base64 encoding image; binascii.Error: Invalid base64-encoded - python

as a project for a course, I'm developing an image coder/decoder using python, but it seems I got a little stuck and I would really apreciate getting some help
The error I'm getting is:
runfile('D:/Documentos/Python/Proyecto_Final/Main.py', wdir='D:/Documentos/Python/Proyecto_Final')
Exception in Tkinter callback
Traceback (most recent call last):
File "C:\...\anaconda3\lib\tkinter\__init__.py", line 1705, in __call__
return self.func(*args)
File "D:\Documentos\Python\Proyecto_Final\Main.py", line 32, in decode64
imagen.write(base64.decodebytes(baset.encode()))
File "C:\...\anaconda3\lib\base64.py", line 546, in decodebytes
return binascii.a2b_base64(s)
binascii.Error: Invalid base64-encoded string: number of data characters (1) cannot be 1 more than a multiple of 4
My code is:
from tkinter import *
import os
import base64
def browseBtn():
filename = filedialog.askopenfilename()
texto.insert(0, filename)
def convbase64():
path = str(texto.get())
imagen = open(path, 'rb')
leeimg = imagen.read()
codigo64 = base64.encodebytes(leeimg)
texto2.insert("1.0", codigo64)
def decode64():
myFormats = [('JPEG / JFIF','*.jpg'),\
('Portable Network Graphics','*.png'),\
('Windows Bitmap','*.bmp'),('CompuServer GIF','*.gif'),]
baset = texto2.get(1.0)
filepath = filedialog.asksaveasfilename(filetypes=myFormats)
imagen = open(filepath, 'wb')
imagen.write(base64.decodebytes(baset.encode()))
imagen.close()
ventana = Tk()
ventana.title("Convertidor imagen a Base64")
ventana.geometry("800x480")
letrero = Label(ventana, text = "Imagen:")
texto = Entry(ventana, width = 100)
buscarImg = Button(ventana, text = "Elegir", command=browseBtn)
letrero2 = Label(ventana, text = "codigo base64:")
texto2 = Text(width = 75, height = 1)
btnconvertir = Button(ventana, text = "Codificar", command=convbase64)
btndecodificar = Button(ventana, text = "decodificar", command=decode64)
letrero.place(x = 32, y = 32)
letrero2.place(x = 16, y = 64)
texto.place(x = 114, y = 35)
texto2.place(x = 114, y = 69)
buscarImg.place(x = 724, y = 32)
btnconvertir.place(x = 724, y = 64)
btndecodificar.place (x = 724, y = 96)
ventana.mainloop()
I'm using anaconda's Spyder 3.7

When using Text.get to get more than one character, you need to give it an end position. You can use the special string "end" for this, but ignoring the last newline character that tkinter adds with "end-1c":
baset = texto2.get("1.0", "end-1c")
See this answer for more information

Related

Convert Image to ASCII Art

I have used a tutorial to program this Image to ASCII Art converter, but it gives me an error. Here is the code:
import PIL.Image
from tkinter import filedialog
# Ascii characters
ASCII_CHARS = ["#", "#" ,"S" ,"%" ,"?" ,"*" ,"+" , ";", ":", "," "."]
def resize_image(image,new_width=100):
width,height =image.size
ratio = height / width / 1.65
new_height = int(new_width*ratio)
resized_image = image.resize((new_width,new_height))
return(resized_image)
# convert each pixel to grayscale
def grayscaler(image):
grayscale_image = image.convert("L")
return(grayscale_image)
#convert pixels to a string of ASCII characters
def pixels_to_ascii(image):
pixels = image.getdata()
characters = "".join([ASCII_CHARS[pixel // 25]for pixel in pixels])
return(characters)
def main(new_width=100):
# Get image path
path = filedialog.askopenfilename(title="Select an image", filetypes=(("Image File", "*.png"),("all files", "*.*")))
try:
image = PIL.Image.open(path)
except:
print(path, "is not a valid pathname to an image")
# convert image to ascii
new_image_data = pixels_to_ascii(grayscaler(resize_image(image)))
# format
pixel_count = len(new_image_data)
ascii_image = "\n".join(new_image_data[i:(i+new_width)] for i in range(0, pixel_count, new_width))
# print result
print(ascii_image)
# save result to "ascii_image.txt"
with open("ascii_image.txt", "w") as f:
f.write(ascii_image)
main()
It is suposed to be printing and saving the ascii art. Instead of that, it gives me this error ay line 34, and 22:
Traceback (most recent call last):
File "c:\Users\Forna\Documents\PYTHON\Image to Ascii\main.py", line 47, in <module>
main()
File "c:\Users\Forna\Documents\PYTHON\Image to Ascii\main.py", line 34, in main
new_image_data = pixels_to_ascii(grayscaler(resize_image(image)))
File "c:\Users\Forna\Documents\PYTHON\Image to Ascii\main.py", line 22, in pixels_to_ascii
characters = "".join([ASCII_CHARS[pixel // 25]for pixel in pixels])
File "c:\Users\Forna\Documents\PYTHON\Image to Ascii\main.py", line 22, in <listcomp>
characters = "".join([ASCII_CHARS[pixel // 25]for pixel in pixels])
IndexError: list index out of range
It would be very nice if someone could help me.
The error was in
ASCII_CHARS = ["#", "#" ,"S" ,"%" ,"?" ,"*" ,"+" , ";", ":", "," "."]
There wasn't a , after "," my bad

Tensorflow Object Detection API: "FailedPreconditionError: HashTable has different value for same key. Key item { has 0 and trying to add value 4"

I am trying to work with this yolov4 in tensorflow 2.0 model it requires the dataset to be in Tensorflow Object Detection API format. Since I had no prior experience with this api so I was following Adrian Rosebrock's tutorial in his ImageNet bundle book.
I'm working with crowdHuman dataset, and I've simplified its annotations a bit. Like in this format below,
/content/gdrive/MyDrive/Datasets/CrowdHuman/crowdHuman_2671.jpg,head,126,243,133,251
/content/gdrive/MyDrive/Datasets/CrowdHuman/crowdHuman_2671.jpg,person,-20,241,19,343
/content/gdrive/MyDrive/Datasets/CrowdHuman/crowdHuman_2671.jpg,head,-12,242,0,254
/content/gdrive/MyDrive/Datasets/CrowdHuman/crowdHuman_1343.jpg,person,-10,281,203,825
/content/gdrive/MyDrive/Datasets/CrowdHuman/crowdHuman_1343.jpg,head,65,293,127,374
I barely changed the code in tutorial other than changing paths and file names, the code files I used are as follows
1 Config.py
import os
BASE_PATH = "/content/gdrive/MyDrive/Datasets/CrowdHuman/"
ANNOT_PATH = BASE_PATH + "annot_train_final.txt"
# Output paths
TRAIN_RECORD = BASE_PATH + "records/training.record"
TEST_RECORD = BASE_PATH + "records/testing.record"
CLASSES_FILE = BASE_PATH + "records/classes.pbtxt"
TEST_SIZE = 0.10
CLASSES = {"person": 1, "head": 2}
2 tfAnnotations.py
from object_detection.utils.dataset_util import bytes_list_feature
from object_detection.utils.dataset_util import float_list_feature
from object_detection.utils.dataset_util import int64_list_feature
from object_detection.utils.dataset_util import int64_feature
from object_detection.utils.dataset_util import bytes_feature
class TFAnnotation:
def __init__(self):
# initializing bbox and labels lists
self.xMins = []
self.xMaxs = []
self.yMins = []
self.yMaxs = []
self.textLabels = []
self.classes = []
self.difficult = []
# initializing additional variables including image itself
# spatial dims, encoding and filename
self.image = None
self.width = None
self.height = None
self.encoding = None
self.filename = None
def build(self):
# now we encode the attributes using their respective
# Tensorflow encoding function
w = int64_feature(self.width)
h = int64_feature(self.height)
filename = bytes_feature(self.filename.encode("utf8"))
encoding = bytes_feature(self.encoding.encode("utf8"))
image = bytes_feature(self.image)
xMins = float_list_feature(self.xMins)
xMaxs = float_list_feature(self.xMaxs)
yMins = float_list_feature(self.yMins)
yMaxs = float_list_feature(self.yMaxs)
textLabels = bytes_list_feature(self.textLabels)
classes = int64_list_feature(self.classes)
difficult = int64_list_feature(self.difficult)
# Construct the TensorFlow compatible data dict
data = {
"image/height": h,
"image/width": w,
"image/filename": filename,
"image/source_id": filename,
"image/encode": image,
"image/format": encoding,
"image/object/bbox/xmin": xMins,
"image/object/bbox/xmax": xMaxs,
"image/object/bbox/ymin": yMins,
"image/object/bbox/ymax": yMaxs,
"image/object/class/text": textLabels,
"image/object/class/label": classes,
"image/object/difficult": difficult,
}
return data
3 records_builder.py
import crowdHuman_config as config
from tfAnnotation import TFAnnotation
from sklearn.model_selection import train_test_split
from PIL import Image
import tensorflow.compat.v1 as tf
import os
def main(_):
# open classes output file
f = open(config.CLASSES_FILE, "w")
# loop over the classes
for (k, v) in config.CLASSES.items():
# construct the class info and write to file
item = ("item {\n"
"\tid: " + str(v) + "\n"
"\tname: '" + k + "'\n"
"}\n")
f.write(item)
f.close()
# initialize a data dict used to map each image filename to
# all bboxes associated with the image and then load the
# contents of the annot file
D = {}
rows = open(config.ANNOT_PATH).read().strip().split('\n')
# loop over the individual rows
for row in rows:
# break the ewo into compenents
row = row.split(",")
(imagePath, label, startX, startY, endX, endY) = row
(startX, startY) = (float(startX), float(startY))
(endX, endY) = (float(endX), float(endY))
# if we are not interested in the label, ignore it
if label not in config.CLASSES:
continue
# build the path to the input image and then grap any
# other boxes and labels associated with the image
# path, labels and bbx lists respectively
#p = os.path.sep.join([config.BASE_PATH, imagePath])
p = imagePath
b = D.get(p,[])
# build a tuple consisting of the label and bbox
# then update the list and store it in the dict
b.append((label, (startX, startY, endX, endY)))
D[p] = b
#print("[INFO] list(D.keys()): {}".format(list(D.keys())))
# create training and testing splits from our data dict
(trainKeys, testKeys) = train_test_split(list(D.keys()),
test_size=config.TEST_SIZE, random_state=42)
# initialize the data split files
datasets = [
("train", trainKeys, config.TRAIN_RECORD),
("test", testKeys, config.TEST_RECORD)
]
# loop over the datasets
for (dType, keys, outputPath) in datasets:
# intialize the Tensorflow writer and initialize the
# total number of examples written to file
print("[INFO] processing '{}'...".format(dType))
writer = tf.python_io.TFRecordWriter(outputPath)
total = 0
# loop over all the kerys in the current set
for k in keys:
# load the input image from disk as a tensorflow obj
encoded = tf.gfile.GFile(k, "rb").read()
encoded = bytes(encoded)
# load the image from disk again, this time as a PIL
# object
pilImage = Image.open(k)
(w, h) = pilImage.size[:2]
# parse the filename and encoding from the input path
filename = k.split(os.path.sep)[-1]
encoding = filename[filename.rfind(".")+1:]
# intialize the annotattion object used to store
# info regarding the bbox and labels
tfAnnot = TFAnnotation()
tfAnnot.image = encoded
tfAnnot.encoding = encoding
tfAnnot.filename = filename
tfAnnot.width = w
tfAnnot.height = h
#loop over the bbox and labels associated with the img
for (label, (startX, startY, endX, endY)) in D[k]:
# tensorflow assumes all bbox are in the range
# [0,1] so we need to scale them
xMin = startX / w
xMax = endX / w
yMin = startY / h
yMax = endY / h
# update the bbox and labels lists
tfAnnot.xMins.append(xMin)
tfAnnot.xMaxs.append(xMax)
tfAnnot.yMins.append(yMin)
tfAnnot.yMaxs.append(yMax)
tfAnnot.textLabels.append(label.encode("utf8"))
tfAnnot.classes.append(config.CLASSES[label])
tfAnnot.difficult.append(0)
# increment the total number of examples
total += 1
# encode the data point attributes using the Tensorflow
# helper functions
features = tf.train.Features(feature=tfAnnot.build())
example = tf.train.Example(features=features)
# add the example to the writer
writer.write(example.SerializeToString())
# close the writer and print the diagnostic info to the user
writer.close()
print("[INFO] {} examples saved for '{}'".format(total, dType))
# check to see if the main thread should be started
if __name__ == "__main__":
tf.app.run()
All these scripts ran successfully and produced the desired files but when I tried using this dataset in the model mentioned above, it gave me this error. There's no really relatable solution that I could find. So I'd really appreciate the help.
ERROR:
Traceback (most recent call last):
File "train.py", line 195, in <module>
app.run(main)
File "/usr/local/lib/python3.7/dist-packages/absl/app.py", line 303, in run
_run_main(main, args)
File "/usr/local/lib/python3.7/dist-packages/absl/app.py", line 251, in _run_main
sys.exit(main(argv))
File "train.py", line 64, in main
FLAGS.dataset, FLAGS.classes, FLAGS.size)
File "/content/yolov3-tf2/yolov3_tf2/dataset.py", line 124, in load_tfrecord_dataset
class_file, tf.string, 0, tf.int64, LINE_NUMBER, delimiter="\n"), -1)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/lookup_ops.py", line 314, in __init__
super(StaticHashTable, self).__init__(default_value, initializer)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/lookup_ops.py", line 185, in __init__
self._init_op = self._initialize()
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/lookup_ops.py", line 188, in _initialize
return self._initializer.initialize(self)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/lookup_ops.py", line 744, in initialize
-1 if self._vocab_size is None else self._vocab_size, self._delimiter)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/ops/gen_lookup_ops.py", line 363, in initialize_table_from_text_file_v2
_ops.raise_from_not_ok_status(e, name)
File "/usr/local/lib/python3.7/dist-packages/tensorflow/python/framework/ops.py", line 6862, in raise_from_not_ok_status
six.raise_from(core._status_to_exception(e.code, message), None)
File "<string>", line 3, in raise_from
tensorflow.python.framework.errors_impl.FailedPreconditionError: HashTable has different value for same key. Key item { has 0 and trying to add value 4 [Op:InitializeTableFromTextFileV2]

"ValueError: Expected 2D array, got 1D array instead" when fitting data into model

I've viewed some questions regarding the same issue but none of them could help me. The problem is as it says, I'm unable to fit data into learning model.
This is the main file, which calls out the class regarding the data i use to fit in the model:
def main():
action = input(
"Choose an action:\n A - Create LinearSVC classifier\n B - Create Random Forest Classifier\n C - Create K Nearest Neighbor classifier\n -> ").upper()
loader = ImageLoader()
if action == "A":
lsvc = LinearSVC(random_state=0, tol=1e-5)
lsvc.fit(loader.hogArray(), loader.labelArray())
joblib.dump(lsvc, './LSVCmodel.pkl')
elif action == "B":
rfc = RandomForestClassifier(n_estimators=100)
rfc.fit(loader.hogArray(), loader.labelArray())
joblib.dump(rfc, './RFmodel.pkl')
elif action == "C":
knc = KNeighborsClassifier(n_neighbors=3)
knc.fit(loader.hogArray(), loader.labelArray())
joblib.dump(knc, './KNCmodel.pkl')
else:
print("That's not a valid answer")
main()
The same error occurs with all 3 models. The class that retrieves the data is written as following:
class ImageProcess:
def __init__(self, image, hog_data=None):
self.hog_data = hog_data
self.image = image
def hog_data_extractor(self):
self.hog_data = feature.hog(self.image) / 255.0
return self.hog_data
def normalize(self):
imageRead = cv2.resize(cv2.imread(self.image), (150, 150))
gaussImage = cv2.fastNlMeansDenoisingColored(imageRead, None, 10, 10, 7, 21)
self.image = cv2.Canny(gaussImage, 100, 200)
self.image = cv2.cvtColor(self.image, cv2.COLOR_GRAY2RGB)
self.image *= np.array((0, 0, 1), np.uint8)
return self.image
class ImageLoader:
def __init__(self):
self.sourcePath = "dataset/seg_train/"
self.labels = ['Buildings', 'Forest', 'Glacier', 'Mountain', 'Sea', 'Street']
self.x_train = []
self.y_train = []
def fillArray(self):
label_train = []
le = LabelEncoder()
run_time = time.time()
for scene in self.labels:
scene_path = os.path.join(self.sourcePath, scene.lower())
fileNumber = 0
scene_length = len([image for image in os.listdir(scene_path)])
for img in os.listdir(scene_path):
per = (file_number / scene_length)
arrow = '-' * int(round(per * 100) - 1) + '>'
spaces = ' ' * (100 - len(arrow))
sys.stdout.write(
"\rProgress: [{0}] {1}% -Ellapsed time: {2}".format(arrow + spaces, int(round(per * 100, 2)),
(int(time.time() - run_time))))
file_number += 1
img_path = os.path.join(scene_path, img)
process = ImageProcess(img_path)
self.x_train.append(process.hog_data_extractor())
label_train.append(str(scene_type))
self.y_train = le.fit_transform(label_train)
def hogArray(self):
return self.x_train
def labelArray(self):
return self.y_train
A side note: Previously I didn't have this ImageLoader class, and simply had the method fillArray() under main() on the previous code, and it didn't give back this error, all was working well. But due to some restrictions I have to follow I tried to transferred it into a class to be use in more other files.
Traceback (most recent call last):
File "main.py", line 35, in <module>
main()
File "main.py", line 19, in main
lsvc.fit(loader.hogArray(), loader.labelArray())
File "/home/filipe/Documents/NovaPasta/2019_20/LP_recuperacao/Trabalho_recuperacao/venv/lib/python3.6/site-packages/sklearn/svm/classes.py", line 229, in fit
accept_large_sparse=False)
File "/home/filipe/Documents/NovaPasta/2019_20/LP_recuperacao/Trabalho_recuperacao/venv/lib/python3.6/site-packages/sklearn/utils/validation.py", line 756, in check_X_y
estimator=estimator)
File "/home/filipe/Documents/NovaPasta/2019_20/LP_recuperacao/Trabalho_recuperacao/venv/lib/python3.6/site-packages/sklearn/utils/validation.py", line 552, in check_array
"if it contains a single sample.".format(array))
ValueError: Expected 2D array, got 1D array instead:
array=[].
Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
I've tried reshaping as its recommended in the error but it retrieves("AttributeError: 'list' object has no attribute 'reshape'") and since I didn't needed this reshape before I assumed this wasn't the solution.
Sorry if its poor coding but I'm not that much of an expert(not even close) and the time period I had to do this was very short so I just focused on getting it to work properly.
You are not calling fillArray. so the lists are empty. Try doing it at end of init function.
array=[] in error shows this.

Trying to parse Word Documents and getting PdfReadError: EOF marker not found

I am testing some Python code to loop through resumes, open each, parse each, and create a comprehensive report based on the contents of each resume. Here is the code that I am running.
#importing all required libraries
import PyPDF2
import os
from os import listdir
from os.path import isfile, join
from io import StringIO
import pandas as pd
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
from spacy.matcher import PhraseMatcher
#Function to read resumes from the folder one by one
mypath='C:\\path_to_resumes\\' #enter your path here where you saved the resumes
onlyfiles = [os.path.join(mypath, f) for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
def pdfextract(file):
fileReader = PyPDF2.PdfFileReader(open(file,'rb'))
countpage = fileReader.getNumPages()
count = 0
text = []
while count < countpage:
pageObj = fileReader.getPage(count)
count +=1
t = pageObj.extractText()
print (t)
text.append(t)
return text
#function to read resume ends
#function that does phrase matching and builds a candidate profile
def create_profile(file):
text = pdfextract(file)
text = str(text)
text = text.replace("\\n", "")
text = text.lower()
#below is the csv where we have all the keywords, you can customize your own
keyword_dict = pd.read_csv('D:/NLP_Resume/resume/template_new.csv')
stats_words = [nlp(text) for text in keyword_dict['Statistics'].dropna(axis = 0)]
NLP_words = [nlp(text) for text in keyword_dict['NLP'].dropna(axis = 0)]
ML_words = [nlp(text) for text in keyword_dict['Machine Learning'].dropna(axis = 0)]
DL_words = [nlp(text) for text in keyword_dict['Deep Learning'].dropna(axis = 0)]
R_words = [nlp(text) for text in keyword_dict['R Language'].dropna(axis = 0)]
python_words = [nlp(text) for text in keyword_dict['Python Language'].dropna(axis = 0)]
Data_Engineering_words = [nlp(text) for text in keyword_dict['Data Engineering'].dropna(axis = 0)]
matcher = PhraseMatcher(nlp.vocab)
matcher.add('Stats', None, *stats_words)
matcher.add('NLP', None, *NLP_words)
matcher.add('ML', None, *ML_words)
matcher.add('DL', None, *DL_words)
matcher.add('R', None, *R_words)
matcher.add('Python', None, *python_words)
matcher.add('DE', None, *Data_Engineering_words)
doc = nlp(text)
d = []
matches = matcher(doc)
for match_id, start, end in matches:
rule_id = nlp.vocab.strings[match_id] # get the unicode ID, i.e. 'COLOR'
span = doc[start : end] # get the matched slice of the doc
d.append((rule_id, span.text))
keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i,j in Counter(d).items())
## convertimg string of keywords to dataframe
df = pd.read_csv(StringIO(keywords),names = ['Keywords_List'])
df1 = pd.DataFrame(df.Keywords_List.str.split(' ',1).tolist(),columns = ['Subject','Keyword'])
df2 = pd.DataFrame(df1.Keyword.str.split('(',1).tolist(),columns = ['Keyword', 'Count'])
df3 = pd.concat([df1['Subject'],df2['Keyword'], df2['Count']], axis =1)
df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")"))
base = os.path.basename(file)
filename = os.path.splitext(base)[0]
name = filename.split('_')
name2 = name[0]
name2 = name2.lower()
## converting str to dataframe
name3 = pd.read_csv(StringIO(name2),names = ['Candidate Name'])
dataf = pd.concat([name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count']], axis = 1)
dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0], inplace = True)
return(dataf)
#function ends
#code to execute/call the above functions
final_database=pd.DataFrame()
i = 0
while i < len(onlyfiles):
file = onlyfiles[i]
dat = create_profile(file)
final_database = final_database.append(dat)
i +=1
print(final_database)
#code to count words under each category and visulaize it through Matplotlib
final_database2 = final_database['Keyword'].groupby([final_database['Candidate Name'], final_database['Subject']]).count().unstack()
final_database2.reset_index(inplace = True)
final_database2.fillna(0,inplace=True)
new_data = final_database2.iloc[:,1:]
new_data.index = final_database2['Candidate Name']
#execute the below line if you want to see the candidate profile in a csv format
#sample2=new_data.to_csv('sample.csv')
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 10})
ax = new_data.plot.barh(title="Resume keywords by category", legend=False, figsize=(25,7), stacked=True)
labels = []
for j in new_data.columns:
for i in new_data.index:
label = str(j)+": " + str(new_data.loc[i][j])
labels.append(label)
patches = ax.patches
for label, rect in zip(labels, patches):
width = rect.get_width()
if width > 0:
x = rect.get_x()
y = rect.get_y()
height = rect.get_height()
ax.text(x + width/2., y + height/2., label, ha='center', va='center')
plt.show()
In the folder, I have '.doc' and '.docx' files. Everything seems to work fine, up until this point, directly below. When I get here, the code throws an error. Here is the troublesome code. The weird thing is, that it looks like some kind of PDF error, but I'm iterating only through '.doc' and '.docx' files.
final_database=pd.DataFrame()
i = 0
while i < len(onlyfiles):
file = onlyfiles[i]
dat = create_profile(file)
final_database = final_database.append(dat)
i +=1
print(final_database)
Here is the StackTrace:
Traceback (most recent call last):
File "<ipython-input-2-c63fca79d39f>", line 5, in <module>
dat = create_profile(file)
File "<ipython-input-1-cdc3bf75cd26>", line 34, in create_profile
text = pdfextract(file)
File "<ipython-input-1-cdc3bf75cd26>", line 17, in pdfextract
fileReader = PyPDF2.PdfFileReader(open(file,'rb'))
File "C:\Users\ryans\Anaconda3\lib\site-packages\PyPDF2\pdf.py", line 1084, in __init__
self.read(stream)
File "C:\Users\ryans\Anaconda3\lib\site-packages\PyPDF2\pdf.py", line 1696, in read
raise utils.PdfReadError("EOF marker not found")
PdfReadError: EOF marker not found
The code comes from here.
https://towardsdatascience.com/do-the-keywords-in-your-resume-aptly-represent-what-type-of-data-scientist-you-are-59134105ba0d
You are using package PyPDF2, which is used to read and manipulate pdf files. In the article from towardsdatascience that you mentioned all resumes that author was working on were in pdf format.
Maybe if your resumes are in doc/docx format you should explore python-docx library:
https://python-docx.readthedocs.io/en/latest/index.html

How to save data in .csv file in row and column form using numpy

I am trying to read and image using OpenCV and after reading that image I have got some data which I have to save in a CSV file using numpy. Here is the program:-
import cv2 as cv
import numpy as np
import os
img1 = cv.imread('C:/Users/sbans/Pictures/bird.jpg')
dataA1 = os.path.basename('C:/Users/sbans/Pictures/bird.jpg')
height, width, channels = img1.shape
dataA2 = height
dataA3 = width
dataA4 = channels
a = int(height/2)
b = int(width/2)
px1 = img1[a,b]
dataA5 = px1[0]
dataA6 = px1[1]
dataA7 = px1[2]
a = np.array([dataA1, dataA2, dataA3, dataA4, dataA5, dataA6, dataA7])
img2 = cv.imread('C:/Users/sbans/Pictures/cat.jpg')
dataB1 = os.path.basename('C:/Users/sbans/Pictures/cat.jpg')
height, width, channels = img2.shape
dataB2 = height
dataB3 = width
dataB4 = channels
a = int(height/2)
b = int(width/2)
px2 = img2[a,b]
dataB5 = px2[0]
dataB6 = px2[1]
dataB7 = px2[2]
b = np.array([dataB1, dataB2, dataB3, dataB4, dataB5, dataB6, dataB7])
np.savetxt("stats.csv", np.stack((a,b)), delimiter=",", fmt='%s')
This error is coming:-
Traceback (most recent call last):
File "C:\Users\sbans\Documents\demo_opencv.py", line 32, in
np.savetxt("stats.csv", np.stack((a,b)), delimiter=",", fmt='%s')
File "<array_function internals>", line 6, in stack
File "C:\Users\sbans\AppData\Local\Programs\Python\Python37\lib\site-packages\numpy\core\shape_base.py", line 425, in stack
raise ValueError('all input arrays must have the same shape')
ValueError: all input arrays must have the same shape
You could simplify the code a bit by defining a function
def get_array(file):
img = cv.imread(file)
basename = os.path.basename(file)
height, width, channels = img.shape
h = int(height/2)
w = int(width/2)
px = img[h,w]
return np.array([basename, height, width, channels, px[0], px[1], px[2]])
Then savetxt can accept a list of same-sized 1D arrays
a = get_array('C:/Users/sbans/Pictures\bird.jpg')
b = get_array('C:/Users/sbans/Pictures\cat.jpg')
np.savetxt("stats.csv", (a, b), delimiter=",", fmt='%s')
The default behavior of np.savetxt method is to replace the existing file with a new data.
If you want to sequentially write the data to a file then you need to have a reference for that file then use it for np.savetxt.
For your case:
f = open('stats.csv','w')
...
np.savetxt(f, np.row_stack(np.column_stack((dataA1, dataA2, dataA3, dataA4, dataA5, dataA6, dataA7))), delimiter=",", fmt='%s')
...
np.savetxt(f, np.row_stack(np.column_stack((dataB1, dataB2, dataB3, dataB4, dataB5, dataB6, dataB7))), delimiter=",", fmt='%s')
f.close()

Categories

Resources