Please help me with this error:
Traceback (most recent call last):
File "c:\Users\Dell Latitude 7480\Desktop\HC\HC.CHECKER.py", line 17, in
read_file = pd.read_excel(abs_folder_path, sheet_name = 'MSAN Cabinets')
File "C:\Users\Dell Latitude 7480\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\util_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "C:\Users\Dell Latitude 7480\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\excel_base.py", line 457, in read_excel
io = ExcelFile(io, storage_options=storage_options, engine=engine)
File "C:\Users\Dell Latitude 7480\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\excel_base.py", line 1376, in init
ext = inspect_excel_format(
File "C:\Users\Dell Latitude 7480\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\excel_base.py", line 1250, in inspect_excel_format
with get_handle(
File "C:\Users\Dell Latitude 7480\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\common.py", line 795, in get_handle
handle = open(handle, ioargs.mode)
PermissionError: [Errno 13] Permission denied: 'C:\Users\Dell Latitude 7480\Desktop\HC'
When running this:
import pandas as pd
import numpy as np
import os
print("Fixed Network Health Check Checker")
folder_path = name = input("Enter the folder path of your Raw files: ")
abs_folder_path = os.path.abspath(folder_path)
read_file = pd.read_excel(abs_folder_path, sheet_name = 'MSAN Cabinets')
read_file.to_csv(abs_folder_path, index = None, header = True)
df = pd.read_csv(abs_folder_path, encoding='unicode_escape')
#fixed
df['MSAN Interface'] = df['MSAN Interface'].replace(np.nan, 0)
df['ACCESS Interface 1 (IPRAN, ATN, LSA, VLAN)'] = df['ACCESS Interface 1 (IPRAN, ATN, LSA, VLAN)'].replace(np.nan, 0)
df['Homing AG2'] = df['Homing AG1'].replace(np.nan, 0)
df = df.iloc[:11255]
# filter "REGION" and drop unnecessary columns
f_df1 = df[df['REGION'] == 'MIN']
dropcols_df1 = f_df1.drop(df.iloc[:, 1:6], axis = 1)
dropcols_df2 = dropcols_df1.drop(df.iloc[:, 22:27], axis = 1)
dropcols_df3 = dropcols_df2.drop(df.iloc[:, 37:50], axis = 1)
# filter "MSAN Interface" and filter the peak util for >= 50%
f_d2 = dropcols_df3['MSAN Interface'] != 0
msan_int = dropcols_df3[f_d2]
f_msan_int = msan_int['Peak Util'] >= 0.5
new_df = msan_int[f_msan_int]
# filter "ACCESS Interface 1 (IPRAN, ATN, LSA, VLAN)" and filter the peak util for >= 50%
fblank_msan_int = dropcols_df3['MSAN Interface'] == 0
msan_int1 = dropcols_df3[fblank_msan_int]
f_df3 = dropcols_df3['ACCESS Interface 1 (IPRAN, ATN, LSA, VLAN)'] != 0
access_int1 = dropcols_df3[f_df3]
f_access_int1 = access_int1['Peak Util.1'] >= 0.5
new_df1 = access_int1[f_access_int1]
# filter "Homing AG1" and filter the peak util for >= 50%
fblank_msan_int1 = dropcols_df3['MSAN Interface'] == 0
msan_int2 = dropcols_df3[fblank_msan_int1]
f_access_int2 = msan_int2['ACCESS Interface 1 (IPRAN, ATN, LSA, VLAN)'] == 0
new_df2 = msan_int2[f_access_int2]
ag1 = new_df2['Peak Util.3'] >= 0.5
new_df3 = new_df2[ag1]
# Concatenate all DataFrames
pdList = [new_df, new_df1, new_df3]
final_df = pd.concat(pdList)
print(final_df.to_csv('output.csv', index = False))
Could someone help me.
Python version 3.9.12 Windows 10 x64
Thanks
Related
I tried to make a dashboard using panel.
Traceback (most recent call last):
File "/home/istts/Desktop/P3/DBPANEL.py", line 70, in <module>
interDataFrame[(interDataFrame.month <= mnthSlider)]
File "/home/istts/.local/lib/python3.9/site-packages/hvplot/interactive.py", line 499, in __call__
clone = new._clone(method(*args, **kwargs), plot=new._method == 'plot')
File "/home/istts/.local/lib/python3.9/site-packages/hvplot/interactive.py", line 370, in _clone
return type(self)(self._obj, fn=self._fn, transform=transform, plot=plot, depth=depth,
File "/home/istts/.local/lib/python3.9/site-packages/hvplot/interactive.py", line 276, in __init__
self._current = self._transform.apply(ds, keep_index=True, compute=False)
File "/home/istts/.local/lib/python3.9/site-packages/holoviews/util/transform.py", line 774, in apply
data = self._apply_fn(dataset, data, fn, fn_name, args,
File "/home/istts/.local/lib/python3.9/site-packages/holoviews/util/transform.py", line 672, in _apply_fn
raise e
File "/home/istts/.local/lib/python3.9/site-packages/holoviews/util/transform.py", line 666, in _apply_fn
data = method(*args, **kwargs)
File "/home/istts/.local/lib/python3.9/site-packages/pandas/util/_decorators.py", line 331, in wrapper
return func(*args, **kwargs)
File "/home/istts/.local/lib/python3.9/site-packages/pandas/core/frame.py", line 6912, in sort_values
k = self._get_label_or_level_values(by, axis=axis)
File "/home/istts/.local/lib/python3.9/site-packages/pandas/core/generic.py", line 1850, in _get_label_or_level_values
raise KeyError(key)
KeyError: 'month'
I always get that error, tried to use different name but still KeyError : 'month'. does any one know what is that error
note : I'll put my code below. I take the data from local database, but there's no month column so I make one.
import pandas as pd
import numpy as np
import panel as pn
import hvplot.pandas
import mysql.connector as connection
import datetime as dt
pn.extension()
# DATABSE VARIABLE
DBHOST = "localhost" #The Host use "localhost" or the database IP
DBUSER = "ISTTS" #The user
DBPASS = "digital123" #The user password
DBDATA = "exampledb" #The database to insert
DBTABL = "Test2" #The table to insert
# END DATABASE VARIABLE
# Device list (Incase of more than 1 device)
DEVICELIST = ['1'] # LATER, TAKE IT FROM DATABASE AND MAKE IT STRING
# End Device list
# Read data
try:
mydb = connection.connect(host=DBHOST, user=DBUSER, password=DBPASS, database=DBDATA, use_pure=True)
query = "Select * from " + DBTABL + ";"
dataFrame = pd.read_sql(query, mydb)
#query = "Select Distinct DeviceID from " + DBTABL +";"
#DEVICELIST = pd.read_sql(query, mydb)
mydb.close()
except Exception as e:
mydb.close()
print(str(e))
# End of read data
## Some process that need to be made for the program
# DEVICE LIST FROM DeviceID COLUMN
# TAKE MONTH FROM Date COLUMN
dataFrame['month'] = pd.DatetimeIndex(dataFrame['Date']).month
## End of the data processing
# Make the dataframe interactive
interDataFrame = dataFrame.interactive()
# End of Process
## Widgets
# Slider to see which month to inspect
mnthSlider = pn.widgets.IntSlider(name='Month slider', start=1, end=12, step=1, value=1)
# Button to pick which data to show for top graph
topYAxisData = pn.widgets.RadioButtonGroup(
name='data select',
options=['Temperature', 'Pressure'],
button_type='primary'
)
# Selector to pick which device data to show for top graph
topDeviceSelector = pn.widgets.Select(name='Device selector', options=DEVICELIST)
# Data pipeline for plotting the top graph
topDataPipeline = (
interDataFrame[(interDataFrame.month <= mnthSlider)]
.groupby(['Date'])[topYAxisData].mean() ## Revision, groupby. plan : month
.to_frame()
.reset_index()
.sort_values(by='month')
.reset_index(drop = True)
)
topDataPlot = topDataPipeline.hvplot(x='Date', y=topYAxisData, line_witdh=2, title='Sensor Data')
# Device selector for bottom one
botDeviceSelector1 = pn.widgets.Select(name='Device selector 1', options=DEVICELIST)
botDeviceSelector2 = pn.widgets.Select(name='Device selector 2', options=DEVICELIST)
botDeviceSelector3 = pn.widgets.Select(name='Device selector 3', options=DEVICELIST)
# Button to pick which data to show for bottom one
botYAxisData1 = pn.widgets.RadioButtonGroup(
name='bot data select1',
options=['Temperature', 'Pressure'],
button_type='primary'
)
botYAxisData2 = pn.widgets.RadioButtonGroup(
name='bot data select2',
options=['Temperature', 'Pressure'],
button_type='primary'
)
botYAxisData3 = pn.widgets.RadioButtonGroup(
name='bot data select3',
options=['Temperature', 'Pressure'],
button_type='primary'
)
# Data pipeline for plotting
botDataPipeline1 = (
interDataFrame[(interDataFrame.month <= monthSlider)]
.groupby(['Date'])[botYAxisData1].mean() ## Revision, groupby. plan : month
.to_frame()
.reset_index()
.sort_values(by='month')
.reset_index(drop = True)
)
botDataPipeline2 = (
interDataFrame[(interDataFrame.month <= monthSlider)]
.groupby(['Date'])[botYAxisData2].mean() ## Revision, groupby. plan : month
.to_frame()
.reset_index()
.sort_values(by='month')
.reset_index(drop = True)
)
botDataPipeline3 = (
interDataFrame[(interDataFrame.month <= monthSlider)]
.groupby(['Date'])[botYAxisData3].mean() ## Revision, groupby. plan : month
.to_frame()
.reset_index()
.sort_values(by='month')
.reset_index(drop = True)
)
# Plotting for bot graph
botDataPlot1 = botDataPipeline1.hvplot(x='Date', y=botYAxisData1, line_witdh=2, title='Sensor Data 1')
botDataPlot2 = botDataPipeline2.hvplot(x='Date', y=botYAxisData2, line_witdh=2, title='Sensor Data 2')
botDataPlot3 = botDataPipeline3.hvplot(x='Date', y=botYAxisData3, line_witdh=2, title='Sensor Data 3')
## End Widgets
# Template
template = pn.template.FastListTemplate(
title='Temperature / Pressure Data',
sidebar=[pn.pane.markdown("# What should I write here??"),
pn.pane.markdown("### What should I write here?? AGAIN"),
pn.pane.PNG("Temperature.png", sizing_mode='scale_both'),
pn.pane.markdown('Month Selector'),
monthSlider],
main=[pn.Row(pn.Column(topYAxisData, topDataPlot.panel(), margin=(0,25))),
pn.Row(pn.Column(botYAxisData1, botDataPlot1.panel(), margin=(0,25)),
pn.Column(botYAxisData2, botDataPlot2.panel(), margin=(0,25)),
pn.Column(botYAxisData3, botDataPlot3.panel(), margin=(0,25)),
pn.Column(pn.Row(botDeviceSelector1), pn.Row(botDeviceSelector2, pn.Row(botDeviceSelector3))))],
accent_base_color="#84A9FF",
header_background="#84A9FF"
)
template.show()
template.servable();
I tried to use "astype" and "to_numeric" but still same error
I'm trying to make a map with the number of noise complaints for each zipcode and everything runs fine, but I can't get the count number to appear on the map when I hover over each area. I tried making it into an int as the error suggested, but nothing seems to work.
import pandas as pd
df2020 = pd.read_csv('/Users/kenia/Desktop/CSCI 233 Seminar Project/311_Noise_Complaints.csv',sep=',', low_memory = False)
df2020=df2020[df2020['Created Date'].str[6:10] == '2020']
df2020['Incident Zip'].fillna(0, inplace=True)
df2020['Incident Zip'] = df2020['Incident Zip'].astype(int)
df2020_zip = df2020['Incident Zip'].value_counts().to_frame().reset_index()
df2020_zip.columns = ['postal_code', 'counts']
df2020_zip['postal_code'] = df2020_zip['postal_code'].astype(str)
df2020_zip['counts'] = df2020_zip['counts'].astype(int)
import folium
nycMap = folium.Map(location=[40.693943, -73.985880], zoom_start=10)
zipLines = '/Users/kenia/Desktop/CSCI 233 Seminar Project/zipMap.geojson.json'
df2020_zip['counts'] = df2020_zip['counts'].astype(int)
df2020_zip['counts'] = pd.Series(zipLines['counts'])
count_col = df2020_zip['counts']
bins = list(df2020_zip['counts'].quantile([0,0.2,0.4,0.6,0.8,1]))
choropleth = folium.Choropleth(geo_data = zipLines,
data=df2020_zip,
columns=['postal_code', 'counts'],
key_on='feature.properties.postalCode',
fill_color='OrRd',
fill_opacity=0.7,
line_opacity=1.0,
bins = bins,
highlight=True,
legend_name="Noise Frequency in 2020"
).add_to(nycMap)
folium.LayerControl().add_to(nycMap)
choropleth.geojson.add_child(
folium.features.GeoJsonTooltip(['postalCode','PO_NAME','count_col'])
)
nycMap.save(outfile='index.html')
Error:
Traceback (most recent call last):
File "/Users/kenia/Desktop/throwaway.py", line 20, in <module>
df2020_zip['counts'] = pd.Series(zipLines['counts'])
TypeError: string indices must be integers
Dataset: https://data.cityofnewyork.us/Social-Services/311-Noise-Complaints/p5f6-bkga
Zipcode GeoJson: https://data.beta.nyc/dataset/nyc-zip-code-tabulation-areas/resource/6df127b1-6d04-4bb7-b983-07402a2c3f90?view_id=b34c6552-9fdb-4f95-8810-0588ad1a4cc8
I'm trying to create a choropleth map using folium on python and I was able to get the base map running, but when I try to add a layer with neighborhood boundaries, it does not show up on the html page. I thought maybe I had to increase the line opacity, but that doesn't seem to be it.
This is my code:
import folium
import pandas as pd
crimeData = pd.read_csv('NYC_crime.csv')
crime2020 = crimeData[crimeData.CMPLNT_FR_DT == 2020]
nycMap = folium.Map(location=[40.693943, -73.985880],zoom_start = 10)
mapLines = 'nbhdMap.geojson.json'
folium.Choropleth(geo_data = mapLines,
data = crime2020,
fill_color = 'OrRd',
fill_opacity=0.5,
line_opacity=1.0,
key_on = 'feature.geometry.coordinates',
columns = ['Lat_Lon']
)
nycMap.save(outfile='index.html')
I'm also having trouble filling the map with data. I'm trying to make it so that each complaint documented on the CSV file from 2020 is used to show which areas received the most calls. But I get this error:
Traceback (most recent call last):
File "/Users/kenia/Desktop/CSCI233/PRAC.py", line 10, in <module>
folium.Choropleth(geo_data = mapLines,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/folium/features.py", line 1158, in __init__
color_data = data.set_index(columns[0])[columns[1]].to_dict()
IndexError: list index out of range
This is the neighborhood boundaries: https://data.beta.nyc/dataset/pediacities-nyc-neighborhoods/resource/35dd04fb-81b3-479b-a074-a27a37888ce7
And this is my data: https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243
[EDIT] So I tried #r-beginners suggestion with a simpler dataset: https://data.cityofnewyork.us/Health/Restaurants-rolled-up-/59dk-tdhz
import pandas as pd
import folium
data = pd.read_csv('nycrestaurants.csv')
data = pd.concat([data, str(data['ZIPCODE']).split(',')], axis=1)
data.columns = ['CAMIS', 'DBA', 'BORO', 'BUILDING', 'STREET', 'ZIPCODE']
resData = data.groupby(['ZIPCODE'])['DBA'].sum().reset_index()
nycMap = folium.Map(location=[40.693943, -73.985880],zoom_start = 10)
mapLines = 'zipMap.geojson.json'
folium.Choropleth(geo_data = mapLines,
data = resData,
key_on = 'feature.properties.postalCode',
columns = ['ZIPCODE', 'DBA'],
fill_color = 'OrRd',
fill_opacity=0.5,
line_opacity=1.0
).add_to(nycMap)
nycMap.save(outfile='index.html')
But now I'm getting this error message:
Traceback (most recent call last):
File "/Users/kenia/Desktop/CSCI233/PRAC.py", line 5, in <module>
data = pd.concat([data, str(data['ZIPCODE']).split(',')], axis=1)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/core/reshape/concat.py", line 274, in concat
op = _Concatenator(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/core/reshape/concat.py", line 359, in __init__
raise TypeError(msg)
TypeError: cannot concatenate object of type '<class 'list'>'; only Series and DataFrame objs are valid
Since you were presented with the data of the complaint in another question, you got the GEOJSON data from here for the corresponding zip code range. As for the process, we have tabulated it by the number of zip codes and tied it to the number of occurrences.
import pandas as pd
import numpy as np
df = pd.read_csv('./data/311_Noise_Complaints.csv', sep=',')
df['Incident Zip'].fillna(0, inplace=True)
df['Incident Zip'] = df['Incident Zip'].astype(int)
df_zip = df['Incident Zip'].value_counts().to_frame().reset_index()
df_zip.columns = ['postal_code', 'counts']
df_zip['postal_code'] = df_zip['postal_code'].astype(str)
import folium
nycMap = folium.Map(location=[40.693943, -73.985880], zoom_start=10)
mapLines = './data/nyc_zip_code_tabulation_areas_polygons.geojson'
choropleth = folium.Choropleth(geo_data = mapLines,
data = df_zip,
columns = ['postal_code', 'counts'],
key_on = 'feature.properties.postalcode',
fill_color = 'BuPu',
fill_opacity=0.5,
line_opacity=1.0
).add_to(nycMap)
choropleth.geojson.add_child(
folium.features.GeoJsonTooltip(['po_name'], labels=False)
)
nycMap.save(outfile='index.html')
nycMap
as a project for a course, I'm developing an image coder/decoder using python, but it seems I got a little stuck and I would really apreciate getting some help
The error I'm getting is:
runfile('D:/Documentos/Python/Proyecto_Final/Main.py', wdir='D:/Documentos/Python/Proyecto_Final')
Exception in Tkinter callback
Traceback (most recent call last):
File "C:\...\anaconda3\lib\tkinter\__init__.py", line 1705, in __call__
return self.func(*args)
File "D:\Documentos\Python\Proyecto_Final\Main.py", line 32, in decode64
imagen.write(base64.decodebytes(baset.encode()))
File "C:\...\anaconda3\lib\base64.py", line 546, in decodebytes
return binascii.a2b_base64(s)
binascii.Error: Invalid base64-encoded string: number of data characters (1) cannot be 1 more than a multiple of 4
My code is:
from tkinter import *
import os
import base64
def browseBtn():
filename = filedialog.askopenfilename()
texto.insert(0, filename)
def convbase64():
path = str(texto.get())
imagen = open(path, 'rb')
leeimg = imagen.read()
codigo64 = base64.encodebytes(leeimg)
texto2.insert("1.0", codigo64)
def decode64():
myFormats = [('JPEG / JFIF','*.jpg'),\
('Portable Network Graphics','*.png'),\
('Windows Bitmap','*.bmp'),('CompuServer GIF','*.gif'),]
baset = texto2.get(1.0)
filepath = filedialog.asksaveasfilename(filetypes=myFormats)
imagen = open(filepath, 'wb')
imagen.write(base64.decodebytes(baset.encode()))
imagen.close()
ventana = Tk()
ventana.title("Convertidor imagen a Base64")
ventana.geometry("800x480")
letrero = Label(ventana, text = "Imagen:")
texto = Entry(ventana, width = 100)
buscarImg = Button(ventana, text = "Elegir", command=browseBtn)
letrero2 = Label(ventana, text = "codigo base64:")
texto2 = Text(width = 75, height = 1)
btnconvertir = Button(ventana, text = "Codificar", command=convbase64)
btndecodificar = Button(ventana, text = "decodificar", command=decode64)
letrero.place(x = 32, y = 32)
letrero2.place(x = 16, y = 64)
texto.place(x = 114, y = 35)
texto2.place(x = 114, y = 69)
buscarImg.place(x = 724, y = 32)
btnconvertir.place(x = 724, y = 64)
btndecodificar.place (x = 724, y = 96)
ventana.mainloop()
I'm using anaconda's Spyder 3.7
When using Text.get to get more than one character, you need to give it an end position. You can use the special string "end" for this, but ignoring the last newline character that tkinter adds with "end-1c":
baset = texto2.get("1.0", "end-1c")
See this answer for more information
I am testing some Python code to loop through resumes, open each, parse each, and create a comprehensive report based on the contents of each resume. Here is the code that I am running.
#importing all required libraries
import PyPDF2
import os
from os import listdir
from os.path import isfile, join
from io import StringIO
import pandas as pd
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
from spacy.matcher import PhraseMatcher
#Function to read resumes from the folder one by one
mypath='C:\\path_to_resumes\\' #enter your path here where you saved the resumes
onlyfiles = [os.path.join(mypath, f) for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
def pdfextract(file):
fileReader = PyPDF2.PdfFileReader(open(file,'rb'))
countpage = fileReader.getNumPages()
count = 0
text = []
while count < countpage:
pageObj = fileReader.getPage(count)
count +=1
t = pageObj.extractText()
print (t)
text.append(t)
return text
#function to read resume ends
#function that does phrase matching and builds a candidate profile
def create_profile(file):
text = pdfextract(file)
text = str(text)
text = text.replace("\\n", "")
text = text.lower()
#below is the csv where we have all the keywords, you can customize your own
keyword_dict = pd.read_csv('D:/NLP_Resume/resume/template_new.csv')
stats_words = [nlp(text) for text in keyword_dict['Statistics'].dropna(axis = 0)]
NLP_words = [nlp(text) for text in keyword_dict['NLP'].dropna(axis = 0)]
ML_words = [nlp(text) for text in keyword_dict['Machine Learning'].dropna(axis = 0)]
DL_words = [nlp(text) for text in keyword_dict['Deep Learning'].dropna(axis = 0)]
R_words = [nlp(text) for text in keyword_dict['R Language'].dropna(axis = 0)]
python_words = [nlp(text) for text in keyword_dict['Python Language'].dropna(axis = 0)]
Data_Engineering_words = [nlp(text) for text in keyword_dict['Data Engineering'].dropna(axis = 0)]
matcher = PhraseMatcher(nlp.vocab)
matcher.add('Stats', None, *stats_words)
matcher.add('NLP', None, *NLP_words)
matcher.add('ML', None, *ML_words)
matcher.add('DL', None, *DL_words)
matcher.add('R', None, *R_words)
matcher.add('Python', None, *python_words)
matcher.add('DE', None, *Data_Engineering_words)
doc = nlp(text)
d = []
matches = matcher(doc)
for match_id, start, end in matches:
rule_id = nlp.vocab.strings[match_id] # get the unicode ID, i.e. 'COLOR'
span = doc[start : end] # get the matched slice of the doc
d.append((rule_id, span.text))
keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i,j in Counter(d).items())
## convertimg string of keywords to dataframe
df = pd.read_csv(StringIO(keywords),names = ['Keywords_List'])
df1 = pd.DataFrame(df.Keywords_List.str.split(' ',1).tolist(),columns = ['Subject','Keyword'])
df2 = pd.DataFrame(df1.Keyword.str.split('(',1).tolist(),columns = ['Keyword', 'Count'])
df3 = pd.concat([df1['Subject'],df2['Keyword'], df2['Count']], axis =1)
df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")"))
base = os.path.basename(file)
filename = os.path.splitext(base)[0]
name = filename.split('_')
name2 = name[0]
name2 = name2.lower()
## converting str to dataframe
name3 = pd.read_csv(StringIO(name2),names = ['Candidate Name'])
dataf = pd.concat([name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count']], axis = 1)
dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0], inplace = True)
return(dataf)
#function ends
#code to execute/call the above functions
final_database=pd.DataFrame()
i = 0
while i < len(onlyfiles):
file = onlyfiles[i]
dat = create_profile(file)
final_database = final_database.append(dat)
i +=1
print(final_database)
#code to count words under each category and visulaize it through Matplotlib
final_database2 = final_database['Keyword'].groupby([final_database['Candidate Name'], final_database['Subject']]).count().unstack()
final_database2.reset_index(inplace = True)
final_database2.fillna(0,inplace=True)
new_data = final_database2.iloc[:,1:]
new_data.index = final_database2['Candidate Name']
#execute the below line if you want to see the candidate profile in a csv format
#sample2=new_data.to_csv('sample.csv')
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 10})
ax = new_data.plot.barh(title="Resume keywords by category", legend=False, figsize=(25,7), stacked=True)
labels = []
for j in new_data.columns:
for i in new_data.index:
label = str(j)+": " + str(new_data.loc[i][j])
labels.append(label)
patches = ax.patches
for label, rect in zip(labels, patches):
width = rect.get_width()
if width > 0:
x = rect.get_x()
y = rect.get_y()
height = rect.get_height()
ax.text(x + width/2., y + height/2., label, ha='center', va='center')
plt.show()
In the folder, I have '.doc' and '.docx' files. Everything seems to work fine, up until this point, directly below. When I get here, the code throws an error. Here is the troublesome code. The weird thing is, that it looks like some kind of PDF error, but I'm iterating only through '.doc' and '.docx' files.
final_database=pd.DataFrame()
i = 0
while i < len(onlyfiles):
file = onlyfiles[i]
dat = create_profile(file)
final_database = final_database.append(dat)
i +=1
print(final_database)
Here is the StackTrace:
Traceback (most recent call last):
File "<ipython-input-2-c63fca79d39f>", line 5, in <module>
dat = create_profile(file)
File "<ipython-input-1-cdc3bf75cd26>", line 34, in create_profile
text = pdfextract(file)
File "<ipython-input-1-cdc3bf75cd26>", line 17, in pdfextract
fileReader = PyPDF2.PdfFileReader(open(file,'rb'))
File "C:\Users\ryans\Anaconda3\lib\site-packages\PyPDF2\pdf.py", line 1084, in __init__
self.read(stream)
File "C:\Users\ryans\Anaconda3\lib\site-packages\PyPDF2\pdf.py", line 1696, in read
raise utils.PdfReadError("EOF marker not found")
PdfReadError: EOF marker not found
The code comes from here.
https://towardsdatascience.com/do-the-keywords-in-your-resume-aptly-represent-what-type-of-data-scientist-you-are-59134105ba0d
You are using package PyPDF2, which is used to read and manipulate pdf files. In the article from towardsdatascience that you mentioned all resumes that author was working on were in pdf format.
Maybe if your resumes are in doc/docx format you should explore python-docx library:
https://python-docx.readthedocs.io/en/latest/index.html