how to solve "KeyError :'month'"

how to solve "KeyError :'month'" - python

I tried to make a dashboard using panel.
Traceback (most recent call last):
File "/home/istts/Desktop/P3/DBPANEL.py", line 70, in <module>
interDataFrame[(interDataFrame.month <= mnthSlider)]
File "/home/istts/.local/lib/python3.9/site-packages/hvplot/interactive.py", line 499, in __call__
clone = new._clone(method(*args, **kwargs), plot=new._method == 'plot')
File "/home/istts/.local/lib/python3.9/site-packages/hvplot/interactive.py", line 370, in _clone
return type(self)(self._obj, fn=self._fn, transform=transform, plot=plot, depth=depth,
File "/home/istts/.local/lib/python3.9/site-packages/hvplot/interactive.py", line 276, in __init__
self._current = self._transform.apply(ds, keep_index=True, compute=False)
File "/home/istts/.local/lib/python3.9/site-packages/holoviews/util/transform.py", line 774, in apply
data = self._apply_fn(dataset, data, fn, fn_name, args,
File "/home/istts/.local/lib/python3.9/site-packages/holoviews/util/transform.py", line 672, in _apply_fn
raise e
File "/home/istts/.local/lib/python3.9/site-packages/holoviews/util/transform.py", line 666, in _apply_fn
data = method(*args, **kwargs)
File "/home/istts/.local/lib/python3.9/site-packages/pandas/util/_decorators.py", line 331, in wrapper
return func(*args, **kwargs)
File "/home/istts/.local/lib/python3.9/site-packages/pandas/core/frame.py", line 6912, in sort_values
k = self._get_label_or_level_values(by, axis=axis)
File "/home/istts/.local/lib/python3.9/site-packages/pandas/core/generic.py", line 1850, in _get_label_or_level_values
raise KeyError(key)
KeyError: 'month'
I always get that error, tried to use different name but still KeyError : 'month'. does any one know what is that error
note : I'll put my code below. I take the data from local database, but there's no month column so I make one.
import pandas as pd
import numpy as np
import panel as pn
import hvplot.pandas
import mysql.connector as connection
import datetime as dt
pn.extension()
# DATABSE VARIABLE
DBHOST = "localhost" #The Host use "localhost" or the database IP
DBUSER = "ISTTS" #The user
DBPASS = "digital123" #The user password
DBDATA = "exampledb" #The database to insert
DBTABL = "Test2" #The table to insert
# END DATABASE VARIABLE
# Device list (Incase of more than 1 device)
DEVICELIST = ['1'] # LATER, TAKE IT FROM DATABASE AND MAKE IT STRING
# End Device list
# Read data
try:
mydb = connection.connect(host=DBHOST, user=DBUSER, password=DBPASS, database=DBDATA, use_pure=True)
query = "Select * from " + DBTABL + ";"
dataFrame = pd.read_sql(query, mydb)
#query = "Select Distinct DeviceID from " + DBTABL +";"
#DEVICELIST = pd.read_sql(query, mydb)
mydb.close()
except Exception as e:
mydb.close()
print(str(e))
# End of read data
## Some process that need to be made for the program
# DEVICE LIST FROM DeviceID COLUMN
# TAKE MONTH FROM Date COLUMN
dataFrame['month'] = pd.DatetimeIndex(dataFrame['Date']).month
## End of the data processing
# Make the dataframe interactive
interDataFrame = dataFrame.interactive()
# End of Process
## Widgets
# Slider to see which month to inspect
mnthSlider = pn.widgets.IntSlider(name='Month slider', start=1, end=12, step=1, value=1)
# Button to pick which data to show for top graph
topYAxisData = pn.widgets.RadioButtonGroup(
name='data select',
options=['Temperature', 'Pressure'],
button_type='primary'
)
# Selector to pick which device data to show for top graph
topDeviceSelector = pn.widgets.Select(name='Device selector', options=DEVICELIST)
# Data pipeline for plotting the top graph
topDataPipeline = (
interDataFrame[(interDataFrame.month <= mnthSlider)]
.groupby(['Date'])[topYAxisData].mean() ## Revision, groupby. plan : month
.to_frame()
.reset_index()
.sort_values(by='month')
.reset_index(drop = True)
)
topDataPlot = topDataPipeline.hvplot(x='Date', y=topYAxisData, line_witdh=2, title='Sensor Data')
# Device selector for bottom one
botDeviceSelector1 = pn.widgets.Select(name='Device selector 1', options=DEVICELIST)
botDeviceSelector2 = pn.widgets.Select(name='Device selector 2', options=DEVICELIST)
botDeviceSelector3 = pn.widgets.Select(name='Device selector 3', options=DEVICELIST)
# Button to pick which data to show for bottom one
botYAxisData1 = pn.widgets.RadioButtonGroup(
name='bot data select1',
options=['Temperature', 'Pressure'],
button_type='primary'
)
botYAxisData2 = pn.widgets.RadioButtonGroup(
name='bot data select2',
options=['Temperature', 'Pressure'],
button_type='primary'
)
botYAxisData3 = pn.widgets.RadioButtonGroup(
name='bot data select3',
options=['Temperature', 'Pressure'],
button_type='primary'
)
# Data pipeline for plotting
botDataPipeline1 = (
interDataFrame[(interDataFrame.month <= monthSlider)]
.groupby(['Date'])[botYAxisData1].mean() ## Revision, groupby. plan : month
.to_frame()
.reset_index()
.sort_values(by='month')
.reset_index(drop = True)
)
botDataPipeline2 = (
interDataFrame[(interDataFrame.month <= monthSlider)]
.groupby(['Date'])[botYAxisData2].mean() ## Revision, groupby. plan : month
.to_frame()
.reset_index()
.sort_values(by='month')
.reset_index(drop = True)
)
botDataPipeline3 = (
interDataFrame[(interDataFrame.month <= monthSlider)]
.groupby(['Date'])[botYAxisData3].mean() ## Revision, groupby. plan : month
.to_frame()
.reset_index()
.sort_values(by='month')
.reset_index(drop = True)
)
# Plotting for bot graph
botDataPlot1 = botDataPipeline1.hvplot(x='Date', y=botYAxisData1, line_witdh=2, title='Sensor Data 1')
botDataPlot2 = botDataPipeline2.hvplot(x='Date', y=botYAxisData2, line_witdh=2, title='Sensor Data 2')
botDataPlot3 = botDataPipeline3.hvplot(x='Date', y=botYAxisData3, line_witdh=2, title='Sensor Data 3')
## End Widgets
# Template
template = pn.template.FastListTemplate(
title='Temperature / Pressure Data',
sidebar=[pn.pane.markdown("# What should I write here??"),
pn.pane.markdown("### What should I write here?? AGAIN"),
pn.pane.PNG("Temperature.png", sizing_mode='scale_both'),
pn.pane.markdown('Month Selector'),
monthSlider],
main=[pn.Row(pn.Column(topYAxisData, topDataPlot.panel(), margin=(0,25))),
pn.Row(pn.Column(botYAxisData1, botDataPlot1.panel(), margin=(0,25)),
pn.Column(botYAxisData2, botDataPlot2.panel(), margin=(0,25)),
pn.Column(botYAxisData3, botDataPlot3.panel(), margin=(0,25)),
pn.Column(pn.Row(botDeviceSelector1), pn.Row(botDeviceSelector2, pn.Row(botDeviceSelector3))))],
accent_base_color="#84A9FF",
header_background="#84A9FF"
)
template.show()
template.servable();
I tried to use "astype" and "to_numeric" but still same error

Related

Input folder path error for csv automation report

Please help me with this error:
Traceback (most recent call last):
File "c:\Users\Dell Latitude 7480\Desktop\HC\HC.CHECKER.py", line 17, in
read_file = pd.read_excel(abs_folder_path, sheet_name = 'MSAN Cabinets')
File "C:\Users\Dell Latitude 7480\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\util_decorators.py", line 311, in wrapper
return func(*args, **kwargs)
File "C:\Users\Dell Latitude 7480\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\excel_base.py", line 457, in read_excel
io = ExcelFile(io, storage_options=storage_options, engine=engine)
File "C:\Users\Dell Latitude 7480\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\excel_base.py", line 1376, in init
ext = inspect_excel_format(
File "C:\Users\Dell Latitude 7480\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\excel_base.py", line 1250, in inspect_excel_format
with get_handle(
File "C:\Users\Dell Latitude 7480\AppData\Local\Programs\Python\Python310\lib\site-packages\pandas\io\common.py", line 795, in get_handle
handle = open(handle, ioargs.mode)
PermissionError: [Errno 13] Permission denied: 'C:\Users\Dell Latitude 7480\Desktop\HC'
When running this:
import pandas as pd
import numpy as np
import os
print("Fixed Network Health Check Checker")
folder_path = name = input("Enter the folder path of your Raw files: ")
abs_folder_path = os.path.abspath(folder_path)
read_file = pd.read_excel(abs_folder_path, sheet_name = 'MSAN Cabinets')
read_file.to_csv(abs_folder_path, index = None, header = True)
df = pd.read_csv(abs_folder_path, encoding='unicode_escape')
#fixed
df['MSAN Interface'] = df['MSAN Interface'].replace(np.nan, 0)
df['ACCESS Interface 1 (IPRAN, ATN, LSA, VLAN)'] = df['ACCESS Interface 1 (IPRAN, ATN, LSA, VLAN)'].replace(np.nan, 0)
df['Homing AG2'] = df['Homing AG1'].replace(np.nan, 0)
df = df.iloc[:11255]
# filter "REGION" and drop unnecessary columns
f_df1 = df[df['REGION'] == 'MIN']
dropcols_df1 = f_df1.drop(df.iloc[:, 1:6], axis = 1)
dropcols_df2 = dropcols_df1.drop(df.iloc[:, 22:27], axis = 1)
dropcols_df3 = dropcols_df2.drop(df.iloc[:, 37:50], axis = 1)
# filter "MSAN Interface" and filter the peak util for >= 50%
f_d2 = dropcols_df3['MSAN Interface'] != 0
msan_int = dropcols_df3[f_d2]
f_msan_int = msan_int['Peak Util'] >= 0.5
new_df = msan_int[f_msan_int]
# filter "ACCESS Interface 1 (IPRAN, ATN, LSA, VLAN)" and filter the peak util for >= 50%
fblank_msan_int = dropcols_df3['MSAN Interface'] == 0
msan_int1 = dropcols_df3[fblank_msan_int]
f_df3 = dropcols_df3['ACCESS Interface 1 (IPRAN, ATN, LSA, VLAN)'] != 0
access_int1 = dropcols_df3[f_df3]
f_access_int1 = access_int1['Peak Util.1'] >= 0.5
new_df1 = access_int1[f_access_int1]
# filter "Homing AG1" and filter the peak util for >= 50%
fblank_msan_int1 = dropcols_df3['MSAN Interface'] == 0
msan_int2 = dropcols_df3[fblank_msan_int1]
f_access_int2 = msan_int2['ACCESS Interface 1 (IPRAN, ATN, LSA, VLAN)'] == 0
new_df2 = msan_int2[f_access_int2]
ag1 = new_df2['Peak Util.3'] >= 0.5
new_df3 = new_df2[ag1]
# Concatenate all DataFrames
pdList = [new_df, new_df1, new_df3]
final_df = pd.concat(pdList)
print(final_df.to_csv('output.csv', index = False))
Could someone help me.
Python version 3.9.12 Windows 10 x64
Thanks

How to read a SQL Stored Procedure into a pandas dataframe?

Thanks all for your suggestions. Adding self. before calling my functions fixed that issue. I'm now running into a different issue where the stored procedure I am using isn't getting properly read into the pandas dataframe. I don't get any errors, the tkinter window pops up like it should, but when the PDF gets generated, there is no data in the rows, just the column names and other formatting I've written in.
I added print(df) to the code to check if it was an issue with the data getting read from the dataframe to the PDF, but print(df) just returns Empty DataFrame.
from tkinter import *
import pyodbc
import pandas as pd
from reportlab.lib import colors
from reportlab.platypus import *
from reportlab.lib import styles
from reportlab.lib.units import inch
# Create connection
server = 'XXXXXXX'
database = 'XXXXXXX'
username = 'XXXXXXXX'
password = 'XXXXXXXXX'
try:
cnxn = pyodbc.connect('DRIVER={SQL Server};SERVER='+server+';DATABASE='+database+';UID='+username+';PWD='+password+'')
except:
raise NameError('unable to connect')
#save stored procedure to a global variable
storedProc = 'EXEC [presentation].[pdf_project] '
elements = []
class NewPDF:
def __init__(self):
window = tk.Tk()
window.title("Form Example")
window = tk.Frame(window)
window.grid(column=0,row=0, sticky=(tk.N,tk.W,tk.E,tk.S))
window.columnconfigure(0, weight = 1)
window.rowconfigure(0, weight = 1)
window.pack(pady = 100, padx = 100)
self.tkvar = tk.StringVar(window)
choices = {'2021','2020','2019','2018'}
self.tkvar.set('2021')
popupMenu = tk.OptionMenu(window, self.tkvar, *choices)
tk.Label(window, text = "Pick Year").grid(row = 1, column = 1)
popupMenu.grid(row = 1, column = 2)
tk.Label(window, text = "Customer").grid(row = 2, column = 1)
self.e1 = tk.Entry(window)
self.e1.grid(row = 2, column = 2)
self.param_year = self.tkvar.get()
self.param_cust = str(self.e1.get())
B = tk.Button(window, text = "Make PDF", command=self.make_df_and_pdf()).grid(row = 3, column = 1)
self.tkvar.trace('w', self.change_dropdown())
window.mainloop()
def change_dropdown(self, *args):
print(args)
def make_df_and_pdf(self):
#param_year = self.tkvar.get()
#param_cust = str(self.e1.get())
params = "'" + self.param_cust + "'," + self.param_year + ""
querystring = storedProc + params
df = pd.read_sql_query(querystring, cnxn)
lista = [df.columns[:,].values.astype(str).tolist()] + df.values.tolist()
#cust_list = (df['CUSTOMER'].unique())
#cust_list = cust_list.tolist()
#print(df)
styles = getSampleStyleSheet()
ts = [('ALIGN', (1,1), (-1,1), 'LEFT'),
('BOX', (0,0), (3,0), 2, colors.red),
('FONT', (0,0), (-1,0), 'Times-Bold'),
('GRID', (0,1), (-1,-1), 0.5, colors.grey)]
n_table = Table(lista, colWidths = (1.5*inch, 1.5*inch, 1.5*inch, 1.5*inch, 1.5*inch), repeatRows = 1)
table_style = TableStyle(ts)
n_table.setStyle(table_style)
PATH_OUT = "Desktop"
doc = SimpleDocTemplate(PATH_OUT + 'UserInputPDF_Test.pdf')
elements.append(Paragraph("CustomerTAT", styles['Title']))
elements.append(n_table)
doc.build(elements)
NewPDF()
EDIT: The stored procedure operates as it should within SQL. Here is the code for my stored procedure:
USE [XXXXX]
GO
/****** Object: StoredProcedure [presentation].[pdf_project] Script Date: 6/22/2021 4:31:20 PM ******/
SET ANSI_NULLS ON
GO
SET QUOTED_IDENTIFIER ON
GO
-- =============================================
-- Author: XXXXXX
-- Create date:
-- Description:
-- =============================================
ALTER PROCEDURE [presentation].[pdf_project]
-- Add the parameters for the stored procedure here
#CustomerName varchar(50) = '',
#Year int = 0
AS
BEGIN
-- SET NOCOUNT ON added to prevent extra result sets from
-- interfering with SELECT statements.
SET NOCOUNT ON;
-- Insert statements for procedure here
SELECT CUSTOMER, [TAT Whole Days], [SHIP DATE], YEAR([SHIP DATE]) AS YEAR,
CASE
WHEN MONTH([SHIP DATE]) IN (1,2,3) THEN 'Q1'
WHEN MONTH([SHIP DATE]) IN (4,5,6) THEN 'Q2'
WHEN MONTH([SHIP DATE]) IN (7,8,9) THEN 'Q3'
ELSE 'Q4'
END AS QUARTER
FROM presentation.CustomerTAT
WHERE (YEAR([SHIP DATE]) = #Year or YEAR([SHIP DATE]) = #Year)
AND CUSTOMER = #CustomerName
END

Issue with creating choropleth map on Python

I'm trying to create a choropleth map using folium on python and I was able to get the base map running, but when I try to add a layer with neighborhood boundaries, it does not show up on the html page. I thought maybe I had to increase the line opacity, but that doesn't seem to be it.
This is my code:
import folium
import pandas as pd
crimeData = pd.read_csv('NYC_crime.csv')
crime2020 = crimeData[crimeData.CMPLNT_FR_DT == 2020]
nycMap = folium.Map(location=[40.693943, -73.985880],zoom_start = 10)
mapLines = 'nbhdMap.geojson.json'
folium.Choropleth(geo_data = mapLines,
data = crime2020,
fill_color = 'OrRd',
fill_opacity=0.5,
line_opacity=1.0,
key_on = 'feature.geometry.coordinates',
columns = ['Lat_Lon']
)
nycMap.save(outfile='index.html')
I'm also having trouble filling the map with data. I'm trying to make it so that each complaint documented on the CSV file from 2020 is used to show which areas received the most calls. But I get this error:
Traceback (most recent call last):
File "/Users/kenia/Desktop/CSCI233/PRAC.py", line 10, in <module>
folium.Choropleth(geo_data = mapLines,
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/folium/features.py", line 1158, in __init__
color_data = data.set_index(columns[0])[columns[1]].to_dict()
IndexError: list index out of range
This is the neighborhood boundaries: https://data.beta.nyc/dataset/pediacities-nyc-neighborhoods/resource/35dd04fb-81b3-479b-a074-a27a37888ce7
And this is my data: https://data.cityofnewyork.us/Public-Safety/NYPD-Complaint-Data-Current-Year-To-Date-/5uac-w243
[EDIT] So I tried #r-beginners suggestion with a simpler dataset: https://data.cityofnewyork.us/Health/Restaurants-rolled-up-/59dk-tdhz
import pandas as pd
import folium
data = pd.read_csv('nycrestaurants.csv')
data = pd.concat([data, str(data['ZIPCODE']).split(',')], axis=1)
data.columns = ['CAMIS', 'DBA', 'BORO', 'BUILDING', 'STREET', 'ZIPCODE']
resData = data.groupby(['ZIPCODE'])['DBA'].sum().reset_index()
nycMap = folium.Map(location=[40.693943, -73.985880],zoom_start = 10)
mapLines = 'zipMap.geojson.json'
folium.Choropleth(geo_data = mapLines,
data = resData,
key_on = 'feature.properties.postalCode',
columns = ['ZIPCODE', 'DBA'],
fill_color = 'OrRd',
fill_opacity=0.5,
line_opacity=1.0
).add_to(nycMap)
nycMap.save(outfile='index.html')
But now I'm getting this error message:
Traceback (most recent call last):
File "/Users/kenia/Desktop/CSCI233/PRAC.py", line 5, in <module>
data = pd.concat([data, str(data['ZIPCODE']).split(',')], axis=1)
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/core/reshape/concat.py", line 274, in concat
op = _Concatenator(
File "/Library/Frameworks/Python.framework/Versions/3.8/lib/python3.8/site-packages/pandas/core/reshape/concat.py", line 359, in __init__
raise TypeError(msg)
TypeError: cannot concatenate object of type '<class 'list'>'; only Series and DataFrame objs are valid

Since you were presented with the data of the complaint in another question, you got the GEOJSON data from here for the corresponding zip code range. As for the process, we have tabulated it by the number of zip codes and tied it to the number of occurrences.
import pandas as pd
import numpy as np
df = pd.read_csv('./data/311_Noise_Complaints.csv', sep=',')
df['Incident Zip'].fillna(0, inplace=True)
df['Incident Zip'] = df['Incident Zip'].astype(int)
df_zip = df['Incident Zip'].value_counts().to_frame().reset_index()
df_zip.columns = ['postal_code', 'counts']
df_zip['postal_code'] = df_zip['postal_code'].astype(str)
import folium
nycMap = folium.Map(location=[40.693943, -73.985880], zoom_start=10)
mapLines = './data/nyc_zip_code_tabulation_areas_polygons.geojson'
choropleth = folium.Choropleth(geo_data = mapLines,
data = df_zip,
columns = ['postal_code', 'counts'],
key_on = 'feature.properties.postalcode',
fill_color = 'BuPu',
fill_opacity=0.5,
line_opacity=1.0
).add_to(nycMap)
choropleth.geojson.add_child(
folium.features.GeoJsonTooltip(['po_name'], labels=False)
)
nycMap.save(outfile='index.html')
nycMap

Trying to parse Word Documents and getting PdfReadError: EOF marker not found

I am testing some Python code to loop through resumes, open each, parse each, and create a comprehensive report based on the contents of each resume. Here is the code that I am running.
#importing all required libraries
import PyPDF2
import os
from os import listdir
from os.path import isfile, join
from io import StringIO
import pandas as pd
from collections import Counter
import en_core_web_sm
nlp = en_core_web_sm.load()
from spacy.matcher import PhraseMatcher
#Function to read resumes from the folder one by one
mypath='C:\\path_to_resumes\\' #enter your path here where you saved the resumes
onlyfiles = [os.path.join(mypath, f) for f in os.listdir(mypath) if os.path.isfile(os.path.join(mypath, f))]
def pdfextract(file):
fileReader = PyPDF2.PdfFileReader(open(file,'rb'))
countpage = fileReader.getNumPages()
count = 0
text = []
while count < countpage:
pageObj = fileReader.getPage(count)
count +=1
t = pageObj.extractText()
print (t)
text.append(t)
return text
#function to read resume ends
#function that does phrase matching and builds a candidate profile
def create_profile(file):
text = pdfextract(file)
text = str(text)
text = text.replace("\\n", "")
text = text.lower()
#below is the csv where we have all the keywords, you can customize your own
keyword_dict = pd.read_csv('D:/NLP_Resume/resume/template_new.csv')
stats_words = [nlp(text) for text in keyword_dict['Statistics'].dropna(axis = 0)]
NLP_words = [nlp(text) for text in keyword_dict['NLP'].dropna(axis = 0)]
ML_words = [nlp(text) for text in keyword_dict['Machine Learning'].dropna(axis = 0)]
DL_words = [nlp(text) for text in keyword_dict['Deep Learning'].dropna(axis = 0)]
R_words = [nlp(text) for text in keyword_dict['R Language'].dropna(axis = 0)]
python_words = [nlp(text) for text in keyword_dict['Python Language'].dropna(axis = 0)]
Data_Engineering_words = [nlp(text) for text in keyword_dict['Data Engineering'].dropna(axis = 0)]
matcher = PhraseMatcher(nlp.vocab)
matcher.add('Stats', None, *stats_words)
matcher.add('NLP', None, *NLP_words)
matcher.add('ML', None, *ML_words)
matcher.add('DL', None, *DL_words)
matcher.add('R', None, *R_words)
matcher.add('Python', None, *python_words)
matcher.add('DE', None, *Data_Engineering_words)
doc = nlp(text)
d = []
matches = matcher(doc)
for match_id, start, end in matches:
rule_id = nlp.vocab.strings[match_id] # get the unicode ID, i.e. 'COLOR'
span = doc[start : end] # get the matched slice of the doc
d.append((rule_id, span.text))
keywords = "\n".join(f'{i[0]} {i[1]} ({j})' for i,j in Counter(d).items())
## convertimg string of keywords to dataframe
df = pd.read_csv(StringIO(keywords),names = ['Keywords_List'])
df1 = pd.DataFrame(df.Keywords_List.str.split(' ',1).tolist(),columns = ['Subject','Keyword'])
df2 = pd.DataFrame(df1.Keyword.str.split('(',1).tolist(),columns = ['Keyword', 'Count'])
df3 = pd.concat([df1['Subject'],df2['Keyword'], df2['Count']], axis =1)
df3['Count'] = df3['Count'].apply(lambda x: x.rstrip(")"))
base = os.path.basename(file)
filename = os.path.splitext(base)[0]
name = filename.split('_')
name2 = name[0]
name2 = name2.lower()
## converting str to dataframe
name3 = pd.read_csv(StringIO(name2),names = ['Candidate Name'])
dataf = pd.concat([name3['Candidate Name'], df3['Subject'], df3['Keyword'], df3['Count']], axis = 1)
dataf['Candidate Name'].fillna(dataf['Candidate Name'].iloc[0], inplace = True)
return(dataf)
#function ends
#code to execute/call the above functions
final_database=pd.DataFrame()
i = 0
while i < len(onlyfiles):
file = onlyfiles[i]
dat = create_profile(file)
final_database = final_database.append(dat)
i +=1
print(final_database)
#code to count words under each category and visulaize it through Matplotlib
final_database2 = final_database['Keyword'].groupby([final_database['Candidate Name'], final_database['Subject']]).count().unstack()
final_database2.reset_index(inplace = True)
final_database2.fillna(0,inplace=True)
new_data = final_database2.iloc[:,1:]
new_data.index = final_database2['Candidate Name']
#execute the below line if you want to see the candidate profile in a csv format
#sample2=new_data.to_csv('sample.csv')
import matplotlib.pyplot as plt
plt.rcParams.update({'font.size': 10})
ax = new_data.plot.barh(title="Resume keywords by category", legend=False, figsize=(25,7), stacked=True)
labels = []
for j in new_data.columns:
for i in new_data.index:
label = str(j)+": " + str(new_data.loc[i][j])
labels.append(label)
patches = ax.patches
for label, rect in zip(labels, patches):
width = rect.get_width()
if width > 0:
x = rect.get_x()
y = rect.get_y()
height = rect.get_height()
ax.text(x + width/2., y + height/2., label, ha='center', va='center')
plt.show()
In the folder, I have '.doc' and '.docx' files. Everything seems to work fine, up until this point, directly below. When I get here, the code throws an error. Here is the troublesome code. The weird thing is, that it looks like some kind of PDF error, but I'm iterating only through '.doc' and '.docx' files.
final_database=pd.DataFrame()
i = 0
while i < len(onlyfiles):
file = onlyfiles[i]
dat = create_profile(file)
final_database = final_database.append(dat)
i +=1
print(final_database)
Here is the StackTrace:
Traceback (most recent call last):
File "<ipython-input-2-c63fca79d39f>", line 5, in <module>
dat = create_profile(file)
File "<ipython-input-1-cdc3bf75cd26>", line 34, in create_profile
text = pdfextract(file)
File "<ipython-input-1-cdc3bf75cd26>", line 17, in pdfextract
fileReader = PyPDF2.PdfFileReader(open(file,'rb'))
File "C:\Users\ryans\Anaconda3\lib\site-packages\PyPDF2\pdf.py", line 1084, in __init__
self.read(stream)
File "C:\Users\ryans\Anaconda3\lib\site-packages\PyPDF2\pdf.py", line 1696, in read
raise utils.PdfReadError("EOF marker not found")
PdfReadError: EOF marker not found
The code comes from here.
https://towardsdatascience.com/do-the-keywords-in-your-resume-aptly-represent-what-type-of-data-scientist-you-are-59134105ba0d

You are using package PyPDF2, which is used to read and manipulate pdf files. In the article from towardsdatascience that you mentioned all resumes that author was working on were in pdf format.
Maybe if your resumes are in doc/docx format you should explore python-docx library:
https://python-docx.readthedocs.io/en/latest/index.html

Plotly Geoscatter with Aggregation: show aggregation in hover Info

I have been trying to create a Geoscatter Plot with Plotly where the marker size should indicate the number of customers (row items) in one city (zip_city). I based my code on two templates from the Plotly documentation: United States Bubble Map and the aggregation part Mapping with Aggregates.
I managed to put together a code that does what I want, except for one drawback: when I hover over a bubble, I would like to see the name of the city plus number of customers (the result from the aggregation), so something like Aguadilla: 2. Can you help me on how to do this?
Here is my code (as a beginner with plotly, I am also open to code improvements):
import plotly.offline as pyo
import pandas as pd
df = pd.DataFrame.from_dict({'Customer': [111, 222, 555, 666],
'zip_city': ['Aguadilla', 'Aguadilla', 'Arecibo', 'Wrangell'],
'zip_latitude':[18.498987, 18.498987, 18.449732,56.409507],
'zip_longitude':[-67.13699,-67.13699,-66.69879,-132.33822]})
data = [dict(
type = 'scattergeo',
locationmode = 'USA-states',
lon = df['zip_longitude'],
lat = df['zip_latitude'],
text = df['Customer'],
marker = dict(
size = df['Customer'],
line = dict(width=0.5, color='rgb(40,40,40)'),
sizemode = 'area'
),
transforms = [dict(
type = 'aggregate',
groups = df['zip_city'],
aggregations = [dict(target = df['Customer'], func = 'count', enabled = True)]
)]
)]
layout = dict(title = 'Customers per US City')
fig = dict( data=data, layout=layout )
pyo.plot( fig, validate=False)
Update:
Can I access the result of the transforms argument directly in the data argument to show the number of customers per city?

You can create a list, that will contains what you want and then set text=list in data. Also do not forget specify hoverinfo='text'.
I am updated your code, so try this:
import pandas as pd
import plotly.offline as pyo
df = pd.DataFrame.from_dict({'Customer': [111, 222, 555, 666],
'zip_city': ['Aguadilla', 'Aguadilla', 'Arecibo', 'Wrangell'],
'zip_latitude':[18.498987, 18.498987, 18.449732,56.409507],
'zip_longitude':[-67.13699,-67.13699,-66.69879,-132.33822]})
customer = df['Customer'].tolist()
zipcity = df['zip_city'].tolist()
list = []
for i in range(len(customer)):
k = str(zipcity[i]) + ':' + str(customer[i])
list.append(k)
data = [dict(
type = 'scattergeo',
locationmode = 'USA-states',
lon = df['zip_longitude'],
lat = df['zip_latitude'],
text = list,
hoverinfo = 'text',
marker = dict(
size = df['Customer'],
line = dict(width=0.5, color='rgb(40,40,40)'),
sizemode = 'area'
),
transforms = [dict(
type = 'aggregate',
groups = df['zip_city'],
aggregations = [dict(target = df['Customer'], func = 'count', enabled = True)]
)]
)]
layout = dict(title = 'Customers per US City')
fig = dict(data=data, layout=layout)
pyo.plot(fig, validate=False)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

how to solve "KeyError :'month'" - python

Related

Input folder path error for csv automation report

How to read a SQL Stored Procedure into a pandas dataframe?

Issue with creating choropleth map on Python

Trying to parse Word Documents and getting PdfReadError: EOF marker not found

Plotly Geoscatter with Aggregation: show aggregation in hover Info

Categories

Resources