I am trying to create a list objects that holds data about professional golfers. The different data points are golfer name and putting percentages from different distances. I want to sort this list of objects by name once all the data has been entered for every player object. The list of these objects is called PlayerNumber. When I try to sort PlayerNumber by attribute 'name'. I get an error stating that 'int' has no attribute and I am not sure why PlayerNumber is being referred to as an integer and not a list.
Any help would be appreciated. Here is the code:
import operator
import numpy as np
import statistics
import matplotlib.pyplot as plt
from colour import Color
from bs4 import BeautifulSoup
import urllib3
############### ACCESS WEBPAGES ####################
def makeSoup(url):
http = urllib3.PoolManager()
response = http.request('GET', url)
soupdata = BeautifulSoup(response.data)
return soupdata
siteURL = []
for i in range(7):
siteURL.append(i)
siteURL[0] = ''
siteURL[1] = 'http://www.pgatour.com/stats/stat.408.html' #>25
siteURL[2] = 'http://www.pgatour.com/stats/stat.407.html' #20-25
siteURL[3] = 'http://www.pgatour.com/stats/stat.406.html' #15-20
siteURL[4] = 'http://www.pgatour.com/stats/stat.405.html' #10-15
siteURL[5] = 'http://www.pgatour.com/stats/stat.404.html' #5-10
siteURL[6] = 'http://www.pgatour.com/stats/stat.02427.html' #3-5
############### ACCESS TABLE DATA ###################
def row_number(soupdata):
for row in table.findAll('tr'):
tot_row = row
return tot_row
def parse_table(soupdata):
currRank = []
prevRank = []
playerName = []
rounds = []
pctMake = []
attempts = []
puttsMade = []
table = soupdata.find('tbody')
tot_row = 0
for row in table.findAll('tr'):
#for col in row.findAll('td'):
col = row.find_all('td')
#column_1 = col[0]
#currRank.append(column_1)
#column_2 = col[1]
#prevRank.append(column_2)
column_3 = col[2].text
column_3.strip()
playerName.append(column_3)
#column_4 = col[3]
#rounds.append(column_4)
column_5 = col[4].text
pctMake.append(column_5)
#column_6 = col[5]
#attempts.append(column_6)
#column_7 = col[6]
#puttsMade.append(column_7)
tot_row += 1
#return currRank, prevRank, playerName, rounds, pctMake, attempts, puttsMade
return playerName, pctMake, tot_row
"""
>25 ft: distance1
20-25 ft: distance2
15-20 ft: distance3
10-15 ft: distance4
5-10 ft: distance5
3-5 ft: distance6
"""
############### CLASS DEFINITION ###################
class Player:
id_list={}
def __init__(self,name, id, dis1=0.0, dis2=0.0, dis3=0.0, dis4=0.0, dis5=0.0, dis6=0.0):
self.name = name
self.dis1 = dis1
self.dis2 = dis2
self.dis3 = dis3
self.dis4 = dis4
self.dis5 = dis5
self.dis6 = dis6
self.id = id
Player.id_list[self.name] = self # save the id as key and self as he value
def addDis1(self,distance1):
self.dis1 = float(distance1)
def addDis2(self,distance2):
self.dis2 = float(distance2)
def addDis3(self,distance3):
self.dis3 = float(distance3)
def addDis4(self,distance4):
self.dis4 = float(distance4)
def addDis5(self,distance5):
self.dis5 = float(distance5)
def addDis6(self,distance6):
self.dis6 = float(distance6)
def displayPlayer(self):
print("Player: ", self.name, '\n'
">25 Ft %: ", self.dis1, '\n'
"20-25 Ft %: ", self.dis2, '\n'
"15-20 Ft %: ", self.dis3, '\n'
"10-15 Ft %: ", self.dis4, '\n'
"5-10 Ft %: ", self.dis5, '\n'
"3-5 Ft %: ", self.dis6, '\n')
#classmethod
def lookup_player_name_by_id(cls, name):
try:
return cls.id_list[name] # return the instance with the id
except KeyError: # error check for if id does not exist
raise KeyError("No user with id %s" % str(id))
############### DATA POPULATION ###################
PlayerNumber=[]
for i in range(0,195):
PlayerNumber.append(i)
for i in range(1,7):
soupdata = makeSoup(siteURL[i])
playerName, pctMake, tot_row = parse_table(soupdata)
for x in range(0,tot_row):
#PlayerNumber.append(x)
name = playerName[x]
name = name.replace("\xa0", " ")
name = name.replace("\n", "")
if i == 1:
PlayerNumber[x] = Player(name, x)
Player.addDis1(PlayerNumber[x],pctMake[x])
if i == 2:
val = Player.lookup_player_name_by_id(name)
Player.addDis2(PlayerNumber[val.id],pctMake[x])
if i == 3:
val = Player.lookup_player_name_by_id(name)
Player.addDis3(PlayerNumber[val.id],pctMake[x])
if i == 4:
val = Player.lookup_player_name_by_id(name)
Player.addDis4(PlayerNumber[val.id],pctMake[x])
if i == 5:
val = Player.lookup_player_name_by_id(name)
Player.addDis5(PlayerNumber[val.id],pctMake[x])
if i == 6:
val = Player.lookup_player_name_by_id(name)
Player.addDis6(PlayerNumber[val.id],pctMake[x])
PlayerNumber.sort(key = operator.attrgetter("name"))
#PlayerNumber[2].displayPlayer()
I'm using Python 3.4 spyder IDE. I'm relatively new to python as an FYI.
Thanks!
It isn't that PlayerNumber is being referred to as an integer, but rather that PlayerNumber is a list of integers, and every element of that list (and integer) doesn't has an attribute "name", which sort() is trying to access (in order to sort them).
Edit:
To elaborate, the second to last line in your sample:
PlayerNumber.sort(key = operator.attrgetter("name"))
is trying to sort PlayerNumber, using the comparison function: operator.attrgetter("name"), which means it must call that function on each element of PlayerNumber to get its rank in the sorted array. That is why you are trying to grab a .name attribute from the integers in PlayerNumber.
Related
I am having trouble getting my class pickled.[This is an example of my function that creates a spike class with certain attributes]
def load_pt(ptname, data_directory):
"""
input: ptname, a string containing the name of the patient you want to load. example: 'ABC123'
output: object: spike ---- contains: List of 1000 random spikes: spike.select and their subsequent:
values: spike.values, chlabels: spike.chlabels, fs: spike.fs, soz channels: spike.soz
"""
val = mat73.loadmat(data_directory + '/values/values_{}.mat'.format(ptname))
val2 = val['values_all']
select_spikes = loadmat(data_directory + '/randi/randi_{}.mat'.format(ptname))
select_spikes = select_spikes['select_spikes']
ch_labels = loadmat(data_directory + '/chlabels/chlabels_{}.mat'.format(ptname))
ch_labels = ch_labels['ch_labels_all']
fs_all = loadmat(data_directory + '/fs/fs_{}.mat'.format(ptname))
fs_all = fs_all['fs_all']
SOZ_chlabels = pd.read_csv(data_directory + '/pt_data/SOZ_channels.csv')
pt_all = pd.read_csv(data_directory + '/pt_data/ptname_all.csv')
pt_name = ("'{}'".format(ptname))
whichpt = pt_all.index[pt_all['ptname'] == pt_name].tolist()
clean_SOZ_chlabels = prep_clean_soz(SOZ_chlabels)
global spike
class spike:
values = val2
select=select_spikes
chlabels = ch_labels
fs = fs_all
soz = clean_SOZ_chlabels[whichpt[0]]
return spike
I understand that the code requires some global effect. I'm fairly new to coding and would love some pointers.
The error I get is:
AttributeError: Can't pickle local object 'load_pt..spike'
Things I've tried:
add a global spike before initializing my class object in my load_pt function (did not work)
get error: Can't get attribute 'spike' on <module 'ied_functions' from 'directory_name'
You need to instantiate the class Spike first before you can return it.
To do this the class does not need to be global. After defining the class instantiate it by writing spike = Spike(). After that, you can return the instance of spike that you just created with return spike.
Into the pickle function you now need to pass load_pt and not load_pt.spike. Now you are passing the function load_pt which returns an instance of the class Spike. load_pt.spike references a local variable inside of load_pt which was never created.
See the entire code here:
import pickle
def load_pt(ptname, data_directory):
# input: ptname, a string containing the name of the patient you want to load. example: 'ABC123'
# output: object: spike ---- contains: List of 1000 random spikes: spike.select and their subsequent:
# values: spike.values, chlabels: spike.chlabels, fs: spike.fs, soz channels: spike.soz
val = mat73.loadmat(data_directory + '/values/values_{}.mat'.format(ptname))
val2 = val['values_all']
select_spikes = loadmat(data_directory + '/randi/randi_{}.mat'.format(ptname))
select_spikes = select_spikes['select_spikes']
ch_labels = loadmat(data_directory + '/chlabels/chlabels_{}.mat'.format(ptname))
ch_labels = ch_labels['ch_labels_all']
fs_all = loadmat(data_directory + '/fs/fs_{}.mat'.format(ptname))
fs_all = fs_all['fs_all']
SOZ_chlabels = pd.read_csv(data_directory + '/pt_data/SOZ_channels.csv')
pt_all = pd.read_csv(data_directory + '/pt_data/ptname_all.csv')
pt_name = ("'{}'".format(ptname))
whichpt = pt_all.index[pt_all['ptname'] == pt_name].tolist()
clean_SOZ_chlabels = prep_clean_soz(SOZ_chlabels)
class Spike:
values = val2
select = select_spikes
chlabels = ch_labels
fs = fs_all
soz = clean_SOZ_chlabels[whichpt[0]]
spike = Spike()
return spike
pickle.dump(load_pt, open( "spike.pkl", "wb" ))
Hope this helpes! :)
Edit:
Also I would follow #MichaelButschers advice:
It is unusual to define a specialized class in a function. Probably you only need an object of the class holding the data as instance attributes you are currently storing as class attributes. In this case you can define a generic class at global level and create an object in the function.
That might look something like this:
import pickle
def load_pt(ptname, data_directory):
# input: ptname, a string containing the name of the patient you want to load. example: 'ABC123'
# output: object: spike ---- contains: List of 1000 random spikes: spike.select and their subsequent:
# values: spike.values, chlabels: spike.chlabels, fs: spike.fs, soz channels: spike.soz
val2 = val['values_all']
select_spikes = select_spikes['select_spikes']
ch_labels = ch_labels['ch_labels_all']
fs_all = loadmat(data_directory + '/fs/fs_{}.mat'.format(ptname))
fs_all = fs_all['fs_all']
SOZ_chlabels = pd.read_csv(data_directory + '/pt_data/SOZ_channels.csv')
pt_all = pd.read_csv(data_directory + '/pt_data/ptname_all.csv')
pt_name = ("'{}'".format(ptname))
whichpt = pt_all.index[pt_all['ptname'] == pt_name].tolist()
clean_SOZ_chlabels = prep_clean_soz(SOZ_chlabels)
spike = Spike(val2, select_spikes, ch_labels, fs_all, clean_SOZ_chlabels[whichpt[0]])
return spike
class Spike():
def __init__(self, values, select, chlabels, fs, soz):
self.values = values
self.select = select
self.chlabels = chlabels
self.fs = fs
self.soz = soz
spike = load_pt('pt_name', 'data_directory')
pickle.dump(spike, open( "spike.pkl", "wb" ))
print(pickle.load(open( "spike.pkl", "rb" )))
I inherited a bit of Python code and I have no background. I am getting unbound local error and is probably something really silly on my part.
UnboundLocalError Traceback (most recent call last)
Input In [1], in <cell line: 372>()
368 print('\n----------------------------------------------------------------------\n')
369 ##############################################################################
--> 372 main()
Input In [1], in main()
319 Gender = p.getGender()
320 StateRes = p.getStateRes()
--> 321 children = immunte(Fname, Lname, DOB, Gender, driver)
323 if children == []:
324 not_found += 1
Input In [1], in immunte(Fname, Lname, DOB, Gender, driver)
204 except WebDriverException:
205 al = []
--> 207 return al
UnboundLocalError: local variable 'al' referenced before assignment
I have looked at this for a few days now and I can't seem to find an answer to this problem even though it is likely simple. It seems a solution to this error is a global keyword somewhere but I am not sure if this applies here as every time I tried to apply global to al = [] i got an error or same result. Any help is appreciated, thank you.
# Imports
import csv
import datetime
import os
import os.path
import time
import pandas as pd
from dateutil.parser import parse
from pandas import DataFrame
from selenium import webdriver
from selenium.common.exceptions import (NoSuchElementException,
WebDriverException)
from selenium.webdriver.support.select import Select
from selenium.webdriver.chrome.service import Service
##############################################################################
# Classes
class Person(object):
def __init__(self, measYr, MEMID, MIDSK, fname, lname, LNMSFX, DRB, GDR, STRES, meas):
self.measYr = measYr
self.MEMID = MEMID
self.MIDSK = MIDSK
self.fname = fname
self.lname = lname
self.LNMSFX = LNMSFX
self.DRB = DRB
self.GDR = GDR
self.STRES = STRES
self.meas = meas
def GTMESYR(self):
return self.measYr
def GTMEMSKY(self):
return self.MIDSK
def GTMEMID(self):
return self.MEMID
def GTFSNM(self):
return self.fname
def GTLSNM(self):
return self.lname
def GTLSTNMSF(self):
return self.LNMSFX
def GTDRB(self):
return self.DRB
def GTGDR(self):
return self.GDR
def getStateRes(self):
return self.STRES
def getMeas(self):
return self.meas
###############################################################################
# Function
def is_date(string, fuzzy=False):
try:
parse(string, fuzzy=fuzzy)
return True
except ValueError:
return False
def immunte(Fname, Lname, DRB, GDR, driver):
# work on search button
driver.find_element_by_xpath("//*[#id='edittesttest']").click()
# work on
lastname = driver.find_element_by_id("LM")
lastname.clear()
lastname.send_keys(Lname)
# work on
firstname = driver.find_element_by_id("FN")
firstname.clear()
firstname.send_keys(Fname)
# work on
birthdate = driver.find_element_by_id("DRB")
birthdate.clear()
birthdate.send_keys(DRB)
# work on advanced search button to input GDR
try:
driver.find_element_by_xpath(
"//*[#id='queryResultsForm']/table/tbody/tr/td[2]/table/tbody/tr[3]/td/table/tbody/tr[2]/td[5]/input").click()
# work on GDR selection button
obj = Select(driver.find_element_by_name("OSC"))
if GDR == 'W':
obj.select_by_index(2)
elif GDR == 'S':
obj.select_by_index(1)
else:
obj.select_by_index(3)
# work on search button
driver.find_element_by_name("cmdFindClient").click()
# two scenarios could emerge as a search result: 1, not found 2, the found
if "No were found for the requested search criteria" in driver.find_element_by_id("queryResultsForm").text:
al = []
elif "the found" in driver.find_element_by_id("queryResultsForm").text:
# work on button
driver.find_element_by_xpath(
"//*[#id='queryResultsForm']/table[2]/tbody/tr[2]/td[2]/span/label").click()
# work on pt button
driver.find_element_by_id("redirect1").click()
# work on getting rid of opt out - header
header = driver.find_elements_by_class_name("large")[1].text
if "Access Restricted" in header:
print(Fname+' '+Lname+' '+" Opt out")
al = []
elif "Information" in header:
# find the first line
first = driver.find_element_by_xpath(
"//*[#id='container']/table[3]/tbody/tr/td[2]/table[2]/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[5]/td[1]").text
if (first == None):
al = []
else:
even = driver.find_elements_by_class_name("evenRow")
odd = driver.find_elements_by_class_name("oddRow")
o = []
e = []
for value in odd:
o.append(value.text)
for value in even:
e.append(value.text)
length = len(o)
i = 0
al = []
# merge odd and even row together and remove the row marked with complete
while i < length:
al.append(e[i])
al.append(o[i])
i = i+1
# parse each row of information with a comma, add group name for row that are without one
for x in range(len(al)):
if is_date(al[x][1:10]):
al[x] = al[x].replace(' ', ',')
al[x] = al[x].replace(',of,', ' of ')
al[x] = group + ',' + al[x][2:]
else:
al[x] = al[x].replace(' ', ',')
al[x] = al[x].replace(',of,', ' of ')
g = al[x].split(',', 1)
group = g[0]
# work on returning to home page
driver.find_element_by_xpath(
"//*[#id='headerMenu']/table/tbody/tr/td[2]/div/a").click()
except NoSuchElementException:
al = []
except WebDriverException:
al = []
return al
def main():
# Welcome message and input info
print('\nThis is the test.')
print('You will be prompted to type .')
print('If you need to exit the script and stop its process press \'CTRL\' + \'C\'.')
file = input("\nEnter file name: ")
user = input("\nEnter username: ")
pw = input("\nEnter password: ")
date = str(datetime.date.today())
# output file
fileOutputName = 'FILELIST' + \
date.replace('-', '_') + '.csv'
fileOutputNameNotFound = 'NOTFOUNDFILELIST' + \
date.replace('-', '_') + '.csv'
fileOutput = open(fileOutputName, 'w')
fileOutputNotFound = open(fileOutputNameNotFound, 'w')
fileOutput.write('MEAS_YR,MEMLFIDSK,MEMLFID,MEMB_FRST_NM,MEMLSTNM,' +
'DRB,GNDR,RSDNC_STATE,IMUN_RGSTRY_STATE,VCCN_GRP,VCCN_ADMN_DT,DOSE_SERIES,' +
'BRND_NM,DOSE_SIZE,RCTN\n')
fileOutputNotFound.write('MEAS_YR,MEMLFIDSK,MEMLFID,MEMB_FRST_NM,MEMLSTNM,MEMB_SUFFIX,' +
'DRB,GNDR,RSDNC_STATE,IMUN_RGSTRY_STATE,VCCN_GRP,VCCN_ADMN_DT,DOSE_SERIES,' +
'BRND_NM,DOSE_SIZE,RCTN\n')
# If the file exists
try:
os.path.isfile(file)
except:
print('File Not Found\n')
df = pd.read_excel(file)
# create array of People objects and member ID
peopleArray = []
memberIdArray = []
df.dropna()
total = len(df)
not_found = 0
found = 0
# assign each record in the data frame into Person class
for i in range(total):
measYr = str(df.loc[i, "MEAS_YR"])
MEMID = str(df.loc[i, "MEMLFID"])
MIDSK = str(df.loc[i, "MEMLFIDSK"])
fname = str(df.loc[i, "MEMLFID"])
lname = str(df.loc[i, "MEMLSTNM"])
inputDate = str(df.loc[i, "DRB"])
# If date is null then assign an impossible date
if not inputDate:
DRB = '01/01/1900'
if '-' in inputDate:
DRB = datetime.datetime.strptime(
inputDate, "%Y-%m-%d %H:%M:%S").strftime('%m/%d/%Y')
else:
DRB = datetime.datetime.strptime(
str(df.loc[i, "DRB"]), '%m/%d/%Y').strftime('%m/%d/%Y')
GDR = str(df.loc[i, "GDR"])
STRES = str(df.loc[i, "STATE_RES"])
meas = str(df.loc[i, "MEAS"])
p = Person(measYr, MEMID, MIDSK, fname, lname,
LNMSFX, DRB, GDR, STRES, meas)
# append array
m = df.loc[i, "MEMLFID"]
if (m not in memberIdArray):
peopleArray.append(p)
memberIdArray.append(m)
# work on setting up driver for md immunet - mac forward slash/windows double backward slash
PATH = os.getcwd()+'\\'+'chromedriver'
s = Service(PATH)
driver = webdriver.Chrome(service = s)
driver.get("https://www.wow2.pe.org/prd-IR/portalmanager.do")
# work on login ID
username = driver.find_element_by_id("userField")
username.clear()
username.send_keys(user)
# work on password
password = driver.find_element_by_name("password")
password.clear()
password.send_keys(pw)
# work on getting to home page - where loop will start
driver.find_element_by_xpath(
"//*[#id='loginButtonForm']/div/div/table/tbody/tr[3]/td[1]/input").click()
for n in range(total):
p = peopleArray[n]
recordToWrite = ''
print('Looking up: ' + str(n)+' ' +
p.GTLSNM() + ', ' + p.GTFSNM())
MeasYr = p.GTMESYR()
MIDSK = p.GTMEMSKY()
MEMID = p.GTMEMID()
Fname = p.GTFSNM()
Lname = p.GTLSNM()
DRB = str(p.GTDRB())
GDR = p.GTGDR()
STRES = p.getStateRes()
children = immunte(Fname, Lname, DRB, GDR, driver)
if children == []:
not_found += 1
recordToWrite = MeasYr+','+MIDSK+','+MEMID+',' + Fname + \
','+Lname + ',' + ' ' + ','+DRB+','+GDR+','+STRES+','+'MD'
fileOutputNotFound.write(recordToWrite + '\n')
elif children != []:
found += 1
for x in range(len(children)):
data_element = children[x].split(",")
# if the admin date is not valid
if is_date(data_element[1]) and is_date(data_element[3]):
children[x] = ''
elif is_date(data_element[1]) and data_element[2] == 'NOT' and data_element[3] == 'VALID':
children[x] = ''
elif is_date(data_element[1]) and is_date(data_element[3]) == False:
if data_element[5] != 'No':
data_element[4] = data_element[5]
data_element[5] = ''
children[x] = ','.join(data_element[0:6])
else:
data_element[5] = ''
children[x] = ','.join(data_element[0:6])
else:
children[x] = ''
for x in range(len(children)):
if children[x] != '':
recordToWrite = MeasYr+','+MIDSK+','+MEMID+',' + \
Fname+','+Lname + ','+DRB+','+GDR+','+STRES+','+'MD'
recordToWrite = recordToWrite+','+children[x]
fileOutput.write(recordToWrite + '\n')
n = +1
fileOutput.close()
fileOutputNotFound.close()
print('\n--------------------------------OUTPUT--------------------------------')
print("Script completed.")
##############################################################################
main()
You can try this in the 'immunte' function:
def immunte(Fname, Lname, DRB, GDR, driver):
al = []
# work on search button
driver.find_element_by_xpath("//*[#id='edittesttest']").click()
# work on
lastname = driver.find_element_by_id("LM")
lastname.clear()
lastname.send_keys(Lname)
# work on
firstname = driver.find_element_by_id("FN")
firstname.clear()
firstname.send_keys(Fname)
# work on
birthdate = driver.find_element_by_id("DRB")
birthdate.clear()
birthdate.send_keys(DRB)
# work on advanced search button to input GDR
try:
driver.find_element_by_xpath(
"//*[#id='queryResultsForm']/table/tbody/tr/td[2]/table/tbody/tr[3]/td/table/tbody/tr[2]/td[5]/input").click()
# work on GDR selection button
obj = Select(driver.find_element_by_name("OSC"))
if GDR == 'W':
obj.select_by_index(2)
elif GDR == 'S':
obj.select_by_index(1)
else:
obj.select_by_index(3)
# work on search button
driver.find_element_by_name("cmdFindClient").click()
# two scenarios could emerge as a search result: 1, not found 2, the found
if "No were found for the requested search criteria" in driver.find_element_by_id("queryResultsForm").text:
return al
if "the found" in driver.find_element_by_id("queryResultsForm").text:
# work on button
driver.find_element_by_xpath(
"//*[#id='queryResultsForm']/table[2]/tbody/tr[2]/td[2]/span/label").click()
# work on pt button
driver.find_element_by_id("redirect1").click()
# work on getting rid of opt out - header
header = driver.find_elements_by_class_name("large")[1].text
if "Access Restricted" in header:
print(Fname+' '+Lname+' '+" Opt out")
return al
if "Information" in header:
# find the first line
first = driver.find_element_by_xpath(
"//*[#id='container']/table[3]/tbody/tr/td[2]/table[2]/tbody/tr/td/table/tbody/tr[1]/td/table/tbody/tr[5]/td[1]"
).text
if (first == None):
return al
even = driver.find_elements_by_class_name("evenRow")
odd = driver.find_elements_by_class_name("oddRow")
o = []
e = []
for value in odd:
o.append(value.text)
for value in even:
e.append(value.text)
length = len(o)
i = 0
al = []
# merge odd and even row together and remove the row marked with complete
while i < length:
al.append(e[i])
al.append(o[i])
i = i+1
# parse each row of information with a comma, add group name for row that are without one
for x in range(len(al)):
if is_date(al[x][1:10]):
al[x] = al[x].replace(' ', ',')
al[x] = al[x].replace(',of,', ' of ')
al[x] = group + ',' + al[x][2:]
else:
al[x] = al[x].replace(' ', ',')
al[x] = al[x].replace(',of,', ' of ')
g = al[x].split(',', 1)
group = g[0]
# work on returning to home page
driver.find_element_by_xpath(
"//*[#id='headerMenu']/table/tbody/tr/td[2]/div/a").click()
except NoSuchElementException:
pass
except WebDriverException:
pass
return al
This is the output of my python script so far.
Excel Table
The vertical axis of the table are road names. The horizontal axis are dates. The values indicate if a road was under construction at the time and why. I'd like to make a line graph that groups the dates by years 2017, 2018, 2019 etc... and plots the longest amount a time within those groups that a road was under construction and the average amount for the whole year. I'm a complete novice in excel and don't know how to leverage it's features to achieve my goal, though I suspect that there may be built in functions that do what I want without much difficulty. Any suggestions on how can achieve my desired output would be much appreciated. EDIT: It was suggested that I post my code so far.
import re
import time
startTime = time.time()
import collections
import xlsxwriter as xlswr
import scipy.spatial as spy
from itertools import islice
from itertools import groupby
from natsort import natsorted
from functools import partial
from collections import Counter
from datetime import date as DATE
from indexed import IndexedOrderedDict
from multiprocessing.dummy import Pool as ThreadPool
import multiprocessing as mp
workBook = xlswr.Workbook("testfix.xlsx")
cell_format = workBook.add_format()
format1 = workBook.add_format({'num_format': 'mm/dd/yy'})
sheet = workBook.add_worksheet()
def to_raw(string):
return fr"{string}"
def cvrt(x):
ans = re.split(r'(\d+)(?!.*\d)', x)
return int(ans[1])
def indexer(s):
pattern = re.compile(r'I, [0-9]+, ')
gm = re.split(pattern, s);
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def int2Date(x):
string = str(x)
Y = int(string[0:4])
M = int(string[4:6])
D = int(string[6:8])
return DATE(Y,M,D)
def dDelta(x, y):
string1 = str(x)
string2 = str(y)
Y1 = int(string1[0:4])
M1 = int(string1[4:6])
D1 = int(string1[6:8])
Y2 = int(string2[0:4])
M2 = int(string2[4:6])
D2 = int(string2[6:8])
f_date = DATE(Y1,M1,D1)
l_date = DATE(Y2,M2,D2)
delta = l_date - f_date
if isinstance(y, int):
return float(int((delta.days)/30.44))
else:
return int((delta.days)/30.44)
def Book(path):
file = open(path,'r')
lines = file.readlines()
file.close()
book = IndexedOrderedDict()
for line in lines:
if re.match("I", line):
IDs = indexer(line)[1]
if re.match(" 0.00,", line):
rID = line
#"GM_FINAL_AUTH,0,[1-9]"
if re.search("GM_FINAL_AUTH,0,[1-9]", line):
book.update({(rID, line): to_raw(IDs)})
return sort_book(book)
def dUpdate(dic, key, value):
return dic.update({(key[0], "GM_FINAL_AUTH,0,0"): value})
def valSplt(s):
pattern = re.compile(r'(\d+)')
gm = re.split(pattern, s)
values = s.rsplit(gm[1])
gm = gm[1]
values[1] = gm
return values
def sort_book(book):
book = natsorted([value, key] for key, value in book.items())
book = IndexedOrderedDict((data[1], data[0]) for data in book)
return book
def alph_order(word1, word2):
for i in range(min(len(word1), len(word2))):
if ord(word1[i]) == ord(word2[i]):
pass
elif ord(word1[i]) > ord(word2[i]):
return word2
else:
return word1
return word1
def read(cpdm, date_list):
sCnt = [0] * len(cpdm)
lowest_number = 999999999999
terminationCondition = [True] * len(cpdm)
saved_results = [0] * len(cpdm)
current_prefix = None
cnt = 0
while any(terminationCondition) is True:
saved_results = [0] * len(cpdm)
last_prefix = None
lowest_number = 999999999999
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
ID = cpdm[dicIdx].values()[dicVal]
# print(entry)
current_prefix, road_number = valSplt(ID)
road_number = int(road_number)
if last_prefix is None:
last_prefix = current_prefix
higherOrder_prefix = alph_order(last_prefix, current_prefix)
# print('check:',[higherOrder_prefix, last_prefix, current_prefix])
if current_prefix == higherOrder_prefix:
if current_prefix != last_prefix:
lowest_number = road_number
last_prefix = current_prefix
elif road_number < lowest_number:
lowest_number = road_number
last_prefix = current_prefix
for dicIdx, dicVal in enumerate(sCnt):
if dicVal < len(cpdm[dicIdx]):
# print(dicIdx, dicVal, len(cpdm[dicIdx]))
ID = cpdm[dicIdx].values()[dicVal]
VALUE = cpdm[dicIdx].keys()[dicVal]
# print(entry)
road_name, road_number = valSplt(ID)
road_number = int(road_number)
if road_name == last_prefix and lowest_number == road_number:
saved_results[dicIdx] = [ID, VALUE[1], date_list[dicIdx], VALUE[0]]
if dicVal < len(cpdm[dicIdx]):
sCnt[dicIdx] += 1
else:
terminationCondition[dicIdx] = False
else:
terminationCondition[dicIdx] = False
for rst in range(len(saved_results)):
if saved_results[rst] == 0:
pass
else:
sheet.write(cnt+1, 0, str(saved_results[rst][0]))
sheet.write(cnt+1, rst+1, cvrt(saved_results[rst][1]))
#sheet.write(cnt+1, 2*et+3, int2Date(saved_results[et][2]), format1)
#sheet.write(cnt+1, 0, saved_results[rst][3])
cnt += 1
def main():
# 2018 MAPS
path1 = "W:\\Scripting\\2018\\DBData_84577881.txt"
path2 = "W:\\Scripting\\2018\\DBData_84639568.txt"
path3 = "W:\\Scripting\\2018\\DBData_84652483.txt"
path4 = "W:\\Scripting\\2018\\DBData_84670490.txt"
# 2019 MAPS
path5 = "W:\\Scripting\\2019\\DBData_84706383.txt"
path6 = "W:\\Scripting\\2019\\DBData_84715201.txt"
path7 = "W:\\Scripting\\2019\\DBData_84743195.txt"
path8 = "W:\\Scripting\\2019\\DBData_84777742.txt"
path9 = "W:\\Scripting\\2019\\DBData_84815446.txt"
path10 = "W:\\Scripting\\2019\\DBData_84835743.txt"
# 2020 MAPS
path11 = "W:\\Scripting\\2020\\DBData_84882849.txt"
path12 = "W:\\Scripting\\2020\\DBData_84966202.txt"
path13 = "W:\\Scripting\\2020\\DBData_84988789.txt"
p_list = [path1, path2, path3, path4, path5, path6, path7,
path8, path9, path10, path11, path12, path13]
pool = mp.Pool(mp.cpu_count())
CPDM = pool.map(Book, p_list)
pool.close()
#pool.join()
date_list = [20180809, 20180913, 20181011, 20181204, 20190222, 20190325,
20190501, 20190628, 20190815, 20190925, 20200207, 20200501, 20200617]
#CPDM = [b1, b2, b3, b4, b5, b6, b7, b8, b9, b10, b11, b12, b13]
for i in CPDM:
print(len(i))
#sheet.write("A1", "Lat Long")
sheet.write("A1", "ID")
#for i in range(len(CPDM)):
cn = 0
for i in date_list:
#sheet.write(0, 3*i+1, "ID" + str(i+1))
sheet.write(0, cn+1, int2Date(i), format1)
cn += 1
#sheet.write(0, 2*i+3, "Date" + str(i+1))
read(CPDM, date_list)
workBook.close()
if __name__ == "__main__":
main()
executionTime = (time.time() - startTime)
print('Execution time in minutes: ' + str(executionTime/60))
Long story short, what you want is not exactly possible. Your data contains spot measurements, so what happened in between? Or after? Was the road under construction or not? This makes it impossible to calculate an accurate number of days that the road was under construction.
It is possible to do something that approximates what you want, but that will require some choices from your side. For example, if you measure that the road is under construction on 08/15/2019 but not anymore on 05/01/2020, do you count all the days between those 2 dates as closed? Or only until new years?
To help you get started I've added a little script that does some formatting on your data. It should give you an idea of how to handle the data.
import pandas
import plotly.express as px
# Read the Excel file
df = pandas.read_excel("./test.xlsx", index_col="ID")
# Flip the dataframe (dates should be on the index)
df = df.transpose()
# Fill any empty cells with 0
df = df.fillna(0)
# Combine columns with the same name
df = df.groupby(df.columns, axis=1).agg(lambda column: column.max(axis=1))
# Make sure the dates are sorted
df = df.sort_index()
# Create a list to hold all the periods per road
roads = []
for road_name in df.columns:
# Group by consecutive 1's
groups = df.loc[df[road_name] == 1, road_name].groupby((df[road_name] != 1).cumsum())
# Every group denotes a period for which the road was under construction
for _, group in groups:
# Get the start and finish for each group
roads.append({
"road": road_name,
"start": group.index[0],
"finish": group.index[-1] + pandas.Timedelta(1, unit="D"), # Add one day because groups with same start and finish will not be visible on the plot
})
# Convert back to a dataframe
roads_df = pandas.DataFrame(roads)
# Create a Gantt chart with Plotly (NOTE: you'll need version 4.9+ of Plotly)
fig = px.timeline(roads_df, x_start="start", x_end="finish", y="road")
fig.update_yaxes(autorange="reversed") # otherwise tasks are listed from the bottom up
fig.show()
I have a code which calculates the movie similarities given a dataset, which is ratings.dat and movies.dat. The code however, is written in python 2.7.
I tried converting the code to python-3 but was unable to get the desired results. Need some expert help to review if there are any mistakes in the code.
Below is the code is the code area which I need to convert to python 3:
def makePairs((user, ratings)):
(movie1, rating1) = ratings[0]
(movie2, rating2) = ratings[1]
return ((movie1, movie2), (rating1, rating2))
def filterDuplicates( (userID, ratings) ):
(movie1, rating1) = ratings[0]
(movie2, rating2) = ratings[1]
return movie1 < movie2
and also this
# Filter for movies with this sim that are "good" as defined by
# our quality thresholds above
filteredResults = moviePairSimilarities.filter(lambda((pair,sim)): \
(pair[0] == movieID or pair[1] == movieID) \
and sim[0] > scoreThreshold and sim[1] > coOccurenceThreshold)
# Sort by quality score.
results = filteredResults.map(lambda((pair,sim)): (sim, pair)).sortByKey(ascending = False).take(10)
the complete code as follows
spark-submit mycodefile.py 50
here's the code in python 2.7
import sys
from pyspark import SparkConf, SparkContext
from math import sqrt
def loadMovieNames():
movieNames = {}
with open("movies.dat") as f:
for line in f:
fields = line.split("::")
movieNames[int(fields[0])] = fields[1].decode('ascii', 'ignore')
return movieNames
def makePairs((user, ratings)):
(movie1, rating1) = ratings[0]
(movie2, rating2) = ratings[1]
return ((movie1, movie2), (rating1, rating2))
def filterDuplicates( (userID, ratings) ):
(movie1, rating1) = ratings[0]
(movie2, rating2) = ratings[1]
return movie1 < movie2
def computeCosineSimilarity(ratingPairs):
numPairs = 0
sum_xx = sum_yy = sum_xy = 0
for ratingX, ratingY in ratingPairs:
sum_xx += ratingX * ratingX
sum_yy += ratingY * ratingY
sum_xy += ratingX * ratingY
numPairs += 1
numerator = sum_xy
denominator = sqrt(sum_xx) * sqrt(sum_yy)
score = 0
if (denominator):
score = (numerator / (float(denominator)))
return (score, numPairs)
conf = SparkConf()
sc = SparkContext(conf = conf)
print("\nLoading movie names...")
nameDict = loadMovieNames()
data = sc.textFile("ratings.dat")
# Map ratings to key / value pairs: user ID => movie ID, rating
ratings = data.map(lambda l: l.split("::")).map(lambda l: (int(l[0]), (int(l[1]), float(l[2]))))
# Emit every movie rated together by the same user.
# Self-join to find every combination.
ratingsPartitioned = ratings.partitionBy(100)
joinedRatings = ratingsPartitioned.join(ratingsPartitioned)
# At this point our RDD consists of userID => ((movieID, rating), (movieID, rating))
# Filter out duplicate pairs
uniqueJoinedRatings = joinedRatings.filter(filterDuplicates)
# Now key by (movie1, movie2) pairs.
moviePairs = uniqueJoinedRatings.map(makePairs).partitionBy(100)
# We now have (movie1, movie2) => (rating1, rating2)
# Now collect all ratings for each movie pair and compute similarity
moviePairRatings = moviePairs.groupByKey()
# We now have (movie1, movie2) = > (rating1, rating2), (rating1, rating2) ...
# Can now compute similarities.
moviePairSimilarities = moviePairRatings.mapValues(computeCosineSimilarity).persist()
# Save the results if desired
moviePairSimilarities.sortByKey()
moviePairSimilarities.saveAsTextFile("movie-sims")
# Extract similarities for the movie we care about that are "good".
if (len(sys.argv) > 1):
scoreThreshold = 0.97
coOccurenceThreshold = 1000
movieID = int(sys.argv[1])
# Filter for movies with this sim that are "good" as defined by
# our quality thresholds above
filteredResults = moviePairSimilarities.filter(lambda((pair,sim)): \
(pair[0] == movieID or pair[1] == movieID) \
and sim[0] > scoreThreshold and sim[1] > coOccurenceThreshold)
# Sort by quality score.
results = filteredResults.map(lambda((pair,sim)): (sim, pair)).sortByKey(ascending = False).take(10)
print("Top 10 similar movies for " + nameDict[movieID])
for result in results:
(sim, pair) = result
# Display the similarity result that isn't the movie we're looking at
similarMovieID = pair[0]
if (similarMovieID == movieID):
similarMovieID = pair[1]
print(nameDict[similarMovieID] + "\tscore: " + str(sim[0]) + "\tstrength: " + str(sim[1]))
any help is much appreciated.
Regard
what I have already done is converting this code to a python 3 equivalent code as follows, but unable to get the desired results.
import sys
from pyspark import SparkConf, SparkContext
from math import sqrt
def loadMovieNames():
movieNames = {}
with open("movies.dat") as f:
for line in f:
fields = line.split("::")
movieNames[int(fields[0])] = fields[1] #.decode('ascii', 'ignore')
return movieNames
def makePairs(*ratings):
for t in ratings:
(movie1, rating1) = t[1][0]
(movie2, rating2) = t[1][1]
return ((movie1, movie2), (rating1, rating2))
def filterDuplicates(*ratings):
for t in ratings:
(movie1, rating1) = t[1][0]
(movie2, rating2) = t[1][1]
return movie1 < movie2
def computeCosineSimilarity(ratingPairs):
numPairs = 0
sum_xx = sum_yy = sum_xy = 0
for ratingX, ratingY in ratingPairs:
sum_xx += ratingX * ratingX
sum_yy += ratingY * ratingY
sum_xy += ratingX * ratingY
numPairs += 1
numerator = sum_xy
denominator = sqrt(sum_xx) * sqrt(sum_yy)
score = 0
if (denominator):
score = (numerator / (float(denominator)))
return (score, numPairs)
conf = SparkConf().setMaster("local[*]").setAppName("MovieSimilarities")
sc = SparkContext(conf = conf)
print("\nLoading movie names...")
nameDict = loadMovieNames()
print("\nLoading movie ratings...")
data = sc.textFile("ratings100.dat")
print("\nDone..")
# Map ratings to key / value pairs: user ID => movie ID, rating
ratings = data.map(lambda l: l.split("::")).map(lambda l: (int(l[0]), (int(l[1]), float(l[2]))))
# Emit every movie rated together by the same user.
# Self-join to find every combination.
ratingsPartitioned = ratings.partitionBy(100)
joinedRatings = ratingsPartitioned.join(ratingsPartitioned)
#joinedRatings = ratings.join(ratings)
# At this point our RDD consists of userID => ((movieID, rating), (movieID, rating))
# Filter out duplicate pairs
uniqueJoinedRatings = joinedRatings.filter(filterDuplicates)
# Now key by (movie1, movie2) pairs.
moviePairs = uniqueJoinedRatings.map(makePairs).partitionBy(100)
# We now have (movie1, movie2) => (rating1, rating2)
# Now collect all ratings for each movie pair and compute similarity
moviePairRatings = moviePairs.groupByKey()
# We now have (movie1, movie2) = > (rating1, rating2), (rating1, rating2) ...
# Can now compute similarities.
moviePairSimilarities = moviePairRatings.mapValues(computeCosineSimilarity).persist()
# Save the results if desired
moviePairSimilarities.sortByKey()
moviePairSimilarities.saveAsTextFile("movie-sims")
# Extract similarities for the movie we care about that are "good".
if (len(sys.argv) > 1):
scoreThreshold = 0.9
coOccurenceThreshold = 1000
movieID = int(sys.argv[1])
# Filter for movies with this sim that are "good" as defined by
# our quality thresholds above
filteredResults = moviePairSimilarities.filter(lambda pairSim: (pairSim[0][0] == movieID or pairSim[0][1] == movieID) and pairSim[1][0] > scoreThreshold and pairSim[1][1] > coOccurenceThreshold)
# Sort by quality score.
results = filteredResults.map(lambda pairSim: (pairSim[1], pairSim[0])).sortByKey(ascending = False).take(10)
print("Top 10 similar movies for " + str(nameDict[movieID]))
for result in results:
(sim, pair) = result
# Display the similarity result that isn't the movie we're looking at
similarMovieID = pair[0]
if (similarMovieID == movieID):
similarMovieID = pair[1]
print(nameDict[similarMovieID] + "\tscore: " + str(sim[0]) + "\tstrength: " + str(sim[1]))
Below is expected result, which should show top 10 similar movie results.
Top 10 similar movies for Wizard of Oz, The (1939)
Toy Story (1995) score: 661 strength: 1545
Some Other Movie score: 594 strength: 720
Another Movie score: 2018 strength: 2804
def f(*tuplex) is not the same as def f((x, y)); it is (more or less) the same as def f(x, y). That is, the first (py3) function receives a list of non-keyword argument, the second (py2) a single tuple argument. Since you're passing a single-element (which happens to be a tuple), tuplex will be a tuple of one element (and the resulting for t in tuplex will only iterate once). You should make it def(xy), where xy will be your (x, y) tuple.
Your Python 2 code:
def makePairs((user, ratings)):
(movie1, rating1) = ratings[0]
(movie2, rating2) = ratings[1]
return ((movie1, movie2), (rating1, rating2))
The actual compatible Python 3 code:
def makePairs(user_ratings):
_, ratings = user_ratings
(movie1, rating1) = ratings[0]
(movie2, rating2) = ratings[1]
return ((movie1, movie2), (rating1, rating2))
As also mentioned somewhere in the comments, you can replace this whole function by a simple zip call, for example:
>>> a = (('movie1', 'rating1'), ('movie2', 'rating2'))
>>> list(zip(*a))
[('movie1', 'movie2'), ('rating1', 'rating2')]
(you don't need list(...) if you just need to return an iterator, but that doesn't show the actual contents on the command line. so leave out the call to list(...) unless you get an error about a "zip object" in your actual code.)
The unfortunate parts here are that
- the map method where you use makePairs passes only a function, so you can't specify the asterisk.
- you need to get rid of the first argument, user.
You could probably use the following:
moviePairs = uniqueJoinedRatings.map(lambda x: zip(*x[1])).partitionBy(100)
(untested)
That gets rid of the complete makePairs function, at the cost of some clarity.
Last tidbit: make_pairs follows the style guide; makePairs is not Python style. As goes for all other names in your code. Since you mentioned the word review at the top of your question (but that's probably more a matter for Code Review.
error section--> student_grade_system = StudentGradeSystem(sys.argv1)
errored place-->
Traceback (most recent call last):
File "C:\Users\Daphnie\Desktop\Python_code\12\student_grade\grade_system.py", line 111, in
main()
File "C:\Users\Daphnie\Desktop\Python_code\12\student_grade\grade_system.py", line 105, in main
student_grade_system = StudentGradeSystem(sys.argv1)
IndexError: list index out of range
Code:
import sys
from student import Student
class StudentGradeSystem(object):
def __init__(self, score_file):
self._score_file = score_file
self._students = []
self._class_avg = 0.0
self._kor_avg = 0.0
self._eng_avg = 0.0
self._math_avg = 0.0
self._register_students()
def _register_students(self):
with open(self._score_file, "rt") as fp:
lines = fp.readlines()
for line in lines:
items = (line.strip()).split(",")
num = items[0]
name = items[1]
kor = int(items[2])
eng = int(items[3])
math = int(items[4])
student = Student(num, name, kor, eng, math)
self._students.append(student)
def _calculate_student_order(self):
temp_students = sorted(self._students, key = lambda x: x.total, reverse = True)
order = 1
for student in temp_students:
student.order = order
order = order + 1
self._students = temp_students
def _calculate_class_avg(self):
total = 0
for student in self._students:
total = total + student.total
self._class_avg = total / len(self._students)
def _calculate_kor_avg(self):
total = 0
for student in self._students:
total = total + student.kor
self._kor_avg = total / len(self._students)
def _calculate_eng_avg(self):
total = 0
for student in self._students:
total = total + student.eng
self._eng_avg = total / len(self._students)
def _calculate_math_avg(self):
total = 0
for student in self._students:
total = total + student.math
self._math_avg = total / len(self._students)
def _calculate_class_information(self):
self._calculate_class_avg()
self._calculate_kor_avg()
self._calculate_eng_avg()
self._calculate_math_avg()
def process(self):
self._calculate_student_order()
self._calculate_class_information()
def output_result(self, output_file):
student_output_format = "번호: {:2}, 이름: {}, 국어: {}, 영어: {}, 수학: {}, 총점: {}, 평균: {:.2f}, 등수: {}\n"
with open(output_file, "wt") as fp:
for student in self._students:
student_output = student_output_format.format(student.num, student.name, student.kor, student.eng,
student.math, student.total, student.avg, student.order)
fp.write(student_output)
fp.write("\n")
fp.write("반 평균: %.2f\n" % self._class_avg)
fp.write("국어 평균: %.2f\n" % self._kor_avg)
fp.write("영어 평균: %.2f\n" % self._eng_avg)
fp.write("수학 평균: %.2f\n" % self._math_avg)
print("성적 처리가 끝났습니다.")
def main():
student_grade_system = StudentGradeSystem(sys.argv[1])
student_grade_system.process()
student_grade_system.output_result(sys.argv[2])
if __name__ == "__main__":
main()
input("end")
What don't you understand exactly ? The error message is quite clear: you have an IndexError (you're trying to access a non-existant item in a list, tuple or other similar indexed sequence) at line 105, which is
student_grade_system = StudentGradeSystem(sys.argv[1])
In this line there's only one indexed access - sys.argv[1] - so it's obviously the culprit. Since your program obviously expects some arguments to be passed, it's your responsability to make sure 1. they are documented and 2. there are effectively passed to the program (which is obviously not the case here).