splitting names by counting commas gives value error

splitting names by counting commas gives value error - python

I have been using this code for this process for around six months and now it is throwing a ValueError: Columns must be same length as key. I haven't changed anything so I am not sure what could be wrong. Basically, I am pulling data from my system and it has names formatted like this FN1, FN2 LN1, LN2 and I need the names to be FN1 LN1 and FN2 LN2. The code runs fine until this last line.
df_gs_recruiter[[f'Recruiter_{j}']] = df_gs_recruiter[f'FN{j}'] + ' ' + df_gs_recruiter[f'LN{j}']
Sample of the data in:
Job ID
Recruiters
0
729538
Bonnie,Tina Smith,Matthews
1
720954
Cindy,Ken Harris,Walsh
2
720954
Cindy,Ken Harris,Walsh
3
721061
Cindy,Ken Harris,Walsh
import numpy as np
num_comma = df_gs_recruiter.Recruiters.str.count(',')
r_min = num_comma.min()
r_max = num_comma.max()
print(r_min,r_max)
df_gs_recruiter[['FN','LN']] = df_gs_recruiter.Recruiters.str.extract('(.*) (.*)',expand=True)
for i in range(len(df_gs_recruiter)):
r = df_gs_recruiter.loc[i,'Recruiters'].count(',')
if r/2 == 6:
df_gs_recruiter[['FN1','FN2','FN3','FN4','FN5', 'FN6', 'FN7']] = df_gs_recruiter.FN.str.extract('(.*),(.*),(.*),(.*),(.*),(.*),(.*)',expand=True)
df_gs_recruiter[['LN1','LN2','LN3','LN4','LN5', 'LN6', 'LN7']] = df_gs_recruiter.LN.str.extract('(.*),(.*),(.*),(.*),(.*),(.*),(.*)',expand=True)
elif r/2 == 5:
df_gs_recruiter[['FN1','FN2','FN3','FN4','FN5', 'FN6']] = df_gs_recruiter.FN.str.extract('(.*),(.*),(.*),(.*),(.*),(.*)',expand=True)
df_gs_recruiter[['LN1','LN2','LN3','LN4','LN5', 'LN6']] = df_gs_recruiter.LN.str.extract('(.*),(.*),(.*),(.*),(.*),(.*)',expand=True)
elif r/2 == 4:
df_gs_recruiter[['FN1','FN2','FN3','FN4','FN5']] = df_gs_recruiter.FN.str.extract('(.*),(.*),(.*),(.*),(.*)',expand=True)
df_gs_recruiter[['LN1','LN2','LN3','LN4','LN5']] = df_gs_recruiter.LN.str.extract('(.*),(.*),(.*),(.*),(.*)',expand=True)
elif r/2 == 3:
df_gs_recruiter[['FN1','FN2','FN3','FN4']] = df_gs_recruiter.FN.str.extract('(.*),(.*),(.*),(.*)',expand=True)
df_gs_recruiter[['LN1','LN2','LN3','LN4']] = df_gs_recruiter.LN.str.extract('(.*),(.*),(.*),(.*)',expand=True)
elif r/2 == 2:
df_gs_recruiter[['FN1','FN2','FN3']] = df_gs_recruiter.FN.str.extract('(.*),(.*),(.*)',expand=True)
df_gs_recruiter[['LN1','LN2','LN3']] = df_gs_recruiter.LN.str.extract('(.*),(.*),(.*)',expand=True)
elif r/2 == 1:
df_gs_recruiter[['FN1','FN2']] = df_gs_recruiter.FN.str.extract('(.*),(.*)',expand=True)
df_gs_recruiter[['LN1','LN2']] = df_gs_recruiter.LN.str.extract('(.*),(.*)',expand=True)
df_gs_recruiter.loc[i,'num'] = r/2 + 1
df_gs_recruiter.loc[i,'num'].astype(np.int8)
if df_gs_recruiter.loc[i,'num'] < 1.5:
df_gs_recruiter.loc[i,'FN0'] = df_gs_recruiter.loc[i,'FN']
df_gs_recruiter.loc[i,'LN0'] = df_gs_recruiter.loc[i,'LN']
else:
df_gs_recruiter.loc[i,'FN0'] = 'null'
df_gs_recruiter.loc[i,'LN0'] = 'null'
df_gs_recruiter.replace('null', np.nan, inplace=True)
for j in range(0,int(r_max/2)+2):
df_gs_recruiter[[f'Recruiter_{j}']] = df_gs_recruiter[f'FN{j}'] + ' ' + df_gs_recruiter[f'LN{j}']
Expected outcome:
0 4
Then this next part runs and shows the outcome.
for col in df_gs_recruiter.columns:
if 'Recruiter_' not in col and 'Job ID' not in col:
df_gs_recruiter.drop([f'{col}'],axis=1, inplace=True)
df_gs_recruiter
Expected outcome is:
Job ID
Recruiter_0
Recruiter_1
Recruiter_2
Recruiter_3
0
729538
NaN
Bonnie Smith
Tina Matthews
NaN
1
720954
NaN
Cindy Harris
Ken Walsh

Related

Why my openpyxl code is slower than my VBA code?

I have an excel file of nearly 95880 rows. I made a VBA function that runs slow, so I tried to code a python script using openpyxl, but it's even slower.
It starts fast, then after 600 rows becomes slower and slower.
The VBA Code is
Option Explicit
Function FTE(Assunzione As Date, Cess As Variant, Data)
Dim myDate As Date
Dim EndDate As Date, EndDate2 As Date
Dim check As Integer
EndDate = Application.WorksheetFunction.EoMonth(Assunzione, 0)
myDate = #1/1/2022#
If Cess = 0 Then
Call Check2(Assunzione, Data, myDate, EndDate, check)
FTE = check
Else:
EndDate2 = Application.WorksheetFunction.EoMonth(Cess, -1)
Call Check1(Assunzione, Cess, Data, myDate, EndDate, EndDate2, check)
FTE = check
End If
End Function
Sub Check1(Assunzione, Cess, Data, myDate, EndDate, EndDate2, check)
Dim Cess1 As Date
Dim gg_lav As Integer, gg_lav2 As Integer
Cess1 = Cess.Value
If Assunzione > Date Then
check = 0
Else
If Month(Assunzione) <= Month(Data) And Year(Assunzione) = 2022 Then
If Assunzione > myDate Then
gg_lav = Application.WorksheetFunction.Days(EndDate, Assunzione) + 1
If gg_lav >= 15 Then
If Month(Data) = (Month(EndDate2) + 1) And Year(Cess1) = 2022 Then
gg_lav2 = Application.WorksheetFunction.Days(Cess1, EndDate2)
If gg_lav2 >= 15 Then
check = 1
Else
check = 0
End If
Else
check = 1
End If
Else
check = 0
End If
Else
check = 1
End If
Else
check = 1
End If
End If
End Sub
Sub Check2(Assunzione, Data, myDate, EndDate, check)
Dim gg_lav As Integer
If Assunzione > Date Then
check = 0
Else
If Month(Assunzione) <= Month(Data) And Year(Assunzione) = 2022 Then
If Assunzione > myDate Then
gg_lav = Application.WorksheetFunction.Days(EndDate, Assunzione) + 1
If gg_lav >= 15 Then
check = 1
Else
check = 0
End If
Else
check = 1
End If
Else
check = 1
End If
End If
End Sub
and my openpyxl is:
def check1(a,d,c,i):
if ws.cell(row=i,column=a).value > ws.cell(row=i,column=d).value:
return 0
else:
if ws.cell(row=i,column=a).value.month == ws.cell(row=i,column=d).value.month and ws.cell(row=i,column=a).value.year == 2022:
EndDate = date(ws.cell(row=i,column=a).value.year, ws.cell(row=i,column=a).value.month,
calendar.monthrange(ws.cell(row=i,column=a).value.year,
ws.cell(row=i,column=a).value.month)[1])
gg_lav = (EndDate - datetime.date(ws.cell(row=i,column=a).value)).days
if gg_lav >= 15:
EndDate2 = date(ws.cell(row=i,column=c).value.year,ws.cell(row=i,column=c).value.month-1,
calendar.monthrange(ws.cell(row=i,column=c).value.year,
ws.cell(row=i,column=c).value.month-1)[1])
if ws.cell(row=i,column=d).value.month == EndDate2.month and ws.cell(row=i,column=c).value.year == 2022:
gg_lav2 = (datetime.date(ws.cell(row=i,column=c).value)-EndDate2).days
if gg_lav2 >= 15:
return 1
else:
return 0
else:
return 1
else:
return 0
else:
return 1
def check2(a,d,i):
if ws.cell(row=i,column=a).value > ws.cell(row=i,column=a).value:
return 0
else:
if ws.cell(row=i,column=a).value.month == ws.cell(row=i,column=d).value.month and ws.cell(row=i,column=a).value.year == 2022:
EndDate = date(ws.cell(row=i,column=a).value.year, ws.cell(row=i,column=a).value.month,
calendar.monthrange(ws.cell(row=i,column=a).value.year,
ws.cell(row=i,column=a).value.month)[1])
gg_lav = (EndDate - datetime.date(ws.cell(row=i,column=a).value)).days
if gg_lav >= 15:
return 1
else:
return 0
else:
return 1
wb1 = Workbook()
ws1 = wb1.create_sheet()
for i in range(2,95882):
if ws.cell(row = i, column = c).value == None:
ws1.cell(row = i, column = 1, value = check2(a, d, i))
else:
ws1.cell(row = i, column = 1, value = check1(a, d, c, i))
What am I doing wrong? Should I use another library or I'm making the code uselessy memory consuming?
Thank you very much for any help!

Update: I think that the problem was with openpyxl. First I tried to reduce the number of observation, from 95K to almost 5K, but it required two and half hour to complete the task.
So I used numpy and it took 55 seconds. Yeah, that's the difference in processing speed.
Here I post the code:
with open('data.csv','r') as f:
data = list(csv.reader(f,delimiter =';'))
arr = np.array(data)
arr = np.resize(arr,(4797,13))
I had to change of course the code in this section:
a = 3
d = 0
c = 4
def check1(a,d,c,i):
if int(arr[i][a]) > int(arr[i][d]):
return 0
else:
za = datetime.fromordinal((int(arr[i][a]) + 693594))
zd = datetime.fromordinal((int(arr[i][d]) + 693594))
da = date(za.year, za.month, za.day)
dd = date(zd.year, zd.month, zd.day)
if za.month == zd.month and za.year + 1899 == 2022:
EndDate = date(za.year, za.month,
calendar.monthrange(za.year,
za.month)[1])
gg_lav = (EndDate - da).days
if gg_lav >= 15:
zc = datetime.fromordinal((int(arr[i][c]) + 693594))
dc = date(zc.year, zc.month, zc.day)
EndDate2 = date(zc.year,zc.month-1,
calendar.monthrange(zc.year,
zc.month-1)[1])
if zd.month == EndDate2.month and zc.year == 2022:
gg_lav2 = (dc-EndDate2).days
if gg_lav2 >= 15:
return 1
else:
return 0
else:
return 1
else:
return 0
else:
return 1
I don't report the check2 function
fte = np.array(10)
for i in range(1,4797):
if arr[i][c] == '':
fte = np.append(fte,check2(a,d,i))
else:
fte = np.append(fte,check1(a, d, c, i))
print(i)

How can I display max number of loses from this dataframe in Pandas?

I wrote a webscraper which is downloading table tennis data. There is info about players, match score etc. I would like to display players which lost the most matches per day. I've created data frame and I would like to sum p1_status and p2_status, then I would like to display Surname and number of loses next to player.
https://gyazo.com/19c70e071db78071e83045bfcea0e772
Here is my code:
s = Service("D:/setka/chromedriver.exe")
option = webdriver.ChromeOptions()
driver = webdriver.Chrome(service=s)
hall = 10
num =1
filename = "C:/Users/filip/result2.csv"
f=open(filename,"w")
headers = "p1_surname, p1_name, p1_score, p2_surname, p2_name, p2_score, p1_status, p2_status \n"
f.write(headers)
while hall <= 10:
for period in [1]:
url = 'https://tabletennis.setkacup.com/en/schedule?date=2021-12-04&hall=' + \
str(hall) + '&' + 'period=' + str(period)
driver.get(url)
time.sleep(5)
divs = driver.find_elements(By.CSS_SELECTOR, "div.score-result")
for div in divs:
data = div.text.split()
#print(data)
if(num % 2) == 0:
f.write(str(data[0]) + "," + str(data[1]) + "," + str(data[2] + "," + "\n"))
else:
f.write(str(data[0]) + "," + str(data[1]) + "," + str(data[2] + ","))
num = num +1
hall =hall + 1
f.close()
df_results=pd.read_csv('C:/Users/filip/result2.csv', sep = r',',
skipinitialspace = True)
df_results.reset_index(drop=True, inplace=True)
df_results.loc[df_results['p1_score'] > df_results['p2_score'], ['p1_status','p2_status']] = ['won','lost']
df_results.loc[df_results['p1_score'] < df_results['p2_score'], ['p1_status','p2_status']] = ['lost','won']
df_results.loc[df_results['p1_score'] == df_results['p2_score'], ['p1_status','p2_status']] = ['not played','not played']
df_results.loc[((df_results['p1_score'] < 3) & (df_results['p1_score']!=0) & (df_results['p2_score'] <3) & (df_results['p2_score']!=0)), ['p1_status','p2_status']] = ['inplay','inplays']
df_results.loc[df_results['p1_status'] != df_results['p2_status'], ['match_status']] = ['finished']
df_results.loc[df_results['p1_status'] == df_results['p2_status'], ['match_status']] = ['not played']
df_results.loc[((df_results['p1_status'] =='inplay') & (df_results['p2_status']=='inplays')), ['match_status']] = ['inplay']
df_results = df_results.dropna(axis=1)
df_results.head(30)

Split your dataframe in 2 parts (p1_, p2_) to count defeats of each player then merge them:
Setup a MRE:
df = pd.DataFrame({'p1_surname': list('AABB'), 'p2_surname': list('CDCD'),
'p1_status': list('LWWW'), 'p2_status': list('WLLL')})
print(df)
# Output:
p1_surname p2_surname p1_status p2_status
0 A C L W
1 A D W L
2 B C W L
3 B D W L
>>> pd.concat([
df.filter(like='p1_').set_index('p1_surname')['p1_status'].eq('L').rename('loses'),
df.filter(like='p2_').set_index('p2_surname')['p2_status'].eq('L').rename('loses')]) \
.groupby(level=0).sum().rename_axis('surname').reset_index()
surname loses
0 A 1
1 B 0
2 C 1
3 D 2

Single list.count instead of multiple

Im parsed list of crew witch one looks like:
20;mechanic;0;68
21;cook;0;43
22;scientist;0;79
23;manager;1;65
24;mechanic;1;41
etc
And now I'm trying to figure out how to count number of workers who have 60 or more stamina( the last element in each employee )
There is my code:
with open('employee.txt', 'r') as employee_list:
count = 0
for employee in employee_list.readlines():
employee_data = employee.rstrip().split(';')
if int(employee_data[3]) >= 60:
count += 1
print(count)
Print from terminal:
1
2
3
...
90
And there is the right answer I think, but is there anyway to get only one 'total' count, not a 90ty strings ?

Just print one line after the loop is done.
with open('employee.txt', 'r') as employee_list:
count = 0
for employee in employee_list.readlines():
employee_data = employee.rstrip().split(';')
if int(employee_data[3]) >= 60:
count += 1
print(count)
But I would also recommend using pandas for data manipulation. For example:
df = pd.read_csv('employee.txt', sep=';')
df.columns = ['col1', 'col2', 'col3', 'stamina']
Then just filter and get the size:
df[df.stamina >= 60].size

So after a day of thinking I wrote this and get right answer ( maybe someone will find this helpful):
def total_resist_count():
# with open('employee.txt', 'r') as employee_list:
employee_list = [input() for i in range(120)]
candidates = []
for employee in employee_list:
employee_data = employee.rstrip().split(';')
if int(employee_data[3]) >= 60:
candidates.append(employee_data)
return candidates
required_professionals = {
'computers specialist': 5,
'cook': 3,
'doctor': 5,
'electrical engineer': 4,
'manager': 1,
'mechanic': 8,
'scientist': 14
}
expedition_total = 40
female_min = 21
male_min = 12
def validate_solution(cur_team, num_females, num_males):
global expedition_total, female_min, male_min
if sum(cur_team) != expedition_total or num_females < female_min or num_males < male_min:
return False
num_of_free_vacancies = 0
for k in required_professionals:
num_of_free_vacancies += required_professionals[k]
if num_of_free_vacancies > 0:
return False
return True
TEAM = None
def backtrack(candidates, cur_team, num_females, num_males):
global required_professionals, expedition_total, TEAM
if sum(cur_team) > expedition_total or TEAM is not None:
return
if validate_solution(cur_team, num_females, num_males):
team = []
for i, used in enumerate(cur_team):
if used == 1:
team.append(candidates[i])
TEAM = team
return
for i in range(len(candidates)):
if cur_team[i] == 0 and required_professionals[candidates[i][1]] > 0:
cur_team[i] = 1
required_professionals[candidates[i][1]] -= 1
if candidates[i][2] == '1':
backtrack(candidates, cur_team, num_females, num_males + 1)
else:
backtrack(candidates, cur_team, num_females + 1, num_males)
required_professionals[candidates[i][1]] += 1
cur_team[i] = 0
if __name__ == '__main__':
ec = decode_fcc_message()
candidates = total_resist_count(ec)
cur_team = [0] * len(candidates)
backtrack(candidates, cur_team, 0, 0)
s = ""
for t in TEAM:
s += str(t[0]) + ';'
print(s)

Error with copying values from one sheet to other

I am trying to copy the values from some cells but it give me this error, i tried even without using the def cell(x,y) but still the same error.
This is the error:
learn_tar.cell(row=learn_tar, column=1).value = sheet.cell(row=learn_tar, column=1).value
AttributeError: 'int' object has no attribute 'cell'
Source:
import openpyxl
def cell(x,y):
cell = sheet.cell(row=x,column=y).value
return cell;
def percentage(percent, whole):
return int((percent * whole) / 100.0);
ex = openpyxl.load_workbook("Final_excel2.xlsx")
sheet = ex.get_sheet_by_name('Sheet1')
num = [0,0,0]
per = [0,0,0]
for row in range(2,4798):
if cell(row,1) == '1: Progression':
num[0] = num[0] + 1
elif cell(row,1) == '2: Incidence':
num[1] = num[1] + 1
elif cell(row,1) == '3: Non-exposed control group':
num[2] = num[2] + 1
for column in range(2,49):
#doing stuff
per[0] = percentage(70,num[0])
per[1] = percentage(70,num[1])
per[2] = percentage(70,num[2])
learn_att = ex.create_sheet('Learn-Att',2)
learn_tar = ex.create_sheet('Learn-Tar',3)
test_att = ex.create_sheet('Test-Att',4)
test_tar = ex.create_sheet('Test-Tar',5)
learn_att = 1
learn_tar = 1
test_att = 1
test_tar = 1
for row in range(2,4798):
if row<=1391:
if row<=974:
learn_tar.cell(row=learn_tar, column=1).value = cell(row,1)
learn_att+= 1
learn_tar+= 1
else:
test_tar.cell(row = test_tar,column = 1).value = cell(row,1)
test_att+= 1
test_tar+= 1
for column in range(2,49):
if row<=1391:
if row<=974:
learn_att.cell(row = learn_att,column = column - 1).value = cell(row,column)
else:
test_att.cell(row = test_att,column = column - 1).value = cell(row,column)

You override learn_tar with 1:
learn_tar = ex.create_sheet('Learn-Tar',3)
...
learn_tar = 1
Remove:
learn_tar = 1
and:
learn_tar+= 1
from your code.

Comparing 2 strings in If,Elif,Else loop

In my if/elif/else statement my program keeps returning only the else where it should be 1 B and 2 Cs.
document = open ("ClassNameGrades.txt","r")
content = document.read().splitlines()
As = 0
Bs = 0
Cs = 0
Ds = 0
Fs = 0
for line in content:
split = line.split (", ")
className = split [0]
facultyName = split [1:1]
studentName= split [2:2]
grade = split [3:]
if str(grade) == ("A"):
As = As + 1
elif str(grade) == ("B"):
Bs = Bs + 1
elif str(grade) == ("C"):
Cs = Cs + 1
elif str(grade) == ("D"):
Ds = Ds + 1
else:
Fs = Fs + 1
print (str(As)+(" students got As."))
print (str(Bs)+(" students got Bs."))
print (str(Cs)+(" students got Cs."))
print (str(Ds)+(" students got Ds."))
print (str(Fs)+(" students got Fs."))
document.close()
The results for this shows as:
0 students got As.
0 students got Bs.
0 students got Cs.
0 students got Ds.
3 students got Fs.

You are comparing a list item to a string, which is why your if conditions never evaluate to True. Instead of using the list slice notation split[2:2] which returns a list, you should simply fetch the items from the list using the list index notation.
document = open ("ClassNameGrades.txt","r")
content = document.read().splitlines()
As = 0
Bs = 0
Cs = 0
Ds = 0
Fs = 0
for line in content:
split = line.split(", ")
className = split[0]
facultyName = split[1]
studentName= split[2]
grade = split[3]
if str(grade) == "A":
As = As + 1
elif str(grade) == "B":
Bs = Bs + 1
elif str(grade) == "C":
Cs = Cs + 1
elif str(grade) == "D":
Ds = Ds + 1
else:
Fs = Fs + 1
print(str(As)+(" students got As."))
print(str(Bs)+(" students got Bs."))
print(str(Cs)+(" students got Cs."))
print(str(Ds)+(" students got Ds."))
print(str(Fs)+(" students got Fs."))
document.close()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

splitting names by counting commas gives value error - python

Related

Why my openpyxl code is slower than my VBA code?

How can I display max number of loses from this dataframe in Pandas?

Single list.count instead of multiple

Error with copying values from one sheet to other

Comparing 2 strings in If,Elif,Else loop

Categories

Resources