I would like to identify doctors based on their title in a dataframe and create a new column to indicate if they are a doctor but I am struggling with my code.
doctorcriteria = ['Dr', 'dr']
def doctor(x):
if doctorcriteria in x:
return 'Doctor'
else:
return 'Not a doctor'
df['doctorcall'] = df.caller_name
df.doctorcall.fillna('Not a doctor', inplace=True)
df.doctorcall = df.doctorcall.apply(doctor)
To create a new column with a function, you can use apply:
df = pd.DataFrame({'Title':['Dr', 'dr', 'Mr'],
'Name':['John', 'Jim', 'Jason']})
doctorcriteria = ['Dr', 'dr']
def doctor(x):
if x.Title in doctorcriteria:
return 'Doctor'
else: return 'Not a doctor'
df['IsDoctor'] = df.apply(doctor, axis=1)
But a more direct route to the answer would be to use map on the Title column.
doctor_titles = {'Dr', 'dr'}
df['IsDoctor'] = df['Title'].map(lambda title: title in doctor_titles)
Related
I'm looking to increase the speed of the nested for loops.
VARIABLES:
'dataframe' - The dataframe I am attempting to modify in the second for loop. It consists of a multitude of training sessions for the same people. This is the attendance document that is changed if a match exists in the reporting dataframe.
'dictNewNames' - This is a dictionary of session title names. The key is the longer session title name and the value is a stripped session title name. For example {'Week 1: Training': 'Training'} etc. The key is equal to the 'Session Title' column in each row but the value is used for searching a substring in the second for loop.
'reporting' - A dataframe that includes information regarding session titles and attendance participation. The reporting dataframe is already filtered so everyone in the 'reporting' dataframe should get credit in 'dataframe'. The only caveat is that the 'search' name is nested within the pathway title.
dataframe = {
'Session Title': ['Organization Week 1: Train', 'Organization Week 2: Train', 'Organization Week 3: Train'],
'Attendee Email': ['name#gmail.com', 'name2#gmail.com', 'name3#gmail.com'],
'Completed': ['No', 'No', 'No'],
'Date Completed': ['','','']}
dictNewNames = { 'Organization Week 1: Train': 'Train', ' Organization Week 2: Train': 'Train', 'Organization Week 3: Train': 'Train' }
Title formatting is not incorrect (i.e. ':' vs '-' as seen in pathway title below). The data is completely all over the place in terms of format.
reporting = {
'Pathway Title': ['Training 1 - Train', 'Training 2: Train', 'Training 3 - Train'],
'Email': ['name#gmail.com', 'name2#gmail.com', 'name3#gmail.com'],
'Date Completed': ['xx/yy/xx', 'yy/xx/zz', 'zz/xx/yy']}
expectedOuput = {
'Session Title': ['Organization Week 1: Train', 'Organization Week 2: Train', 'Organization Week 3: Train'],
'Attendee Email': ['name#gmail.com', 'name2#gmail.com', 'name3#gmail.com'],
'Completed': ['Yes', 'Yes', 'Yes'],
'Date Completed': ['xx/yy/xx', 'yy/xx/zz', 'zz/xx/yy']}
My code:
def giveCredit(dataframe, dictNewNames, reporting):
for index, row in dataframe.iterrows():
temp = row['Session Title']
searchName = dictNewNames[temp]
attendeeEmail = row['Attendee: Email']
for index1, row1 in reporting.iterrows():
pathwayTitle = row1['Pathway Title']
Email = row1['Organization Email']
dateCompleted = row1['Date Completed']
if attendeeEmail == Email and searchName in pathwayTitle:
dataframe.at[index, 'Completed'] = 'Yes'
dataframe.at[index, 'Date Completed'] = dateCompleted
break
return dataframe
Your pattern looks like merge:
for loop1 on first dataframe:
for loop2 on second dataframe:
if conditions match between both dataframes:
So:
# Create a common key Name based on dictNewNames
pat = fr"({'|'.join(dictNewNames.values())})"
name1 = dataframe['Session Title'].map(dictNewNames)
name2 = reporting['Pathway Title'].str.extract(pat)
# Merge dataframes based on this key and email
out = pd.merge(dataframe.assign(Name=name1),
reporting.assign(Name=name2),
left_on=['Name', 'Attendee Email'],
right_on=['Name', 'Email'],
how='left', suffixes=(None, '_'))
# Update the dataframe
out['Date Completed'] = out.pop('Date Completed_')
out['Completed'] = np.where(out['Date Completed'].notna(), 'Yes', 'No')
out = out[dataframe.columns]
Output:
>>> out
Session Title Attendee Email Completed Date Completed
0 Week 1: Train 1 name#gmail.com Yes xx/yy/xx
1 Week 2: Train 2 name2#gmail.com Yes yy/xx/zz
2 Week 3: Train 3 name3#gmail.com Yes zz/xx/yy
This workaround cut my execution time from 460 seconds to under 10.
def giveCredit(dataframe, dictNewNames, reporting):
reporting['Date Completed'] = pd.to_datetime(reporting['Date Completed'])
for index1, row in dataframe.iterrows():
temp = row['Session Title']
numberList = re.findall('[0-9]+', temp)
finalNumber = str(numberList[0])
searchName = dictNewNames[temp]
attendeeEmail = row['Attendee: Email']
row = reporting.loc[(reporting['Pathway Title'].str.contains(searchName, case=False)) & (reporting['Organization Email'] == attendeeEmail)]
if len(row.index) != 0:
new_row = row.loc[(reporting['Pathway Title'].str.contains(finalNumber, case=False))]
if len(new_row.index) != 0:
dataframe = modifyFrame(dataframe, new_row, index1)
else:
dataframe= modifyFrame(dataframe, row, index1)
dataframe = dataframe.sort_values(["Completed", "Attendee"], ascending=[False, True])
return dataframe
def modifyFrame(frame, row, index1):
dateCompleted = row['Date Completed']
dateCompleted = dateCompleted.to_string(buf=None, header=False, index=False, length=False, name=False, max_rows=None).strip()
dataframe.at[index1, 'Completed'] = 'Yes'
dataframe.at[index1, 'Date Completed'] = dateCompleted
return dataframe
So I currently have what is above.
I've managed to separate them into categories using groupby but now I would like to put them in a subplot of tables.
##open comma separated file and the columns Name, In Stock, committed, reorder point
file = pd.read_csv('Katana/InventoryItems-2022-01-06-09_10.csv',
usecols=['Name','In stock','Committed', 'Reorder point','Category'])
##take the columns and put them in to a list
Name = file['Name'].tolist()
InStock = file['In stock'].tolist()
Committed = file['Committed'].tolist()
ReorderPT = file['Reorder point'].tolist()
Category = file['Category'].tolist()
##take the lists and change them into appropriate type of data
inStock = [int(float(i)) for i in InStock]
commited = [int(float(i)) for i in Committed]
reorderpt = [int(float(i)) for i in ReorderPT]
##have the liss with correct data type and arrange them
inventory = {'Name': Name,
'In stock': inStock,
'Committed': commited,
'Reorder point': reorderpt,
'Category': Category
}
##take the inventory arrangement and display them into a table
frame = DataFrame(inventory)
grouped = frame.groupby(frame.Category)
df_elec = grouped.get_group('Electronics')
df_bedp = grouped.get_group('Bed Packaging')
df_fil = grouped.get_group('Filament')
df_fast = grouped.get_group('Fasteners')
df_kit = grouped.get_group('Kit Packaging')
df_pap = grouped.get_group('Paper')
Try something along the lines of:
import matplotlib.pyplot as plt
fig,axs = plt.subplots(nrows=6,ncols=1)
for ax,data in zip(axs,[df_elec,df_bedp,df_fil,df_fast,df_kit,df_pap]):
data.plot(ax=ax,table=True)
I am doing some web scraping using selenium and am able to return a phone number and email but unable to append it to my dataframe.
I have tried running the function and it spits out the correct information and I have tried saving the results of the function to a variable, then putting it into the dataframe but it just won't save the way I am trying to get it to save
df = pd.DataFrame(columns=['Phone', 'EmailAddress'])
def phonenumber():
for element in browser.find_elements_by_xpath('.//span[#class = "phone ng-binding ng-scope"]'):
return(element.text)
def email():
for element in browser.find_elements_by_xpath('.//span[#class = "email ng-scope"]'):
return(element.text)
df = df.append({'Phone': phonenumber(), 'EmailAddress': email()}, ignore_index=True)
Right now, the code returns "none" in the dataframe
You can append each element in the for loop into the respective empty lists for each function, return them from the functions and then use them to create the dataframe:
def phonenumber():
ph = []
for element in browser.find_elements_by_xpath('.//span[#class = "phone ng-binding ng-scope"]'):
ph.append(element.text)
return ph
def email():
mail = []
for element in browser.find_elements_by_xpath('.//span[#class = "email ng-scope"]'):
mail.append(element.text)
return mail
ph = phonenumber()
mail = email()
Now use the appended lists to create the dataframe. This is assuming that the length of the lists is equal.
df = pd.DataFrame({'Phone':ph, 'EmailAddress':mail})
I have a dataframe with a model id and associated values. The columns are date, client_id, model_id, category1, category2, color, and price. I have a simple flask app where the user can select a model id and add to their "purchase" history. Based on the model id I would like to add a row to the dataframe and bring the associated values of category1, category2, color, and price. What is the best way to do this using Pandas? I know in Excel I'd use a vlookup but I am unsure how to go about it using Python. Assume category1, category2, color, and price are unique to each model id.
client_id = input("ENTER Model ID: ")
model_id = input("ENTER Model ID: ")
def update_history(df, client_id, model_id):
today=pd.to_datetime('today')
#putting in tmp but just need to "lookup" these values from the original dataframe somehow
df.loc[len(df)]=[today, client_id, model_id, today, 'tmp', 'tmp','tmp', 'tmp']
return df
Code below adds a new row with new values to an existing dataframe. The list of new values could be passed in to the function.
Import libraries
import pandas as pd
import numpy as np
import datetime
Create sample dataframe
model_id = ['M1', 'M2', 'M3']
today = ['2018-01-01', '2018-01-02', '2018-01-01']
client_id = ['C1', 'C2', 'C3']
category1 = ['orange', 'apple', 'beans']
category2 = ['fruit', 'fruit', 'grains']
df = pd.DataFrame({'today':today, 'model_id': model_id, 'client_id':client_id,
'category1': category1, 'category2':category2})
df['today'] = pd.to_datetime(df['today'])
df
Function
def update_history(df, client_id, model_id, category1, category2):
today=pd.to_datetime('today')
# Create a temp dataframe with new values.
# Column names in this dataframe should match the existing dataframe
temp = pd.DataFrame({'today':[today], 'model_id': [model_id], 'client_id':[client_id],
'category1': [category1], 'category2':[category2]})
df = df.append(temp)
return df
Call function to append a row with new values to existing dataframe
update_history(df, client_id='C4', model_id='M4', category1='apple', category2='fruit')
You could try this. In case you are appending more than one row at a time, appending a dictionary to list and then appending them at once to a dataframe is faster.
modelid = ['MOD1', 'MOD2', 'MOD3']
today = ['2018-07-15', '2018-07-18', '2018-07-20']
clients = ['CLA', 'CLA', 'CLB']
cat_1 = ['CAT1', 'CAT2', 'CAT3']
cat_2 = ['CAT11', 'CAT12', 'CAT13']
mdf = pd.DataFrame({"model_id": modelid, "today": today, "client_id": clients, "cat_1":cat_1, "cat_2":cat_2})
def update_history(df, client_id, model_id):
today = pd.to_datetime('today')
row = df[df.model_id==model_id].iloc[0]
rows_list = []
dict = {"today":today, "client_id":client_id,
"model_id":model_id,"cat_1":row["cat_1"],
"cat_2":row["cat_2"]}
rows_list.append(dict)
df2 = pd.DataFrame(rows_list)
df = df.append(df2)
return df
mdf = update_history(mdf,"CLC","MOD1")
This is what I ended up doing. I still think there is a more elegant solution, so please let me know!
#create dataframe
modelid = ['MOD1', 'MOD2', 'MOD3']
today = ['2018-07-15', '2018-07-18', '2018-07-20']
clients = ['CLA', 'CLA', 'CLB']
cat_1 = ['CAT1', 'CAT2', 'CAT3']
cat_2 = ['CAT11', 'CAT12', 'CAT13']
mdf = pd.DataFrame({"model_id": modelid, "today": today, "client_id": clients, "cat_1":cat_1, "cat_2":cat_2})
#reorder columns
mdf = mdf[['cat_1', 'cat_2', 'model_id', 'client_id', 'today']]
#create lookup table
lookup=mdf[['cat_1','cat_2','model_id']]
lookup.drop_duplicates(inplace=True)
#get values
client_id = input("ENTER Client ID: ")
model_id = input("ENTER Model ID: ")
#append model id to list
model_id_lst=[]
model_id_lst.append(model_id)
today=pd.to_datetime('today')
#grab associated cat_1, and cat_2 from lookup table
temp=lookup[lookup['model_id'].isin(model_id_lst)]
out=temp.values.tolist()
out[0].extend([client_id, today])
#add this as a row to the df
mdf.loc[len(mdf)]=out[0]
How can i split by big dataframe into smaller dataframe and able to print all the dataframe separately on web? any idea on edit code can place a loop in context?
here is my code:
def read_raw_data(request):
Wb = pd.read_excel(r"LookAhead.xlsm", sheetname="Step")
Step1 = Wb.replace(np.nan, '', regex=True)
drop_column =
Step1_Result.drop(['facility','volume','indicator_product'], 1)
uniquevaluesproduct = np.unique(drop_column[['Product']].values)
total_count=drop_column['Product'].nunique()
row_array=[]
for name, group in drop_column.groupby('Product')
group=group.values.tolist()
row_array.append(group)
i=1
temp=row_array[0]
while i<total_count:
newb = temp + row_array[i]
temp=newb
i = i + 1
b = ['indicator', 'Product']
test=pd.DataFrame.from_records(temp, columns=b)
table = test.style.set_table_attributes('border="" class = "dataframe table table-hover table-bordered"').set_precision(10).render()
context = { "result": table}
return render(request, 'result.html', context)
If you want to show a big dataframe in different pages, I recommend you using a Paginator. The documentation has a good example on how to implement it.
https://docs.djangoproject.com/en/1.10/topics/pagination/#using-paginator-in-a-view