Joining dataframe in for loop - python

My code won't work... it gives me ValueError: columns overlap but no suffix specified
import pandas as pd
import pickle
list = ["ZILLOW2.csv", "ZILLOW3.csv", "ZILLOW4.csv",
"ZILLOW6.csv", "ZILLOW7.csv", "ZILLOW8.csv"]
maindf = pd.DataFrame()
for x in list:
df = pd.read_csv(x)
if x == "ZILLOW2.csv":
maindf = pd.DataFrame(df)
else:
maindf = maindf.join(df)
print(maindf)

Use concat:
import pandas as pd
import pickle
list = ["ZILLOW2.csv", "ZILLOW3.csv", "ZILLOW4.csv",
"ZILLOW6.csv", "ZILLOW7.csv", "ZILLOW8.csv"]
maindf = pd.DataFrame()
for x in list:
df = pd.read_csv(x)
if x == "ZILLOW2.csv":
maindf = pd.DataFrame(df)
else:
maindf = pd.concat([maindf, df], axis=1)
print(maindf)

Related

Feature engineering, ValueError: Columns must be same length as key

I'm running into a ValueError: Columns must be same length as key when trying to do encoding for the column Type. Here are the codes, not sure which part is wrong.
df.head()
plt.figure(figsize=(7, 5))
sns.heatmap(df.isnull(), cmap='viridis')
df.isnull().any()
df.isnull().sum()
df['Rating'] = df['Rating'].fillna(df['Rating'].median())
replaces = [u'\u00AE', u'\u2013', u'\u00C3', u'\u00E3', u'\u00B3', '[', ']', "'"]
for i in replaces:
df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : x.replace(i, ''))
regex = [r'[-+|/:/;(_)#]', r'\s+', r'[A-Za-z]+']
for j in regex:
df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : re.sub(j, '0', x))
df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : x.replace('.', ',',1).replace('.', '').replace(',', '.',1)).astype(float)
df['Current Ver'] = df['Current Ver'].fillna(df['Current Ver'].median())
i = df[df['Category'] == '1.9'].index
df.loc[i]
df = df.drop(i)
df = df[pd.notnull(df['Last Updated'])]
df = df[pd.notnull(df['Content Rating'])]
le = preprocessing.LabelEncoder()
df['App'] = le.fit_transform(df['App'])
category_list = df['Category'].unique().tolist()
category_list = ['cat_' + word for word in category_list]
df = pd.concat([df, pd.get_dummies(df['Category'], prefix='cat')], axis=1)
le = preprocessing.LabelEncoder()
df['Genres'] = le.fit_transform(df['Genres'])
le = preprocessing.LabelEncoder()
df['Content Rating'] = le.fit_transform(df['Content Rating'])
df['Price'] = df['Price'].apply(lambda x : x.strip('$'))
df['Installs'] = df['Installs'].apply(lambda x : x.strip('+').replace(',', ''))
df['Type'] = pd.get_dummies(df['Type'])
You are trying to map a DataFrame with multiple columns to one column to the original DataFrame.
pd.get_dummies returns a DataFrame with a column for each value in the column.
If you want to add those values to the original DataFrame you can use concat.
Example:
import pandas as pd
df = pd.DataFrame(data=['type1', 'type2', 'type3'], columns=['Type'])
dummies_df = pd.get_dummies(df['Type'])
pd.concat([df, dummies_df], axis=1)

Dictionary creation inside a function

Let's say I have the following dataframe:
import pandas as pd
data = {'Flag':['a', 'b', 'a', 'b'],
'Item':['ball', 'car', 'pen', 'candy'],
'Char1':[0, 0, 0, 0],
'Char2':[23, 21, 19, 13],
'Char3':[40, 43, 60, 70]}
df = pd.DataFrame(data)
Now, let's perform some calculation:
df['Char1_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char1.mean(), axis=1)
df['Char1_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char1.std(), axis=1)
df['Char2_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char2.mean(), axis=1)
df['Char2_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char2.std(), axis=1)
df['Char3_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char3.mean(), axis=1)
df['Char3_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char3.std(), axis=1)
Finally let's create the following dictionary:
Flag_list = ['a','b']
sum_dict = {'Flag':Flag_list,
'Char1_average':df['Char1_avg'].head(2).tolist(),
'Char1_std':df['Char1_std'].head(2).tolist(),
'Char2_average':df['Char2_avg'].head(2).tolist(),
'Char2_std':df['Char2_std'].head(2).tolist(),
'Char3_average':df['Char3_avg'].head(2).tolist(),
'Char3_std':df['Char3_std'].head(2).tolist()}
In this way all works fine,
correct dictionary
but I need to define a function that performs the same things, so I have written the following code:
def fnctn(dataf):
param_list=["Char1", "Char2", 'Char3']
for param in param_list:
dataf[f'{param}_avg'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].mean(), axis=1)
dataf[f'{param}_StDev'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].std(), axis=1)
sum_dict = {'Flag':Flag_list,
f'{param}_average':dref[f'{param}_avg'].head(2).tolist(),
f'{param}_std':dref[f'{param}_StDev'].head(2).tolist()}
ref_avg_values = pd.DataFrame(sum_dict)
dataf = df.copy()
fnctn(dataf)
But this time the dictionary I get contains only the values of the last iteration:
wrong dictionary
How can I get the same dictionary as in the previous case?
you have to update it into the dictionary so that you have all the values that are iterated inside the for loop.
Here is the solution to your query:
def fnctn(dataf):
param_list=["Char1", "Char2", 'Char3']
dictie={}
for param in param_list:
dataf[f'{param}_avg'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].mean(), axis=1)
dataf[f'{param}_StDev'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].std(), axis=1)
sum_dict = {'Flag':Flag_list,
f'{param}_average':dataf[f'{param}_avg'].head(2).tolist(),
f'{param}_std':dataf[f'{param}_StDev'].head(2).tolist()}
dictie.update(sum_dict)
return pd.DataFrame(dictie)
dataf = df.copy()
fnctn(dataf)
And the answer is as below:

How to append items to a list inside a dictionary at specific index locations of the list?

I have dictionary called data that looks like this:
{'Annou1_date': [NoneType,NoneType,....], 'Appro2_date': [NoneType,NoneType,....], 'Appro3_date': [NoneType,NoneType,....]}
And, I want to update Appro2_date's list at index 1 so that my dictionary would like:
{'Annou1_date': [NoneType,NoneType,....], 'Appro2_date': [NoneType,22nd July 2021,....], 'Appro3_date': [NoneType,NoneType,....]}
How would I do this?
I tried doing:
#RECRUITMENT PRE-PROCESSING SCRIPT
import pandas as pd
import numpy as np
import math
#Reading report file
df1 = pd.read_excel("C:/Users/INT011/Desktop/WorkFlows/Dashboards/Recruitment Workflow/Recruitment Workflow Tasks.xlsx")
df1 = df1.sort_values(by=["Workflow ID","Task ID"], ascending=True)
wf_id,tk_id,tk_name,sd,ed,delay,st,category,assignee,ini,can_name,can_desig,hod,posi,dept,jb_num,qty,ddj,fdj,lk \
= df1['Workflow ID'].tolist(),df1['Task ID'].tolist(),df1['Task Name'].tolist(),df1['Start Date'].tolist(),df1['End Date'].tolist(),df1['Delay(Days)'].tolist(),df1['Status'].tolist(), \
df1['Category'].tolist(),df1['Assignee Name'].tolist(),df1['Initiator'].tolist(),df1['Name Of Candidate'].tolist(),df1['Candidate Designation'].tolist(),df1['HOD Selection'].tolist(), \
df1['Position'].tolist(),df1['Department'].tolist(),df1['Project Number'].tolist(),df1['Quantity'].tolist(),df1['Desired Date Of Joining'].tolist(),df1['Final Date Of Joining'].tolist(), \
df1['Alfresco Link'].tolist()
sd = pd.to_datetime(df1['Start Date'])
df1['End Date'] = pd.to_datetime(df1['End Date'])
df1['Min Date'] = df1.groupby(['Workflow ID', 'Task Name'])['End Date'].transform('min')
df1['Max Date'] = df1.groupby(['Workflow ID','Task Name'])['End Date'].transform('max')
firstdate = df1['Min Date'].tolist()
lastdate = df1['Max Date'].tolist()
tasklist = np.unique(tk_name)
t = len(tasklist)
task_var_names = [str()]*t
for i in range(t):
task_var_names[i] = tasklist[i][0:5]+str(i+1)+"_date"
#New Columns for Output File
a = len(wf_id)
x=np.array(wf_id)
x=np.unique(wf_id)
b=len(x)
data = dict((task_var_names[i], [None]*b) for i in range(t))
runnintask,status,ct,initiator,HOD,jb,alflink = [str()]*b,[str()]*b,[str()]*b,[str()]*b,[str()]*b,[str()]*b,[str()]*b
for i in range(0,b):
for j in range(0,a):
if x[i] == wf_id[j]:
alflink[i] = lk[j]
ct[i] = category[j]
for k in range(t):
if tk_name[j] == tasklist[k]:
data[task_var_names[k]] = ed[j]
but it replaces the list with single values like:
{'Annou1_date': '17-Jul-2021', 'Appro2_date': '24-Jul-2021', 'Appro3_date': '22-Jul-2021'}
From the dictionary, Get the value of Appro2_date which is a list
Now update the list at your desired index.
lst[index] = new_value
d = {'Annou1_date': [None,None], 'Appro2_date': [None,None,None], 'Appro3_date': [None,None,None]}
date_str = '22nd July 2021'
# Updating the list. Modify data at index - 1
d['Appro2_date'][1] = date_str
print(d)
{'Annou1_date': [None, None], 'Appro2_date': [None, '22nd July 2021', None], 'Appro3_date': [None, None, None]}

Is there a way to reduce this repetitive code?

def frame(dt_type, start_year, end_year, columns_req):
frame = pd.DataFrame()
for i in range (start_year, end_year):
file_name = f"{dt_type} {i}"
dataframe = pd.read_csv(BytesIO(uploaded["%s.csv"%file_name]))
if len(columns_req) == 1:
df = pd.DataFrame(data, columns= [columns_req[0])
if len(columns_req) == 2:
df = pd.DataFrame(data, columns= [columns_req[0], columns_req[1]])
if len(columns_req) == 3:
df = pd.DataFrame(data, columns= [columns_req[0], columns_req[1], columns_req[2])
if len(columns_req) == 4:
df = pd.DataFrame(data, columns= [columns_req[0], columns_req[1], columns_req[2], columns_req[3]])
frame = frame.append(dataframe, ignore_index=True)
return (frame)
As you can see, the if loop is repetitive and feels odd. I am new to programming. Is there anyway to reduce that whole bunch of code?
you could do this
df = pd.DataFrame(data, columns = colums_req)
instead of all those if - conditions

How to select column if string is in column name

so I have a dict of dataframes with many columns. I want to selected all the columns that have the string 'important' in them.
So some of the frames may have important_0 or important_9_0 as their column name. How can I select them and put them into their own new dictionary with all the values each columns contains.
import pandas as pd
df = pd.DataFrame(columns=['a', 'b', 'important_c'])
selected_cols = [c for c in df.columns if c.startswith('important_')]
print(selected_cols)
# ['important_c']
dict_df = { x: pd.DataFrame(columns=['a', 'b', 'important_c']) for x in range(3) }
new_dict = { x: dict_df[x][[c for c in dict_df[x].columns if c.startswith('important_')]] for x in dict_df }
important_columns = [x for x in df.columns if 'important' in x]
#changing your dataframe by remaining columns that you need
df = df[important_columns]

Categories

Resources