Feature engineering, ValueError: Columns must be same length as key - python

I'm running into a ValueError: Columns must be same length as key when trying to do encoding for the column Type. Here are the codes, not sure which part is wrong.
df.head()
plt.figure(figsize=(7, 5))
sns.heatmap(df.isnull(), cmap='viridis')
df.isnull().any()
df.isnull().sum()
df['Rating'] = df['Rating'].fillna(df['Rating'].median())
replaces = [u'\u00AE', u'\u2013', u'\u00C3', u'\u00E3', u'\u00B3', '[', ']', "'"]
for i in replaces:
df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : x.replace(i, ''))
regex = [r'[-+|/:/;(_)#]', r'\s+', r'[A-Za-z]+']
for j in regex:
df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : re.sub(j, '0', x))
df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : x.replace('.', ',',1).replace('.', '').replace(',', '.',1)).astype(float)
df['Current Ver'] = df['Current Ver'].fillna(df['Current Ver'].median())
i = df[df['Category'] == '1.9'].index
df.loc[i]
df = df.drop(i)
df = df[pd.notnull(df['Last Updated'])]
df = df[pd.notnull(df['Content Rating'])]
le = preprocessing.LabelEncoder()
df['App'] = le.fit_transform(df['App'])
category_list = df['Category'].unique().tolist()
category_list = ['cat_' + word for word in category_list]
df = pd.concat([df, pd.get_dummies(df['Category'], prefix='cat')], axis=1)
le = preprocessing.LabelEncoder()
df['Genres'] = le.fit_transform(df['Genres'])
le = preprocessing.LabelEncoder()
df['Content Rating'] = le.fit_transform(df['Content Rating'])
df['Price'] = df['Price'].apply(lambda x : x.strip('$'))
df['Installs'] = df['Installs'].apply(lambda x : x.strip('+').replace(',', ''))
df['Type'] = pd.get_dummies(df['Type'])

You are trying to map a DataFrame with multiple columns to one column to the original DataFrame.
pd.get_dummies returns a DataFrame with a column for each value in the column.
If you want to add those values to the original DataFrame you can use concat.
Example:
import pandas as pd
df = pd.DataFrame(data=['type1', 'type2', 'type3'], columns=['Type'])
dummies_df = pd.get_dummies(df['Type'])
pd.concat([df, dummies_df], axis=1)

Related

Dictionary creation inside a function

Let's say I have the following dataframe:
import pandas as pd
data = {'Flag':['a', 'b', 'a', 'b'],
'Item':['ball', 'car', 'pen', 'candy'],
'Char1':[0, 0, 0, 0],
'Char2':[23, 21, 19, 13],
'Char3':[40, 43, 60, 70]}
df = pd.DataFrame(data)
Now, let's perform some calculation:
df['Char1_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char1.mean(), axis=1)
df['Char1_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char1.std(), axis=1)
df['Char2_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char2.mean(), axis=1)
df['Char2_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char2.std(), axis=1)
df['Char3_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char3.mean(), axis=1)
df['Char3_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char3.std(), axis=1)
Finally let's create the following dictionary:
Flag_list = ['a','b']
sum_dict = {'Flag':Flag_list,
'Char1_average':df['Char1_avg'].head(2).tolist(),
'Char1_std':df['Char1_std'].head(2).tolist(),
'Char2_average':df['Char2_avg'].head(2).tolist(),
'Char2_std':df['Char2_std'].head(2).tolist(),
'Char3_average':df['Char3_avg'].head(2).tolist(),
'Char3_std':df['Char3_std'].head(2).tolist()}
In this way all works fine,
correct dictionary
but I need to define a function that performs the same things, so I have written the following code:
def fnctn(dataf):
param_list=["Char1", "Char2", 'Char3']
for param in param_list:
dataf[f'{param}_avg'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].mean(), axis=1)
dataf[f'{param}_StDev'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].std(), axis=1)
sum_dict = {'Flag':Flag_list,
f'{param}_average':dref[f'{param}_avg'].head(2).tolist(),
f'{param}_std':dref[f'{param}_StDev'].head(2).tolist()}
ref_avg_values = pd.DataFrame(sum_dict)
dataf = df.copy()
fnctn(dataf)
But this time the dictionary I get contains only the values of the last iteration:
wrong dictionary
How can I get the same dictionary as in the previous case?
you have to update it into the dictionary so that you have all the values that are iterated inside the for loop.
Here is the solution to your query:
def fnctn(dataf):
param_list=["Char1", "Char2", 'Char3']
dictie={}
for param in param_list:
dataf[f'{param}_avg'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].mean(), axis=1)
dataf[f'{param}_StDev'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].std(), axis=1)
sum_dict = {'Flag':Flag_list,
f'{param}_average':dataf[f'{param}_avg'].head(2).tolist(),
f'{param}_std':dataf[f'{param}_StDev'].head(2).tolist()}
dictie.update(sum_dict)
return pd.DataFrame(dictie)
dataf = df.copy()
fnctn(dataf)
And the answer is as below:

How to flag an outlier(s) /anomaly in selected columns in python?

In the dataset df below. I want to flag the anomalies in all columns except A, B,C and L.
Any value less than 1500 or greater than 400000 is regarded as an anomaly.
import pandas as pd
# intialise data of lists
data = {
'A':['T1', 'T2', 'T3', 'T4', 'T5'],
'B':[1,1,1,1,1],
'C':[1,2,3,5,9],
'D':[12005, 18190, 1034, 15310, 31117],
'E':[11021, 19112, 19021, 12, 24509 ],
'F':[10022,19910, 19113,19999, 25519],
'G':[14029, 29100, 39022, 24509, 412262],
'H':[52119,32991,52883,69359,57835],
'J':[41218, 52991,55121,69152,79355],
'K': [43211,8199991,56881,212,77342],
'L': [1,0,1,0,0],
'M': [31211,42901,53818,62158,69325],
}
# Create DataFrame
df = pd.DataFrame(data)
# Print the output.
df
Attempt:
exclude_cols = ['A','B','C','L']
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
return '' # or None, or whatever df.style() needs
else:
s = pd.to_numeric(s, errors='coerce')
indexes = (s<1500)|(s>400000)
return ['background-color: red' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1)
Result of the code:
Desired output should look like this:
Thanks for the effort!
If you set the subset as the argument of the apply function, you will get what you want.
exclude_cols = ['A','B','C','L']
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
print(s.name)
return '' # or None, or whatever df.style() needs
else:
s = pd.to_numeric(s, errors='coerce')
indexes = (s<1500)|(s>400000)
return ['background-color: yellow' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1, subset=['D','E','F','G','H','J','K'])

How to select column if string is in column name

so I have a dict of dataframes with many columns. I want to selected all the columns that have the string 'important' in them.
So some of the frames may have important_0 or important_9_0 as their column name. How can I select them and put them into their own new dictionary with all the values each columns contains.
import pandas as pd
df = pd.DataFrame(columns=['a', 'b', 'important_c'])
selected_cols = [c for c in df.columns if c.startswith('important_')]
print(selected_cols)
# ['important_c']
dict_df = { x: pd.DataFrame(columns=['a', 'b', 'important_c']) for x in range(3) }
new_dict = { x: dict_df[x][[c for c in dict_df[x].columns if c.startswith('important_')]] for x in dict_df }
important_columns = [x for x in df.columns if 'important' in x]
#changing your dataframe by remaining columns that you need
df = df[important_columns]

Joining dataframe in for loop

My code won't work... it gives me ValueError: columns overlap but no suffix specified
import pandas as pd
import pickle
list = ["ZILLOW2.csv", "ZILLOW3.csv", "ZILLOW4.csv",
"ZILLOW6.csv", "ZILLOW7.csv", "ZILLOW8.csv"]
maindf = pd.DataFrame()
for x in list:
df = pd.read_csv(x)
if x == "ZILLOW2.csv":
maindf = pd.DataFrame(df)
else:
maindf = maindf.join(df)
print(maindf)
Use concat:
import pandas as pd
import pickle
list = ["ZILLOW2.csv", "ZILLOW3.csv", "ZILLOW4.csv",
"ZILLOW6.csv", "ZILLOW7.csv", "ZILLOW8.csv"]
maindf = pd.DataFrame()
for x in list:
df = pd.read_csv(x)
if x == "ZILLOW2.csv":
maindf = pd.DataFrame(df)
else:
maindf = pd.concat([maindf, df], axis=1)
print(maindf)

How to get percentiles on groupby column in python?

I have a dataframe as below:
df = pd.DataFrame({'state': ['CA', 'WA', 'CO', 'AZ'] * 3,
'office_id': list(range(1, 7)) * 2,
'sales': [np.random.randint(100000, 999999) for _ in range(12)]})
To get percentiles of sales,state wise,I have written below code:
pct_list1 = []
pct_list2 = []
for i in df['state'].unique().tolist():
pct_list1.append(i)
for j in range(0,101,10):
pct_list1.append(np.percentile(df[df['state'] == i]['sales'],j))
pct_list2.append(pct_list1)
pct_list1 = []
colnm_list1 = []
for k in range(0,101,10):
colnm_list1.append('perct_'+str(k))
colnm_list2 = ['state'] + colnm_list1
df1 = pd.DataFrame(pct_list2)
df1.columns = colnm_list2
df1
Can we optimize this code?
I feel that,we can also use
df1 = df[['state','sales']].groupby('state').quantile(0.1).reset_index(level=0)
df1.columns = ['state','perct_0']
for i in range(10,101,10):
df1.loc[:,('perct_'+str(i))] = df[['state','sales']].groupby('state').quantile(float(i/100.0)).reset_index(level=0)['sales']
If there are any other alternatives,please help.
Thanks.
How about this?
quants = np.arange(.1,1,.1)
pd.concat([df.groupby('state')['sales'].quantile(x) for x in quants],axis=1,keys=[str(x) for x in quants])

Categories

Resources