How to get percentiles on groupby column in python? - python

I have a dataframe as below:
df = pd.DataFrame({'state': ['CA', 'WA', 'CO', 'AZ'] * 3,
'office_id': list(range(1, 7)) * 2,
'sales': [np.random.randint(100000, 999999) for _ in range(12)]})
To get percentiles of sales,state wise,I have written below code:
pct_list1 = []
pct_list2 = []
for i in df['state'].unique().tolist():
pct_list1.append(i)
for j in range(0,101,10):
pct_list1.append(np.percentile(df[df['state'] == i]['sales'],j))
pct_list2.append(pct_list1)
pct_list1 = []
colnm_list1 = []
for k in range(0,101,10):
colnm_list1.append('perct_'+str(k))
colnm_list2 = ['state'] + colnm_list1
df1 = pd.DataFrame(pct_list2)
df1.columns = colnm_list2
df1
Can we optimize this code?
I feel that,we can also use
df1 = df[['state','sales']].groupby('state').quantile(0.1).reset_index(level=0)
df1.columns = ['state','perct_0']
for i in range(10,101,10):
df1.loc[:,('perct_'+str(i))] = df[['state','sales']].groupby('state').quantile(float(i/100.0)).reset_index(level=0)['sales']
If there are any other alternatives,please help.
Thanks.

How about this?
quants = np.arange(.1,1,.1)
pd.concat([df.groupby('state')['sales'].quantile(x) for x in quants],axis=1,keys=[str(x) for x in quants])

Related

Feature engineering, ValueError: Columns must be same length as key

I'm running into a ValueError: Columns must be same length as key when trying to do encoding for the column Type. Here are the codes, not sure which part is wrong.
df.head()
plt.figure(figsize=(7, 5))
sns.heatmap(df.isnull(), cmap='viridis')
df.isnull().any()
df.isnull().sum()
df['Rating'] = df['Rating'].fillna(df['Rating'].median())
replaces = [u'\u00AE', u'\u2013', u'\u00C3', u'\u00E3', u'\u00B3', '[', ']', "'"]
for i in replaces:
df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : x.replace(i, ''))
regex = [r'[-+|/:/;(_)#]', r'\s+', r'[A-Za-z]+']
for j in regex:
df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : re.sub(j, '0', x))
df['Current Ver'] = df['Current Ver'].astype(str).apply(lambda x : x.replace('.', ',',1).replace('.', '').replace(',', '.',1)).astype(float)
df['Current Ver'] = df['Current Ver'].fillna(df['Current Ver'].median())
i = df[df['Category'] == '1.9'].index
df.loc[i]
df = df.drop(i)
df = df[pd.notnull(df['Last Updated'])]
df = df[pd.notnull(df['Content Rating'])]
le = preprocessing.LabelEncoder()
df['App'] = le.fit_transform(df['App'])
category_list = df['Category'].unique().tolist()
category_list = ['cat_' + word for word in category_list]
df = pd.concat([df, pd.get_dummies(df['Category'], prefix='cat')], axis=1)
le = preprocessing.LabelEncoder()
df['Genres'] = le.fit_transform(df['Genres'])
le = preprocessing.LabelEncoder()
df['Content Rating'] = le.fit_transform(df['Content Rating'])
df['Price'] = df['Price'].apply(lambda x : x.strip('$'))
df['Installs'] = df['Installs'].apply(lambda x : x.strip('+').replace(',', ''))
df['Type'] = pd.get_dummies(df['Type'])
You are trying to map a DataFrame with multiple columns to one column to the original DataFrame.
pd.get_dummies returns a DataFrame with a column for each value in the column.
If you want to add those values to the original DataFrame you can use concat.
Example:
import pandas as pd
df = pd.DataFrame(data=['type1', 'type2', 'type3'], columns=['Type'])
dummies_df = pd.get_dummies(df['Type'])
pd.concat([df, dummies_df], axis=1)

Dictionary creation inside a function

Let's say I have the following dataframe:
import pandas as pd
data = {'Flag':['a', 'b', 'a', 'b'],
'Item':['ball', 'car', 'pen', 'candy'],
'Char1':[0, 0, 0, 0],
'Char2':[23, 21, 19, 13],
'Char3':[40, 43, 60, 70]}
df = pd.DataFrame(data)
Now, let's perform some calculation:
df['Char1_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char1.mean(), axis=1)
df['Char1_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char1.std(), axis=1)
df['Char2_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char2.mean(), axis=1)
df['Char2_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char2.std(), axis=1)
df['Char3_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char3.mean(), axis=1)
df['Char3_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char3.std(), axis=1)
Finally let's create the following dictionary:
Flag_list = ['a','b']
sum_dict = {'Flag':Flag_list,
'Char1_average':df['Char1_avg'].head(2).tolist(),
'Char1_std':df['Char1_std'].head(2).tolist(),
'Char2_average':df['Char2_avg'].head(2).tolist(),
'Char2_std':df['Char2_std'].head(2).tolist(),
'Char3_average':df['Char3_avg'].head(2).tolist(),
'Char3_std':df['Char3_std'].head(2).tolist()}
In this way all works fine,
correct dictionary
but I need to define a function that performs the same things, so I have written the following code:
def fnctn(dataf):
param_list=["Char1", "Char2", 'Char3']
for param in param_list:
dataf[f'{param}_avg'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].mean(), axis=1)
dataf[f'{param}_StDev'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].std(), axis=1)
sum_dict = {'Flag':Flag_list,
f'{param}_average':dref[f'{param}_avg'].head(2).tolist(),
f'{param}_std':dref[f'{param}_StDev'].head(2).tolist()}
ref_avg_values = pd.DataFrame(sum_dict)
dataf = df.copy()
fnctn(dataf)
But this time the dictionary I get contains only the values of the last iteration:
wrong dictionary
How can I get the same dictionary as in the previous case?
you have to update it into the dictionary so that you have all the values that are iterated inside the for loop.
Here is the solution to your query:
def fnctn(dataf):
param_list=["Char1", "Char2", 'Char3']
dictie={}
for param in param_list:
dataf[f'{param}_avg'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].mean(), axis=1)
dataf[f'{param}_StDev'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].std(), axis=1)
sum_dict = {'Flag':Flag_list,
f'{param}_average':dataf[f'{param}_avg'].head(2).tolist(),
f'{param}_std':dataf[f'{param}_StDev'].head(2).tolist()}
dictie.update(sum_dict)
return pd.DataFrame(dictie)
dataf = df.copy()
fnctn(dataf)
And the answer is as below:

How to transpose values from top few rows in python dataframe into new columns

I am trying to select the values from the top 3 records of each group in a python sorted dataframe and put them into new columns. I have a function that is processing each group but I am having difficulties finding the right method to extract, rename the series, then combine the result as a single series to return.
Below is a simplified example of an input dataframe (df_in) and the expected output (df_out):
import pandas as pd
data_in = { 'Product': ['A', 'A', 'A', 'A', 'B', 'C', 'C'],
'Price': [25.0, 30.5, 50.0, 61.5, 120.0, 650.0, 680.0],
'Qty': [15 , 13, 14, 10, 5, 2, 1]}
df_in = pd.DataFrame (data_in, columns = ['Product', 'Price', 'Qty'])
I am reproducing below 2 examples of the functions I've tested and trying to get a more efficient option that works, especially if I have to process many more columns and records.
Function best3_prices_v1 works but have to explicitly specify each column or variable, and is especially an issue as I have to add more columns.
def best3_prices_v1(x):
d = {}
# get best 3 records if records available, else set volumes as zeroes
best_price_lv1 = x.iloc[0].copy()
rec_with_zeroes = best_price_lv1.copy()
rec_with_zeroes['Price'] = 0
rec_with_zeroes['Qty'] = 0
recs = len(x) # number of records
if (recs == 1):
# 2nd and 3rd records not available
best_price_lv2 = rec_with_zeroes.copy()
best_price_lv3 = rec_with_zeroes.copy()
elif (recs == 2):
best_price_lv2 = x.iloc[1]
# 3rd record not available
best_price_lv3 = rec_with_zeroes.copy()
else:
best_price_lv2 = x.iloc[1]
best_price_lv3 = x.iloc[2]
# 1st best
d['Price_1'] = best_price_lv1['Price']
d['Qty_1'] = best_price_lv1['Qty']
# 2nd best
d['Price_2'] = best_price_lv2['Price']
d['Qty_2'] = best_price_lv2['Qty']
# 3rd best
d['Price_3'] = best_price_lv3['Price']
d['Qty_3'] = best_price_lv3['Qty']
# return combined results as a series
return pd.Series(d, index=['Price_1', 'Qty_1', 'Price_2', 'Qty_2', 'Price_3', 'Qty_3'])
Codes to call function:
# sort dataframe by Product and Price
df_in.sort_values(by=['Product', 'Price'], ascending=True, inplace=True)
# get best 3 prices and qty as new columns
df_out = df_in.groupby(['Product']).apply(best3_prices_v1).reset_index()
Second attempt to improve/reduce codes and explicit names for each variable ... not complete and not working.
def best3_prices_v2(x):
d = {}
# get best 3 records if records available, else set volumes as zeroes
best_price_lv1 = x.iloc[0].copy()
rec_with_zeroes = best_price_lv1.copy()
rec_with_zeroes['Price'] = 0
rec_with_zeroes['Qty'] = 0
recs = len(x) # number of records
if (recs == 1):
# 2nd and 3rd records not available
best_price_lv2 = rec_with_zeroes.copy()
best_price_lv3 = rec_with_zeroes.copy()
elif (recs == 2):
best_price_lv2 = x.iloc[1]
# 3rd record not available
best_price_lv3 = rec_with_zeroes.copy()
else:
best_price_lv2 = x.iloc[1]
best_price_lv3 = x.iloc[2]
stats_columns = ['Price', 'Qty']
# get records values for best 3 prices
d_lv1 = best_price_lv1[stats_columns]
d_lv2 = best_price_lv2[stats_columns]
d_lv3 = best_price_lv3[stats_columns]
# How to rename (keys?) or combine values to return?
lv1_stats_columns = [c + '_1' for c in stats_columns]
lv2_stats_columns = [c + '_2' for c in stats_columns]
lv3_stats_columns = [c + '_3' for c in stats_columns]
# return combined results as a series
return pd.Series(d, index=lv1_stats_columns + lv2_stats_columns + lv3_stats_columns)
Let's unstack():
df_in=(df_in.set_index([df_in.groupby('Product').cumcount().add(1),'Product'])
.unstack(0,fill_value=0))
df_in.columns=[f"{x}_{y}" for x,y in df_in]
df_in=df_in.reset_index()
OR via pivot()
df_in=(df_in.assign(key=df_in.groupby('Product').cumcount().add(1))
.pivot('Product','key',['Price','Qty'])
.fillna(0,downcast='infer'))
df_in.columns=[f"{x}_{y}" for x,y in df_in]
df_in=df_in.reset_index()
Based on #AnuragDabas's pivot solution and #ceruler's feedback above, I can now expand it to a more general problem.
New dataframe with more groups and columns:
data_in = { 'Product': ['A', 'A', 'A', 'A', 'B', 'C', 'C'],
'Model': ['A1', 'A1', 'A1', 'A2', 'B1', 'C1', 'C1'],
'Price': [25.0, 30.5, 50.0, 61.5, 120.0, 650.0, 680.0],
'Qty': [15 , 13, 14, 10, 5, 2, 1],
'Ratings': [9, 7, 8, 10, 6, 7, 8 ]}
df_in = pd.DataFrame (data_in, columns = ['Product', 'Model' ,'Price', 'Qty', 'Ratings'])
group_list = ['Product', 'Model']
stats_list = ['Price','Qty', 'Ratings']
df_out = df_in.groupby(group_list).head(3)
df_out=(df_out.assign(key=df_out.groupby(group_list).cumcount().add(1))
.pivot(group_list,'key', stats_list)
.fillna(0,downcast='infer'))
df_out.columns=[f"{x}_{y}" for x,y in df_out]
df_out = df_out.reset_index()

Why does the export of a styled pandas dataframe to Excel not work?

I would like to apply the same background color to cells that have for each PEOPLE instance the name and the related name. I have tried to df.style.applymap, it does not return an error but it does not seem to work. Anyone has any ideas why? Thank you.
clrs = list(mcolors.CSS4_COLORS.keys())
for k in range(len(PEOPLE)):
if PEOPLE[k].attribute == 'child':
df1_data = [PEOPLE[k].name, PEOPLE[k].related]
df.style.applymap([lambda x: 'background-color: yellow' if x in df1_data else 'background-color: red'])
df.to_excel('styledz.xlsx', engine='openpyxl')
Here is some more info on df.style. Here I'm using some simple example because I don't have your data available:
import pandas as pd
import numpy as np
df = pd.DataFrame({'a': np.random.randint(0, 10, 10), 'b': np.random.randint(0, 10, 10), 'late': np.random.choice([0, 1], 10).astype(np.bool)})
def highlight_late(s):
return ['background-color: red' if s['late'] else 'background-color: green' for s_ in s]
df = df.style.apply(highlight_late, axis=1)
df.to_excel('style.xlsx', engine='openpyxl')
Looks in the excel file like this:
For cell based coloring use:
def highlight_late(s):
return ['background-color: red' if s_ else 'background-color: green' for s_ in s]
df = df.style.apply(highlight_late, subset=["late"], axis=1)
This gives you:
Basically your solution will be a modification of the following:
df = DataFrame([['mark', 2], ['mike', 4], ['manny', 6]], columns=['name', 'attribute'])
def style_row(row, people):
output = Series("", index=row.index)
if row["name"] in people:
output['attribute'] = "background-color:red;"
return output
styler = df.style.apply(style_row, axis=1, people=['mark', 'manny'])
styler

Why is my for loop overwriting my output?

I understand what I am doing wrong, but not how to fix it.
I am trying to write "Z" to excel and then save once I have looped through all my list entries, but the index in the DF resets every iteration and overwrites so I only see my last list entry and not all 5.
Any help is appreciated.
orig_df = (pd.read_excel("abc.xlsx"))
writer = pd.ExcelWriter('NEW_Frame.xlsx', engine='xlsxwriter')
List1 = ['Market_A', 'Market_B', 'Market_C', 'Market_D', 'Market_E']
new_df = pd.DataFrame(columns=['Location','Data1','Data2','Data3'], index=range(5))
for i in range(len(List1)) :
M = List1[i]
P = List1[i]
M = abc[abc.Location.str.contains(M)]
Z = [{'Location': P , 'Data1': abc['Data1'].sum(), 'Data2': abc['Data2'].sum(), 'Data3':
abc['Data3'].sum(),}]
Z = pd.DataFrame(Z)
Z.to_excel(writer, sheet_name=P)
i += 1
writer.save()
since I don't have your input file I am just saving a small test dataframe, but I had no trouble saving all 5 pages with this:
writer = pd.ExcelWriter('NEW_Frame.xlsx', engine='xlsxwriter')
for sheet_name in ['Market_A', 'Market_B', 'Market_C', 'Market_D', 'Market_E']:
df = pd.DataFrame({'test1': [1,1,1], 'test2': [2,2,2]})
df.to_excel(writer, sheet_name=sheet_name)
writer.save()
Could you try this and see if you have any issues?
orig_df = (pd.read_excel("abc.xlsx"))
writer = pd.ExcelWriter('NEW_Frame.xlsx', engine='xlsxwriter')
List1 = ['Market_A', 'Market_B', 'Market_C', 'Market_D', 'Market_E']
new_df = pd.DataFrame(columns=['Location','Data1','Data2','Data3'], index=range(5))
location, data1, data2, data3 = []
for i in range(len(List1)) :
M = List1[i]
P = List1[i]
M = abc[abc.Location.str.contains(M)]
location.append(p)
data1.append(abc['Data1'].sum())
data2.append(abc['Data2'].sum())
data3.append(abc['Data3'].sum())
i += 1
Z = [{'Location': location , 'Data1': data1, 'Data2': data2, 'Data3':
data3,}]
Z = pd.DataFrame(Z)
Z.to_excel(writer, sheet_name=P)
writer.save()

Categories

Resources