Let's say I have the following dataframe:
import pandas as pd
data = {'Flag':['a', 'b', 'a', 'b'],
'Item':['ball', 'car', 'pen', 'candy'],
'Char1':[0, 0, 0, 0],
'Char2':[23, 21, 19, 13],
'Char3':[40, 43, 60, 70]}
df = pd.DataFrame(data)
Now, let's perform some calculation:
df['Char1_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char1.mean(), axis=1)
df['Char1_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char1.std(), axis=1)
df['Char2_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char2.mean(), axis=1)
df['Char2_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char2.std(), axis=1)
df['Char3_avg'] = df.apply(lambda x: df[df.Flag == x.Flag].Char3.mean(), axis=1)
df['Char3_std'] = df.apply(lambda x: df[df.Flag == x.Flag].Char3.std(), axis=1)
Finally let's create the following dictionary:
Flag_list = ['a','b']
sum_dict = {'Flag':Flag_list,
'Char1_average':df['Char1_avg'].head(2).tolist(),
'Char1_std':df['Char1_std'].head(2).tolist(),
'Char2_average':df['Char2_avg'].head(2).tolist(),
'Char2_std':df['Char2_std'].head(2).tolist(),
'Char3_average':df['Char3_avg'].head(2).tolist(),
'Char3_std':df['Char3_std'].head(2).tolist()}
In this way all works fine,
correct dictionary
but I need to define a function that performs the same things, so I have written the following code:
def fnctn(dataf):
param_list=["Char1", "Char2", 'Char3']
for param in param_list:
dataf[f'{param}_avg'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].mean(), axis=1)
dataf[f'{param}_StDev'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].std(), axis=1)
sum_dict = {'Flag':Flag_list,
f'{param}_average':dref[f'{param}_avg'].head(2).tolist(),
f'{param}_std':dref[f'{param}_StDev'].head(2).tolist()}
ref_avg_values = pd.DataFrame(sum_dict)
dataf = df.copy()
fnctn(dataf)
But this time the dictionary I get contains only the values of the last iteration:
wrong dictionary
How can I get the same dictionary as in the previous case?
you have to update it into the dictionary so that you have all the values that are iterated inside the for loop.
Here is the solution to your query:
def fnctn(dataf):
param_list=["Char1", "Char2", 'Char3']
dictie={}
for param in param_list:
dataf[f'{param}_avg'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].mean(), axis=1)
dataf[f'{param}_StDev'] = dataf.apply(lambda x: dataf[dataf.Flag == x.Flag][f'{param}'].std(), axis=1)
sum_dict = {'Flag':Flag_list,
f'{param}_average':dataf[f'{param}_avg'].head(2).tolist(),
f'{param}_std':dataf[f'{param}_StDev'].head(2).tolist()}
dictie.update(sum_dict)
return pd.DataFrame(dictie)
dataf = df.copy()
fnctn(dataf)
And the answer is as below:
Related
multipliers = {'A' : 5, 'B' : 10, 'C' : 15, 'D' : 20}
df = pd.util.testing.makeDataFrame() # a random df with columns A,B,C,D
f = lambda x, col: multipliers[col] * x
Is there Pandas non-loop way to apply f to each column, like df.apply(f, axis = 0, ?)? What I can achieve with loop is
df2 = df.copy()
for c in df.columns:
df2[c] = f(df[c], c)
(real f is more complex than the above example, please treat f as a black box function of two variables, arg1 is data, arg2 is column name)
Use lambda function and for pass column name use x.name:
np.random.seed(2022)
multipliers = {'A' : 5, 'B' : 10, 'C' : 15, 'D' : 20}
df = pd.util.testing.makeDataFrame() # a random df with columns A,B,C,D
f = lambda x, col: multipliers[col] * x
df2 = df.copy()
for c in df.columns:
df2[c] = f(df[c], c)
print (df2.head())
A B C D
9CTWXXW3ys 2.308860 6.375789 5.362095 -23.354181
yq1PHBltEO 2.876024 1.950080 15.772909 -13.776645
lWtMioDq6A -11.206739 17.691500 -12.175996 25.957264
lEHcq1pxLr -6.510434 -6.004475 14.084401 13.999673
xvL04Y66tm -3.827731 -3.104207 -4.111277 1.440596
df2 = df.apply(lambda x: f(x, x.name))
print (df2.head())
A B C D
9CTWXXW3ys 2.308860 6.375789 5.362095 -23.354181
yq1PHBltEO 2.876024 1.950080 15.772909 -13.776645
lWtMioDq6A -11.206739 17.691500 -12.175996 25.957264
lEHcq1pxLr -6.510434 -6.004475 14.084401 13.999673
xvL04Y66tm -3.827731 -3.104207 -4.111277 1.440596
You can convert your dictionary to series and transform your function to vectorized operation. For example:
df * pd.Series(multipliers)
You can also use the method transform that accepts a dict of functions:
def func(var):
# return your function
return lambda x: x * var
df.transform({k: func(v) for k, v in multipliers.items()})
In the dataset df below. I want to flag the anomalies in all columns except A, B,C and L.
Any value less than 1500 or greater than 400000 is regarded as an anomaly.
import pandas as pd
# intialise data of lists
data = {
'A':['T1', 'T2', 'T3', 'T4', 'T5'],
'B':[1,1,1,1,1],
'C':[1,2,3,5,9],
'D':[12005, 18190, 1034, 15310, 31117],
'E':[11021, 19112, 19021, 12, 24509 ],
'F':[10022,19910, 19113,19999, 25519],
'G':[14029, 29100, 39022, 24509, 412262],
'H':[52119,32991,52883,69359,57835],
'J':[41218, 52991,55121,69152,79355],
'K': [43211,8199991,56881,212,77342],
'L': [1,0,1,0,0],
'M': [31211,42901,53818,62158,69325],
}
# Create DataFrame
df = pd.DataFrame(data)
# Print the output.
df
Attempt:
exclude_cols = ['A','B','C','L']
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
return '' # or None, or whatever df.style() needs
else:
s = pd.to_numeric(s, errors='coerce')
indexes = (s<1500)|(s>400000)
return ['background-color: red' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1)
Result of the code:
Desired output should look like this:
Thanks for the effort!
If you set the subset as the argument of the apply function, you will get what you want.
exclude_cols = ['A','B','C','L']
def flag_outliers(s, exclude_cols):
if s.name in exclude_cols:
print(s.name)
return '' # or None, or whatever df.style() needs
else:
s = pd.to_numeric(s, errors='coerce')
indexes = (s<1500)|(s>400000)
return ['background-color: yellow' if v else '' for v in indexes]
df.style.apply(lambda s: flag_outliers(s, exclude_cols), axis=1, subset=['D','E','F','G','H','J','K'])
def frame(dt_type, start_year, end_year, columns_req):
frame = pd.DataFrame()
for i in range (start_year, end_year):
file_name = f"{dt_type} {i}"
dataframe = pd.read_csv(BytesIO(uploaded["%s.csv"%file_name]))
if len(columns_req) == 1:
df = pd.DataFrame(data, columns= [columns_req[0])
if len(columns_req) == 2:
df = pd.DataFrame(data, columns= [columns_req[0], columns_req[1]])
if len(columns_req) == 3:
df = pd.DataFrame(data, columns= [columns_req[0], columns_req[1], columns_req[2])
if len(columns_req) == 4:
df = pd.DataFrame(data, columns= [columns_req[0], columns_req[1], columns_req[2], columns_req[3]])
frame = frame.append(dataframe, ignore_index=True)
return (frame)
As you can see, the if loop is repetitive and feels odd. I am new to programming. Is there anyway to reduce that whole bunch of code?
you could do this
df = pd.DataFrame(data, columns = colums_req)
instead of all those if - conditions
I would like to apply the same background color to cells that have for each PEOPLE instance the name and the related name. I have tried to df.style.applymap, it does not return an error but it does not seem to work. Anyone has any ideas why? Thank you.
clrs = list(mcolors.CSS4_COLORS.keys())
for k in range(len(PEOPLE)):
if PEOPLE[k].attribute == 'child':
df1_data = [PEOPLE[k].name, PEOPLE[k].related]
df.style.applymap([lambda x: 'background-color: yellow' if x in df1_data else 'background-color: red'])
df.to_excel('styledz.xlsx', engine='openpyxl')
Here is some more info on df.style. Here I'm using some simple example because I don't have your data available:
import pandas as pd
import numpy as np
df = pd.DataFrame({'a': np.random.randint(0, 10, 10), 'b': np.random.randint(0, 10, 10), 'late': np.random.choice([0, 1], 10).astype(np.bool)})
def highlight_late(s):
return ['background-color: red' if s['late'] else 'background-color: green' for s_ in s]
df = df.style.apply(highlight_late, axis=1)
df.to_excel('style.xlsx', engine='openpyxl')
Looks in the excel file like this:
For cell based coloring use:
def highlight_late(s):
return ['background-color: red' if s_ else 'background-color: green' for s_ in s]
df = df.style.apply(highlight_late, subset=["late"], axis=1)
This gives you:
Basically your solution will be a modification of the following:
df = DataFrame([['mark', 2], ['mike', 4], ['manny', 6]], columns=['name', 'attribute'])
def style_row(row, people):
output = Series("", index=row.index)
if row["name"] in people:
output['attribute'] = "background-color:red;"
return output
styler = df.style.apply(style_row, axis=1, people=['mark', 'manny'])
styler
My code won't work... it gives me ValueError: columns overlap but no suffix specified
import pandas as pd
import pickle
list = ["ZILLOW2.csv", "ZILLOW3.csv", "ZILLOW4.csv",
"ZILLOW6.csv", "ZILLOW7.csv", "ZILLOW8.csv"]
maindf = pd.DataFrame()
for x in list:
df = pd.read_csv(x)
if x == "ZILLOW2.csv":
maindf = pd.DataFrame(df)
else:
maindf = maindf.join(df)
print(maindf)
Use concat:
import pandas as pd
import pickle
list = ["ZILLOW2.csv", "ZILLOW3.csv", "ZILLOW4.csv",
"ZILLOW6.csv", "ZILLOW7.csv", "ZILLOW8.csv"]
maindf = pd.DataFrame()
for x in list:
df = pd.read_csv(x)
if x == "ZILLOW2.csv":
maindf = pd.DataFrame(df)
else:
maindf = pd.concat([maindf, df], axis=1)
print(maindf)