Pandas set element style dependent on another dataframe mith multi index - python

I have previously asked the question Pandas set element style dependent on another dataframe, which I have a working solution to, but now I am trying to apply it to a data frame with a multi index and I am getting an error, which I do not understand.
Problem
I have a pandas df and accompanying boolean matrix. I want to highlight the df depending on the boolean matrix.
Data
import pandas as pd
import numpy as np
from datetime import datetime
date = pd.date_range(start = datetime(2016,1,1), end = datetime(2016,2,1), freq = "D")
i = len(date)
dic = {'X':pd.DataFrame(np.random.randn(i, 2),index = date, columns = ['A','B']),
'Y':pd.DataFrame(np.random.randn(i, 2),index = date, columns = ['A','B']),
'Z':pd.DataFrame(np.random.randn(i, 2),index = date, columns = ['A','B'])}
df = pd.concat(dic.values(),axis=1,keys=dic.keys())
boo = [True, False]
bool_matrix = {'X':pd.DataFrame(np.random.choice(boo, (i,2), p=[0.3,.7]), index = date, columns = ['A','B']),
'Y':pd.DataFrame(np.random.choice(boo, (i,2), p=[0.3,.7]), index = date, columns = ['A','B']),
'Z':pd.DataFrame(np.random.choice(boo, (i,2), p=[0.3,.7]), index = date, columns = ['A','B'])}
bool_matrix =pd.concat(bool_matrix.values(),axis=1,keys=bool_matrix.keys())
My attempted solution
def highlight(value):
return 'background-color: green'
my_style = df.style
for column in df.columns:
for i in df[column].index:
data = bool_matrix.loc[i, column]
if data:
my_style = df.style.use(my_style.export()).applymap(highlight, subset = pd.IndexSlice[i, column])
my_style
Results
The above throws an AttributeError: 'Series' object has no attribute 'applymap'
I do not understand what is returning as a Series. This is a single value I am subsetting and this solution worked for non multi-indexed df's as shown below.
Without Multi-index
import pandas as pd
import numpy as np
from datetime import datetime
np.random.seed(24)
date = pd.date_range(start = datetime(2016,1,1), end = datetime(2016,2,1), freq = "D")
df = pd.DataFrame({'A': np.linspace(1, 100, len(date))})
df = pd.concat([df, pd.DataFrame(np.random.randn(len(date), 4), columns=list('BCDE'))],
axis=1)
df['date'] = date
df.set_index("date", inplace = True)
boo = [True, False]
bool_matrix = pd.DataFrame(np.random.choice(boo, (len(date), 5),p=[0.3,.7]), index = date,columns=list('ABCDE'))
def highlight(value):
return 'background-color: green'
my_style = df.style
for column in df.columns:
for i in bool_matrix.index:
data = bool_matrix.loc[i, column]
if data:
my_style = df.style.use(my_style.export()).applymap(highlight, subset = pd.IndexSlice[i,column])
my_style
Documentation
The docs make reference to CSS Classes and say that "Index label cells include level where k is the level in a MultiIndex." I am obviouly indexing this wrong, but am stumped on how to proceed.

It's very nice that there is a runable example.
You can use df.style.apply(..., axis=None) to apply a highlight method to the whole dataframe.
With your df and bool_matrix, try this:
def highlight(value):
d = value.copy()
for c in d.columns:
for r in df.index:
if bool_matrix.loc[r, c]:
d.loc[r, c] = 'background-color: green'
else:
d.loc[r, c] = ''
return d
df.style.apply(highlight, axis=None)
Or to make codes simple, you can try:
def highlight(value):
return bool_matrix.applymap(lambda x: 'background-color: green' if x else '')
df.style.apply(highlight, axis=None)
Hope this is what you need.

Related

Pandas Conditional formatting by comparing the column values of dataframe

import io
import pandas as pd
csv_data = '''App_name,pre-prod,prod,stage
matching-image,nginx,nginx,nginx
mismatching-image,nginx,nginx,nginx:1.23.3-alpine'''
df = pd.read_csv(io.StringIO(csv_data), sep=",")
html_table = df.tohtml()
Is there a way to compare the values of columns in dataframe and use it in conditional formatting ? I want compare if the 'prod','pre-prod' and 'stage' values are mismatching, if yes then then its bg-color should be red. I have tired the following methods present in pandas but none of them works.
df.style.apply()
df.style.apply_index()
df.style.applymap()
Current Output:
Desired output:
You can add style conditionally by applying style to a subset of your dataframe like:
import io
import pandas as pd
csv_data = '''App_name,pre-prod,prod,stage
matching-image,nginx,nginx,nginx
mismatching-image,nginx,nginx,nginx:1.23.3-alpine'''
def add_color(row):
return ['background-color: red'] * len(row)
df = pd.read_csv(io.StringIO(csv_data), sep=",")
df.loc[(df["pre-prod"] == df["prod"]) & (df["prod"] == df["stage"])].style.apply(add_color, axis=1)
import io
import pandas as pd
csv_data = '''
App_name,pre-prod,prod,stage
matching-image,nginx,nginx,nginx
matching-image,nginx,nginx,nginx
mismatching-image,nginx,nginx,nginx:1.23.3-alpine
mismatching-image,nginx,nginx,nginx:1.23.3-alpine
'''
df = pd.read_csv(io.StringIO(csv_data), sep=",")
def match_checker(row):
if row['prod'] == row['pre-prod'] == row['stage']:
return [''] * len(row)
else:
return ['background-color: red'] * len(row)
df = df.style.apply(match_checker, axis=1)
html_table = df.to_html()
with open('testpandas.html','w+') as html_file:
html_file.write(html_table)
html_file.close()
Updated #PeterSmith answer.
It's also possible to style the entire DataFrame in one go by passing axis=None to apply.
We can identify rows which have differing values in the specified columns by comparing the first column (column 0) with the remaining columns (column 1-2) and identifying where there are unequal values using ne on axis=0.
df[['prod', 'stage']].ne(df['pre-prod'], axis=0)
# prod stage
# 0 False False
# 1 False True
Then we can check across rows for any rows which have any True values (meaning there is something that's not equal in the row).
df[['prod', 'stage']].ne(df['pre-prod'], axis=0).any(axis=1)
# 0 False
# 1 True
# dtype: bool
We can then simply apply the styles anywhere there's a True value in the resulting Series.
Altogether this could look something like:
def colour_rows_that_dont_match(df_: pd.DataFrame, comparison_cols: List[str]):
# Sanity check that comparison_cols is what we expect
assert isinstance(comparison_cols, list) and len(comparison_cols) > 1, \
'Must be a list and provide at least 2 column to compare'
# Create an Empty DataFrame to hold styles of the same shape as the original df
styles_df = pd.DataFrame('', index=df_.index, columns=df_.columns)
# Compare the first column's (col 0) values to the remaining columns.
# Find rows where any values are not equal (ne)
rows_that_dont_match = df[comparison_cols[1:]].ne(df[comparison_cols[0]], axis=0).any(axis=1)
# Apply styles to rows which meet the above criteria
styles_df.loc[rows_that_dont_match, :] = 'background-color: red'
return styles_df
df.style.apply(
colour_rows_that_dont_match,
# This gets passed to the function
comparison_cols=['pre-prod', 'prod', 'stage'],
# Apply to the entire DataFrame at once
axis=None
).to_html(buf='test_df.html')
Which produces the following:
Setup, version, and imports:
from typing import List
import pandas as pd # version 1.5.2
df = pd.DataFrame({
'App_name': ['matching-image', 'mismatching-image'],
'pre-prod': ['nginx', 'nginx'],
'prod': ['nginx', 'nginx'],
'stage': ['nginx', 'nginx:1.23.3-alpine']
})

Pandas include single row in df after filtering with .loc

So, in this function:
def filter_by_freq(df, frequency):
filtered_df = df.copy()
if frequency.upper() == 'DAY':
pass
else:
date_obj = filtered_df['Date'].values[0]
target_day = pd.to_datetime(date_obj).day
target_month = pd.to_datetime(date_obj).month
final_date_obj = filtered_df['Date'].values[-1]
if frequency.upper() == 'MONTH':
filtered_df = filtered_df.loc[filtered_df['Date'].dt.day.eq(target_day)]
elif frequency.upper() == 'YEAR':
filtered_df = filtered_df.loc[filtered_df['Date'].dt.day.eq(target_day)]
filtered_df = filtered_df.loc[filtered_df['Date'].dt.month.eq(target_month)]
return filtered_df
How can I also include in the .loc the very last row from the original df? Tried doing (for month frequency): filtered_df = filtered_df.loc[(filtered_df['Date'].dt.day.eq(target_day)) | (filtered_df['Date'].dt.date.eq(final_date_obj))] but didn't work.
Thanks for your time!
Here's one way you could do it. In this example I have a df and I want to filter out all rows that have c1 > 0.5, but I want to keep the last row no matter what. I create a boolean series called lte_half to keep track of the first condition, and then I create another boolean series/list/array (all interchangeable) called end_ind which is True only for the last row. The filtered table is created by taking all rows that pass either condition with the |
import pandas as pd
import numpy as np
np.random.seed(0)
df = pd.DataFrame({'c1':np.random.rand(20)})
lte_half = df['c1'].le(0.5)
end_ind = df.index == df.index[-1]
filt_df = df[lte_half | end_ind]
print(filt_df)

Too many columns resulting in `PerformanceWarning: DataFrame is highly fragmented`

I have a list of filepaths in the first column of a dataframe. My goal is to create a second column that represents file categories, with categories reflecting the words in the filepath.
import pandas as pd
import numpy as np
data = {'filepath': ['C:/barracuda/document.doc', 'C:/dog/document.doc', 'C:/cat/document.doc']
}
df = pd.DataFrame(data)
df["Animal"] =(df['filepath'].str.contains("dog|cat",case=False,regex=True))
df["Fish"] =(df['filepath'].str.contains("barracuda",case=False))
df = df.loc[:, 'filepath':'Fish'].replace(True, pd.Series(df.columns, df.columns))
df = df.loc[:, 'filepath':'Fish'].replace(False,np.nan)
def squeeze_nan(x):
original_columns = x.index.tolist()
squeezed = x.dropna()
squeezed.index = [original_columns[n] for n in range(squeezed.count())]
return squeezed.reindex(original_columns, fill_value=np.nan)
df = df.apply(squeeze_nan, axis=1)
print(df)
This code works. The problem arises when I have 200 statements beginning with df['columnName'] =. Because I have so many, I get the error:
PerformanceWarning: DataFrame is highly fragmented. This is usually the result of calling frame.insert many times, which has poor performance. Consider joining all columns at once using pd.concat(axis=1) instead. To get a de-fragmented frame, use newframe = frame.copy()
To fix this I have tried:
dfAnimal = df.copy
dfAnimal['Animal'] = dfAnimal['filepath'].str.contains("dog|cat",case=False,regex=True)
dfFish = df.copy
dfFish["Fish"] =dfFish['filepath'].str.contains("barracuda",case=False)
df = pd.concat(dfAnimal,dfFish)
The above gives me errors such as method object is not iterable and method object is not subscriptable. I then tried df = df.loc[df['filepath'].isin(['cat','dog'])] but this only works when 'cat' or 'dog' is the only word in the column. How do I avoid the performance error?
Try creating all your new columns in a dict, and then convert that dict into a dataframe, and then use pd.concat to add the resulting dataframe (containing the new columns) to the original dataframe:
new_columns = {
'Animal': df['filepath'].str.contains("dog|cat",case=False,regex=True),
'Fish': df['filepath'].str.contains("barracuda",case=False),
}
new_df = pd.DataFrame(new_columns)
df = pd.concat([df, new_df], axis=1)
Added to your original code, it would be something like this:
import pandas as pd
import numpy as np
data = {'filepath': ['C:/barracuda/document.doc', 'C:/dog/document.doc', 'C:/cat/document.doc']
}
df = pd.DataFrame(data)
##### These are the new lines #####
new_columns = {
'Animal': df['filepath'].str.contains("dog|cat",case=False,regex=True),
'Fish': df['filepath'].str.contains("barracuda",case=False),
}
new_df = pd.DataFrame(new_columns)
df = pd.concat([df, new_df], axis=1)
##### End of new lines #####
df = df.loc[:, 'filepath':'Fish'].replace(True, pd.Series(df.columns, df.columns))
df = df.loc[:, 'filepath':'Fish'].replace(False,np.nan)
def squeeze_nan(x):
original_columns = x.index.tolist()
squeezed = x.dropna()
squeezed.index = [original_columns[n] for n in range(squeezed.count())]
return squeezed.reindex(original_columns, fill_value=np.nan)
df = df.apply(squeeze_nan, axis=1)
print(df)

Python pandas DF question : trying to drop a column but nothing happens (df.drop) - code also runs without any errors

I am trying to delete a column called Rank but nothing happens. The remaining code all executes without any issue but the column itself remains in the output file. I've highlighted the part of the code that is not working.
def read_csv():
file = "\mona" + yday+".csv"
#df=[]
df = pd.read_csv(save_path+file,skiprows=3,encoding = "ISO-8859-1",error_bad_lines=False)
return df
# replace . with / in column EPIC
def tickerchange():
df=read_csv()
df['EPIC'] = df['EPIC'].str.replace('.','/')
return df
def consolidate_AB_listings():
df=tickerchange()
Aline = df.loc[(df['EPIC'] =='RDSA'),'Mkt Cap (àm)']
Bline = df.loc[(df['EPIC'] =='RDSB'),'Mkt Cap (àm)']
df.loc[(df['EPIC'] =='RDSA'),'Mkt Cap (àm)']= float(Aline) + float(Bline)
df = df.loc[(df.Ind != 'I/E')]
df = df.loc[(df.Ind != 'FL')]
df = df.loc[(df.Ind != 'M')]
df = df.loc[(df.EPIC != 'RDSB')]
return df
def ranking_mktcap():
df = consolidate_AB_listings()
df['Rank']= df['Mkt Cap (àm)'].rank(ascending=False)
df = df.loc[(df.Rank != 1)]
df['Rank1']= df['Mkt Cap (Em)'].rank(ascending=False)
## This doesn't seem to work
df = df.drop(df['Security'], 1)
return df
def save_outputfile():
#df = drop()
df = ranking_mktcap()
df.to_csv(r'S:\Index_Analytics\UK\Index Methodology\FTSE\Py_file_download\MonitoredList.csv', index=False)
print("finished")
if __name__ == "__main__":
main()
read_csv()
tickerchange()
consolidate_AB_listings()
ranking_mktcap()
save_outputfile()
DataFrame.drop() takes the following: DataFrame.drop(self, labels=None, axis=0, index=None, columns=None, level=None, inplace=False, errors='raise').
When you call df = df.drop(df['Security'], 1) it's using df['security'] as the labels to drop. And the 1 is being passed through the axis parameter.
If you want to drop the column 'Security' then you'd want to do:
df = df.drop('Security', axis=1)
# this is same as
df = df.drop(labels='Security', axis=1)
# you can also specify the column name directly, like this
df = df.drop(columns='Security')
Note: the columns= parameter can take a single lable (str) like above, or can take a list of column names.
Try by replacing
df = df.drop(df['Security'], 1)
By
df.drop(['Security'],axis=1, inplace=True)
I had the same issue and all I did was add inplace = True
So it will be df = df.drop(df['Security'], 1, inplace = True)

Update rows of pandas dataframe based upon other rows

I have a pandas dataframe which has the following columns ( pk1, pk2 type, qty_6, qty_7 ). I have type as predicted_90, override_90, predicted_50, override 50. Now Based upon combination of pk1 and pk2 If for type predicted_50, predicted_90 contains some value for override_50, override_90 apart from NaN, I want to update my dataframe columns predicted_50, predicted_90 with override_50 and override_90 respectively. Also, I want to capture this change in a boolean column called qty_6_overridden, qty_7_overridden. Also, I want to capture the difference between the both in a column qty_6_dev, qty_7_dev.
qty_6_dev = qty_6 override - qty_6 predicted
Example dataframe :
data=[
['B01FV0FBX4','2019-01-13','predicted_90',2207.931,2217.841],
['B01FV0FBX4','2019-01-13','predicted_50',1561.033,1521.567],
['B01FV0FBX4','2019-01-13','override_90',1973.000,np.NaN],
['B01FV0FBX4','2019-01-13','override_50',1233.000,np.NaN],
['B01FV0FBX4','2019-01-06','override_50',np.NaN,1233.000],
['B01FV0FBX4','2019-01-06','predicted_50',1210.129,1213.803],
['B01FV0FBX4','2019-01-06','override_90',np.NaN,1973.000],
['B01FV0FBX4','2019-01-06','predicted_90',1911.205,1921.594]
]
df = pd.DataFrame(data,columns=['pk1','pk2', 'type', 'qty_6', 'qty_7'])
Expected output :
data=[
['B01FV0FBX4','2019-01-13','predicted_90',1973.000,2217.841,-234.931,0,True,False],
['B01FV0FBX4','2019-01-13','predicted_50',1233.000,1521.567,-328.033,0,True,False],
['B01FV0FBX4','2019-01-13','override_90',1973.000,np.NaN,0,0,False,False],
['B01FV0FBX4','2019-01-13','override_50',1233.000,np.NaN,0,0,False,False],
['B01FV0FBX4','2019-01-06','override_50',np.NaN,1233.000,0,0,False,False],
['B01FV0FBX4','2019-01-06','predicted_50',1210.129,1213.000,0,-0.803,False,True],
['B01FV0FBX4','2019-01-06','override_90',np.NaN,1973.000,0,0,False,False],
['B01FV0FBX4','2019-01-06','predicted_90',1911.205,1973.000,0,51.406,False,True]
]
df = pd.DataFrame(data,columns=['pk1','pk2', 'type', 'qty_6', 'qty_7','qty_6_dev','qty_7_dev', 'qty_6_overridden','qty_7_overridden'])
In the example you can see, the quantities with override exchange quantitties with predicted and we get the corresponding columns 'qty_6_dev','qty_7_dev', 'qty_6_overridden','qty_7_overridden'.
I was able to write a solution. It works but it looks horrible and very difficult to understand for others.
import pandas as pd
import numpy as np
import math
data=[
['B01FV0FBX4','2019-01-13','predicted_90',2207.931,2217.841],
['B01FV0FBX4','2019-01-13','predicted_50',1561.033,1521.567],
['B01FV0FBX4','2019-01-13','override_90',1973.000,np.NaN],
['B01FV0FBX4','2019-01-13','override_50',1233.000,np.NaN],
['B01FV0FBX4','2019-01-06','override_50',np.NaN,1233.000],
['B01FV0FBX4','2019-01-06','predicted_50',1210.129,1213.803],
['B01FV0FBX4','2019-01-06','override_90',np.NaN,1973.000],
['B01FV0FBX4','2019-01-06','predicted_90',1911.205,1921.594]
]
df = pd.DataFrame(data,columns=['pk1','pk2', 'type', 'qty_6', 'qty_7'])
override_map = {
"predicted_50" : "override_50",
"predicted_90" : "override_90"
}
def transform_df(df):
transformed_df = pd.DataFrame()
for index, row in df.iterrows():
row_type = row['type']
row_pk1 = row['pk1']
row_pk2 = row['pk2']
if row_type in override_map.keys():
override_type = override_map.get(row_type)
else:
for i in range(6,8):
qty_dev_col = 'qty_'+str(i)+'_dev'
qty_override_col = 'qty_'+str(i)+'_overridden'
row[qty_dev_col] = 0
row[qty_override_col] = False
transformed_df=transformed_df.append(row, ignore_index=True)
continue
corr_df = df.loc[(df.type == override_type)
& (df.pk1 == row_pk1)
& (df.pk2 == row_pk2)]
for i in range(6,8):
qty_col = 'qty_'+str(i)
qty_dev_col = 'qty_'+str(i)+'_dev'
qty_override_col = 'qty_'+str(i)+'_overridden'
if not (math.isnan(corr_df[qty_col])) and (corr_df[qty_col].values[0] != row[qty_col]):
row[qty_dev_col] = corr_df[qty_col].values[0] - row[qty_col]
row[qty_col] = corr_df[qty_col].values[0]
row[qty_override_col] = True
else:
row[qty_dev_col] = 0
row[qty_override_col] = False
transformed_df=transformed_df.append(row, ignore_index=True)
return transformed_df
x1 = transform_df(df)
Is there a better way to do this using lambdas or something ? Also this takes like forever to run over a bigger dataframe.

Categories

Resources