A more efficient way to iterate over multiple DataFrames - python

I am trying to create custom DataFrame that will represent all missing (NaN) values in my data.
Solution I came up with works, but it is slow and ineffective over a set with 300 rows and 50 columns.
Pandas Version = "0.24.2"
import pandas as pd
data = {
'city_code' : ['Sydney2017', 'London2017', 'Sydney2018', 'London2018'],
'population_mil': [5.441, 7.375, pd.np.nan, pd.np.nan]
}
class NaNData:
def __init__(self, data: dict):
self.data: dict = data
#property
def data_df(self) -> pd.DataFrame:
""" Returns input data as a DataFrame. """
return pd.DataFrame(self.data)
def select_city(self, city_code: str) -> pd.DataFrame:
""" Creates DataFrame where city_code column value matches
requested city_code string. """
df = self.data_df
return df.loc[df['city_code'] == city_code]
#property
def df(self) -> pd.DataFrame:
""" Creates custom summary DataFrame to represent missing data. """
data_df = self.data_df
# There are duplicates in 'city_code' column. Make sure your cities
# are unique values only.
all_cities = list(set(data_df['city_code']))
# Check whether given city has any NaN values in any column.
has_nan = [
self.select_city(i).isnull().values.any() for i in all_cities
]
data = {
'cities' : all_cities,
'has_NaN': has_nan,
}
df = pd.DataFrame(data)
return df
nan_data = NaNData(data)
print(nan_data.df)
# Output:
# cities has_NaN
# 0 London2018 True
# 1 London2017 False
# 2 Sydney2018 True
# 3 Sydney2017 False
I feel like the way I approach iteration in pandas is not right. Is there a proper (or common) solution for this kind of problem? Should I be somehow using groupby for these kind of operations?
Any input is very appreciated,
Thank you for your time.

You don't need to iterate over multiple dataframes to obtaion your result, you can indeed use groupby with apply:
import pandas as pd
data = {
'city_code' : ['Sydney2017', 'London2017', 'Sydney2018', 'London2018'],
'population_mil': [5.441, 7.375, pd.np.nan, pd.np.nan],
'temp': [28, pd.np.nan, 24, 25]
}
df = pd.DataFrame(data)
df.groupby('city_code').apply(lambda x: x.isna().any()).any(axis=1)

I think you can use the isna() function to do the na check:
df = pd.DataFrame(data)
df.assign(has_NaN=df.population_mil.isna()).drop('population_mil',1)
city_code has_NaN
0 Sydney2017 False
1 London2017 False
2 Sydney2018 True
3 London2018 True

Related

Pandas Conditional formatting by comparing the column values of dataframe

import io
import pandas as pd
csv_data = '''App_name,pre-prod,prod,stage
matching-image,nginx,nginx,nginx
mismatching-image,nginx,nginx,nginx:1.23.3-alpine'''
df = pd.read_csv(io.StringIO(csv_data), sep=",")
html_table = df.tohtml()
Is there a way to compare the values of columns in dataframe and use it in conditional formatting ? I want compare if the 'prod','pre-prod' and 'stage' values are mismatching, if yes then then its bg-color should be red. I have tired the following methods present in pandas but none of them works.
df.style.apply()
df.style.apply_index()
df.style.applymap()
Current Output:
Desired output:
You can add style conditionally by applying style to a subset of your dataframe like:
import io
import pandas as pd
csv_data = '''App_name,pre-prod,prod,stage
matching-image,nginx,nginx,nginx
mismatching-image,nginx,nginx,nginx:1.23.3-alpine'''
def add_color(row):
return ['background-color: red'] * len(row)
df = pd.read_csv(io.StringIO(csv_data), sep=",")
df.loc[(df["pre-prod"] == df["prod"]) & (df["prod"] == df["stage"])].style.apply(add_color, axis=1)
import io
import pandas as pd
csv_data = '''
App_name,pre-prod,prod,stage
matching-image,nginx,nginx,nginx
matching-image,nginx,nginx,nginx
mismatching-image,nginx,nginx,nginx:1.23.3-alpine
mismatching-image,nginx,nginx,nginx:1.23.3-alpine
'''
df = pd.read_csv(io.StringIO(csv_data), sep=",")
def match_checker(row):
if row['prod'] == row['pre-prod'] == row['stage']:
return [''] * len(row)
else:
return ['background-color: red'] * len(row)
df = df.style.apply(match_checker, axis=1)
html_table = df.to_html()
with open('testpandas.html','w+') as html_file:
html_file.write(html_table)
html_file.close()
Updated #PeterSmith answer.
It's also possible to style the entire DataFrame in one go by passing axis=None to apply.
We can identify rows which have differing values in the specified columns by comparing the first column (column 0) with the remaining columns (column 1-2) and identifying where there are unequal values using ne on axis=0.
df[['prod', 'stage']].ne(df['pre-prod'], axis=0)
# prod stage
# 0 False False
# 1 False True
Then we can check across rows for any rows which have any True values (meaning there is something that's not equal in the row).
df[['prod', 'stage']].ne(df['pre-prod'], axis=0).any(axis=1)
# 0 False
# 1 True
# dtype: bool
We can then simply apply the styles anywhere there's a True value in the resulting Series.
Altogether this could look something like:
def colour_rows_that_dont_match(df_: pd.DataFrame, comparison_cols: List[str]):
# Sanity check that comparison_cols is what we expect
assert isinstance(comparison_cols, list) and len(comparison_cols) > 1, \
'Must be a list and provide at least 2 column to compare'
# Create an Empty DataFrame to hold styles of the same shape as the original df
styles_df = pd.DataFrame('', index=df_.index, columns=df_.columns)
# Compare the first column's (col 0) values to the remaining columns.
# Find rows where any values are not equal (ne)
rows_that_dont_match = df[comparison_cols[1:]].ne(df[comparison_cols[0]], axis=0).any(axis=1)
# Apply styles to rows which meet the above criteria
styles_df.loc[rows_that_dont_match, :] = 'background-color: red'
return styles_df
df.style.apply(
colour_rows_that_dont_match,
# This gets passed to the function
comparison_cols=['pre-prod', 'prod', 'stage'],
# Apply to the entire DataFrame at once
axis=None
).to_html(buf='test_df.html')
Which produces the following:
Setup, version, and imports:
from typing import List
import pandas as pd # version 1.5.2
df = pd.DataFrame({
'App_name': ['matching-image', 'mismatching-image'],
'pre-prod': ['nginx', 'nginx'],
'prod': ['nginx', 'nginx'],
'stage': ['nginx', 'nginx:1.23.3-alpine']
})

Ungroup pandas dataframe after bfill

I'm trying to write a function that will backfill columns in a dataframe adhearing to a condition. The upfill should only be done within groups. I am however having a hard time getting the group object to ungroup. I have tried reset_index as in the example bellow but that gets an AttributeError.
Accessing the original df through result.obj doesn't lead to the updated value because there is no inplace for the groupby bfill.
def upfill(df:DataFrameGroupBy)->DataFrameGroupBy:
for column in df.obj.columns:
if column.startswith("x"):
df[column].bfill(axis="rows", inplace=True)
return df
Assigning the dataframe column in the function doesn't work because groupbyobject doesn't support item assingment.
def upfill(df:DataFrameGroupBy)->DataFrameGroupBy:
for column in df.obj.columns:
if column.startswith("x"):
df[column] = df[column].bfill()
return df
The test I'm trying to get to pass:
def test_upfill():
df = DataFrame({
"id":[1,2,3,4,5],
"group":[1,2,2,3,3],
"x_value": [4,4,None,None,5],
})
grouped_df = df.groupby("group")
result = upfill(grouped_df)
result.reset_index()
assert result["x_value"].equals(Series([4,4,None,5,5]))
You should use 'transform' method on the grouped DataFrame, like this:
import pandas as pd
def test_upfill():
df = pd.DataFrame({
"id":[1,2,3,4,5],
"group":[1,2,2,3,3],
"x_value": [4,4,None,None,5],
})
result = df.groupby("group").transform(lambda x: x.bfill())
assert result["x_value"].equals(pd.Series([4,4,None,5,5]))
test_upfill()
Here you can find find more information about the transform method on Groupby objects
Based on the accepted answer this is the full solution I got to although I have read elsewhere there are issues using the obj attribute.
def upfill(df:DataFrameGroupBy)->DataFrameGroupBy:
columns = [column for column in df.obj.columns if column.startswith("x")]
df.obj[columns] = df[columns].transform(lambda x:x.bfill())
return df
def test_upfill():
df = DataFrame({
"id":[1,2,3,4,5],
"group":[1,2,2,3,3],
"x_value": [4,4,None,None,5],
})
grouped_df = df.groupby("group")
result = upfill(grouped_df)
assert df["x_value"].equals(Series([4,4,None,5,5]))

How to insert a new column into a dataframe and access rows with different indices?

I have a dataframe with one column "Numbers" and I want to add a second column "Result". The values should be the sum of the previous two values in the "Numbers" column, otherwise NaN.
import pandas as pd
import numpy as np
data = {
"Numbers": [100,200,400,0]
}
df = pd.DataFrame(data,index = ["whatever1", "whatever2", "whatever3", "whatever4"])
def add_prev_two_elems_to_DF(df):
numbers = "Numbers" # alias
result = "Result" # alias
df[result] = np.nan # empty column
result_index = list(df.columns).index(result)
for i in range(len(df)):
#row = df.iloc[i]
if i < 2: df.iloc[i,result_index] = np.nan
else: df.iloc[i,result_index] = df.iloc[i-1][numbers] + df.iloc[i-2][numbers]
add_prev_two_elems_to_DF(df)
display(df)
The output is:
Numbers Result
whatever1 100 NaN
whatever2 200 NaN
whatever3 400 300.0
whatever4 0 600.0
But this looks quite complicated. Can this be done easier and maybe faster? I am not looking for a solution with sum(). I want a general solution for any kind of function that can fill a column using values from other rows.
Edit 1: I forgot to import numpy.
Edit 2: I changed one line to this:
if i < 2: df.iloc[i,result_index] = np.nan
Looks like you could use rolling.sum together with shift. Since rollling.sum sums until the current row, we have to shift it down one row, so that each row value matches to the sum of the previous 2 rows:
df['Result'] = df['Numbers'].rolling(2).sum().shift()
Output:
Numbers Result
whatever1 100 NaN
whatever2 200 NaN
whatever3 400 300.0
whatever4 0 600.0
This is the shortest code I could develop. It outputs exactly the same table.
import numpy as np
import pandas as pd
#import swifter # apply() gets swifter
data = {
"Numbers": [100,200,400,0]
}
df = pd.DataFrame(data,index = ["whatever1", "whatever2", "whatever3", "whatever4"])
def func(a: np.ndarray) -> float: # we expect 3 elements, but we don't check that
a.reset_index(inplace=True,drop=True) # the index now starts with 0, 1,...
return a[0] + a[1] # we use the first two elements, the 3rd is unnecessary
df["Result"] = df["Numbers"].rolling(3).apply(func)
#df["Result"] = df["Numbers"].swifter.rolling(3).apply(func)
display(df)

How to implement python custom function on dictionary of dataframes

I have a dictionary that contains 3 dataframes.
How do I implement a custom function to each dataframes in the dictionary.
In simpler terms, I want to apply the function find_outliers as seen below
# User defined function : find_outliers
#(I)
from scipy import stats
outlier_threshold = 1.5
ddof = 0
def find_outliers(s: pd.Series):
outlier_mask = np.abs(stats.zscore(s, ddof=ddof)) > outlier_threshold
# replace boolean values with corresponding strings
return ['background-color:blue' if val else '' for val in outlier_mask]
To the dictionary of dataframes dict_of_dfs below
# the dataset
import numpy as np
import pandas as pd
df = {
'col_A':['A_1001', 'A_1001', 'A_1001', 'A_1001', 'B_1002','B_1002','B_1002','B_1002','D_1003','D_1003','D_1003','D_1003'],
'col_X':[110.21, 191.12, 190.21, 12.00, 245.09,4321.8,122.99,122.88,134.28,148.14,161.17,132.17],
'col_Y':[100.22,199.10, 191.13,199.99, 255.19,131.22,144.27,192.21,7005.15,12.02,185.42,198.00],
'col_Z':[140.29, 291.07, 390.22, 245.09, 4122.62,4004.52,395.17,149.19,288.91,123.93,913.17,1434.85]
}
df = pd.DataFrame(df)
df
#dictionary_of_dataframes
#(II)
dict_of_dfs=dict(tuple(df.groupby('col_A')))
and lastly, flag outliers in each df of the dict_of_dfs
# end goal is to have find/flag outliers in each `df` of the `dict_of_dfs`
#(III)
desired_cols = ['col_X','col_Y','col_Z']
dict_of_dfs.style.apply(find_outliers, subset=desired_cols)
summarily, I want to apply I to II and finally flag outliers in III
Thanks for your attempt. :)
Desired output should look like this, but for the three dataframes
This may not be what you want, but this is how I'd approach it, but you'll have to work out the details of the function because you have it written to receive a series rather a dataframe. Groupby apply() will send the subsets of rows and then you can perform the actions on that subset and return the result.
For consideration:
inside the function you may be able to handle all columns like so:
def find_outliers(x):
for col in ['col_X','col_Y','col_Z']:
outlier_mask = np.abs(stats.zscore(x[col], ddof=ddof)) > outlier_threshold
x[col] = ['outlier' if val else '' for val in outlier_mask]
return x
newdf = df.groupby('col_A').apply(find_outliers)
col_A col_X col_Y col_Z
0 A_1001 outlier
1 A_1001
2 A_1001
3 A_1001 outlier
4 B_1002 outlier
5 B_1002 outlier
6 B_1002
7 B_1002
8 D_1003 outlier
9 D_1003
10 D_1003

Update rows of pandas dataframe based upon other rows

I have a pandas dataframe which has the following columns ( pk1, pk2 type, qty_6, qty_7 ). I have type as predicted_90, override_90, predicted_50, override 50. Now Based upon combination of pk1 and pk2 If for type predicted_50, predicted_90 contains some value for override_50, override_90 apart from NaN, I want to update my dataframe columns predicted_50, predicted_90 with override_50 and override_90 respectively. Also, I want to capture this change in a boolean column called qty_6_overridden, qty_7_overridden. Also, I want to capture the difference between the both in a column qty_6_dev, qty_7_dev.
qty_6_dev = qty_6 override - qty_6 predicted
Example dataframe :
data=[
['B01FV0FBX4','2019-01-13','predicted_90',2207.931,2217.841],
['B01FV0FBX4','2019-01-13','predicted_50',1561.033,1521.567],
['B01FV0FBX4','2019-01-13','override_90',1973.000,np.NaN],
['B01FV0FBX4','2019-01-13','override_50',1233.000,np.NaN],
['B01FV0FBX4','2019-01-06','override_50',np.NaN,1233.000],
['B01FV0FBX4','2019-01-06','predicted_50',1210.129,1213.803],
['B01FV0FBX4','2019-01-06','override_90',np.NaN,1973.000],
['B01FV0FBX4','2019-01-06','predicted_90',1911.205,1921.594]
]
df = pd.DataFrame(data,columns=['pk1','pk2', 'type', 'qty_6', 'qty_7'])
Expected output :
data=[
['B01FV0FBX4','2019-01-13','predicted_90',1973.000,2217.841,-234.931,0,True,False],
['B01FV0FBX4','2019-01-13','predicted_50',1233.000,1521.567,-328.033,0,True,False],
['B01FV0FBX4','2019-01-13','override_90',1973.000,np.NaN,0,0,False,False],
['B01FV0FBX4','2019-01-13','override_50',1233.000,np.NaN,0,0,False,False],
['B01FV0FBX4','2019-01-06','override_50',np.NaN,1233.000,0,0,False,False],
['B01FV0FBX4','2019-01-06','predicted_50',1210.129,1213.000,0,-0.803,False,True],
['B01FV0FBX4','2019-01-06','override_90',np.NaN,1973.000,0,0,False,False],
['B01FV0FBX4','2019-01-06','predicted_90',1911.205,1973.000,0,51.406,False,True]
]
df = pd.DataFrame(data,columns=['pk1','pk2', 'type', 'qty_6', 'qty_7','qty_6_dev','qty_7_dev', 'qty_6_overridden','qty_7_overridden'])
In the example you can see, the quantities with override exchange quantitties with predicted and we get the corresponding columns 'qty_6_dev','qty_7_dev', 'qty_6_overridden','qty_7_overridden'.
I was able to write a solution. It works but it looks horrible and very difficult to understand for others.
import pandas as pd
import numpy as np
import math
data=[
['B01FV0FBX4','2019-01-13','predicted_90',2207.931,2217.841],
['B01FV0FBX4','2019-01-13','predicted_50',1561.033,1521.567],
['B01FV0FBX4','2019-01-13','override_90',1973.000,np.NaN],
['B01FV0FBX4','2019-01-13','override_50',1233.000,np.NaN],
['B01FV0FBX4','2019-01-06','override_50',np.NaN,1233.000],
['B01FV0FBX4','2019-01-06','predicted_50',1210.129,1213.803],
['B01FV0FBX4','2019-01-06','override_90',np.NaN,1973.000],
['B01FV0FBX4','2019-01-06','predicted_90',1911.205,1921.594]
]
df = pd.DataFrame(data,columns=['pk1','pk2', 'type', 'qty_6', 'qty_7'])
override_map = {
"predicted_50" : "override_50",
"predicted_90" : "override_90"
}
def transform_df(df):
transformed_df = pd.DataFrame()
for index, row in df.iterrows():
row_type = row['type']
row_pk1 = row['pk1']
row_pk2 = row['pk2']
if row_type in override_map.keys():
override_type = override_map.get(row_type)
else:
for i in range(6,8):
qty_dev_col = 'qty_'+str(i)+'_dev'
qty_override_col = 'qty_'+str(i)+'_overridden'
row[qty_dev_col] = 0
row[qty_override_col] = False
transformed_df=transformed_df.append(row, ignore_index=True)
continue
corr_df = df.loc[(df.type == override_type)
& (df.pk1 == row_pk1)
& (df.pk2 == row_pk2)]
for i in range(6,8):
qty_col = 'qty_'+str(i)
qty_dev_col = 'qty_'+str(i)+'_dev'
qty_override_col = 'qty_'+str(i)+'_overridden'
if not (math.isnan(corr_df[qty_col])) and (corr_df[qty_col].values[0] != row[qty_col]):
row[qty_dev_col] = corr_df[qty_col].values[0] - row[qty_col]
row[qty_col] = corr_df[qty_col].values[0]
row[qty_override_col] = True
else:
row[qty_dev_col] = 0
row[qty_override_col] = False
transformed_df=transformed_df.append(row, ignore_index=True)
return transformed_df
x1 = transform_df(df)
Is there a better way to do this using lambdas or something ? Also this takes like forever to run over a bigger dataframe.

Categories

Resources