what is the pythonic way to test dataframe processing pipelines - python

What is the best way to go about testing a pandas dataframe processing chain? I stubbed out the script file and the test file below so you can see what I mean.
I am getting confused on best practice, my only guiding intuition is to make the tests so they can run in any order, limit how many times the csv is loaded from disk, while also making sure each point in the chain does not modify the fixture. Each step in the process is dependent on the previous steps so unit testing each node is like testing the accumulation of processing to that point in the pipeline. So far I am accomplishing the mission but it seems like a lot of code duplication is happening because I am incrementally building the pipeline in each test.
What is the way to test this kind of python script?
This is the data processing file stubbed out:
#main_script.py
def calc_allocation_methodology(df_row):
print('calculating allocation methodoloyg')
return 'simple'
def flag_data_for_the_allocation_methodology(df):
allocation_methodology = df.apply(calc_allocation_methodology, axis=1)
df.assign(allocation_methodology=allocation_methodology)
print('flagging each row for the allocation methodoloyg')
return df
def convert_repeating_values_to_nan(df):
'keep one value and nan the rest of the values'
print('convert repeating values to nan')
return df
def melt_and_drop_accounting_columns(df):
print('melt and drop accounting columns')
print(f'colums remaining: {df.shape[0]}')
return df
def melt_and_drop_engineering_columns(df):
print('melt and drop engineering columns')
print(f'colums remaining: {df.shape[0]}')
return df
def process_csv_to_tiny_format(df):
print('process the entire pipeline')
return (df
.pipe(flag_data_for_the_allocation_methodology)
.pipe(convert_repeating_values_to_nan)
.pipe(melt_and_drop_accounting_columns)
.pipe(melt_and_drop_engineering_columns)
)
This is the test file stubbed out
#test_main.py
from pytest import fixture
import main_script as main
import pandas as pd
#fixture(scope='session')
def df_from_csv()
return pd.load_csv('database_dump.csv')
#fixture
def df_copy(df_from_csv):
df = df_from_csv.copy()
return df
def test_expected_flag_data_for_the_allocation_methodology(df_copy)
df = df_copy
node_to_test = df.pipe(main.flag_data_for_the_allocation_methodology)
assert True
def test_convert_repeating_values_to_nan(df_copy)
df = df_copy
node_to_test = df.pipe(main.flag_data_for_the_allocation_methodology).pipe(main.convert_repeating_values_to_nan)
assert True
def test_melt_and_drop_accounting_columns(df_copy)
df = df_copy
node_to_test = (df
.pipe(main.flag_data_for_the_allocation_methodology)
.pipe(main.convert_repeating_values_to_nan)
.pipe(main.melt_and_drop_accounting_columns))
assert True
def test_melt_and_drop_engineering_columns(df_copy)
df = df_copy
node_to_test = (df
.pipe(main.flag_data_for_the_allocation_methodology)
.pipe(main.convert_repeating_values_to_nan)
.pipe(main.melt_and_drop_accounting_columns)
.pipe(main.melt_and_drop_engineering_columns))
assert True
def test_process_csv_to_tiny_format(df_from_csv):
df = df_from_csv.copy()
tiny_data = main.process_csv_to_tiny_format(df)
assert True

Related

Polars dataframe doesn't drop column

I have a function in a script that I am testing and the df.drop() function is not working as expected.
app.py
def area(df,value):
df["area"] = df['geo'].apply(lambda row:to_area(row))
df["area"] = df["area"].apply(lambda row: abs(row - mean))
df = df.filter(pl.col("area") < value)
df = df.drop("area")
return df
test.py
def test():
df = some df
res = area(df,2)
res_2 = area(df,4)
At res_2, I keep getting the "area" column back in the dataframe, which is causing me problems with type checking. Any ideas on what might be causing this? I know that using df.clone() works, but I don't understand what is causing this issue with how things are set up.

Python parallel apply on dataframe

I have this part of code in my application.
What I want is to iterate over each row in my data frame (pandas) and modify column to function result.
I tried to implement it with multiprocessing, but I'm to see if there is any faster and easier to implement way to do it.
Is there any simple way to run this part in parallel?
def _format(data: pd.DataFrame, context: pd.DataFrame)
data['context'] = data.apply(lambda row: get_context_value(context, row), axis=1)
The data frame I work with is not to large (10,000 - 100,000) and the function to evaluate the value to assign to the column take around 250ms - 500ms for one row. But the whole process for the size of the data frame takes to much.
Thanks
I have a project which it is done there: https://github.com/mjafari98/dm-classification/blob/main/inference.py
import pandas as pd
from functools import partial
from multiprocessing import Pool
import numpy as np
def parallelize(data, func, num_of_processes=8):
data_split = np.array_split(data, num_of_processes)
pool = Pool(num_of_processes)
data = pd.concat(pool.map(func, data_split))
pool.close()
pool.join()
return data
def run_on_subset(func, data_subset):
return data_subset.apply(func, axis=1)
def parallelize_on_rows(data, func, num_of_processes=8):
return parallelize(data, partial(run_on_subset, func), num_of_processes)
def a_function(row):
...do something ...
return row
df = ...somedf...
new_df = parallelize_on_rows(df, a_function)

Alternative to repeatedly printing shapes of the pandas dataframe after every step

Hello users of pandas,
I often find myself printing the shapes of the dataframes after every step of processing. I do this to monitor how the shape of the data changes and to ensure that it is done correctly.
e.g.
print(df.shape)
df=df.dropna()
print(df.shape)
df=df.melt()
print(df.shape)
...
I wonder if there is any better/elegant way, preferably a shorthad or an automatic way to do this kind of stuff.
I believe that what you're doing is entirely fine - especially as you are exploring. The code is easy to read and there isn't too much repetitive code. If you really wanted to reduce lines of code, you could utilize a helper function that could wrap whatever you are trying to run. For example:
def df_caller(df, fn, *args, **kwargs):
new_df = getattr(df, fn)(*args, **kwargs)
print(new_df.shape)
assert df.shape == new_df.shape
return new_df
df = df_caller(df, 'dropna')
df = df_caller(df, 'melt')
...
However, in my opinion the meta programming in the above solution is a little too magical and harder to read than what you originally posted.
I improvised on Matthew Cox's answer, and added an attribute to the pandas dataframe itself. This simplifies things a lot.
import numpy as np
import pandas as pd
# set logger
import logging
logger = logging.getLogger()
logger.setLevel(logging.INFO)
# log changes in dataframe
def log_(df, fun, *args, **kwargs):
logging.info(f'shape changed from {df.shape}', )
df1 = getattr(df, fun)(*args, **kwargs)
logging.info(f'shape changed to {df1.shape}')
return df1
# custom pandas dataframe
#pd.api.extensions.register_dataframe_accessor("log")
class log:
def __init__(self, pandas_obj):
self._obj = pandas_obj
def dropna(self,**kws):
return log_(self._obj,fun='dropna',**kws)
# demo data
df = pd.DataFrame({"name": ['Alfred', 'Batman', 'Catwoman'],
"toy": [np.nan, 'Batmobile', 'Bullwhip'],
"born": [pd.NaT, pd.Timestamp("1940-04-25"),
pd.NaT]})
# trial
df.log.dropna()
# stderr
INFO:root:shape changed from (3, 3)
INFO:root:shape changed to (1, 3)
# returns dropna'd dataframe

Doing task between Multiprocessing python

I want use multiprocessing library to parallelize the computation. If you comment line 5 and 9 and uncomment line 11 we can run this code in serial fashion.
My dataframe is very big and taking lot of time so I want to use multiprocessing.
This is what i am trying
def do_something (df):
return df
def main(df,df_hide,df_res):
p = Pool() # comment to run normal way
for i in range(0,df_hide.shape[0]):
df = df.append(df_hide.iloc[i,:])
df = p.map(do_something,df) # comment to run normal way
#df = do_something(df) # uncomment to run normal way
df_res.iloc[i,0] = df.iloc[-1,0]
return df_res
if __name__ == '__main__':
df = pd.DataFrame({'a':[1,2,3]})
df_hide = pd.DataFrame({'a':[4,5,6]})
df_res = pd.DataFrame({'z':[0,0,0]})
df_res1 = main(df,df_hide,df_res)
print(df_res1)
Excepted output it will come if I run it normally
z
0 4
1 5
2 6
This gives me nothing It freezes the cmd. Any way still if I run it I don't think I will get expected results. As I have to do something after ever process. Can you please suggest how to parallelize this above code using multiprocessing.
import numpy as np
import pandas as pd
def do_something (df):
return df
def main(df,df_hide,df_res):
for i in range(0,df_hide.shape[0]):
df = df.append(df_hide.iloc[i,:])
df_res.iloc[i,0] = df.iloc[-1,0]
return df_res
if __name__ == '__main__':
df = pd.DataFrame({'a':[1,2,3]})
df_hide = pd.DataFrame({'a':[4,5,6]})
df_res = pd.DataFrame({'z':[0,0,0]})
df_res1 = main(df,df_hide,df_res)
print(df_res1)

Applying parallelization when updating dictionary values

datasets = {}
datasets['df1'] = df1
datasets['df2'] = df2
datasets['df3'] = df3
datasets['df4'] = df4
def prepare_dataframe(dataframe):
return dataframe.apply(lambda x: x.astype(str).str.lower().str.replace('[^\w\s]', ''))
for key, value in datasets.items():
datasets[key] = prepare_dataframe(value)
I need to prepare the data in some dataframes for further analysis. I would like to parallelize the for loop that updates the dictionary with a prepared dataframe. This code will eventually run on a machine with dozens of cores and thousands of dataframes. On my local machine I do not appear to be using more than a single core in the prepare_dataframe function.
I have looked at Numba and Joblib but I cannot find a way to work with dictionary values in either library.
Any insight would be very much appreciated!
You can use the multiprocessing library. You can read about its basics here.
Here is the code that does what you need:
from multiprocessing import Pool
def prepare_dataframe(dataframe):
# do whatever you want here
# changes made here are *not* global
# return a modified version of what you want
return dataframe
def worker(dict_item):
key,value = dict_item
return (key,prepare_dataframe(value))
def parallelize(data, func):
data_list = list(data.items())
pool = Pool()
data = dict(pool.map(func, data_list))
pool.close()
pool.join()
return data
datasets = parallelize(datasets,worker)

Categories

Resources