Python - reduce number of if statements - python

I am filtering a pandas dataframe based on one or more conditions, like so:
def filter_dataframe(dataframe, position=None, team_id=None, home=None, window=None, min_games=0):
df = dataframe.copy()
if position:
df = df[df['position_id'] == position]
if clube_id:
df = df[df['team_id'] == team_id]
if home:
if home == 'home':
df = df[df['home_dummy'] == 1.0]
elif home == 'away':
df = df[df['home_dummy'] == 0.0]
if window:
df = df[df['round_id'].between(1, window)]
if min_games:
df = df[df['games_num'] >= min_games]
return df
But I don't think this is elegant.
Is there a simpler way of achieving the same result?
I though of creating rules for conditions like in this SO answer and then use the method any(rules) in order to apply the filtering, if any, but I don't know how to approach this. Any ideas?

You could try something like this:
def filter_dataframe(dataframe, position=None, clube_id=None, team_id=None, home=None, window=None, min_games=0):
df = dataframe.copy()
masks = {
"mask1": [position is not None, df[df["position_id"] == position]],
"mask2": [clube_id is not None, df[df["team_id"] == team_id]],
"mask3": [home == "home", df[df["home_dummy"] == 1.0]],
"mask4": [home == "away", df[df["home_dummy"] == 0.0]],
"mask5": [window is not None, df[df["round_id"].between(1, window)]],
"mask6": [min_games is not None, df[df["games_num"] >= min_games]],
}
for value in masks.values():
if value[0]:
df = value[1]
return df

Related

How can I simplify this function, random pandas DF selection adding to dictionary?

How can I simplify this function I am trying to create? I would like to pull data from a csv. Turn it into a Dataframe, randomly select a choice, add that choice to a corresponding dictionary key value pair.
def generate_traits():
import pandas as pd
df_bonds = pd.read_csv('/file/location_1')
df_alignments = pd.read_csv('/file/location_2')
df_faiths = pd.read_csv('/file/location_3')
df_flaws = pd.read_csv('/file/location_4')
df_ideals = pd.read_csv('/file/location_5')
df_lifestyles = pd.read_csv('/file/location_6')
df_organizations = pd.read_csv('/file/location_7')
df_personalities = pd.read_csv('/file/location_8')
df_names = pd.read_csv("/file/location_9")
random_bond = df_bonds.sample(1)
random_alignment = df_alignments.sample(1)
random_faith = df_faiths.sample(1)
random_flaw = df_flaws.sample(1)
random_ideal = df_ideals.sample(1)
random_lifestyle = df_lifestyles.sample(1)
random_organization = df_organizations.sample(1)
random_personaltiy = df_personalities.sample(1)
random_name = df_names.sample(1)
traits_dict={"Name:": random_name.iloc[0,0],
"Alignment:": random_alignment.iloc[0,0],
"Bond:":random_bond.iloc[0,0],
"Religion:":random_faith.iloc[0,0],
"Flaw:":random_flaw.iloc[0,0],
"Ideal:":random_ideal.iloc[0,0],
"Lifestyle:":random_lifestyle.iloc[0,0],
"Organization:":random_organization.iloc[0,0],
"Personality:":random_personaltiy.iloc[0,0]}
return traits_dict
The function does behave as expected however, I know there must be a way to loop through this I just have not found any way to do so.
You can chain your operations:
import pandas as pd
def generate_traits():
return {'Name': pd.read_csv('/file/location_1').sample(1).iloc[0,0],
'Alignment:': pd.read_csv('/file/location_2').sample(1).iloc[0,0],
'Bond': pd.read_csv('/file/location_3').sample(1).iloc[0,0],
'Religion': pd.read_csv('/file/location_4').sample(1).iloc[0,0],
'Flaw': pd.read_csv('/file/location_5').sample(1).iloc[0,0],
'Ideal': pd.read_csv('/file/location_6').sample(1).iloc[0,0],
'Lifestyle': pd.read_csv('/file/location_7').sample(1).iloc[0,0],
'Organization': pd.read_csv('/file/location_8').sample(1).iloc[0,0],
'Personality': pd.read_csv('/file/location_9').sample(1).iloc[0,0]}
def generate_traits():
import pandas as pd
name_location = {'Bond': 'location_1'
'Alignment': 'location_2'
'Religion': 'location_3'
'Flaw': 'location_4'
'ideals': 'location_5'
'Lifestyle': 'location_6'
'Organization': 'location_7'
'Personality': 'location_8'
'Name': 'location_9'}
all_df = {name: pd.read_csv(f'/file/{loc}') for name, loc in name_location.items()}
traits_dict = {name: df.sample(1).iloc[0, 0] for name, df in all_df.items()}
return traits_dict

parsing json file with function into dataframe for analysis

Hi am working with two json files , and im having problem with the data cleaning.
Suppose a record in g1j or g2j looks like this:
{
'cls_loc': 'QOEBBG_K0101',
'date': 1584957443013,
'dur': 32,
'exp': [
{
'm': 'spot_excited',
's': 8.5,
't': 8.5,
'w': 'spot_bored',
'x': 'A'
},
{
's': 1.1,
't': 11.4,
'w': 'spot_scared',
'x': 'A'
}
],
'mod': 'Poster',
'pre': False,
'scr': 67,
'usr': 'QOGOBN',
'ver': '20.5.3'
}
What we want per row in our DataFrame is this:
{
'student_pin': 'QOGOBN', # from `usr`
'date': datetime.date(2020 3, 23), # from `date`, but parsed
'duration': 32, # from `dur`
'level': 3, # the "K" from `cls_loc`, mapped to int
'unit': 1, # from `cls_loc`, mapped to int
'module': 1, # from `cls_loc`, mapped to int
'accuracy': 0.5, # calcualted from `exp`
}
my code so far:
from datetime import datetime
import json
import numpy as np
import pandas as pd
from scipy import stats
with open('/content/drive/MyDrive/group1_exp_2020-04-08.json', 'r') as f:
g1j = json.loads(f.read())
with open('/content/drive/MyDrive/group2_exp_2020-04-22.json', 'r') as f:
g2j = json.loads(f.read())
#convert the integer timestamp to a datetime.date
def timestamp_to_date():
l =[]
for item in g1j:
timestamp =item['date']
timestamp = timestamp/1000
dt_obj = datetime.fromtimestamp(timestamp).strftime('%Y, %m, %d ')
l.append(dt_obj)
return l
timestamp_to_date()
def timestamp_to_date():
l =[]
for item in g2j:
timestamp =item['date']
timestamp = timestamp/1000
dt_obj = datetime.fromtimestamp(timestamp).strftime('%Y, %m, %d ')
l.append(dt_obj)
return l
#extract the level, unit, module, and accuracy here
def get_level(x):
loc = x['cls_loc'].split('_')[-1]
return level_map[loc[0]]
def get_unit(x):
loc = x['cls_loc'].split('_')[-1]
unit = loc[1:3]
return int(unit)
def get_module(x):
loc = x['cls_loc'].split('_')[-1]
module = loc[3:]
return int(module)
def get_accuracy(x):
challenges = [x for x in x['exp'] if x['x'] == 'A']
n = len(challenges)
if n == 0:
return 'N/A'
mistakes = [x for x in challenges if 'm' in x.keys()]
correct = n - len(mistakes)
return correct / n
#create the function to convert experience records to the pandas.DataFrame
def exp_to_df(g1j):
df = pd.DataFrame(f, columns=['exp'])
return df
def exp_to_df(g2j):
df = pd.DataFrame(f, columns=['exp'])
return df
#uses the function you just implemented, and checks that your function keeps the records and uses the right column names
g1 = exp_to_df(g1j)
g2 = exp_to_df(g2j)
assert len(g1) == len(g1j)
assert len(g2) == len(g2j)
columns = ['student_pin', 'date', 'level', 'unit', 'module', 'accuracy']
assert all(c in g1.columns for c in columns)
assert all(c in g2.columns for c in columns)
What am I doing wrong? It seems like def exp_to_df(g1j) and def exp_to_df(g2j) are wrong. Any suggestions?
Also is my def timestamp_to_date() also wrong?
I suggest using the pandas read_json() function to load your json directly into a dataframe (I added a couple dummy records):
g1 = pd.read_json('/content/drive/MyDrive/group1_exp_2020-04-08.json')
# cls_loc date dur exp mod pre scr usr ver
# 0 QOEBBG_K0101 2020-03-23 09:57:23.013 32 [{'m': 'spot_excited', 's': 8.5, 't': 8.5, 'w'... Poster False 67 QOGOBN 20.5.3
# 1 QOEBBG_K0102 2020-03-23 09:57:23.013 32 [{'m': 'spot_excited', 's': 8.5, 't': 8.5, 'w'... Poster False 67 QOGOBN 20.5.3
# 2 QOEBBG_K0103 2020-03-23 09:57:23.013 32 [{'s': 1.1, 't': 11.4, 'x': 'C'}] Poster False 67 QOGOBN 20.5.3
Then you can do all the data wrangling with pandas functions like
str.extract(),
assign(),
to_datetime(),
map(), and
apply():
# extract level, unit, module as columns
g1 = g1.assign(**g1.cls_loc
.str.extract(r'_([a-zA-Z])([0-9]{2})([0-9]{2})')
.rename({0: 'level', 1: 'unit', 2: 'module'}, axis=1))
# convert date to datetime
g1.date = pd.to_datetime(g1.date, unit='ms')
# map level to int
level_map = {'K': 3}
g1.level = g1.level.map(level_map)
# compute accuracy
def accuracy(exp):
challenges = [e for e in exp if e['x'] == 'A']
n = len(challenges)
if n == 0:
return np.nan
mistakes = [c for c in challenges if 'm' in c.keys()]
correct = n - len(mistakes)
return correct / n
g1['accuracy'] = g1.exp.apply(accuracy)
# rename usr -> student_pin
g1 = g1.rename({'usr': 'student_pin'}, axis=1)
# keep desired columns
columns = ['student_pin', 'date', 'level', 'unit', 'module', 'accuracy']
g1 = g1[columns]
Output:
student_pin date level unit module accuracy
0 QOGOBN 2020-03-23 09:57:23.013 3 01 01 0.500000
1 QOGOBN 2020-03-23 09:57:23.013 3 01 02 0.333333
2 QOGOBN 2020-03-23 09:57:23.013 3 01 03 NaN

Performance problem when using pandas apply on big dataframes

Im having some performance issues with the code below, mostly because of the apply function that im using on a huge dataframe. I want to update the semi_dict dictionary with some other data that im calculating with the some functions. Is it any way to improve this?
def my_function_1(semi_dict, row):
#do some calculation/other stuff based on the row data and append it to the dictionary
random_dict = dict(data=some_data, more_data=more_data)
semi_dict["data"].append(random_dict)
def my_function_2(semi_dict, row):
#do some calculation/other stuff based on the row data and append it to the dictionary
random_dict = dict(data=some_data, more_data=more_data)
semi_dict["data2"].append(random_dict)
dictionary_list = []
for v in values:
df_1_rows = df_1_rows[(df_1_rows.values == v)]
df_2_rows = df_2_rows[(df_2_rows.values == v)]
semi_dict = dict(value=v, data=[], data2=[])
function = partial(my_function_1, semi_dict)
function_2 = partial(my_function_2, semi_dict)
df_1_rows.apply(lambda row : function(row), axis=1)
df_2_rows.apply(lambda row : function_2(row), axis=1)
dictionary_list.append(semi_dict)
This answer uses dictionary merge from How to merge dictionaries of dictionaries?, but depending on your use case, you might not need it in the end:
import pandas as pd
import random
len_df = 10
row_values = list("ABCD")
extra_col_values = list("12345")
df_1 = pd.DataFrame([[random.choice(row_values), random.choice(extra_col_values)] for _ in range(len_df)], columns=['col1', 'extra1'])
df_2 = pd.DataFrame([[random.choice(row_values), random.choice(extra_col_values)] for _ in range(len_df)], columns=['col2', 'extra2'])
def make_dict(df):
# some calculations on the df
return {
'data': df.head(1).values.tolist(),
}
def make_dict_2(df):
# some calculations on the df
return {
'data_2': df.head(1).values.tolist(),
}
def merge(a, b, path=None):
"merges b into a, taken from https://stackoverflow.com/questions/7204805/how-to-merge-dictionaries-of-dictionaries "
if path is None: path = []
for key in b:
if key in a:
if isinstance(a[key], dict) and isinstance(b[key], dict):
merge(a[key], b[key], path + [str(key)])
elif a[key] == b[key]:
pass # same leaf value
else:
raise Exception('Conflict at %s' % '.'.join(path + [str(key)]))
else:
a[key] = b[key]
return a
dict1 = df_1.groupby('col1').apply(make_dict).to_dict()
dict2 = df_2.groupby('col2').apply(make_dict_2).to_dict()
result = merge(dict1, dict2)
result

Raise ‘PicklingError’error when apply functions in certain class with pyspark

I'm trying to use pandas functions in spark with applyInPandas,when I tranform it within a certain class,it raise errors like this:pickle.PicklingError: Could not serialize object: Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.
my script run well in function-type coding:
from scipy.stats import kendalltau
import numpy as np
import pandas as pd
def kendall(dat, a, b):
kentmp = []
ken = [np.nan, np.nan]
if type(a) is list:
if dat.shape[0] > 3:
for item in a:
kentmp.append(kendalltau(dat[item], dat[b])[0])
tmp = pd.Series(kentmp, index=a).dropna()
if tmp.shape[0] > 0:
cato = tmp.idxmax()
if (tmp < 0).any():
cato = tmp.abs().idxmax()
ken = [cato, tmp[cato]]
index = ['category', 'corr']
else:
if dat.shape[0] >= 10:
ken = [kendalltau(dat[a], dat[b])[0], dat.shape[0]]
index = ['corr', 'N']
return pd.Series(ken, index=index)
def kendall_process(pdf):
result = pdf.groupby(['step_id','unit_id']).apply(kendall,'process','label')
result = pd.DataFrame(result).reset_index()
#result.columns = ['step_id','unit_id','corr','N']
pdf['label'] = pdf.label.astype('int')
result_ = pdf.groupby(['step_id','unit_id'])['label'].mean().reset_index()
result = pd.merge(result,result_,on=['step_id','unit_id'],how='left')
result.columns = ['step_id','unit_id','corr','N','ratio']
return result
result = datInOut.groupBy('step_id','unit_id').applyInPandas(kendall_process, schema='step_id string,\
unit_id string,\
corr float,\
N long,\
ratio float')
result.show(5)
+--------------+--------+-----------+----+-----+
| step_id| unit_id| corr| N|ratio|
+--------------+--------+-----------+----+-----+
|10303_A2AOI300|A2AOI300| null|null| 0.0|
|17613_A2AOI500|A2AOI500|-0.13477948| 14| 0.5|
|1B304_A2MAC100|A2MAC100| null|null| 1.0|
|1A106_A2SPR100|A2SPR100| null|null| 1.0|
|19103_A2AOI800|A2AOI800| null|null| 0.5|
+--------------+--------+-----------+----+-----+
only showing top 5 rows
but when I tansform it to class type coding,it raise the PicklingError:
#staticmethod
def kendall(dat,a,b):
kentmp=[]
ken=[np.nan,np.nan]
if type(a) is list:
if dat.shape[0]>3:
for item in a:
kentmp.append(kendalltau(dat[item],dat[b])[0])
tmp=pd.Series(kentmp,index=a).dropna()
if tmp.shape[0]>0:
cato=tmp.idxmax()
if (tmp<0).any():
cato=tmp.abs().idxmax()
ken=[cato,tmp[cato]]
index=['category','corr']
else:
if dat.shape[0]>=10:
ken=[kendalltau(dat[a],dat[b])[0],dat.shape[0]]
index=['corr','N']
return pd.Series(ken,index=index)
#staticmethod
def kendall_delay(pdf):
result = pdf.groupby(['step_id','equip_id']).apply(QTWorker.kendall,'delay','label')
result = pd.DataFrame(result).reset_index()
pdf['label'] = pdf.label.astype('int')
result_ = pdf.groupby(['step_id', 'equip_id'])['label'].mean().reset_index()
result = pd.merge(result, result_, on=['step_id', 'equip_id'], how='left')
result.columns = ['step_id', 'equip_id', 'corr', 'N', 'ratio']
return result
ret = datQ.groupBy(self.step, self.equip).applyInPandas(self.kendall_delay, schema='step_id string,equip_id string,corr float,N long,ratio float')
as see,I've already decorated the funtions used with staticmethod,but it still not work. I really wanna how to fix it!
Even I don't no why,but I've solved it by puting the kendall functions under kendall_delay.
I really wanna figure out the reason of it!
#staticmethod
def kendall_process(pdf):
def kendall(dat, a, b):
kentmp = []
ken = [np.nan, np.nan]
if type(a) is list:
if dat.shape[0] > 3:
for item in a:
kentmp.append(kendalltau(dat[item], dat[b])[0])
tmp = pd.Series(kentmp, index=a).dropna()
if tmp.shape[0] > 0:
cato = tmp.idxmax()
if (tmp < 0).any():
cato = tmp.abs().idxmax()
ken = [cato, tmp[cato]]
index = ['category', 'corr']
else:
if dat.shape[0] >= 10:
ken = [kendalltau(dat[a], dat[b])[0], dat.shape[0]]
index = ['corr', 'N']
return pd.Series(ken, index=index)
result = pdf.groupby(['step_id','equip_id']).apply(kendall,'process','label')
result = pd.DataFrame(result).reset_index()
pdf['label'] = pdf.label.astype('int')
result_ = pdf.groupby(['step_id', 'equip_id'])['label'].mean().reset_index()
result = pd.merge(result, result_, on=['step_id', 'equip_id'], how='left')
result.columns = ['step_id', 'equip_id', 'corr', 'N', 'ratio']
return result

how to fix the function so that the resulting dataframe has only the subsetted column?

trying to subset a dataframe to remove/drop certain columns from a given dataframe
Please help me to fix this function
dataframe = pd.DataFrame({
"X": range(10),
"Y": range(10,20),
"Z": range(5,15)
})
def subset_dataframe(dataframe, drop_cols):
subset_df = data.drop([drop_cols], inplace=True)
return subset_df
subsetted_df = subset_dataframe(dataframe, drop_cols=["x", "z"])
dataframe = pd.DataFrame({ "X": range(10), "Y": range(10,20), "Z": range(5,15) })
def subset_dataframe(dataframe, drop_cols): subset_df = data.drop(columns = [drop_cols], inplace=True) return subset_df
subsetted_df = subset_dataframe(dataframe, drop_cols=["x", "z"])
add columns as well in data.drop($)
EDIT 1:
final function:
dataframe = pd.DataFrame({ "X": range(10), "Y": range(10,20), "Z": range(5,15) })
def subset_dataframe(dataframe, drop_cols):
subset_df = dataframe.drop(columns = drop_cols)
return subset_df
subsetted_df = subset_dataframe(dataframe, drop_cols=["X", "Z"])
EDIT 2:
If you add inplace = True, the object returned in Nonetype, and nothing is stored in subset_df. check the syntax as well. Final function in Edit 1 works.
Hope this helps :)

Categories

Resources