Python - reduce number of if statements

Python - reduce number of if statements - python

I am filtering a pandas dataframe based on one or more conditions, like so:
def filter_dataframe(dataframe, position=None, team_id=None, home=None, window=None, min_games=0):
df = dataframe.copy()
if position:
df = df[df['position_id'] == position]
if clube_id:
df = df[df['team_id'] == team_id]
if home:
if home == 'home':
df = df[df['home_dummy'] == 1.0]
elif home == 'away':
df = df[df['home_dummy'] == 0.0]
if window:
df = df[df['round_id'].between(1, window)]
if min_games:
df = df[df['games_num'] >= min_games]
return df
But I don't think this is elegant.
Is there a simpler way of achieving the same result?
I though of creating rules for conditions like in this SO answer and then use the method any(rules) in order to apply the filtering, if any, but I don't know how to approach this. Any ideas?

You could try something like this:
def filter_dataframe(dataframe, position=None, clube_id=None, team_id=None, home=None, window=None, min_games=0):
df = dataframe.copy()
masks = {
"mask1": [position is not None, df[df["position_id"] == position]],
"mask2": [clube_id is not None, df[df["team_id"] == team_id]],
"mask3": [home == "home", df[df["home_dummy"] == 1.0]],
"mask4": [home == "away", df[df["home_dummy"] == 0.0]],
"mask5": [window is not None, df[df["round_id"].between(1, window)]],
"mask6": [min_games is not None, df[df["games_num"] >= min_games]],
}
for value in masks.values():
if value[0]:
df = value[1]
return df

Related

How can I simplify this function, random pandas DF selection adding to dictionary?

How can I simplify this function I am trying to create? I would like to pull data from a csv. Turn it into a Dataframe, randomly select a choice, add that choice to a corresponding dictionary key value pair.
def generate_traits():
import pandas as pd
df_bonds = pd.read_csv('/file/location_1')
df_alignments = pd.read_csv('/file/location_2')
df_faiths = pd.read_csv('/file/location_3')
df_flaws = pd.read_csv('/file/location_4')
df_ideals = pd.read_csv('/file/location_5')
df_lifestyles = pd.read_csv('/file/location_6')
df_organizations = pd.read_csv('/file/location_7')
df_personalities = pd.read_csv('/file/location_8')
df_names = pd.read_csv("/file/location_9")
random_bond = df_bonds.sample(1)
random_alignment = df_alignments.sample(1)
random_faith = df_faiths.sample(1)
random_flaw = df_flaws.sample(1)
random_ideal = df_ideals.sample(1)
random_lifestyle = df_lifestyles.sample(1)
random_organization = df_organizations.sample(1)
random_personaltiy = df_personalities.sample(1)
random_name = df_names.sample(1)
traits_dict={"Name:": random_name.iloc[0,0],
"Alignment:": random_alignment.iloc[0,0],
"Bond:":random_bond.iloc[0,0],
"Religion:":random_faith.iloc[0,0],
"Flaw:":random_flaw.iloc[0,0],
"Ideal:":random_ideal.iloc[0,0],
"Lifestyle:":random_lifestyle.iloc[0,0],
"Organization:":random_organization.iloc[0,0],
"Personality:":random_personaltiy.iloc[0,0]}
return traits_dict
The function does behave as expected however, I know there must be a way to loop through this I just have not found any way to do so.

You can chain your operations:
import pandas as pd
def generate_traits():
return {'Name': pd.read_csv('/file/location_1').sample(1).iloc[0,0],
'Alignment:': pd.read_csv('/file/location_2').sample(1).iloc[0,0],
'Bond': pd.read_csv('/file/location_3').sample(1).iloc[0,0],
'Religion': pd.read_csv('/file/location_4').sample(1).iloc[0,0],
'Flaw': pd.read_csv('/file/location_5').sample(1).iloc[0,0],
'Ideal': pd.read_csv('/file/location_6').sample(1).iloc[0,0],
'Lifestyle': pd.read_csv('/file/location_7').sample(1).iloc[0,0],
'Organization': pd.read_csv('/file/location_8').sample(1).iloc[0,0],
'Personality': pd.read_csv('/file/location_9').sample(1).iloc[0,0]}

def generate_traits():
import pandas as pd
name_location = {'Bond': 'location_1'
'Alignment': 'location_2'
'Religion': 'location_3'
'Flaw': 'location_4'
'ideals': 'location_5'
'Lifestyle': 'location_6'
'Organization': 'location_7'
'Personality': 'location_8'
'Name': 'location_9'}
all_df = {name: pd.read_csv(f'/file/{loc}') for name, loc in name_location.items()}
traits_dict = {name: df.sample(1).iloc[0, 0] for name, df in all_df.items()}
return traits_dict

parsing json file with function into dataframe for analysis

Hi am working with two json files , and im having problem with the data cleaning.
Suppose a record in g1j or g2j looks like this:
{
'cls_loc': 'QOEBBG_K0101',
'date': 1584957443013,
'dur': 32,
'exp': [
{
'm': 'spot_excited',
's': 8.5,
't': 8.5,
'w': 'spot_bored',
'x': 'A'
},
{
's': 1.1,
't': 11.4,
'w': 'spot_scared',
'x': 'A'
}
],
'mod': 'Poster',
'pre': False,
'scr': 67,
'usr': 'QOGOBN',
'ver': '20.5.3'
}
What we want per row in our DataFrame is this:
{
'student_pin': 'QOGOBN', # from `usr`
'date': datetime.date(2020 3, 23), # from `date`, but parsed
'duration': 32, # from `dur`
'level': 3, # the "K" from `cls_loc`, mapped to int
'unit': 1, # from `cls_loc`, mapped to int
'module': 1, # from `cls_loc`, mapped to int
'accuracy': 0.5, # calcualted from `exp`
}
my code so far:
from datetime import datetime
import json
import numpy as np
import pandas as pd
from scipy import stats
with open('/content/drive/MyDrive/group1_exp_2020-04-08.json', 'r') as f:
g1j = json.loads(f.read())
with open('/content/drive/MyDrive/group2_exp_2020-04-22.json', 'r') as f:
g2j = json.loads(f.read())
#convert the integer timestamp to a datetime.date
def timestamp_to_date():
l =[]
for item in g1j:
timestamp =item['date']
timestamp = timestamp/1000
dt_obj = datetime.fromtimestamp(timestamp).strftime('%Y, %m, %d ')
l.append(dt_obj)
return l
timestamp_to_date()
def timestamp_to_date():
l =[]
for item in g2j:
timestamp =item['date']
timestamp = timestamp/1000
dt_obj = datetime.fromtimestamp(timestamp).strftime('%Y, %m, %d ')
l.append(dt_obj)
return l
#extract the level, unit, module, and accuracy here
def get_level(x):
loc = x['cls_loc'].split('_')[-1]
return level_map[loc[0]]
def get_unit(x):
loc = x['cls_loc'].split('_')[-1]
unit = loc[1:3]
return int(unit)
def get_module(x):
loc = x['cls_loc'].split('_')[-1]
module = loc[3:]
return int(module)
def get_accuracy(x):
challenges = [x for x in x['exp'] if x['x'] == 'A']
n = len(challenges)
if n == 0:
return 'N/A'
mistakes = [x for x in challenges if 'm' in x.keys()]
correct = n - len(mistakes)
return correct / n
#create the function to convert experience records to the pandas.DataFrame
def exp_to_df(g1j):
df = pd.DataFrame(f, columns=['exp'])
return df
def exp_to_df(g2j):
df = pd.DataFrame(f, columns=['exp'])
return df
#uses the function you just implemented, and checks that your function keeps the records and uses the right column names
g1 = exp_to_df(g1j)
g2 = exp_to_df(g2j)
assert len(g1) == len(g1j)
assert len(g2) == len(g2j)
columns = ['student_pin', 'date', 'level', 'unit', 'module', 'accuracy']
assert all(c in g1.columns for c in columns)
assert all(c in g2.columns for c in columns)
What am I doing wrong? It seems like def exp_to_df(g1j) and def exp_to_df(g2j) are wrong. Any suggestions?
Also is my def timestamp_to_date() also wrong?

I suggest using the pandas read_json() function to load your json directly into a dataframe (I added a couple dummy records):
g1 = pd.read_json('/content/drive/MyDrive/group1_exp_2020-04-08.json')
# cls_loc date dur exp mod pre scr usr ver
# 0 QOEBBG_K0101 2020-03-23 09:57:23.013 32 [{'m': 'spot_excited', 's': 8.5, 't': 8.5, 'w'... Poster False 67 QOGOBN 20.5.3
# 1 QOEBBG_K0102 2020-03-23 09:57:23.013 32 [{'m': 'spot_excited', 's': 8.5, 't': 8.5, 'w'... Poster False 67 QOGOBN 20.5.3
# 2 QOEBBG_K0103 2020-03-23 09:57:23.013 32 [{'s': 1.1, 't': 11.4, 'x': 'C'}] Poster False 67 QOGOBN 20.5.3
Then you can do all the data wrangling with pandas functions like
str.extract(),
assign(),
to_datetime(),
map(), and
apply():
# extract level, unit, module as columns
g1 = g1.assign(**g1.cls_loc
.str.extract(r'_([a-zA-Z])([0-9]{2})([0-9]{2})')
.rename({0: 'level', 1: 'unit', 2: 'module'}, axis=1))
# convert date to datetime
g1.date = pd.to_datetime(g1.date, unit='ms')
# map level to int
level_map = {'K': 3}
g1.level = g1.level.map(level_map)
# compute accuracy
def accuracy(exp):
challenges = [e for e in exp if e['x'] == 'A']
n = len(challenges)
if n == 0:
return np.nan
mistakes = [c for c in challenges if 'm' in c.keys()]
correct = n - len(mistakes)
return correct / n
g1['accuracy'] = g1.exp.apply(accuracy)
# rename usr -> student_pin
g1 = g1.rename({'usr': 'student_pin'}, axis=1)
# keep desired columns
columns = ['student_pin', 'date', 'level', 'unit', 'module', 'accuracy']
g1 = g1[columns]
Output:
student_pin date level unit module accuracy
0 QOGOBN 2020-03-23 09:57:23.013 3 01 01 0.500000
1 QOGOBN 2020-03-23 09:57:23.013 3 01 02 0.333333
2 QOGOBN 2020-03-23 09:57:23.013 3 01 03 NaN

Performance problem when using pandas apply on big dataframes

Im having some performance issues with the code below, mostly because of the apply function that im using on a huge dataframe. I want to update the semi_dict dictionary with some other data that im calculating with the some functions. Is it any way to improve this?
def my_function_1(semi_dict, row):
#do some calculation/other stuff based on the row data and append it to the dictionary
random_dict = dict(data=some_data, more_data=more_data)
semi_dict["data"].append(random_dict)
def my_function_2(semi_dict, row):
#do some calculation/other stuff based on the row data and append it to the dictionary
random_dict = dict(data=some_data, more_data=more_data)
semi_dict["data2"].append(random_dict)
dictionary_list = []
for v in values:
df_1_rows = df_1_rows[(df_1_rows.values == v)]
df_2_rows = df_2_rows[(df_2_rows.values == v)]
semi_dict = dict(value=v, data=[], data2=[])
function = partial(my_function_1, semi_dict)
function_2 = partial(my_function_2, semi_dict)
df_1_rows.apply(lambda row : function(row), axis=1)
df_2_rows.apply(lambda row : function_2(row), axis=1)
dictionary_list.append(semi_dict)

This answer uses dictionary merge from How to merge dictionaries of dictionaries?, but depending on your use case, you might not need it in the end:
import pandas as pd
import random
len_df = 10
row_values = list("ABCD")
extra_col_values = list("12345")
df_1 = pd.DataFrame([[random.choice(row_values), random.choice(extra_col_values)] for _ in range(len_df)], columns=['col1', 'extra1'])
df_2 = pd.DataFrame([[random.choice(row_values), random.choice(extra_col_values)] for _ in range(len_df)], columns=['col2', 'extra2'])
def make_dict(df):
# some calculations on the df
return {
'data': df.head(1).values.tolist(),
}
def make_dict_2(df):
# some calculations on the df
return {
'data_2': df.head(1).values.tolist(),
}
def merge(a, b, path=None):
"merges b into a, taken from https://stackoverflow.com/questions/7204805/how-to-merge-dictionaries-of-dictionaries "
if path is None: path = []
for key in b:
if key in a:
if isinstance(a[key], dict) and isinstance(b[key], dict):
merge(a[key], b[key], path + [str(key)])
elif a[key] == b[key]:
pass # same leaf value
else:
raise Exception('Conflict at %s' % '.'.join(path + [str(key)]))
else:
a[key] = b[key]
return a
dict1 = df_1.groupby('col1').apply(make_dict).to_dict()
dict2 = df_2.groupby('col2').apply(make_dict_2).to_dict()
result = merge(dict1, dict2)
result

Raise ‘PicklingError’error when apply functions in certain class with pyspark

I'm trying to use pandas functions in spark with applyInPandas,when I tranform it within a certain class,it raise errors like this:pickle.PicklingError: Could not serialize object: Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063.
my script run well in function-type coding:
from scipy.stats import kendalltau
import numpy as np
import pandas as pd
def kendall(dat, a, b):
kentmp = []
ken = [np.nan, np.nan]
if type(a) is list:
if dat.shape[0] > 3:
for item in a:
kentmp.append(kendalltau(dat[item], dat[b])[0])
tmp = pd.Series(kentmp, index=a).dropna()
if tmp.shape[0] > 0:
cato = tmp.idxmax()
if (tmp < 0).any():
cato = tmp.abs().idxmax()
ken = [cato, tmp[cato]]
index = ['category', 'corr']
else:
if dat.shape[0] >= 10:
ken = [kendalltau(dat[a], dat[b])[0], dat.shape[0]]
index = ['corr', 'N']
return pd.Series(ken, index=index)
def kendall_process(pdf):
result = pdf.groupby(['step_id','unit_id']).apply(kendall,'process','label')
result = pd.DataFrame(result).reset_index()
#result.columns = ['step_id','unit_id','corr','N']
pdf['label'] = pdf.label.astype('int')
result_ = pdf.groupby(['step_id','unit_id'])['label'].mean().reset_index()
result = pd.merge(result,result_,on=['step_id','unit_id'],how='left')
result.columns = ['step_id','unit_id','corr','N','ratio']
return result
result = datInOut.groupBy('step_id','unit_id').applyInPandas(kendall_process, schema='step_id string,\
unit_id string,\
corr float,\
N long,\
ratio float')
result.show(5)
+--------------+--------+-----------+----+-----+
| step_id| unit_id| corr| N|ratio|
+--------------+--------+-----------+----+-----+
|10303_A2AOI300|A2AOI300| null|null| 0.0|
|17613_A2AOI500|A2AOI500|-0.13477948| 14| 0.5|
|1B304_A2MAC100|A2MAC100| null|null| 1.0|
|1A106_A2SPR100|A2SPR100| null|null| 1.0|
|19103_A2AOI800|A2AOI800| null|null| 0.5|
+--------------+--------+-----------+----+-----+
only showing top 5 rows
but when I tansform it to class type coding,it raise the PicklingError:
#staticmethod
def kendall(dat,a,b):
kentmp=[]
ken=[np.nan,np.nan]
if type(a) is list:
if dat.shape[0]>3:
for item in a:
kentmp.append(kendalltau(dat[item],dat[b])[0])
tmp=pd.Series(kentmp,index=a).dropna()
if tmp.shape[0]>0:
cato=tmp.idxmax()
if (tmp<0).any():
cato=tmp.abs().idxmax()
ken=[cato,tmp[cato]]
index=['category','corr']
else:
if dat.shape[0]>=10:
ken=[kendalltau(dat[a],dat[b])[0],dat.shape[0]]
index=['corr','N']
return pd.Series(ken,index=index)
#staticmethod
def kendall_delay(pdf):
result = pdf.groupby(['step_id','equip_id']).apply(QTWorker.kendall,'delay','label')
result = pd.DataFrame(result).reset_index()
pdf['label'] = pdf.label.astype('int')
result_ = pdf.groupby(['step_id', 'equip_id'])['label'].mean().reset_index()
result = pd.merge(result, result_, on=['step_id', 'equip_id'], how='left')
result.columns = ['step_id', 'equip_id', 'corr', 'N', 'ratio']
return result
ret = datQ.groupBy(self.step, self.equip).applyInPandas(self.kendall_delay, schema='step_id string,equip_id string,corr float,N long,ratio float')
as see,I've already decorated the funtions used with staticmethod,but it still not work. I really wanna how to fix it!

Even I don't no why,but I've solved it by puting the kendall functions under kendall_delay.
I really wanna figure out the reason of it!
#staticmethod
def kendall_process(pdf):
def kendall(dat, a, b):
kentmp = []
ken = [np.nan, np.nan]
if type(a) is list:
if dat.shape[0] > 3:
for item in a:
kentmp.append(kendalltau(dat[item], dat[b])[0])
tmp = pd.Series(kentmp, index=a).dropna()
if tmp.shape[0] > 0:
cato = tmp.idxmax()
if (tmp < 0).any():
cato = tmp.abs().idxmax()
ken = [cato, tmp[cato]]
index = ['category', 'corr']
else:
if dat.shape[0] >= 10:
ken = [kendalltau(dat[a], dat[b])[0], dat.shape[0]]
index = ['corr', 'N']
return pd.Series(ken, index=index)
result = pdf.groupby(['step_id','equip_id']).apply(kendall,'process','label')
result = pd.DataFrame(result).reset_index()
pdf['label'] = pdf.label.astype('int')
result_ = pdf.groupby(['step_id', 'equip_id'])['label'].mean().reset_index()
result = pd.merge(result, result_, on=['step_id', 'equip_id'], how='left')
result.columns = ['step_id', 'equip_id', 'corr', 'N', 'ratio']
return result

how to fix the function so that the resulting dataframe has only the subsetted column?

trying to subset a dataframe to remove/drop certain columns from a given dataframe
Please help me to fix this function
dataframe = pd.DataFrame({
"X": range(10),
"Y": range(10,20),
"Z": range(5,15)
})
def subset_dataframe(dataframe, drop_cols):
subset_df = data.drop([drop_cols], inplace=True)
return subset_df
subsetted_df = subset_dataframe(dataframe, drop_cols=["x", "z"])

dataframe = pd.DataFrame({ "X": range(10), "Y": range(10,20), "Z": range(5,15) })
def subset_dataframe(dataframe, drop_cols): subset_df = data.drop(columns = [drop_cols], inplace=True) return subset_df
subsetted_df = subset_dataframe(dataframe, drop_cols=["x", "z"])
add columns as well in data.drop($)
EDIT 1:
final function:
dataframe = pd.DataFrame({ "X": range(10), "Y": range(10,20), "Z": range(5,15) })
def subset_dataframe(dataframe, drop_cols):
subset_df = dataframe.drop(columns = drop_cols)
return subset_df
subsetted_df = subset_dataframe(dataframe, drop_cols=["X", "Z"])
EDIT 2:
If you add inplace = True, the object returned in Nonetype, and nothing is stored in subset_df. check the syntax as well. Final function in Edit 1 works.
Hope this helps :)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python - reduce number of if statements - python

Related

How can I simplify this function, random pandas DF selection adding to dictionary?

parsing json file with function into dataframe for analysis

Performance problem when using pandas apply on big dataframes

Raise ‘PicklingError’error when apply functions in certain class with pyspark

how to fix the function so that the resulting dataframe has only the subsetted column?

Categories

Resources