pandas dataframe vectorize for loop with logical statements - python

i want to vectorize following function in python. i am havin 50000 rows in dataframe , so need to make it happen fast and so vectorization for following code is needed in python
for i in range(1,len(df)):
if(df['temperature'].iloc[i]>df['temperature'].iloc[i-1]):
df['delta'].iloc[i]=df['qty'].iloc[i]
df['value'].iloc[i]=1
elif(df['temperature'].iloc[i]<df['temperature'].iloc[i-1]):
df['delta'].iloc[i]=-1*df['qty'].iloc[i]
df['value'].iloc[i]=-1
elif(df['temperature'].iloc[i]==df['temperature'].iloc[i-1]):
df['delta'].iloc[i]=df['value'].iloc[i-1]*df['qty'].iloc[i]
df['value'].iloc[i]=df['value'].iloc[i-1]

I expect this will do the job, but without input and expected output to compare with, I can't check:
gt_idx = df['temperature'] > df.shift(-1)['temperature']
df.loc[gt_idx, 'delta'] = df.loc[gt_idx, 'qty']
df.loc[gt_idx, 'value'] = 1
lt_idx = df['temperature'] < df.shift(-1)['temperature']
df.loc[lt_idx, 'delta'] = df.loc[lt_idx, 'qty'] * -1
df.loc[lt_idx, 'value'] = -1
eq_idx = df['temperature'] == df.shift(-1)['temperature']
df.loc[eq_idx, 'delta'] = df.shift(-1).loc[eq_idx, 'value'] *df.loc[eq_idx, 'qty']
df.loc[eq_idx, 'value'] = df.shift(-1).loc[eq_idx, 'value']

You can try using np.select(), as follows:
cond_list = [df['temperature'] > df['temperature'].shift(),
df['temperature'] < df['temperature'].shift(),
df['temperature'] = df['temperature'].shift()
]
choice_list = [(df['delta'] = df['qty'], df['value'] = 1),
(df['delta'] = - df['qty'], df['value'] = -1),
(df['delta'] = df['qty'] * df['value'].shift(), df['value'] = df['value'].shift())
]
np.select(cond_list, choice_list)

Related

Conditional method chaining in pandas

Is there a simple general way to make a method conditional to an if-statement when using method chaining with pandas?
Mock example:
df = pd.DataFrame({'A':['one', 'two'], 'B':['one', 'two']})
change_to_numeric = False
df = (df
.query("A == 'one'")
.replace('one', 1) # <-- Execute this row only "if change_to_numeric == True"
)
Thank you!
You can use pipe:
df = pd.DataFrame({'A':['one', 'two'], 'B':['one', 'two']})
change_to_numeric = False
df = (df
.query("A == 'one'")
.pipe(lambda d: d.replace('one', 1) if change_to_numeric else d)
)
output for change_to_numeric = False:
A B
0 one one
output for change_to_numeric = True:
A B
0 1 1

Apply for loop in multiple dataframe for multiple columns?

Dataframe is like below: Where I want to change dataframes value to 'dead' if age is more than 100.
import pandas as pd
raw_data = {'age1': [23,45,210],'age2': [10,20,150],'name': ['a','b','c']}
df = pd.DataFrame(raw_data, columns = ['age1','age2','name'])
raw_data = {'age1': [80,90,110],'age2': [70,120,90],'name': ['a','b','c']}
df2 = pd.DataFrame(raw_data, columns = ['age1','age2','name'])
Desired outcome
df=
age1 age2 name
0 23 10 a
1 45 20 b
2 dead dead c
df2=
age1 age2 name
0 80 70 a
1 90 dead b
2 dead 90 c
I was trying something like this:
col_list=['age1','age2']
df_list=[df,df2]
def dead(df):
for df in df_list:
if df.columns in col_list:
if df.columns >=100:
return 'dead'
else:
return df.columns
df.apply(dead)
Error shown:
The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
I am looking for a loop that works on all dataframe.
Please correct my function also for future learning :)
With your shown samples, please try following. Using filter, np.where functions of pandas, numpy respectively.
c = df.filter(regex='age\d+').columns
df[c] = np.where(df[c].ge(100),'dead',df[c])
df
Alternative approach with where:
c=df.filter(like='age').columns
df[c] = df[c].where(~df['c'].ge(100),'dead')
Explanation:
Getting columns which has same name like age in c variable.
Then using np.where to check if respective(all age columns) are greeter/equal to 100, if yes then set it to dead or keep it as it is.
I did the following:
col_list=['age1','age2']
df_list=[df,df2]
for d in df_list:
for c in col_list:
d.loc[d[c]>100, c] = 'dead'
#inspired by #jib and #ravinder
col_list=['age1','age2']
df_list=[df,df2]
for d in df_list:
for c in col_list:
d[c]=np.where(d[c]>100,'dead',d[c])
df #or df2
output:
age1 age2 name
0 23 10 a
1 45 20 b
2 dead dead c
One possible solution is to use Pandas' mask, which is similar to if-else, but vectorized.
def dead(df):
col_list = ['age1', 'age2']
df = df.copy()
temporary = df.filter(col_list)
temporary = temporary.mask(temporary >= 100, "dead")
df.loc[:, col_list] = temporary
return df
Apply function to the dataframe:
df.pipe(dead)
age1 age2 name
0 23 10 a
1 45 20 b
2 dead dead c
You can do:
def check_more_than_100(x):
v = None
try:
v = int(x)
except:
pass
if v is not None:
return (v > 100)
return (False)
df['age1'] = df['age1'].apply(lambda x : 'dead' if check_more_than_100(x) else x)
df['age2'] = df['age2'].apply(lambda x : 'dead' if check_more_than_100(x) else x)
df2['age1'] = df2['age1'].apply(lambda x : 'dead' if check_more_than_100(x) else x)
df2['age2'] = df2['age2'].apply(lambda x : 'dead' if check_more_than_100(x) else x)
This should take care of non-int values if any.
I used this answer to a similar question. Basically you can use the .where() function from numpy to set based on the conditional.
import pandas as pd
import numpy as np
raw_data = {'age1': [23,45,210],'age2': [10,20,150],'name': ['a','b','c']}
df = pd.DataFrame(raw_data, columns = ['age1','age2','name'])
raw_data = {'age1': [80,90,110],'age2': [70,120,90],'name': ['a','b','c']}
df2 = pd.DataFrame(raw_data, columns = ['age1','age2','name'])
col_list=['age1','age2']
df_list=[df,df2]
def dead(df_list, col_list):
for df in df_list:
for col in col_list:
df[col] = np.where(df[col] >= 100, "dead", df[col])
return df_list
df
dead([df], col_list)
Extracting numeric columns and then using numpy where -
df_cols = df._get_numeric_data().columns.values
df2_cols = df2._get_numeric_data().columns.values
df[df_cols] = np.where(df[df_cols].to_numpy() > 100, 'dead', df[df_cols])
df2[df2_cols] = np.where(df2[df2_cols].to_numpy() > 100, 'dead', df2[df2_cols])

Fastest way to filter a pandas dataframe many times in a loop

I have a dataframe with 3 millions of rows (df1) and another with 10k rows (df2). What is the fastest method of filtering df1 for each row in df2?
Here is exactly what I need to do in the loop:
for i in list(range(len(df2))): #For each row
x = df1[(df1['column1'].isin([df2['info1'][i]])) \
& (df1['column2'].isin([df2['info2'][i]])) \
& (df1['column3'].isin([df2['info3'][i]]))]
# ..... More code using x variable every time ......
This code is not fast enough to be viable.
Note that I used .isin function but inside it thereĀ“s always only 1 item. I found out that using .isin() , df1['column1'].isin([df2['info1'][i]] , was faster then using df1['column1'] == df2['info1'][i] .
import pandas as pd
import numpy as np
def make_filter(x, y, match_dict, uinque=False):
filter = None
for x_key in x.columns:
if x_key in match_dict:
y_key = match_dict[x_key]
y_col = y[y_key]
if uinque:
y_col = y_col.unique()
col_filter = x[x_key].isin(y[y_key])
if filter is None:
filter = col_filter
else:
filter = filter & col_filter
return filter
def main():
n_rows = 100
x = np.random.randint(4, size=(n_rows, 2))
x = pd.DataFrame(x, columns=["col1", "col2"])
y = np.random.randint(2, 4, size=(n_rows, 2))
y = pd.DataFrame(y, columns=["info1", "info2"])
match_dict = {"col1":"info1", "col2": "info2"}
z = make_filter(x, y, match_dict, uinque=True)
print(x[z])
main()

How to fix erratic NaN values in pandas generated by column operations?

I have some forex data here, on which I am trying to do some pandas operations.
import pandas as pd
import numpy as np
df = pd.read_excel(r"History_M1.xlsx", sheet_name='Sheet1', dtype={'high': float, 'low':float, 'open':float, 'close':float, 'hour': str})
df['time'] = pd.to_datetime(df['time'], utc=True)
df.set_index('time', inplace=True)
df[['high','low','open','close']] = df[['high','low','open','close']].apply(pd.to_numeric, errors='coerce')
df['hour'] = df.index.hour
df['hl'] = (df['high'] - df['low'])*10**4
df['oc'] = (df['close'] - df['open'])*10**4
df['ab'] = (df['close'] - df['open']).abs()*10**4
df['dir'] = df[['close','open']].apply(lambda x: 1 if x['close'] > x['open'] else -1, axis=1)
I downsampled df to a hourly frequency and performed some column operations.
dfh = df[['volume','high','low','open','close']].resample('1H').agg({'volume': 'sum','open': 'first','high': 'max','low': 'min','close': 'last'}).ffill()
dfh['day'] = dfh.index.weekday
dfh['hour'] = dfh.index.hour
dfh['hl'] = (dfh['high'] - dfh['low'])*10**4
dfh['oc'] = (dfh['close'] - dfh['open'])*10**4
dfh['ab'] = (dfh['close'] - df['open']).abs()*10**4
dfh['dir'] = dfh[['close','open']].apply(lambda x: 1 if x['close'] > x['open'] else -1, axis=1)
The column dfh['ab] is giving some NaN values for no reason. How can we fix this?
Maybe it does not work because you replaced dfh by df here
dfh['ab'] = (dfh['close'] - df['open']).abs()*10**4 # should be dfh['open']
Also try to change this lambda operation
df['dir'] = df[['close','open']].apply(lambda x: 1 if x['close'] > x['open'] else -1, axis=1)
by a numpy operation (much faster)
df['dir'] = np.where(df['close'] > df['open'], 1, -1)

Pandas: df_left.merge(df_right) Summary Statistics

With regards to Pandas: df.merge() method, is their a convenient way to obtain the merge summary statistics (such as number of matched, number of not matched etc.). I know these stats depend on the how='inner' flag, but it would be handy to know how much is being 'discarded' when using an inner join etc. I could simply use:
df = df_left.merge(df_right, on='common_column', how='inner')
set1 = set(df_left[common_column].unique())
set2 = set(df_right[common_column].unique())
set1.issubset(set2) #True No Further Analysis Required
set2.issubset(set1) #False
num_shared = len(set2.intersection(set1))
num_diff = len(set2.difference(set1))
# And So on ...
But thought this might be implemented already. Have I missed it (i.e. something like report=True for merge which would return new_dataframe and a report series or dataframe)
Try this function... You can then just pass your arguments into it like this:
df = merge_like_stata(df1, df2, mergevars)
Function definition:
def merge_like_stata(master, using, mergevars):
master['_master_merge_'] = 'master'
using['_using_merge_'] = 'using'
df = pd.merge(master, using, on=mergevars, how='outer')
df['_master_merge_'] = df['_master_merge_'].apply(lambda x: 'miss' if pd.isnull(x) else x)
df['_using_merge_'] = df['_using_merge_'].apply(lambda x: 'miss' if pd.isnull(x) else x)
df['_merge'] = df.apply(lambda row: '3 - Master Only' if row['_master_merge_']=='master' and row['_using_merge_'] =='using' else None, axis=1)
df['_merge'] = df.apply(lambda row: '2 - Master Only' if row['_master_merge_']=='master' and row['_using_merge_'] =='miss' else row['_merge'], axis=1)
df['_merge'] = df.apply(lambda row: '1 - Using Only' if row['_master_merge_']=='miss' and row['_using_merge_'] =='using' else row['_merge'], axis=1)
df['column']="Count"
pd.crosstab(df._merge, df.column, margins=True)
df = df.drop(['_master_merge_', '_using_merge_'], axis=1)
return print(pd.crosstab(df._merge, df.column, margins=True))
return df
This is what I use thus far.
This is part of a function that concord's data from one coding system to another coding system.
if report == True:
report_df = pd.DataFrame(data[match_on].describe(), columns=['left'])
report_df = report_df.merge(pd.DataFrame(concord[match_on].describe(), columns=['right']), left_index=True, right_index=True)
set_left = set(data[match_on])
set_right = set(concord[match_on])
set_info = pd.DataFrame({'left':set_left.issubset(set_right), 'right':set_right.issubset(set_left)}, index=['subset'])
report_df = report_df.append(set_info)
set_info = pd.DataFrame({'left':len(set_left.difference(set_right)), 'right':len(set_right.difference(set_left))}, index=['differences'])
report_df = report_df.append(set_info)
#Return Random Sample of [5 Differences]
left_diff = list(set_left.difference(set_right))[0:5]
if len(left_diff) < 5:
left_diff = (left_diff + [np.nan]*5)[0:5]
right_diff = list(set_right.difference(set_left))[0:5]
if len(right_diff) < 5:
right_diff = (right_diff + [np.nan]*5)[0:5]
set_info = pd.DataFrame({'left': left_diff, 'right': right_diff}, index=['diff1', 'diff2', 'diff3', 'diff4', 'diff5'])
report_df = report_df.append(set_info)
Sample Report

Categories

Resources