I have some forex data here, on which I am trying to do some pandas operations.
import pandas as pd
import numpy as np
df = pd.read_excel(r"History_M1.xlsx", sheet_name='Sheet1', dtype={'high': float, 'low':float, 'open':float, 'close':float, 'hour': str})
df['time'] = pd.to_datetime(df['time'], utc=True)
df.set_index('time', inplace=True)
df[['high','low','open','close']] = df[['high','low','open','close']].apply(pd.to_numeric, errors='coerce')
df['hour'] = df.index.hour
df['hl'] = (df['high'] - df['low'])*10**4
df['oc'] = (df['close'] - df['open'])*10**4
df['ab'] = (df['close'] - df['open']).abs()*10**4
df['dir'] = df[['close','open']].apply(lambda x: 1 if x['close'] > x['open'] else -1, axis=1)
I downsampled df to a hourly frequency and performed some column operations.
dfh = df[['volume','high','low','open','close']].resample('1H').agg({'volume': 'sum','open': 'first','high': 'max','low': 'min','close': 'last'}).ffill()
dfh['day'] = dfh.index.weekday
dfh['hour'] = dfh.index.hour
dfh['hl'] = (dfh['high'] - dfh['low'])*10**4
dfh['oc'] = (dfh['close'] - dfh['open'])*10**4
dfh['ab'] = (dfh['close'] - df['open']).abs()*10**4
dfh['dir'] = dfh[['close','open']].apply(lambda x: 1 if x['close'] > x['open'] else -1, axis=1)
The column dfh['ab] is giving some NaN values for no reason. How can we fix this?
Maybe it does not work because you replaced dfh by df here
dfh['ab'] = (dfh['close'] - df['open']).abs()*10**4 # should be dfh['open']
Also try to change this lambda operation
df['dir'] = df[['close','open']].apply(lambda x: 1 if x['close'] > x['open'] else -1, axis=1)
by a numpy operation (much faster)
df['dir'] = np.where(df['close'] > df['open'], 1, -1)
Related
i want to vectorize following function in python. i am havin 50000 rows in dataframe , so need to make it happen fast and so vectorization for following code is needed in python
for i in range(1,len(df)):
if(df['temperature'].iloc[i]>df['temperature'].iloc[i-1]):
df['delta'].iloc[i]=df['qty'].iloc[i]
df['value'].iloc[i]=1
elif(df['temperature'].iloc[i]<df['temperature'].iloc[i-1]):
df['delta'].iloc[i]=-1*df['qty'].iloc[i]
df['value'].iloc[i]=-1
elif(df['temperature'].iloc[i]==df['temperature'].iloc[i-1]):
df['delta'].iloc[i]=df['value'].iloc[i-1]*df['qty'].iloc[i]
df['value'].iloc[i]=df['value'].iloc[i-1]
I expect this will do the job, but without input and expected output to compare with, I can't check:
gt_idx = df['temperature'] > df.shift(-1)['temperature']
df.loc[gt_idx, 'delta'] = df.loc[gt_idx, 'qty']
df.loc[gt_idx, 'value'] = 1
lt_idx = df['temperature'] < df.shift(-1)['temperature']
df.loc[lt_idx, 'delta'] = df.loc[lt_idx, 'qty'] * -1
df.loc[lt_idx, 'value'] = -1
eq_idx = df['temperature'] == df.shift(-1)['temperature']
df.loc[eq_idx, 'delta'] = df.shift(-1).loc[eq_idx, 'value'] *df.loc[eq_idx, 'qty']
df.loc[eq_idx, 'value'] = df.shift(-1).loc[eq_idx, 'value']
You can try using np.select(), as follows:
cond_list = [df['temperature'] > df['temperature'].shift(),
df['temperature'] < df['temperature'].shift(),
df['temperature'] = df['temperature'].shift()
]
choice_list = [(df['delta'] = df['qty'], df['value'] = 1),
(df['delta'] = - df['qty'], df['value'] = -1),
(df['delta'] = df['qty'] * df['value'].shift(), df['value'] = df['value'].shift())
]
np.select(cond_list, choice_list)
Dataframe is like below: Where I want to change dataframes value to 'dead' if age is more than 100.
import pandas as pd
raw_data = {'age1': [23,45,210],'age2': [10,20,150],'name': ['a','b','c']}
df = pd.DataFrame(raw_data, columns = ['age1','age2','name'])
raw_data = {'age1': [80,90,110],'age2': [70,120,90],'name': ['a','b','c']}
df2 = pd.DataFrame(raw_data, columns = ['age1','age2','name'])
Desired outcome
df=
age1 age2 name
0 23 10 a
1 45 20 b
2 dead dead c
df2=
age1 age2 name
0 80 70 a
1 90 dead b
2 dead 90 c
I was trying something like this:
col_list=['age1','age2']
df_list=[df,df2]
def dead(df):
for df in df_list:
if df.columns in col_list:
if df.columns >=100:
return 'dead'
else:
return df.columns
df.apply(dead)
Error shown:
The truth value of an array with more than one element is ambiguous. Use a.any() or a.all()
I am looking for a loop that works on all dataframe.
Please correct my function also for future learning :)
With your shown samples, please try following. Using filter, np.where functions of pandas, numpy respectively.
c = df.filter(regex='age\d+').columns
df[c] = np.where(df[c].ge(100),'dead',df[c])
df
Alternative approach with where:
c=df.filter(like='age').columns
df[c] = df[c].where(~df['c'].ge(100),'dead')
Explanation:
Getting columns which has same name like age in c variable.
Then using np.where to check if respective(all age columns) are greeter/equal to 100, if yes then set it to dead or keep it as it is.
I did the following:
col_list=['age1','age2']
df_list=[df,df2]
for d in df_list:
for c in col_list:
d.loc[d[c]>100, c] = 'dead'
#inspired by #jib and #ravinder
col_list=['age1','age2']
df_list=[df,df2]
for d in df_list:
for c in col_list:
d[c]=np.where(d[c]>100,'dead',d[c])
df #or df2
output:
age1 age2 name
0 23 10 a
1 45 20 b
2 dead dead c
One possible solution is to use Pandas' mask, which is similar to if-else, but vectorized.
def dead(df):
col_list = ['age1', 'age2']
df = df.copy()
temporary = df.filter(col_list)
temporary = temporary.mask(temporary >= 100, "dead")
df.loc[:, col_list] = temporary
return df
Apply function to the dataframe:
df.pipe(dead)
age1 age2 name
0 23 10 a
1 45 20 b
2 dead dead c
You can do:
def check_more_than_100(x):
v = None
try:
v = int(x)
except:
pass
if v is not None:
return (v > 100)
return (False)
df['age1'] = df['age1'].apply(lambda x : 'dead' if check_more_than_100(x) else x)
df['age2'] = df['age2'].apply(lambda x : 'dead' if check_more_than_100(x) else x)
df2['age1'] = df2['age1'].apply(lambda x : 'dead' if check_more_than_100(x) else x)
df2['age2'] = df2['age2'].apply(lambda x : 'dead' if check_more_than_100(x) else x)
This should take care of non-int values if any.
I used this answer to a similar question. Basically you can use the .where() function from numpy to set based on the conditional.
import pandas as pd
import numpy as np
raw_data = {'age1': [23,45,210],'age2': [10,20,150],'name': ['a','b','c']}
df = pd.DataFrame(raw_data, columns = ['age1','age2','name'])
raw_data = {'age1': [80,90,110],'age2': [70,120,90],'name': ['a','b','c']}
df2 = pd.DataFrame(raw_data, columns = ['age1','age2','name'])
col_list=['age1','age2']
df_list=[df,df2]
def dead(df_list, col_list):
for df in df_list:
for col in col_list:
df[col] = np.where(df[col] >= 100, "dead", df[col])
return df_list
df
dead([df], col_list)
Extracting numeric columns and then using numpy where -
df_cols = df._get_numeric_data().columns.values
df2_cols = df2._get_numeric_data().columns.values
df[df_cols] = np.where(df[df_cols].to_numpy() > 100, 'dead', df[df_cols])
df2[df2_cols] = np.where(df2[df2_cols].to_numpy() > 100, 'dead', df2[df2_cols])
I have the following dataset (check the link below):
The stock returns from 2006 to 2018
I need to assign values for each company's stock return in each year in 4 quantiles (check the link below).
Distributed stock returns from 2006 to 2018
df = pd.read_excel('data.xlsx', sheet_name=['Excess Returns'], index_col='Data')
df = pd.concat(df[frame] for frame in df.keys())
df = df.replace('-', '')
df = (df).apply(pd.to_numeric, errors='coerce')
Could you help to write the cycle for sorting?
The following code does not work:
for i in range(0, df.shape[0]):
if df.iloc[i] <= df.quantile(0.2, axis = 1).iloc[i]:
df.iloc[i] = 1
elif df.iloc[i] > (df.quantile(0.2, axis = 1) and df.iloc[i] <= df.quantile(0.4, axis = 1)).iloc[i]:
df.iloc[i] = 2
elif df.iloc[i] > (df.quantile(0.4, axis = 1) and df.iloc[i] <= df.quantile(0.6, axis = 1)).iloc[i]:
df.iloc[i] = 3
elif df.iloc[i] > (df.quantile(0.6, axis = 1) and df.iloc[i] <= df.quantile(0.8, axis = 1)).iloc[i]:
df.iloc[i] = 4
else:
df.iloc[i] = 5
ValueError: The truth value of a Series is ambiguous. Use a.empty, a.bool(), a.item(), a.any() or a.all().
You could try something like this -
import pandas as pd, numpy as np
quantiles_0.75 = pd.DataFrame(np.where(df<=df.quantile([0.75]).loc[0.75], 1, 2), columns=df.columns])
Please note that the above will create a dataframe quantiles_0.75 with same columns as df but with values according to the rule which you specified.
In case of more conditions, I think a modified version of #David's post will also be useful in case you want the result into a dataframe -
temp = pd.DataFrame(np.where(df <= df.quantile(0.2), 1, np.where((df > df.quantile(0.2)) & \
(df <= df.quantile(0.4)), 2, \
np.where((df > df.quantile(0.4)) & \
(df <= df.quantile(0.6)), 3, 4)), columns=df.columns)
My idea based on previous contributors. I feel it is quite optimal solution:
df = pd.DataFrame(np.where(df <= df.quantile(0.2), 1, np.where((df > df.quantile(0.2)) & \
(df <= df.quantile(0.4)), 2, \
np.where((df > df.quantile(0.4)) & \
(df <= df.quantile(0.6)), 3, 4)), columns=df.columns)
I have 2-dimensional data (Column-Cell1,Cell2.., Row-Gene1,Gene2..) in which I want to delete rows with 99% zeroes and with the resultant matrix drop columns with 99% zeroes in them. I have written the following code to do the same, however since the matrix is very large, it is taking a long time to run. Is there a better way to approach this issue?
import pandas as pd
import numpy as np
def read_in(matrix_file):
matrix_df=pd.read_csv(matrix_file,index_col=0)
return(matrix_df)
def genes_less_exp(matrix_df):
num_columns=matrix_df.shape[1]
for index, row in matrix_df.iterrows():
zero_els=np.count_nonzero(row.values==0)
gene_per_zero=(float(zero_els)/float(num_columns))*100
if gene_per_zero >= 99:
matrix_df.drop([index],axis=0,inplace=True)
return(matrix_df)
def cells_less_exp(matrix_df):
num_rows=matrix_df.shape[0]
for label,content in matrix_df.iteritems():
zero_els=np.count_nonzero(content.values==0)
cells_per_zero=(float(zero_els)/float(num_rows))*100
if cells_per_zero >= 99:
matrix_df.drop(label,axis=1,inplace=True)
return(matrix_df)
if __name__ == "__main__":
matrix_df=read_in("Data/big-matrix.csv")
print("original:"+str(matrix_df.shape))
filtered_genes=genes_less_exp(matrix_df)
print("filtered_genes:"+str(filtered_genes.shape))
filtered_cells=cells_less_exp(filtered_genes)
print("filtered_cells:"+str(filtered_cells.shape))
filtered_cells.to_csv("abi.99.percent.filtered.csv", sep=',')
Its easier if you reframe your question to "keep those with less than 99% 0s".
def drop_almost_zero(df, percentage):
row_cut_off = int(percentage/100*len(df.columns))
df = df[(df==0).sum(axis='columns') <= row_cut_off]
column_cut_off = int(percentage/100*len(df))
b = (df == 0).sum(axis='rows')
df = df[ b[ b <= column_cut_off].index.values ]
return df
#test
size = 50
percentage = 90
rows = size//2
columns = size
a = np.random.choice(2, size=(rows, columns), p=[(1-0.1), 0.1])
df = pd.DataFrame(a, columns=[f'c{i}' for i in range(size)])
df = drop_almost_zero(df,percentage)
assert (df == 0).sum(axis='rows').max() <= percentage/100*rows
assert (df == 0).sum(axis='columns').max() <= percentage/100*columns
With regards to Pandas: df.merge() method, is their a convenient way to obtain the merge summary statistics (such as number of matched, number of not matched etc.). I know these stats depend on the how='inner' flag, but it would be handy to know how much is being 'discarded' when using an inner join etc. I could simply use:
df = df_left.merge(df_right, on='common_column', how='inner')
set1 = set(df_left[common_column].unique())
set2 = set(df_right[common_column].unique())
set1.issubset(set2) #True No Further Analysis Required
set2.issubset(set1) #False
num_shared = len(set2.intersection(set1))
num_diff = len(set2.difference(set1))
# And So on ...
But thought this might be implemented already. Have I missed it (i.e. something like report=True for merge which would return new_dataframe and a report series or dataframe)
Try this function... You can then just pass your arguments into it like this:
df = merge_like_stata(df1, df2, mergevars)
Function definition:
def merge_like_stata(master, using, mergevars):
master['_master_merge_'] = 'master'
using['_using_merge_'] = 'using'
df = pd.merge(master, using, on=mergevars, how='outer')
df['_master_merge_'] = df['_master_merge_'].apply(lambda x: 'miss' if pd.isnull(x) else x)
df['_using_merge_'] = df['_using_merge_'].apply(lambda x: 'miss' if pd.isnull(x) else x)
df['_merge'] = df.apply(lambda row: '3 - Master Only' if row['_master_merge_']=='master' and row['_using_merge_'] =='using' else None, axis=1)
df['_merge'] = df.apply(lambda row: '2 - Master Only' if row['_master_merge_']=='master' and row['_using_merge_'] =='miss' else row['_merge'], axis=1)
df['_merge'] = df.apply(lambda row: '1 - Using Only' if row['_master_merge_']=='miss' and row['_using_merge_'] =='using' else row['_merge'], axis=1)
df['column']="Count"
pd.crosstab(df._merge, df.column, margins=True)
df = df.drop(['_master_merge_', '_using_merge_'], axis=1)
return print(pd.crosstab(df._merge, df.column, margins=True))
return df
This is what I use thus far.
This is part of a function that concord's data from one coding system to another coding system.
if report == True:
report_df = pd.DataFrame(data[match_on].describe(), columns=['left'])
report_df = report_df.merge(pd.DataFrame(concord[match_on].describe(), columns=['right']), left_index=True, right_index=True)
set_left = set(data[match_on])
set_right = set(concord[match_on])
set_info = pd.DataFrame({'left':set_left.issubset(set_right), 'right':set_right.issubset(set_left)}, index=['subset'])
report_df = report_df.append(set_info)
set_info = pd.DataFrame({'left':len(set_left.difference(set_right)), 'right':len(set_right.difference(set_left))}, index=['differences'])
report_df = report_df.append(set_info)
#Return Random Sample of [5 Differences]
left_diff = list(set_left.difference(set_right))[0:5]
if len(left_diff) < 5:
left_diff = (left_diff + [np.nan]*5)[0:5]
right_diff = list(set_right.difference(set_left))[0:5]
if len(right_diff) < 5:
right_diff = (right_diff + [np.nan]*5)[0:5]
set_info = pd.DataFrame({'left': left_diff, 'right': right_diff}, index=['diff1', 'diff2', 'diff3', 'diff4', 'diff5'])
report_df = report_df.append(set_info)
Sample Report