Tiling in groupby on dataframe - python

I have a data frame that contains returns, size and sedols for a couple of dates.
My goal is to identify the top and bottom values for a certain condition per date, i.e I want the top decile largest size entries and the bottom decile smallest size entries for each date and flag them in a new column by 'xx' and 'yy'.
I am confused how to apply the tiling while grouping as well as creating a new column, here is what I already have.
import pandas as pd
import numpy as np
import datetime as dt
from random import choice
from string import ascii_uppercase
def create_dummy_data(start_date, days, entries_pday):
date_sequence_lst = [dt.datetime.strptime(start_date,'%Y-%m-%d') +
dt.timedelta(days=x) for x in range(0,days)]
date_sequence_lst = date_sequence_lst * entries_pday
returns_lst = [round(np.random.uniform(low=-0.10,high=0.20),2) for _ in range(entries_pday*days)]
size_lst = [round(np.random.uniform(low=10.00,high=10000.00),0) for _ in range(entries_pday*days)]
rdm_sedol_lst = [(''.join(choice(ascii_uppercase) for i in range(7))) for x in range(entries_pday)]
rdm_sedol_lst = rdm_sedol_lst * days
dates_returns_df = pd.DataFrame({'Date':date_sequence_lst , 'Sedols':rdm_sedol_lst, 'Returns':returns_lst,'Size':size_lst})
dates_returns_df = dates_returns_df.sort_values('Date',ascending=True)
dates_returns_df = dates_returns_df.reset_index(drop=True)
return dates_returns_df
def order_df_by(df_in,column_name):
df_out = df_in.sort_values(['Date',column_name],ascending=[True,False])
return df_out
def get_ntile(df_in,ntile):
df_in['Tiled'] = df_in.groupby(['Date'])['Size'].transform(lambda x : pd.qcut(x,ntile))
return df_in
if __name__ == "__main__":
# create dummy returns
data_df = create_dummy_data('2001-01-01',31,10)
# sort by attribute
data_sorted_df = order_df_by(data_df,'Size')
#ntile data per date
data_ntiled = get_ntile(data_sorted_df, 10)
for key, item in data_ntiled:
print(data_ntiled.get_group(key))
so far I would be expecting deciled results based on 'Size' for each date, the next step would be to filter only for decile 1 and decile 10 and flag the entries 'xx' and 'yy' respectively.
thanks

Consider using transform on the pandas.qcut method with labels 1 through ntile+1 for a decile column, then conditionally set flag with np.where using decile values:
...
def get_ntile(df_in, ntile):
df_in['Tiled'] = df_in.groupby(['Date'])['Size'].transform(lambda x: pd.qcut(x, ntile, labels=list(range(1, ntile+1))))
return df_in
if __name__ == "__main__":
# create dummy returns
data_df = create_dummy_data('2001-01-01',31,10)
# sort by attribute
data_sorted_df = order_df_by(data_df,'Size')
#ntile data per date
data_ntiled = get_ntile(data_sorted_df, 10)
data_ntiled['flag'] = np.where(data_ntiled['Tiled']==1.0, 'YY',
np.where(data_ntiled['Tiled']==10.0, 'XX', np.nan))
print(data_ntiled.reset_index(drop=True).head(15))
# Date Returns Sedols Size Tiled flag
# 0 2001-01-01 -0.03 TEEADVJ 8942.0 10.0 XX
# 1 2001-01-01 -0.03 PDBWGBJ 7142.0 9.0 nan
# 2 2001-01-01 0.03 QNVVPIC 6995.0 8.0 nan
# 3 2001-01-01 0.04 NTKEAKB 6871.0 7.0 nan
# 4 2001-01-01 0.20 ZVVCLSJ 6541.0 6.0 nan
# 5 2001-01-01 0.12 IJKXLIF 5131.0 5.0 nan
# 6 2001-01-01 0.14 HVPDRIU 4490.0 4.0 nan
# 7 2001-01-01 -0.08 XNOGFET 3397.0 3.0 nan
# 8 2001-01-01 -0.06 JOARYWC 2582.0 2.0 nan
# 9 2001-01-01 0.12 FVKBQGU 723.0 1.0 YY
# 10 2001-01-02 0.03 ZVVCLSJ 9291.0 10.0 XX
# 11 2001-01-02 0.14 HVPDRIU 8875.0 9.0 nan
# 12 2001-01-02 0.08 PDBWGBJ 7496.0 8.0 nan
# 13 2001-01-02 0.02 FVKBQGU 7307.0 7.0 nan
# 14 2001-01-02 -0.01 QNVVPIC 7159.0 6.0 nan

Related

Subtracting value from column gives NaN only

I have multiple column csv file and I want to subtract values of column X31-X27,Y31-Y27,Z31-Z27 from the same dataframe but when I am subtracting it gives me NaN values.
Here is the values of csv file:
It gives me the result as shown in picture
Help me to figure out this problem
import pandas as pd
import os
import numpy as np
df27 = pd.read_csv('D:27.txt', names=['No27','X27','Y27','Z27','Date27','Time27'], sep='\s+')
df28 = pd.read_csv('D:28.txt', names=['No28','X28','Y28','Z28','Date28','Time28'], sep='\s+')
df29 = pd.read_csv('D:29.txt', names=['No29','X29','Y29','Z29','Date29','Time29'], sep='\s+')
df30 = pd.read_csv('D:30.txt', names=['No30','X30','Y30','Z30','Date30','Time30'], sep='\s+')
df31 = pd.read_csv('D:31.txt', names=['No31','X31','Y31','Z31','Date31','Time31'], sep='\s+')
total=pd.concat([df27,df28,df29,df30,df31], axis=1)
total.to_csv('merge27-31.csv', index = False)
print(total)
df2731 = pd.read_csv('C:\\Users\\finalmerge27-31.csv')
df2731.reset_index(inplace=True)
print(df2731)
df227 = df2731[['X31', 'Y31', 'Z31']] - df2731[['X27', 'Y27', 'Z27']]
print(df227)
# input data
df = pd.DataFrame({'x27':[-1458.88, 181.78, 1911.84, 3739.3, 5358.19], 'y27':[-5885.8, -5878.1,-5786.5,-5735.7, -5545.6],
'z27':[1102,4139,4616,4108,1123], 'x31':[-1458, 181, 1911, np.nan, 5358], 'y31':[-5885, -5878, -5786, np.nan, -5554],
'z31':[1102,4138,4616,np.nan,1123]})
df
x27 y27 z27 x31 y31 z31
0 -1458.88 -5885.8 1102 -1458.0 -5885.0 1102.0
1 181.78 -5878.1 4139 181.0 -5878.0 4138.0
2 1911.84 -5786.5 4616 1911.0 -5786.0 4616.0
3 3739.30 -5735.7 4108 NaN NaN NaN
4 5358.19 -5545.6 1123 5358.0 -5554.0 1123.0
pd.DataFrame(df1.values - df2.values).rename(columns={0:'x32-x27', 1:'y31-y27', 2:'z31-x31'})
Out:
x32-x27 y31-y27 z31-x31
0 -0.88 -0.8 0.0
1 0.78 -0.1 1.0
2 0.84 -0.5 0.0
3 NaN NaN NaN
4 0.19 8.4 0.0

how to simultaneously print all the matched values in a new column in pandas

I am a newer in pandas. I would like to search for some targets in a particular column and print all the matched targets in a new column. Here is my code.
# coding=utf-8
import pandas as pd
import numpy as np
#########
classes = [('Carbon;Pyruvate;vitamins', 16.7),
('Pyruvate;Carbohydrate;Pentose and glucuronate', 30),
('Lipid;Carbon;Galactose', 40.5),
('Galactose;Pyruvate;Fatty acid', 57),
('Fatty acid;Lipid', 22)]
labels = ['Ko_class','FPKM']
alls = pd.DataFrame.from_records(classes, columns=labels)
target =['Carbon', 'Pyruvate','Galactose']
#####
def match(x):
for i in target:
if i in x:
return i
else:
return np.nan
alls['Pathways'] = alls['Ko_class'].apply(match)
Its results are:
Ko_class FPKM Pathways
0 Carbon;Pyruvate;vitamins 16.7 Carbon
1 Pyruvate;Carbohydrate;Pentose and glucuronate 30.0 Pyruvate
2 Lipid;Carbon;Galactose 40.5 Carbon
3 Galactose;Pyruvate;Fatty acid 57.0 Pyruvate
4 Fatty acid;Lipid 22.0 NaN
The expected results are:
Ko_class FPKM Pathways
0 Carbon;Pyruvate;vitamins 16.7 Carbon;Pyruvate
1 Pyruvate;Carbohydrate;Pentose and glucuronate 30.0 Pyruvate
2 Lipid;Carbon;Galactose 40.5 Carbon;Galactose
3 Galactose;Pyruvate;Fatty acid 57.0 Galactose;Pyruvate
4 Fatty acid;Lipid 22.0 NaN
My question is: how to print all the matched targets in the new column like "Carbon;Pyruvate" not only "Carbon".
Use str.extractall with aggregation as string instead of your custom function:
regex = '|'.join(target)
alls['Pathsways'] = (alls['Ko_class'].str.extractall(f'({regex})')[0]
.groupby(level=0).agg(';'.join))
Or with a list comprehension and a set:
S = set(target)
alls['Pathways'] = [';'.join(x for x in s.split(';') if x in S)
for s in alls['Ko_class']]
output:
Ko_class FPKM Pathways
0 Carbon;Pyruvate;vitamins 16.7 Carbon;Pyruvate
1 Pyruvate;Carbohydrate;Pentose and glucuronate 30.0 Pyruvate
2 Lipid;Carbon;Galactose 40.5 Carbon;Galactose
3 Galactose;Pyruvate;Fatty acid 57.0 Galactose;Pyruvate
4 Fatty acid;Lipid 22.0 NaN

How to handle Cells containing only NaN values in pandas?

I am setting up a stock price prediction data set,in that while applying the following code for Ichimoku Cloud Indicator:
from datetime import timedelta
high_9 = df['High'].rolling(window= 9).max()
low_9 = df['Low'].rolling(window= 9).min()
df['tenkan_sen'] = (high_9 + low_9) /2
high_26 = df['High'].rolling(window= 26).max()
low_26 = df['Low'].rolling(window= 26).min()
df['kijun_sen'] = (high_26 + low_26) /2
# this is to extend the 'df' in future for 26 days
# the 'df' here is numerical indexed df
# the problem is here
last_index = df.iloc[-1:].index[0]
last_date = df['Date'].iloc[-1].date()
for i in range(26):
df.loc[last_index+1 +i, 'Date'] = last_date + timedelta(days=i)
df['senkou_span_a'] = ((df['tenkan_sen'] + df['kijun_sen']) / 2).shift(26)
high_52 = df['High'].rolling(window= 52).max()
low_52 = df['Low'].rolling(window= 52).min()
df['senkou_span_b'] = ((high_52 + low_52) /2).shift(26)
# most charting softwares dont plot this line
df['chikou_span'] = df['Close'].shift(-26)
The above code works great but the problem is while extending to the next 26 time steps(rows) in 'senoku span a' and 'b' columns it turns other rest columns row's values to NaN.
So i need the help to make 'Senoku span a' & 'Senoku span b' predicted rows in my data set without making other rows vlaues to NaN.
The current output is:
Date Open High Low Close Senoku span a Senoku span b
2019-03-16 50 51 52 53 56.0 55.82
2019-03-17 NaN NaN NaN NaN 55.0 56.42
2019-03-18 NaN NaN NaN NaN 54.0 57.72
2019-03-19 NaN NaN NaN NaN 53.0 58.12
2019-03-20 NaN NaN NaN NaN 52.0 59.52
The expected output is:
Date Open High Low Close Senoku span a Senoku span b
2019-03-16 50 51 52 53 56.0 55.82
2019-03-17 55.0 56.42
2019-03-18 54.0 57.72
2019-03-19 53.0 58.12
2019-03-20 52.0 59.52

Pandas: rolling windows with a sum product

There are a number of answers that each provide me with a portion of my desired result, but I am challenged putting them all together. My core Pandas data frame looks like this, where I am trying to estimate volume_step_1:
date volume_step_0 volume_step_1
2018-01-01 100 a
2018-01-02 101 b
2018-01-03 105 c
2018-01-04 123 d
2018-01-05 121 e
I then have a reference table with the conversion rates, for e.g.
step conversion
0 0.60
1 0.81
2 0.18
3 0.99
4 0.75
I have another table containing point estimates of a Poisson distribution:
days_to_complete step_no pc_cases
0 0 0.50
1 0 0.40
2 0 0.07
Using these data, I now want to estimate
volume_step_1 =
(volume_step_0(today) * days_to_complete(step0, day0) * conversion(step0)) +
(volume_step_0(yesterday) * days_to_complete(step0,day1) * conversion(step0))
and so forth.
How do I write some Python code to do so?
Calling your dataframes (from top to bottom as df1, df2, and df3):
df1['volume_step_1'] = (
(df1['volume_step_0']*
df2.loc[(df2['days_to_complete'] == 0) & (df2['step_no'] == 0), 'pc_cases']*
df3.loc[df3['step'] == 0, 'conversion']) +
df1['volume_step_0'].shift(1)*
df2.loc[(df2['days_to_complete'] == 1) & (df2['step_no'] == 0), 'pc_cases']*
df3.loc[df3['step'] == 0, 'conversion'])
EDIT:
IIUC, you are trying to get a 'dot product' of sorts between the volume_step_0 column and the product of the pc_cases and conversionfor a particular step_no. You can merge df2 and df3 to match steps:
df_merged = df_merged = df2.merge(df3, how = 'left', left_on = 'step', right_on = 'step_no')
df_merged.head(3)
step conversion days_to_complete step_no pc_cases
0 0.0 0.6 0.0 0.0 0.50
1 0.0 0.6 1.0 0.0 0.40
2 0.0 0.6 2.0 0.0 0.07
I'm guessing you're only using stepk to get volume_step_k+1, and you want to iterate the sum over the days. The following code generates a vector of days_to_complete(step0, dayk) and conversion(step0) for all values of k that are available in days_to_complete, and finds their product:
df_fin = df_merged[df_merged['step'] == 0][['conversion', 'pc_cases']].product(axis = 1)
0 0.300
1 0.240
2 0.042
df_fin = df_fin[::-1].reset_index(drop = True)
Finally, you want to take the dot product of the days_to_complete * conversion vector by the volume_step_0 vector, for a rolling window (as many values exist in days_to_complete):
vol_step_1 = pd.Series([df1['volume_step_0'][i:i+len(df3)].reset_index(drop = True).dot(df_fin) for i in range(0,len(df3))])
df1['volume_step_1'] = df1['volume_step_1'][::-1].reset_index(drop = True)
Output:
df1
date volume_step_0 volume_step_1
0 2018-01-01 100 NaN
1 2018-01-02 101 NaN
2 2018-01-03 105 70.230
3 2018-01-04 123 66.342
4 2018-01-05 121 59.940
While this is by no means a comprehensive solution, the code is meant to provide the logic to "sum multiple products", as you had asked.

Populating new DataFrame by multi-criteria selection from old one with different structure

I'm using Pandas for data analysis. I have an input file like this snippet:
VEH SEC POS ACCELL SPEED
2 8.4 36.51 -0.2929 27.39
3 8.4 23.57 -0.7381 33.09
4 8.4 6.18 0.6164 38.8
1 8.5 47.76 0 25.57
I need to reorganize the data so that the rows are the unique (ordered) values from SEC as the 1st column, and then the other columns would be VEH1_POS, VEH1_SPEED, VEH1_ACCELL, VEH2_POS, VEH2_SPEED, VEH2_ACCELL, etc.:
TIME VEH1_POS VEH1_SPEED VEH1_ACCEL VEH2_POS, VEH2_SPEED, etc.
0.1 6.2 3.7 0.0 7.5 2.1
0.2 6.8 3.2 -0.5 8.3 2.1
etc.
So, for example, the value for VEH1_POS for each row in the new dataframe would be filled in by selecting values from the POS column in the original dataframe using the row where the SEC value matches the TIME value for the row in the new dataframe and the VEH value == 1.
To set up the rows in the new data frame I'm doing this:
start = inputdf['SIMSEC'].min()
end = inputdf['SIMSEC'].max()
time_steps = frange(start, end, 0.1)
outputdf['TIME'] = time_steps
But I'm lost at how to select the right values from the input dataframe and create the rest of the new dataframe for further analysis. Note also that the input file will NOT have data for every VEH for every SEC (time stamp). So the solution needs to handle that as well. My best guess was:
outputdf['veh1_pos'] = np.where((inputdf['VEH NO'] == 1) & (inputdf['SIMSEC'] == row['Time Step']))
but that doesn't work.
import pandas as pd
# your data
# ==========================
print(df)
Out[272]:
VEH SEC POS ACCELL SPEED
0 2 8.4 36.51 -0.2929 27.39
1 3 8.4 23.57 -0.7381 33.09
2 4 8.4 6.18 0.6164 38.80
3 1 8.5 47.76 0.0000 25.57
# reshaping
# ==========================
result = df.set_index(['SEC','VEH']).unstack()
Out[278]:
POS ACCELL SPEED
VEH 1 2 3 4 1 2 3 4 1 2 3 4
SEC
8.4 NaN 36.51 23.57 6.18 NaN -0.2929 -0.7381 0.6164 NaN 27.39 33.09 38.8
8.5 47.76 NaN NaN NaN 0 NaN NaN NaN 25.57 NaN NaN NaN
So here, the column has multi-level index where 1st level is POS, ACCELL, SPEED and 2nd level is VEH=1,2,3,4.
# if you want to rename the column
temp_z = result.columns.get_level_values(0)
temp_y = result.columns.get_level_values(1)
temp_x = ['VEH'] * len(temp_y)
result.columns = ['{}{}_{}'.format(x,y,z) for x,y,z in zip(temp_x, temp_y, temp_z)]
Out[298]:
VEH1_POS VEH2_POS VEH3_POS VEH4_POS VEH1_ACCELL VEH2_ACCELL VEH3_ACCELL VEH4_ACCELL VEH1_SPEED VEH2_SPEED VEH3_SPEED VEH4_SPEED
SEC
8.4 NaN 36.51 23.57 6.18 NaN -0.2929 -0.7381 0.6164 NaN 27.39 33.09 38.8
8.5 47.76 NaN NaN NaN 0 NaN NaN NaN 25.57 NaN NaN NaN

Categories

Resources