numpy - anyway to improve this further(PandasTook(1h26m), NumpyTakes(38m)) - python
Initially had everything written pandas and for this exploratory exercise i had did a lot of groupby's and while running with the whole data it was ran for 1h 26m. Over the last weekend, i had changed everything from pandas to using numpy, currently it took(Wall time: 38min 27s). I would like to know if it can be further improved
While converting to numpy, i had additionally used numpy_indexed.
Overall what i am doing is calling the below function this loop(i have read in lots of places loops are bad). Dataset has around 657058 rows and there around 5000 tickers.
for idx, ticker in enumerate(ticker_list):
...
df_temp = weekly_trend_analysis(exchange, df_weekly, df_daily)
...
df_weekly_all = pd.concat([df_weekly_all, df_temp], sort=False)
def weekly_trend_analysis(exchange, df_weekly_all, df_daily):
if exchange == 'BSE':
ticker = df_daily.iloc[0]['sc_code']
else:
ticker = df_daily.iloc[0]['symbol']
arr_yearWeek = df_daily['yearWeek'].to_numpy()
arr_close = df_daily['close'].to_numpy()
arr_prevclose = df_daily['prevclose'].to_numpy()
arr_chng = df_daily['chng'].to_numpy()
arr_chngp = df_daily['chngp'].to_numpy()
arr_ts = df_daily['ts'].to_numpy()
arr_volumes = df_daily['volumes'].to_numpy()
# Close
arr_concat = np.column_stack((arr_yearWeek, arr_close))
npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
#a = df_temp[['yearWeek', 'close']].to_numpy()
yearWeek, daysTraded = np.unique(arr_concat[:,0], return_counts=True)
cmaxs, cmins = [], []
first, last, wChng, wChngp = [], [], [], []
for idx,subarr in enumerate(npi_gb):
cmaxs.append( np.amax(subarr) )
cmins.append( np.amin(subarr) )
first.append(subarr[0])
last.append(subarr[-1])
wChng.append( subarr[-1] - subarr[0] )
wChngp.append( ( (subarr[-1] / subarr[0]) * 100) - 100 )
#npi_gb.clear()
arr_concat = np.empty((100,100))
# Chng
arr_concat = np.column_stack((arr_yearWeek, arr_chng))
npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
HSDL, HSDG = [], []
for idx,subarr in enumerate(npi_gb):
HSDL.append( np.amin(subarr) )
HSDG.append( np.amax(subarr) )
#npi_gb.clear()
arr_concat = np.empty((100,100))
# Chngp
arr_concat = np.column_stack((arr_yearWeek, arr_chngp))
npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
HSDLp, HSDGp = [], []
for idx,subarr in enumerate(npi_gb):
HSDLp.append( np.amin(subarr) )
HSDGp.append( np.amax(subarr) )
#npi_gb.clear()
arr_concat = np.empty((100,100))
# Last Traded Date of the Week
i = df_daily[['yearWeek', 'ts']].to_numpy()
j = npi.group_by(i[:, 0]).split(i[:, 1])
lastTrdDoW = []
for idx,subarr in enumerate(j):
lastTrdDoW.append( subarr[-1] )
i = np.empty((100,100))
#j.clear()
# Times inreased
TI = np.where(arr_close > arr_prevclose, 1, 0)
# Below npi_gb_yearWeekTI is used in volumes section
arr_concat = np.column_stack((arr_yearWeek, TI))
npi_gb_yearWeekTI = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
tempArr, TI = npi.group_by(arr_yearWeek).sum(TI)
# Volume ( dependent on above section value t_group , thats the reason to move from top to here)
arr_concat = np.column_stack((arr_yearWeek, arr_volumes))
npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
vmaxs, vavgs, volAvgWOhv, HVdAV, CPveoHVD, lastDVotWk, lastDVdAV = [], [], [], [], [], [], []
for idx,subarr in enumerate(npi_gb):
vavgs.append( np.mean(subarr) )
ldvotWk = subarr[-1]
lastDVotWk.append(ldvotWk)
#print(idx, 'O - ',subarr, np.argmax(subarr), ', average : ',np.mean(subarr))
ixDel = np.argmax(subarr)
hV = subarr[ixDel]
vmaxs.append( hV )
if(len(subarr)>1):
subarr = np.delete(subarr, ixDel)
vawoHV = np.mean(subarr)
else:
vawoHV = np.mean(subarr)
volAvgWOhv.append( vawoHV )
HVdAV.append(hV / vawoHV)
CPveoHVD.append( npi_gb_yearWeekTI[idx][ixDel] )
lastDVdAV.append(ldvotWk / vawoHV)
#npi_gb.clear()
arr_concat = np.empty((100,100))
# Preparing the dataframe
# yearWeek and occurances
#yearWeek, daysTraded = np.unique(a[:,0], return_counts=True)
yearWeek = yearWeek.astype(int)
HSDL = np.round(HSDL,2)
HSDG = np.round(HSDG,2)
HSDLp = np.round(HSDLp,2)
HSDGp = np.round(HSDGp,2)
first = np.round(first,2)
last = np.round(last,2)
wChng = np.round(wChng,2)
wChngp = np.round(wChngp,2)
vavgs = np.array(vavgs).astype(int)
volAvgWOhv = np.array(volAvgWOhv).astype(int)
HVdAV = np.round(HVdAV,2)
dict_temp = {'yearWeek': yearWeek, 'closeH': cmaxs, 'closeL': cmins, 'volHigh':vmaxs, 'volAvg':vavgs, 'daysTraded':daysTraded
,'HSDL':HSDL, 'HSDG':HSDG, 'HSDLp':HSDLp, 'HSDGp':HSDGp, 'first':first, 'last':last, 'wChng':wChng, 'wChngp':wChngp
,'lastTrdDoW':lastTrdDoW, 'TI':TI, 'volAvgWOhv':volAvgWOhv, 'HVdAV':HVdAV, 'CPveoHVD':CPveoHVD
,'lastDVotWk':lastDVotWk, 'lastDVdAV':lastDVdAV}
df_weekly = pd.DataFrame(data=dict_temp)
df_weekly['sc_code'] = ticker
cols = ['sc_code', 'yearWeek', 'lastTrdDoW', 'daysTraded', 'closeL', 'closeH', 'volAvg', 'volHigh'
, 'HSDL', 'HSDG', 'HSDLp', 'HSDGp', 'first', 'last', 'wChng', 'wChngp', 'TI', 'volAvgWOhv', 'HVdAV'
, 'CPveoHVD', 'lastDVotWk', 'lastDVdAV']
df_weekly = df_weekly[cols].copy()
# df_weekly_all will be 0, when its a new company or its a FTA(First Time Analysis)
if df_weekly_all.shape[0] == 0:
df_weekly_all = pd.DataFrame(columns=list(df_weekly.columns))
# Removing all yearWeek in df_weekly2 from df_weekly
a = set(df_weekly_all['yearWeek'])
b = set(df_weekly['yearWeek'])
c = list(a.difference(b))
#print('df_weekly_all={}, df_weekly={}, difference={}'.format(len(a), len(b), len(c)) )
df_weekly_all = df_weekly_all[df_weekly_all.yearWeek.isin(c)].copy()
# Append the latest week data to df_weekly
df_weekly_all = pd.concat([df_weekly_all, df_weekly], sort=False)
#print('After concat : df_weekly_all={}'.format(df_weekly_all.shape[0]))
return df_weekly_all
Input data
ts = ['2019-04-01 00:00:00','2019-04-01 00:00:00','2019-04-01 00:00:00','2019-04-01 00:00:00','2019-04-01 00:00:00','2019-04-02 00:00:00','2019-04-02 00:00:00','2019-04-02 00:00:00','2019-04-02 00:00:00','2019-04-02 00:00:00']
sc_code = ['500002','500002','500002','500002','500002','500002','500002','500002','500002','500002']
high = [1326.6, 208.45, 732.15, 14.87, 1979.0, 57.8, 11.55, 1.68, 8.1, 139.4]
low = [1306.35, 204.0, 717.05, 13.41, 1937.65, 54.65, 11.2, 1.52, 7.75, 135.65]
close = [1313.55, 206.65, 723.05, 13.53, 1955.25, 56.0, 11.21, 1.68, 8.1, 136.85]
prevclose = [1319.85, 202.95, 718.95, 14.29, 1967.3, 54.65, 11.22, 1.6, 7.75, 135.05]
volumes = [7785, 6150, 21641, 46296, 707019, 40089, 25300, 5920, 500, 235355]
yearWeek = [201913, 201913, 201913, 201913, 201913, 201913, 201913, 201913, 201913, 201913]
chng = [-6.29, 3.70, 4.09, -0.75, -12.04, 1.35, -0.09, 0.079, 0.34, 1.79]
chngp = [-0.48, 1.82, 0.57, -5.32, -0.61, 2.47, -0.09, 5.0, 4.52, 1.33]
dict_temp = {'ts':ts, 'sc_code':sc_code, 'high':high, 'low':low, 'close':close, 'prevclose':prevclose, 'volumes':volumes, 'yearWeek':yearWeek, 'chng':chng, 'chngp':chngp}
df_weekly = pd.DataFrame(data=dict_temp)
Adding line-profiler details,
('getcwd : ', '/home/bobby_dreamer')
Timer unit: 1e-06 s
Total time: 0.043637 s
File: BTD-Analysis1V3.py
Function: weekly_trend_analysis at line 36
Line # Hits Time Per Hit % Time Line Contents
==============================================================
36 def weekly_trend_analysis(exchange, df_weekly_all, df_daily):
37
38 1 3.0 3.0 0.0 if exchange == 'BSE':
39 1 963.0 963.0 2.2 ticker = df_daily.iloc[0]['sc_code']
40 else:
41 ticker = df_daily.iloc[0]['symbol']
42
95 # Last Traded Date of the Week
96 1 3111.0 3111.0 7.1 i = df_daily[['yearWeek', 'ts']].to_numpy()
97 1 128.0 128.0 0.3 j = npi.group_by(i[:, 0]).split(i[:, 1])
98
160
161 1 3.0 3.0 0.0 dict_temp = {'yearWeek': yearWeek, 'closeH': cmaxs, 'closeL': cmins, 'volHigh':vmaxs, 'volAvg':vavgs, 'daysTraded':daysTraded
162 1 2.0 2.0 0.0 ,'HSDL':HSDL, 'HSDG':HSDG, 'HSDLp':HSDLp, 'HSDGp':HSDGp, 'first':first, 'last':last, 'wChng':wChng, 'wChngp':wChngp
163 1 2.0 2.0 0.0 ,'lastTrdDoW':lastTrdDoW, 'TI':TI, 'volAvgWOhv':volAvgWOhv, 'HVdAV':HVdAV, 'CPveoHVD':CPveoHVD
164 1 2.0 2.0 0.0 ,'lastDVotWk':lastDVotWk, 'lastDVdAV':lastDVdAV}
165 1 3677.0 3677.0 8.4 df_weekly = pd.DataFrame(data=dict_temp)
166
167 1 1102.0 1102.0 2.5 df_weekly['sc_code'] = ticker
168
169 1 3.0 3.0 0.0 cols = ['sc_code', 'yearWeek', 'lastTrdDoW', 'daysTraded', 'closeL', 'closeH', 'volAvg', 'volHigh'
170 1 1.0 1.0 0.0 , 'HSDL', 'HSDG', 'HSDLp', 'HSDGp', 'first', 'last', 'wChng', 'wChngp', 'TI', 'volAvgWOhv', 'HVdAV'
171 1 2.0 2.0 0.0 , 'CPveoHVD', 'lastDVotWk', 'lastDVdAV']
172
173 1 2816.0 2816.0 6.5 df_weekly = df_weekly[cols].copy()
174
175 # df_weekly_all will be 0, when its a new company or its a FTA(First Time Analysis)
176 1 13.0 13.0 0.0 if df_weekly_all.shape[0] == 0:
177 1 20473.0 20473.0 46.9 df_weekly_all = pd.DataFrame(columns=list(df_weekly.columns))
178
179 # Removing all yearWeek in df_weekly2 from df_weekly
180 1 321.0 321.0 0.7 a = set(df_weekly_all['yearWeek'])
181 1 190.0 190.0 0.4 b = set(df_weekly['yearWeek'])
182 1 5.0 5.0 0.0 c = list(a.difference(b))
183 #print('df_weekly_all={}, df_weekly={}, difference={}'.format(len(a), len(b), len(c)) )
184 1 1538.0 1538.0 3.5 df_weekly_all = df_weekly_all[df_weekly_all.yearWeek.isin(c)].copy()
185
186 # Append the latest week data to df_weekly
187 1 6998.0 6998.0 16.0 df_weekly_all = pd.concat([df_weekly_all, df_weekly], sort=False)
188 #print('After concat : df_weekly_all={}'.format(df_weekly_all.shape[0]))
189
190 1 2.0 2.0 0.0 return df_weekly_all
After reviewing the above profile, made changes to code which consumed more time, basically added more numpy code removed pandas in the function. Below code when run with whole data took only Wall time: 7min 47s.
Encountered some numpy errors like below, handled via writing intermediate files. I am using windows machine and intermediate files were < 3MB. Not sure if there were any limitations.
MemoryError: Unable to allocate array with shape (82912, 22) and data type <U32
Line # Hits Time Per Hit % Time Line Contents
==============================================================
38 def weekly_trend_analysis_np(exchange, np_weekly_all, df_daily):
39
40 1 4.0 4.0 0.0 if exchange == 'BSE':
43 1 152.0 152.0 1.2 ticker = df_daily['sc_code'].to_numpy()[0]
44 else:
47 ticker = df_daily['symbol'].to_numpy()[0]
48
101 # Last Traded Date of the Week
102 1 33.0 33.0 0.3 arr_concat = np.column_stack((arr_yearWeek, arr_ts))
103 1 341.0 341.0 2.6 npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
104
152 1 5.0 5.0 0.0 yearWeek = yearWeek.astype(int)
153 1 59.0 59.0 0.5 HSDL = np.round(HSDL,2)
154 1 26.0 26.0 0.2 HSDG = np.round(HSDG,2)
155 1 23.0 23.0 0.2 HSDLp = np.round(HSDLp,2)
156 1 23.0 23.0 0.2 HSDGp = np.round(HSDGp,2)
157
158 1 23.0 23.0 0.2 first = np.round(first,2)
159 1 23.0 23.0 0.2 last = np.round(last,2)
160 1 23.0 23.0 0.2 wChng = np.round(wChng,2)
161 1 23.0 23.0 0.2 wChngp = np.round(wChngp,2)
162
163 1 12.0 12.0 0.1 vavgs = np.array(vavgs).astype(int)
164 1 16.0 16.0 0.1 volAvgWOhv = np.array(volAvgWOhv).astype(int)
165 1 24.0 24.0 0.2 HVdAV = np.round(HVdAV,2)
166
167 1 16.0 16.0 0.1 ticker = np.full(yearWeek.shape[0], ticker)
168 1 2.0 2.0 0.0 np_weekly = np.column_stack((ticker, yearWeek, lastTrdDoW, daysTraded, cmins, cmaxs, vavgs, vmaxs, HSDL
169 1 2.0 2.0 0.0 , HSDG, HSDLp, HSDGp, first, last, wChng, wChngp, TI, volAvgWOhv, HVdAV
170 1 546.0 546.0 4.2 , CPveoHVD, lastDVotWk, lastDVdAV))
171
173 1 2.0 2.0 0.0 if len(np_weekly_all) > 0:
175 1 2.0 2.0 0.0 a = np_weekly_all[:,1]
176 1 1.0 1.0 0.0 b = np_weekly[:,1]
177 1 205.0 205.0 1.6 tf_1 = np.isin(a, b, invert=True)
179 1 13.0 13.0 0.1 t_result = list(compress(range(len(tf_1)), tf_1))
181 1 13.0 13.0 0.1 np_weekly_all = np_weekly_all[t_result]
182 1 40.0 40.0 0.3 np_weekly_all = np.vstack((np_weekly_all, np_weekly))
183 else:
184 np_weekly_all = []
185 np_weekly_all = np.vstack((np_weekly))
186
187 1 2.0 2.0 0.0 return np_weekly_all
I would glad to hear your suggestions and thanks for pointing to profiler, i didn't know about that.
Edited and posted the line profiler code above. I have updated the code from pandas to numpy which had reduced the times considerable from inital 1h 26min to now 7mins.
Related
How to fill empty data with zeros?
After going through some previous answers I found that I could use this code to fill missing values of df1[0] which range from 340 to 515, with open('contactasortedtest.dat', 'r') as f: text = [line.split() for line in f] def replace_missing(df1 , Ids ): missing = np.setdiff1d(Ids,df1[1]) print(missing) if len(missing) > 0 : missing_df = pd.DataFrame(data = np.zeros( (len(missing) , 4 ))) missing_df[1] = missing missing_df[2].replace(0 , df1[2].iloc[1] , inplace = True) df1 = pd.concat([df1 , missing_df]) return df1 Ids = (np.arange(340.0,515.0)) final_df = df1.groupby(df1[2],as_index=True).apply(replace_missing ,Ids).reset_index(drop = True) final_df Through troubleshooting I found that missing = np.setdiff1d(Ids,df1[1]) does not perform. Rather return the whole array. I found many answers on this, but I couldn't work it out. Any help would be appreciated. Sample data I used, 12 340.0 1.0 0.0 2 491.0 1.0 35.8 13 492.0 1.0 81.4 4 493.0 1.0 0.0 7 495.0 1.0 0.2 0 496.0 1.0 90.3 11 509.0 1.0 2.3 6 513.0 1.0 4.3 8 515.0 1.0 0.1 Thank you !
You can use df['x'].fillna(0) to fill non zeros in a column
Iterating through a series to find values >= x then use values
I have a series (of length 201) created from reading a .xlsx spread sheet, as follows: xl = pandas.ExcelFile(file) data = xl.parse('Sheet1') data.columns = ["a", "b", "c", "d", "e", "f", "g", "h"] A = data.a So I am working with A and if I print (A) I get 0 76.0 1 190.0 2 0.0 3 86.0 4 0.0 196 156.0 197 0.0 198 0.0 199 320.0 200 0.0 Name: Vazi, Length: 201, dtype: float64 I want to iterate through A and find all the values => 180 and make a new array (or series) where for the values in A => 180 I subtract 180 but for values in A =< 180 I use the original value. I have tried the following but I get errors: nx = len(A) for i in range (nx): if A_new(i) >= A(i) + 180: else A_new(i) == A(i)
Use Series.mask / Series.where: new_s = s.mask(s.ge(180),s.sub(180)) #new_s = s.sub(180).where(s.ge(180),s) #or series.where or np.where new_s = pd.Series(data = np.where(s.ge(180),s.sub(180),s), index = s.index, name = s.name) We could also use Series.loc new_s = s.copy() new_s.loc[s.ge(180)] =s.sub(180) new_s output 0 76.0 1 10.0 2 0.0 3 86.0 4 0.0 196 156.0 197 0.0 198 0.0 199 140.0 200 0.0 Name: Vazi, Length: 201, dtype: float64
Python compare each row, if it have same value A, then plus value B
I have data that have thousand row, therefore I want to compare some row that have same value and meaning and sum their other values into one row. 14:45:14 923.0 6.9 4,399 87,144 14:30:02 923.9 7.8 6 82,745 14:30:00 924.0 7.9 19 82,739 14:29:58 924.0 7.9 1 82,720 14:29:56 924.0 7.9 20 82,719 14:29:54 923.9 7.8 96 82,699 my code while True: dong_khop_lenh = driver.find_element_by_xpath( "//div[#id='derivative-info-table-scroll-area-VN30F1910']" + "/table/tbody/tr["+str(vitri1)+"]") tds = dong_khop_lenh.find_elements_by_tag_name('td') time_khop = tds[0].text gia_khop = tds[1].text lech_vn30 = tds[2].text kl_khop = tds[3].text kl_tluy = tds[4].text dong_khop_lenh_duoi = driver.find_element_by_xpath( "//div[#id='derivative-info-table-scroll-area-VN30F1910']" + "/table/tbody/tr["+str(vitri2)+"]") tds_duoi = dong_khop_lenh_duoi.find_elements_by_tag_name('td') time_khop_duoi = tds_duoi[0].text gia_khop_duoi = tds_duoi[1].text lech_duoi = tds_duoi[2].text kl_khop_duoi = tds_duoi[3].text kl_tluy_duoi = tds_duoi[4].text if float(gia_khop) == float(gia_khop_duoi) and \ int(kl_tluy.replace(',', '')) > int(kl_tluy_duoi.replace(',', '')): vitri1 += 1 vitri2 += 1 kl_khop1 = int(kl_khop_duoi.replace(',', '')) if float(gia_khop) == float(gia_khop_duoi): final_kl_khop_duoi = int(kl_khop1) + int(kl_khop_duoi.replace(',', '')) final_dong_khop_lenh_duoi = str(time_khop) + "->" + str(time_khop_duoi) + \ "\t" + str(gia_khop_duoi) + "\t" + str(lech_duoi ) + \ "\t" + str(final_kl_khop_duoi) + "\t" + str(kl_tluy_duoi) print final_dong_khop_lenh_duoi I tried to compare rows 3 4 5 which have same value at 924, then plus 19+1+20 the result: 14:30:02 923.9 7.8 6 82,745 14:30:00 924.0 7.9 19 82,739 14:30:00->14:29:58 924.0 7.9 2 82,720 14:29:58->14:29:56 924.0 7.9 40 82,719 14:29:54 923.9 7.8 96 82,699 14:29:52 924.0 7.9 3 82,603 14:29:52->14:29:50 924.0 7.9 34 82,600 14:29:50->14:29:48 924.0 7.9 66 82,583 14:29:48->14:29:46 924.0 7.9 126 82,550
Speed optimization for loop
I'm trying to predict the outcome of sports games and therefore want to transform my dataframe in such a way that I can train a model. Currently I am using a for loop to loop through all played games, pick the two players of the game and check how they performed the x games before the actual game took place. After this I want to take the mean of the statistics of previous games of these players and concatenate these together. In the end I add the true outcome of the actual game so I can train a model on the true outcome. Now I got some speed performance issues, my current code takes about 9 minutes to complete for 20000 games (with ~200 variables). I already managed to go from 20 to 9 minutes. I started with adding each game to a dataframe, later I changed this to adding each seperate dataframe to a list and make one big dataframe of this list in the end. I also included if statements which make sure that the loop continues if a player did not play at least x games. I expect the outcome to be much faster than 9 minutes. I think it can be much faster. Hope you guys can help me! import pandas as pd import numpy as np import random import string letters = list(string.ascii_lowercase) datelist = pd.date_range(start='1/1/2017', end='1/1/2019') data = pd.DataFrame({'Date':np.random.choice(datelist,5000), 'League': np.random.choice(['LeagueA','LeagueB'], 5000), 'Home_player':np.random.choice(letters, 5000), 'Away_player':np.random.choice(letters, 5000), 'Home_strikes':np.random.randint(1,20,5000), 'Home_kicks':np.random.randint(1,20,5000), 'Away_strikes':np.random.randint(1,20,5000), 'Away_kicks':np.random.randint(1,20,5000), 'Winner':np.random.randint(0,2,5000)}) leagues = list(data['League'].unique()) home_columns = [col for col in data if col.startswith('Home')] away_columns = [col for col in data if col.startswith('Away')] # Determine to how many last x games to take statistics total_games = 5 final_df = [] # Make subframe of league for league in leagues: league_data = data[data.League == league] league_data = league_data.sort_values(by='Date').reset_index(drop=True) # Pick the last game league_data = league_data.head(500) for i in range(0,len(league_data)): if i < 1: league_copy = league_data.sort_values(by='Date').reset_index(drop=True) else: league_copy = league_data[:-i].reset_index(drop=True) # Loop back from the last game last_game = league_copy.iloc[-1:].reset_index(drop=True) # Take home and away player Home_player = last_game.loc[0,"Home_player"] # Pick home team Away_player = last_game.loc[0,'Away_player'] # pick away team # # Remove last row so current game is not picked df = league_copy[:-1] # Now check the statistics of the games befóre this game was played Home = df[df.Home_player == Home_player].tail(total_games) # Pick data from home team # If the player did not play at least x number of games, then continue if len(Home) < total_games: continue else: Home = Home[home_columns].reset_index(drop=True) # Pick all columnnames that start with "Home" # Do the same for the away team Away = df[df.Away_player == Away_player].tail(total_games) # Pick data from home team if len(Away) < total_games: continue else: Away = Away[away_columns].reset_index(drop=True) # Pick all columnnames that start with "Home" # Now concat home and away player data Home_away = pd.concat([Home, Away], axis=1) Home_away.drop(['Away_player','Home_player'],inplace=True,axis=1) # Take the mean of all columns Home_away = pd.DataFrame(Home_away.mean().to_dict(),index=[0]) # Now again add home team and away team to dataframe Home_away["Home_player"] = Home_player Home_away["Away_player"] = Away_player winner = last_game.loc[0,"Winner"] date = last_game.loc[0,"Date"] Home_away['Winner'] = winner Home_away['Date'] = date final_df.append(Home_away) final_df = pd.concat(final_df, axis=0) final_df = final_df[['Date','Home_player','Away_player','Home_kicks','Away_kicks','Home_strikes','Away_strikes','Winner']]
This doesn't answer your question but you can leverage the package line_profiler to find the slow parts of your code. Resource: http://gouthamanbalaraman.com/blog/profiling-python-jupyter-notebooks.html Line # Hits Time Per Hit % Time Line Contents ============================================================== 2 1 35.0 35.0 0.0 letters = list(string.ascii_lowercase) 3 1 11052.0 11052.0 0.0 datelist = pd.date_range(start='1/1/2017', end='1/1/2019') 4 5 1 3483.0 3483.0 0.0 data = pd.DataFrame({'Date':np.random.choice(datelist,5000), 6 1 1464.0 1464.0 0.0 'League': np.random.choice(['LeagueA','LeagueB'], 5000), 7 1 2532.0 2532.0 0.0 'Home_player':np.random.choice(letters, 5000), 8 1 1019.0 1019.0 0.0 'Away_player':np.random.choice(letters, 5000), 9 1 693.0 693.0 0.0 'Home_strikes':np.random.randint(1,20,5000), 10 1 682.0 682.0 0.0 'Home_kicks':np.random.randint(1,20,5000), 11 1 682.0 682.0 0.0 'Away_strikes':np.random.randint(1,20,5000), 12 1 731.0 731.0 0.0 'Away_kicks':np.random.randint(1,20,5000), 13 1 40409.0 40409.0 0.0 'Winner':np.random.randint(0,2,5000)}) 14 15 1 6560.0 6560.0 0.0 leagues = list(data['League'].unique()) 16 1 439.0 439.0 0.0 home_columns = [col for col in data if col.startswith('Home')] 17 1 282.0 282.0 0.0 away_columns = [col for col in data if col.startswith('Away')] 18 19 # Determine to how many last x games to take statistics 20 1 11.0 11.0 0.0 total_games = 5 21 1 12.0 12.0 0.0 final_df = [] 22 23 # Make subframe of league 24 3 38.0 12.7 0.0 for league in leagues: 25 26 2 34381.0 17190.5 0.0 league_data = data[data.League == league] 27 2 30815.0 15407.5 0.0 league_data = league_data.sort_values(by='Date').reset_index(drop=True) 28 # Pick the last game 29 2 5045.0 2522.5 0.0 league_data = league_data.head(500) 30 1002 14202.0 14.2 0.0 for i in range(0,len(league_data)): 31 1000 11943.0 11.9 0.0 if i < 1: 32 2 28407.0 14203.5 0.0 league_copy = league_data.sort_values(by='Date').reset_index(drop=True) 33 else: 34 998 5305364.0 5316.0 4.2 league_copy = league_data[:-i].reset_index(drop=True) 35 36 # Loop back from the last game 37 1000 4945240.0 4945.2 3.9 last_game = league_copy.iloc[-1:].reset_index(drop=True) 38 39 # Take home and away player 40 1000 1504055.0 1504.1 1.2 Home_player = last_game.loc[0,"Home_player"] # Pick home team 41 1000 899081.0 899.1 0.7 Away_player = last_game.loc[0,'Away_player'] # pick away team 42 43 # # Remove last row so current game is not picked 44 1000 2539351.0 2539.4 2.0 df = league_copy[:-1] 45 46 # Now check the statistics of the games befóre this game was played 47 1000 16428854.0 16428.9 13.0 Home = df[df.Home_player == Home_player].tail(total_games) # Pick data from home team 48 49 # If the player did not play at least x number of games, then continue 50 1000 49133.0 49.1 0.0 if len(Home) < total_games: 51 260 2867.0 11.0 0.0 continue 52 else: 53 740 12968016.0 17524.3 10.2 Home = Home[home_columns].reset_index(drop=True) # Pick all columnnames that start with "Home" 54 55 56 # Do the same for the away team 57 740 12007650.0 16226.6 9.5 Away = df[df.Away_player == Away_player].tail(total_games) # Pick data from home team 58 59 740 33357.0 45.1 0.0 if len(Away) < total_games: 60 64 825.0 12.9 0.0 continue 61 else: 62 676 11598741.0 17157.9 9.1 Away = Away[away_columns].reset_index(drop=True) # Pick all columnnames that start with "Home" 63 64 65 # Now concat home and away player data 66 676 5114022.0 7565.1 4.0 Home_away = pd.concat([Home, Away], axis=1) 67 676 9702001.0 14352.1 7.6 Home_away.drop(['Away_player','Home_player'],inplace=True,axis=1) 68 69 # Take the mean of all columns 70 676 12171184.0 18004.7 9.6 Home_away = pd.DataFrame(Home_away.mean().to_dict(),index=[0]) 71 72 # Now again add home team and away team to dataframe 73 676 5112558.0 7563.0 4.0 Home_away["Home_player"] = Home_player 74 676 4880017.0 7219.0 3.8 Home_away["Away_player"] = Away_player 75 76 676 791718.0 1171.2 0.6 winner = last_game.loc[0,"Winner"] 77 676 696925.0 1031.0 0.5 date = last_game.loc[0,"Date"] 78 676 5142111.0 7606.7 4.1 Home_away['Winner'] = winner 79 676 9630466.0 14246.3 7.6 Home_away['Date'] = date 80 81 676 16125.0 23.9 0.0 final_df.append(Home_away) 82 1 5088063.0 5088063.0 4.0 final_df = pd.concat(final_df, axis=0) 83 1 18424.0 18424.0 0.0 final_df = final_df[['Date','Home_player','Away_player','Home_kicks','Away_kicks','Home_strikes','Away_strikes','Winner']]
IIUC, you can obtain the last 5 game statistics, including the current one by: # replace this with you statistic columns stat_cols = data.columns[4:] total_games = 5 data.groupby(['League','Home_player', 'Away_player'])[stat_cols].rolling(total_games).mean() If you want to exclude the current one: last_stats = data.groupby(['League','Home_player', 'Away_player']).apply(lambda x: x[stat_cols].shift().rolling(total_games).mean()) This last_stats data frame should have the same index as the the original one, so you can do: train_data = data.copy() # backup the actual outcome train_data['Actual'] = train_data['Winner'] # copy the average statistics train_data[stat_cols] = last_stats All together should not take more than 1 min.
Pandas Seaborn Data labels showing as 0.00
I have two questions: My data labels are showing as 0.00 and isn't matching my cross tab. I don't know why... Updating with the full code: df = pd.read_csv('2018_ms_data_impact_only.csv', low_memory=False) df.head() StartDate EndDate Status IPAddress Progress duration Finished RecordedDate ResponseId RecipientLastName ... Gender LGBTQ Mobile organizing_interest Parent Policy policy_interest reg_to_vote unique_id Veteran 0 4/6/18 10:32 4/6/18 10:39 1 NaN 100 391 1 4/6/18 10:39 R_1liSDxRmTKDLFfT Mays ... Woman 0.0 4752122624 Currently in this field 1.0 NaN NaN 0.0 0034000001VAbTAAA1 0.0 1 4/9/18 6:31 4/9/18 6:33 1 NaN 100 160 1 4/9/18 6:33 R_0ezRf2zyaLwFDa1 Mays ... Woman 0.0 4752122684 2020 0.0 A 2020 0.0 0034000001W3tGOAAZ 0.0 2 4/9/18 9:14 4/9/18 9:15 1 NaN 100 70 1 4/9/18 9:15 R_DeHh3DQ23uQZwLD Mays ... Woman 0.0 4752122684 2020 0.0 A 2020 0.0 0034000001W3tGOAAZ 0.0 3 4/9/18 9:21 4/9/18 9:22 1 NaN 100 69 1 4/9/18 9:22 R_1CC0ckmyS7E1qs3 Mays ... Woman 0.0 4752122684 2020 0.0 A 2020 0.0 0034000001W3tGOAAZ 0.0 4 4/9/18 9:28 4/9/18 9:29 1 NaN 100 54 1 4/9/18 9:29 R_01GuM5KqtHIgvEl Mays ... Woman 0.0 4752122684 2020 0.0 A 2020 0.0 0034000001W3tGOAAZ 0.0 def impact_action_yn_new(series): if series == 3: return 'No' elif series == 1: return 'Yes' df['impact_action_yn_new'] = df['impact_action_yn'].apply(impact_action_yn_new) df['impact_action_yn_new'].value_counts(sort=False) # clean up engagement - collapse nan and 0, 2s def engagement_new(series): if series == '0': return 'Null' elif series == 'NaN': return 'Null' elif series == '1': return '1' elif series == '2': return '2a' elif series == '2a': return '2a' elif series == '2b': return '2b' elif series == '3': return '3' elif series == '4': return '4' elif series == '5': return '5' df['engagement_new'] = df['Engagement'].apply(engagement_new) impact_action_table_eng = pd.crosstab(df.impact_action_yn_new,df.engagement_new) print(impact_action_table_eng) engagement_new 1 2a 2b 3 4 5 Null impact_action_yn_new No 676 508 587 683 172 31 1 Yes 410 405 303 671 357 237 1 # Crosstab: Impact YN x Engagement - Row percentages impact_action_table_eng_rowperc = pd.crosstab(df.impact_action_yn_new,df.engagement_new).apply(lambda r: r/r.sum()*100, axis=1) print(impact_action_table_eng_rowperc) engagement_new 1 2a 2b 3 4 \ impact_action_yn_new No 25.432656 19.112114 22.084274 25.696012 6.471031 Yes 17.197987 16.988255 12.709732 28.145973 14.974832 engagement_new 5 Null impact_action_yn_new No 1.166290 0.037622 Yes 9.941275 0.041946 #plot data stacked_imp_eng_rowperc = impact_action_table_eng_rowperc.stack().reset_index().rename(columns={0:'value'}) total = float(len(df)) #set fig size fig, ax = plt.subplots(figsize=(15,10)) #set style sns.set_style('whitegrid') #plot ax = sns.barplot(x=stacked_imp_eng_rowperc.engagement_new, y=stacked_imp_eng_rowperc.value, hue=stacked_imp_eng_rowperc.impact_action_yn_new) #plot legend ax.legend(loc='center right',bbox_to_anchor=(.95,.9),ncol=1, fancybox=True, shadow=True) #plot axis labels for p in ax.patches: height = p.get_height() ax.text(p.get_x()+p.get_width()/2., height, '{:1.2f}'.format(height/total), ha="center") ax.set(xlabel='Engagement Level', ylabel='% Reporting an Action within Last 12 Months'); I'm not sure why the data labels on the bar plot are showing as 0.00. It is calling the crosstab. Any thoughts? Is there a way to convert the crosstab calcualtions to show as a percentage? I'd like to plot those percentages instead of decimals. Thanks for all your help!