numpy - anyway to improve this further(PandasTook(1h26m), NumpyTakes(38m))

numpy - anyway to improve this further(PandasTook(1h26m), NumpyTakes(38m)) - python

Initially had everything written pandas and for this exploratory exercise i had did a lot of groupby's and while running with the whole data it was ran for 1h 26m. Over the last weekend, i had changed everything from pandas to using numpy, currently it took(Wall time: 38min 27s). I would like to know if it can be further improved
While converting to numpy, i had additionally used numpy_indexed.
Overall what i am doing is calling the below function this loop(i have read in lots of places loops are bad). Dataset has around 657058 rows and there around 5000 tickers.
for idx, ticker in enumerate(ticker_list):
...
df_temp = weekly_trend_analysis(exchange, df_weekly, df_daily)
...
df_weekly_all = pd.concat([df_weekly_all, df_temp], sort=False)
def weekly_trend_analysis(exchange, df_weekly_all, df_daily):
if exchange == 'BSE':
ticker = df_daily.iloc[0]['sc_code']
else:
ticker = df_daily.iloc[0]['symbol']
arr_yearWeek = df_daily['yearWeek'].to_numpy()
arr_close = df_daily['close'].to_numpy()
arr_prevclose = df_daily['prevclose'].to_numpy()
arr_chng = df_daily['chng'].to_numpy()
arr_chngp = df_daily['chngp'].to_numpy()
arr_ts = df_daily['ts'].to_numpy()
arr_volumes = df_daily['volumes'].to_numpy()
# Close
arr_concat = np.column_stack((arr_yearWeek, arr_close))
npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
#a = df_temp[['yearWeek', 'close']].to_numpy()
yearWeek, daysTraded = np.unique(arr_concat[:,0], return_counts=True)
cmaxs, cmins = [], []
first, last, wChng, wChngp = [], [], [], []
for idx,subarr in enumerate(npi_gb):
cmaxs.append( np.amax(subarr) )
cmins.append( np.amin(subarr) )
first.append(subarr[0])
last.append(subarr[-1])
wChng.append( subarr[-1] - subarr[0] )
wChngp.append( ( (subarr[-1] / subarr[0]) * 100) - 100 )
#npi_gb.clear()
arr_concat = np.empty((100,100))
# Chng
arr_concat = np.column_stack((arr_yearWeek, arr_chng))
npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
HSDL, HSDG = [], []
for idx,subarr in enumerate(npi_gb):
HSDL.append( np.amin(subarr) )
HSDG.append( np.amax(subarr) )
#npi_gb.clear()
arr_concat = np.empty((100,100))
# Chngp
arr_concat = np.column_stack((arr_yearWeek, arr_chngp))
npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
HSDLp, HSDGp = [], []
for idx,subarr in enumerate(npi_gb):
HSDLp.append( np.amin(subarr) )
HSDGp.append( np.amax(subarr) )
#npi_gb.clear()
arr_concat = np.empty((100,100))
# Last Traded Date of the Week
i = df_daily[['yearWeek', 'ts']].to_numpy()
j = npi.group_by(i[:, 0]).split(i[:, 1])
lastTrdDoW = []
for idx,subarr in enumerate(j):
lastTrdDoW.append( subarr[-1] )
i = np.empty((100,100))
#j.clear()
# Times inreased
TI = np.where(arr_close > arr_prevclose, 1, 0)
# Below npi_gb_yearWeekTI is used in volumes section
arr_concat = np.column_stack((arr_yearWeek, TI))
npi_gb_yearWeekTI = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
tempArr, TI = npi.group_by(arr_yearWeek).sum(TI)
# Volume ( dependent on above section value t_group , thats the reason to move from top to here)
arr_concat = np.column_stack((arr_yearWeek, arr_volumes))
npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
vmaxs, vavgs, volAvgWOhv, HVdAV, CPveoHVD, lastDVotWk, lastDVdAV = [], [], [], [], [], [], []
for idx,subarr in enumerate(npi_gb):
vavgs.append( np.mean(subarr) )
ldvotWk = subarr[-1]
lastDVotWk.append(ldvotWk)
#print(idx, 'O - ',subarr, np.argmax(subarr), ', average : ',np.mean(subarr))
ixDel = np.argmax(subarr)
hV = subarr[ixDel]
vmaxs.append( hV )
if(len(subarr)>1):
subarr = np.delete(subarr, ixDel)
vawoHV = np.mean(subarr)
else:
vawoHV = np.mean(subarr)
volAvgWOhv.append( vawoHV )
HVdAV.append(hV / vawoHV)
CPveoHVD.append( npi_gb_yearWeekTI[idx][ixDel] )
lastDVdAV.append(ldvotWk / vawoHV)
#npi_gb.clear()
arr_concat = np.empty((100,100))
# Preparing the dataframe
# yearWeek and occurances
#yearWeek, daysTraded = np.unique(a[:,0], return_counts=True)
yearWeek = yearWeek.astype(int)
HSDL = np.round(HSDL,2)
HSDG = np.round(HSDG,2)
HSDLp = np.round(HSDLp,2)
HSDGp = np.round(HSDGp,2)
first = np.round(first,2)
last = np.round(last,2)
wChng = np.round(wChng,2)
wChngp = np.round(wChngp,2)
vavgs = np.array(vavgs).astype(int)
volAvgWOhv = np.array(volAvgWOhv).astype(int)
HVdAV = np.round(HVdAV,2)
dict_temp = {'yearWeek': yearWeek, 'closeH': cmaxs, 'closeL': cmins, 'volHigh':vmaxs, 'volAvg':vavgs, 'daysTraded':daysTraded
,'HSDL':HSDL, 'HSDG':HSDG, 'HSDLp':HSDLp, 'HSDGp':HSDGp, 'first':first, 'last':last, 'wChng':wChng, 'wChngp':wChngp
,'lastTrdDoW':lastTrdDoW, 'TI':TI, 'volAvgWOhv':volAvgWOhv, 'HVdAV':HVdAV, 'CPveoHVD':CPveoHVD
,'lastDVotWk':lastDVotWk, 'lastDVdAV':lastDVdAV}
df_weekly = pd.DataFrame(data=dict_temp)
df_weekly['sc_code'] = ticker
cols = ['sc_code', 'yearWeek', 'lastTrdDoW', 'daysTraded', 'closeL', 'closeH', 'volAvg', 'volHigh'
, 'HSDL', 'HSDG', 'HSDLp', 'HSDGp', 'first', 'last', 'wChng', 'wChngp', 'TI', 'volAvgWOhv', 'HVdAV'
, 'CPveoHVD', 'lastDVotWk', 'lastDVdAV']
df_weekly = df_weekly[cols].copy()
# df_weekly_all will be 0, when its a new company or its a FTA(First Time Analysis)
if df_weekly_all.shape[0] == 0:
df_weekly_all = pd.DataFrame(columns=list(df_weekly.columns))
# Removing all yearWeek in df_weekly2 from df_weekly
a = set(df_weekly_all['yearWeek'])
b = set(df_weekly['yearWeek'])
c = list(a.difference(b))
#print('df_weekly_all={}, df_weekly={}, difference={}'.format(len(a), len(b), len(c)) )
df_weekly_all = df_weekly_all[df_weekly_all.yearWeek.isin(c)].copy()
# Append the latest week data to df_weekly
df_weekly_all = pd.concat([df_weekly_all, df_weekly], sort=False)
#print('After concat : df_weekly_all={}'.format(df_weekly_all.shape[0]))
return df_weekly_all
Input data
ts = ['2019-04-01 00:00:00','2019-04-01 00:00:00','2019-04-01 00:00:00','2019-04-01 00:00:00','2019-04-01 00:00:00','2019-04-02 00:00:00','2019-04-02 00:00:00','2019-04-02 00:00:00','2019-04-02 00:00:00','2019-04-02 00:00:00']
sc_code = ['500002','500002','500002','500002','500002','500002','500002','500002','500002','500002']
high = [1326.6, 208.45, 732.15, 14.87, 1979.0, 57.8, 11.55, 1.68, 8.1, 139.4]
low = [1306.35, 204.0, 717.05, 13.41, 1937.65, 54.65, 11.2, 1.52, 7.75, 135.65]
close = [1313.55, 206.65, 723.05, 13.53, 1955.25, 56.0, 11.21, 1.68, 8.1, 136.85]
prevclose = [1319.85, 202.95, 718.95, 14.29, 1967.3, 54.65, 11.22, 1.6, 7.75, 135.05]
volumes = [7785, 6150, 21641, 46296, 707019, 40089, 25300, 5920, 500, 235355]
yearWeek = [201913, 201913, 201913, 201913, 201913, 201913, 201913, 201913, 201913, 201913]
chng = [-6.29, 3.70, 4.09, -0.75, -12.04, 1.35, -0.09, 0.079, 0.34, 1.79]
chngp = [-0.48, 1.82, 0.57, -5.32, -0.61, 2.47, -0.09, 5.0, 4.52, 1.33]
dict_temp = {'ts':ts, 'sc_code':sc_code, 'high':high, 'low':low, 'close':close, 'prevclose':prevclose, 'volumes':volumes, 'yearWeek':yearWeek, 'chng':chng, 'chngp':chngp}
df_weekly = pd.DataFrame(data=dict_temp)
Adding line-profiler details,
('getcwd : ', '/home/bobby_dreamer')
Timer unit: 1e-06 s
Total time: 0.043637 s
File: BTD-Analysis1V3.py
Function: weekly_trend_analysis at line 36
Line # Hits Time Per Hit % Time Line Contents
==============================================================
36 def weekly_trend_analysis(exchange, df_weekly_all, df_daily):
37
38 1 3.0 3.0 0.0 if exchange == 'BSE':
39 1 963.0 963.0 2.2 ticker = df_daily.iloc[0]['sc_code']
40 else:
41 ticker = df_daily.iloc[0]['symbol']
42
95 # Last Traded Date of the Week
96 1 3111.0 3111.0 7.1 i = df_daily[['yearWeek', 'ts']].to_numpy()
97 1 128.0 128.0 0.3 j = npi.group_by(i[:, 0]).split(i[:, 1])
98
160
161 1 3.0 3.0 0.0 dict_temp = {'yearWeek': yearWeek, 'closeH': cmaxs, 'closeL': cmins, 'volHigh':vmaxs, 'volAvg':vavgs, 'daysTraded':daysTraded
162 1 2.0 2.0 0.0 ,'HSDL':HSDL, 'HSDG':HSDG, 'HSDLp':HSDLp, 'HSDGp':HSDGp, 'first':first, 'last':last, 'wChng':wChng, 'wChngp':wChngp
163 1 2.0 2.0 0.0 ,'lastTrdDoW':lastTrdDoW, 'TI':TI, 'volAvgWOhv':volAvgWOhv, 'HVdAV':HVdAV, 'CPveoHVD':CPveoHVD
164 1 2.0 2.0 0.0 ,'lastDVotWk':lastDVotWk, 'lastDVdAV':lastDVdAV}
165 1 3677.0 3677.0 8.4 df_weekly = pd.DataFrame(data=dict_temp)
166
167 1 1102.0 1102.0 2.5 df_weekly['sc_code'] = ticker
168
169 1 3.0 3.0 0.0 cols = ['sc_code', 'yearWeek', 'lastTrdDoW', 'daysTraded', 'closeL', 'closeH', 'volAvg', 'volHigh'
170 1 1.0 1.0 0.0 , 'HSDL', 'HSDG', 'HSDLp', 'HSDGp', 'first', 'last', 'wChng', 'wChngp', 'TI', 'volAvgWOhv', 'HVdAV'
171 1 2.0 2.0 0.0 , 'CPveoHVD', 'lastDVotWk', 'lastDVdAV']
172
173 1 2816.0 2816.0 6.5 df_weekly = df_weekly[cols].copy()
174
175 # df_weekly_all will be 0, when its a new company or its a FTA(First Time Analysis)
176 1 13.0 13.0 0.0 if df_weekly_all.shape[0] == 0:
177 1 20473.0 20473.0 46.9 df_weekly_all = pd.DataFrame(columns=list(df_weekly.columns))
178
179 # Removing all yearWeek in df_weekly2 from df_weekly
180 1 321.0 321.0 0.7 a = set(df_weekly_all['yearWeek'])
181 1 190.0 190.0 0.4 b = set(df_weekly['yearWeek'])
182 1 5.0 5.0 0.0 c = list(a.difference(b))
183 #print('df_weekly_all={}, df_weekly={}, difference={}'.format(len(a), len(b), len(c)) )
184 1 1538.0 1538.0 3.5 df_weekly_all = df_weekly_all[df_weekly_all.yearWeek.isin(c)].copy()
185
186 # Append the latest week data to df_weekly
187 1 6998.0 6998.0 16.0 df_weekly_all = pd.concat([df_weekly_all, df_weekly], sort=False)
188 #print('After concat : df_weekly_all={}'.format(df_weekly_all.shape[0]))
189
190 1 2.0 2.0 0.0 return df_weekly_all
After reviewing the above profile, made changes to code which consumed more time, basically added more numpy code removed pandas in the function. Below code when run with whole data took only Wall time: 7min 47s.
Encountered some numpy errors like below, handled via writing intermediate files. I am using windows machine and intermediate files were < 3MB. Not sure if there were any limitations.
MemoryError: Unable to allocate array with shape (82912, 22) and data type <U32
Line # Hits Time Per Hit % Time Line Contents
==============================================================
38 def weekly_trend_analysis_np(exchange, np_weekly_all, df_daily):
39
40 1 4.0 4.0 0.0 if exchange == 'BSE':
43 1 152.0 152.0 1.2 ticker = df_daily['sc_code'].to_numpy()[0]
44 else:
47 ticker = df_daily['symbol'].to_numpy()[0]
48
101 # Last Traded Date of the Week
102 1 33.0 33.0 0.3 arr_concat = np.column_stack((arr_yearWeek, arr_ts))
103 1 341.0 341.0 2.6 npi_gb = npi.group_by(arr_concat[:, 0]).split(arr_concat[:, 1])
104
152 1 5.0 5.0 0.0 yearWeek = yearWeek.astype(int)
153 1 59.0 59.0 0.5 HSDL = np.round(HSDL,2)
154 1 26.0 26.0 0.2 HSDG = np.round(HSDG,2)
155 1 23.0 23.0 0.2 HSDLp = np.round(HSDLp,2)
156 1 23.0 23.0 0.2 HSDGp = np.round(HSDGp,2)
157
158 1 23.0 23.0 0.2 first = np.round(first,2)
159 1 23.0 23.0 0.2 last = np.round(last,2)
160 1 23.0 23.0 0.2 wChng = np.round(wChng,2)
161 1 23.0 23.0 0.2 wChngp = np.round(wChngp,2)
162
163 1 12.0 12.0 0.1 vavgs = np.array(vavgs).astype(int)
164 1 16.0 16.0 0.1 volAvgWOhv = np.array(volAvgWOhv).astype(int)
165 1 24.0 24.0 0.2 HVdAV = np.round(HVdAV,2)
166
167 1 16.0 16.0 0.1 ticker = np.full(yearWeek.shape[0], ticker)
168 1 2.0 2.0 0.0 np_weekly = np.column_stack((ticker, yearWeek, lastTrdDoW, daysTraded, cmins, cmaxs, vavgs, vmaxs, HSDL
169 1 2.0 2.0 0.0 , HSDG, HSDLp, HSDGp, first, last, wChng, wChngp, TI, volAvgWOhv, HVdAV
170 1 546.0 546.0 4.2 , CPveoHVD, lastDVotWk, lastDVdAV))
171
173 1 2.0 2.0 0.0 if len(np_weekly_all) > 0:
175 1 2.0 2.0 0.0 a = np_weekly_all[:,1]
176 1 1.0 1.0 0.0 b = np_weekly[:,1]
177 1 205.0 205.0 1.6 tf_1 = np.isin(a, b, invert=True)
179 1 13.0 13.0 0.1 t_result = list(compress(range(len(tf_1)), tf_1))
181 1 13.0 13.0 0.1 np_weekly_all = np_weekly_all[t_result]
182 1 40.0 40.0 0.3 np_weekly_all = np.vstack((np_weekly_all, np_weekly))
183 else:
184 np_weekly_all = []
185 np_weekly_all = np.vstack((np_weekly))
186
187 1 2.0 2.0 0.0 return np_weekly_all
I would glad to hear your suggestions and thanks for pointing to profiler, i didn't know about that.

Edited and posted the line profiler code above. I have updated the code from pandas to numpy which had reduced the times considerable from inital 1h 26min to now 7mins.

Related

How to fill empty data with zeros?

After going through some previous answers I found that I could use this code to fill missing values of df1[0] which range from 340 to 515,
with open('contactasortedtest.dat', 'r') as f:
text = [line.split() for line in f]
def replace_missing(df1 , Ids ):
missing = np.setdiff1d(Ids,df1[1])
print(missing)
if len(missing) > 0 :
missing_df = pd.DataFrame(data = np.zeros( (len(missing) , 4 )))
missing_df[1] = missing
missing_df[2].replace(0 , df1[2].iloc[1] , inplace = True)
df1 = pd.concat([df1 , missing_df])
return df1
Ids = (np.arange(340.0,515.0))
final_df = df1.groupby(df1[2],as_index=True).apply(replace_missing ,Ids).reset_index(drop = True)
final_df
Through troubleshooting I found that missing = np.setdiff1d(Ids,df1[1]) does not perform. Rather return the whole array. I found many answers on this, but I couldn't work it out. Any help would be appreciated.
Sample data I used,
12 340.0 1.0 0.0
2 491.0 1.0 35.8
13 492.0 1.0 81.4
4 493.0 1.0 0.0
7 495.0 1.0 0.2
0 496.0 1.0 90.3
11 509.0 1.0 2.3
6 513.0 1.0 4.3
8 515.0 1.0 0.1
Thank you !

You can use df['x'].fillna(0) to fill non zeros in a column

Iterating through a series to find values >= x then use values

I have a series (of length 201) created from reading a .xlsx spread sheet, as follows:
xl = pandas.ExcelFile(file)
data = xl.parse('Sheet1')
data.columns = ["a", "b", "c", "d", "e", "f", "g", "h"]
A = data.a
So I am working with A and if I print (A) I get
0 76.0
1 190.0
2 0.0
3 86.0
4 0.0
196 156.0
197 0.0
198 0.0
199 320.0
200 0.0
Name: Vazi, Length: 201, dtype: float64
I want to iterate through A and find all the values => 180 and make a new array (or series) where for the values in A => 180 I subtract 180 but for values in A =< 180 I use the original value. I have tried the following but I get errors:
nx = len(A)
for i in range (nx):
if A_new(i) >= A(i) + 180:
else A_new(i) == A(i)

Use Series.mask / Series.where:
new_s = s.mask(s.ge(180),s.sub(180))
#new_s = s.sub(180).where(s.ge(180),s) #or series.where
or np.where
new_s = pd.Series(data = np.where(s.ge(180),s.sub(180),s),
index = s.index,
name = s.name)
We could also use Series.loc
new_s = s.copy()
new_s.loc[s.ge(180)] =s.sub(180)
new_s output
0 76.0
1 10.0
2 0.0
3 86.0
4 0.0
196 156.0
197 0.0
198 0.0
199 140.0
200 0.0
Name: Vazi, Length: 201, dtype: float64

Python compare each row, if it have same value A, then plus value B

I have data that have thousand row, therefore I want to compare some row that have same value and meaning and sum their other values into one row.
14:45:14 923.0 6.9 4,399 87,144
14:30:02 923.9 7.8 6 82,745
14:30:00 924.0 7.9 19 82,739
14:29:58 924.0 7.9 1 82,720
14:29:56 924.0 7.9 20 82,719
14:29:54 923.9 7.8 96 82,699
my code
while True:
dong_khop_lenh = driver.find_element_by_xpath(
"//div[#id='derivative-info-table-scroll-area-VN30F1910']" +
"/table/tbody/tr["+str(vitri1)+"]")
tds = dong_khop_lenh.find_elements_by_tag_name('td')
time_khop = tds[0].text
gia_khop = tds[1].text
lech_vn30 = tds[2].text
kl_khop = tds[3].text
kl_tluy = tds[4].text
dong_khop_lenh_duoi = driver.find_element_by_xpath(
"//div[#id='derivative-info-table-scroll-area-VN30F1910']" +
"/table/tbody/tr["+str(vitri2)+"]")
tds_duoi = dong_khop_lenh_duoi.find_elements_by_tag_name('td')
time_khop_duoi = tds_duoi[0].text
gia_khop_duoi = tds_duoi[1].text
lech_duoi = tds_duoi[2].text
kl_khop_duoi = tds_duoi[3].text
kl_tluy_duoi = tds_duoi[4].text
if float(gia_khop) == float(gia_khop_duoi) and \
int(kl_tluy.replace(',', '')) > int(kl_tluy_duoi.replace(',', '')):
vitri1 += 1
vitri2 += 1
kl_khop1 = int(kl_khop_duoi.replace(',', ''))
if float(gia_khop) == float(gia_khop_duoi):
final_kl_khop_duoi = int(kl_khop1) + int(kl_khop_duoi.replace(',', ''))
final_dong_khop_lenh_duoi = str(time_khop) + "->" + str(time_khop_duoi) + \
"\t" + str(gia_khop_duoi) + "\t" + str(lech_duoi ) + \
"\t" + str(final_kl_khop_duoi) + "\t" + str(kl_tluy_duoi)
print final_dong_khop_lenh_duoi
I tried to compare rows 3 4 5 which have same value at 924, then plus 19+1+20
the result:
14:30:02 923.9 7.8 6 82,745
14:30:00 924.0 7.9 19 82,739
14:30:00->14:29:58 924.0 7.9 2 82,720
14:29:58->14:29:56 924.0 7.9 40 82,719
14:29:54 923.9 7.8 96 82,699
14:29:52 924.0 7.9 3 82,603
14:29:52->14:29:50 924.0 7.9 34 82,600
14:29:50->14:29:48 924.0 7.9 66 82,583
14:29:48->14:29:46 924.0 7.9 126 82,550

Speed optimization for loop

I'm trying to predict the outcome of sports games and therefore want to transform my dataframe in such a way that I can train a model. Currently I am using a for loop to loop through all played games, pick the two players of the game and check how they performed the x games before the actual game took place. After this I want to take the mean of the statistics of previous games of these players and concatenate these together. In the end I add the true outcome of the actual game so I can train a model on the true outcome.
Now I got some speed performance issues, my current code takes about 9 minutes to complete for 20000 games (with ~200 variables). I already managed to go from 20 to 9 minutes.
I started with adding each game to a dataframe, later I changed this to adding each seperate dataframe to a list and make one big dataframe of this list in the end.
I also included if statements which make sure that the loop continues if a player did not play at least x games.
I expect the outcome to be much faster than 9 minutes. I think it can be much faster.
Hope you guys can help me!
import pandas as pd
import numpy as np
import random
import string
letters = list(string.ascii_lowercase)
datelist = pd.date_range(start='1/1/2017', end='1/1/2019')
data = pd.DataFrame({'Date':np.random.choice(datelist,5000),
'League': np.random.choice(['LeagueA','LeagueB'], 5000),
'Home_player':np.random.choice(letters, 5000),
'Away_player':np.random.choice(letters, 5000),
'Home_strikes':np.random.randint(1,20,5000),
'Home_kicks':np.random.randint(1,20,5000),
'Away_strikes':np.random.randint(1,20,5000),
'Away_kicks':np.random.randint(1,20,5000),
'Winner':np.random.randint(0,2,5000)})
leagues = list(data['League'].unique())
home_columns = [col for col in data if col.startswith('Home')]
away_columns = [col for col in data if col.startswith('Away')]
# Determine to how many last x games to take statistics
total_games = 5
final_df = []
# Make subframe of league
for league in leagues:
league_data = data[data.League == league]
league_data = league_data.sort_values(by='Date').reset_index(drop=True)
# Pick the last game
league_data = league_data.head(500)
for i in range(0,len(league_data)):
if i < 1:
league_copy = league_data.sort_values(by='Date').reset_index(drop=True)
else:
league_copy = league_data[:-i].reset_index(drop=True)
# Loop back from the last game
last_game = league_copy.iloc[-1:].reset_index(drop=True)
# Take home and away player
Home_player = last_game.loc[0,"Home_player"] # Pick home team
Away_player = last_game.loc[0,'Away_player'] # pick away team
# # Remove last row so current game is not picked
df = league_copy[:-1]
# Now check the statistics of the games befóre this game was played
Home = df[df.Home_player == Home_player].tail(total_games) # Pick data from home team
# If the player did not play at least x number of games, then continue
if len(Home) < total_games:
continue
else:
Home = Home[home_columns].reset_index(drop=True) # Pick all columnnames that start with "Home"
# Do the same for the away team
Away = df[df.Away_player == Away_player].tail(total_games) # Pick data from home team
if len(Away) < total_games:
continue
else:
Away = Away[away_columns].reset_index(drop=True) # Pick all columnnames that start with "Home"
# Now concat home and away player data
Home_away = pd.concat([Home, Away], axis=1)
Home_away.drop(['Away_player','Home_player'],inplace=True,axis=1)
# Take the mean of all columns
Home_away = pd.DataFrame(Home_away.mean().to_dict(),index=[0])
# Now again add home team and away team to dataframe
Home_away["Home_player"] = Home_player
Home_away["Away_player"] = Away_player
winner = last_game.loc[0,"Winner"]
date = last_game.loc[0,"Date"]
Home_away['Winner'] = winner
Home_away['Date'] = date
final_df.append(Home_away)
final_df = pd.concat(final_df, axis=0)
final_df = final_df[['Date','Home_player','Away_player','Home_kicks','Away_kicks','Home_strikes','Away_strikes','Winner']]

This doesn't answer your question but you can leverage the package line_profiler to find the slow parts of your code.
Resource:
http://gouthamanbalaraman.com/blog/profiling-python-jupyter-notebooks.html
Line # Hits Time Per Hit % Time Line Contents
==============================================================
2 1 35.0 35.0 0.0 letters = list(string.ascii_lowercase)
3 1 11052.0 11052.0 0.0 datelist = pd.date_range(start='1/1/2017', end='1/1/2019')
4
5 1 3483.0 3483.0 0.0 data = pd.DataFrame({'Date':np.random.choice(datelist,5000),
6 1 1464.0 1464.0 0.0 'League': np.random.choice(['LeagueA','LeagueB'], 5000),
7 1 2532.0 2532.0 0.0 'Home_player':np.random.choice(letters, 5000),
8 1 1019.0 1019.0 0.0 'Away_player':np.random.choice(letters, 5000),
9 1 693.0 693.0 0.0 'Home_strikes':np.random.randint(1,20,5000),
10 1 682.0 682.0 0.0 'Home_kicks':np.random.randint(1,20,5000),
11 1 682.0 682.0 0.0 'Away_strikes':np.random.randint(1,20,5000),
12 1 731.0 731.0 0.0 'Away_kicks':np.random.randint(1,20,5000),
13 1 40409.0 40409.0 0.0 'Winner':np.random.randint(0,2,5000)})
14
15 1 6560.0 6560.0 0.0 leagues = list(data['League'].unique())
16 1 439.0 439.0 0.0 home_columns = [col for col in data if col.startswith('Home')]
17 1 282.0 282.0 0.0 away_columns = [col for col in data if col.startswith('Away')]
18
19 # Determine to how many last x games to take statistics
20 1 11.0 11.0 0.0 total_games = 5
21 1 12.0 12.0 0.0 final_df = []
22
23 # Make subframe of league
24 3 38.0 12.7 0.0 for league in leagues:
25
26 2 34381.0 17190.5 0.0 league_data = data[data.League == league]
27 2 30815.0 15407.5 0.0 league_data = league_data.sort_values(by='Date').reset_index(drop=True)
28 # Pick the last game
29 2 5045.0 2522.5 0.0 league_data = league_data.head(500)
30 1002 14202.0 14.2 0.0 for i in range(0,len(league_data)):
31 1000 11943.0 11.9 0.0 if i < 1:
32 2 28407.0 14203.5 0.0 league_copy = league_data.sort_values(by='Date').reset_index(drop=True)
33 else:
34 998 5305364.0 5316.0 4.2 league_copy = league_data[:-i].reset_index(drop=True)
35
36 # Loop back from the last game
37 1000 4945240.0 4945.2 3.9 last_game = league_copy.iloc[-1:].reset_index(drop=True)
38
39 # Take home and away player
40 1000 1504055.0 1504.1 1.2 Home_player = last_game.loc[0,"Home_player"] # Pick home team
41 1000 899081.0 899.1 0.7 Away_player = last_game.loc[0,'Away_player'] # pick away team
42
43 # # Remove last row so current game is not picked
44 1000 2539351.0 2539.4 2.0 df = league_copy[:-1]
45
46 # Now check the statistics of the games befóre this game was played
47 1000 16428854.0 16428.9 13.0 Home = df[df.Home_player == Home_player].tail(total_games) # Pick data from home team
48
49 # If the player did not play at least x number of games, then continue
50 1000 49133.0 49.1 0.0 if len(Home) < total_games:
51 260 2867.0 11.0 0.0 continue
52 else:
53 740 12968016.0 17524.3 10.2 Home = Home[home_columns].reset_index(drop=True) # Pick all columnnames that start with "Home"
54
55
56 # Do the same for the away team
57 740 12007650.0 16226.6 9.5 Away = df[df.Away_player == Away_player].tail(total_games) # Pick data from home team
58
59 740 33357.0 45.1 0.0 if len(Away) < total_games:
60 64 825.0 12.9 0.0 continue
61 else:
62 676 11598741.0 17157.9 9.1 Away = Away[away_columns].reset_index(drop=True) # Pick all columnnames that start with "Home"
63
64
65 # Now concat home and away player data
66 676 5114022.0 7565.1 4.0 Home_away = pd.concat([Home, Away], axis=1)
67 676 9702001.0 14352.1 7.6 Home_away.drop(['Away_player','Home_player'],inplace=True,axis=1)
68
69 # Take the mean of all columns
70 676 12171184.0 18004.7 9.6 Home_away = pd.DataFrame(Home_away.mean().to_dict(),index=[0])
71
72 # Now again add home team and away team to dataframe
73 676 5112558.0 7563.0 4.0 Home_away["Home_player"] = Home_player
74 676 4880017.0 7219.0 3.8 Home_away["Away_player"] = Away_player
75
76 676 791718.0 1171.2 0.6 winner = last_game.loc[0,"Winner"]
77 676 696925.0 1031.0 0.5 date = last_game.loc[0,"Date"]
78 676 5142111.0 7606.7 4.1 Home_away['Winner'] = winner
79 676 9630466.0 14246.3 7.6 Home_away['Date'] = date
80
81 676 16125.0 23.9 0.0 final_df.append(Home_away)
82 1 5088063.0 5088063.0 4.0 final_df = pd.concat(final_df, axis=0)
83 1 18424.0 18424.0 0.0 final_df = final_df[['Date','Home_player','Away_player','Home_kicks','Away_kicks','Home_strikes','Away_strikes','Winner']]

IIUC, you can obtain the last 5 game statistics, including the current one by:
# replace this with you statistic columns
stat_cols = data.columns[4:]
total_games = 5
data.groupby(['League','Home_player', 'Away_player'])[stat_cols].rolling(total_games).mean()
If you want to exclude the current one:
last_stats = data.groupby(['League','Home_player', 'Away_player']).apply(lambda x: x[stat_cols].shift().rolling(total_games).mean())
This last_stats data frame should have the same index as the the original one, so you can do:
train_data = data.copy()
# backup the actual outcome
train_data['Actual'] = train_data['Winner']
# copy the average statistics
train_data[stat_cols] = last_stats
All together should not take more than 1 min.

Pandas Seaborn Data labels showing as 0.00

I have two questions:
My data labels are showing as 0.00 and isn't matching my cross tab. I don't know why...
Updating with the full code:
df = pd.read_csv('2018_ms_data_impact_only.csv', low_memory=False)
df.head()
StartDate EndDate Status IPAddress Progress duration Finished RecordedDate ResponseId RecipientLastName ... Gender LGBTQ Mobile organizing_interest Parent Policy policy_interest reg_to_vote unique_id Veteran
0 4/6/18 10:32 4/6/18 10:39 1 NaN 100 391 1 4/6/18 10:39 R_1liSDxRmTKDLFfT Mays ... Woman 0.0 4752122624 Currently in this field 1.0 NaN NaN 0.0 0034000001VAbTAAA1 0.0
1 4/9/18 6:31 4/9/18 6:33 1 NaN 100 160 1 4/9/18 6:33 R_0ezRf2zyaLwFDa1 Mays ... Woman 0.0 4752122684 2020 0.0 A 2020 0.0 0034000001W3tGOAAZ 0.0
2 4/9/18 9:14 4/9/18 9:15 1 NaN 100 70 1 4/9/18 9:15 R_DeHh3DQ23uQZwLD Mays ... Woman 0.0 4752122684 2020 0.0 A 2020 0.0 0034000001W3tGOAAZ 0.0
3 4/9/18 9:21 4/9/18 9:22 1 NaN 100 69 1 4/9/18 9:22 R_1CC0ckmyS7E1qs3 Mays ... Woman 0.0 4752122684 2020 0.0 A 2020 0.0 0034000001W3tGOAAZ 0.0
4 4/9/18 9:28 4/9/18 9:29 1 NaN 100 54 1 4/9/18 9:29 R_01GuM5KqtHIgvEl Mays ... Woman 0.0 4752122684 2020 0.0 A 2020 0.0 0034000001W3tGOAAZ 0.0
def impact_action_yn_new(series):
if series == 3:
return 'No'
elif series == 1:
return 'Yes'
df['impact_action_yn_new'] = df['impact_action_yn'].apply(impact_action_yn_new)
df['impact_action_yn_new'].value_counts(sort=False)
# clean up engagement - collapse nan and 0, 2s
def engagement_new(series):
if series == '0':
return 'Null'
elif series == 'NaN':
return 'Null'
elif series == '1':
return '1'
elif series == '2':
return '2a'
elif series == '2a':
return '2a'
elif series == '2b':
return '2b'
elif series == '3':
return '3'
elif series == '4':
return '4'
elif series == '5':
return '5'
df['engagement_new'] = df['Engagement'].apply(engagement_new)
impact_action_table_eng = pd.crosstab(df.impact_action_yn_new,df.engagement_new)
print(impact_action_table_eng)
engagement_new 1 2a 2b 3 4 5 Null
impact_action_yn_new
No 676 508 587 683 172 31 1
Yes 410 405 303 671 357 237 1
# Crosstab: Impact YN x Engagement - Row percentages
impact_action_table_eng_rowperc = pd.crosstab(df.impact_action_yn_new,df.engagement_new).apply(lambda r: r/r.sum()*100, axis=1)
print(impact_action_table_eng_rowperc)
engagement_new 1 2a 2b 3 4 \
impact_action_yn_new
No 25.432656 19.112114 22.084274 25.696012 6.471031
Yes 17.197987 16.988255 12.709732 28.145973 14.974832
engagement_new 5 Null
impact_action_yn_new
No 1.166290 0.037622
Yes 9.941275 0.041946
#plot data
stacked_imp_eng_rowperc = impact_action_table_eng_rowperc.stack().reset_index().rename(columns={0:'value'})
total = float(len(df))
#set fig size
fig, ax = plt.subplots(figsize=(15,10))
#set style
sns.set_style('whitegrid')
#plot
ax = sns.barplot(x=stacked_imp_eng_rowperc.engagement_new,
y=stacked_imp_eng_rowperc.value,
hue=stacked_imp_eng_rowperc.impact_action_yn_new)
#plot legend
ax.legend(loc='center right',bbox_to_anchor=(.95,.9),ncol=1, fancybox=True, shadow=True)
#plot axis labels
for p in ax.patches:
height = p.get_height()
ax.text(p.get_x()+p.get_width()/2.,
height,
'{:1.2f}'.format(height/total),
ha="center")
ax.set(xlabel='Engagement Level', ylabel='% Reporting an Action within Last 12 Months');
I'm not sure why the data labels on the bar plot are showing as 0.00. It is calling the crosstab. Any thoughts?
Is there a way to convert the crosstab calcualtions to show as a percentage? I'd like to plot those percentages instead of decimals.
Thanks for all your help!

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

numpy - anyway to improve this further(PandasTook(1h26m), NumpyTakes(38m)) - python

Edited and posted the line profiler code above. I have updated the code from pandas to numpy which had reduced the times considerable from inital 1h 26min to now 7mins.

Related

How to fill empty data with zeros?

Iterating through a series to find values >= x then use values

Python compare each row, if it have same value A, then plus value B

Speed optimization for loop

Pandas Seaborn Data labels showing as 0.00

Categories

Resources