Related
I have an excel file of nearly 95880 rows. I made a VBA function that runs slow, so I tried to code a python script using openpyxl, but it's even slower.
It starts fast, then after 600 rows becomes slower and slower.
The VBA Code is
Option Explicit
Function FTE(Assunzione As Date, Cess As Variant, Data)
Dim myDate As Date
Dim EndDate As Date, EndDate2 As Date
Dim check As Integer
EndDate = Application.WorksheetFunction.EoMonth(Assunzione, 0)
myDate = #1/1/2022#
If Cess = 0 Then
Call Check2(Assunzione, Data, myDate, EndDate, check)
FTE = check
Else:
EndDate2 = Application.WorksheetFunction.EoMonth(Cess, -1)
Call Check1(Assunzione, Cess, Data, myDate, EndDate, EndDate2, check)
FTE = check
End If
End Function
Sub Check1(Assunzione, Cess, Data, myDate, EndDate, EndDate2, check)
Dim Cess1 As Date
Dim gg_lav As Integer, gg_lav2 As Integer
Cess1 = Cess.Value
If Assunzione > Date Then
check = 0
Else
If Month(Assunzione) <= Month(Data) And Year(Assunzione) = 2022 Then
If Assunzione > myDate Then
gg_lav = Application.WorksheetFunction.Days(EndDate, Assunzione) + 1
If gg_lav >= 15 Then
If Month(Data) = (Month(EndDate2) + 1) And Year(Cess1) = 2022 Then
gg_lav2 = Application.WorksheetFunction.Days(Cess1, EndDate2)
If gg_lav2 >= 15 Then
check = 1
Else
check = 0
End If
Else
check = 1
End If
Else
check = 0
End If
Else
check = 1
End If
Else
check = 1
End If
End If
End Sub
Sub Check2(Assunzione, Data, myDate, EndDate, check)
Dim gg_lav As Integer
If Assunzione > Date Then
check = 0
Else
If Month(Assunzione) <= Month(Data) And Year(Assunzione) = 2022 Then
If Assunzione > myDate Then
gg_lav = Application.WorksheetFunction.Days(EndDate, Assunzione) + 1
If gg_lav >= 15 Then
check = 1
Else
check = 0
End If
Else
check = 1
End If
Else
check = 1
End If
End If
End Sub
and my openpyxl is:
def check1(a,d,c,i):
if ws.cell(row=i,column=a).value > ws.cell(row=i,column=d).value:
return 0
else:
if ws.cell(row=i,column=a).value.month == ws.cell(row=i,column=d).value.month and ws.cell(row=i,column=a).value.year == 2022:
EndDate = date(ws.cell(row=i,column=a).value.year, ws.cell(row=i,column=a).value.month,
calendar.monthrange(ws.cell(row=i,column=a).value.year,
ws.cell(row=i,column=a).value.month)[1])
gg_lav = (EndDate - datetime.date(ws.cell(row=i,column=a).value)).days
if gg_lav >= 15:
EndDate2 = date(ws.cell(row=i,column=c).value.year,ws.cell(row=i,column=c).value.month-1,
calendar.monthrange(ws.cell(row=i,column=c).value.year,
ws.cell(row=i,column=c).value.month-1)[1])
if ws.cell(row=i,column=d).value.month == EndDate2.month and ws.cell(row=i,column=c).value.year == 2022:
gg_lav2 = (datetime.date(ws.cell(row=i,column=c).value)-EndDate2).days
if gg_lav2 >= 15:
return 1
else:
return 0
else:
return 1
else:
return 0
else:
return 1
def check2(a,d,i):
if ws.cell(row=i,column=a).value > ws.cell(row=i,column=a).value:
return 0
else:
if ws.cell(row=i,column=a).value.month == ws.cell(row=i,column=d).value.month and ws.cell(row=i,column=a).value.year == 2022:
EndDate = date(ws.cell(row=i,column=a).value.year, ws.cell(row=i,column=a).value.month,
calendar.monthrange(ws.cell(row=i,column=a).value.year,
ws.cell(row=i,column=a).value.month)[1])
gg_lav = (EndDate - datetime.date(ws.cell(row=i,column=a).value)).days
if gg_lav >= 15:
return 1
else:
return 0
else:
return 1
wb1 = Workbook()
ws1 = wb1.create_sheet()
for i in range(2,95882):
if ws.cell(row = i, column = c).value == None:
ws1.cell(row = i, column = 1, value = check2(a, d, i))
else:
ws1.cell(row = i, column = 1, value = check1(a, d, c, i))
What am I doing wrong? Should I use another library or I'm making the code uselessy memory consuming?
Thank you very much for any help!
Update: I think that the problem was with openpyxl. First I tried to reduce the number of observation, from 95K to almost 5K, but it required two and half hour to complete the task.
So I used numpy and it took 55 seconds. Yeah, that's the difference in processing speed.
Here I post the code:
with open('data.csv','r') as f:
data = list(csv.reader(f,delimiter =';'))
arr = np.array(data)
arr = np.resize(arr,(4797,13))
I had to change of course the code in this section:
a = 3
d = 0
c = 4
def check1(a,d,c,i):
if int(arr[i][a]) > int(arr[i][d]):
return 0
else:
za = datetime.fromordinal((int(arr[i][a]) + 693594))
zd = datetime.fromordinal((int(arr[i][d]) + 693594))
da = date(za.year, za.month, za.day)
dd = date(zd.year, zd.month, zd.day)
if za.month == zd.month and za.year + 1899 == 2022:
EndDate = date(za.year, za.month,
calendar.monthrange(za.year,
za.month)[1])
gg_lav = (EndDate - da).days
if gg_lav >= 15:
zc = datetime.fromordinal((int(arr[i][c]) + 693594))
dc = date(zc.year, zc.month, zc.day)
EndDate2 = date(zc.year,zc.month-1,
calendar.monthrange(zc.year,
zc.month-1)[1])
if zd.month == EndDate2.month and zc.year == 2022:
gg_lav2 = (dc-EndDate2).days
if gg_lav2 >= 15:
return 1
else:
return 0
else:
return 1
else:
return 0
else:
return 1
I don't report the check2 function
fte = np.array(10)
for i in range(1,4797):
if arr[i][c] == '':
fte = np.append(fte,check2(a,d,i))
else:
fte = np.append(fte,check1(a, d, c, i))
print(i)
I am trying to create an indicator that will find all the divergences between 2 signals.
The output of the function so far looks like this
But the problem is that is painfully slow when I am trying to use it with long signals. Could any of you guys help me to make it faster if is possible?
My code:
def find_divergence(price: pd.Series, indicator: pd.Series, width_divergence: int, order: int):
div = pd.DataFrame(index=range(price.size), columns=[
f"Bullish_{width_divergence}_{order}",
f"Berish_{width_divergence}_{order}"
])
div[f'Bullish_idx_{width_divergence}_{order}'] = False
div[f'Berish_idx_{width_divergence}_{order}'] = False
def calc_argrelextrema(price_: np.numarray):
return argrelextrema(price_, np.less_equal, order=order)[0]
price_ranges = []
for i in range(len(price)):
price_ranges.append(price.values[0:i + 1])
f = []
with ThreadPoolExecutor(max_workers=16) as exe:
for i in price_ranges:
f.append(exe.submit(calc_argrelextrema, i))
prices_lows = SortedSet()
for r in concurrent.futures.as_completed(f):
data = r.result()
for d in reversed(data):
if d not in prices_lows:
prices_lows.add(d)
else:
break
price_lows_idx = pd.Series(prices_lows)
for idx_1 in range(price_lows_idx.size):
min_price = price[price_lows_idx[idx_1]]
min_indicator = indicator[price_lows_idx[idx_1]]
for idx_2 in range(idx_1 + 1, idx_1 + width_divergence):
if idx_2 >= price_lows_idx.size:
break
if price[price_lows_idx[idx_2]] < min_price:
min_price = price[price_lows_idx[idx_2]]
if indicator[price_lows_idx[idx_2]] < min_indicator:
min_indicator = indicator[price_lows_idx[idx_2]]
consistency_price_rd = min_price == price[price_lows_idx[idx_2]]
consistency_indicator_rd = min_indicator == indicator[price_lows_idx[idx_1]]
consistency_price_hd = min_price == price[price_lows_idx[idx_1]]
consistency_indicator_hd = min_indicator == indicator[price_lows_idx[idx_2]]
diff_price = price[price_lows_idx[idx_1]] - price[price_lows_idx[idx_2]] # should be neg
diff_indicator = indicator[price_lows_idx[idx_1]] - indicator[price_lows_idx[idx_2]] # should be pos
is_regular_divergence = diff_price > 0 and diff_indicator < 0
is_hidden_divergence = diff_price < 0 and diff_indicator > 0
if is_regular_divergence and consistency_price_rd and consistency_indicator_rd:
div.at[price_lows_idx[idx_2], f'Bullish_{width_divergence}_{order}'] = (price_lows_idx[idx_1], price_lows_idx[idx_2])
div.at[price_lows_idx[idx_2], f'Bullish_idx_{width_divergence}_{order}'] = True
elif is_hidden_divergence and consistency_price_hd and consistency_indicator_hd:
div.at[price_lows_idx[idx_2], f'Berish_{width_divergence}_{order}'] = (price_lows_idx[idx_1], price_lows_idx[idx_2])
div.at[price_lows_idx[idx_2], f'Berish_idx_{width_divergence}_{order}'] = True
return div
I have been using this code for this process for around six months and now it is throwing a ValueError: Columns must be same length as key. I haven't changed anything so I am not sure what could be wrong. Basically, I am pulling data from my system and it has names formatted like this FN1, FN2 LN1, LN2 and I need the names to be FN1 LN1 and FN2 LN2. The code runs fine until this last line.
df_gs_recruiter[[f'Recruiter_{j}']] = df_gs_recruiter[f'FN{j}'] + ' ' + df_gs_recruiter[f'LN{j}']
Sample of the data in:
Job ID
Recruiters
0
729538
Bonnie,Tina Smith,Matthews
1
720954
Cindy,Ken Harris,Walsh
2
720954
Cindy,Ken Harris,Walsh
3
721061
Cindy,Ken Harris,Walsh
import numpy as np
num_comma = df_gs_recruiter.Recruiters.str.count(',')
r_min = num_comma.min()
r_max = num_comma.max()
print(r_min,r_max)
df_gs_recruiter[['FN','LN']] = df_gs_recruiter.Recruiters.str.extract('(.*) (.*)',expand=True)
for i in range(len(df_gs_recruiter)):
r = df_gs_recruiter.loc[i,'Recruiters'].count(',')
if r/2 == 6:
df_gs_recruiter[['FN1','FN2','FN3','FN4','FN5', 'FN6', 'FN7']] = df_gs_recruiter.FN.str.extract('(.*),(.*),(.*),(.*),(.*),(.*),(.*)',expand=True)
df_gs_recruiter[['LN1','LN2','LN3','LN4','LN5', 'LN6', 'LN7']] = df_gs_recruiter.LN.str.extract('(.*),(.*),(.*),(.*),(.*),(.*),(.*)',expand=True)
elif r/2 == 5:
df_gs_recruiter[['FN1','FN2','FN3','FN4','FN5', 'FN6']] = df_gs_recruiter.FN.str.extract('(.*),(.*),(.*),(.*),(.*),(.*)',expand=True)
df_gs_recruiter[['LN1','LN2','LN3','LN4','LN5', 'LN6']] = df_gs_recruiter.LN.str.extract('(.*),(.*),(.*),(.*),(.*),(.*)',expand=True)
elif r/2 == 4:
df_gs_recruiter[['FN1','FN2','FN3','FN4','FN5']] = df_gs_recruiter.FN.str.extract('(.*),(.*),(.*),(.*),(.*)',expand=True)
df_gs_recruiter[['LN1','LN2','LN3','LN4','LN5']] = df_gs_recruiter.LN.str.extract('(.*),(.*),(.*),(.*),(.*)',expand=True)
elif r/2 == 3:
df_gs_recruiter[['FN1','FN2','FN3','FN4']] = df_gs_recruiter.FN.str.extract('(.*),(.*),(.*),(.*)',expand=True)
df_gs_recruiter[['LN1','LN2','LN3','LN4']] = df_gs_recruiter.LN.str.extract('(.*),(.*),(.*),(.*)',expand=True)
elif r/2 == 2:
df_gs_recruiter[['FN1','FN2','FN3']] = df_gs_recruiter.FN.str.extract('(.*),(.*),(.*)',expand=True)
df_gs_recruiter[['LN1','LN2','LN3']] = df_gs_recruiter.LN.str.extract('(.*),(.*),(.*)',expand=True)
elif r/2 == 1:
df_gs_recruiter[['FN1','FN2']] = df_gs_recruiter.FN.str.extract('(.*),(.*)',expand=True)
df_gs_recruiter[['LN1','LN2']] = df_gs_recruiter.LN.str.extract('(.*),(.*)',expand=True)
df_gs_recruiter.loc[i,'num'] = r/2 + 1
df_gs_recruiter.loc[i,'num'].astype(np.int8)
if df_gs_recruiter.loc[i,'num'] < 1.5:
df_gs_recruiter.loc[i,'FN0'] = df_gs_recruiter.loc[i,'FN']
df_gs_recruiter.loc[i,'LN0'] = df_gs_recruiter.loc[i,'LN']
else:
df_gs_recruiter.loc[i,'FN0'] = 'null'
df_gs_recruiter.loc[i,'LN0'] = 'null'
df_gs_recruiter.replace('null', np.nan, inplace=True)
for j in range(0,int(r_max/2)+2):
df_gs_recruiter[[f'Recruiter_{j}']] = df_gs_recruiter[f'FN{j}'] + ' ' + df_gs_recruiter[f'LN{j}']
Expected outcome:
0 4
Then this next part runs and shows the outcome.
for col in df_gs_recruiter.columns:
if 'Recruiter_' not in col and 'Job ID' not in col:
df_gs_recruiter.drop([f'{col}'],axis=1, inplace=True)
df_gs_recruiter
Expected outcome is:
Job ID
Recruiter_0
Recruiter_1
Recruiter_2
Recruiter_3
0
729538
NaN
Bonnie Smith
Tina Matthews
NaN
1
720954
NaN
Cindy Harris
Ken Walsh
I had issues with this line in the below code, which throws
TypeError: Cannot index by location index with a non-integer key
How to change the code?
exit_price = df.loc[exit_time + dt.timedelta(minutes=1)].Open
import yfinance as yf
import pandas as pd
import datetime as dt
asset = "TSLA"
intraday = yf.download(asset, start = "2021-03-17", end = "2021-03-18", interval = "1m")
def Intradaytrend(df, entry, exit):
ret_120min = df.iloc[120].Open/df.iloc[0].Open -1
tickret = df.Open.pct_change()
if ret_120min > entry:
buyprice = df.iloc[121].Open
buytime = df.iloc[121].name
cumulated = (tickret.loc[buytime:] +1).cumprod() -1
exittime = cumulated[(cumulated < -exit) | (cumulated > exit)].first_valid_index()
if exittime == None:
exitprice = df.iloc[-1].Open
else:
exitprice = df.iloc[exittime + dt.timedelta(minutes=1)].Open
profit = exitprice - buyprice
profitrel = profit/buyprice
return profitrel
else:
return None
Intradaytrend(intraday, 0.01, 0.01)
Below is a script I had some help with. I would like to alter it to give me 2 new columns with 3 possible variables. Date | gamePK | Home | Home Rest | Away | Away Rest
The current matches.csv format is Date | gamePK | Home | Away
Home Rest & Away Rest (-1 if the team played the day prior vs a team that didn't, 1 if the team didn't play the day prior vs an opponent who did, 0 otherwise)
Any information on how to create the columns and write this statement for them would be much appreciated.
import csv
import requests
import datetime
from pprint import pprint
import time
import pandas as pd
kp = []
for i in range(20001,20070):
req = requests.get('https://statsapi.web.nhl.com/api/v1/schedule?site=en_nhl&gamePk=20180' + str(i) + '&leaderGameTypes=R&expand=schedule.broadcasts.all,schedule.radioBroadcasts,schedule.teams,schedule.ticket,schedule.game.content.media.epg')
data = req.json()
for item in data['dates']:
date = item['date']
games = item['games']
for game in games:
gamePk = game['gamePk']
season = game['season']
teams = game['teams']
home = teams['home']
home_tm = home['team']['abbreviation']
away = teams['away']
away_tm = away['team']['abbreviation']
print (date, gamePk, away_tm, home_tm)
kp.append([date, gamePk, away_tm, home_tm])
pprint(kp)
df = pd.DataFrame(kp, columns=['Date','gamePk','Home', 'Away'])
df.to_csv('matches.csv', sep=',', header=True, index=False)
time.sleep(5)
def find_last(match_date, da, team):
home_play = da[da['Home'] == team].tail(1) #then find last matches played at home, select greatest
away_play = da[da['Away'] == team].tail(1) #" " find last matches played at away, select greatest
#then take the last match played, either home or away, whichever is more recent
if home_play.empty and away_play.empty:
print (team, "no_matches before this date")
last_match = 'NA'
elif home_play.empty:
last_match = away_play.Date.item()
elif away_play.empty:
last_match = home_play.Date.item()
else:
last_match = max([home_play.Date.item(), away_play.Date.item()])
if last_match != 'NA':
#And then subtract this from "todays" date (match_date)
duration_since_last = pd.to_datetime(match_date) - pd.to_datetime(last_match)
print ("Team:", team)
print ("Todays game date = ", match_date)
print ("Last match played = ", last_match)
print ("Rest Period = ", duration_since_last)
print()
return duration_since_last
df = pd.read_csv('matches.csv', sep=',')
for k in df.index:
home_team = df.Home[k]
away_team = df.Away[k]
match_date = df.Date[k]
gamePk = df.gamePk[k]
#we want to find all date values less than todays match date.
da = df[df['Date'] < match_date]
## if not da.empty:
for team in [home_team,away_team]:
print ("Record", k, home_team, 'vs', away_team)
find_last(match_date, da, team)
print ('________________________________________')
The script you provided has been broken into separate sections for greater understanding.
The following new sections are required to produce your desired addition to the DataFrame:
On Game Day, What is the Previous Day
Did We Play on the Previous Day
Determine Game Day Handicap
Here's a jupyter notebook of the work: nhl_stats_parsing
Code:
import csv
import requests
import datetime
from pprint import pprint
import time
import pandas as pd
from pprint import pprint as pp
import json
pd.set_option('max_columns', 100)
pd.set_option('max_rows', 300)
# ### make request to NHL stats server for data and save it to a file
address_p1 = 'https://statsapi.web.nhl.com/api/v1/schedule?site=en_nhl&gamePk=20180'
address_p2 = '&leaderGameTypes=R&expand=schedule.broadcasts.all,schedule.radioBroadcasts,schedule.teams,schedule.ticket,schedule.game.content.media.epg'
with open('data.json', 'w') as outfile:
data_list = []
for i in range(20001,20070): # end 20070
req = requests.get(address_p1 + str(i) + address_p2)
data = req.json()
data_list.append(data) # append each request to the data list; will be a list of dicts
json.dump(data_list, outfile) # save the json file so you don't have to keep hitting the nhl server with your testing
# ### read the json file back in
with open('data.json') as f:
data = json.load(f)
# ### this is what 1 record looks like
for i, x in enumerate(data):
if i == 0:
pp(x)
# ### parse each dict
kp = []
for json_dict in data:
for item in json_dict['dates']:
date = item['date']
games = item['games']
for game in games:
gamePk = game['gamePk']
season = game['season']
teams = game['teams']
home = teams['home']
home_tm = home['team']['abbreviation']
away = teams['away']
away_tm = away['team']['abbreviation']
print (date, gamePk, away_tm, home_tm)
kp.append([date, gamePk, away_tm, home_tm])
# ### create DataFrame and save to csv
df = pd.DataFrame(kp, columns=['Date','gamePk','Home', 'Away'])
df.to_csv('matches.csv', sep=',', header=True, index=False)
# ### read in csv into DataFrame
df = pd.read_csv('matches.csv', sep=',')
print(df.head()) # first 5
## On Game Day, What is the Previous Day
def yesterday(date):
today = datetime.datetime.strptime(date, '%Y-%m-%d')
return datetime.datetime.strftime(today - datetime.timedelta(1), '%Y-%m-%d')
def yesterday_apply(df):
df['previous_day'] = df.apply(lambda row: yesterday(date=row['Date']), axis=1)
yesterday_apply(df)
## Did We Play on the Previous Day
def played_previous_day(df, date, team):
filter_t = f'(Date == "{date}") & ((Home == "{team}") | (Away == "{team}"))'
filtered_df = df.loc[df.eval(filter_t)]
if filtered_df.empty:
return False # didn't play previous day
else:
return True # played previous day
def played_previous_day_apply(df):
df['home_played_previous_day'] = df.apply(lambda row: played_previous_day(df, date=row['previous_day'], team=row['Home']), axis=1)
df['away_played_previous_day'] = df.apply(lambda row: played_previous_day(df, date=row['previous_day'], team=row['Away']), axis=1)
played_previous_day_apply(df)
# # Determine Game Day Handicap
# Home Rest & Away Rest (-1 if the team played the day prior vs a team that didn't, 1 if the team didn't play the day prior vs an opponent who did, 0 otherwise)
def handicap(team, home, away):
if (team == 'home') and not home and away:
return 1
elif (team == 'away') and not home and away:
return -1
elif (team == 'home') and home and not away:
return -1
elif (team == 'away') and home and not away:
return 1
else:
return 0
def handicap_apply(df):
df['home_rest'] = df.apply(lambda row: handicap(team='home', home=row['home_played_previous_day'], away=row['away_played_previous_day']), axis=1)
df['away_rest'] = df.apply(lambda row: handicap(team='away', home=row['home_played_previous_day'], away=row['away_played_previous_day']), axis=1)
handicap_apply(df)
print(df)
# ### data presentation method
def find_last(match_date, da, team):
home_play = da[da['Home'] == team].tail(1) # then find last matches played at home, select greatest
away_play = da[da['Away'] == team].tail(1) # " " find last matches played at away, select greatest
#then take the last match played, either home or away, whichever is more recent
if home_play.empty and away_play.empty:
print (team, "no_matches before this date")
last_match = 'NA'
elif home_play.empty:
last_match = away_play.Date.item()
elif away_play.empty:
last_match = home_play.Date.item()
else:
last_match = max([home_play.Date.item(), away_play.Date.item()])
if last_match != 'NA':
#And then subtract this from "todays" date (match_date)
duration_since_last = pd.to_datetime(match_date) - pd.to_datetime(last_match)
print ("Team:", team)
print ("Todays game date = ", match_date)
print ("Last match played = ", last_match)
print ("Rest Period = ", duration_since_last)
print()
return duration_since_last
# ### produce your output
for k in df.index:
home_team = df.Home[k]
away_team = df.Away[k]
match_date = df.Date[k]
gamePk = df.gamePk[k]
#we want to find all date values less than todays match date.
da = df[df['Date'] < match_date]
## if not da.empty:
for team in [home_team, away_team]:
print ("Record", k, home_team, 'vs', away_team)
find_last(match_date, da, team) # call your method
print('_' * 40)
Output:
Date gamePk Home Away previous_day home_played_previous_day away_played_previous_day home_rest away_rest
0 2018-10-03 2018020001 MTL TOR 2018-10-02 False False 0 0
1 2018-10-03 2018020002 BOS WSH 2018-10-02 False False 0 0
2 2018-10-03 2018020003 CGY VAN 2018-10-02 False False 0 0
3 2018-10-03 2018020004 ANA SJS 2018-10-02 False False 0 0
4 2018-10-04 2018020005 BOS BUF 2018-10-03 True False -1 1
5 2018-10-04 2018020006 NSH NYR 2018-10-03 False False 0 0
6 2018-10-04 2018020007 WSH PIT 2018-10-03 True False -1 1
7 2018-10-04 2018020008 NYI CAR 2018-10-03 False False 0 0
8 2018-10-04 2018020009 CHI OTT 2018-10-03 False False 0 0