Could someone please explain to me how I rewrite this code to run faster using pools? Sorry for that question but I'm a beginner and I spent so much time trying but unfortunately I couldn't figure it out.
for i in constituents:
print(i) # print out the ticker so we know the downloading progress
prc = yf.download(i, interval="1d", start=start_date, end=end_date)
prc = pd.DataFrame(prc['Adj Close']) # select adjusted close price only
prc.columns = [i] # rename the column with the ticker of the stock
try:
df_prc = pd.concat([df_prc, prc], axis=1) # if the dataframe already exists, join the newly downloaded data to the existing table
except:
df_prc = prc # create the dataframe for the first ticker
stk = yf.Ticker(i)
try:
stk.info['floatShares']
except:
stk.info['floatShares'] = None
try:
stk.info['sharesOutstanding']
except:
stk.info['sharesOutstanding'] = None
if stk.info['floatShares']:
mcap = prc * stk.info['floatShares']
elif stk.info['sharesOutstanding']:
mcap = prc * stk.info['sharesOutstanding']
else:
mcap = prc * ((stk.info['marketCap'])/(stk.info['previousClose']))
try:
df_mcap = pd.concat([df_mcap, mcap], axis=1)
except:
df_mcap = mcap
Further, I would like to provide the code that runs befor that one I posted to clarify my question:
import yfinance as yf
import pandas as pd
start_date = "2021-01-04"
end_date = "2021-11-29"
idx = "^STOXX50E"
Index = yf.download(idx, # ticker
interval="1d", # daily frequency
start=start_date, end=end_date) # sampling period
Index = pd.DataFrame(Index['Adj Close'].rename(idx)) # select adjusted close price
page = pd.read_html('https://en.wikipedia.org/wiki/EURO_STOXX_50')
constituents = page[2]['Ticker'] # we only need tickers
constituents.pop(46) # Ticker UMG.AS is removed because otherwise the for loop produces an error
I was able to reduce the running time from 386 seconds to 17 seconds with the following code. Note that I had to do an import of module ssl and issue ssl._create_default_https_context = ssl._create_unverified_context to overcome an SSL certificate error I was getting from the pd.read_html() method call.
import yfinance as yf
import pandas as pd
from multiprocessing.pool import ThreadPool
from functools import partial
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
start_date = "2021-01-04"
end_date = "2021-11-29"
def process_constituent(data, constituent):
prc = pd.DataFrame(data[constituent]['Adj Close']) # select adjusted close price only
prc.columns = [constituent] # rename the column with the ticker of the stock
stk = yf.Ticker(constituent)
try:
stk.info['floatShares']
except:
stk.info['floatShares'] = None
try:
stk.info['sharesOutstanding']
except:
stk.info['sharesOutstanding'] = None
if stk.info['floatShares']:
mcap = prc * stk.info['floatShares']
elif stk.info['sharesOutstanding']:
mcap = prc * stk.info['sharesOutstanding']
else:
mcap = prc * ((stk.info['marketCap'])/(stk.info['previousClose']))
return mcap
def process_constituents(constituents):
# Download all the tickers:
data = yf.download(
tickers = ' '.join(constituents),
interval='1d',
start = start_date,
end = end_date,
group_by = 'ticker',
adjust=False,
threads = True,
proxy = None
)
pool = ThreadPool(len(constituents))
for idx, mcap in enumerate(pool.imap(partial(process_constituent, data), constituents)):
if idx == 0:
df_mcap = mcap
else:
df_mcap = pd.concat([df_mcap, mcap], axis=1)
return df_mcap
def main():
idx = "^STOXX50E"
Index = yf.download(idx, # ticker
interval="1d", # daily frequency
start=start_date, end=end_date) # sampling period
Index = pd.DataFrame(Index['Adj Close'].rename(idx)) # select adjusted close price
page = pd.read_html('https://en.wikipedia.org/wiki/EURO_STOXX_50')
constituents = page[2]['Ticker'] # we only need tickers
constituents.pop(46) # Ticker UMG.AS is removed because otherwise the for loop produces an error
df_mcap = process_constituents(constituents)
print(df_mcap)
if __name__ == '__main__':
main()
Prints:
[*********************100%***********************] 1 of 1 completed
[*********************100%***********************] 49 of 49 completed
ADS.DE ADYEN.AS AD.AS AI.PA ... TTE.PA DG.PA VOW.DE VNA.DE
Date ...
2021-01-04 5.083322e+10 4.880933e+10 2.414267e+10 6.200227e+10 ... 8.326552e+10 3.746300e+10 3.842743e+10 3.322534e+10
2021-01-05 4.983515e+10 4.800875e+10 2.403104e+10 6.134340e+10 ... 8.545638e+10 3.682896e+10 3.849667e+10 3.338207e+10
2021-01-06 5.019652e+10 4.548888e+10 2.411223e+10 6.147971e+10 ... 8.921219e+10 3.824197e+10 3.886594e+10 3.203872e+10
2021-01-07 4.964585e+10 4.500328e+10 2.407163e+10 6.195684e+10 ... 9.018724e+10 3.830537e+10 3.946601e+10 3.183722e+10
2021-01-08 5.078160e+10 4.610573e+10 2.400059e+10 6.232034e+10 ... 9.024743e+10 3.879449e+10 3.893518e+10 3.225142e+10
... ... ... ... ... ... ... ... ... ...
2021-11-22 4.851034e+10 6.454539e+10 3.178177e+10 7.108912e+10 ... 1.073903e+11 4.175263e+10 6.518791e+10 2.937961e+10
2021-11-23 4.727562e+10 6.298360e+10 3.187473e+10 7.017166e+10 ... 1.086315e+11 4.224230e+10 6.532881e+10 2.843048e+10
2021-11-24 4.667566e+10 6.206490e+10 3.153388e+10 7.028287e+10 ... 1.092141e+11 4.271798e+10 6.326233e+10 2.985586e+10
2021-11-25 4.659740e+10 6.453227e+10 3.159068e+10 7.013459e+10 ... 1.091381e+11 4.279726e+10 6.298054e+10 3.005144e+10
2021-11-26 4.405841e+10 6.358732e+10 3.132214e+10 6.882791e+10 ... 1.026661e+11 3.918302e+10 6.072620e+10 2.859604e+10
[233 rows x 49 columns]
import yfinance as yf
import pandas as pd
from multiprocessing.pool import ThreadPool
from functools import partial
import ssl
ssl._create_default_https_context = ssl._create_unverified_context
start_date = "2021-01-04"
end_date = "2021-11-29"
# download data
idx = "^STOXX50E"
Index = yf.download(idx, # ticker
interval="1d", # daily frequency
start=start_date, end=end_date) # sampling period
Index = pd.DataFrame(Index['Adj Close'].rename(idx)) # select adjusted close price
page = pd.read_html('https://en.wikipedia.org/wiki/EURO_STOXX_50')
constituents = page[2]['Ticker'] # we only need tickers
constituents.pop(46) # Ticker UMG.AS is removed because otherwise the for loop produces an error
data = yf.download(
tickers = ' '.join(constituents),
interval='1d',
start = start_date,
end = end_date,
group_by = 'ticker',
adjust=False,
threads = True,
proxy = None
)
def process_prc(data, constituent):
prc = pd.DataFrame(data[constituent]['Adj Close']) # select adjusted close price only
prc.columns = [constituent] # rename the column with the ticker of the stock
return prc
def process_constituent(data, constituent):
prc = pd.DataFrame(data[constituent]['Adj Close']) # select adjusted close price only
prc.columns = [constituent] # rename the column with the ticker of the stock
stk = yf.Ticker(constituent)
try:
stk.info['floatShares']
except:
stk.info['floatShares'] = None
try:
stk.info['sharesOutstanding']
except:
stk.info['sharesOutstanding'] = None
if stk.info['floatShares']:
mcap = prc * stk.info['floatShares']
elif stk.info['sharesOutstanding']:
mcap = prc * stk.info['sharesOutstanding']
else:
mcap = prc * ((stk.info['marketCap'])/(stk.info['previousClose']))
return mcap
def process_dfprc(constituents):
pool = ThreadPool(len(constituents))
for idx, prc in enumerate(pool.imap(partial(process_prc, data), constituents)):
try:
df_prc = pd.concat([df_prc, prc], axis=1) # if the dataframe already exists, join the newly downloaded data to the existing table
except:
df_prc = prc # create the dataframe for the first ticker
return df_prc
def process_constituents(constituents):
# Download all the tickers:
pool = ThreadPool(len(constituents))
for idx, mcap in enumerate(pool.imap(partial(process_constituent, data), constituents)):
#if idx == 0:
# df_mcap = mcap
#else:
# df_mcap = pd.concat([df_mcap, mcap], axis=1)
try:
df_mcap = pd.concat([df_mcap, mcap], axis=1)
except:
df_mcap = mcap
return df_mcap
page = pd.read_html('https://en.wikipedia.org/wiki/EURO_STOXX_50')
constituents = page[2]['Ticker'] # we only need tickers
constituents.pop(46) # Ticker UMG.AS is removed because otherwise the for loop produces an error
calculating df_mcap:
if __name__ == '__main__':
df_mcap = process_constituents(constituents)
df_mcap
print:
ADS.DE ADYEN.AS AD.AS AI.PA AIR.PA ALV.DE ABI.BR ASML.AS CS.PA BAS.DE ... SAF.PA SAN.PA SAP.DE SU.PA SIE.DE STLA.MI TTE.PA DG.PA VOW.DE VNA.DE
Date
2021-01-04 5.083322e+10 4.880933e+10 2.414267e+10 6.200227e+10 6.001130e+10 7.936595e+10 5.306992e+10 1.647683e+11 3.507867e+10 5.679746e+10 ... 4.005571e+10 8.460465e+10 1.080643e+11 6.363292e+10 9.197665e+10 2.174885e+10 8.763420e+10 3.746300e+10 3.842743e+10 3.316382e+10
2021-01-05 4.983515e+10 4.800875e+10 2.403104e+10 6.134340e+10 5.997124e+10 7.855080e+10 5.304209e+10 1.650319e+11 3.506423e+10 5.636858e+10 ... 4.014193e+10 8.459394e+10 1.077770e+11 6.303187e+10 9.178897e+10 2.169038e+10 8.994003e+10 3.682896e+10 3.849667e+10 3.332026e+10
2021-01-06 5.019652e+10 4.548888e+10 2.411223e+10 6.147971e+10 6.019823e+10 8.263458e+10 5.451703e+10 1.633893e+11 3.656208e+10 5.896818e+10 ... 4.010744e+10 8.407989e+10 1.082285e+11 6.478275e+10 9.530789e+10 2.123436e+10 9.389289e+10 3.824197e+10 3.886594e+10 3.197940e+10
2021-01-07 4.964585e+10 4.500328e+10 2.407163e+10 6.195684e+10 5.983105e+10 8.195529e+10 5.417381e+10 1.638151e+11 3.678766e+10 5.987848e+10 ... 3.993501e+10 8.323385e+10 1.072435e+11 6.611552e+10 9.720029e+10 2.161438e+10 9.491911e+10 3.830537e+10 3.946601e+10 3.177828e+10
2021-01-08 5.078160e+10 4.610573e+10 2.400059e+10 6.232034e+10 6.015150e+10 8.221501e+10 5.367288e+10 1.687430e+11 3.675157e+10 6.002728e+10 ... 4.012468e+10 8.437975e+10 1.089467e+11 6.682110e+10 9.696569e+10 2.121390e+10 9.498246e+10 3.879449e+10 3.893518e+10 3.219170e+10
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2021-11-22 4.851034e+10 6.454539e+10 3.178177e+10 7.108912e+10 7.262242e+10 8.481195e+10 4.905974e+10 3.047505e+11 4.871214e+10 5.686775e+10 ... 3.862038e+10 9.936060e+10 1.269160e+11 8.638617e+10 1.251472e+11 3.187450e+10 1.074238e+11 4.175263e+10 6.518791e+10 2.937961e+10
2021-11-23 4.727562e+10 6.298360e+10 3.187473e+10 7.017166e+10 7.361048e+10 8.489549e+10 4.868553e+10 2.879491e+11 4.890396e+10 5.780438e+10 ... 3.883494e+10 9.827092e+10 1.247044e+11 8.521374e+10 1.230764e+11 3.105655e+10 1.086654e+11 4.224230e+10 6.532881e+10 2.843048e+10
2021-11-24 4.667566e+10 6.206490e+10 3.153388e+10 7.028287e+10 7.441161e+10 8.483284e+10 4.900361e+10 2.891317e+11 4.899028e+10 5.683102e+10 ... 3.892492e+10 9.708117e+10 1.240785e+11 8.322063e+10 1.219527e+11 3.068774e+10 1.092482e+11 4.271798e+10 6.326233e+10 2.985586e+10
2021-11-25 4.659740e+10 6.453227e+10 3.159068e+10 7.013459e+10 7.494570e+10 8.464487e+10 5.084663e+10 2.899473e+11 4.885600e+10 5.657391e+10 ... 3.898721e+10 9.634731e+10 1.249965e+11 8.351906e+10 1.232691e+11 3.050516e+10 1.091722e+11 4.279726e+10 6.298054e+10 3.005144e+10
2021-11-26 4.405841e+10 6.358732e+10 3.132214e+10 6.882791e+10 6.633355e+10 7.996257e+10 4.789032e+10 2.795891e+11 4.646787e+10 5.317635e+10 ... 3.498675e+10 9.452378e+10 1.201978e+11 8.077986e+10 1.165751e+11 2.842012e+10 1.026981e+11 3.918302e+10 6.072620e+10 2.859604e+10
233 rows × 49 columns
calculating df_prc:
if __name__ == '__main__':
df_prc = process_dfprc(constituents)
df_prc
print:
ADS.DE ADYEN.AS AD.AS AI.PA AIR.PA ALV.DE ABI.BR ASML.AS CS.PA BAS.DE ... SAF.PA SAN.PA SAP.DE SU.PA SIE.DE STLA.MI TTE.PA DG.PA VOW.DE VNA.DE
Date
2021-01-04 292.307343 1859.5 23.374092 133.809341 89.889999 190.011627 56.726482 404.039764 18.287489 61.853455 ... 115.747612 76.089233 103.588997 119.404495 114.593018 11.912073 34.584999 80.331764 163.641632 57.650417
2021-01-05 286.568085 1829.0 23.266014 132.387405 89.830002 188.060059 56.696735 404.686218 18.279963 61.386387 ... 115.996742 76.079597 103.313599 118.276649 114.359184 11.880051 35.494999 78.972191 163.936478 57.922352
2021-01-06 288.646088 1733.0 23.344616 132.681595 90.169998 197.837112 58.273296 400.658264 19.060837 64.217407 ... 115.897095 75.617287 103.746368 121.562111 118.743378 11.630281 37.055000 82.002113 165.509003 55.591476
2021-01-07 285.479584 1714.5 23.305313 133.711288 89.620003 196.210800 57.906425 401.702545 19.178438 65.208740 ... 115.398827 74.856400 102.802139 124.062988 121.101105 11.838422 37.459999 82.138069 168.064377 55.241844
2021-01-08 292.010498 1756.5 23.236538 134.495789 90.099998 196.832611 57.370987 413.786438 19.159622 65.370781 ... 115.946915 75.886971 104.434860 125.386978 120.808823 11.619075 37.485001 83.186890 165.803864 55.960529
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
2021-11-22 278.950012 2459.0 30.770000 153.419998 108.779999 203.050003 52.439999 747.299988 25.395000 61.930000 ... 111.599998 89.360001 121.660004 162.100006 155.919998 17.458000 42.395000 89.529999 277.600006 51.072109
2021-11-23 271.850006 2399.5 30.860001 151.440002 110.260002 203.250000 52.040001 706.099976 25.495001 62.950001 ... 112.220001 88.379997 119.540001 159.899994 153.339996 17.010000 42.884998 90.580002 278.200012 49.422203
2021-11-24 268.399994 2364.5 30.530001 151.679993 111.459999 203.100006 52.380001 709.000000 25.540001 61.889999 ... 112.480003 87.309998 118.940002 156.160004 151.940002 16.808001 43.115002 91.599998 269.399994 51.900002
2021-11-25 267.950012 2458.5 30.584999 151.360001 112.260002 202.649994 54.349998 711.000000 25.469999 61.610001 ... 112.660004 86.650002 119.820000 156.720001 153.580002 16.708000 43.084999 91.769997 268.200012 52.240002
2021-11-26 253.350006 2422.5 30.325001 148.539993 99.360001 191.440002 51.189999 685.599976 24.225000 57.910000 ... 101.099998 85.010002 115.220001 151.580002 145.240005 15.566000 40.529999 84.019997 258.600006 49.709999
233 rows × 49 columns
Related
I want to calculate difference between two time columns without considering non-business hours. I have used pyholidays, which worked totally fine. But even when i define starttime and endtime for Business-duration, Result still includes Non-Business Hours as you shown in attached photos.
for index, row in df.iterrows():
first=row['New']
second=row['Assigned']
third=row['In Progress']
if(pd.notnull(second)):
starttime = (8,0,0)
endtime = (17,0,0)
holidaylist = pyholidays.Germany()
unit='hour'
row['AP'] = businessDuration(first,second,holidaylist=holidaylist,unit=unit)
else:
starttime = (8,0,0)
endtime = (17,0,0)
holidaylist = pyholidays.Germany()
unit='hour'
row['AP'] = businessDuration(first,third,holidaylist=holidaylist,unit=unit)
ap.append(row['AP'])
DataFrame
Printed Result
Thank you for your suggestion. I have tried your method, i have also defined calendar instance. Later i was getting 'relativedelta' error which i have somehow solved by 'dateutil'. Now i am at final stage to compute business-hour difference between two columns.
`de_holidays = pyholidays.Germany()
cal = Calendar(holidays=de_holidays, weekdays=['Saturday', 'Sunday'])
df['rp'] = df.apply(lambda row: compute_bizhours_diff(row['Resolved'], row['Pending'], cal=cal, biz_open_time = time(8, 0, 0), biz_close_time = time(17, 0, 0)), axis=1)`
Now i am getting error about month number, which can not be nan. I have also attached photo of errors.
Pic1
Pic2
I do not know if this works, but try this:
# == Imports needed ===========================
from __future__ import annotations
from typing import Any
import pandas as pd
import holidays as pyholidays
from datetime import time
from bizdays import Calendar
from dateutil.relativedelta import relativedelta
# == Functions ==================================
def is_null_dates(*dates: Any) -> bool:
"""Determine whether objects are valid dates.
Parameters
----------
dates : Any
Variables to check whether they hold a valid date, or not.
Returns
-------
bool
True, if at least one informed value is not a date.
False otherwise.
"""
for date in dates:
if pd.isna(pd.to_datetime(date, errors='coerce')):
return True
return False
def compute_bizhours_diff(
start_date: str | pd.Timestamp,
end_date: str | pd.Timestamp,
biz_open_time: datetime.time | None = None,
biz_close_time: datetime.time | None = None,
cal: bizdays.Calendar | None = None,
) -> float:
"""Compute the number of business hours between two dates.
Parameters
----------
start_date : str | pd.Timestamp
The first date.
end_date : str | pd.Timestamp
The final date.
biz_open_time : datetime.time | None
The beginning hour/minute of a business day.
biz_close_time : datetime.time | None
The ending hour/minute of a business day.
cal : bizdays.Calendar | None
The calendar object used to figure out the number of days between `start_date`
and `end_date` that are not holidays. If None, consider every day as a business day,
except Saturdays, or Sundays.
Returns
-------
float
The total number of business hours between `start_date`, and `end_date`.
Examples
--------
>>> import holidays as pyholidays
>>> from datetime import time
>>> from bizdays import Calendar
>>> # 2022-09-07 is a national holiday in Brazil, therefore only
>>> # the hours between 2022-09-08 09:00:00, and 2022-09-08 15:48:00
>>> # should be considered. This should equal 6.8 hours.
>>> start_date = pd.to_datetime('2022-09-07 15:55:00')
>>> end_date = pd.to_datetime('2022-09-08 15:48:00')
>>> BR_holiday_list = pyholidays.BR(years={start_date.year, end_date.year}, state='RJ')
>>> cal = Calendar(holidays=BR_holiday_list, weekdays=['Saturday', 'Sunday'])
>>> print(compute_bizhours_diff(start_date, end_date, cal=cal))
6.8
>>> # Both dates in the next example are holidays, therefore, the result should be 0.0
>>> start_date = pd.to_datetime('2022-09-07 15:55:00')
>>> end_date = pd.to_datetime('2022-09-07 15:48:00')
>>> print(compute_bizhours_diff(start_date, end_date, cal=cal))
0.0
>>> # What if the end_date preceeds start_date by mistake?
>>> # In such cases, we switch start_date to end_date, and vice-versa.
>>> start_date = pd.to_datetime('2022-09-02 00:00:00')
>>> end_date = pd.to_datetime('2022-09-01 15:55:00')
>>> print(compute_bizhours_diff(start_date, end_date, cal=cal))
2.0833333333333335
>>> # What if the start_date, and end_date begin and finish on the same day, but they both have timestamps that end before
>>> # or after the business hours?
>>> # In such cases, the total number of hours is equal to 0.0
>>> start_date = pd.to_datetime('2022-09-02 00:00:00')
>>> end_date = pd.to_datetime('2022-09-02 8:00:00')
>>> print(compute_bizhours_diff(start_date, end_date, cal=cal))
0.0
"""
if is_null_dates(start_date, end_date):
return pd.NA
if biz_open_time is None:
biz_open_time = time(9, 0, 0)
if biz_close_time is None:
biz_close_time = time(18, 0, 0)
if cal is None:
cal = Calendar(weekdays=['Saturday', 'Sunday'])
open_delta = relativedelta(hour=biz_open_time.hour, minute=biz_open_time.minute)
end_delta = relativedelta(hour=biz_close_time.hour, minute=biz_close_time.minute)
start_date = pd.to_datetime(start_date)
end_date = pd.to_datetime(end_date)
_end_date = max(start_date, end_date)
_start_date = min(start_date, end_date)
start_date = _start_date
end_date = _end_date
start_date = (
start_date if cal.isbizday(start_date) else cal.following(start_date) + open_delta
)
end_date = (
end_date if cal.isbizday(end_date) else cal.preceding(end_date) + end_delta
)
if end_date < start_date:
return 0.00
start_date_biz = max(start_date, start_date + open_delta)
end_first_day = start_date_biz + end_delta
end_date_biz = min(
end_date,
end_date + end_delta
)
start_last_day = end_date_biz + open_delta
if start_last_day > end_date:
end_date_biz = start_last_day
if end_first_day < start_date:
end_first_day = start_date_biz
if end_first_day.date() == end_date_biz.date():
return (end_date_biz - start_date_biz).seconds / 3600
return (
(end_first_day - start_date_biz).seconds
+ (end_date_biz - start_last_day).seconds
+ (
max((len(list(cal.seq(start_date, end_date))) - 2), 0)
* (end_first_day - (start_date + open_delta)).seconds
)
) / 3600
Before running the preceding code, you need to install the following packages, if you do not already have them:
pip install holidays bizdays
Link to both packages' documentation:
bizdays
python-holidays
Examples
Here is how you can use compute_bizhours_diff:
import pandas as pd
import holidays as pyholidays
from datetime import time
from bizdays import Calendar
# OPTIONAL: define custom start, and end to your business hours.
biz_open_time = time(9, 0, 0)
biz_close_time = time(18, 0, 0)
# Define your start, and end dates.
start_date = pd.to_datetime('2022-09-07 04:48:00')
end_date = pd.to_datetime('2022-09-10 15:55:00')
# Create a list of holidays, and create a Calendar instance.
BR_holiday_list = pyholidays.BR(years={start_date.year, end_date.year}, state='RJ')
# For German holidays, you can use something like:
German_holiday_list = pyholidays.Germany(years={start_date.year, end_date.year})
# Define the Calendar instance. Here, we use the German holidays, excluding Saturday, and Sunday from weekdays.
cal = Calendar(holidays=German_holiday_list, weekdays=['Saturday', 'Sunday'])
# Finally, compute the total number of working hours between your two dates:
compute_bizhours_diff(start_date, end_date, cal=cal)
# Returns: 27.0
You can also use the function with pandas dataframes, using apply:
df['working_hours_delta'] = df.apply(lambda row: compute_bizhours_diff(row[START_DATE_COLNAME], row[END_DATE_COLNAME], cal=cal), axis=1)
Notes
The function compute_bizhours_diff is far from perfect. Before using it in any production environment, or for any serious use case, I strongly recommend refactoring it.
Edit
I made some changes to the original answer, to account for instances where start_date, or end_date have null or invalid representations of dates.
Using the example dataframe from your question it now runs fine:
de_holidays = pyholidays.Germany()
cal = Calendar(holidays=de_holidays, weekdays=['Saturday', 'Sunday'])
df = pd.DataFrame(
{
'Assigned': [None, '2022-07-28 10:53:00', '2022-07-28 18:08:00', None, '2022-07-29 12:56:00'],
'In Progress': ['2022-08-01 10:53:00', '2022-08-02 09:32:00', '2022-07-29 12:08:00', '2022-08-02 10:23:00', '2022-07-29 14:54:00'],
'New': ['2022-07-27 15:01:00', '2022-07-28 10:09:00', '2022-07-28 13:37:00', '2022-07-29 00:12:00', '2022-07-29 09:51:00'],
}
).apply(pd.to_datetime)
df['rp'] = df.apply(
lambda row: compute_bizhours_diff(
row['Assigned'], row['In Progress'], cal=cal, biz_open_time = time(8, 0, 0), biz_close_time = time(17, 0, 0)
), axis=1
)
print(df)
# Prints:
# Assigned In Progress New rp
# 0 NaT 2022-08-01 10:53:00 2022-07-27 15:01:00 <NA>
# 1 2022-07-28 10:53:00 2022-08-02 09:32:00 2022-07-28 10:09:00 25.65
# 2 2022-07-28 18:08:00 2022-07-29 12:08:00 2022-07-28 13:37:00 4.133333
# 3 NaT 2022-08-02 10:23:00 2022-07-29 00:12:00 <NA>
# 4 2022-07-29 12:56:00 2022-07-29 14:54:00 2022-07-29 09:51:00 1.966667
I have this error :
KeyError: 'id_cont'
During handling of the above exception, another exception occurred:
<ipython-input-11-4604edb9a0b7> in generateID(self, outputMode, data_df)
84
85 if outputMode.getModeCB() == CONST_MODE_CONT:
---> 86 data_df['id_cont'] = data_df.apply(lambda row:row['product_name']+'-'+row['hour_local'],axis=1)
87 #data_df['id_cont'] = data_df.apply(lambda row:row['equipement']+'-'+row['product_name']+'-'+row['hour_shift'].strftime('%Y-%m-%d %H:%M:%S'),axis=1)
88 else:
/dataiku/dss_data/code-envs/python/Python3_6/lib/python3.6/site-packages/pandas/core/frame.py in __setitem__(self, key, value)
2936 else:
2937 # set column
-> 2938 self._set_item(key, value)
2939
2940 def _setitem_slice(self, key, value):
ValueError: Wrong number of items passed 149, placement implies 1
Adding this line brings up this error, I think that it's a data type problem :
data_df['id_cont'] = data_df.apply(lambda row:row['product_name']+'-'+row['hour_shift'].strftime('%Y-%m-%d %H:%M:%S'),axis=1)
hour_shift is a datetime and product_name, equipment are object.
I think the reason you're getting this error is because the data_df is an empty dataframe due to no rows satisfy the condition data_df['hour_local'].isin(target_hours), causing all hour_shift column values to be NaT, making all rows to be dropped at data_df = data_df.dropna(subset=['hour_shift']). You can test this by using the sample data that has hour_local values that satisfy the condition vs that doesn't
Satisfy condition:
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
data_df = pd.DataFrame({'local_time': [datetime.strptime("08:30:00",'%H:%M:%S'), datetime.strptime("08:24:00",'%H:%M:%S')], 'product_name': ['A', 'B']})
delta = timedelta(minutes=5)
# Start time
start_time = datetime.strptime("08:20:00",'%H:%M:%S')
cur_time = start_time
target_hours = []
while cur_time.date() <= start_time.date():
target_hours.append(cur_time.time())
cur_time += delta
data_df['hour_local'] = pd.to_datetime(data_df["local_time"].astype(str)).dt.time
data_df = data_df.drop(columns=['hour_shift'], errors='ignore')
data_df.loc[data_df['hour_local'].isin(target_hours),'hour_shift'] = data_df['local_time']
data_df = data_df.sort_values(by=['local_time'])
data_df['hour_shift'] = data_df['hour_shift'].ffill()
data_df = data_df.dropna(subset=['hour_shift'])
# This will print dataframe with one row
print(data_df)
data_df['id_cont'] = data_df.apply(lambda row:row['product_name']+'- '+row['hour_shift'].strftime('%Y-%m-%d %H:%M:%S'),axis=1)
print(data_df)
Not satisfy condition:
from datetime import datetime
from datetime import timedelta
import time
import pandas as pd
# NOTE: no data satisfy the below condition
data_df = pd.DataFrame({'local_time': [datetime.strptime("08:31:00",'%H:%M:%S'), datetime.strptime("08:24:00",'%H:%M:%S')], 'product_name': ['A', 'B']})
delta = timedelta(minutes=5)
# Start time
start_time = datetime.strptime("08:20:00",'%H:%M:%S')
cur_time = start_time
target_hours = []
while cur_time.date() <= start_time.date():
target_hours.append(cur_time.time())
cur_time += delta
data_df['hour_local'] = pd.to_datetime(data_df["local_time"].astype(str)).dt.time
data_df = data_df.drop(columns=['hour_shift'], errors='ignore')
data_df.loc[data_df['hour_local'].isin(target_hours),'hour_shift'] = data_df['local_time']
data_df = data_df.sort_values(by=['local_time'])
data_df['hour_shift'] = data_df['hour_shift'].ffill()
data_df = data_df.dropna(subset=['hour_shift'])
# This will print empty dataframe
print(data_df)
data_df['id_cont'] = data_df.apply(lambda row:row['product_name']+'- '+row['hour_shift'].strftime('%Y-%m-%d %H:%M:%S'),axis=1)
One way I think you can avoid this error is the add a check to only run the apply line if the dataframe is not empty
if len(data_df):
data_df['id_cont'] = data_df.apply(lambda row:row['product_name']+'- '+row['hour_shift'].strftime('%Y-%m-%d %H:%M:%S'),axis=1)
print(data_df)
I'm downloading historical candlestick data for multiple crypto pairs across different timeframes from the binance api, i would like to know how to sort this data according to pair and timeframe and check which pair on which timeframe executes my code, the following code is what i use to get historical data
import requests
class BinanceFuturesClient:
def __init__(self):
self.base_url = "https://fapi.binance.com"
def make_requests(self, method, endpoint, data):
if method=="GET":
response = requests.get(self.base_url + endpoint, params=data)
return response.json()
def get_symbols(self):
symbols = []
exchange_info = self.make_requests("GET", "/fapi/v1/exchangeInfo", None)
if exchange_info is not None:
for symbol in exchange_info['symbols']:
if symbol['contractType'] == 'PERPETUAL' and symbol['quoteAsset'] == 'USDT':
symbols.append(symbol['pair'])
return symbols
def initial_historical_data(self, symbol, interval):
data = dict()
data['symbol'] = symbol
data['interval'] = interval
data['limit'] = 35
raw_candle = self.make_requests("GET", "/fapi/v1/klines", data)
candles = []
if raw_candle is not None:
for c in raw_candle:
candles.append(float(c[4]))
return candles[:-1]
running this code
print(binance.initial_historical_data("BTCUSDT", "5m"))
will return this as the output
[55673.63, 55568.0, 55567.89, 55646.19, 55555.0, 55514.53, 55572.46, 55663.91, 55792.83, 55649.43,
55749.98, 55680.0, 55540.25, 55470.44, 55422.01, 55350.0, 55486.56, 55452.45, 55507.03, 55390.23,
55401.39, 55478.63, 55466.48, 55584.2, 55690.03, 55760.81, 55515.57, 55698.35, 55709.78, 55760.42,
55719.71, 55887.0, 55950.0, 55980.47]
which is a list of closes
i want to loop through the code in such a manner that i can return all the close prices for the pairs and timeframes i need and sort it accordingly, i did give it a try but am just stuck at this point
period = ["1m", "3m", "5m", "15m"]
binance = BinanceFuturesClient()
symbols = binance.get_symbols()
for symbol in symbols:
for tf in period:
historical_candles = binance.initial_historical_data(symbol, tf)
# store values and run through strategy
You can use my code posted below. It requires python-binance package to be installed on your environment and API key/secret from your Binance account. Method tries to load data by weekly chunks (parameter step) and supports resending requests on failures after timeout. It may helps when you need to fetch huge amount of data.
import pandas as pd
import pytz, time, datetime
from binance.client import Client
from tqdm.notebook import tqdm
def binance_client(api_key, secret_key):
return Client(api_key=api_key, api_secret=secret_key)
def load_binance_data(client, symbol, start='1 Jan 2017 00:00:00', timeframe='1M', step='4W', timeout_sec=5):
tD = pd.Timedelta(timeframe)
now = (pd.Timestamp(datetime.datetime.now(pytz.UTC).replace(second=0)) - tD).strftime('%d %b %Y %H:%M:%S')
tlr = pd.DatetimeIndex([start]).append(pd.date_range(start, now, freq=step).append(pd.DatetimeIndex([now])))
print(f' >> Loading {symbol} {timeframe} for [{start} -> {now}]')
df = pd.DataFrame()
s = tlr[0]
for e in tqdm(tlr[1:]):
if s + tD < e:
_start, _stop = (s + tD).strftime('%d %b %Y %H:%M:%S'), e.strftime('%d %b %Y %H:%M:%S')
nerr = 0
while nerr < 3:
try:
chunk = client.get_historical_klines(symbol, timeframe.lower(), _start, _stop)
nerr = 100
except e as Exception:
nerr +=1
print(red(str(e)))
time.sleep(10)
if chunk:
data = pd.DataFrame(chunk, columns = ['timestamp', 'open', 'high', 'low', 'close', 'volume', 'close_time', 'quote_av', 'trades', 'tb_base_av', 'tb_quote_av', 'ignore' ])
data.index = pd.to_datetime(data['timestamp'].rename('time'), unit='ms')
data = data.drop(columns=['timestamp', 'close_time']).astype(float).astype({
'ignore': bool,
'trades': int,
})
df = df.append(data)
s = e
time.sleep(timeout_sec)
return df
How to use
c = binance_client(<your API code>, <your API secret>)
# loading daily data from 1/Mar/21 till now (your can use other timerames like 1m, 5m etc)
data = load_binance_data(c, 'BTCUSDT', '2021-03-01', '1D')
It returns indexed DataFrame with loaded data:
time
open
high
low
close
volume
quote_av
trades
tb_base_av
tb_quote_av
ignore
2021-03-02 00:00:00
49595.8
50200
47047.6
48440.7
64221.1
3.12047e+09
1855583
31377
1.52515e+09
False
2021-03-03 00:00:00
48436.6
52640
48100.7
50349.4
81035.9
4.10952e+09
2242131
40955.4
2.07759e+09
False
2021-03-04 00:00:00
50349.4
51773.9
47500
48374.1
82649.7
4.07984e+09
2291936
40270
1.98796e+09
False
2021-03-05 00:00:00
48374.1
49448.9
46300
48751.7
78192.5
3.72713e+09
2054216
38318.3
1.82703e+09
False
2021-03-06 00:00:00
48746.8
49200
47070
48882.2
44399.2
2.14391e+09
1476474
21500.6
1.03837e+09
False
Next steps are up to you and dependent on how would you like to design your data structure. In simplest case you could store data into dictionaries:
from collections import defaultdict
data = defaultdict(dict)
for symbol in ['BTCUSDT', 'ETHUSDT']:
for tf in ['1d', '1w']:
historical_candles = load_binance_data(c, symbol, '2021-05-01', timeframe=tf)
# store values and run through strategy
data[symbol][tf] = historical_candles
to get access to your OHLC you just need following: data['BTCUSDT']['1d'] etc.
So I am trying to get multiple stock prices using pandas and panadas datareader. If I only try to import one ticker it will run fine, but if I use more than one then an error arises. The code is:
import pandas as pd
import pandas_datareader as web
import datetime as dt
stocks = ['BA', 'AMD']
start = dt.datetime(2018, 1, 1)
end = dt.datetime(2020, 1, 1)
d = web.DataReader(stocks, 'yahoo', start, end)
Though I get the error:
ValueError: Wrong number of items passed 2, placement implies 1
So how do I get around it only allowing to pass 1 stock.
So far I have tried using quandl and google instead, which dont work either. I also have tried pdr.get_data_yahoo but I get the same result. I have also tried yf.download() and still get the same issue. Does anyone have any ideas to get around this? Thank you.
EDIT: Full code:
import pandas as pd
import pandas_datareader as web
import datetime as dt
import yfinance as yf
import numpy as np
stocks = ['BA', 'AMD', 'AAPL']
start = dt.datetime(2018, 1, 1)
end = dt.datetime(2020, 1, 1)
d = web.DataReader(stocks, 'yahoo', start, end)
d['sma50'] = np.round(d['Close'].rolling(window=2).mean(), decimals=2)
d['sma200'] = np.round(d['Close'].rolling(window=14).mean(), decimals=2)
d['200-50'] = d['sma200'] - d['sma50']
_buy = -2
d['Crossover_Long'] = np.where(d['200-50'] < _buy, 1, 0)
d['Crossover_Long_Change']=d.Crossover_Long.diff()
d['buy'] = np.where(d['Crossover_Long_Change'] == 1, 'buy', 'n/a')
d['sell'] = np.where(d['Crossover_Long_Change'] == -1, 'sell', 'n/a')
pd.set_option('display.max_rows', 5093)
d.drop(['High', 'Low', 'Close', 'Volume', 'Open'], axis=1, inplace=True)
d.dropna(inplace=True)
#make 2 dataframe
d.set_index(d['Adj Close'], inplace=True)
buy_price = d.index[d['Crossover_Long_Change']==1]
sell_price = d.index[d['Crossover_Long_Change']==-1]
d['Crossover_Long_Change'].value_counts()
profit_loss = (sell_price - buy_price)*10
commision = buy_price*.01
position_value = (buy_price + commision)*10
percent_return = (profit_loss/position_value)*100
percent_rounded = np.round(percent_return, decimals=2)
prices = {
"Buy Price" : buy_price,
"Sell Price" : sell_price,
"P/L" : profit_loss,
"Return": percent_rounded
}
df = pd.DataFrame(prices)
print('The return was {}%, and profit or loss was ${} '.format(np.round(df['Return'].sum(), decimals=2),
np.round(df['P/L'].sum(), decimals=2)))
d
I tried 3 stocks in your code and it returns data for all 3, not sure I understood the problem you're facing?
import pandas as pd
import pandas_datareader as web
import datetime as dt
stocks = ['BA', 'AMD', 'AAPL']
start = dt.datetime(2018, 1, 1)
end = dt.datetime(2020, 1, 1)
d = web.DataReader(stocks, 'yahoo', start, end)
print(d)
Output:
Attributes Adj Close Close ... Open Volume
Symbols BA AMD AAPL BA AMD AAPL ... BA AMD AAPL BA AMD AAPL
Date ...
2018-01-02 282.886383 10.980000 166.353714 296.839996 10.980000 172.259995 ... 295.750000 10.420000 170.160004 2978900.0 44146300.0 25555900.0
2018-01-03 283.801239 11.550000 166.324722 297.799988 11.550000 172.229996 ... 295.940002 11.610000 172.529999 3211200.0 154066700.0 29517900.0
2018-01-04 282.724396 12.120000 167.097290 296.670013 12.120000 173.029999 ... 297.940002 12.100000 172.539993 4171700.0 109503000.0 22434600.0
2018-01-05 294.322296 11.880000 168.999741 308.839996 11.880000 175.000000 ... 296.769989 12.190000 173.440002 6177700.0 63808900.0 23660000.0
2018-01-08 295.570740 12.280000 168.372040 310.149994 12.280000 174.350006 ... 308.660004 12.010000 174.350006 4124900.0 63346000.0 20567800.0
... ... ... ... ... ... ... ... ... ... ... ... ... ...
2019-12-24 331.030457 46.540001 282.831299 333.000000 46.540001 284.269989 ... 339.510010 46.099998 284.690002 4120100.0 44432200.0 12119700.0
2019-12-26 327.968689 46.630001 288.442780 329.920013 46.630001 289.910004 ... 332.700012 46.990002 284.820007 4593400.0 57562800.0 23280300.0
2019-12-27 328.187408 46.180000 288.333313 330.140015 46.180000 289.799988 ... 330.200012 46.849998 291.119995 4124000.0 36581300.0 36566500.0
2019-12-30 324.469513 45.520000 290.044617 326.399994 45.520000 291.519989 ... 330.500000 46.139999 289.459991 4525500.0 41149700.0 36028600.0
2019-12-31 323.833313 45.860001 292.163818 325.760010 45.860001 293.649994 ... 325.410004 45.070000 289.929993 4958800.0 31673200.0 25201400.0
I think the error comes from your moving average and the line
d['sma50'] = np.round(d['Close'].rolling(window=2).mean(), decimals=2)
because d represent 3 stocks, I think you have to separate each stock and compute the moving average separately
EDIT : I tried something for two stocks only (BA and AMD) but it is not the best solution because I'm always repeating myself for every line.
I'm just a beginner in Python but maybe this will help you to find a solution to your problem
PS : The last line doesn't work really well (which is the printing of the P&L and Return)
"
import pandas as pd
import pandas_datareader as web
import datetime as dt
stock1 = ['BA']
stock2=['AMD']
start = dt.datetime(2018, 1, 1)
end = dt.datetime(2020, 1, 1)
d1 = web.DataReader(stock1, 'yahoo', start, end)
d2 = web.DataReader(stock2, 'yahoo', start, end)
d1['sma50'] = np.round(d1['Close'].rolling(window=2).mean(), decimals=2)
d2['sma50'] = np.round(d2['Close'].rolling(window=2).mean(), decimals=2)
d1['sma200'] = np.round(d1['Close'].rolling(window=14).mean(), decimals=2)
d2['sma200'] = np.round(d2['Close'].rolling(window=14).mean(), decimals=2)
d1['200-50'] = d1['sma200'] - d1['sma50']
d2['200-50'] = d2['sma200'] - d2['sma50']
_buy = -2
d1['Crossover_Long'] = np.where(d1['200-50'] < _buy, 1, 0)
d2['Crossover_Long'] = np.where(d2['200-50'] < _buy, 1, 0)
d1['Crossover_Long_Change']=d1.Crossover_Long.diff()
d2['Crossover_Long_Change']=d2.Crossover_Long.diff()
d1['buy'] = np.where(d1['Crossover_Long_Change'] == 1, 'buy', 'n/a')
d2['buy'] = np.where(d2['Crossover_Long_Change'] == 1, 'buy', 'n/a')
d1['sell_BA'] = np.where(d1['Crossover_Long_Change'] == -1, 'sell', 'n/a')
d2['sell_AMD'] = np.where(d2['Crossover_Long_Change'] == -1, 'sell', 'n/a')
pd.set_option('display.max_rows', 5093)
d1.drop(['High', 'Low', 'Close', 'Volume', 'Open'], axis=1, inplace=True)
d2.drop(['High', 'Low', 'Close', 'Volume', 'Open'], axis=1, inplace=True)
d2.dropna(inplace=True)
d1.dropna(inplace=True)
d1.set_index("Adj Close",inplace=True)
d2.set_index("Adj Close",inplace=True)
buy_price_BA = np.array(d1.index[d1['Crossover_Long_Change']==1])
buy_price_AMD = np.array(d2.index[d2['Crossover_Long_Change']==1])
sell_price_BA = np.array(d1.index[d1['Crossover_Long_Change']==-1])
sell_price_AMD = np.array(d2.index[d2['Crossover_Long_Change']==-1])
d1['Crossover_Long_Change'].value_counts()
d2['Crossover_Long_Change'].value_counts()
profit_loss_BA = (sell_price_BA - buy_price_BA)*10
profit_loss_AMD = (sell_price_AMD - buy_price_AMD)*10
commision_BA = buy_price_BA*.01
commision_AMD = buy_price_AMD*.01
position_value_BA = (buy_price_BA + commision_BA)*10
position_value_AMD = (buy_price_AMD + commision_AMD)*10
percent_return_BA = np.round(((profit_loss_BA/position_value_BA)*100),decimals=2)
percent_return_AMD = np.round(((profit_loss_AMD/position_value_AMD)*100),decimals=2)
prices_BA = {
"Buy Price BA" : [buy_price_BA],
"Sell Price BA" : [sell_price_BA],
"P/L BA" : [profit_loss_BA],
"Return BA": [percent_return_BA]}
df = pd.DataFrame(prices_BA)
print('The return was {}%, and profit or loss was ${} '.format(np.round(df['Return BA'].sum(), decimals=2),
np.round(df['P/L BA'].sum(), decimals=2)))
prices_AMD = {
"Buy Price AMD" : [buy_price_AMD],
"Sell Price AMD" : [sell_price_AMD],
"P/L AMD" : [profit_loss_AMD],
"Return AMD": [percent_return_AMD]}
df = pd.DataFrame(prices_AMD)
print('The return was {}%, and profit or loss was ${} '.format(np.round(df['Return AMD'].sum(), decimals=2),
np.round(df['P/L AMD'].sum(), decimals=2)))
It seems like there's a bug in the pandas data reader. I work around it by initialising with one symbol and then setting the symbols property on the instantiated object. After doing that, it works fine to call read() on tmp below.
import pandas_datareader as pdr
all_symbols = ['ibb', 'xly', 'fb', 'exx1.de']
tmp = pdr.yahoo.daily.YahooDailyReader(symbols=all_symbols[0])
# this is a work-around, pdr is broken...
tmp.symbols = all_symbols
data = tmp.read()
So I am trying to build a trading software and I am using the code from an online YouTuber. I am gathering all of the data for the companies on the S&P 500 in the get_data_from_yahoo() function. So when I run that code it says Already Have (then the given ticker) which is fine, but when I got to print the data for this in the following function, which is compile_data(), it only print one ticker which is ZTS.
Anyone have any ideas?
import bs4 as bs
import datetime as dt
import os
import pandas as pd
from pandas_datareader import data as pdr
import pickle
import requests
import fix_yahoo_finance as yf
def save_sp500_tickers():
resp = requests.get('http://en.wikipedia.org/wiki/List_of_S%26P_500_companies')
soup = bs.BeautifulSoup(resp.text, 'lxml')
table = soup.find('table', {'class': 'wikitable sortable'})
tickers = []
for row in table.findAll('tr')[1:]:
ticker = row.findAll('td')[0].text.replace('.', '-')
ticker = ticker[:-1]
tickers.append(ticker)
with open("sp500tickers.pickle", "wb") as f:
pickle.dump(tickers, f)
print(tickers)
return tickers
save_sp500_tickers()
def get_data_from_yahoo(reload_sp500=False):
if reload_sp500:
tickers = save_sp500_tickers()
else:
with open("sp500tickers.pickle", "rb") as f:
tickers = pickle.load(f)
if not os.path.exists('stock_dfs'):
os.makedirs('stock_dfs')
start = dt.datetime(2019, 6, 8)
end = dt.datetime.now()
for ticker in tickers:
print(ticker)
if not os.path.exists('stock_dfs/{}.csv'.format(ticker)):
df = pdr.get_data_yahoo(ticker, start, end)
df.reset_index(inplace=True)
df.set_index("Date", inplace=True)
df.to_csv('stock_dfs/{}.csv'.format(ticker))
else:
print('Already have {}'.format(ticker))
save_sp500_tickers()
get_data_from_yahoo()
def complied_data():
with open("sp500tickers.pickle","rb") as f:
tickers = pickle.load(f)
main_df = pd.DataFrame()
for count, ticker in enumerate(tickers):
df = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
df.set_index('Date', inplace=True)
df.rename(columns = {'Adj Close':ticker}, inplace=True)
df.drop(['Open', 'High', 'Low','Close','Volume'], 1, inplace=True)
if main_df.empty:
main_df = df
else:
main_df = main_df.join(df, how='outer')
if count % 10 == 0:
print(count)
print(main_df.head())
main_df.to_csv('sp500_joined_closes.csv')
complied_data()
When I run this code this is what it says:
MMM
Already have MMM
ABT
Already have ABT
ABBV
Already have ABBV
ABMD
Already have ABMD
ACN
Already have ACN
ATVI
Already have ATVI
ADBE
Already have ADBE
AMD
Already have AMD
AAP
Already have AAP
AES
Already have AES
AMG
Already have AMG
AFL
Already have AFL
A
Already have A
APD
Already have APD
AKAM
Already have AKAM
ALK
Already have ALK
ALB
Already have ALB
It then continues to say that it already has all of the 500 companies(I did not show the hole thing because the list is very long). But when I run the compile_data()
function it only prints the data for one ticker:
ZTS
Date
2019-01-02 83.945038
2019-01-03 81.043526
2019-01-04 84.223267
2019-01-07 84.730026
2019-01-08 85.991997
The problem is in a for loop, specifically the one in complied_data.
The if-else and if blocks should be included in the for loop:
for count, ticker in enumerate(tickers):
df = pd.read_csv('stock_dfs/{}.csv'.format(ticker))
df.set_index('Date', inplace=True)
df.rename(columns = {'Adj Close':ticker}, inplace=True)
df.drop(['Open', 'High', 'Low','Close','Volume'], 1, inplace=True)
if main_df.empty:
main_df = df
else:
main_df = main_df.join(df, how='outer')
if count % 10 == 0:
print(count)
Otherwise they will be evaluated only after it is done looping and elaborate the last element.
The following is the output when changing to the above indentation:
(... omitted counting from 0)
470
480
490
500
MMM ABT ABBV ABMD ... YUM ZBH ZION ZTS
Date ...
2019-06-10 165.332672 80.643486 74.704918 272.429993 ... 107.794380 121.242027 43.187107 109.920105
2019-06-11 165.941788 80.494644 75.889320 262.029999 ... 106.722885 120.016762 43.758469 109.860268
2019-06-12 166.040024 81.318237 76.277657 254.539993 ... 108.082100 120.225945 43.512192 111.136780
2019-06-13 165.882843 81.655624 76.646561 255.529999 ... 108.121788 119.329407 44.063854 109.730621
2019-06-14 163.760803 81.586166 76.394157 250.960007 ... 108.925407 116.998398 44.211620 110.488556
[5 rows x 505 columns]