I've built a small download manager to get data for the SHARADAR tables in Quandl. GIT
This is functioning well but the downloads are very slow for the larger files (up to 2 gb over 10 years).
I attempted to use asyncio but this didn't speed up the downloads. This may be because Quandl doesn't allow concurrent downloads. Am I making an error in my code, or is this restriction I will have to live with from Quandl?
import asyncio
import math
import time
import pandas as pd
import quandl
import update
def segment_dates(table, date_start, date_end):
# Determine the number of days per asyncio loop. Determined by the max size of the
# range of data divided by the size of the files in 100 mb chunks.
# reduce this number for smaller more frequent downloads.
total_days = 40
# Number of days per download should be:
sizer = math.ceil(total_days / update.sharadar_tables[table][2])
# Number of days between start and end.
date_diff = date_end - date_start
loop_count = int(math.ceil(date_diff.days / sizer))
sd = date_start
sync_li = []
for _ in range(loop_count):
ed = sd + pd.Timedelta(days=sizer)
if ed > date_end:
ed = date_end
sync_li.append((sd, ed,))
sd = ed + pd.Timedelta(days=1)
return sync_li
async def get_data(table, kwarg):
"""
Using the table name and kwargs retrieves the most current data.
:param table: Name of table to update.
:param kwarg: Dictionary containing the parameters to send to Quandl.
:return dataframe: Pandas dataframe containing latest data for the table.
"""
return quandl.get_table("SHARADAR/" + table.upper(), paginate=True, **kwarg)
async def main():
table = "SF1"
# Name of the column that has the date field for this particular table.
date_col = update.sharadar_tables[table][0]
date_start = pd.to_datetime("2020-03-15")
date_end = pd.to_datetime("2020-04-01")
apikey = "API Key"
quandl.ApiConfig.api_key = apikey
# Get a list containing the times start and end for loops.
times = segment_dates(table, date_start, date_end)
wait_li = []
for t in times:
kwarg = {date_col: {"gte": t[0].strftime("%Y-%m-%d"), "lte": t[1].strftime("%Y-%m-%d")}}
wait_li.append(loop.create_task(get_data(table, kwarg)))
await asyncio.wait(wait_li)
return wait_li
if __name__ == "__main__":
starter = time.time()
try:
loop = asyncio.get_event_loop()
res = loop.run_until_complete(main())
for r in res:
df = r.result()
print(df.shape)
print(df.head())
except:
raise ValueError("error")
finally:
# loop.close()
print("Finished in {}".format(time.time() - starter))
Related
This question already has an answer here:
Pandas - Explanation on apply function being slow
(1 answer)
Closed 7 months ago.
I am actively running some Python code in jupyter on a df consisting of about 84k rows. I'm estimating this is going to take somewhere in the neighborhood of 9 hours at this rate. My code is below, I have read that ideally one would vectorize for max speed but being sort of new to Python and coding in general, I'm not sure how I can go about changing the below code to vectorize it. The goal is to look at the value in the first column of the dataframe and add that value to the end of a url. I then check the first line in the url and compare it to some predetermined values to find out if there is a match. Any advice would be greatly appreciated!
#Python 3
import pandas as pd
import urllib
no_res = "Item Not Found"
error = "Does Not Exist"
for i in df1.index:
path = 'http://xxxx/xxx/xxx.pl?part=' + str(df1['ITEM_ID'][i])
parsed_path = path.replace(' ','%20')
f = urllib.request.urlopen(parsed_path)
raw = str(f.read().decode("utf-8"))
lines = raw.split('\n')
r = lines[0]
if r == no_res:
sap = 'NO'
elif r == error:
sap = 'ERROR'
else:
sap = 'YES'
df1["item exists"][i] = sap
df1["Path"][i] = path
df1["URL return value"][i] = r
Edit adding test code below
import concurrent.futures
import pandas as pd
import urllib
import numpy as np
def my_func(df_row):
no_res = "random"
error = "entered"
path = "http://www.google.com"
parsed_path = path.replace(' ','%20')
f = urllib.request.urlopen(parsed_path)
raw = str(f.read().decode("utf-8"))
lines = raw.split('\n')
r = df_row['0']
if r == no_res:
sap = "NO"
elif r == error:
sap = "ERROR"
else:
sap = "YES"
df_row['4'] = sap
df_row['5'] = lines[0]
df_row['6'] = r
n = 1000
my_df = pd.DataFrame(np.random.choice(['random','words','entered'], size=(n,3)))
my_df['4'] = ""
my_df['5'] = ""
my_df['6'] = ""
my_df = my_df.apply(lambda col: col.astype('category'))
executor = concurrent.futures.ProcessPoolExecutor(8)
futures = [executor.submit(my_func, row) for _,row in my_df.iterrows()]
concurrent.futures.wait(futures)
This is throwing the following error (shortened):
DoneAndNotDoneFutures(done={<Future at 0x1cfe4938040 state=finished raised BrokenProcessPool>, <Future at 0x1cfe48b8040 state=finished raised BrokenProcessPool>,
Since you are doing some outside operation with a URL, I do not think vectorization is a solution (let possible).
The bottleneck of your operation is the following line
f = urllib.request.urlopen(parsed_path)
This line waits for the response and is blocking, as mentioned your operation is I/O bound. The CPU can start other jobs while waiting for the response. The solution to address this is using concurrency.
Edit: My original answer was using python built-in multi threading which was problematic. The best way to do multiprocessing/threading with pandas data frame is using "dask" library.
The following code is tested with the dummy data set on my PC and on average speeds up the naive for loop by ~ 12 times.
#%%
import time
import urllib.request
import pandas as pd
import numpy as np
import dask.dataframe as dd
def my_func(df_row):
df_row = df_row.copy()
no_res = "random"
error = "entered"
path = "http://www.google.com"
parsed_path = path.replace(' ','%20')
f = urllib.request.urlopen(parsed_path)
# I had to change the encoding on my machine.
raw = str(f.read().decode("'windows-1252"))
lines = raw.split('\n')
r = df_row[0]
if r == no_res:
sap = "NO"
elif r == error:
sap = "ERROR"
else:
sap = "YES"
df_row['4'] = sap
df_row['5'] = lines[0]
df_row['6'] = r
return df_row
def run():
print("started.")
n = 1000
my_df = pd.DataFrame(np.random.choice(['random','words','entered'], size=(n,3)))
my_df = my_df.apply(lambda col: col.astype('category'))
my_df['4'] = ""
my_df['5'] = ""
my_df['6'] = ""
# Literally dask partitions the original dataframe into
# npartitions chunks and use them in apply function
# in parallel.
my_ddf = dd.from_pandas(my_df, npartitions=15)
start = time.time()
q = my_ddf.apply(my_func, axis= 1, meta=my_ddf)
# num_workers is number of threads used,
print(q.compute(num_workers= 50))
time_end = time.time()
print(f"Elapsed: {time_end - start:10.2f}")
if __name__ == "__main__":
run()
dask provides many other tools and options to facilitate concurrent processing and it would be a good idea to take a look at its documentation to investigate other options.
P.S. : if you run the above code too many times on google you will receive "HTTP Error 429: Too Many Requests". This happens to prevent something like DDoS attack on a public server. So, if for your real job you are querying a public website, you may end up receiving the same 429 response, if you try 84K queries in a short time.
I am exploring azure management APIs. The ADF monitor pipeline, returns only 100 records at a time. So I created a while loop, but for some reason, not sure what, not able to get the next token.
ct = d.get('continuationToken','')
c = 1
while ct!='':
req_body = self.getDataBody(ct)
data = self.getResponse(data_url,data_headers,req_body)
nct = self.getContinuationToken(data,c)
c = c+1
print(c)
if ct == nct:
print(ct)
print(nct)
print('duplicate token')
break
ct = nct
if ct == '':
break
Here in the next iteration next token is not getting updated.
Update:
following the functions that the above code is using
def getDataBody(self,ct):
start_date = datetime.now().strftime("%Y-%m-%d")
end_date = (datetime.now() + timedelta(days=1)).strftime("%Y-%m-%d")
data_body = {'lastUpdatedAfter': start_date, 'lastUpdatedBefore': end_date}
if ct!='':
data_body['continuationToken'] = ct
return data_body
def getResponse(self,url,headers,body):
data = requests.post(url,headers=headers,data=body)
return data.text
def getContinuationToken(self,data,c):
d = json.loads(data)
with open(f'data/{c}.json','w') as f:
json.dump(d,f)
return d.get('continuationToken','')
you can try with increasing the timeout in the ADF activity may be due to the timeout setting in your current ADF activity is less than the actual time
taking to execute that API .
I'm trying to getting data from kraken exchange by the krakenex API. But i'm facing several problems, 'cause, I want getting the data in a range time bigger than the alllowed by the API.
The API only allows getting a dataframe with 720 rows, so 'cause that I need to do a loop while to getting more data and concat in another dataframe.
I've already read other topics about it, but I'm still not reaching good results.
import krakenex
import time
import krakenex
import pandas as pd
from pykrakenapi import KrakenAPI
from datetime import datetime
k = krakenex.API()
start = '28/01/2021 00:00:00'
start = datetime.strptime(start, "%d/%m/%Y %H:%M:%S")
start = int(time.mktime(start.timetuple()))
stop = '03/02/2021 00:00:00'
stop = datetime.strptime(stop, "%d/%m/%Y %H:%M:%S")
stop = int(time.mktime(stop.timetuple()))
prices = pd.DataFrame()
while start < stop:
time.sleep(5)
data = k.query_public('OHLC', {'pair':'XXBTZUSD', 'interval':1, 'since':start})
df = pd.DataFrame( data['result']['XXBTZUSD'])
daily_prices = df[0].to_list()
start = int(daily_prices[0])
prices = pd.concat([precos , df])
For weeks I have been working on a script that does exactly that. In my case I collect all pairs with BTC and ETH but you can use the script with any pair. To do this I used the REST API and defined some functions that automate everything. I download the data with 1 minute timeframe but it can be used for any timeframe.
First I defined a function that downloads the data in full or from a specific date, it's necessary because at the first run it will download all the data and then it will download only the new data. The parameter 'interval' defines the number of minutes of the timeframe while 'since' defines the beginning of the data to download.
def get_ohlc (pair, interval=1, since='last'):
endpoint = 'https://api.kraken.com/0/public/OHLC'
payLoad = {
'pair': pair,
'interval': interval,
'since' : since
}
response = requests.get(endpoint, payLoad)
data = response.json()
OHLC = data['result'][pair]
data = pd.DataFrame.from_records(OHLC, columns=['Time', 'Open', 'High', 'Low', 'Close', 'vwap', 'volume', 'count'])
data['Time'] = pd.to_datetime(data['Time'], unit='s')
data.set_index('Time',inplace=True)
data = data.drop(['vwap', 'volume', 'count'], axis=1)
data['Open'] = data.Open.astype(float)
data['High'] = data.High.astype(float)
data['Low'] = data.Low.astype(float)
data['Close'] = data.Close.astype(float)
return data
Then I defined a function to load the .json file that was saved into memory. The function returns the dataframe with the old data and a timestamp that indicates from where to download the new data. I also created a function for calculate the timestamp.
def load_data(pair, path):
data = pd.read_json(path + pair + '.json' , orient='split')
tmp = data.tail(1).index
tmp = tmp.strftime('%Y-%m-%d %H:%M:%S')
dt = str_to_datetime(tmp[0])
ts = dt.timestamp()
return data, ts
def str_to_datetime(datestr):
Y = int(datestr[0:4])
M = int(datestr[5:7])
D = int(datestr[8:10])
H = int(datestr[11:13])
m = int(datestr[14:16])
return datetime.datetime(Y, M, D, H, m, 0, tzinfo=tz.gettz("Etc/GMT"))
Now your main should be something like:
from countdown import countdown
import pandas as pd
import datetime
import os
path = os.getcwd() + '/historical_data/'
pair = 'XBTUSD'
while True:
if os.path.exists(path + pair + '.json') == False:
data = get_ohlc(pair, 1) # 1 minute timeframe
data.to_json(path + pair + '.json', orient='split')
else:
data1, ts = load_data(pair, path)
data2 = get_ohlc(pair, 1, ts)
data3 = pd.concat([data1, data2])
data3.drop(data3.tail(1).index,inplace=True) # delete last record because it's not ended
data3.to_json(path + pair + '.json', orient='split')
countdown(60) # update every hour
I delete the last record because when you download it it's not ended so we will download at the next update. I haven't tested if it works because I took pieces of code from my program, if it doesn't work let me know and I'll fix it.
I am learning Python and had a question regarding for and if loops. This is my scenario:
I have an endpoint that i make API-call with request.get
I need to retrieve all the historic data
I have a start_date (2017-06-17)
So i need to make multiple API-call because they have a limit of 60-days period. So i made my code like this:
date = datetime.strptime("2017-06-17", "%Y-%m-%d") # Start Date
current_date = date.date() # timedelta need date object so i make it a date object
days_after = (current_date+timedelta(days=60)).isoformat() # days_after is set to 60-days because limit in API
date_string = current_date.strftime('%Y-%m-%d') # made to string again since API need string not date object
So this is how i make the dates for 60 days period. Starting from 2017-06-17 and 60-days ahead.
This is how i make the API-request:
response = requests.get("https://reporting-api/campaign?token=xxxxxxxxxx&format=json&fromDate="+date_string+"&toDate="+days_after)
response_data = response.json() # Added this because i am writing temprorary to a JSON file
This is how i write to JSON file:
if response_data:
print("WE GOT DATA") # Debugging
data = response.json() # This is duplicate?
with open('data.json', 'w') as f: # Open my data.json file as write
json.dump(data, f) # dumps my json-data from API to the file
else:
print("NO DATA") # Debugging if no data / response. Should make a skip statement here
So my question is how can i proceed with my code so that every time i make a API-call starting from 2017-06-17 the date date_string and days_after should go 60 days forward for each API-call and append those data to data.json. I would maybe need some for loops or something?
Please note i have been using Python for 3 days now, be gentle.
Thanks!
You could use a while loop that changes the start and end date until a specified condition is met. Also, you can append the response to a file for every run. the example below I used the date of "today":
import os
from datetime import datetime, timedelta
x = 0
y = 60
date = datetime.strptime("2017-06-17", "%Y-%m-%d")
current_date = date.date()
date_start = current_date+timedelta(days=x)
while date_start < datetime.now().date():
date_start = current_date+timedelta(days=x)
days_after = current_date+timedelta(days=y)
x = x + 60
y = y + 60
response = requests.get("https://reporting-api/campaign?token=xxxxxxxxxx&format=json&fromDate="+date_start.isoformat() +"&toDate="+days_after.isoformat())
response_data = response.json()
if response_data:
print("WE GOT DATA")
data = response.json()
#create a file if not exists or append new data to it.
if os.path.exists('data.json'):
append_write = 'a' # append if already exists
else:
append_write = 'w' # make a new file if not
with open('data.json', append_write) as f:
json.dump(data, f)
else:
print("NO DATA")
Basically, on every run the time of start and end is increased by 60 days and appended to the data.json file.
Been trying to extract websocket information from Bitfinex websocket client service. Below is the code. The script works fine when I search for under 30 crypto pairs (ie. "p" or "PAIRS" has 30 elements) but if I try to go higher the script never gets to the "save_data" co-routine. Any ideas why this could be happening.
I modified the script from: "https://mmquant.net/replicating-orderbooks-from-websocket-stream-with-python-and-asyncio/", kudos to Mmquant for making the code available and giving an awesome script description.
import aiohttp
import asyncio
import ujson
from tabulate import tabulate
from copy import deepcopy
import pandas as pd
from openpyxl import load_workbook
import datetime
from datetime import datetime
import numpy as np
from collections import OrderedDict
from time import sleep
"""
Load the workbook to dump the API data as well as instruct it to not generate a new sheet.
The excel work book must:
1. Be of the type ".xlsx", only this because the load_workbook function was set to call a specific sheet with .xlsx format. This can be changed.
2. Must have the worksheets, "apidata" and "Test". This can also be adjusted below.
3. The excel workbooks name is "bitfinexws.xlsx". This can be changed below.
4. The excel spreadsheet is in the same folder as this script.
"""
book = load_workbook('bitfinexwsasync.xlsx') #.xlsx Excel spreadsheet that will be used for the placement and extracting of data.
apdat = book['Sheet1'] #Assign a variable to the sheet where the trade ratios will be put. This is case sensitive.
#The next 3 lines are critical to allow overwriting of data and not creating a new worksheet when using panda dataframes.
writer = pd.ExcelWriter('bitfinexwsasync.xlsx', engine='openpyxl')
writer.book = book
writer.sheets = dict((ws.title, ws) for ws in book.worksheets)
#Get a list of all the ratios and add the standard trade url: "https://api.bitfinex.com/v1/book/" before the ratios.
burl = 'https://api.bitfinex.com/v1/book/' #This is the standard url for retrieving trade ratios, the pair symbol must be added after this.
sym = pd.read_json('https://api.bitfinex.com/v1/symbols',orient='values') #This is a list of all the symbols on the Bitfinex website.
p=[]
p=[0]*len(sym)
for i in range(0,len(sym)):
p[i]=sym.loc[i,0]
p=tuple(p)
m=len(p) #Max number of trade ratios to extract for this script. Script cannot run the full set of 105 trade ratios, it will time-out.
p=p[0:m]
d=[]
e=[]
j=[]
"""
NOTE:
The script cannot run for the full 105 pairs, it timesout and becomes unresponsive.
By testig the stability it was found that calling 21 pairs per script at a refresh rate of 5seconds did not allow for any time-out problems.
"""
print('________________________________________________________________________________________________________')
print('')
print('Bitfinex Websocket Trading Orderbook Extraction - Asynchronous.')
print('There are a total of ', len(sym), ' trade ratios in this exchange.')
print('Only ',m,' trading pairs will be extracted by this script, namely:',p)
print('Process initiated at',datetime.now().strftime('%Y-%m-%d %H:%M:%S'),'.') #Tells me the date and time that the data extraction was intiated.
print('________________________________________________________________________________________________________')
print('')
# Pairs which generate orderbook for.
PAIRS = p
# If there is n pairs we need to subscribe to n websocket channels.
# This the subscription message template.
# For details about settings refer to https://bitfinex.readme.io/v2/reference#ws-public-order-books.
SUB_MESG = {
'event': 'subscribe',
'channel': 'book',
'freq': 'F0', #Adjust for real time
'len': '25',
'prec': 'P0'
# 'pair': <pair>
}
def build_book(res, pair):
""" Updates orderbook.
:param res: Orderbook update message.
:param pair: Updated pair.
"""
global orderbooks
# Filter out subscription status messages.
if res.data[0] == '[':
# String to json
data = ujson.loads(res.data)[1]
# Build orderbook
# Observe the structure of orderbook. The prices are keys for corresponding count and amount.
# Structuring data in this way significantly simplifies orderbook updates.
if len(data) > 10:
bids = {
str(level[0]): [str(level[1]), str(level[2])]
for level in data if level[2] > 0
}
asks = {
str(level[0]): [str(level[1]), str(level[2])[1:]]
for level in data if level[2] < 0
}
orderbooks[pair]['bids'] = bids
orderbooks[pair]['asks'] = asks
# Update orderbook and filter out heartbeat messages.
elif data[0] != 'h':
# Example update message structure [1765.2, 0, 1] where we have [price, count, amount].
# Update algorithm pseudocode from Bitfinex documentation:
# 1. - When count > 0 then you have to add or update the price level.
# 1.1- If amount > 0 then add/update bids.
# 1.2- If amount < 0 then add/update asks.
# 2. - When count = 0 then you have to delete the price level.
# 2.1- If amount = 1 then remove from bids
# 2.2- If amount = -1 then remove from asks
data = [str(data[0]), str(data[1]), str(data[2])]
if int(data[1]) > 0: # 1.
if float(data[2]) > 0: # 1.1
orderbooks[pair]['bids'].update({data[0]: [data[1], data[2]]})
elif float(data[2]) < 0: # 1.2
orderbooks[pair]['asks'].update({data[0]: [data[1], str(data[2])[1:]]})
elif data[1] == '0': # 2.
if data[2] == '1': # 2.1
if orderbooks[pair]['bids'].get(data[0]):
del orderbooks[pair]['bids'][data[0]]
elif data[2] == '-1': # 2.2
if orderbooks[pair]['asks'].get(data[0]):
del orderbooks[pair]['asks'][data[0]]
async def save_data():
""" Save the data to the excel spreadsheet specified """
#NOTE, Adjusted this for every 5 seconds, ie "await asyncio.sleep(10)" to "await asyncio.sleep(5)"
global orderbooks
while 1:
d=[]
e=[]
j=[]
await asyncio.sleep(5)
for pair in PAIRS:
bids2 = [[v[1], v[0], k] for k, v in orderbooks[pair]['bids'].items()]
asks2 = [[k, v[0], v[1]] for k, v in orderbooks[pair]['asks'].items()]
bids2.sort(key=lambda x: float(x[2]), reverse=True)
asks2.sort(key=lambda x: float(x[0]))
table2 = [[*bid, *ask] for (bid, ask) in zip(bids2, asks2)]
d.extend(table2)
e.extend([0]*len(table2))
e[len(e)-len(table2)]=pair
j.extend([0]*len(d))
j[0]=datetime.now().strftime('%Y-%m-%d %H:%M:%S')
s = pd.DataFrame(d, columns=['bid:amount', 'bid:count', 'bid:price', 'ask:price', 'ask:count', 'ask:amount'])
r = pd.DataFrame(e, columns=['Trade pair'])
u = pd.DataFrame(j, columns=['Last updated'])
z = pd.concat([s, r, u], axis=1, join_axes=[s.index])
z.to_excel(writer, 'Sheet1', startrow=0, startcol=0, index=False)
writer.save()
print('Update completed at',datetime.now().strftime('%Y-%m-%d %H:%M:%S'),'.')
async def get_book(pair, session):
""" Subscribes for orderbook updates and fetches updates. """
#print('enter get_book, pair: {}'.format(pair))
pair_dict = deepcopy(SUB_MESG) #Allows for changes to a made within a variable.
pair_dict.update({'pair': pair}) #Updates the dictionary SUB_MESG with the new pair to be evaluated. Will be added to the end of the dictionary.
async with session.ws_connect('wss://api.bitfinex.com/ws/2') as ws:
asyncio.ensure_future(ws.send_json(pair_dict)) #This was added and replaced "ws.send_json(pair_dict)" as Ubuntu python required a link to asyncio for this function.
while 1: #Loops infinitely.
res = await ws.receive()
print(pair_dict['pair'], res.data) # debug
build_book(res, pair)
async def main():
""" Driver coroutine. """
async with aiohttp.ClientSession() as session:
coros = [get_book(pair, session) for pair in PAIRS]
# Append coroutine for printing orderbook snapshots every 10s.
coros.append(save_data())
await asyncio.wait(coros)
orderbooks = {
pair: {}
for pair in PAIRS
}
loop = asyncio.get_event_loop()
loop.run_until_complete(main())