Accessing Attributes of Ticker Object from IB API - python

I'm having issues accessing the attributes of the ticker object returned by reqMktData. I iterated over the object to see it has attributes that match the [ tick list][1]. When I try and print out the data from these attributes I get nan or None. I used the default settings for reqMktData self.ib.reqMktData( self.underlying, "", False, False, []).
Now this may be because the tick types I'm looking for aren't available from TWS, but I don't believe this is the case. I don't believe that it's an issue of including generic tick types either in the second parameter.
# https://www.youtube.com/channel/UC3GoIgz6agdJzFNZWJrIR6g
#Imports
from datetime import datetime
from ib_insync import *
from apscheduler.schedulers.background import BackgroundScheduler
import asyncio
import pandas as pd
import numpy as np
class RiskyOptionsBot:
"""
Gap Trading Bot
"""
#Initialize variables
def __init__(self):
print("Options Bot Running, connecting to IB ...")
#Connect to IB
try:
self.ib = IB()
self.ib.connect('127.0.0.1',7497,clientId=1) #7496 is for live trading, 7497 for paper trading
print("Successfully connected to IB")
except Exception as e:
print(str(e))
# Create SPY Contract
self.underlying = Stock('AMZN', 'SMART', 'USD')
self.ib.qualifyContracts(self.underlying)
print("Backfilling data to catchup ...")
print ("Contract", self.ib.qualifyContracts)
# Request Streaming bars
self.data = self.ib.reqHistoricalData(self.underlying,
endDateTime='',
durationStr='2 D', #Loads the last 2 days
barSizeSetting='1 min',
whatToShow='TRADES', #Can also do bidask
useRTH=False, #Not sure what this does
keepUpToDate=True,)
#print("Data:", self.data)
histData = np.array(self.data)
#print("HistData", histData)
#self.ib.reqMktData returns a ticker object. We iterate over the objects attributes to reveal tick types
#associated with the ticker object. We may need to use the second parameter of reqMktData to specifify which tick types we are using
#first param should be 1 for live market data and 3rd param should be the number
#https://interactivebrokers.github.io/tws-api/market_data_type.html
#https://interactivebrokers.github.io/tws-api/tick_types.html
self.open = self.ib.reqMktData( self.underlying, "100", False, False, []);
print("Open:", self.open)
for i in dir(self.open):
print (i)
print(self.open.ask)
print(self.open.askGreeks)
print(self.open.bidGreeks)
print(self.open.lastGreeks)
print(self.open.vwap)
print(self.open.open)
print(self.open.last)
print(self.open.callVolume)
print(self.open.volume)
#Local vars
self.in_trade = False
#Get current options chains
self.chains = self.ib.reqSecDefOptParams(self.underlying.symbol, '', self.underlying.secType, self.underlying.conId)
for chain in self.chains:
print("Chain", chain)
#Update Chains every hour - can't update chains in event loop causes asyncio issues
update_chain_scheduler = BackgroundScheduler(job_defaults={'max_instances': 1000})
#https://apscheduler.readthedocs.io/en/3.x/userguide.html
update_chain_scheduler.add_job(func=self.update_options_chains,trigger='interval', seconds=5)
update_chain_scheduler.start()
print("Running Live")
# Set callback function for streaming bars
self.data.updateEvent += self.on_bar_update
#self.ib.execDetailsEvent += self.exec_status
#Run forever
self.ib.run()
print("Printing historical data ...")
#print(self.data)
#Update options chains
def update_options_chains(self):
try:
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
print("Updating options chains")
#Get current options chains
self.chains = self.ib.reqSecDefOptParams(self.underlying.symbol, '', self.underlying.secType, self.underlying.conId)
for chain in self.chains:
print("Chains:", chain)
except Exception as e:
print(str(e))
#On Bar Update, when we get new data
def on_bar_update(self, bars: BarDataList, has_new_bar: bool):
if has_new_bar:
#Convert BarDataList to pandas Dataframe
df = util.df(bars)
#Instantiate Class to get things rolling
RiskyOptionsBot()

Related

SQL Server Driver compatible with Async

Using the API CCXT I am pulling the orderbook for multiple symbols using async, this is an example provided by the API themselves and I have tweaked it to my requirements.
However, sometimes when I am running the program I find that some symbols do not download, after speaking to CCXT they advised that it is most likely due to the fact I am writing to the SQL Server table in a way that isn't compatible with async, meaning I need a driver that works with async. Below is my current code that works but misses some symbols sometimes.
import ccxt.async_support as ccxt
from asyncio import gather, run
import pandas as pd
from datetime import datetime
import sqlalchemy as sa
import urllib
from sqlalchemy.ext.asyncio import create_async_engine
async def symbol_loop(exchange, symbol):
print('Connecting to Database Server')
params = urllib.parse.quote_plus("DRIVER={SQL Server};"
"SERVER=PC;"
"DATABASE=crypto;"
'UID=abc;'
'PWD=123;'
"Trusted_Connection=yes")
engine = sa.create_engine("mssql+pyodbc:///?odbc_connect={}".format(params))
print('Starting the', exchange.id, 'symbol loop with', symbol)
level = 0
while True:
try:
level += 1
orderbook = await exchange.fetch_order_book(symbol, 5)
now = exchange.milliseconds()
print(exchange.iso8601(now), exchange.id, symbol, orderbook['asks'][0], len(orderbook['asks']),
orderbook['bids'][0], len(orderbook['bids']))
# Create DataFrame
orderbook_df = pd.DataFrame(index=range(1))
# Clean Symbol
clean_symbol = symbol.replace('USD_PERP', '/USD')
# Add Unique Identifier - Symbol+Exchange
orderbook_df['ID'] = f'{clean_symbol}_BINANCEFUTURES'
# Add Loop Level
orderbook_df['Level'] = level
# Add Date/Time
orderbook_df['DateTime'] = datetime.now()
# Add Ticker Symbol
orderbook_df['Symbol'] = str(symbol)
# Add Exchange
orderbook_df['Exchange'] = str(exchange)
# Add Bid Levels
orderbook_df['Bid1'] = orderbook['bids'][0][0]
# Add Ask Levels
orderbook_df['Ask1'] = orderbook['asks'][0][0]
# Add Bid Volume Levels
orderbook_df['BidVol1'] = orderbook['bids'][0][1]
# Add Ask Volume Levels
orderbook_df['AskVol1'] = orderbook['asks'][0][1]
# Add Trading Fee
orderbook_df['Fee'] = 0.2
orderbook_df.to_sql("Historical_Crypto", engine, if_exists='append')
except Exception as e:
print(str(e))
raise e # uncomment to break all loops in case of an error in any one of them
# break # you can break just this one loop if it fails
async def exchange_loop(exchange_id, symbols):
print('Starting the', exchange_id, 'exchange loop with', symbols)
exchange = getattr(ccxt, exchange_id)()
loops = [symbol_loop(exchange, symbol) for symbol in symbols]
await gather(*loops)
await exchange.close()
async def main():
exchanges = {
'binancecoinm': ['ADAUSD_PERP', 'AVAXUSD_PERP', 'BCHUSD_PERP', 'BNBUSD_PERP', 'BTCUSD_PERP', 'DOGEUSD_PERP'],
}
loops = [exchange_loop(exchange_id, symbols) for exchange_id, symbols in exchanges.items()]
await gather(*loops)
run(main())
I was hoping someone might have an idea of how to replace how I write to SQL in a way that is compatible with async. I have looked at SQLAlchemy and tried their example but I could not get it to work, maybe I missed something.
I am hoping it is an easy fix and someone has a simple solution.
The line that causes the issue is orderbook_df.to_sql("Historical_Crypto", engine, if_exists='append') so this is what needs to be replicated in some way that is asynchronous.

How to break down script into smaller function and create main.py? [closed]

Closed. This question is not reproducible or was caused by typos. It is not currently accepting answers.
This question was caused by a typo or a problem that can no longer be reproduced. While similar questions may be on-topic here, this one was resolved in a way less likely to help future readers.
Closed 1 year ago.
Improve this question
I wrote the script in python that works perfectly fine if executed as-is. What I am trying to do is to break this script into meaningful functions and create main.py to execute this as a proper python application.
Here is my LiveStream.py code with which I am collecting data from the sensor at the beginning of every minute, and sending it to the MySQL database, and also posting it to the URL. As mentioned this works perfectly fine if I execute: python3 LiveStream.py
# Import Dependencies
import board
import pandas as pd
from busio import I2C
import adafruit_bme680
from datetime import datetime, timedelta
import time
import requests
import mysql.connector
import json
import sqlalchemy
# read database config file
with open("config.json") as config:
param = json.load(config)
# Create library object using Bus I2C port
i2c = I2C(board.SCL, board.SDA)
bme680 = adafruit_bme680.Adafruit_BME680_I2C(i2c, debug=False)
# change this to match the location's pressure (hPa) at sea level
bme680.sea_level_pressure = 1013.25
# Read data from sensors
while True:
# Create the now variable to capture the current moment
TimeStamp = datetime.now()
Temperature = round((bme680.temperature * 9/5) + 32, 2)
Gas = round(bme680.gas, 2)
Humidity = round(bme680.humidity, 2)
Pressure = round(bme680.pressure, 2)
Altitude = round(bme680.altitude, 2)
now = datetime.strftime(TimeStamp,"%Y-%m-%dT%H:%M:%S")
# Adding collected measurements into dataframe
data = pd.DataFrame([
{
"TimeStamp": now,
"Temperature": Temperature,
"Gas": Gas,
"Humidity": Humidity,
"Pressure": Pressure,
"Altitude": Altitude
}
])
# Try establishing connection with database
try:
engine = sqlalchemy.create_engine('mysql+mysqlconnector://{0}:{1}#{2}/{3}'.
format(param['MyDemoServer'][0]['user'],
param['MyDemoServer'][0]['password'],
param['MyDemoServer'][0]['host'],
param['MyDemoServer'][0]['database']), echo=False)
# Cleaning the data from existing tables MetricValues and Metrics
db_con = engine.connect()
if db_con.connect():
try:
data.to_sql('sensordata', con = db_con, if_exists = 'append', index = False)
db_con.close()
# Dispose the engine
engine.dispose()
except OSError as e:
print(e)
except OSError as e:
print(e)
# Power BI API
# BI Address to push the data to
url = 'https://api.powerbi.com/beta/94cd2fa9-eb6a-490b-af36-53bf7f5ef485/datasets/2a7a2529-dbfd-4c32-9513-7d5857b61137/rows?noSignUpCheck=1&key=nS3bP1Mo4qN9%2Fp6XJcTBgHBUV%2FcOZb0edYrK%2BtVWDg6iWwzRtY16HWUGSqB9YsqF3GHMNO2fe3r5ltB7NhVIvw%3D%3D'
# post/push data to the streaming API
headers = {
"Content-Type": "application/json"
}
response = requests.request(
method="POST",
url=url,
headers=headers,
data=json.dumps(data.to_json())
)
data = pd.DataFrame()
# Re-run the script at the beginning of every new minute.
dt = datetime.now() + timedelta(minutes=1)
dt = dt.replace(second=1)
while datetime.now() < dt:
time.sleep(1)
Here is what I have tried so far... I created a lib folder where I have etl.py file. in this file I tried creating functions such us:
def sensorsreading():
# Create library object using Bus I2C port
i2c = I2C(board.SCL, board.SDA)
bme680 = adafruit_bme680.Adafruit_BME680_I2C(i2c, debug=False)
# change this to match the location's pressure (hPa) at sea level
bme680.sea_level_pressure = 1013.25
# Read data from sensors
while True:
# Create the now variable to capture the current moment
TimeStamp = datetime.now()
Temperature = round((bme680.temperature * 9 / 5) + 32, 2)
Gas = round(bme680.gas, 2)
Humidity = round(bme680.humidity, 2)
Pressure = round(bme680.pressure, 2)
Altitude = round(bme680.altitude, 2)
now = datetime.strftime(TimeStamp, "%Y-%m-%dT%H:%M:%S")
# Adding collected measurements into dataframe
data = pd.DataFrame([
{
"TimeStamp": now,
"Temperature": Temperature,
"Gas": Gas,
"Humidity": Humidity,
"Pressure": Pressure,
"Altitude": Altitude
}
])
return data
And also function:
def dataload(data):
# Try establishing connection with database
try:
engine = sqlalchemy.create_engine('mysql+mysqlconnector://{0}:{1}#{2}/{3}'.
format(param['MyDemoServer'][0]['user'],
param['MyDemoServer'][0]['password'],
param['MyDemoServer'][0]['host'],
param['MyDemoServer'][0]['database']), echo=False)
# Cleaning the data from existing tables MetricValues and Metrics
db_con = engine.connect()
if db_con.connect():
try:
data.to_sql('sensordata', con=db_con, if_exists='append', index=False)
db_con.close()
# Dispose the engine
engine.dispose()
except OSError as e:
print(e)
except OSError as e:
print(e)
And my main.py looks like this:
import pandas as pd
from datetime import datetime, timedelta
import time
from lib.etl import *
def etl(name):
data = sensorsreading()
dataload(data)
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
etl('PyCharm')
# Re-run the script at the beginning of every new minute.
dt = datetime.now() + timedelta(minutes=1)
dt = dt.replace(second=1)
while datetime.now() < dt:
time.sleep(1)
When I run main.py it seems that I am not passing the data frame from sensorsreading() to dataload() function.
Any idea what am I doing wrong here?
To address you original question, you were using yield instead of return. Yields is used in generators, as you can read more here: https://www.geeksforgeeks.org/use-yield-keyword-instead-return-keyword-python/
In the case you don't need a precise execution, this will call the function each 60 seconds. Anyways, I'll sugest using a scheduler like systemctl or cron.
import time
while True:
etl('PyCharm')
time.sleep(60)
If you want something more precise you could use:
import time
starttime = time.time()
while True:
etl('PyCharm')
time.sleep(60.0 - ((time.time() - starttime) % 60.0))
as explained in What is the best way to repeatedly execute a function every x seconds?

How to add time delay in asynchronous coroutines?

I am attempting to retrieve historical data concurrently from Binance for each crypto pair in my database. I am running into bans with APIErrors, stating "APIError(code=-1003): Way too much request weight used; IP banned until 1629399758399. Please use the websocket for live updates to avoid bans."
How can I add a time delay to prevent reaching the API request weight limit which is 1200 per 1 Minute?
here's what I have as of now
import numpy as np
import json
import requests
import datetime, time
import aiohttp, asyncpg, asyncio
from asyncio import gather, create_task
from binance.client import AsyncClient
from multiprocessing import Process
import time
import config
async def main():
# create database connection pool
pool = await asyncpg.create_pool(user=config.DB_USER, password=config.DB_PASS, database=config.DB_NAME, host=config.DB_HOST, command_timeout=60)
# get a connection
async with pool.acquire() as connection:
cryptos = await connection.fetch("SELECT * FROM crypto")
symbols = {}
for crypto in cryptos:
symbols[crypto['id']] = crypto['symbol']
await get_prices(pool, symbols)
async def get_prices(pool, symbols):
try:
# schedule requests to run concurrently for all symbols
tasks = [create_task(get_price(pool, crypto_id, symbols[crypto_id])) for crypto_id in symbols]
await gather(*tasks)
print("Finalized all. Retrieved price data of {} outputs.".format(len(tasks)))
except Exception as e:
print("Unable to fetch crypto prices due to {}.".format(e.__class__))
print(e)
async def get_price(pool, crypto_id, url):
try:
candlesticks = []
client = await AsyncClient.create(config.BINANCE_API_KEY, config.BINANCE_SECRET_KEY)
async for kline in await client.get_historical_klines_generator(f"{crypto_id}".format(), AsyncClient.KLINE_INTERVAL_1HOUR, "18 Aug, 2021", "19 Aug, 2021"):
candlesticks.append(kline)
df = pd.DataFrame(candlesticks, columns = ["date","open","high","low","close","volume","Close time","Quote Asset Volume","Number of Trades","Taker buy base asset volume","Taker buy quote asset volume","Ignore"])
df["date"] = pd.to_datetime(df.loc[:, "date"], unit ='ms')
df.drop(columns=['Close time','Ignore', 'Quote Asset Volume', 'Number of Trades', 'Taker buy base asset volume', 'Taker buy quote asset volume'], inplace=True)
df.loc[:, "id"] = crypto_id
df
print(df)
except Exception as e:
print("Unable to get {} prices due to {}.".format(url, e.__class__))
print(e)
start = time.time()
if __name__ == "__main__":
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
end = time.time()
print("Took {} seconds.".format(end - start))
You can create an instance of a custom class that will keep the count of currently active requests (and timing of requests) - and only allow one request to proceed if that guard says it is ok.
Python´s async with command would be nice to use in such a construct since it can both guard a block, and decrease the active request count with minimal intervention in the code you already have.
This can proceed like this- the line in your code that actually trigger the requests is:
client = await AsyncClient.create(config.BINANCE_API_KEY, config.BINANCE_SECRET_KEY)
So, if we can ensure this line is called at most 1200 times per minute, having to yield to the mainloop while it does not happen, we are good.
While it would be possible to burst 1200 (-1) calls and them waiut one minute, the code will be both easier to write, and the API limit will be more respected in its spirit, if we simply yield one call each (60s / 1200) ( x 90% for a 10% nice margin) seconds.
The async with will call the class' __aenter__ method. In there we can simply check the time interval since the last API call and sleep until this time has passed.
(Actually, we will need one instance of the class per task, as __aenter__ needs to be called in each instance). But in order not to depend on a global "counter", we can create a factory function that will create a guard per API that needs limiting - and we keep that one in a global variable)
So, you can add this factory function to your program, and then create a guard-class on your main function and use "async with" inside the tasks code:
def create_rate_limit_guard(rate_limit=1200, safety_margin=0.9):
"""Rate limit is given in maximum requests per minute.
"""
# TBD: it would easy to have code to throttle by maximum active requests
# instead of total requests per minute.
# I will just let the accounting of concurrent_requests in place, though
class Guard:
request_interval = (60 / rate_limit) * safety_margin
current_requests = 0
max_concurrent_requests = 0
last_request = 0
async def __aenter__(self):
cls = self.__class__
cls.current_requests += 1
if (throttle_wait:= time.time() - last_request) < cls.request_interval:
await asyncio.sleep(throttle_wait)
cls.current_requests += 1
cls.last_request = time.time()
async def __aexit__(self, exc_type, exc, tb):
cls = self.__class__
cls.max_concurrent_requests = max(cls.max_concurrent_requests, cls.current_requests)
cls.current_requests -= 1
return Guard
And in your code, you could just change get_price to this, and create the guard class (last line before if ...__main__:
async def get_price(pool, crypto_id, url):
try:
candlesticks = []
# consider having a single client application wise - you are creating one per task.
with BinanceLimitGuard():
client = await AsyncClient.create(config.BINANCE_API_KEY, config.BINANCE_SECRET_KEY)
# as the actual calls to the remote endpoint are done inside the client code itself,
# we can't just run "async for" on the generator - instead we have to throttle
# all the "for" interactions. So we "unfold" the async for in a while/anext
# structure so that we can place the guard before each interation:
klines_generator = await client.get_historical_klines_generator(
f"{crypto_id}".format(), AsyncClient.KLINE_INTERVAL_1HOUR, "18 Aug, 2021", "19 Aug, 2021")
while True:
try:
with BinanceLimitGuard():
kline = await klines_generator.__anext__()
except StopAsyncIteration:
break
candlesticks.append(kline)
df = pd.DataFrame(candlesticks, columns = ["date","open","high","low","close","volume","Close time","Quote Asset Volume","Number of Trades","Taker buy base asset volume","Taker buy quote asset volume","Ignore"])
df["date"] = pd.to_datetime(df.loc[:, "date"], unit ='ms')
df.drop(columns=['Close time','Ignore', 'Quote Asset Volume', 'Number of Trades', 'Taker buy base asset volume', 'Taker buy quote asset volume'], inplace=True)
df.loc[:, "id"] = crypto_id
print(df)
except Exception as e:
print("Unable to get {} prices due to {}.".format(url, e.__class__))
print(e)
BinanceLimitGuard = create_rate_limit_guard(300)
if __name__ == "__main__":
# all code that is meant to take place when your file is run as a program
# should be guarded in this if block. Importing your file should not "print"
start = time.time()
loop = asyncio.get_event_loop()
loop.run_until_complete(main())
end = time.time()
print("Took {} seconds.".format(end - start))
Note that while I designed the guard to "1200 requests per minute" - I sugested a limit of "300" parallel tasks per minute above, in BinanceLimitGuard = create_rate_limit_guard(300) - because, checking the source code for the binance client itself, it does perform several requests of itself in a call to "get_historical_klines" - and that code has embedded a limit of 3 calls per second - but which take place per generator, so we can't account for them on the outside code.
If this still not work, it can be made to work by subclassing (or monkeypatching) the AsyncClient itself and placing the limit rate on its internal _request_api internal method, at this place https://github.com/sammchardy/python-binance/blob/a6f3048527f0f2fd9bc6591ac1fdd926b2a29f3e/binance/client.py#L330 - then you can go back to the "1200 limit" as it will account all internal calls. (drop a comment if you need to resort to this, I could complete this answer or add another one)

How to run multiple Azure Functions in parallel which scroll through Elasticsearch?

I have a setup where I need to extract data from Elasticsearch and store it on an Azure Blob. Now to get the data I am using Elasticsearch's _search and _scroll API. The indexes are pretty well designed and are formatted something like game1.*, game2.*, game3.* etc.
I've created a worker.py file which I stored in a folder called shared_code as Microsoft suggests and I have several Timer Trigger Functions which import and call worker.py. Due to the way ES was setup on our side I had to create a VNET and a static Outbound IP address which we've then whitelisted on ES. Conversely, the data is only available to be extracted from ES only on port 9200. So I've created an Azure Function App which has the connection setup and I am trying to create multiple Functions (game1-worker, game2-worker, game3-worker) to pull the data from ES running in parallel on minute 5. I've noticed if I add the FUNCTIONS_WORKER_PROCESS_COUNT = 1 setting then the functions will wait until the first triggered one finishes its task and then the second one triggers. If I don't add this app setting or increase the number, then once a function stopped because it finished working, it will try to start it again and then I get a OSError: [WinError 10048] Only one usage of each socket address (protocol/network address/port) is normally permitted error. Is there a way I can make these run in parallel but not have the mentioned error?
Here is the code for the worker.py:
#!/usr/bin/env python
# coding: utf-8
# # Elasticsearch to Azure Microservice
import json, datetime, gzip, importlib, os, re, logging
from elasticsearch import Elasticsearch
import azure.storage.blob as azsb
import azure.identity as azi
import os
import tempfile
def batch(game_name, env='prod'):
# #### Global Variables
env = env.lower()
connection_string = os.getenv('conn_storage')
lowerFormat = game_name.lower().replace(" ","_")
azFormat = re.sub(r'[^0-9a-zA-Z]+', '-', game_name).lower()
storageContainerName = azFormat
stateStorageContainerName = "azure-webjobs-state"
minutesOffset = 5
tempFilePath = tempfile.gettempdir()
curFileName = f"{lowerFormat}_cursor.py"
curTempFilePath = os.path.join(tempFilePath,curFileName)
curBlobFilePath = f"cursors/{curFileName}"
esUrl = os.getenv('esUrl')
# #### Connections
es = Elasticsearch(
esUrl,
port=9200,
timeout=300)
def uploadJsonGzipBlob(filePathAndName, jsonBody):
blob = azsb.BlobClient.from_connection_string(
conn_str=connection_string,
container_name=storageContainerName,
blob_name=filePathAndName
)
blob.upload_blob(gzip.compress(bytes(json.dumps(jsonBody), encoding='utf-8')))
def getAndLoadCursor(filePathAndName):
# Get cursor from blob
blob = azsb.BlobClient.from_connection_string(
conn_str=os.getenv('AzureWebJobsStorage'),
container_name=stateStorageContainerName,
blob_name=filePathAndName
)
# Stream it to Temp file
with open(curTempFilePath, "wb") as f:
data = blob.download_blob()
data.readinto(f)
# Load it by path
spec = importlib.util.spec_from_file_location("cursor", curTempFilePath)
cur = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cur)
return cur
def writeCursor(filePathAndName, body):
blob = azsb.BlobClient.from_connection_string(
conn_str=os.getenv('AzureWebJobsStorage'),
container_name=stateStorageContainerName,
blob_name=filePathAndName
)
blob.upload_blob(body, overwrite=True)
# Parameter and state settings
if os.getenv(f"{lowerFormat}_maxSizeMB") is None:
maxSizeMB = 10 # Default to 10 MB
else:
maxSizeMB = int(os.getenv(f"{lowerFormat}_maxSizeMB"))
if os.getenv(f"{lowerFormat}_maxProcessTimeSeconds") is None:
maxProcessTimeSeconds = 300 # Default to 300 seconds
else:
maxProcessTimeSeconds = int(os.getenv(f"{lowerFormat}_maxProcessTimeSeconds"))
try:
cur = getAndLoadCursor(curBlobFilePath)
except Exception as e:
dtStr = f"{datetime.datetime.utcnow():%Y/%m/%d %H:%M:00}"
writeCursor(curBlobFilePath, f"# Please use format YYYY/MM/DD HH24:MI:SS\nlastPolled = '{dtStr}'")
logging.info(f"No cursor file. Generated {curFileName} file with date {dtStr}")
return 0
# # Scrolling and Batching Engine
lastRowDateOffset = cur.lastPolled
nrFilesThisInstance = 0
while 1:
# Offset the current time by -5 minutes to account for the 2-3 min delay in Elasticsearch
initTime = datetime.datetime.utcnow()
## Filter lt (less than) endDate to avoid infinite loops.
## Filter lt manually when compiling historical based on
endDate = initTime-datetime.timedelta(minutes=minutesOffset)
endDate = f"{endDate:%Y/%m/%d %H:%M:%S}"
doc = {
"query": {
"range": {
"baseCtx.date": {
"gt": lastRowDateOffset,
"lt": endDate
}
}
}
}
Index = lowerFormat + ".*"
if env == 'dev': Index = 'dev.' + Index
if nrFilesThisInstance == 0:
page = es.search(
index = Index,
sort = "baseCtx.date:asc",
scroll = "2m",
size = 10000,
body = doc
)
else:
page = es.scroll(scroll_id = sid, scroll = "10m")
pageSize = len(page["hits"]["hits"])
data = page["hits"]["hits"]
sid = page["_scroll_id"]
totalSize = page["hits"]["total"]
print(f"Total Size: {totalSize}")
cnt = 0
# totalSize might be flawed as it returns at times an integer > 0 but array is empty
# To overcome this, I've added the below check for the array size instead
if pageSize == 0: break
while 1:
cnt += 1
page = es.scroll(scroll_id = sid, scroll = "10m")
pageSize = len(page["hits"]["hits"])
sid = page["_scroll_id"]
data += page["hits"]["hits"]
sizeMB = len(gzip.compress(bytes(json.dumps(data), encoding='utf-8'))) / (1024**2)
loopTime = datetime.datetime.utcnow()
processTimeSeconds = (loopTime-initTime).seconds
print(f"{cnt} Results pulled: {pageSize} -- Cumulative Results: {len(data)} -- Gzip Size MB: {sizeMB} -- processTimeSeconds: {processTimeSeconds} -- pageSize: {pageSize} -- startDate: {lastRowDateOffset} -- endDate: {endDate}")
if sizeMB > maxSizeMB: break
if processTimeSeconds > maxProcessTimeSeconds: break
if pageSize < 10000: break
lastRowDateOffset = max([x['_source']['baseCtx']['date'] for x in data])
lastRowDateOffsetDT = datetime.datetime.strptime(lastRowDateOffset, '%Y/%m/%d %H:%M:%S')
outFile = f"elasticsearch/live/{lastRowDateOffsetDT:%Y/%m/%d/%H}/{lowerFormat}_live_{lastRowDateOffsetDT:%Y%m%d%H%M%S}.json.gz"
uploadJsonGzipBlob(outFile, data)
writeCursor(curBlobFilePath, f"# Please use format YYYY/MM/DD HH24:MI:SS\nlastPolled = '{lastRowDateOffset}'")
nrFilesThisInstance += 1
logging.info(f"File compiled: {outFile} -- {sizeMB} MB\n")
# If the while loop ran for more than maxProcessTimeSeconds then end it
if processTimeSeconds > maxProcessTimeSeconds: break
if pageSize < 10000: break
logging.info(f"Closing Connection to {esUrl}")
es.close()
return 0
And these are 2 of the timing triggers I am calling:
game1-worker
import logging
import datetime
import azure.functions as func
#from shared_code import worker
import importlib
def main(mytimer: func.TimerRequest) -> None:
utc_timestamp = datetime.datetime.utcnow().replace(
tzinfo=datetime.timezone.utc).isoformat()
if mytimer.past_due:
logging.info('The timer is past due!')
# Load a new instance of worker.py
spec = importlib.util.spec_from_file_location("worker", "shared_code/worker.py")
worker = importlib.util.module_from_spec(spec)
spec.loader.exec_module(worker)
worker.batch('game1name')
logging.info('Python timer trigger function ran at %s', utc_timestamp)
game2-worker
import logging
import datetime
import azure.functions as func
#from shared_code import worker
import importlib
def main(mytimer: func.TimerRequest) -> None:
utc_timestamp = datetime.datetime.utcnow().replace(
tzinfo=datetime.timezone.utc).isoformat()
if mytimer.past_due:
logging.info('The timer is past due!')
# Load a new instance of worker.py
spec = importlib.util.spec_from_file_location("worker", "shared_code/worker.py")
worker = importlib.util.module_from_spec(spec)
spec.loader.exec_module(worker)
worker.batch('game2name')
logging.info('Python timer trigger function ran at %s', utc_timestamp)
TL;DR
Based on what you described, multiple worker-processes share underlying runtime's resources (sockets).
For your usecase you just need to leave FUNCTIONS_WORKER_PROCESS_COUNT at 1. Default value is supposed to be 1, so not specifying it should mean the same as setting it to 1.
You need to understand how Azure Functions scale. It is very unnatural/confusing.
Assumes Consumption Plan.
Coding: You write Functions. Say F1 an F2. How you organize is up to you.
Provisioning:
You create a Function App.
You deploy F1 and F2 to this App.
You start the App. (not function).
Runtime:
At start
Azure spawns one Function Host. Think of this as a container/OS.
Inside the Host, one worker-process is created. This worker-process will host one instance of App.
If you change FUNCTIONS_WORKER_PROCESS_COUNT to say 10 then Host will spawn 10 processes and run your App inside each of them.
When a Function is triggered (function could be triggered due to timer, or REST calls or message in Q, ...)
Each worker-process is capable of servicing one request at a time. Be it a request for F1 or F2. One at a time.
Each Host is capable servicing one request per worker-process in it.
If backlog of requests grows, then Azure load balancer would trigger scale-out and create new Function Hosts.
Based on limited info, it seems like bad design to create 3 functions. You could instead create a single timer-triggered function, which sends out 3 messages to a Q (Storage Q should be more than plenty for such minuscule traffic), which in turn triggers your actual Function/implementation (which is storage Q triggered Function). Message would be something like {"game_name": "game1"}.

How to JSON dump to a rotating file object

I'm writing a program which periodically dumps old data from a RethinkDB database into a file and removes it from the database. Currently, the data is dumped into a single file which grows without limit. I'd like to change this so that the maximum file size is, say, 250 Mb, and the program starts to write to a new output file just before this size is exceeded.
It seems like Python's RotatingFileHandler class for loggers does approximately what I want; however, I'm not sure whether logging can be applied to any JSON-dumpable object or just to strings.
Another possible approach would be to use (a variant of) Mike Pennington's
RotatingFile class (see python: outfile to another text file if exceed certain file size).
Which of these approaches is likely to be the most fruitful?
For reference, my current program is as follows:
import os
import sys
import json
import rethinkdb as r
import pytz
from datetime import datetime, timedelta
import schedule
import time
import functools
from iclib import RethinkDB
import msgpack
''' The purpose of the Controller is to periodically archive data from the "sensor_data" table so that it does not grow without limit.'''
class Controller(RethinkDB):
def __init__(self, db_address=(os.environ['DB_ADDR'], int(os.environ['DB_PORT'])), db_name=os.environ['DB_NAME']):
super(Controller, self).__init__(db_address=db_address, db_name=db_name) # Initialize the IperCronComponent with the default logger name (in this case, "Controller")
self.db_table = RethinkDB.SENSOR_DATA_TABLE # The table name is "sensor_data" and is stored as a class variable in RethinkDBMixIn
def generate_archiving_query(self, retention_period=timedelta(days=3)):
expiry_time = r.now() - retention_period.total_seconds() # Timestamp before which data is to be archived
if "timestamp" in r.table(self.db_table).index_list().run(self.db): # If "timestamp" is a secondary index
beginning_of_time = r.time(1400, 1, 1, 'Z') # The minimum time of a ReQL time object (i.e., the year 1400 in the UTC timezone)
data_to_archive = r.table(self.db_table).between(beginning_of_time, expiry_time, index="timestamp") # Generate query using "between" (faster)
else:
data_to_archive = r.table(self.db_table).filter(r.row['timestamp'] < expiry_time) # Generate the same query using "filter" (slower, but does not require "timestamp" to be a secondary index)
return data_to_archive
def archiving_job(self, data_to_archive=None, output_file="archived_sensor_data.json"):
if data_to_archive is None:
data_to_archive = self.generate_archiving_query() # By default, the call the "generate_archiving_query" function to generate the query
old_data = data_to_archive.run(self.db, time_format="raw") # Without time_format="raw" the output does not dump to JSON
with open(output_file, 'a') as f:
ids_to_delete = []
for item in old_data:
print item
# msgpack.dump(item, f)
json.dump(item, f)
f.write('\n') # Separate each document by a new line
ids_to_delete.append(item['id'])
r.table(self.db_table).get_all(r.args(ids_to_delete)).delete().run(self.db) # Delete based on ID. It is preferred to delete the entire batch in a single operation rather than to delete them one by one in the for loop.
def test_job_1():
db_name = "ipercron"
table_name = "sensor_data"
port_offset = 1 # To avoid interference of this testing program with the main program, all ports are initialized at an offset of 1 from the default ports using "rethinkdb --port_offset 1" at the command line.
conn = r.connect("localhost", 28015 + port_offset)
r.db(db_name).table(table_name).delete().run(conn)
import rethinkdb_add_data
controller = Controller(db_address=("localhost", 28015+port_offset))
archiving_job = functools.partial(controller.archiving_job, data_to_archive=controller.generate_archiving_query())
return archiving_job
if __name__ == "__main__":
archiving_job = test_job_1()
schedule.every(0.1).minutes.do(archiving_job)
while True:
schedule.run_pending()
It is not completely 'runnable' from the part shown, but the key point is that I would like to replace the line
json.dump(item, f)
with a similar line in which f is a rotating, and not fixed, file object.
Following Stanislav Ivanov, I used json.dumps to convert each RethinkDB document to a string and wrote this to a RotatingFileHandler:
import os
import sys
import json
import rethinkdb as r
import pytz
from datetime import datetime, timedelta
import schedule
import time
import functools
from iclib import RethinkDB
import msgpack
import logging
from logging.handlers import RotatingFileHandler
from random_data_generator import RandomDataGenerator
''' The purpose of the Controller is to periodically archive data from the "sensor_data" table so that it does not grow without limit.'''
os.environ['DB_ADDR'] = 'localhost'
os.environ['DB_PORT'] = '28015'
os.environ['DB_NAME'] = 'ipercron'
class Controller(RethinkDB):
def __init__(self, db_address=None, db_name=None):
if db_address is None:
db_address = (os.environ['DB_ADDR'], int(os.environ['DB_PORT'])) # The default host ("rethinkdb") and port (28015) are stored as environment variables
if db_name is None:
db_name = os.environ['DB_NAME'] # The default database is "ipercron" and is stored as an environment variable
super(Controller, self).__init__(db_address=db_address, db_name=db_name) # Initialize the instance of the RethinkDB class. IperCronComponent will be initialized with its default logger name (in this case, "Controller")
self.db_name = db_name
self.db_table = RethinkDB.SENSOR_DATA_TABLE # The table name is "sensor_data" and is stored as a class variable of RethinkDBMixIn
self.table = r.db(self.db_name).table(self.db_table)
self.archiving_logger = logging.getLogger("archiving_logger")
self.archiving_logger.setLevel(logging.DEBUG)
self.archiving_handler = RotatingFileHandler("archived_sensor_data.log", maxBytes=2000, backupCount=10)
self.archiving_logger.addHandler(self.archiving_handler)
def generate_archiving_query(self, retention_period=timedelta(days=3)):
expiry_time = r.now() - retention_period.total_seconds() # Timestamp before which data is to be archived
if "timestamp" in self.table.index_list().run(self.db):
beginning_of_time = r.time(1400, 1, 1, 'Z') # The minimum time of a ReQL time object (namely, the year 1400 in UTC)
data_to_archive = self.table.between(beginning_of_time, expiry_time, index="timestamp") # Generate query using "between" (faster, requires "timestamp" to be a secondary index)
else:
data_to_archive = self.table.filter(r.row['timestamp'] < expiry_time) # Generate query using "filter" (slower, but does not require "timestamp" to be a secondary index)
return data_to_archive
def archiving_job(self, data_to_archive=None):
if data_to_archive is None:
data_to_archive = self.generate_archiving_query() # By default, the call the "generate_archiving_query" function to generate the query
old_data = data_to_archive.run(self.db, time_format="raw") # Without time_format="raw" the output does not dump to JSON or msgpack
ids_to_delete = []
for item in old_data:
print item
self.dump(item)
ids_to_delete.append(item['id'])
self.table.get_all(r.args(ids_to_delete)).delete().run(self.db) # Delete based on ID. It is preferred to delete the entire batch in a single operation rather than to delete them one by one in the for-loop.
def dump(self, item, mode='json'):
if mode == 'json':
dump_string = json.dumps(item)
elif mode == 'msgpack':
dump_string = msgpack.packb(item)
self.archiving_logger.debug(dump_string)
def populate_database(db_name, table_name, conn):
if db_name not in r.db_list().run(conn):
r.db_create(db_name).run(conn) # Create the database if it does not yet exist
if table_name not in r.db(db_name).table_list().run(conn):
r.db(db_name).table_create(table_name).run(conn) # Create the table if it does not yet exist
r.db(db_name).table(table_name).delete().run(conn) # Empty the table to start with a clean slate
# Generate random data with timestamps uniformly distributed over the past 6 days
random_data_time_interval = timedelta(days=6)
start_random_data = datetime.utcnow().replace(tzinfo=pytz.utc) - random_data_time_interval
random_generator = RandomDataGenerator(seed=0)
packets = random_generator.packets(N=100, start=start_random_data)
# print packets
print "Adding data to the database..."
r.db(db_name).table(table_name).insert(packets).run(conn)
if __name__ == "__main__":
db_name = "ipercron"
table_name = "sensor_data"
port_offset = 1 # To avoid interference of this testing program with the main program, all ports are initialized at an offset of 1 from the default ports using "rethinkdb --port_offset 1" at the command line.
host = "localhost"
port = 28015 + port_offset
conn = r.connect(host, port) # RethinkDB connection object
populate_database(db_name, table_name, conn)
# import rethinkdb_add_data
controller = Controller(db_address=(host, port))
archiving_job = functools.partial(controller.archiving_job, data_to_archive=controller.generate_archiving_query()) # This ensures that the query is only generated once. (This is sufficient since r.now() is re-evaluated every time a connection is made).
schedule.every(0.1).minutes.do(archiving_job)
while True:
schedule.run_pending()
In this context the RethinkDB class does little other than define the class variable SENSOR_DATA_TABLE and the RethinkDB connection, self.db = r.connect(self.address[0], self.address[1]). This is run together with a module for generating fake data, random_data_generator.py:
import random
import faker
from datetime import datetime, timedelta
import pytz
import rethinkdb as r
class RandomDataGenerator(object):
def __init__(self, seed=None):
self._seed = seed
self._random = random.Random()
self._random.seed(seed)
self.fake = faker.Faker()
self.fake.random.seed(seed)
def __getattr__(self, x):
return getattr(self._random, x)
def name(self):
return self.fake.name()
def datetime(self, start=None, end=None):
if start is None:
start = datetime(2000, 1, 1, tzinfo=pytz.utc) # Jan 1st 2000
if end is None:
end = datetime.utcnow().replace(tzinfo=pytz.utc)
if isinstance(end, datetime):
dt = end - start
elif isinstance(end, timedelta):
dt = end
assert isinstance(dt, timedelta)
random_dt = timedelta(microseconds=self._random.randrange(int(dt.total_seconds() * (10 ** 6))))
return start + random_dt
def packets(self, N=1, start=None, end=None):
return [{'name': self.name(), 'timestamp': self.datetime(start=start, end=end)} for _ in range(N)]
When I run controller it produces several rolled-over output logs, each at most 2 kB in size, as expected:

Categories

Resources