get results from kafka for a specific period of time - python

Here is my code, that uses kafka-python.
now = datetime.now()
month_ago = now - relativedelta(month=1)
topic = 'some_topic_name'
consumer = KafkaConsumer(topic, bootstrap_servers=PROD_KAFKA_SERVER,
security_protocol=PROTOCOL,
group_id=GROUP_ID,
enable_auto_commit=False,
sasl_mechanism=SASL_MECHANISM, sasl_plain_username=SASL_USERNAME,
sasl_plain_password=SASL_PASSWORD)
for msg in consumer:
print(msg)
I want to get results from topic just between now and month_ago in a loop. How can I do this?
Thanks for any help!

Get topic partitions, assigned to your consumer:
partitions = consumer.assignment()
Get offsets for partitions by datetime:
month_ago_timestamp = int(month_ago.timestamp() * 1000)
partition_to_timestamp = {part: month_ago_timestamp for part in partitions}
mapping = consumer.offsets_for_times(partition_to_timestamp)
Seek partitions to offsets:
for partition, offset_and_timestamp in partition_to_offset_and_timestamp.items():
consumer.seek(partition, offset_and_timestamp[0])
Warning! Consumer can return None, set with int zero or block indefinitely in cases like missing topic, missing partition or messages without timestamp

Finally, I do this :) My code looks like this:
topic = 'some_topic_name'
consumer = KafkaConsumer(bootstrap_servers=PROD_KAFKA_SERVER,
security_protocol=PROTOCOL,
group_id=GROUP_ID,
sasl_mechanism=SASL_MECHANISM, sasl_plain_username=SASL_USERNAME,
sasl_plain_password=SASL_PASSWORD)
month_ago = (datetime.now() - relativedelta(months=1)).timestamp()
topic_partition = TopicPartition(topic, 0)
assigned_topic = [topic_partition]
consumer.assign(assigned_topic)
partitions = consumer.assignment()
partition_to_timestamp = {part: int(month_ago * 1000) for part in partitions}
end_offsets = consumer.end_offsets(list(partition_to_timestamp.keys()))
mapping = consumer.offsets_for_times(partition_to_timestamp)
for partition, ts in mapping.items():
end_offset = end_offsets.get(partition)
consumer.seek(partition, ts[0])
for msg in consumer:
value = json.loads(msg.value.decode('utf-8'))
# do something
if msg.offset == end_offset - 1:
consumer.close()
break

Related

Getting Expired Options Contract pricing from Interactive Brokers

I am looking to reconstruct Expired Options pricing with the help of ib_insync library and Interactive Brokers available data..
Because IB provides OPTION_IMPLIED_VOLATILITY as an output for reqHistoricalData I was thinking proceeding this way:
Have a function to infer Expired Options Contract Prices from Black-Scholes Model:
def black_scholes(stock_price,strike_price,vol,time,rate,right="Call"):
d1 = (np.log(stock_price/strike_price) + (rate + 0.5* vol**2)*time)/(vol*np.sqrt(time))
d2 = (np.log(stock_price/strike_price) + (rate - 0.5* vol**2)*time)/(vol*np.sqrt(time))
nd1 = norm.cdf(d1)
nd2 = norm.cdf(d2)
n_d1 = norm.cdf(-1*d1)
n_d2 = norm.cdf(-1*d2)
if right.capitalize()[0] == "C":
return round((stock_price*nd1) - (strike_price*np.exp(-1*rate*time)*nd2),2)
else:
return round((strike_price*np.exp(-1*rate*time)*n_d2) - (stock_price*n_d1),2)
Using the contract on the underlying stock assuming I have a valid ib connection opened elsewhere in my code to retrieve the data
def get_stock_history(symbol,whattoshow_string):
contract = Stock(symbol, 'SMART', 'USD')
ib.reqMarketDataType(2)
bars = ib.reqHistoricalData(
contract,
endDateTime='',
durationStr='2 Y',
barSizeSetting='1 Hour',
whatToShow=whattoshow_string,
useRTH=True,
formatDate=1)
ib.sleep(5)
df = util.df(bars)
df['date'] = pd.to_datetime(df['date'] ).dt.date
return df
I also have a handy function to compute maturity in the BSM based on an hourly time decay:
def hourCount(DF, expiry):
DF["maturity"] = ((dt.datetime.strptime(expiry, "%Y-%m-%d") - pd.to_datetime(DF.index))/pd.Timedelta(hours=1))/(365*24)
I could then get the data as below assuming I have an expiration date and strike from elsewhere I wish to backtest:
strike = 148
expiration_date = '2022-12-02'
symbol = 'AAPL'
historicalData = get_stock_history(symbol,'ADJUSTED_LAST')
impVolData = get_stock_history(symbol,'OPTION_IMPLIED_VOLATILITY')
option_price_call = pd.DataFrame(columns=["open","high","low","close"])
option_price_put = pd.DataFrame(columns=["open","high","low","close"])
hourCount(historicalData, expiration_date)
hourCount(impVolData, expiration_date)
historicalData = historicalData [(historicalData["maturity"] > 0)]
impVolData = impVolData[(impVolData["maturity"] > 0)]
for column in ["open","high","low","close"]:
option_price_call[column] = black_scholes(historicalData[column], strike, impVolData[column], historicalData["maturity"], 0.03,right="Call")
option_price_put[column] = black_scholes(historicalData[column], strike, impVolData[column], historicalData["maturity"], 0.03,right="Put")
Would that be a good approach to reconstruct/backtest the Expired Options contract pricing or am I overlooking something here? and maybe a smarter way to achieve this operation?
Thanks in advance for your suggestions! (y)

How to loop function using a list of variables?

I have a function that prints OHLCV data for stock prices from a websocket. It works but I have to copy it for each variable (Var1 to Var14) to get each individual stock data. How would I automate this process given that I have list:
varlist = [var1, var2, var3...var14]
and my code is:
def process_messages_for_var1(msg):
if msg['e'] == 'error':
print(msg['m'])
# If message is a trade, print the OHLC data
else:
# Convert time into understandable structure
transactiontime = msg['k']['T'] / 1000
transactiontime = datetime.fromtimestamp(transactiontime).strftime('%d %b %Y %H:%M:%S')
# Process this message once websocket starts
print("{} - {} - Interval {} - Open: {} - Close: {} - High: {} - Low: {} - Volume: {}".
format(transactiontime,msg['s'],msg['k']['i'],msg['k']['o'],msg['k']['c'],msg['k']['h'],msg['k']['l'],msg['k']['v']))
# Also, put information into an array
kline_array_msg = "{},{},{},{},{},{}".format(
msg['k']['T'],msg['k']['o'],msg['k']['c'],msg['k']['h'],msg['k']['l'],msg['k']['v'])
# Insert at first position
kline_array_dct[var1].insert(0, kline_array_msg)
if (len(kline_array_dct[var1]) > window):
# Remove last message when res_array size is > of FIXED_SIZE
del kline_array_dct[var1][-1]
I'm hoping to get the following result (notice how function name also changes):
def process_messages_for_var2(msg):
if msg['e'] == 'error':
print(msg['m'])
# If message is a trade, print the OHLC data
else:
# Convert time into understandable structure
transactiontime = msg['k']['T'] / 1000
transactiontime = datetime.fromtimestamp(transactiontime).strftime('%d %b %Y %H:%M:%S')
# Process this message once websocket starts
print("{} - {} - Interval {} - Open: {} - Close: {} - High: {} - Low: {} - Volume: {}".
format(transactiontime,msg['s'],msg['k']['i'],msg['k']['o'],msg['k']['c'],msg['k']['h'],msg['k']['l'],msg['k']['v']))
# Also, put information into an array
kline_array_msg = "{},{},{},{},{},{}".format(
msg['k']['T'],msg['k']['o'],msg['k']['c'],msg['k']['h'],msg['k']['l'],msg['k']['v'])
# Insert at first position
kline_array_dct[var2].insert(0, kline_array_msg)
if (len(kline_array_dct[var2]) > window):
# Remove last message when res_array size is > of FIXED_SIZE
del kline_array_dct[var2][-1]
You can adjust the function so that it takes one of the vars as an argument. I.e.,
def process_messages(msg, var):
...
kline_array_dct[var].insert(0, kline_array_msg)
if (len(kline_array_dct[var]) > window):
# Remove last message when res_array size is > of FIXED_SIZE
del kline_array_dct[var][-1]
If the processes are generally the same, just define one of them, and give it more arguments:
def process_messages(msg, var)
Then, you can adjust your process code to run through each var when you call it. You can do this by removing the numbered vars in the process code:
if msg['e'] == 'error':
print(msg['m'])
# If message is a trade, print the OHLC data
else:
# Convert time into understandable structure
transactiontime = msg['k']['T'] / 1000
transactiontime = datetime.fromtimestamp(transactiontime).strftime('%d %b %Y %H:%M:%S')
# Process this message once websocket starts
print("{} - {} - Interval {} - Open: {} - Close: {} - High: {} - Low: {} - Volume: {}".
format(transactiontime,msg['s'],msg['k']['i'],msg['k']['o'],msg['k']['c'],msg['k']['h'],msg['k']['l'],msg['k']['v']))
# Also, put information into an array
kline_array_msg = "{},{},{},{},{},{}".format(
msg['k']['T'],msg['k']['o'],msg['k']['c'],msg['k']['h'],msg['k']['l'],msg['k']['v'])
# Insert at first position
kline_array_dct[var].insert(0, kline_array_msg)
if (len(kline_array_dct[var]) > window):
# Remove last message when res_array size is > of FIXED_SIZE
del kline_array_dct[var][-1]
Then, create a simple for loop to call the process for each var in the list:
for var in varList:
process_messages("msg", var)
The for loop will call the process for each var in the list.

How can I speed up a python loop with a timestamp interval condition

I have this code that is rather done in a hurry but it works in general. The only thing it runs forever. The idea is to update 2 columns on a table that is holding 1495748 rows, so the number of the list of timestamp being queried in first place. For each update value there has to be done a comparison in which the timestamp has to be in an hourly interval that is formed by two timestamps coming from the api in two different dicts. Is there a way to speed up things a little or maybe multiprocess it?
Hint: db_mac = db_connection to a Postgres database.
the response looks like this:
{'meta': {'source': 'National Oceanic and Atmospheric Administration, Deutscher Wetterdienst'}, 'data': [{'time': '2019-11-26 23:00:00', 'time_local': '2019-11-27 00:00', 'temperature': 8.3, 'dewpoint': 5.9, 'humidity': 85, 'precipitation': 0, 'precipitation_3': None, 'precipitation_6': None, 'snowdepth': None, 'windspeed': 11, 'peakgust': 21, 'winddirection': 160, 'pressure': 1004.2, 'condition': 4}, {'time': '2019-11-27 00:00:00', ....
import requests
import db_mac
from collections import defaultdict
import datetime
import time
t = time.time()
station = [10382,"DE","Berlin / Tegel",52.5667,13.3167,37,"EDDT",10382,"TXL","Europe/Berlin"]
dates = [("2019-11-20","2019-11-22"), ("2019-11-27","2019-12-02") ]
insert_dict = defaultdict(tuple)
hist_weather_list = []
for d in dates:
end = d[1]
start = d[0]
print(start, end)
url = "https://api.meteostat.net/v1/history/hourly?station={station}&start={start}&end={end}&time_zone={timezone}&&time_format=Y-m-d%20H:i&key=<APIKEY>".format(station=station[0], start=start, end=end, timezone=station[-1])
response = requests.get(url)
weather = response.json()
print(weather)
for i in weather["data"]:
hist_weather_list.append(i)
sql = "select timestamp from dump order by timestamp asc"
result = db_mac.execute(sql)
hours, rem = divmod(time.time() - t, 3600)
minutes, seconds = divmod(rem, 60)
print("step1 {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
for row in result:
try:
ts_dump = datetime.datetime.timestamp(row[0])
for i, hour in enumerate(hist_weather_list):
ts1 = datetime.datetime.timestamp(datetime.datetime.strptime(hour["time"], '%Y-%m-%d %H:%M:%S'))
ts2 = datetime.datetime.timestamp(datetime.datetime.strptime(hist_weather_list[i + 1]["time"], '%Y-%m-%d %H:%M:%S'))
if ts1 <= ts_dump and ts_dump < ts2:
insert_dict[row[0]] = (hour["temperature"], hour["pressure"])
except Exception as e:
pass
hours, rem = divmod(time.time() - t, 3600)
minutes, seconds = divmod(rem, 60)
print("step2 {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
for key, value in insert_dict.items():
sql2 = """UPDATE dump SET temperature = """ + str(value[0]) + """, pressure = """+ str(value[1]) + """ WHERE timestamp = '"""+ str(key) + """';"""
db_mac.execute(sql2)
hours, rem = divmod(time.time() - t, 3600)
minutes, seconds = divmod(rem, 60)
print("step3 {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
UPDATE the code for multiprocessing. I'll let it run the night and give an update of the running time.
import requests
import db_mac
from collections import defaultdict
import datetime
import time
import multiprocessing as mp
t = time.time()
station = [10382,"DE","Berlin / Tegel",52.5667,13.3167,37,"EDDT",10382,"TXL","Europe/Berlin"]
dates = [("2019-11-20","2019-11-22"), ("2019-11-27","2019-12-02") ]
insert_dict = defaultdict(tuple)
hist_weather_list = []
for d in dates:
end = d[1]
start = d[0]
print(start, end)
url = "https://api.meteostat.net/v1/history/hourly?station={station}&start={start}&end={end}&time_zone={timezone}&&time_format=Y-m-d%20H:i&key=wzwi2YR5".format(station=station[0], start=start, end=end, timezone=station[-1])
response = requests.get(url)
weather = response.json()
print(weather)
for i in weather["data"]:
hist_weather_list.append(i)
sql = "select timestamp from dump order by timestamp asc"
result = db_mac.execute(sql)
hours, rem = divmod(time.time() - t, 3600)
minutes, seconds = divmod(rem, 60)
print("step1 {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
def find_parameters(x):
for row in result[x[0]:x[1]]:
try:
ts_dump = datetime.datetime.timestamp(row[0])
for i, hour in enumerate(hist_weather_list):
ts1 = datetime.datetime.timestamp(datetime.datetime.strptime(hour["time"], '%Y-%m-%d %H:%M:%S'))
ts2 = datetime.datetime.timestamp(datetime.datetime.strptime(hist_weather_list[i + 1]["time"], '%Y-%m-%d %H:%M:%S'))
if ts1 <= ts_dump and ts_dump < ts2:
insert_dict[row[0]] = (hour["temperature"], hour["pressure"])
except Exception as e:
pass
step1 = int(len(result) /4)
step2 = 2 * step1
step3 = 3 * step1
step4 = len(result)
steps = [[0,step1],[step1,step2],[step2,step3], [step3,step4]]
pool = mp.Pool(mp.cpu_count())
pool.map(find_parameters, steps)
hours, rem = divmod(time.time() - t, 3600)
minutes, seconds = divmod(rem, 60)
print("step2 {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
for key, value in insert_dict.items():
sql2 = """UPDATE dump SET temperature = """ + str(value[0]) + """, pressure = """+ str(value[1]) + """ WHERE timestamp = '"""+ str(key) + """';"""
db_mac.execute(sql2)
hours, rem = divmod(time.time() - t, 3600)
minutes, seconds = divmod(rem, 60)
print("step3 {:0>2}:{:0>2}:{:05.2f}".format(int(hours),int(minutes),seconds))
UPDATE 2
It finished and ran for 2:45 hours in 4 cores on a raspberry pi. Though is there a more efficient way to do such things?
So theres a few minor things I can think of to speed this up a little. I figure anything little bit helps especially if you have a lot of rows to process. For starters, print statements can slow down your code a lot. I'd get rid of those if they are unneeded.
Most importantly, you are calling the api in every iteration of the loop. Waiting for a response from the API is probably taking up the bulk of your time. I looked a bit at the api you are using, but don't know the exact case you're using it for or what your dates "start" and "end" look like, but if you could do it in less calls that would surely speed up this loop by a lot. Another way you can do this is, it looks like the api has a .csv version of the data you can download and use. Running this on local data would be way faster. If you choose to go this route i'd suggest using pandas. (Sorry if you already know pandas and i'm over explaining) You can use: df = pd.read_csv("filename.csv") and edit the table from there easily. You can also do df.to_sql(params) to write to your data base. Let me know if you want help forming a pandas version of this code.
Also, not sure from your code if this would cause an error, but I would try, instead of your for loop (for i in weather["data"]).
hist_weather_list += weather["data"]
or possibly
hist_weather_list += [weather["data"]
Let me know how it goes!

How to get the latest offset from each partition using kafka-python?

I'm trying to get the latest offset (not committed offset) from each partition for a given topic.
from kafka import KafkaConsumer, TopicPartition
topic = 'test-topic'
broker = 'localhost:9092'
consumer = KafkaConsumer(bootstrap_servers=broker)
tp = TopicPartition(topic, 0) #1
consumer.assign([tp]) #2
consumer.seek_to_end(tp) #3
last_offset = consumer.position(tp) #4
for i in consumer.partitions_for_topic(topic):
tp = TopicPartition(topic, i)
consumer.assign([tp])
consumer.seek_to_end(tp)
last_offset = consumer.position(tp)
print(last_offset)
The preceding code does work and prints the offset of each partition. However, notice how I have the same 4 lines outside of the loop as well as inside of the loop. If I remove any of the lines #1 - #4 (the 4 lines directly preceding the for loop) I get the error:
File "check_kafka_offset.py", line 19, in
for i in consumer.partitions_for_topic(topic):
TypeError: 'NoneType' object is not iterable
Why do I need to have the 4 lines before the for loop?
You can use the end_offsets(partitions) function in that client to get the last offset for the partitions specified. Note that the returned offset is the next offset, that is the current end +1. Documentation here.
Edit: Example implementation:
from kafka import KafkaProducer, KafkaConsumer, TopicPartition
from kafka.errors import KafkaError
import json
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
BOOTSTRAP="""cluster:9092"""
API_KEY="""redacted"""
API_SECRET="""redacted"""
TOPIC="python-test"
consumer = KafkaConsumer(
group_id="my-group",
bootstrap_servers=[BOOTSTRAP],
security_protocol="SASL_SSL",
sasl_mechanism="PLAIN",
sasl_plain_username=API_KEY,
sasl_plain_password=API_SECRET,
value_deserializer=lambda m: json.loads(m.decode('ascii')),
auto_offset_reset='earliest'
)
PARTITIONS = []
for partition in consumer.partitions_for_topic(TOPIC):
PARTITIONS.append(TopicPartition(TOPIC, partition))
end_offsets = consumer.end_offsets(PARTITIONS)
print(end_offsets)
and end_offsets looks like this:
{TopicPartition(topic=u'python-test', partition=0): 5,
TopicPartition(topic=u'python-test', partition=1): 20,
TopicPartition(topic=u'python-test', partition=2): 0}
Here is a simple and well documented function:
from kafka import TopicPartition
def getTopicInfos(consumer, topic: str):
"""
Get topic's informations like partitions with their last offsets.
Example of result: {'topic': 'myTopic', 'partitions': ['{"partition": 0, "lastOffset": 47}', '{"partition": 1, "lastOffset": 98}']})
- Parameters:
consumer: A Kafka consumer.
topic: A topic name.
- Return:
The topic's informations.
"""
# Get topic-partition pairs
# E.g: [TopicPartition(topic='myTopic', partition=0), TopicPartition(topic='myTopic', partition=1)]
tp = [TopicPartition(topic, partition) for partition in consumer.partitions_for_topic(topic)]
# Get last offsets
# E.g: {TopicPartition(topic='myTopic', partition=0): 47, TopicPartition(topic='myTopic', partition=1): 98}
tplo = consumer.end_offsets(tp)
# Format partition-lastOffset pairs
# E.g: ['{"partition": 0, "lastOffset": 47}', '{"partition": 1, "lastOffset": 98}']
plo = ['{' + f'"partition": {item.partition}, "lastOffset": {tplo.get(item)}' + '}' for item in tplo]
# Concat topic with partition-lastOffset pairs
# E.g: {'topic': 'myTopic', 'partitions': ['{"partition": 0, "lastOffset": 47}', '{"partition": 1, "lastOffset": 98}']})
tplo = {"topic": topic, "partitions": plo}
# Return the result
return tplo

Operation timed out error in Cassandra cluster

My cluster size is 6 machines and I often times receive this error message and I don't really know how to solve this:
code=1100 [Coordinator node timed out waiting for replica nodes' responses] message="Operation timed out - received only 0 responses." info={'received_responses': 0, 'required_responses': 1, 'consistency': 'LOCAL_ONE'}
This is my complete code and the part of the code where the error message occurs is this:
batch.add(schedule_remove_stmt, (source, type, row['scheduled_for'],row['id']));session.execute(batch,30)
Complete code:
cluster = Cluster(['localhost'])
session = cluster.connect('keyspace')
d = datetime.utcnow()
scheduled_for = d.replace(second=0, microsecond=0)
rowid=[]
stmt = session.prepare('SELECT * FROM schedules WHERE source=? AND type= ? AND scheduled_for = ?')
schedule_remove_stmt = session.prepare("DELETE FROM schedules WHERE source = ? AND type = ? AND scheduled_for = ? AND id = ?")
schedule_insert_stmt = session.prepare("INSERT INTO schedules(source, type, scheduled_for, id) VALUES (?, ?, ?, ?)")
schedules_to_delete = []
articles={}
source=''
type=''
try:
rows = session.execute(stmt, [source,type, scheduled_for])
article_schedule_delete = ''
for row in rows:
schedules_to_delete.append({'id':row.id,'scheduled_for':row.scheduled_for})
article_schedule_delete=article_schedule_delete+'\''+row.id+'\','
rowid.append(row.id)
article_schedule_delete = article_schedule_delete[0:-1]
cql = 'SELECT * FROM articles WHERE id in (%s)' % article_schedule_delete
articles_row = session.execute(cql)
for row in articles_row:
articles[row.id]=row.created_at
except Exception as e:
print e
log.info('select error is:%s' % e)
try:
for row in schedules_to_delete:
batch = BatchStatement()
batch.add(schedule_remove_stmt, (source, type, row['scheduled_for'],row['id']))
try:
if row['id'] in articles.keys():
next_schedule =d
elapsed = datetime.utcnow() - articles[row['id']]
if elapsed <= timedelta(hours=1):
next_schedule += timedelta(minutes=6)
elif elapsed <= timedelta(hours=3):
next_schedule += timedelta(minutes=18)
elif elapsed <= timedelta(hours=6):
next_schedule += timedelta(minutes=36)
elif elapsed <= timedelta(hours=12):
next_schedule += timedelta(minutes=72)
elif elapsed <= timedelta(days=1):
next_schedule += timedelta(minutes=144)
elif elapsed <= timedelta(days=3):
next_schedule += timedelta(minutes=432)
elif elapsed <= timedelta(days=30) :
next_schedule += timedelta(minutes=1440)
if not next_schedule==d:
batch.add(schedule_insert_stmt, (source,type, next_schedule.replace(second=0, microsecond=0),row['id']))
#log.info('schedule id:%s' % row['id'])
except Exception as e:
print 'key error:',e
log.info('HOW IT CHANGES %s %s %s %s ERROR:%s' % (source,type, next_schedule.replace(second=0, microsecond=0), row['id'],e))
session.execute(batch,30)
except Exception as e:
print 'schedules error is =======================>',e
log.info('schedules error is:%s' % e)
Thanks a lot for the help I really don't know how to solve this!
I think you shouldn't use a batch statement in this case because you are tying to use the batch to perform a big number of operations for different partition keys, it leads to timeout exceptions. You should use batches to keep tables in sync but not for performance optimization.
You can find more about misusing batches in this article
Using an asynchronous driver api is more suitable to perform a lot of delete queries for you case. It will allow to keep performance of your code and avoid coordinator overload.

Categories

Resources