How to get the latest offset from each partition using kafka-python? - python

I'm trying to get the latest offset (not committed offset) from each partition for a given topic.
from kafka import KafkaConsumer, TopicPartition
topic = 'test-topic'
broker = 'localhost:9092'
consumer = KafkaConsumer(bootstrap_servers=broker)
tp = TopicPartition(topic, 0) #1
consumer.assign([tp]) #2
consumer.seek_to_end(tp) #3
last_offset = consumer.position(tp) #4
for i in consumer.partitions_for_topic(topic):
tp = TopicPartition(topic, i)
consumer.assign([tp])
consumer.seek_to_end(tp)
last_offset = consumer.position(tp)
print(last_offset)
The preceding code does work and prints the offset of each partition. However, notice how I have the same 4 lines outside of the loop as well as inside of the loop. If I remove any of the lines #1 - #4 (the 4 lines directly preceding the for loop) I get the error:
File "check_kafka_offset.py", line 19, in
for i in consumer.partitions_for_topic(topic):
TypeError: 'NoneType' object is not iterable
Why do I need to have the 4 lines before the for loop?

You can use the end_offsets(partitions) function in that client to get the last offset for the partitions specified. Note that the returned offset is the next offset, that is the current end +1. Documentation here.
Edit: Example implementation:
from kafka import KafkaProducer, KafkaConsumer, TopicPartition
from kafka.errors import KafkaError
import json
import logging
logging.basicConfig(level=logging.DEBUG)
logger = logging.getLogger(__name__)
BOOTSTRAP="""cluster:9092"""
API_KEY="""redacted"""
API_SECRET="""redacted"""
TOPIC="python-test"
consumer = KafkaConsumer(
group_id="my-group",
bootstrap_servers=[BOOTSTRAP],
security_protocol="SASL_SSL",
sasl_mechanism="PLAIN",
sasl_plain_username=API_KEY,
sasl_plain_password=API_SECRET,
value_deserializer=lambda m: json.loads(m.decode('ascii')),
auto_offset_reset='earliest'
)
PARTITIONS = []
for partition in consumer.partitions_for_topic(TOPIC):
PARTITIONS.append(TopicPartition(TOPIC, partition))
end_offsets = consumer.end_offsets(PARTITIONS)
print(end_offsets)
and end_offsets looks like this:
{TopicPartition(topic=u'python-test', partition=0): 5,
TopicPartition(topic=u'python-test', partition=1): 20,
TopicPartition(topic=u'python-test', partition=2): 0}

Here is a simple and well documented function:
from kafka import TopicPartition
def getTopicInfos(consumer, topic: str):
"""
Get topic's informations like partitions with their last offsets.
Example of result: {'topic': 'myTopic', 'partitions': ['{"partition": 0, "lastOffset": 47}', '{"partition": 1, "lastOffset": 98}']})
- Parameters:
consumer: A Kafka consumer.
topic: A topic name.
- Return:
The topic's informations.
"""
# Get topic-partition pairs
# E.g: [TopicPartition(topic='myTopic', partition=0), TopicPartition(topic='myTopic', partition=1)]
tp = [TopicPartition(topic, partition) for partition in consumer.partitions_for_topic(topic)]
# Get last offsets
# E.g: {TopicPartition(topic='myTopic', partition=0): 47, TopicPartition(topic='myTopic', partition=1): 98}
tplo = consumer.end_offsets(tp)
# Format partition-lastOffset pairs
# E.g: ['{"partition": 0, "lastOffset": 47}', '{"partition": 1, "lastOffset": 98}']
plo = ['{' + f'"partition": {item.partition}, "lastOffset": {tplo.get(item)}' + '}' for item in tplo]
# Concat topic with partition-lastOffset pairs
# E.g: {'topic': 'myTopic', 'partitions': ['{"partition": 0, "lastOffset": 47}', '{"partition": 1, "lastOffset": 98}']})
tplo = {"topic": topic, "partitions": plo}
# Return the result
return tplo

Related

Multiprocessing and relationship traversal?

I am trying to implement multiprocessing to speed up traversing a relationship graph. I want to capture items that have a total less than 1000. If the parent is over 1000, process the children until there's no more to check.
I've mocked up an illustration that shows that ThreadPoolExecutor only processes the initial items provided to the class while the class.search_queue_list list is still populated. I also tried using a Queue instead of a list with similar results. Synchronous processing works as expected for list and Queue. Is there a way to make multiprocessing work here when the initial array of items can change?
from concurrent.futures import ThreadPoolExecutor
from queue import Queue
from time import sleep
dummy_data = {
'id1': {'total': 1001, 'children': ['id101','id102']}, # over 1000, children will be processed
'id2': {'total': 999, 'children': ['id201','id202']}, # under 1000, children won't be processed
'id101': {'total': 501, 'children': ['more_children']},
'id102': {'total': 500, 'children': ['more_children']},
'id201': {'total': 499,'children': ['more_children']},
'id202': {'total': 500, 'children': ['more_children']},
}
class SearchDummy(object):
def __init__(self, start_list):
# with list
self.search_queue_list = start_list
# with Queue
self.search_queue_queue = Queue()
for item in self.search_queue_list:
self.search_queue_queue.put(item)
self.good_ids = []
def get_total(self, search_id):
# artificial delay
sleep(0.5)
return dummy_data[search_id]['total']
def get_children(self, search_id):
# artificial delay
sleep(0.5)
return dummy_data[search_id]['children']
# START LIST
def current_check_list(self):
# get first element in search_queue_list
current_id = self.search_queue_list.pop(0)
# check if current_id['total'] is over 1000
if self.get_total(current_id) <= 1000:
self.good_ids.append(current_id)
else:
# prepend children to search_queue_list
self.search_queue_list.extend(self.get_children(current_id))
def search_list(self):
while self.search_queue_list:
self.current_check_list()
def multi_search_list(self):
with ThreadPoolExecutor() as e:
while self.search_queue_list:
e.submit(self.current_check_list)
# END LIST
# START QUEUE
def current_check_queue(self):
# get item from search_queue_queue
current_id = self.search_queue_queue.get()
# check if current_id['total'] is over 1000
if self.get_total(current_id) <= 1000:
self.good_ids.append(current_id)
else:
# put children in search_queue_queue
for child in self.get_children(current_id):
self.search_queue_queue.put(child)
def search_queue(self):
while not self.search_queue_queue.empty():
self.current_check_queue()
def multi_search_queue(self):
with ThreadPoolExecutor() as e:
while not self.search_queue_queue.empty():
e.submit(self.current_check_queue)
# END QUEUE
# synchronous list
s = SearchDummy(['id1','id2'])
s.search_list()
print('List output', s.good_ids) # returns ['id101', 'id102', 'id2']
print('Remaining list size', len(s.search_queue_list)) # returns 0
# synchronous queue
s = SearchDummy(['id1','id2'])
s.search_queue()
print('Queue output', s.good_ids) # returns ['id101', 'id102', 'id2']
print('Remaining queue size', s.search_queue_queue.qsize()) # returns 0
# multiprocessing list
s = SearchDummy(['id1','id2'])
s.multi_search_list()
print('Multi list output', s.good_ids) # returns ['id2']
print('Multi list remaining', s.search_queue_list) # returns ['id101', 'id102']
# multiprocessing queue
s = SearchDummy(['id1','id2'])
s.multi_search_queue()
print('Multi queue output', s.good_ids) # returns ['id2']
print('Multi queue remaining', list(s.search_queue_queue.queue)) # returns ['id101', 'id102']

How to get rid of the rest of the text after getting the results I want?

import urllib.request
import json
from collections import Counter
def count_coauthors(author_id):
coauthors_dict = {}
url_str = ('https://api.semanticscholar.org/graph/v1/author/47490276?fields=name,papers.authors')
respons = urllib.request.urlopen(url_str)
text = respons.read().decode()
for line in respons:
print(line.decode().rstip())
data = json.loads(text)
print(type(data))
print(list(data.keys()))
print(data["name"])
print(data["authorId"])
name = []
for lines in data["papers"]:
for authors in lines["authors"]:
name.append(authors.get("name"))
print(name)
count = dict()
names = name
for i in names:
if i not in count:
count[i] = 1
else:
count[i] += 1
print(count)
c = Counter(count)
top = c.most_common(10)
print(top)
return coauthors_dict
author_id = '47490276'
cc = count_coauthors(author_id)
top_coauthors = sorted(cc.items(), key=lambda item: item[1], reverse=True)
for co_author in top_coauthors[:10]:
print(co_author)
This is how my code looks this far, there are no error. I need to get rid of the rest of the text when I run it, so it should look like this:
('Diego Calvanese', 47)
('D. Lanti', 28)
('Martín Rezk', 21)
('Elem Güzel Kalayci', 18)
('B. Cogrel', 17)
('E. Botoeva', 16)
('E. Kharlamov', 16)
('I. Horrocks', 12)
('S. Brandt', 11)
('V. Ryzhikov', 11)
I have tried using rstrip and split on my 'c' variable but it doesn't work. Im only allowed importing what I already have imported and must use the link which is included.
Tips on simplifying or bettering the code is also appreciated!
("Extend the program below so that it prints the names of the top-10 coauthors together with the numbers of the coauthored publications")
From what I understand you are not quite sure where your successful output originates from. It is not the 5 lines at the end.
Your result is printed by the print(top) on line 39. This top variable is what you want to return from the function, as the coauthors_dict you are currently returning never actually gets any data written to it.
You will also have to slightly adjust your sorted(...) as you now have a list and not a dictionary, but you should then get the correct result.
If I understand correctly you are wanting this function to return a count of each distinct co-author (excluding the author), which it seems like you already have in your count variable, which you don't return. The variable you DO return is empty.
Instead consider:
import urllib.request
import json
from collections import Counter
def count_coauthors(author_id):
url_str = (f'https://api.semanticscholar.org/graph/v1/author/{author_id}?fields=name,papers.authors')
response = urllib.request.urlopen(url_str)
text = response.read().decode()
data = json.loads(text)
names = [a.get("name") for l in data["papers"] for a in l["authors"] if a['authorId'] != author_id]
#The statement above can be written long-hand like:
#names=[]
#for l in data["papers"]:
# for a in l["authors"]:
# if a['authorId'] != author_id:
# names.append(a.get("name"))
return list(Counter(names).items())
author_id = '47490276'
cc = count_coauthors(author_id)
top_coauthors = sorted(cc, key=lambda item: item[1], reverse=True)
for co_author in top_coauthors[:10]:
print(co_author)
('Diego Calvanese', 47)
('D. Lanti', 28)
('Martín Rezk', 21)
('Elem Güzel Kalayci', 18)
('B. Cogrel', 17)
('E. Botoeva', 16)
('E. Kharlamov', 16)
('I. Horrocks', 12)
('S. Brandt', 11)
('V. Ryzhikov', 11)
You might also consider moving the top N logic into the function as an optional paramter:
import urllib.request
import json
from collections import Counter
def count_coauthors(author_id, top=0):
url_str = (f'https://api.semanticscholar.org/graph/v1/author/{author_id}?fields=name,papers.authors')
response = urllib.request.urlopen(url_str)
text = response.read().decode()
data = json.loads(text)
names = [a.get("name") for l in data["papers"] for a in l["authors"] if a['authorId'] != author_id]
name_count = list(Counter(names).items())
top = top if top!=0 else len(name_count)
return sorted(name_count, key=lambda x: x[1], reverse=True)[:top]
author_id = '47490276'
for auth in count_coauthors(author_id, top=10):
print(auth)

Generating a nested dictionary in Python through iterations

I'm new in Python and I have to retrieve datas from a txt file (which I have already did) and then I need to make a nested dictionary like this:
new_dict = {"2009-10-16": {"KitchenSensor":"active for x minutes today",
"BathroomSensor":"active for y minutes today"...}
"2009-10-24":{"KitchenSensor":"active for x minutes today",
"BathroomSensor":"active for y minutes today"...}
"2009-11-13":{"KitchenSensor":"active for x minutes today",
"BathroomSensor":"active for y minutes today"...}}
my code looks like this
namesFile = open("data.txt", "r")
listaDati = namesFile.readlines()
listaDivisa = []
for i in listaDati:
if i[27] != "T":
listaDivisa.append(
i.split())
and the datas in my txt file have this format:
2009-10-16 00:01:04.000059 KITCHENSENSOR ON
2009-10-16 02:33:12.000093 KITCHENSENSOR OFF
2009-10-24 21:25:52.000023 BATHROOMSENSOR ON
2009-10-24 23:13:52.000014 BATHROOMSENSOR OFF
2009-11-13 09:03:23.000053 BATHROOMSENSOR ON
2009-11-13 12:13:42.000014 BATHROOMSENSOR OFF
the timestamp changes every now and then so I want to create a new key with the new timestamp everytime I meet a new one and saving the infos I have to save. I was trying doing this with an enumerative for loop but I don't understand how I can create the dictionary.
Thank you!
You're maybe looking for something like this; I separated the task into
parsing the input lines (could be from a file, but here they're just a list) into events (3-tuples of datetime, sensor name and state)
grouping the events by date, and looking at the state changes.
import datetime
from itertools import groupby
def parse_line(line):
# Split the line at the two spaces.
time_string, event = line.split(" ", 1)
# Split the rest of the line at one space.
sensor, event = event.split(" ", 1)
# Parse the time string to a real datetime object.
t = datetime.datetime.strptime(time_string, "%Y-%m-%d %H:%M:%S.%f")
return (t, sensor, event == "ON")
def collate_sorted_events(sorted_events):
zero_delta = datetime.timedelta(0)
for day, events in groupby(
sorted_events, lambda event_triple: event_triple[0].date()
):
# We're assuming all sensors start off each day.
turn_on_times = {}
durations = {}
for time, sensor, state in events:
if state: # Turning on?
# If it was on already, that's not an issue; we just consider that a glitch.
if sensor not in turn_on_times:
turn_on_times[sensor] = time
else:
if sensor not in turn_on_times:
raise ValueError("Sensor was turned off before it was turned on.")
this_duration = time - turn_on_times[sensor]
durations[sensor] = durations.get(sensor, zero_delta) + this_duration
del turn_on_times[sensor]
yield (day, durations)
if turn_on_times:
# This check could be removed, but for now it's a good sanity check...
raise ValueError(
"Some sensors were left on at the end of the day; this could be a problem"
)
listaDati = [
"2009-10-16 00:01:04.000059 KITCHENSENSOR ON",
"2009-10-16 02:33:12.000093 KITCHENSENSOR OFF",
"2009-10-24 21:25:52.000023 BATHROOMSENSOR ON",
"2009-10-24 23:13:52.000014 BATHROOMSENSOR OFF",
"2009-11-13 09:03:23.000053 BATHROOMSENSOR ON",
"2009-11-13 12:13:42.000014 BATHROOMSENSOR OFF",
]
# Parse and sort input lines. It's imperative that the events are sorted
# so the rest of the code works as it should.
sorted_events = sorted(parse_line(i) for i in listaDati)
# Collate events by day; the function yields day/durations tuples,
# and `dict` accepts that format to create a dict with.
output = dict(collate_sorted_events(sorted_events))
print(output)
for date, deltas in sorted(output.items()):
for sensor, delta in sorted(deltas.items()):
print(f"{date} {sensor} {delta.total_seconds() / 60:.2f} minutes")
The output is
{
datetime.date(2009, 10, 16): {'KITCHENSENSOR': datetime.timedelta(seconds=9128, microseconds=34)},
datetime.date(2009, 10, 24): {'BATHROOMSENSOR': datetime.timedelta(seconds=6479, microseconds=999991)},
datetime.date(2009, 11, 13): {'BATHROOMSENSOR': datetime.timedelta(seconds=11418, microseconds=999961)},
}
followed by the formatted
2009-10-16 KITCHENSENSOR 152.13 minutes
2009-10-24 BATHROOMSENSOR 108.00 minutes
2009-11-13 BATHROOMSENSOR 190.32 minutes

get results from kafka for a specific period of time

Here is my code, that uses kafka-python.
now = datetime.now()
month_ago = now - relativedelta(month=1)
topic = 'some_topic_name'
consumer = KafkaConsumer(topic, bootstrap_servers=PROD_KAFKA_SERVER,
security_protocol=PROTOCOL,
group_id=GROUP_ID,
enable_auto_commit=False,
sasl_mechanism=SASL_MECHANISM, sasl_plain_username=SASL_USERNAME,
sasl_plain_password=SASL_PASSWORD)
for msg in consumer:
print(msg)
I want to get results from topic just between now and month_ago in a loop. How can I do this?
Thanks for any help!
Get topic partitions, assigned to your consumer:
partitions = consumer.assignment()
Get offsets for partitions by datetime:
month_ago_timestamp = int(month_ago.timestamp() * 1000)
partition_to_timestamp = {part: month_ago_timestamp for part in partitions}
mapping = consumer.offsets_for_times(partition_to_timestamp)
Seek partitions to offsets:
for partition, offset_and_timestamp in partition_to_offset_and_timestamp.items():
consumer.seek(partition, offset_and_timestamp[0])
Warning! Consumer can return None, set with int zero or block indefinitely in cases like missing topic, missing partition or messages without timestamp
Finally, I do this :) My code looks like this:
topic = 'some_topic_name'
consumer = KafkaConsumer(bootstrap_servers=PROD_KAFKA_SERVER,
security_protocol=PROTOCOL,
group_id=GROUP_ID,
sasl_mechanism=SASL_MECHANISM, sasl_plain_username=SASL_USERNAME,
sasl_plain_password=SASL_PASSWORD)
month_ago = (datetime.now() - relativedelta(months=1)).timestamp()
topic_partition = TopicPartition(topic, 0)
assigned_topic = [topic_partition]
consumer.assign(assigned_topic)
partitions = consumer.assignment()
partition_to_timestamp = {part: int(month_ago * 1000) for part in partitions}
end_offsets = consumer.end_offsets(list(partition_to_timestamp.keys()))
mapping = consumer.offsets_for_times(partition_to_timestamp)
for partition, ts in mapping.items():
end_offset = end_offsets.get(partition)
consumer.seek(partition, ts[0])
for msg in consumer:
value = json.loads(msg.value.decode('utf-8'))
# do something
if msg.offset == end_offset - 1:
consumer.close()
break

boto3 glue get_job_runs - check execution with certain date exists in the response object

I am trying to fetch glue job executions that got failed previous day using 'get_job_runs' function available through boto3's glue client.
https://boto3.amazonaws.com/v1/documentation/api/latest/reference/services/glue.html#Glue.Client.get_job_runs.
The request syntax, does not have an option to filter executions or job runs by date/status -
response = client.get_job_runs(
JobName='string',
NextToken='string',
MaxResults=123
)
The response I receive back looks something like below -
{
"JobRuns": [
{
"Id": "jr_89bfa55b544f7eec4f6ea574dfb0345345uhi4df65e59869e93c5d8f5efef989",
"Attempt": 0,
"JobName": "GlueJobName",
"StartedOn": datetime.datetime(2021, 1, 27, 4, 32, 47, 718000, tzinfo=tzlocal()),
"LastModifiedOn": datetime.datetime(2021, 1, 27, 4, 36, 14, 975000, tzinfo=tzlocal()),
"CompletedOn": datetime.datetime(2021, 1, 27, 4, 36, 14, 975000, tzinfo=tzlocal()),
"JobRunState": "FAILED",
"Arguments": {
"--additional-python-modules": "awswrangler",
"--conf": "spark.executor.memory=40g",
"--conf ": "spark.driver.memory=40g",
"--enable-spark-ui": "true",
"--extra-py-files": "s3://GlueJobName/lttb.py",
"--job-bookmark-option": "job-bookmark-disable",
"--spark-event-logs-path": "s3://GlueJobName/glue-script/spark-event-logs"
},
"ErrorMessage": "MemoryError: Unable to allocate xxxxx",
"PredecessorRuns": [],
"AllocatedCapacity": 8,
"ExecutionTime": 199,
"Timeout": 2880,
"MaxCapacity": 8.0,
"WorkerType": "G.2X",
"NumberOfWorkers": 4,
"LogGroupName": "/aws-glue/jobs",
"GlueVersion": "2.0"
}
],
"NextToken": "string"
}
So, what I am doing now is looping through the response object to check if the "CompletedOn" date matches with yesterday's date using prev_day calculated using datetime and timedelta and I am doing this in a while loop to fetch last 10000 executions, as the 'get_job_runs' single call is insufficient.
import boto3
from datetime import datetime, timedelta
logger = logging.getLogger()
logger.setLevel(logging.INFO)
glue_client = boto3.client("glue")
def filter_failed_exec_prev_day(executions, prev_day) -> list:
filtered_resp = []
for execution in executions['JobRuns']:
if execution['JobRunState'] == 'FAILED' and execution['CompletedOn'].date() == prev_day:
filtered_resp.append(execution)
return filtered_resp
def get_final_executions() -> list:
final_job_runs_list = []
MAX_EXEC_SEARCH_CNT = 10000
prev_day = (datetime.utcnow() - timedelta(days=1)).date()
buff_exec_cnt = 0
l_job = 'GlueJobName'
response = glue_client.get_job_runs(
JobName=l_job
)
resp_count = len(response['JobRuns'])
if resp_count > 0:
buff_exec_cnt += resp_count
filtered_resp = filter_failed_exec_prev_day(response, prev_day)
final_job_runs_list.extend(filtered_resp)
while buff_exec_cnt <= MAX_EXEC_SEARCH_CNT:
if 'NextToken' in response:
response = glue_client.get_job_runs(
JobName=l_job
)
buff_exec_cnt += len(response['JobRuns'])
filtered_resp = filter_failed_exec_prev_day(response, prev_day)
final_job_runs_list.extend(filtered_resp)
else:
logger.info(f"{job} executions list: {final_job_runs_list}")
break
return final_job_runs_list
Here, I am using a while loop to break the call after hitting 10K executions, this is triple the amount of executions we see each day on the job.
Now, I am hoping to break the while loop after I encounter execution that belongs to prev_day - 1, so is it possible to search the entire response dict for prev_day - 1 to make sure all prev day's executions are covered considering the datetime.datetime object we receive from boto3 for CompletedOn attribute?
Appreciate reading through.
Thank you
I looked at your code. And I think it might return always the same result as you're not iterating through the resultset correctly.
here:
while buff_exec_cnt <= MAX_EXEC_SEARCH_CNT:
if 'NextToken' in response:
response = glue_client.get_job_runs(
JobName=l_job
)
you need to pass the NextToken value to the get_job_runs method, like this:
response = glue_client.get_job_runs(
JobName=l_job, NextToken= response['NextToken']
)

Categories

Resources