I have a async_receive method of Eventhub developed in python and also has a checkpoint with it. The code was taken from the official Eventhub sample github repo.
Problem- Using the above-mentioned code, I am just able to receive 20-35 messages per minute if I keep the receiver on for the whole day whereas my Eventhub has a lot of stream data ingested (~200 messages per Minute). The enqueued time at eventhub for a message is now lagging behind by 90 minutes due to poor throughput at the receiver's end which means that the data that got enqueued at X minute in the Eventhub got pulled out of it at X+90 minutes
Investigation- I tried to look at the receive subclass in the Eventhub python SDK and came across a prefetch parameter (line 318) which is set to 300 by default. If this is already set to 300 then I should be able to pull more than 30-35 messages by default.
Any idea on how can I increase my pull capacity? I'm stuck at this point and have no direction forward, any help is highly appreciated.
EDIT 1-
I'm now attaching my Python Code as shown below-
import asyncio
import json
import logging
import os
import sys
import time
from datetime import date
import requests
from azure.eventhub.aio import EventHubConsumerClient
from azure.eventhub.extensions.checkpointstoreblobaio import BlobCheckpointStore
import log_handler
import threading
import traceback
try:
## Set env variables
CONNECTION_STR = os.environ["ECS"].strip()
EVENTHUB_NAME = os.environ['EN'].strip()
EVENTHUB_CONSUMER = os.environ["EC"].strip()
API = os.environ['API_variable'].strip()
AZURE_BLOB_CONNECTION_STR = os.environ["ACS"].strip()
BLOB_CONTAINER_NAME = os.environ["BCN"].strip()
BLOB_ACCOUNT_URL = os.environ["BAU"].strip()
PREFETCH_COUNT = int(os.environ["PREFETCH_COUNT"])
MAX_WAIT_TIME = float(os.environ["MAX_WAIT_TIME"])
except Exception as exception:
logging.debug(traceback.format_exc())
logging.warning(
"*** Please check the environment variables for {}".format(str(exception)))
sys.exit()
def API_CALL(event_data):
"""
Sends the request to the API
"""
try:
url = event_data['image_url']
payload = {"url": url}
## API call to the server
service_response = requests.post(API, json=payload)
logging.info(f"*** service_response.status_code : {service_response.status_code}")
cloud_response = json.loads(
service_response.text) if service_response.status_code == 200 else None
today = date.today()
response_date = today.strftime("%B %d, %Y")
response_time = time.strftime("%H:%M:%S", time.gmtime())
response_data = {
"type": "response_data",
"consumer_group": EVENTHUB_CONSUMER,
'current_date': response_date,
'current_time': response_time,
'image_url': url,
'status_code': service_response.status_code,
'response': cloud_response,
'api_response_time': int(service_response.elapsed.total_seconds()*1000),
"eventhub_data": event_data
}
logging.info(f"*** response_data {json.dumps(response_data)}")
logging.debug(f"*** response_data {json.dumps(response_data)}")
except Exception as exception:
logging.debug(traceback.format_exc())
logging.error(
"**** RaiseError: Failed request url %s, Root Cause of error: %s", url, exception)
async def event_operations(partition_context, event):
start_time = time.time()
data_ = event.body_as_str(encoding='UTF-8')
json_data = json.loads(data_)
## forming data payload
additional_data = {
"type": "event_data",
"consumer_group": EVENTHUB_CONSUMER,
"image_name": json_data["image_url"].split("/")[-1]
}
json_data.update(additional_data)
logging.info(f"*** Data fetched from EH : {json_data}")
logging.debug(f"*** Data fetched from EH : {json_data}")
API_CALL(json_data)
logging.info(f"*** time taken to process an event(ms): {(time.time()-start_time)*1000}")
def between_callback(partition_context, event):
"""
Loop to create threads
"""
loop = asyncio.new_event_loop()
asyncio.set_event_loop(loop)
loop.run_until_complete(event_operations(partition_context, event))
loop.close()
async def on_event(partition_context, event):
"""
Put your code here.
Do some sync or async operations.
If the operation is i/o intensive, async will have better performanceself.
"""
t1 = time.time()
_thread = threading.Thread(target=between_callback, args=(partition_context, event))
_thread.start()
logging.info(f"*** time taken to start a thread(ms): {(time.time()-t1)*1000}")
logging.info("*** Fetching the next event")
## Update checkpoint per event
t2 = time.time()
await partition_context.update_checkpoint(event)
logging.info(f"*** time taken to update checkpoint(ms): {(time.time()-t2)*1000}")
async def main(client):
"""
Run the on_event method for each event received
Args:
client ([type]): Azure Eventhub listener client
"""
try:
async with client:
# Call the receive method. Only read current data (#latest)
logging.info("*** Listening to event")
await client.receive(on_event=on_event,
prefetch=PREFETCH_COUNT,
max_wait_time=MAX_WAIT_TIME)
except KeyboardInterrupt:
print("*** Stopped receiving due to keyboard interrupt")
except Exception as err:
logging.debug(traceback.format_exc())
print("*** some error occured :", err)
if __name__ == '__main__':
## Checkpoint initialization
checkpoint_store = BlobCheckpointStore(
blob_account_url=BLOB_ACCOUNT_URL,
container_name=BLOB_CONTAINER_NAME,
credential=AZURE_BLOB_CONNECTION_STR
)
## Client initialization
client = EventHubConsumerClient.from_connection_string(
CONNECTION_STR,
consumer_group=EVENTHUB_CONSUMER,
eventhub_name=EVENTHUB_NAME,
checkpoint_store=checkpoint_store, #COMMENT TO RUN WITHOUT CHECKPOINT
logging_enable=True,
on_partition_initialize=on_partition_initialize,
on_partition_close=on_partition_close,
idle_timeout=10,
on_error=on_error,
retry_total=3
)
logging.info("Connecting to eventhub {} consumer {}".format(
EVENTHUB_NAME, EVENTHUB_CONSUMER))
logging.info("Registering receive callback.")
loop = asyncio.get_event_loop()
try:
loop.run_until_complete(main(client))
except KeyboardInterrupt as exception:
pass
finally:
loop.stop()
Execution-flow main()-->on_event()-->Thread(between_callback-->API_CALL)-->update_checkpoint
Change the function for the receiver in the below format when we can able to get the events and run them until they complete.
import asyncio
from azure.eventhub.aio import EventHubConsumerClient
from azure.eventhub.extensions.checkpointstoreblobaio import BlobCheckpointStore
async def on_event(partition_context, event):
# Print the event data.
print("Received the event: \"{}\" from the partition with ID: \"{}\"".format(event.body_as_str(encoding='UTF-8'), partition_context.partition_id))
# Update the checkpoint so that the program doesn't read the events
# that it has already read when you run it next time.
await partition_context.update_checkpoint(event)
async def main():
# Create an Azure blob checkpoint store to store the checkpoints.
checkpoint_store = BlobCheckpointStore.from_connection_string("AZURE STORAGE CONNECTION STRING", "BLOB CONTAINER NAME")
# Create a consumer client for the event hub.
client = EventHubConsumerClient.from_connection_string("EVENT HUBS NAMESPACE CONNECTION STRING", consumer_group="$Default", eventhub_name="EVENT HUB NAME", checkpoint_store=checkpoint_store)
async with client:
# Call the receive method. Read from the beginning of the partition (starting_position: "-1")
await client.receive(on_event=on_event, starting_position="-1")
if __name__ == '__main__':
loop = asyncio.get_event_loop()
# Run the main method.
loop.run_until_complete(main())
Also as per the suggest in the comment, call the API on an interval bases.
Related
I want to use grpc-python in the following scenario, but I don' t know how to realize it.
The scenario is that, in the python server, it uses class to calculate and update the instance' s state, then sends such state to corresponding client; in the client side, more than one clients need to communicate with the server to get its one result and not interfered by others.
Specifically, suppose there is a class with initial value self.i =0, then each time the client calls the class' s update function, it does self.i=self.i+1 and returns self.i. Actually there are two clients call such update function simultaneously, like when client1 calls update at third time, client2 calls update at first time.
I think this may can be solved by creating thread for each client to avoid conflict. If the new client calls, new thead will be created; if existing client calls, existing thread will be used. But I don' t know how to realize it?
Hope you can help me. Thanks in advance.
I think I solved this problem by myself. If you have any other better solutions, you can post here.
I edited helloworld example in grpc-python introduction to explain my aim.
For helloworld.proto
syntax = "proto3";
option java_multiple_files = true;
option java_package = "io.grpc.examples.helloworld";
option java_outer_classname = "HelloWorldProto";
option objc_class_prefix = "HLW";
package helloworld;
// The greeting service definition.
service Greeter {
// Sends a greeting
rpc SayHello (HelloRequest) returns (HelloReply) {}
rpc Unsubscribe (HelloRequest) returns (HelloReply) {}
}
// The request message containing the user's name.
message HelloRequest {
string name = 1;
}
// The response message containing the greetings
message HelloReply {
string message = 1;
}
I add Unsubsribe function to allow one specific client to diconnect from server.
In hello_server.py
import grpc
import helloworld_pb2
import helloworld_pb2_grpc
import threading
from threading import RLock
import time
from concurrent import futures
import logging
class Calcuate:
def __init__(self):
self.i = 0
def add(self):
self.i+=1
return self.i
class PeerSet(object):
def __init__(self):
self._peers_lock = RLock()
self._peers = {}
self.instances = {}
def connect(self, peer):
with self._peers_lock:
if peer not in self._peers:
print("Peer {} connecting".format(peer))
self._peers[peer] = 1
a = Calcuate()
self.instances[peer] = a
output = a.add()
return output
else:
self._peers[peer] += 1
a = self.instances[peer]
output = a.add()
return output
def disconnect(self, peer):
print("Peer {} disconnecting".format(peer))
with self._peers_lock:
if peer not in self._peers:
raise RuntimeError("Tried to disconnect peer '{}' but it was never connected.".format(peer))
del self._peers[peer]
del self.instances[peer]
def peers(self):
with self._peers_lock:
return self._peers.keys()
class Greeter(helloworld_pb2_grpc.GreeterServicer):
def __init__(self):
self._peer_set = PeerSet()
def _record_peer(self, context):
return self._peer_set.connect(context.peer())
def SayHello(self, request, context):
output = self._record_peer(context)
print("[thread {}] Peers: {}, output: {}".format(threading.currentThread().ident, self._peer_set.peers(), output))
time.sleep(1)
return helloworld_pb2.HelloReply(message='Hello, {}, {}!'.format(request.name, output))
def Unsubscribe(self, request, context):
self._peer_set.disconnect(context.peer())
return helloworld_pb2.HelloReply(message='{} disconnected!'.format(context.peer()))
def serve():
server = grpc.server(futures.ThreadPoolExecutor(max_workers=10))
helloworld_pb2_grpc.add_GreeterServicer_to_server(Greeter(), server)
server.add_insecure_port('[::]:50051')
server.start()
server.wait_for_termination()
if __name__ == '__main__':
logging.basicConfig()
serve()
The use of context.peer() is adapted from Richard Belleville' s answer in this post. You can change add() function to any other functions that can be used to update instance' s state.
In hello_client.py
from __future__ import print_function
import logging
import grpc
import helloworld_pb2
import helloworld_pb2_grpc
def run():
# NOTE(gRPC Python Team): .close() is possible on a channel and should be
# used in circumstances in which the with statement does not fit the needs
# of the code.
with grpc.insecure_channel('localhost:50051') as channel:
stub = helloworld_pb2_grpc.GreeterStub(channel)
response = stub.SayHello(helloworld_pb2.HelloRequest(name='you'))
print("Greeter client received: " + response.message)
response = stub.SayHello(helloworld_pb2.HelloRequest(name='Tom'))
print("Greeter client received: " + response.message)
response = stub.SayHello(helloworld_pb2.HelloRequest(name='Jerry'))
print("Greeter client received: " + response.message)
stub.Unsubscribe(helloworld_pb2.HelloRequest(name="end"))
if __name__ == '__main__':
logging.basicConfig()
run()
If we run serveral hello_client.py simultaneously, the server can distinguish the different clients and send correct corresponding info to them.
Let's say I have a web app driven by Uvicorn server, the app implements GraphQL API with a mutation that starts long calculations on the server-side and a query endpoint that checks the status of the server. Let's say we want to know how many tasks are running in the background.
I have a simplified code, which does not work:
import asyncio
import logging
import time
import ariadne
import ariadne.asgi
import uvicorn
import starlette as sl
import starlette.applications
query_t = ariadne.QueryType()
mutation_t = ariadne.MutationType()
FIFO = []
async def long_task():
print('Starting long task...')
global FIFO
FIFO = [1, *FIFO]
# mock long calc with 5sec sleep
time.sleep(5)
FIFO.pop()
print('Long task finished!')
#mutation_t.field('longTask')
async def resolve_long_task(_parent, info):
print('Start to resolve long task...')
asyncio.create_task(long_task())
print('Resolve finished!')
return {}
#query_t.field('ping')
async def resolve_ping(_parent, info):
return f'FIFO has {len(FIFO)} elements'
def main():
schema_str = ariadne.gql('''
type Mutation{
longTask: longTaskResponse
}
type longTaskResponse {
message: String
}
type Query {
ping: String
}
''')
schema = ariadne.make_executable_schema(schema_str, query_t, mutation_t)
gql_app = ariadne.asgi.GraphQL(schema)
app = sl.applications.Starlette(routes=[sl.routing.Mount('/graphql', gql_app)])
uvicorn.run(app,
host='0.0.0.0',
port=9002,
log_level='error')
if __name__ == '__main__':
main()
After running
$ python main.py
I send a mutation in the GraphQL GUI in the first tab:
mutation longTaskQueue{
longTask {
message
}
}
In the second tab, I try to retrieve the length of the FIFO:
query ping {
ping
}
It seems that it's possible to run 2 long_task, but ping is waiting until all long_task will be finished. My general question is how to run heavy code in the background and do not block GQL API?
After many attempts I made it, now I can put many tasks in the background and track their amount (API is not freezing on one long task). What is happening is
the blocking computations are run in a pool:
import asyncio
import logging
import time
import ariadne
import ariadne.asgi
import uvicorn
import starlette as sl
import starlette.applications
from concurrent.futures import ProcessPoolExecutor, ThreadPoolExecutor
query_t = ariadne.QueryType()
mutation_t = ariadne.MutationType()
FIFO = []
async def long_task():
print('Starting long task...')
global FIFO
FIFO = [1, *FIFO]
# mock long calc with 5sec sleep
time.sleep(5)
FIFO.pop()
print('Long task finished!')
def run(corofn, *args):
loop = asyncio.new_event_loop()
try:
coro = corofn(*args)
asyncio.set_event_loop(loop)
return loop.run_until_complete(coro)
finally:
loop.close()
#mutation_t.field('longTask')
async def resolve_long_task(_parent, info):
loop = asyncio.get_event_loop()
executor = ThreadPoolExecutor(max_workers=5)
loop.set_default_executor(ProcessPoolExecutor())
print('Start to resolve long task...')
loop.run_in_executor(executor, run, long_task)
print('Resolve finished!')
return {}
#query_t.field('ping')
async def resolve_ping(_parent, info):
return f'FIFO has {len(FIFO)} elements'
def main():
schema_str = ariadne.gql('''
type Mutation{
longTask: longTaskResponse
}
type longTaskResponse {
message: String
}
type Query {
ping: String
}
''')
schema = ariadne.make_executable_schema(schema_str, query_t, mutation_t)
gql_app = ariadne.asgi.GraphQL(schema)
app = sl.applications.Starlette(routes=[sl.routing.Mount('/graphql', gql_app)])
uvicorn.run(app,
host='0.0.0.0',
port=9002,
log_level='error')
if __name__ == '__main__':
main()
The solution inspired by this answer.
Newbie to Azure Stream analytics. I am sending events to Azure Event Hub with below Python program by serializing the respone\data to JSON format so that it can be read by the Stream Analytics job input. However, there is something either wrong or I am missing because of which I can not view the test data in the Stream Analytics job.
import asyncio
import time
import requests
import json
from azure.eventhub.aio import EventHubProducerClient
from azure.eventhub import EventData
connection_str = 'XXX'
consumer_group = 'XXX'
eventhub_name = 'XXX'
async def run():
# create a producer client to send messages to the event hub
# specify connection string to your event hubs namespace and
# the event hub name
producer = EventHubProducerClient.from_connection_string(connection_str, consumer_group=consumer_group, eventhub_name=eventhub_name)
async with producer:
# create a batch
event_data_batch = await producer.create_batch(partition_id='1')
while True:
# add events to the batch
response = requests.get("https://jsonplaceholder.typicode.com/todos/1")
data = json.loads(response.text)
event_data_batch.add(EventData(data))
time.sleep(5)
# send the batch of events to the event hub
await producer.send_batch(event_data_batch)
loop = asyncio.get_event_loop()
loop.run_until_complete(run())
I'm trying to create a simple Kafka producer based on confluent_kafka. My code is the following:
#!/usr/bin/env python
from confluent_kafka import Producer
import json
def delivery_report(err, msg):
"""Called once for each message produced to indicate delivery result.
Triggered by poll() or flush().
see https://github.com/confluentinc/confluent-kafka-python/blob/master/README.md"""
if err is not None:
print('Message delivery failed: {}'.format(err))
else:
print('Message delivered to {} [{}]'.format(
msg.topic(), msg.partition()))
class MySource:
"""Kafka producer"""
def __init__(self, kafka_hosts, topic):
"""
:kafka_host list(str): hostnames or 'host:port' of Kafka
:topic str: topic to produce messages to
"""
self.topic = topic
# see https://github.com/edenhill/librdkafka/blob/master/CONFIGURATION.md
config = {
'metadata.broker.list': ','.join(kafka_hosts),
'group.id': 'mygroup',
}
self.producer = Producer(config)
#staticmethod
def main():
topic = 'my-topic'
message = json.dumps({
'measurement': [1, 2, 3]})
mys = MySource(['kafka'], topic)
mys.producer.produce(
topic, message, on_delivery=delivery_report)
mys.producer.flush()
if __name__ == "__main__":
MySource.main()
The first time I use a topic (here: "my-topic"), Kafka does react with "Auto creation of topic my-topic with 1 partitions and replication factor 1 is successful (kafka.server.KafkaApis)". However, the call-back function (on_delivery=delivery_report) is never called and it hangs at flush() (it terminates if I set a timeout for flush) neither the first time nor subsequent times. The Kafka logs does not show anything if I use an existing topic.
I try to call multiple pykafka consumer function using async. However, the first pykafka consumer function will block the other function from working.
The QueueConsumer lib:
import json
from pykafka import KafkaClient
import configparser
import asyncio
class QueueConsumer(object):
def __init__(self):
config = configparser.ConfigParser()
config.read('config.ini')
self.config = config
async def test(self):
defaultTopic = 'test'
client = KafkaClient(hosts=self.config['kafka']['host'])
topic = client.topics[defaultTopic.encode('utf-8')]
consumer = topic.get_simple_consumer()
# msg = next(consumer)
for message in consumer:
print(defaultTopic+' '+message.value.decode("utf-8"))
async def coba(self):
defaultTopic = 'coba'
client = KafkaClient(hosts=self.config['kafka']['host'])
topic = client.topics[defaultTopic.encode('utf-8')]
consumer = topic.get_simple_consumer()
# msg = next(consumer)
for message in consumer:
print(defaultTopic+' '+message.value.decode("utf-8"))
Then I call those function using:
import asyncio
queueConsumer = QueueConsumer()
loop = asyncio.get_event_loop()
loop.run_until_complete(asyncio.gather(
queueConsumer.test(),
queueConsumer.coba(),
))
loop.close()
The result will only return queue message from topic 'test' only.
Edit:
I try to add another function
async def factorial(self, name, number):
f = 1
for i in range(2, number+1):
print("Task %s: Compute factorial(%s)..." % (name, i))
await asyncio.sleep(1)
f *= i
print("Task %s: factorial(%s) = %s" % (name, number, f))
And then called like:
queueConsumer.test(),
queueConsumer.coba(),
queueConsumer.factorial('a',3),
queueConsumer.factorial('b',5),
queueConsumer.factorial('c',7),
Some print from factorial function is executed. But when print from either test or coba is called, then it just stop the others.
SimpleConsumer.consume is a blocking call, so you'll need to adjust your code to periodically poll for new messages while relinquishing control between polls for other asynchronous operations to take over. One way this could be accomplished is with the use_greenlets=True kwarg on KafkaClient, relying on gevent to handle control flow between multiple asynchronous operations.