I'm using confluent_kafka package for working with Kafka.
I create topic in this way:
from confluent_kafka import avro
from confluent_kafka.avro import AvroProducer
def my_producer():
bootstrap_servers=['my_adress.com:9092',
'my_adress.com:9092']
value_schema = avro.load('/home/ValueSchema.avsc')
avroProducer = AvroProducer({
'bootstrap.servers': bootstrap_servers[0]+','+bootstrap_servers[1],
'schema.registry.url':'http://my_adress.com:8081',
},
default_value_schema=value_schema
)
for i in range(0, 25000):
value = {"name":"Yuva","favorite_number":10,"favorite_color":"green","age":i*2}
avroProducer.produce(topic='my_topik14', value=value)
avroProducer.flush(0)
print('Finished!')
if __name__ == '__main__':
my_producer()
It works. (this get 24820 messages instead of 25000 by the way...)
We can check it:
kafka-run-class kafka.tools.GetOffsetShell --broker-list my_adress.com:9092 --topic my_topik14
my_topik14:0:24819
Now I want to consume:
from confluent_kafka import KafkaError
from confluent_kafka.avro import AvroConsumer
from confluent_kafka.avro.serializer import SerializerError
bootstrap_servers=['my_adress.com:9092',
'my_adress.com:9092']
c = AvroConsumer(
{'bootstrap.servers': bootstrap_servers[0]+','+bootstrap_servers[1],
'group.id': 'avroneversleeps',
'schema.registry.url': 'http://my_adress.com:8081',
'api.version.request': True,
'fetch.min.bytes': 100000,
'consume.callback.max.messages':1000,
'batch.num.messages':2
})
c.subscribe(['my_topik14'])
running = True
while running:
msg = None
try:
msg = c.poll(0.1)
if msg:
if not msg.error():
print(msg.value())
c.commit(msg)
elif msg.error().code() != KafkaError._PARTITION_EOF:
print(msg.error())
running = False
else:
print("No Message!! Happily trying again!!")
except SerializerError as e:
print("Message deserialization failed for %s: %s" % (msg, e))
running = False
c.commit()
c.close()
But there is a problem:
I read messages just one by one.
My question is How to read batch of messages?
I tried different parameters in Consumer config but they didn't cnahge anything!
Also I found this question on SO and tried the same parameters - it still doesn't work.
Also read this. But this is against the previous link...
You can do it using consume([num_messages=1][, timeout=-1]) method. API ref. here:
For Consumer:
https://docs.confluent.io/current/clients/confluent-kafka-python/index.html#confluent_kafka.Consumer.consume
For AvroConsumer:
https://docs.confluent.io/current/clients/confluent-kafka-python/index.html?highlight=avroconsumer#confluent_kafka.Consumer.consume
More about the issue here:
https://github.com/confluentinc/confluent-kafka-python/issues/252
AvroConsumer have no consume method. But it is easy to make my own implementation of this method as there is in Consume class (parent of AvroConsumer).
Here is the code:
def consume_batch(self, num_messages=1, timeout=None):
"""
This is an overriden method from confluent_kafka.Consumer class. This handles batch of message
deserialization using avro schema
:param int num_messages: number of messages to read in one batch (default=1)
:param float timeout: Poll timeout in seconds (default: indefinite)
:returns: list of messages objects with deserialized key and value as dict objects
:rtype: Message
"""
messages_out = []
if timeout is None:
timeout = -1
messages = super(AvroConsumer, self).consume(num_messages=num_messages, timeout=timeout)
if messages is None:
return None
else:
for m in messages:
if not m.value() and not m.key():
return messages
if not m.error():
if m.value() is not None:
decoded_value = self._serializer.decode_message(m.value())
m.set_value(decoded_value)
if m.key() is not None:
decoded_key = self._serializer.decode_message(m.key())
m.set_key(decoded_key)
messages_out.append(m)
#print(len(message))
return messages_out
But after that we run test and this method give no any performance increasing. So looks like it just for better usability. Or I need to make some additional work about serializing not single message, but whole batch.
Related
The bounty expires in 5 days. Answers to this question are eligible for a +50 reputation bounty.
Haley Mueller wants to draw more attention to this question.
I'm new to Python so this could be a simple fix.
I am using Flask and sockets for this Python project. I am starting the socket on another thread so I can actively listen for new messages. I have an array variable called 'SocketConnections' that is within my UdpComms class. The variable gets a new 'Connection' appended to it when a new socket connection is made. That works correctly. My issue is that when I try to read 'SocketConnections' from outside of the thread looking in, it is an empty array.
server.py
from flask import Flask, jsonify
import UdpComms as U
app = Flask(__name__)
#app.route('/api/talk', methods=['POST'])
def talk():
global global_server_socket
apples = global_server_socket.SocketConnections
return jsonify(message=apples)
global_server_socket = None
def start_server():
global global_server_socket
sock = U.UdpComms(udpIP="127.0.0.1", portTX=8000, portRX=8001, enableRX=True, suppressWarnings=True)
i = 0
global_server_socket = sock
while True:
i += 1
data = sock.ReadReceivedData() # read data
if data != None: # if NEW data has been received since last ReadReceivedData function call
print(data) # print new received data
time.sleep(1)
if __name__ == '__main__':
server_thread = threading.Thread(target=start_server)
server_thread.start()
app.run(debug=True,host='192.168.0.25')
UdpComms.py
import json
import uuid
class UdpComms():
def __init__(self,udpIP,portTX,portRX,enableRX=False,suppressWarnings=True):
self.SocketConnections = []
import socket
self.udpIP = udpIP
self.udpSendPort = portTX
self.udpRcvPort = portRX
self.enableRX = enableRX
self.suppressWarnings = suppressWarnings # when true warnings are suppressed
self.isDataReceived = False
self.dataRX = None
# Connect via UDP
self.udpSock = socket.socket(socket.AF_INET, socket.SOCK_DGRAM) # internet protocol, udp (DGRAM) socket
self.udpSock.setsockopt(socket.SOL_SOCKET, socket.SO_REUSEADDR, 1) # allows the address/port to be reused immediately instead of it being stuck in the TIME_WAIT state waiting for late packets to arrive.
self.udpSock.bind((udpIP, portRX))
# Create Receiving thread if required
if enableRX:
import threading
self.rxThread = threading.Thread(target=self.ReadUdpThreadFunc, daemon=True)
self.rxThread.start()
def __del__(self):
self.CloseSocket()
def CloseSocket(self):
# Function to close socket
self.udpSock.close()
def SendData(self, strToSend):
# Use this function to send string to C#
self.udpSock.sendto(bytes(strToSend,'utf-8'), (self.udpIP, self.udpSendPort))
def SendDataAddress(self, strToSend, guid):
# Use this function to send string to C#
print('finding connection: ' + guid)
if self.SocketConnections:
connection = self.GetConnectionByGUID(guid)
print('found connection: ' + guid)
if connection is not None:
self.udpSock.sendto(bytes(strToSend,'utf-8'), connection.Address)
def ReceiveData(self):
if not self.enableRX: # if RX is not enabled, raise error
raise ValueError("Attempting to receive data without enabling this setting. Ensure this is enabled from the constructor")
data = None
try:
data, _ = self.udpSock.recvfrom(1024)
print('Socket data recieved from: ', _)
if self.IsNewConnection(_) == True:
print('New socket')
self.SendDataAddress("INIT:" + self.SocketConnections[-1].GUID, self.SocketConnections[-1].GUID)
data = data.decode('utf-8')
except WindowsError as e:
if e.winerror == 10054: # An error occurs if you try to receive before connecting to other application
if not self.suppressWarnings:
print("Are You connected to the other application? Connect to it!")
else:
pass
else:
raise ValueError("Unexpected Error. Are you sure that the received data can be converted to a string")
return data
def ReadUdpThreadFunc(self): # Should be called from thread
self.isDataReceived = False # Initially nothing received
while True:
data = self.ReceiveData() # Blocks (in thread) until data is returned (OR MAYBE UNTIL SOME TIMEOUT AS WELL)
self.dataRX = data # Populate AFTER new data is received
self.isDataReceived = True
# When it reaches here, data received is available
def ReadReceivedData(self):
data = None
if self.isDataReceived: # if data has been received
self.isDataReceived = False
data = self.dataRX
self.dataRX = None # Empty receive buffer
if data != None and data.startswith('DIALOG:'): #send it info
split = data.split(':')[1]
return data
class Connection:
def __init__(self, gUID, address) -> None:
self.GUID = gUID
self.Address = address
def IsNewConnection(self, address):
for connection in self.SocketConnections:
if connection.Address == address:
return False
print('Appending new connection...')
connection = self.Connection(str(uuid.uuid4()),address)
self.SocketConnections.append(connection)
return True
def GetConnectionByGUID(self, guid):
for connection in self.SocketConnections:
if connection.GUID == guid:
return connection
return None
As mentioned above. When IsNewConnection() is called in UdpComms it does append a new object to SocketConnections. It's just trying to view the SocketConnections in the app.route that is empty. My plans are to be able to send socket messages from the app.routes
For interprocess communication you may try to use something like shared memory documented here
Instead of declaring your self.SocketConnections as a list = []
you'd use self.SocketConnections = Array('i', range(10)) (you are then limited to remembering only 10 connections though).
I have a problem with Kafka message deserializing. I use confluent kafka.
There is no schema registry - schemas are hardcoded.
I can connect consumer to any topic and receive messages, but I can't deserialise these messages.
Output after deserialisation looks something like this:
print(reader) line:
<avro.io.DatumReader object at 0x000002354235DBB0>
I think, that I've wrong code for deserializaing, but hove to solve this problem?
At the end I want to extract deserialized key and value
from confluent_kafka import Consumer, KafkaException, KafkaError
import sys
import time
import avro.schema
from avro.io import DatumReader, DatumWriter
def kafka_conf():
conf = {''' MY CONFIGURATION'''
}
return conf
if __name__ == '__main__':
conf = kafka_conf()
topic = """MY TOPIC"""
c = Consumer(conf)
c.subscribe([topic])
try:
while True:
msg = c.poll(timeout=200.0)
if msg is None:
continue
if msg.error():
# Error or event
if msg.error().code() == KafkaError._PARTITION_EOF:
# End of partition event
sys.stderr.write('%% %s [%d] reached end at offset %d\n' %
(msg.topic(), msg.partition(), msg.offset()))
else:
# Error
raise KafkaException(msg.error())
else:
print("key: ", msg.key())
print("value: ", msg.value())
print("offset: ", msg.offset())
print("topic: ", msg.topic())
print("timestamp: ", msg.timestamp())
print("headers: ", msg.headers())
print("partition: ", msg.partition())
print("latency: ", msg.latency())
schema = avro.schema.parse(open("MY_AVRO_SCHEMA.avsc", "rb").read())
print(schema)
reader = DatumReader(msg.value, reader_schema=schema)
print(reader)
time.sleep(5) # only on test
except KeyboardInterrupt:
print('\nAborted by user\n')
finally:
c.close()
You're printing a reader object, not deserializing data, which you do with reader.read()
You need a BinaryDecoder as well.
The DeserializingConsumer in the Confluent library source code does the exact same thing, after it fetches the schema from the registry, rather than local filesystem, so I suggest you follow what they do.
I'm using the Python Azure ServiceBus package and have the following snippet of code:
# receiver.py
import logging
logger = logging.getLogger("test")
def receive_message(
connection_str,
queue_name
) -> None:
"""
Call Azure ServiceBus API to retrieve message from a queue
"""
with ServiceBusClient.from_connection_string(
connection_str, logging_enable=True
) as servicebus_client:
with servicebus_client.get_queue_receiver(
queue_name=queue_name,
max_wait_time=20,
receive_mode=ServiceBusReceiveMode.PEEK_LOCK,
) as receiver:
for message in receiver:
logger.debug(f"Received message {message}")
I'm attempting to write unit tests for this function, and want to be able to mock out the recevier. Here is my attempt at writing the unit test, which fails because I can't figure out how to get the test to enter the for message in receiver block.
# test_receiver.py
#patch("receiver.ServiceBusClient")
#patch("receiver.logger")
def test_receive_message(mock_logger, mock_svcbus_client):
# Figure out how to mock
mock_svcbus_client.from_connection_string.return_value.get_queue_receiver.return_value = iter(["message"])
receive_message("mock_connection_str", "mock_q_name")
# Assertion fails
mock_logger.return_value.debug.assert_called_once()
You can try from mocks import MockReceivedMessage, MockReceiver to mock the receiver
Example 1:
class MockReceivedMessage(ServiceBusReceivedMessage):
def __init__(self, prevent_renew_lock=False, exception_on_renew_lock=False, **kwargs):
self._lock_duration = kwargs.get("lock_duration", 2)
self._raw_amqp_message = None
self._received_timestamp_utc = utc_now()
self.locked_until_utc = self._received_timestamp_utc + timedelta(seconds=self._lock_duration)
self._settled = False
self._receiver = MockReceiver()
self._prevent_renew_lock = prevent_renew_lock
self._exception_on_renew_lock = exception_on_renew_lock
Example 2:
def test_queue_message_receive_and_delete(self, servicebus_namespace_connection_string, servicebus_queue, **kwargs):
with ServiceBusClient.from_connection_string(
servicebus_namespace_connection_string, logging_enable=False) as sb_client:
with sb_client.get_queue_sender(servicebus_queue.name) as sender:
message = ServiceBusMessage("Receive and delete test")
sender.send_messages(message)
with sb_client.get_queue_receiver(servicebus_queue.name,
receive_mode=ServiceBusReceiveMode.RECEIVE_AND_DELETE) as receiver:
messages = receiver.receive_messages(max_wait_time=10)
assert len(messages) == 1
message = messages[0]
print_message(_logger, message)
with pytest.raises(ValueError):
receiver.complete_message(message)
with pytest.raises(ValueError):
receiver.abandon_message(message)
with pytest.raises(ValueError):
receiver.defer_message(message)
with pytest.raises(ValueError):
receiver.dead_letter_message(message)
with pytest.raises(ValueError):
receiver.renew_message_lock(message)
time.sleep(10)
with sb_client.get_queue_receiver(servicebus_queue.name) as receiver:
messages = receiver.receive_messages(max_wait_time=10)
for m in messages:
print_message(_logger, m)
assert len(messages) == 0
You can refer to mocks.py
and test_queues.py
If you still have doubt, You can open an issue on GitHub: azure-sdk-for-python
The "Sending and receiving logging events across a network" section of the python logging cookbook demonstrates how a clients can send logs via a TCP session.
Log messages are pickled and sent to the server thanks to the socket handler. The server then unpickle the messages and log them.
The code to get a message from the tcp socket is this one:
class LogRecordStreamHandler(SocketServer.StreamRequestHandler):
"""Handler for a streaming logging request.
This basically logs the record using whatever logging policy is
configured locally.
"""
def handle(self):
"""
Handle multiple requests - each expected to be a 4-byte length,
followed by the LogRecord in pickle format. Logs the record
according to whatever policy is configured locally.
"""
while True:
chunk = self.connection.recv(4)
if len(chunk) < 4:
break
slen = struct.unpack('>L', chunk)[0]
chunk = self.connection.recv(slen)
while len(chunk) < slen:
chunk = chunk + self.connection.recv(slen - len(chunk))
obj = self.unPickle(chunk)
record = logging.makeLogRecord(obj)
self.handleLogRecord(record)
# then, methods to handle the record, but that's not the interesting part
class LogRecordSocketReceiver(SocketServer.ThreadingTCPServer):
"""
Simple TCP socket-based logging receiver suitable for testing.
"""
allow_reuse_address = 1
def __init__(self, host='localhost',
port=logging.handlers.DEFAULT_TCP_LOGGING_PORT,
handler=LogRecordStreamHandler):
SocketServer.ThreadingTCPServer.__init__(self, (host, port), handler)
self.abort = 0
self.timeout = 1
self.logname = None
def serve_until_stopped(self):
import select
abort = 0
while not abort:
rd, wr, ex = select.select([self.socket.fileno()], [], [], self.timeout)
if rd:
self.handle_request()
abort = self.abort
What I don't understand in this example is: how do we know that the first 4 bytes we read from the socket constitute the length of the message? I looked at the socket, and logging documentation but could not find mention of it.
Also, the docstring implicitly states that this code is not good enough for production. What is so bad about that code?
As suggested in the comments, the answer is in the sending code (handlers.py in python 2.7.10). I just removed the docstrings/comments that were obvious or irrelevant to this question to make the code more readable.
def makePickle(self, record):
ei = record.exc_info
if ei:
dummy = self.format(record)
record.exc_info = None
d = dict(record.__dict__)
d['msg'] = record.getMessage()
d['args'] = None
s = cPickle.dumps(d, 1)
if ei:
record.exc_info = ei
# slen represents a "long integer", which is usually 32 bits large.
slen = struct.pack(">L", len(s))
# Here is where the 4 byte length is prepended to the message
return slen + s
def emit(self, record):
try:
s = self.makePickle(record)
# s is actually (length of the message) + (message)
self.send(s)
except (KeyboardInterrupt, SystemExit):
raise
except:
self.handleError(record)
def send(self, s):
"""
Send a pickled string to the socket.
This function allows for partial sends which can happen when the
network is busy.
"""
if self.sock is None:
self.createSocket()
if self.sock:
try:
if hasattr(self.sock, "sendall"):
self.sock.sendall(s)
else:
sentsofar = 0
left = len(s)
while left > 0:
sent = self.sock.send(s[sentsofar:])
sentsofar = sentsofar + sent
left = left - sent
except socket.error:
self.sock.close()
self.sock = None # so we can call createSocket next time
I'm trying to warp Pyro's name server into a more convenient object that would allow me to start and stop it as I wish. For example, I would like to be able to do something like
nameServer = NameServer("localhost")
nameServer.startNS()
[... make some other operations...]
nameServer.stopNS()
nameServer = None
[... make some other operations...]
nameServer = NameServer("localhost")
nameServer.startNS()
using the following definition for the NameServer class:
class NameServer(threadutil.Thread):
def __init__(self, host, isDeamon=True, port=None, enableBroadcast=True,
bchost=None, bcport=None, unixsocket=None, nathost=None, natport=None):
super(NameServer,self).__init__()
self.setDaemon(isDeamon)
self.host=host
self.started=threadutil.Event()
self.unixsocket = unixsocket
self.port = port
self.enableBroadcast = enableBroadcast
self.bchost = bchost
self.bcport = bcport
self.nathost = nathost
self.natport = natport
"""
This code is taken from Pyro4.naming.startNSloop
"""
self.ns_daemon = naming.NameServerDaemon(self.host, self.port, self.unixsocket,
nathost=self.nathost, natport=self.natport)
self.uri = self.ns_daemon.uriFor(self.ns_daemon.nameserver)
internalUri = self.ns_daemon.uriFor(self.ns_daemon.nameserver, nat=False)
self.bcserver=None
if self.unixsocket:
hostip = "Unix domain socket"
else:
hostip = self.ns_daemon.sock.getsockname()[0]
if hostip.startswith("127."):
enableBroadcast=False
if enableBroadcast:
# Make sure to pass the internal uri to the broadcast responder.
# It is almost always useless to let it return the external uri,
# because external systems won't be able to talk to this thing anyway.
bcserver=naming.BroadcastServer(internalUri, self.bchost, self.bcport)
bcserver.runInThread()
def run(self):
try:
self.ns_daemon.requestLoop()
finally:
self.ns_daemon.close()
if self.bcserver is not None:
self.bcserver.close()
def startNS(self):
self.start()
def stopNS(self):
self.ns_daemon.shutdown()
if self.bcserver is not None:
self.bcserver.shutdown()
So far, so good. It works as expected. However, if I run a command Pyro4.naming.locateNS() from another thread when the name server is running, then the next time I call nameServer.stopNS(), the program freezes. Anyone has an idea why? And what would be the best (at least a better) way to write such a NameServer wrapper.
There is an example in the Pyro4 repository that you could adapt.
https://github.com/delmic/Pyro4/blob/master/examples/eventloop/server.py
from __future__ import print_function
import socket
import select
import sys
import Pyro4.core
import Pyro4.naming
if sys.version_info<(3,0):
input=raw_input
print("Make sure that you don't have a name server running already.")
servertype=input("Servertype thread/multiplex (t/m)?")
if servertype=='t':
Pyro4.config.SERVERTYPE="thread"
else:
Pyro4.config.SERVERTYPE="multiplex"
hostname=socket.gethostname()
class EmbeddedServer(object):
def multiply(self, x, y):
return x*y
print("initializing services... servertype=%s" % Pyro4.config.SERVERTYPE)
# start a name server with broadcast server as well
nameserverUri, nameserverDaemon, broadcastServer = Pyro4.naming.startNS(host=hostname)
assert broadcastServer is not None, "expect a broadcast server to be created"
print("got a Nameserver, uri=%s" % nameserverUri)
print("ns daemon location string=%s" % nameserverDaemon.locationStr)
print("ns daemon sockets=%s" % nameserverDaemon.sockets)
print("bc server socket=%s (fileno %d)" % (broadcastServer.sock, broadcastServer.fileno()))
# create a Pyro daemon
pyrodaemon=Pyro4.core.Daemon(host=hostname)
print("daemon location string=%s" % pyrodaemon.locationStr)
print("daemon sockets=%s" % pyrodaemon.sockets)
# register a server object with the daemon
serveruri=pyrodaemon.register(EmbeddedServer())
print("server uri=%s" % serveruri)
# register it with the embedded nameserver directly
nameserverDaemon.nameserver.register("example.embedded.server",serveruri)
print("")
# below is our custom event loop.
while True:
print("Waiting for events...")
# create sets of the socket objects we will be waiting on
# (a set provides fast lookup compared to a list)
nameserverSockets = set(nameserverDaemon.sockets)
pyroSockets = set(pyrodaemon.sockets)
rs=[broadcastServer] # only the broadcast server is directly usable as a select() object
rs.extend(nameserverSockets)
rs.extend(pyroSockets)
rs,_,_ = select.select(rs,[],[],3)
eventsForNameserver=[]
eventsForDaemon=[]
for s in rs:
if s is broadcastServer:
print("Broadcast server received a request")
broadcastServer.processRequest()
elif s in nameserverSockets:
eventsForNameserver.append(s)
elif s in pyroSockets:
eventsForDaemon.append(s)
if eventsForNameserver:
print("Nameserver received a request")
nameserverDaemon.events(eventsForNameserver)
if eventsForDaemon:
print("Daemon received a request")
pyrodaemon.events(eventsForDaemon)
nameserverDaemon.close()
broadcastServer.close()
pyrodaemon.close()
print("done")