Flink Python Datastream API Kafka Producer Sink Serializaion

Flink Python Datastream API Kafka Producer Sink Serializaion - python

I'm trying to read data from one kafka topic and writing to another after making some processing.
I'm able to read data and process it when i try to write it to another topic. it gives the error
If I try to write the data as it is without doing any processing over it. Kafka producer SimpleStringSchema accepts it.
But I want to convert String to JSON. play with JSON and then write it to another topic in String format.
My Code:
import json
from pyflink.common import Row
from pyflink.common.serialization import SimpleStringSchema, SerializationSchema,JsonRowSerializationSchema,Encoder
from pyflink.common.typeinfo import Types,BasicType,TypeInformation,BasicTypeInfo
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.connectors import FlinkKafkaConsumer, FlinkKafkaProducer
def my_map(obj):
json_obj = json.loads(json.loads(obj))
return json.dumps(json_obj["name"])
def utf8_decoder(s):
""" Decode the unicode as UTF-8 """
if s is None:
return None
return s.decode('utf-8')
def datastream_api_demo():
# 1. create a StreamExecutionEnvironment
env = StreamExecutionEnvironment.get_execution_environment()
# the sql connector for kafka is used here as it's a fat jar and could avoid dependency issues
env.add_jars("file:///Users/niaz/Downloads/f2.jar")
# 2. create source DataStream
deserialization_schema = SimpleStringSchema()
# deserialization_schema = JsonRowDeserializationSchema.builder() \
# .type_info(type_info=Types.ROW([
# Types.("name", Types.STRING()),
# Types.FIELD("age", Types.LONG()),
# Types.FIELD("car", Types.STRING())])).build()
kafka_source = FlinkKafkaConsumer(
topics='test_source_topic_input',
deserialization_schema=deserialization_schema,
properties={'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'})
ds = env.add_source(kafka_source)
ds = ds.map(lambda a: my_map(a))
# 3. define the execution logic
# ds = ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \
# .key_by(lambda a: a[0]) \
# .reduce(lambda a, b: Row(a[0], a[1] + b[1]))
# 4. create sink and emit result to sink
serialization_schema = SimpleStringSchema()
kafka_sink = FlinkKafkaProducer(
topic='test_sink_topic_4',
serialization_schema=serialization_schema,
producer_config={'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'})
ds.add_sink(kafka_sink)
#ds.print()
# 5. execute the job
env.execute('datastream_api_demo')
if __name__ == '__main__':
datastream_api_demo()
And I'm getting following error:
py4j.protocol.Py4JJavaError: An error occurred while calling o0.execute.
: org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
at org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144)
at org.apache.flink.runtime.minicluster.MiniClusterJobClient.lambda$getJobExecutionResult$3(MiniClusterJobClient.java:137)
at java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:642)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073)
at org.apache.flink.runtime.rpc.akka.AkkaInvocationHandler.lambda$invokeRpc$0(AkkaInvocationHandler.java:237)
at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:859)
at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:837)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073)
at org.apache.flink.runtime.concurrent.FutureUtils$1.onComplete(FutureUtils.java:1081)
at akka.dispatch.OnComplete.internal(Future.scala:264)
at akka.dispatch.OnComplete.internal(Future.scala:261)
at akka.dispatch.japi$CallbackBridge.apply(Future.scala:191)
at akka.dispatch.japi$CallbackBridge.apply(Future.scala:188)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:36)
at org.apache.flink.runtime.concurrent.Executors$DirectExecutionContext.execute(Executors.java:73)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:44)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:252)
at akka.pattern.PromiseActorRef.$bang(AskSupport.scala:572)
at akka.pattern.PipeToSupport$PipeableFuture$$anonfun$pipeTo$1.applyOrElse(PipeToSupport.scala:22)
at akka.pattern.PipeToSupport$PipeableFuture$$anonfun$pipeTo$1.applyOrElse(PipeToSupport.scala:21)
at scala.concurrent.Future$$anonfun$andThen$1.apply(Future.scala:436)
at scala.concurrent.Future$$anonfun$andThen$1.apply(Future.scala:435)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:36)
at akka.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:55)
at akka.dispatch.BatchingExecutor$BlockableBatch$$anonfun$run$1.apply$mcV$sp(BatchingExecutor.scala:91)
at akka.dispatch.BatchingExecutor$BlockableBatch$$anonfun$run$1.apply(BatchingExecutor.scala:91)
at akka.dispatch.BatchingExecutor$BlockableBatch$$anonfun$run$1.apply(BatchingExecutor.scala:91)
at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:72)
at akka.dispatch.BatchingExecutor$BlockableBatch.run(BatchingExecutor.scala:90)
at akka.dispatch.TaskInvocation.run(AbstractDispatcher.scala:40)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(ForkJoinExecutorConfigurator.scala:44)
at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
Caused by: org.apache.flink.runtime.JobException: Recovery is suppressed by NoRestartBackoffTimeStrategy
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:138)
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.getFailureHandlingResult(ExecutionFailureHandler.java:82)
at org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskFailure(DefaultScheduler.java:216)
at org.apache.flink.runtime.scheduler.DefaultScheduler.maybeHandleTaskFailure(DefaultScheduler.java:206)
at org.apache.flink.runtime.scheduler.DefaultScheduler.updateTaskExecutionStateInternal(DefaultScheduler.java:197)
at org.apache.flink.runtime.scheduler.SchedulerBase.updateTaskExecutionState(SchedulerBase.java:682)
at org.apache.flink.runtime.scheduler.SchedulerNG.updateTaskExecutionState(SchedulerNG.java:79)
at org.apache.flink.runtime.jobmaster.JobMaster.updateTaskExecutionState(JobMaster.java:435)
at jdk.internal.reflect.GeneratedMethodAccessor41.invoke(Unknown Source)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcInvocation(AkkaRpcActor.java:305)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:212)
at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:77)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:158)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21)
at scala.PartialFunction$class.applyOrElse(PartialFunction.scala:123)
at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:170)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
at akka.actor.Actor$class.aroundReceive(Actor.scala:517)
at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592)
at akka.actor.ActorCell.invoke(ActorCell.scala:561)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258)
at akka.dispatch.Mailbox.run(Mailbox.scala:225)
at akka.dispatch.Mailbox.exec(Mailbox.scala:235)
... 4 more
Caused by: org.apache.flink.streaming.runtime.tasks.AsynchronousException: Caught exception while processing timer.
at org.apache.flink.streaming.runtime.tasks.StreamTask$StreamTaskAsyncExceptionHandler.handleAsyncException(StreamTask.java:1309)
at org.apache.flink.streaming.runtime.tasks.StreamTask.handleAsyncException(StreamTask.java:1285)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invokeProcessingTimeCallback(StreamTask.java:1424)
at org.apache.flink.streaming.runtime.tasks.StreamTask.lambda$null$16(StreamTask.java:1413)
at org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor$SynchronizedStreamTaskActionExecutor.runThrowing(StreamTaskActionExecutor.java:93)
at org.apache.flink.streaming.runtime.tasks.mailbox.Mail.run(Mail.java:90)
at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.processMailsWhenDefaultActionUnavailable(MailboxProcessor.java:344)
at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.processMail(MailboxProcessor.java:330)
at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.runMailboxLoop(MailboxProcessor.java:202)
at org.apache.flink.streaming.runtime.tasks.StreamTask.runMailboxLoop(StreamTask.java:681)
at org.apache.flink.streaming.runtime.tasks.StreamTask.executeInvoke(StreamTask.java:636)
at org.apache.flink.streaming.runtime.tasks.StreamTask.runWithCleanUpOnFail(StreamTask.java:647)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:620)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:779)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:566)
at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: TimerException{org.apache.flink.streaming.runtime.tasks.ExceptionInChainedOperatorException: Could not forward element to next operator}
... 14 more
Caused by: org.apache.flink.streaming.runtime.tasks.ExceptionInChainedOperatorException: Could not forward element to next operator
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.pushToOperator(CopyingChainingOutput.java:85)
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.collect(CopyingChainingOutput.java:46)
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.collect(CopyingChainingOutput.java:26)
at org.apache.flink.streaming.api.operators.CountingOutput.collect(CountingOutput.java:50)
at org.apache.flink.streaming.api.operators.CountingOutput.collect(CountingOutput.java:28)
at org.apache.flink.streaming.api.operators.TimestampedCollector.collect(TimestampedCollector.java:50)
at org.apache.flink.streaming.api.operators.python.PythonMapOperator.emitResult(PythonMapOperator.java:59)
at org.apache.flink.streaming.api.operators.python.AbstractPythonFunctionOperator.emitResults(AbstractPythonFunctionOperator.java:299)
at org.apache.flink.streaming.api.operators.python.AbstractPythonFunctionOperator.invokeFinishBundle(AbstractPythonFunctionOperator.java:322)
at org.apache.flink.streaming.api.operators.python.AbstractPythonFunctionOperator.checkInvokeFinishBundleByTime(AbstractPythonFunctionOperator.java:314)
at org.apache.flink.streaming.api.operators.python.AbstractPythonFunctionOperator.lambda$open$0(AbstractPythonFunctionOperator.java:133)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invokeProcessingTimeCallback(StreamTask.java:1422)
... 13 more
Caused by: java.lang.ClassCastException: class [B cannot be cast to class java.lang.String ([B and java.lang.String are in module java.base of loader 'bootstrap')
at org.apache.flink.api.common.serialization.SimpleStringSchema.serialize(SimpleStringSchema.java:36)
at org.apache.flink.streaming.connectors.kafka.internals.KafkaSerializationSchemaWrapper.serialize(KafkaSerializationSchemaWrapper.java:71)
at org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer.invoke(FlinkKafkaProducer.java:907)
at org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer.invoke(FlinkKafkaProducer.java:99)
at org.apache.flink.streaming.api.functions.sink.TwoPhaseCommitSinkFunction.invoke(TwoPhaseCommitSinkFunction.java:223)
at org.apache.flink.streaming.api.operators.StreamSink.processElement(StreamSink.java:54)
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.pushToOperator(CopyingChainingOutput.java:71)
... 24 more

Maybe you can set ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG and ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG in producer_config in FlinkKafkaProducer
props.put("key.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer");

I have resolved it by providing the output type as Java string which was like
from pyflink.common.typeinfo import Types
ds = ds.map(lambda a: my_map(a),Types.STRING()) # Map function needs ouput type to serialize it to Java String

Related

google.api_core.exceptions.Unknown: None There was a problem opening the stream. Try turning on DEBUG level logs to see the error

I am working on writing a python script to load the data from Pub/Sub to BigQuery using Storage Write API's streaming method with default stream. I am trying to adapt https://github.com/googleapis/python-bigquery-storage/blob/main/samples/snippets/append_rows_proto2.py to my needs but I am running into an error
As per the google documentation, I have converted my data in ProtoBuf format for Python client.
However I am getting this error continuously while trying to run my program.
(venv) {{MY_COMPUTER}} {{FOLDER_NAME}} % python3 default_Stream.py
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): metadata.google. internal.:80
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): metadata.google. internal.:80
DEBUG:google.cloud.logging_v2.handlers.transports.background_thread:Background thread started.
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): metadata.google. internal.:80
DEBUG:urllib3.connectionpool:Starting new HTTP connection (1): metadata.google. internal.:80
Traceback (most recent call last) :
File "default_Stream.py" line 116, in <module>
append_rows_default(“{{GCLOUD_PROJECT_NAME}}, “{{BIGQUERY_DATASET_NAME}}”, “{{BIGQUERY_TABLE}}”)
File "default_ Stream.py", line 95, in append_rows default
response_future_1 = append_rows_stream. send (request)
File “{{VIRTUAL_ENVIRONMENT_PATH}}/venv/lib/python3.7/site-packages/google/cloud/bigquery_storage_v1/writer.py", line 234, in send
return self._open(request)
File "{{VIRTUAL_ENVIRONMENT_PATH}}/venv/lib/python3.7/site-packages/google/cloud/bigquery_storage_v1/writer.py", line 207, in _open
raise request_exception
google.api_core.exceptions. Unknown: None There was a problem opening the stream. Try turning on DEBUG level logs to
see the error.
Waiting up to 5 seconds.
Sent all pending logs.
Here is my script:
# [START bigquerystorage_append_rows_default]
"""
This code sample demonstrates how to write records
using the low-level generated client for Python.
"""
from xmlrpc.client import boolean
from google.cloud import bigquery_storage_v1
from google.cloud.bigquery_storage_v1 import types
from google.cloud.bigquery_storage_v1 import writer
from google.protobuf import descriptor_pb2
import logging
import google.cloud.logging
#from google.cloud import logging
# If you update the customer_record.proto protocol buffer definition, run:
#
# protoc --python_out=. customer_record.proto
#
# from the samples/snippets directory to generate the debezium_record_pb2.py module.
import debezium_record_pb2
def create_row_data(id: int, name: str, role: int, joining_date: int, last_updated: int, is_deleted: boolean):
row = debezium_record_pb2.DebeziumRecord()
row.column1 = column1
row.column2 = column2
row.column3 = column3
row.column4 = column4
row.column5 = column5
row.column6 = column6
return row.SerializeToString()
def append_rows_default(project_id: str, dataset_id: str, table_id: str):
"""Create a write stream, write some sample data, and commit the stream."""
client = google.cloud.logging.Client()
logging.basicConfig(level=logging.DEBUG)
client.setup_logging()
#logging.getLogger().setLevel(logging.INFO)
write_client = bigquery_storage_v1.BigQueryWriteClient()
parent = write_client.table_path(project_id, dataset_id, table_id)
stream_name = f'{parent}/_default'
write_stream = types.WriteStream()
#write_stream.type_ = types.WriteStream.Type.PENDING
# write_stream = write_client.create_write_stream(
# parent=parent, write_stream=write_stream
# )
#stream_name = write_stream.name
# Create a template with fields needed for the first request.
request_template = types.AppendRowsRequest()
# The initial request must contain the stream name.
request_template.write_stream = stream_name
# So that BigQuery knows how to parse the serialized_rows, generate a
# protocol buffer representation of your message descriptor.
proto_schema = types.ProtoSchema()
proto_descriptor = descriptor_pb2.DescriptorProto()
debezium_record_pb2.DebeziumRecord.DESCRIPTOR.CopyToProto(proto_descriptor)
proto_schema.proto_descriptor = proto_descriptor
proto_data = types.AppendRowsRequest.ProtoData()
proto_data.writer_schema = proto_schema
request_template.proto_rows = proto_data
# Some stream types support an unbounded number of requests. Construct an
# AppendRowsStream to send an arbitrary number of requests to a stream.
append_rows_stream = writer.AppendRowsStream(write_client, request_template)
# Create a batch of row data by appending proto2 serialized bytes to the
# serialized_rows repeated field.
proto_rows = types.ProtoRows()
proto_rows.serialized_rows.append(create_row_data(8, "E", 13, 1643673600000, 1654556118813, False))
#proto_rows.serialized_rows.append(create_row_data(2, "Bob"))
# Set an offset to allow resuming this stream if the connection breaks.
# Keep track of which requests the server has acknowledged and resume the
# stream at the first non-acknowledged message. If the server has already
# processed a message with that offset, it will return an ALREADY_EXISTS
# error, which can be safely ignored.
#
# The first request must always have an offset of 0.
request = types.AppendRowsRequest()
request.offset = 0
proto_data = types.AppendRowsRequest.ProtoData()
proto_data.rows = proto_rows
request.proto_rows = proto_data
logging.basicConfig(level=logging.DEBUG)
response_future_1 = append_rows_stream.send(request)
logging.basicConfig(level=logging.DEBUG)
print(response_future_1.result())
#print(response_future_2.result())
# Shutdown background threads and close the streaming connection.
append_rows_stream.close()
# No new records can be written to the stream after this method has been called.
write_client.finalize_write_stream(name=write_stream.name)
# Commit the stream you created earlier.
batch_commit_write_streams_request = types.BatchCommitWriteStreamsRequest()
batch_commit_write_streams_request.parent = parent
batch_commit_write_streams_request.write_streams = [write_stream.name]
write_client.batch_commit_write_streams(batch_commit_write_streams_request)
print(f"Writes to stream: '{write_stream.name}' have been committed.")
if __name__ == "__main__":
append_rows_default(“{{GCLOUD_PROJECT_NAME}}, “{{BIGQUERY_DATASET_NAME}}”, “{{BIGQUERY_TABLE}}”)
# [END bigquerystorage_append_rows_default]
This is my proto file (debezium_record_pb2.py)
syntax = "proto3";
// cannot contain fields which are not present in the table.
message DebeziumRecord {
uint32 column1 = 1;
string column2 = 2;
uint32 column3 = 3;
uint64 column4 = 4;
uint64 column5 = 5;
bool column6 = 6;
}
This is the definition of my BigQuery table
CREATE TABLE `{{GCLOUD_PROJECT_NAME}}.{{BIGQUERY_DATASET_NAME}}.{{BIGQUERY_TABLE}}`
(
column1 INT64 NOT NULL,
column2 STRING,
column3 INT64,
column4 INT64 NOT NULL,
column5 INT64,
column6 BOOL
);
I have been stuck on this error and cannot proceed further.Any pointers would be really appreciated.
Thanks

From another team member of the posting user:
We had to fix our logging output in order to see what the error actually was
We changed this portion of default_Stream.py
if __name__ == "__main__":
append_rows_default(“{{GCLOUD_PROJECT_NAME}}, “{{BIGQUERY_DATASET_NAME}}”, “{{BIGQUERY_TABLE}}”)
# [END bigquerystorage_append_rows_default]
to
if __name__ == "__main__":
logging.basicConfig(
level=logging.DEBUG,
format="%(asctime)s [%(levelname)s] %(message)s",
handlers=[
#logging.FileHandler("debug.log"),
logging.StreamHandler()
]
)
append_rows_default(“{{GCLOUD_PROJECT_NAME}}, “{{BIGQUERY_DATASET_NAME}}”, “{{BIGQUERY_TABLE}}”)
# [END bigquerystorage_append_rows_default]
Then we ran python3 default_Stream.py --log=DEBUG
Once we were actually getting the error message logged to the standard output, we saw that the error was
grpc._channel._MultiThreadedRendezvous: <_MultiThreadedRendezvous of RPC that terminated with:
status = StatusCode.INVALID_ARGUMENT
details = "The proto field mismatched with BigQuery field at DebeziumRecord.column4, the proto field type uint64, BigQuery field type INTEGER Entity: projects/{{GCLOUD_PROJECT_NAME}}/datasets/{{BIGQUERY_DATASET_NAME}}/tables/{{BIGQUERY_TABLE}}/_default"
debug_error_string = "{"created":"#1656037879.048726680","description":"Error received from peer ipv4:142.251.6.95:443","file":"src/core/lib/surface/call.cc","file_line":966,"grpc_message":"The proto field mismatched with BigQuery field at DebeziumRecord.column4, the proto field type uint64, BigQuery field type INTEGER Entity: projects/{{GCLOUD_PROJECT_NAME}}/datasets/{{BIGQUERY_DATASET_NAME}}/tables/{{BIGQUERY_TABLE}}/_default","grpc_status":3}"
>
To fix that error we corrected the data types of column4 and column5 to be int64 instead of uint64, per https://cloud.google.com/bigquery/docs/write-api#data_type_conversions
There are still additional errors/issues with default_Stream.py that we are working through, but this was the answer to this question

Apache Kafka setup consumer error: "JSON Decoder Error: Extra data line 1 column 4 char 4"

The consumer bin/kafka-console-consumer.sh --bootstrap-server kafka:9092 --topic my_topic works and I can see the logs as: 2021-12-11T22:40:13.800Z {"ts":1639262395.220755,"uid":"CiaUp427FXwzqySsOh","id.orig_h":"fe80::f816:3eff:fef4:a877","id.orig_p":5353,"id.resp_h":"ff02::fb","id.resp_p":5353,"proto":"udp","service":"dns","duration":0.40987586975097659,"orig_bytes":1437,"resp_bytes":0,"conn_state":"S0","missed_bytes":0,"history":"D","orig_pkts":4,"orig_ip_bytes":1629,"resp_pkts":0,"resp_ip_bytes":0}
However with the consumer code listed below I am getting: JSON Decoder Error: Extra data line 1 column 4 char 4" which seems to be an easy error related to parsing the data, which has each log starting with the date:time as shown above. Meaning the consumer gets the first log but cannot parse it.
Easy enough, yet it seems I cannot get around it as this is part of the KafkaConsumer object. If anyone can give a hint or show how to do it it would be great. Thanks and Regards, M
from json import loads
from kafka import KafkaConsumer, TopicPartition
import threading, time
from IPython.display import clear_output
KAFKA_SERVER='10.10.10.10:9092'
TOPIC = 'my_topic'
AUTO_OFFSET_RESET = 'earliest'
CONSUMER_TIME_OUT=1000 #miliseconds
MAXIMUM_SECONDS=0.01 #seconds
class TrafficConsumer():
def __init__(self, offset=AUTO_OFFSET_RESET, verbose=False, close=True):
try:
self.__traffic_consumer = KafkaConsumer(
TOPIC,
bootstrap_servers = [KAFKA_SERVER],
auto_offset_reset = offset,
enable_auto_commit = True,
#group_id = GROUP_ID,
value_deserializer = lambda x : loads(x.decode('utf-8')),
consumer_timeout_ms = CONSUMER_TIME_OUT,
#on_commit = self.commit_completed(),
)
self.__traffic_consumer.subscribe([TOPIC])
threading.Thread.__init__(self)
self.stop_event = threading.Event()
except Exception as e:
print("Consumer is not accessible. Check: the connections and the settings in attributes_kafka.", e)
self.set_conn_log_traffic(verbose=verbose, close=close)
def stop(self):
self.stop_event.set()
def get_consumer(self):
return self.__traffic_consumer
def set_conn_log_traffic(self, verbose=False, close=True):
while not self.stop_event.is_set():
for ind_flow in self.__traffic_consumer.poll(2):
print(ind_flow)
if self.stop_event.is_set():
break
if close: self.__traffic_consumer.close()

Your data isn't proper json. It includes a timestamp before the json object, which cannot be decoded using json.loads.
You should verify how the producer is sending data since the timestamp is part of the value, rather than the Kafka record timestamp
Or, you can handle the problem in a the consumer by using a different deserializer function
For example
def safe_deserialize(value):
_, data = value.decode('utf-8').split(" ", 1)
return json.loads(data)
...
KafkaConsumer(
...
value_deserializer = safe_deserialize,

Confluent Kafka python schema parser causes conflict with fastavro

I am running Python 3.9 with Confluent Kafka 1.7.0, avro-python3 1.10.0 and fastavro 1.4.1.
The following code uses Avro schema encoder in order to encode a message, which succeeds only if we transform the resulting schema encoding by getting rid of the MappingProxyType:
from confluent_kafka import Producer
from confluent_kafka.avro import CachedSchemaRegistryClient, MessageSerializer
from fastavro.schema import parse_schema
from fastavro.validation import validate
from types import MappingProxyType
from typing import Any
import sys
def transformMap(item: Any) -> Any:
if type(item) in {dict, MappingProxyType}:
return {k:transformMap(v) for k,v in item.items()}
elif type(item) is list:
return [transformMap(v) for v in item]
else:
return item
def main(argv = None):
msgType = 'InstrumentIdMsg'
idFigi = 'BBG123456789'
head = {'sDateTime': 1, 'msgType': msgType, 'srcSeq': 1,
'rDateTime': 1, 'src': 'Brownstone', 'reqID': None,
'sequence': 1}
msgMap = {'head': head, 'product': 'Port', 'idIsin': None, 'idFigi': idFigi,
'idBB': None, 'benchmark': None, 'idCusip': None,'idCins': None}
registryClient = CachedSchemaRegistryClient(url = 'http://local.KafkaRegistry.com:8081')
schemaId, schema, version = registryClient.get_latest_schema(msgType)
serializer = MessageSerializer(registry_client = registryClient)
schemaMap = schema.to_json()
# NOTE:
# schemaMap cannot be used since it uses mappingproxy
# which causes validate() and parse_schema() to throw
schemaDict = transformMap(schemaMap)
isValid = validate(datum = msgMap, schema = schemaDict, raise_errors = True)
parsed_schema = parse_schema(schema = schemaDict)
msg = serializer.encode_record_with_schema_id(schema_id = schemaId,
record = msgMap)
producer = Producer({'bootstrap.servers': 'kafkaServer:9092'})
producer.produce(key = idFigi,
topic = 'TOPIC_NAME',
value = msg)
return 0
if __name__ == '__main__':
sys.exit(main())
The transformation basically leaves everything unchanged except altering MappingProxyType to dict instances.
Is there a problem in the way I am calling the standard library which causes mapping proxy to be used, which in turn causes fastavro to throw? Can this be fixed by something as a user, or is this really a bug in the Confluent Kafka library?
In addition, the output schemaId from registryClient.get_latest_schema() is marked in the docs to return str but returns int. If I understand correctly, this is the intended input into the schema_id parameter of serializer.encode_record_with_schema_id() (and it works correctly if I call it), which is also marked as int. Is that a typo in the docs? In other words, it seems either registryClient.get_latest_schema() should return an integer, or serializer.encode_record_with_schema_id() should take a string, or I am doing something incorrectly :) Which one is it?
Thank you very much.

How to run multiple Azure Functions in parallel which scroll through Elasticsearch?

I have a setup where I need to extract data from Elasticsearch and store it on an Azure Blob. Now to get the data I am using Elasticsearch's _search and _scroll API. The indexes are pretty well designed and are formatted something like game1.*, game2.*, game3.* etc.
I've created a worker.py file which I stored in a folder called shared_code as Microsoft suggests and I have several Timer Trigger Functions which import and call worker.py. Due to the way ES was setup on our side I had to create a VNET and a static Outbound IP address which we've then whitelisted on ES. Conversely, the data is only available to be extracted from ES only on port 9200. So I've created an Azure Function App which has the connection setup and I am trying to create multiple Functions (game1-worker, game2-worker, game3-worker) to pull the data from ES running in parallel on minute 5. I've noticed if I add the FUNCTIONS_WORKER_PROCESS_COUNT = 1 setting then the functions will wait until the first triggered one finishes its task and then the second one triggers. If I don't add this app setting or increase the number, then once a function stopped because it finished working, it will try to start it again and then I get a OSError: [WinError 10048] Only one usage of each socket address (protocol/network address/port) is normally permitted error. Is there a way I can make these run in parallel but not have the mentioned error?
Here is the code for the worker.py:
#!/usr/bin/env python
# coding: utf-8
# # Elasticsearch to Azure Microservice
import json, datetime, gzip, importlib, os, re, logging
from elasticsearch import Elasticsearch
import azure.storage.blob as azsb
import azure.identity as azi
import os
import tempfile
def batch(game_name, env='prod'):
# #### Global Variables
env = env.lower()
connection_string = os.getenv('conn_storage')
lowerFormat = game_name.lower().replace(" ","_")
azFormat = re.sub(r'[^0-9a-zA-Z]+', '-', game_name).lower()
storageContainerName = azFormat
stateStorageContainerName = "azure-webjobs-state"
minutesOffset = 5
tempFilePath = tempfile.gettempdir()
curFileName = f"{lowerFormat}_cursor.py"
curTempFilePath = os.path.join(tempFilePath,curFileName)
curBlobFilePath = f"cursors/{curFileName}"
esUrl = os.getenv('esUrl')
# #### Connections
es = Elasticsearch(
esUrl,
port=9200,
timeout=300)
def uploadJsonGzipBlob(filePathAndName, jsonBody):
blob = azsb.BlobClient.from_connection_string(
conn_str=connection_string,
container_name=storageContainerName,
blob_name=filePathAndName
)
blob.upload_blob(gzip.compress(bytes(json.dumps(jsonBody), encoding='utf-8')))
def getAndLoadCursor(filePathAndName):
# Get cursor from blob
blob = azsb.BlobClient.from_connection_string(
conn_str=os.getenv('AzureWebJobsStorage'),
container_name=stateStorageContainerName,
blob_name=filePathAndName
)
# Stream it to Temp file
with open(curTempFilePath, "wb") as f:
data = blob.download_blob()
data.readinto(f)
# Load it by path
spec = importlib.util.spec_from_file_location("cursor", curTempFilePath)
cur = importlib.util.module_from_spec(spec)
spec.loader.exec_module(cur)
return cur
def writeCursor(filePathAndName, body):
blob = azsb.BlobClient.from_connection_string(
conn_str=os.getenv('AzureWebJobsStorage'),
container_name=stateStorageContainerName,
blob_name=filePathAndName
)
blob.upload_blob(body, overwrite=True)
# Parameter and state settings
if os.getenv(f"{lowerFormat}_maxSizeMB") is None:
maxSizeMB = 10 # Default to 10 MB
else:
maxSizeMB = int(os.getenv(f"{lowerFormat}_maxSizeMB"))
if os.getenv(f"{lowerFormat}_maxProcessTimeSeconds") is None:
maxProcessTimeSeconds = 300 # Default to 300 seconds
else:
maxProcessTimeSeconds = int(os.getenv(f"{lowerFormat}_maxProcessTimeSeconds"))
try:
cur = getAndLoadCursor(curBlobFilePath)
except Exception as e:
dtStr = f"{datetime.datetime.utcnow():%Y/%m/%d %H:%M:00}"
writeCursor(curBlobFilePath, f"# Please use format YYYY/MM/DD HH24:MI:SS\nlastPolled = '{dtStr}'")
logging.info(f"No cursor file. Generated {curFileName} file with date {dtStr}")
return 0
# # Scrolling and Batching Engine
lastRowDateOffset = cur.lastPolled
nrFilesThisInstance = 0
while 1:
# Offset the current time by -5 minutes to account for the 2-3 min delay in Elasticsearch
initTime = datetime.datetime.utcnow()
## Filter lt (less than) endDate to avoid infinite loops.
## Filter lt manually when compiling historical based on
endDate = initTime-datetime.timedelta(minutes=minutesOffset)
endDate = f"{endDate:%Y/%m/%d %H:%M:%S}"
doc = {
"query": {
"range": {
"baseCtx.date": {
"gt": lastRowDateOffset,
"lt": endDate
}
}
}
}
Index = lowerFormat + ".*"
if env == 'dev': Index = 'dev.' + Index
if nrFilesThisInstance == 0:
page = es.search(
index = Index,
sort = "baseCtx.date:asc",
scroll = "2m",
size = 10000,
body = doc
)
else:
page = es.scroll(scroll_id = sid, scroll = "10m")
pageSize = len(page["hits"]["hits"])
data = page["hits"]["hits"]
sid = page["_scroll_id"]
totalSize = page["hits"]["total"]
print(f"Total Size: {totalSize}")
cnt = 0
# totalSize might be flawed as it returns at times an integer > 0 but array is empty
# To overcome this, I've added the below check for the array size instead
if pageSize == 0: break
while 1:
cnt += 1
page = es.scroll(scroll_id = sid, scroll = "10m")
pageSize = len(page["hits"]["hits"])
sid = page["_scroll_id"]
data += page["hits"]["hits"]
sizeMB = len(gzip.compress(bytes(json.dumps(data), encoding='utf-8'))) / (1024**2)
loopTime = datetime.datetime.utcnow()
processTimeSeconds = (loopTime-initTime).seconds
print(f"{cnt} Results pulled: {pageSize} -- Cumulative Results: {len(data)} -- Gzip Size MB: {sizeMB} -- processTimeSeconds: {processTimeSeconds} -- pageSize: {pageSize} -- startDate: {lastRowDateOffset} -- endDate: {endDate}")
if sizeMB > maxSizeMB: break
if processTimeSeconds > maxProcessTimeSeconds: break
if pageSize < 10000: break
lastRowDateOffset = max([x['_source']['baseCtx']['date'] for x in data])
lastRowDateOffsetDT = datetime.datetime.strptime(lastRowDateOffset, '%Y/%m/%d %H:%M:%S')
outFile = f"elasticsearch/live/{lastRowDateOffsetDT:%Y/%m/%d/%H}/{lowerFormat}_live_{lastRowDateOffsetDT:%Y%m%d%H%M%S}.json.gz"
uploadJsonGzipBlob(outFile, data)
writeCursor(curBlobFilePath, f"# Please use format YYYY/MM/DD HH24:MI:SS\nlastPolled = '{lastRowDateOffset}'")
nrFilesThisInstance += 1
logging.info(f"File compiled: {outFile} -- {sizeMB} MB\n")
# If the while loop ran for more than maxProcessTimeSeconds then end it
if processTimeSeconds > maxProcessTimeSeconds: break
if pageSize < 10000: break
logging.info(f"Closing Connection to {esUrl}")
es.close()
return 0
And these are 2 of the timing triggers I am calling:
game1-worker
import logging
import datetime
import azure.functions as func
#from shared_code import worker
import importlib
def main(mytimer: func.TimerRequest) -> None:
utc_timestamp = datetime.datetime.utcnow().replace(
tzinfo=datetime.timezone.utc).isoformat()
if mytimer.past_due:
logging.info('The timer is past due!')
# Load a new instance of worker.py
spec = importlib.util.spec_from_file_location("worker", "shared_code/worker.py")
worker = importlib.util.module_from_spec(spec)
spec.loader.exec_module(worker)
worker.batch('game1name')
logging.info('Python timer trigger function ran at %s', utc_timestamp)
game2-worker
import logging
import datetime
import azure.functions as func
#from shared_code import worker
import importlib
def main(mytimer: func.TimerRequest) -> None:
utc_timestamp = datetime.datetime.utcnow().replace(
tzinfo=datetime.timezone.utc).isoformat()
if mytimer.past_due:
logging.info('The timer is past due!')
# Load a new instance of worker.py
spec = importlib.util.spec_from_file_location("worker", "shared_code/worker.py")
worker = importlib.util.module_from_spec(spec)
spec.loader.exec_module(worker)
worker.batch('game2name')
logging.info('Python timer trigger function ran at %s', utc_timestamp)

TL;DR
Based on what you described, multiple worker-processes share underlying runtime's resources (sockets).
For your usecase you just need to leave FUNCTIONS_WORKER_PROCESS_COUNT at 1. Default value is supposed to be 1, so not specifying it should mean the same as setting it to 1.
You need to understand how Azure Functions scale. It is very unnatural/confusing.
Assumes Consumption Plan.
Coding: You write Functions. Say F1 an F2. How you organize is up to you.
Provisioning:
You create a Function App.
You deploy F1 and F2 to this App.
You start the App. (not function).
Runtime:
At start
Azure spawns one Function Host. Think of this as a container/OS.
Inside the Host, one worker-process is created. This worker-process will host one instance of App.
If you change FUNCTIONS_WORKER_PROCESS_COUNT to say 10 then Host will spawn 10 processes and run your App inside each of them.
When a Function is triggered (function could be triggered due to timer, or REST calls or message in Q, ...)
Each worker-process is capable of servicing one request at a time. Be it a request for F1 or F2. One at a time.
Each Host is capable servicing one request per worker-process in it.
If backlog of requests grows, then Azure load balancer would trigger scale-out and create new Function Hosts.
Based on limited info, it seems like bad design to create 3 functions. You could instead create a single timer-triggered function, which sends out 3 messages to a Q (Storage Q should be more than plenty for such minuscule traffic), which in turn triggers your actual Function/implementation (which is storage Q triggered Function). Message would be something like {"game_name": "game1"}.

ERROR:SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063

I am presently working with ASN 1 Decoder.I will be getting a Hex decimal code from producer and i will be collecting it in consumer.
Then after i will be converting the hex code to RDD and then pass the hex value RDD to another function with in same class Decode_Module and will be using python asn1 decoder to decode the hex data and return it back and print it.
I don't understand whats wrong with my code.I have already installed my asn1 parser dependencies in worker nodes too.
Any wrong with the way i call in lambda expression or something else.
My ERROR: Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063
PLEASE HELP ME THANK YOU
My CODE:
class telco_cn:
def __init__(self,sc):
self.sc = sc
print ('in init function')
logging.info('eneterd into init function')
def decode_module(self,msg):
try:
logging.info('Entered into generate module')
### Providing input for module we need to load
load_module(config_values['load_module'])
### Providing Value for Type of Decoding
ASN1.ASN1Obj.CODEC = config_values['PER_DECODER']
### Providing Input for Align/UnAlign
PER.VARIANT = config_values['PER_ALIGNED']
### Providing Input for pdu load
pdu = GLOBAL.TYPE[config_values['pdu_load']]
### Providing Hex value to buf
buf = '{}'.format(msg).decode('hex')
return val
except Exception as e:
logging.debug('error in decode_module function %s' %str(e))
def consumer_input(self,sc,k_topic):
logging.info('entered into consumer input');print(k_topic)
consumer = KafkaConsumer(ip and other values given)
consumer.subscribe(k_topic)
for msg in consumer:
print(msg.value);
a = sc.parallelize([msg.value])
d = a.map(lambda x: self.decode_module(x)).collect()
print d
if __name__ == "__main__":
logging.info('Entered into main')
conf = SparkConf()
conf.setAppName('telco_consumer')
conf.setMaster('yarn-client')
sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)
cn = telco_cn(sc)
cn.consumer_input(sc,config_values['kafka_topic'])

This is because self.decode_module contain instance of SparkContext.
To fix your code you can use #staticmethod:
class telco_cn:
def __init__(self, sc):
self.sc = sc
#staticmethod
def decode_module(msg):
return msg
def consumer_input(self, sc, k_topic):
a = sc.parallelize(list('abcd'))
d = a.map(lambda x: telco_cn.decode_module(x)).collect()
print d
if __name__ == "__main__":
conf = SparkConf()
sc = SparkContext(conf=conf)
cn = telco_cn(sc)
cn.consumer_input(sc, '')
For more infomation:
http://spark.apache.org/docs/latest/programming-guide.html#passing-functions-to-spark

You cannot reference the instance method (self.decode_module) inside the lambda expression, because it the instance object contains a SparkContext reference.
This occurs because internally PySpark tries to Pickle everything it gets to send to its workers. So when you say it should execute self.decode_module() inside the nodes, PySpark tries to pickle the whole (self) object (that contains a reference to the spark context).
To fix that, you just need to remove the SparkContext reference from the telco_cn class and use a different approach like using the SparkContext before calling the class instance (like Zhangs's answer suggests).

With me the issue was:
text_df = "some text"
convertUDF = udf(lambda z: my_fynction(z), StringType())
cleaned_fun = text_df.withColumn('cleaned', udf(convertUDF, StringType())('text'))
I was giving udf() twice. Just did this:
convertUDF = lambda z: my_fynction(z)
cleaned_fun = text_df.withColumn('cleaned', udf(convertUDF, StringType())('text'))
and solved the error

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.