I am using kafka Confluent 4.0.0 to fetch data from SQL Server into kafka topics.
I want to read topics data stored in kafka from spark streaming program using below program:
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from confluent_kafka.avro.cached_schema_registry_client import CachedSchemaRegistryClient
from confluent_kafka.avro.serializer.message_serializer import MessageSerializer
schema_registry_client = CachedSchemaRegistryClient(url='http://schemaregistryhostip:8081')
serializer = MessageSerializer(schema_registry_client)
ssc = StreamingContext(sc, 30)
kvs = KafkaUtils.createDirectStream(ssc, ["topic name"], {"metadata.broker.list": "brokerip:9092"}, valueDecoder=serializer.decode_message)
lines = kvs.map(lambda x: x[1])
lines.pprint()
ssc.start()
ssc.awaitTermination()
But I am getting below error when I start spark streaming using the method ssc.start().
ImportError: No module named confluent_kafka.avro.serializer
Related
I am trying to connect to Kafka producer using pyspark. Using a console producer to produce the data.
Below is the code.
import findspark
findspark.init('D:\spark-2.4.0-bin-hadoop2.7')
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--packages org.apache.spark:spark-
streaming-kafka-0-8_2.11:2.0.2 pyspark-shell'
import sys
import time
from pyspark import SparkContext
from pyspark import SparkConf
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
if __name__ == "__main__":
n_secs = 1
topic = ["streamtest1"]
## spark config
conf = SparkConf().setAppName("KafkaStreamTest").setMaster("local[2]")
# spark context creation
sc = SparkContext(conf=conf)
print(sc)
sc.setLogLevel("WARN")
# create streaming context
ssc = StreamingContext(sc,n_secs)
#kafka direct stream to connect to kafka broker
kafkastream = KafkaUtils.createDirectStream(ssc,topic,
{'bootstrap.servers':'localhost:9092',
'group.id':'test123',
'fetch.message.max.bytes':'15728640'})
print(kafkastream)
#map the Dstream data to lines
lines = kafkastream.map(lambda x : x[1])
words = lines.flatMap(lambda line: line.split(" "))
pairs=words.map(lambda word:(word,1))
counts = pairs.reduceByKey(lambda a,b: a+b)
counts.pprint()
# starts the streaming context
ssc.start()
time.sleep(60)
ssc.awaitTermination()
And below is the error:
Py4JJavaError: An error occurred while calling o27.awaitTermination.
: org.apache.spark.SparkException: An exception was raised by Python:
Traceback (most recent call last):
I set up a kafka system with a producer and a consumer, streaming as messages the lines of a json file.
Using pyspark, I need to analyze the data for the different streaming windows. To do so, I need to have a look at the data as they are streamed by pyspark... How can I do it?
To run the code I used Yannael's Docker container. Here is my python code:
# Add dependencies and load modules
import os
os.environ['PYSPARK_SUBMIT_ARGS'] = '--conf spark.ui.port=4040 --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.0,com.datastax.spark:spark-cassandra-connector_2.11:2.0.0-M3 pyspark-shell'
from kafka import KafkaConsumer
from random import randint
from time import sleep
# Load modules and start SparkContext
from pyspark import SparkContext, SparkConf
from pyspark.sql import SQLContext, Row
conf = SparkConf() \
.setAppName("Streaming test") \
.setMaster("local[2]") \
.set("spark.cassandra.connection.host", "127.0.0.1")
try:
sc.stop()
except:
pass
sc = SparkContext(conf=conf)
sqlContext=SQLContext(sc)
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
# Create streaming task
ssc = StreamingContext(sc, 0.60)
kafkaStream = KafkaUtils.createStream(ssc, "127.0.0.1:2181", "spark-streaming-consumer", {'test': 1})
ssc.start()
You can either call kafkaStream.pprint(), or learn more about structured streaming and you can print like so
query = kafkaStream \
.writeStream \
.outputMode("complete") \
.format("console") \
.start()
query.awaitTermination()
I see that you have cassandraendpoints, so assuming you're writing into Cassandra, you can use Kafka Connect rather than writing Spark code for this
I am using spark 2.1.0 version with kafka 0.9 in MapR environment.I am trying to read from Kafka topic into spark streaming. However i am facing error as below when i am running Kafkautils createDirectStream command.
py4j.protocol.Py4JError: An error occurred while calling z:org.apache.spark.streaming.kafka09.KafkaUtilsPythonHelper.createDirectStream.
Trace:
py4j.Py4JException: Method createDirectStream([class org.apache.spark.streaming.api.java.JavaStreamingContext, class
java.util.ArrayList, class java.util.HashMap]) does not exist
Code that i am running
from __future__ import print_function
import sys
from pyspark import SparkContext,SparkConf
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext
from pyspark.streaming.kafka09 import KafkaUtils;
sqlContext = SQLContext(sc)
ssc = StreamingContext(sc, 3)
strLoc = '/home/mapr/stream:info'
kafkaparams = {"zookeeper.connect" : "x.x.x.x:5181","metadata.broker.list" : "x.x.x.x:9092"}
strarg = KafkaUtils.createDirectStream(ssc,[strLoc],kafkaparams) <- Error when i run this command on pyspark shell
Am trying to refining your code . Please try to execute with this below code .
from pyspark.sql import SQLContext, SparkSession
from pyspark.streaming import StreamingContext
from confluent_kafka.avro.cached_schema_registry_client import CachedSchemaRegistryClient
from confluent_kafka.avro.serializer.message_serializer import MessageSerializer
from pyspark.streaming.kafka import KafkaUtils
import json
var_schema_url = 'http://localhost:8081'
var_kafka_parms_src = {"metadata.broker.list": 'localhost:9092'}
schema_registry_client = CachedSchemaRegistryClient(var_schema_url)
serializer = MessageSerializer(schema_registry_client)
spark = SparkSession.builder \
.appName('Advertiser_stream') \
.master('local[*]') \
.getOrCreate()
def handler(message):
records = message.collect()
for record in records:
<<You can process that data >>
sc = spark.sparkContext
ssc = StreamingContext(sc, 5)
kvs = KafkaUtils.createDirectStream(ssc, ['Topic-name'], var_kafka_parms_src,valueDecoder=serializer.decode_message)
kvs.foreachRDD(handler)
ssc.start()
ssc.awaitTermination()
I am having problems executing the example code from the mleap repository. I wish to run the code in a script instead of a jupyter notebook (which is how the example is run). My script is as follows:
##################################################################################
# start a local spark session
# https://spark.apache.org/docs/0.9.0/python-programming-guide.html
##################################################################################
from pyspark import SparkContext, SparkConf
conf = SparkConf()
#set app name
conf.set("spark.app.name", "train classifier")
#Run Spark locally with as many worker threads as logical cores on your machine (cores X threads).
conf.set("spark.master", "local[*]")
#number of cores to use for the driver process (only in cluster mode)
conf.set("spark.driver.cores", "1")
#Limit of total size of serialized results of all partitions for each Spark action (e.g. collect)
conf.set("spark.driver.maxResultSize", "1g")
#Amount of memory to use for the driver process
conf.set("spark.driver.memory", "1g")
#Amount of memory to use per executor process (e.g. 2g, 8g).
conf.set("spark.executor.memory", "2g")
#pass configuration to the spark context object along with code dependencies
sc = SparkContext(conf=conf)
from pyspark.sql.session import SparkSession
spark = SparkSession(sc)
##################################################################################
import mleap.pyspark
# # Imports MLeap serialization functionality for PySpark
from mleap.pyspark.spark_support import SimpleSparkSerializer
# Import standard PySpark Transformers and packages
from pyspark.ml.feature import VectorAssembler, StandardScaler, OneHotEncoder, StringIndexer
from pyspark.ml import Pipeline, PipelineModel
from pyspark.sql import Row
# Create a test data frame
l = [('Alice', 1), ('Bob', 2)]
rdd = sc.parallelize(l)
Person = Row('name', 'age')
person = rdd.map(lambda r: Person(*r))
df2 = spark.createDataFrame(person)
df2.collect()
# Build a very simple pipeline using two transformers
string_indexer = StringIndexer(inputCol='name', outputCol='name_string_index')
feature_assembler = VectorAssembler(
inputCols=[string_indexer.getOutputCol()], outputCol="features")
feature_pipeline = [string_indexer, feature_assembler]
featurePipeline = Pipeline(stages=feature_pipeline)
featurePipeline.fit(df2)
featurePipeline.serializeToBundle("jar:file:/tmp/pyspark.example.zip")
On executing spark-submit script.py I get the following error:
17/09/18 13:26:43 WARN NativeCodeLoader: Unable to load native-hadoop library for your platform... using builtin-java classes where applicable
Traceback (most recent call last):
File "/Users/opringle/Documents/Repos/finn/Magellan/src/no_spark_predict.py", line 58, in <module>
featurePipeline.serializeToBundle("jar:file:/tmp/pyspark.example.zip")
AttributeError: 'Pipeline' object has no attribute 'serializeToBundle'
Any help would be much appreciated! I have installed mleap from pypy.
See Here
It seems MLeap isn't ready for Spark 2.3 yet. If you happen to be running Spark 2.3, try downgrading to 2.2 and retry. Hopefully, that helps!
I have solved the issue by attaching the following jar file when running:
spark-submit --packages ml.combust.mleap:mleap-spark_2.11:0.8.1 script.py
It seems you didn't follow the steps correctly, here http://mleap-docs.combust.ml/getting-started/py-spark.html it says that
Note: the import of mleap.pyspark needs to happen before any other PySpark libraries are imported.
Hence try importing your SparkContext after mleap
im trying to save my streaming data from spark to cassandra, spark is conected to kafka and its working ok, but saving to cassandra its making me become crazy. Im using spark 2.0.2, kafka 0.10 and cassandra 2.23,
this is how im submiting to spark
spark-submit --verbose --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.0.0 --jars /tmp/pyspark-cassandra-0.3.5.jar --driver-class-path /tmp/pyspark-cassandra-0.3.5.jar --py-files /tmp/pyspark-cassandra-0.3.5.jar --conf spark.cassandra.connection.host=localhost /tmp/direct_kafka_wordcount5.py localhost:9092 testing
and this is my code it just a little modification from the spark examples, its works but i cant save this data to cassandra....
and this what im trying to do but just with the count result
http://rustyrazorblade.com/2015/05/spark-streaming-with-python-and-kafka/
from __future__ import print_function
import sys
import os
import time
import pyspark_cassandra
import pyspark_cassandra.streaming
from pyspark_cassandra import CassandraSparkContext
import urllib
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
from pyspark.sql import SQLContext
from pyspark.sql import Row
from pyspark.sql.types import IntegerType
from pyspark.sql.functions import udf
from pyspark.sql.functions import from_unixtime, unix_timestamp, min, max
from pyspark.sql.types import FloatType
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
if __name__ == "__main__":
if len(sys.argv) != 3:
print("Usage: direct_kafka_wordcount.py <broker_list> <topic>", file=sys.stderr)
exit(-1)
sc = SparkContext(appName="PythonStreamingDirectKafkaWordCount")
ssc = StreamingContext(sc, 1)
sqlContext = SQLContext(sc)
brokers, topic = sys.argv[1:]
kvs = KafkaUtils.createDirectStream(ssc, [topic], {"metadata.broker.list": brokers})
lines = kvs.map(lambda x: x[1])
counts=lines.count()
counts.saveToCassandra("spark", "count")
counts.pprint()
ssc.start()
ssc.awaitTermination()
i got this error,
Traceback (most recent call last):
File "/tmp/direct_kafka_wordcount5.py", line 88, in
counts.saveToCassandra("spark", "count")
Pyspark Casasndra stopped being updated a while ago and the latest version only supports up to Spark 1.6
https://github.com/TargetHolding/pyspark-cassandra
Additionally
counts=lines.count() // Returns data to the driver (not an RDD)
counts is now an Integer. This means the function saveToCassandra doesn't apply since that is a function for RDDs