I am programming in Python and I want to connect to an Apache Kafka topic. The Producer works fine, but I'm having trouble with the Consumer. The following is the source code of the Consumer:
import argparse
import logging
import sys
import time
import re
import string
from pyflink.common import WatermarkStrategy, Encoder, Types
from pyflink.datastream import StreamExecutionEnvironment, RuntimeExecutionMode, DataStream
from pyflink.datastream.connectors import (FileSource, StreamFormat, FileSink, OutputFileConfig,
RollingPolicy)
from pyflink.datastream.connectors import FlinkKafkaConsumer
from pyflink.common.serialization import SimpleStringSchema
from pyflink.common.serialization import JsonRowDeserializationSchema
def TwitterProcessing():
env = StreamExecutionEnvironment.get_execution_environment()
env.set_runtime_mode(RuntimeExecutionMode.STREAMING)
# write all the data to one file
env.set_parallelism(1)
env.add_jars("file:///C:/kafka/kafka_2.12-3.2.0/libs/flink-sql-connector-kafka-1.15.2.jar")
kafka_props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'Twitter'}
deserialization_schema = JsonRowDeserializationSchema.builder() \
.type_info(type_info=Types.ROW([Types.STRING()])).build()
kafka_consumer = FlinkKafkaConsumer('Twitter', deserialization_schema, kafka_props)
#kafka_consumer.set_start_from_earliest()
kafka_consumer.set_start_from_latest()
ds = env.add_source(kafka_consumer)
def split(line):
yield from line.split()
# compute word count
ds = ds.flat_map(split) \
.map(word_munge) \
.map(lambda i: (i.lower(), 1), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \
.key_by(lambda i: i[0]) \
.reduce(lambda i, j: (i[0], i[1] + j[1])) \
ds.print()
# submit for execution
env.execute()
def word_munge(single_word):
lower_case_word=single_word.lower()
return re.sub(f"[{re.escape(string.punctuation)}]", "", lower_case_word)
if __name__ == '__main__':
logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format="%(message)s")
TwitterProcessing()
No compilation error is thrown. But when executing the program an error is thrown:
Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o12.execute.
: org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
I am using Windows 10, kafka-python 2.0.2 and apache-flink 1.15.2.
Related
I'm trying to read data from one kafka topic and writing to another after making some processing.
I'm able to read data and process it when i try to write it to another topic. it gives the error
If I try to write the data as it is without doing any processing over it. Kafka producer SimpleStringSchema accepts it.
But I want to convert String to JSON. play with JSON and then write it to another topic in String format.
My Code:
import json
from pyflink.common import Row
from pyflink.common.serialization import SimpleStringSchema, SerializationSchema,JsonRowSerializationSchema,Encoder
from pyflink.common.typeinfo import Types,BasicType,TypeInformation,BasicTypeInfo
from pyflink.datastream import StreamExecutionEnvironment
from pyflink.datastream.connectors import FlinkKafkaConsumer, FlinkKafkaProducer
def my_map(obj):
json_obj = json.loads(json.loads(obj))
return json.dumps(json_obj["name"])
def utf8_decoder(s):
""" Decode the unicode as UTF-8 """
if s is None:
return None
return s.decode('utf-8')
def datastream_api_demo():
# 1. create a StreamExecutionEnvironment
env = StreamExecutionEnvironment.get_execution_environment()
# the sql connector for kafka is used here as it's a fat jar and could avoid dependency issues
env.add_jars("file:///Users/niaz/Downloads/f2.jar")
# 2. create source DataStream
deserialization_schema = SimpleStringSchema()
# deserialization_schema = JsonRowDeserializationSchema.builder() \
# .type_info(type_info=Types.ROW([
# Types.("name", Types.STRING()),
# Types.FIELD("age", Types.LONG()),
# Types.FIELD("car", Types.STRING())])).build()
kafka_source = FlinkKafkaConsumer(
topics='test_source_topic_input',
deserialization_schema=deserialization_schema,
properties={'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'})
ds = env.add_source(kafka_source)
ds = ds.map(lambda a: my_map(a))
# 3. define the execution logic
# ds = ds.map(lambda a: Row(a % 4, 1), output_type=Types.ROW([Types.LONG(), Types.LONG()])) \
# .key_by(lambda a: a[0]) \
# .reduce(lambda a, b: Row(a[0], a[1] + b[1]))
# 4. create sink and emit result to sink
serialization_schema = SimpleStringSchema()
kafka_sink = FlinkKafkaProducer(
topic='test_sink_topic_4',
serialization_schema=serialization_schema,
producer_config={'bootstrap.servers': 'localhost:9092', 'group.id': 'test_group'})
ds.add_sink(kafka_sink)
#ds.print()
# 5. execute the job
env.execute('datastream_api_demo')
if __name__ == '__main__':
datastream_api_demo()
And I'm getting following error:
py4j.protocol.Py4JJavaError: An error occurred while calling o0.execute.
: org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
at org.apache.flink.runtime.jobmaster.JobResult.toJobExecutionResult(JobResult.java:144)
at org.apache.flink.runtime.minicluster.MiniClusterJobClient.lambda$getJobExecutionResult$3(MiniClusterJobClient.java:137)
at java.base/java.util.concurrent.CompletableFuture$UniApply.tryFire(CompletableFuture.java:642)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073)
at org.apache.flink.runtime.rpc.akka.AkkaInvocationHandler.lambda$invokeRpc$0(AkkaInvocationHandler.java:237)
at java.base/java.util.concurrent.CompletableFuture.uniWhenComplete(CompletableFuture.java:859)
at java.base/java.util.concurrent.CompletableFuture$UniWhenComplete.tryFire(CompletableFuture.java:837)
at java.base/java.util.concurrent.CompletableFuture.postComplete(CompletableFuture.java:506)
at java.base/java.util.concurrent.CompletableFuture.complete(CompletableFuture.java:2073)
at org.apache.flink.runtime.concurrent.FutureUtils$1.onComplete(FutureUtils.java:1081)
at akka.dispatch.OnComplete.internal(Future.scala:264)
at akka.dispatch.OnComplete.internal(Future.scala:261)
at akka.dispatch.japi$CallbackBridge.apply(Future.scala:191)
at akka.dispatch.japi$CallbackBridge.apply(Future.scala:188)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:36)
at org.apache.flink.runtime.concurrent.Executors$DirectExecutionContext.execute(Executors.java:73)
at scala.concurrent.impl.CallbackRunnable.executeWithValue(Promise.scala:44)
at scala.concurrent.impl.Promise$DefaultPromise.tryComplete(Promise.scala:252)
at akka.pattern.PromiseActorRef.$bang(AskSupport.scala:572)
at akka.pattern.PipeToSupport$PipeableFuture$$anonfun$pipeTo$1.applyOrElse(PipeToSupport.scala:22)
at akka.pattern.PipeToSupport$PipeableFuture$$anonfun$pipeTo$1.applyOrElse(PipeToSupport.scala:21)
at scala.concurrent.Future$$anonfun$andThen$1.apply(Future.scala:436)
at scala.concurrent.Future$$anonfun$andThen$1.apply(Future.scala:435)
at scala.concurrent.impl.CallbackRunnable.run(Promise.scala:36)
at akka.dispatch.BatchingExecutor$AbstractBatch.processBatch(BatchingExecutor.scala:55)
at akka.dispatch.BatchingExecutor$BlockableBatch$$anonfun$run$1.apply$mcV$sp(BatchingExecutor.scala:91)
at akka.dispatch.BatchingExecutor$BlockableBatch$$anonfun$run$1.apply(BatchingExecutor.scala:91)
at akka.dispatch.BatchingExecutor$BlockableBatch$$anonfun$run$1.apply(BatchingExecutor.scala:91)
at scala.concurrent.BlockContext$.withBlockContext(BlockContext.scala:72)
at akka.dispatch.BatchingExecutor$BlockableBatch.run(BatchingExecutor.scala:90)
at akka.dispatch.TaskInvocation.run(AbstractDispatcher.scala:40)
at akka.dispatch.ForkJoinExecutorConfigurator$AkkaForkJoinTask.exec(ForkJoinExecutorConfigurator.scala:44)
at akka.dispatch.forkjoin.ForkJoinTask.doExec(ForkJoinTask.java:260)
at akka.dispatch.forkjoin.ForkJoinPool$WorkQueue.runTask(ForkJoinPool.java:1339)
at akka.dispatch.forkjoin.ForkJoinPool.runWorker(ForkJoinPool.java:1979)
at akka.dispatch.forkjoin.ForkJoinWorkerThread.run(ForkJoinWorkerThread.java:107)
Caused by: org.apache.flink.runtime.JobException: Recovery is suppressed by NoRestartBackoffTimeStrategy
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.handleFailure(ExecutionFailureHandler.java:138)
at org.apache.flink.runtime.executiongraph.failover.flip1.ExecutionFailureHandler.getFailureHandlingResult(ExecutionFailureHandler.java:82)
at org.apache.flink.runtime.scheduler.DefaultScheduler.handleTaskFailure(DefaultScheduler.java:216)
at org.apache.flink.runtime.scheduler.DefaultScheduler.maybeHandleTaskFailure(DefaultScheduler.java:206)
at org.apache.flink.runtime.scheduler.DefaultScheduler.updateTaskExecutionStateInternal(DefaultScheduler.java:197)
at org.apache.flink.runtime.scheduler.SchedulerBase.updateTaskExecutionState(SchedulerBase.java:682)
at org.apache.flink.runtime.scheduler.SchedulerNG.updateTaskExecutionState(SchedulerNG.java:79)
at org.apache.flink.runtime.jobmaster.JobMaster.updateTaskExecutionState(JobMaster.java:435)
at jdk.internal.reflect.GeneratedMethodAccessor41.invoke(Unknown Source)
at java.base/jdk.internal.reflect.DelegatingMethodAccessorImpl.invoke(DelegatingMethodAccessorImpl.java:43)
at java.base/java.lang.reflect.Method.invoke(Method.java:566)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcInvocation(AkkaRpcActor.java:305)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleRpcMessage(AkkaRpcActor.java:212)
at org.apache.flink.runtime.rpc.akka.FencedAkkaRpcActor.handleRpcMessage(FencedAkkaRpcActor.java:77)
at org.apache.flink.runtime.rpc.akka.AkkaRpcActor.handleMessage(AkkaRpcActor.java:158)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:26)
at akka.japi.pf.UnitCaseStatement.apply(CaseStatements.scala:21)
at scala.PartialFunction$class.applyOrElse(PartialFunction.scala:123)
at akka.japi.pf.UnitCaseStatement.applyOrElse(CaseStatements.scala:21)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:170)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
at scala.PartialFunction$OrElse.applyOrElse(PartialFunction.scala:171)
at akka.actor.Actor$class.aroundReceive(Actor.scala:517)
at akka.actor.AbstractActor.aroundReceive(AbstractActor.scala:225)
at akka.actor.ActorCell.receiveMessage(ActorCell.scala:592)
at akka.actor.ActorCell.invoke(ActorCell.scala:561)
at akka.dispatch.Mailbox.processMailbox(Mailbox.scala:258)
at akka.dispatch.Mailbox.run(Mailbox.scala:225)
at akka.dispatch.Mailbox.exec(Mailbox.scala:235)
... 4 more
Caused by: org.apache.flink.streaming.runtime.tasks.AsynchronousException: Caught exception while processing timer.
at org.apache.flink.streaming.runtime.tasks.StreamTask$StreamTaskAsyncExceptionHandler.handleAsyncException(StreamTask.java:1309)
at org.apache.flink.streaming.runtime.tasks.StreamTask.handleAsyncException(StreamTask.java:1285)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invokeProcessingTimeCallback(StreamTask.java:1424)
at org.apache.flink.streaming.runtime.tasks.StreamTask.lambda$null$16(StreamTask.java:1413)
at org.apache.flink.streaming.runtime.tasks.StreamTaskActionExecutor$SynchronizedStreamTaskActionExecutor.runThrowing(StreamTaskActionExecutor.java:93)
at org.apache.flink.streaming.runtime.tasks.mailbox.Mail.run(Mail.java:90)
at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.processMailsWhenDefaultActionUnavailable(MailboxProcessor.java:344)
at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.processMail(MailboxProcessor.java:330)
at org.apache.flink.streaming.runtime.tasks.mailbox.MailboxProcessor.runMailboxLoop(MailboxProcessor.java:202)
at org.apache.flink.streaming.runtime.tasks.StreamTask.runMailboxLoop(StreamTask.java:681)
at org.apache.flink.streaming.runtime.tasks.StreamTask.executeInvoke(StreamTask.java:636)
at org.apache.flink.streaming.runtime.tasks.StreamTask.runWithCleanUpOnFail(StreamTask.java:647)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invoke(StreamTask.java:620)
at org.apache.flink.runtime.taskmanager.Task.doRun(Task.java:779)
at org.apache.flink.runtime.taskmanager.Task.run(Task.java:566)
at java.base/java.lang.Thread.run(Thread.java:834)
Caused by: TimerException{org.apache.flink.streaming.runtime.tasks.ExceptionInChainedOperatorException: Could not forward element to next operator}
... 14 more
Caused by: org.apache.flink.streaming.runtime.tasks.ExceptionInChainedOperatorException: Could not forward element to next operator
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.pushToOperator(CopyingChainingOutput.java:85)
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.collect(CopyingChainingOutput.java:46)
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.collect(CopyingChainingOutput.java:26)
at org.apache.flink.streaming.api.operators.CountingOutput.collect(CountingOutput.java:50)
at org.apache.flink.streaming.api.operators.CountingOutput.collect(CountingOutput.java:28)
at org.apache.flink.streaming.api.operators.TimestampedCollector.collect(TimestampedCollector.java:50)
at org.apache.flink.streaming.api.operators.python.PythonMapOperator.emitResult(PythonMapOperator.java:59)
at org.apache.flink.streaming.api.operators.python.AbstractPythonFunctionOperator.emitResults(AbstractPythonFunctionOperator.java:299)
at org.apache.flink.streaming.api.operators.python.AbstractPythonFunctionOperator.invokeFinishBundle(AbstractPythonFunctionOperator.java:322)
at org.apache.flink.streaming.api.operators.python.AbstractPythonFunctionOperator.checkInvokeFinishBundleByTime(AbstractPythonFunctionOperator.java:314)
at org.apache.flink.streaming.api.operators.python.AbstractPythonFunctionOperator.lambda$open$0(AbstractPythonFunctionOperator.java:133)
at org.apache.flink.streaming.runtime.tasks.StreamTask.invokeProcessingTimeCallback(StreamTask.java:1422)
... 13 more
Caused by: java.lang.ClassCastException: class [B cannot be cast to class java.lang.String ([B and java.lang.String are in module java.base of loader 'bootstrap')
at org.apache.flink.api.common.serialization.SimpleStringSchema.serialize(SimpleStringSchema.java:36)
at org.apache.flink.streaming.connectors.kafka.internals.KafkaSerializationSchemaWrapper.serialize(KafkaSerializationSchemaWrapper.java:71)
at org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer.invoke(FlinkKafkaProducer.java:907)
at org.apache.flink.streaming.connectors.kafka.FlinkKafkaProducer.invoke(FlinkKafkaProducer.java:99)
at org.apache.flink.streaming.api.functions.sink.TwoPhaseCommitSinkFunction.invoke(TwoPhaseCommitSinkFunction.java:223)
at org.apache.flink.streaming.api.operators.StreamSink.processElement(StreamSink.java:54)
at org.apache.flink.streaming.runtime.tasks.CopyingChainingOutput.pushToOperator(CopyingChainingOutput.java:71)
... 24 more
Maybe you can set ProducerConfig.KEY_SERIALIZER_CLASS_CONFIG and ProducerConfig.VALUE_SERIALIZER_CLASS_CONFIG in producer_config in FlinkKafkaProducer
props.put("key.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer");
props.put("value.serializer", "org.apache.kafka.common.serialization.ByteArraySerializer");
I have resolved it by providing the output type as Java string which was like
from pyflink.common.typeinfo import Types
ds = ds.map(lambda a: my_map(a),Types.STRING()) # Map function needs ouput type to serialize it to Java String
I'm integrating kafka and spark with elasticsearch.
when i run this script using :
sudo spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.3.1 --jars jars/elasticsearch-hadoop-2.1.0.Beta2.jar cv.py localhost:9092 flumekafka
Script 'cv.py':
import json
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
#import org.elasticsearch.spark.rdd.EsSpark
if __name__ == "__main__":
sc = SparkContext(appName="kafka")
ssc = StreamingContext(sc, 2)
brokers, topic = sys.argv[1:]
kvs = KafkaUtils.createDirectStream(ssc, [topic],
{"metadata.broker.list": brokers})
es_write_conf = {"es.nodes" : 'localhost',"es.port" :
'9200',"es.resource" : 'rh/cv',"es.input.json" : "yes"}
parsed = kvs.map(lambda v: json.loads(v[1]))
# rdd = sc.parallelize(kvs)
#def format_data(x):
# return (data['doc_id'], json.dumps(data))
#rdd = rdd.map(lambda x: format_data(x))
parsed.saveAsNewAPIHadoopFile(path='-',
outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
keyClass="org.apache.hadoop.io.NullWritable",
valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
conf=es_write_conf)
#parsed.pprint()
ssc.start()
ssc.awaitTermination()
I got this problem :
parsed.saveAsNewAPIHadoopFile(path='-', outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=es_write_conf)
AttributeError: 'KafkaTransformedDStream' object has no attribute 'saveAsNewAPIHadoopFile'
what can i do to make my script insert into elasticsearc or if is there an other solution.
I have the Python script:
import time
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from urllib.parse import urlsplit, unquote
def extractPath(host, url):
if host in url:
return urlsplit(url).path
else:
return '-'
startCreateUdfs = time.time()
getPathUdf = udf(extractPath, StringType())
endCreateUdfs = time.time()
print("Python udf creation time: {}".format(endCreateUdfs - startCreateUdfs))
and the Scala script:
import java.net.URLDecoder
import java.nio.charset.StandardCharsets
import java.net.URL
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.udf
object UdfTimes extends App{
val spark = SparkSession.builder().master("local").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
val extractPath: (String, String) => String = (host, url) => {
if (url.contains(host))
new URL(url).getPath
else
"-"
}
val unquote: String => String = str => URLDecoder.decode(str, StandardCharsets.UTF_8.name())
val startTimeUdf = System.nanoTime()
val getPathUdf = udf(extractPath)
val endTimeUdf = System.nanoTime()
println("Scala udf registering time: " + (endTimeUdf - startTimeUdf) / math.pow(10, 9))
}
Which I have written to do the same thing. The udf creation is instant in Python (from command line):
Python udf creation time: 2.0503997802734375e-05
but in Scala, it takes almost a second (sbt command line):
udf registering time: 0.768687091
What is the reason for this big difference?
My Python code for creating and running a job in AWS Glue is:
from datetime import datetime, timedelta
from time import sleep
import boto3
glue = boto3.client(
service_name='glue',
region_name='ap-south-1',
endpoint_url='https://glue.ap-south-1.amazonaws.com'
)
myJob = glue.create_job(
Name='sample',
Role='Name_of_my_role',
Command={
'Name': 'glueetl',
'ScriptLocation': 's3://s3-location'
}
)
myNewJobRun = glue.start_job_run(JobName=myJob['Name'])
target_time = datetime.utcnow() + timedelta(minutes=5)
while(datetime.utcnow() < target_time):
status = glue.get_job_run(JobName=myJob['Name'], RunId=myNewJobRun['JobRunId'])
print status['JobRun']['JobRunState']
sleep(30)
The script that needs to run is:
print "Hello World!"
print "Sevilla lost against Messi FC!"
This is from an example. Upon the completion of the job, it ends up in an error: Command failed with exit code 1 and upon checking the logs and error logs from the console, I get my desired result i.e. the above two lines in the script runs well.
Here are the error logs;
Container: ****
LogType:stdout
Log Upload Time:Mon Feb 25 10:46:40 +0000 2019
LogLength:44
Log Contents:
Hello World!
Sevilla lost against Messi FC!
End of LogType:stdout
I forgot to add job.commit() in the end.
For eg:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql import DataFrameReader, DataFrameWriter
from datetime import datetime
import time
# #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
print("All imports were successful.")
df = spark.read.orc(
's3://****'
)
print("First dataframe read with headers set to True")
df2 = spark.read.orc(
's3://****'
)
print("Second dataframe read with headers set to True")
# df3 = df.join(df2, ['c_0'], "outer")
# df3 = df.join(
# df2,
# df["column_test_1"] == df2["column_1"],
# "outer"
# )
df3 = df.alias('l').join(df2.alias('r'), on='c_0') #.collect()
print("Dataframes have been joined successfully.")
output_file_path = 's3://****'
)
df3.write.orc(
output_file_path
)
print("Dataframe has been written to csv.")
job.commit()
I am getting the below error when executing the code from command line in centOS.
"(<class 'py4j.protocol.Py4JJavaError'>, Py4JJavaError(u'An error occurred while calling o313.save.\n', JavaObject id=o315), <traceback object at 0x7fca49970320>)"
I am getting this issue only when I am submitting it through ./bin/spark-submit test.py
If I use just : spark-submit test.py everything works fine. But I am not able to run the code in yarn with it.
I have anaconda installed in my machine and I think it's using the anaconda spark-submit in the second method.
Can anyone please suggest what to do with it? Do I have set any env variables or update the libraries?
Edit:
As per the comments providing more details about the script and versions
This is the code:
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession,DataFrame
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoderEstimator, OneHotEncoder
import sys
from operator import add
import os
import pandas as pd
import numpy as np
from pyspark.sql.types import *
import pyspark.sql.functions as func
from IPython.display import display, HTML
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import LogisticRegression,LogisticRegressionModel
path = 'hdfs://10.0.15.42/nih-poc-public-dataset/'
pipeline_path = '/opt/nih/pipeline_model'
try:
conf = SparkConf().setMaster("local[8]").setAppName("sparkstream").set("spark.driver.allowMultipleContexts", "true")
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)
spark = SparkSession.builder\
.config(conf = conf)\
.getOrCreate()
print "SparkContext Version:", sc.version
print "SparkContext Version:", sc.version
print "Python version: ", sc.pythonVer
print "Master URL to connect to: ", sc.master
print "Path where Spark is installed on worker nodes: ", str(sc.sparkHome)
print "Retrieve name of the Spark User running SparkContext: ", str(sc.sparkUser())
print "Application name: ", sc.appName
print "Application ID: ", sc.applicationId
print "Default level of parallelism: ", sc.defaultParallelism
print "Default number of partitions for RDDs: ", sc.defaultMinPartitions
########################################################################################################
############################################# DATA EXTRACTION ##########################################
########################################################################################################
file_name = path + 'maintenance_data.csv'
df_nih = spark.read.csv(file_name,sep = ';',header = "true",inferSchema="true")
df_nih.show(1)
#print(df_nih.columns)
print(df_nih.count())
########################################################################################################
########################################################################################################
######################################## PIPELINE FORMATION ############################################
########################################################################################################
categoricalcolumns = ['team','provider']
numericalcolumns = ['lifetime', 'pressureInd', 'moistureInd', 'temperatureInd']
stages = []
for categoricalcol in categoricalcolumns:
stringindexer = StringIndexer(inputCol = categoricalcol, outputCol = categoricalcol + '_index')
encoder = OneHotEncoderEstimator(inputCols=[stringindexer.getOutputCol()], outputCols=[categoricalcol + "_classVec"])
stages += [stringindexer,encoder]
#stages += [stringindexer]
assemblerinputs = [c + "_classVec" for c in categoricalcolumns] + numericalcolumns
vectorassembler_stage = VectorAssembler(inputCols = assemblerinputs
,outputCol='features')
stages += [vectorassembler_stage]
#output = assembler.transform(df_nih)
#print(output.show(1))
indexer = StringIndexer(inputCol = 'broken', outputCol = 'label')
stages += [indexer]
#pipeline = Pipeline(stages = [stages,vectorassembler_stage,indexer])
pipeline = Pipeline(stages = stages)
model_pipeline = pipeline.fit(df_nih)
model_pipeline.write().overwrite().save(pipeline_path)
except:
print(sys.exc_info())
finally:
print('stopping spark session')
spark.stop()
sc.stop()
This is the output:
> /usr/hdp/3.0.0.0-1634/spark2> ./bin/spark-submit --master yarn
> /opt/nih/sample_spark.py SparkContext Version: 2.3.1.3.0.0.0-1634
> SparkContext Version: 2.3.1.3.0.0.0-1634 Python version: 2.7 Master
> URL to connect to: yarn Path where Spark is installed on worker
> nodes: None Retrieve name of the Spark User running SparkContext:
> spark Application name: sparkstream Application ID:
> application_1540925663097_0023 Default level of parallelism: 2
> Default number of partitions for RDDs: 2
> +--------+------+----------------+----------------+----------------+-----+---------+ |lifetime|broken| pressureInd| moistureInd| temperatureInd|
> team| provider|
> +--------+------+----------------+----------------+----------------+-----+---------+ | 56|
> 0|92.1788540640753|104.230204454489|96.5171587259733|TeamA|Provider4|
> +--------+------+----------------+----------------+----------------+-----+---------+ only showing top 1 row
>
> 999 (<class 'py4j.protocol.Py4JJavaError'>, Py4JJavaError(u'An error
> occurred while calling o309.save.\n', JavaObject id=o311), <traceback
> object at 0x7f9061c86290>) stopping spark session
If I comment the saving part(model_pipeline.write) everything works fine. Please provide your suggestions