I'm integrating kafka and spark with elasticsearch.
when i run this script using :
sudo spark-submit --packages org.apache.spark:spark-streaming-kafka-0-8_2.11:2.3.1 --jars jars/elasticsearch-hadoop-2.1.0.Beta2.jar cv.py localhost:9092 flumekafka
Script 'cv.py':
import json
import sys
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.streaming.kafka import KafkaUtils
#import org.elasticsearch.spark.rdd.EsSpark
if __name__ == "__main__":
sc = SparkContext(appName="kafka")
ssc = StreamingContext(sc, 2)
brokers, topic = sys.argv[1:]
kvs = KafkaUtils.createDirectStream(ssc, [topic],
{"metadata.broker.list": brokers})
es_write_conf = {"es.nodes" : 'localhost',"es.port" :
'9200',"es.resource" : 'rh/cv',"es.input.json" : "yes"}
parsed = kvs.map(lambda v: json.loads(v[1]))
# rdd = sc.parallelize(kvs)
#def format_data(x):
# return (data['doc_id'], json.dumps(data))
#rdd = rdd.map(lambda x: format_data(x))
parsed.saveAsNewAPIHadoopFile(path='-',
outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
keyClass="org.apache.hadoop.io.NullWritable",
valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable",
conf=es_write_conf)
#parsed.pprint()
ssc.start()
ssc.awaitTermination()
I got this problem :
parsed.saveAsNewAPIHadoopFile(path='-', outputFormatClass="org.elasticsearch.hadoop.mr.EsOutputFormat",
keyClass="org.apache.hadoop.io.NullWritable", valueClass="org.elasticsearch.hadoop.mr.LinkedMapWritable", conf=es_write_conf)
AttributeError: 'KafkaTransformedDStream' object has no attribute 'saveAsNewAPIHadoopFile'
what can i do to make my script insert into elasticsearc or if is there an other solution.
Related
I am programming in Python and I want to connect to an Apache Kafka topic. The Producer works fine, but I'm having trouble with the Consumer. The following is the source code of the Consumer:
import argparse
import logging
import sys
import time
import re
import string
from pyflink.common import WatermarkStrategy, Encoder, Types
from pyflink.datastream import StreamExecutionEnvironment, RuntimeExecutionMode, DataStream
from pyflink.datastream.connectors import (FileSource, StreamFormat, FileSink, OutputFileConfig,
RollingPolicy)
from pyflink.datastream.connectors import FlinkKafkaConsumer
from pyflink.common.serialization import SimpleStringSchema
from pyflink.common.serialization import JsonRowDeserializationSchema
def TwitterProcessing():
env = StreamExecutionEnvironment.get_execution_environment()
env.set_runtime_mode(RuntimeExecutionMode.STREAMING)
# write all the data to one file
env.set_parallelism(1)
env.add_jars("file:///C:/kafka/kafka_2.12-3.2.0/libs/flink-sql-connector-kafka-1.15.2.jar")
kafka_props = {'bootstrap.servers': 'localhost:9092', 'group.id': 'Twitter'}
deserialization_schema = JsonRowDeserializationSchema.builder() \
.type_info(type_info=Types.ROW([Types.STRING()])).build()
kafka_consumer = FlinkKafkaConsumer('Twitter', deserialization_schema, kafka_props)
#kafka_consumer.set_start_from_earliest()
kafka_consumer.set_start_from_latest()
ds = env.add_source(kafka_consumer)
def split(line):
yield from line.split()
# compute word count
ds = ds.flat_map(split) \
.map(word_munge) \
.map(lambda i: (i.lower(), 1), output_type=Types.TUPLE([Types.STRING(), Types.INT()])) \
.key_by(lambda i: i[0]) \
.reduce(lambda i, j: (i[0], i[1] + j[1])) \
ds.print()
# submit for execution
env.execute()
def word_munge(single_word):
lower_case_word=single_word.lower()
return re.sub(f"[{re.escape(string.punctuation)}]", "", lower_case_word)
if __name__ == '__main__':
logging.basicConfig(stream=sys.stdout, level=logging.ERROR, format="%(message)s")
TwitterProcessing()
No compilation error is thrown. But when executing the program an error is thrown:
Py4JJavaError(
py4j.protocol.Py4JJavaError: An error occurred while calling o12.execute.
: org.apache.flink.runtime.client.JobExecutionException: Job execution failed.
I am using Windows 10, kafka-python 2.0.2 and apache-flink 1.15.2.
I am trying to read a file allocated in azure datalake gen2 into spark dataframe using python.
Code is
from pyspark import SparkConf
from pyspark.sql import SparkSession
# create spark session
key = "some_key"
appName = "DataExtract"
master = "local[*]"
sparkConf = SparkConf() \
.setAppName(appName) \
.setMaster(master) \
.set("fs.azure.account.key.myaccount.dfs.core.windows.net", key)
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
data_csv="abfs://test-file-system#myaccount.dfs.core.windows.net/data.csv"
data_out = "abfs://test-file-system#myaccount.dfs.core.windows.net/data_out.csv"
# read csv
df = self.spark_session.read.csv(data_csv)
# write csv
df.write.csv(data_out)
The file is read and is written well, but I am getting following error
ERROR AzureBlobFileSystemStore: Failed to parse the date Thu, 09 Sep 2021 10:12:34 GMT
Date seems to be file creation date.
How can I parse the date to avoid getting the error?
I tried reproducing the same issue and found it is with these lines that is causing the error.
data_csv="abfs://test-file-system#myaccount.dfs.core.windows.net/data.csv" data_out =
"abfs://test-file-system#myaccount.dfs.core.windows.net/data_out.csv"
# read csv df = self.spark_session.read.csv(data_csv) ```
Here is the code that worked for me when I tried replacing the above lines of code i.e.. abfs to abfss
from pyspark import SparkConf
from pyspark.sql import SparkSession
# create spark session
key = "<Your Storage Account Key>"
appName = "<Synapse App Name>"
master = "local[*]"
sparkConf = SparkConf() \
.setAppName(appName) \
.setMaster(master) \
.set("fs.azure.account.key.<Storage Account Name>.dfs.core.windows.net", key)
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
data_csv="abfss://<ContainerName>#<Storage Account Name>.dfs.core.windows.net/<Directory>"
# read csv
df1 = spark.read.option('header','true')\
.option('delimiter', ',')\
.csv(data_csv + '/sample1.csv')
df1.show()
# write csv
df2 = df1.write.csv(data_csv + '/<Give the name of blob you want to write to>.csv')
else you can even try the below code which perfectly worked for me
from pyspark.sql import SparkSession
from pyspark.sql.types import *
account_name = "<StorageAccount Name>"
container_name = "<Storage Account Container Name>"
relative_path = "<Directory path>"
adls_path = 'abfss://%s#%s.dfs.core.windows.net/%s'%(container_name,account_name,relative_path)
dataframe1 = spark.read.option('header','true')\
.option('delimiter', ',')\
.csv(adls_path + '/sample1.csv')
dataframe1.show()
dataframe2 = dataframe1.write.csv(adls_path + '/<Give the name of blob you want to write to>.csv')
REFERENCE :
Synapse Spark – Reading CSV files from Azure Data Lake Storage Gen 2 with Synapse Spark using Python - SQL Stijn (sql-stijn.com)
My Python code for creating and running a job in AWS Glue is:
from datetime import datetime, timedelta
from time import sleep
import boto3
glue = boto3.client(
service_name='glue',
region_name='ap-south-1',
endpoint_url='https://glue.ap-south-1.amazonaws.com'
)
myJob = glue.create_job(
Name='sample',
Role='Name_of_my_role',
Command={
'Name': 'glueetl',
'ScriptLocation': 's3://s3-location'
}
)
myNewJobRun = glue.start_job_run(JobName=myJob['Name'])
target_time = datetime.utcnow() + timedelta(minutes=5)
while(datetime.utcnow() < target_time):
status = glue.get_job_run(JobName=myJob['Name'], RunId=myNewJobRun['JobRunId'])
print status['JobRun']['JobRunState']
sleep(30)
The script that needs to run is:
print "Hello World!"
print "Sevilla lost against Messi FC!"
This is from an example. Upon the completion of the job, it ends up in an error: Command failed with exit code 1 and upon checking the logs and error logs from the console, I get my desired result i.e. the above two lines in the script runs well.
Here are the error logs;
Container: ****
LogType:stdout
Log Upload Time:Mon Feb 25 10:46:40 +0000 2019
LogLength:44
Log Contents:
Hello World!
Sevilla lost against Messi FC!
End of LogType:stdout
I forgot to add job.commit() in the end.
For eg:
import sys
from awsglue.transforms import *
from awsglue.utils import getResolvedOptions
from pyspark.context import SparkContext
from awsglue.context import GlueContext
from awsglue.job import Job
from pyspark.sql import DataFrameReader, DataFrameWriter
from datetime import datetime
import time
# #params: [JOB_NAME]
args = getResolvedOptions(sys.argv, ['JOB_NAME'])
sc = SparkContext()
glueContext = GlueContext(sc)
spark = glueContext.spark_session
job = Job(glueContext)
job.init(args['JOB_NAME'], args)
print("All imports were successful.")
df = spark.read.orc(
's3://****'
)
print("First dataframe read with headers set to True")
df2 = spark.read.orc(
's3://****'
)
print("Second dataframe read with headers set to True")
# df3 = df.join(df2, ['c_0'], "outer")
# df3 = df.join(
# df2,
# df["column_test_1"] == df2["column_1"],
# "outer"
# )
df3 = df.alias('l').join(df2.alias('r'), on='c_0') #.collect()
print("Dataframes have been joined successfully.")
output_file_path = 's3://****'
)
df3.write.orc(
output_file_path
)
print("Dataframe has been written to csv.")
job.commit()
I am getting the below error when executing the code from command line in centOS.
"(<class 'py4j.protocol.Py4JJavaError'>, Py4JJavaError(u'An error occurred while calling o313.save.\n', JavaObject id=o315), <traceback object at 0x7fca49970320>)"
I am getting this issue only when I am submitting it through ./bin/spark-submit test.py
If I use just : spark-submit test.py everything works fine. But I am not able to run the code in yarn with it.
I have anaconda installed in my machine and I think it's using the anaconda spark-submit in the second method.
Can anyone please suggest what to do with it? Do I have set any env variables or update the libraries?
Edit:
As per the comments providing more details about the script and versions
This is the code:
from pyspark import SparkConf, SparkContext, SQLContext
from pyspark.sql import SparkSession,DataFrame
from pyspark.ml.feature import StringIndexer, VectorAssembler, OneHotEncoderEstimator, OneHotEncoder
import sys
from operator import add
import os
import pandas as pd
import numpy as np
from pyspark.sql.types import *
import pyspark.sql.functions as func
from IPython.display import display, HTML
from pyspark.ml import Pipeline, PipelineModel
from pyspark.ml.classification import LogisticRegression,LogisticRegressionModel
path = 'hdfs://10.0.15.42/nih-poc-public-dataset/'
pipeline_path = '/opt/nih/pipeline_model'
try:
conf = SparkConf().setMaster("local[8]").setAppName("sparkstream").set("spark.driver.allowMultipleContexts", "true")
sc = SparkContext(conf = conf)
sqlContext = SQLContext(sc)
spark = SparkSession.builder\
.config(conf = conf)\
.getOrCreate()
print "SparkContext Version:", sc.version
print "SparkContext Version:", sc.version
print "Python version: ", sc.pythonVer
print "Master URL to connect to: ", sc.master
print "Path where Spark is installed on worker nodes: ", str(sc.sparkHome)
print "Retrieve name of the Spark User running SparkContext: ", str(sc.sparkUser())
print "Application name: ", sc.appName
print "Application ID: ", sc.applicationId
print "Default level of parallelism: ", sc.defaultParallelism
print "Default number of partitions for RDDs: ", sc.defaultMinPartitions
########################################################################################################
############################################# DATA EXTRACTION ##########################################
########################################################################################################
file_name = path + 'maintenance_data.csv'
df_nih = spark.read.csv(file_name,sep = ';',header = "true",inferSchema="true")
df_nih.show(1)
#print(df_nih.columns)
print(df_nih.count())
########################################################################################################
########################################################################################################
######################################## PIPELINE FORMATION ############################################
########################################################################################################
categoricalcolumns = ['team','provider']
numericalcolumns = ['lifetime', 'pressureInd', 'moistureInd', 'temperatureInd']
stages = []
for categoricalcol in categoricalcolumns:
stringindexer = StringIndexer(inputCol = categoricalcol, outputCol = categoricalcol + '_index')
encoder = OneHotEncoderEstimator(inputCols=[stringindexer.getOutputCol()], outputCols=[categoricalcol + "_classVec"])
stages += [stringindexer,encoder]
#stages += [stringindexer]
assemblerinputs = [c + "_classVec" for c in categoricalcolumns] + numericalcolumns
vectorassembler_stage = VectorAssembler(inputCols = assemblerinputs
,outputCol='features')
stages += [vectorassembler_stage]
#output = assembler.transform(df_nih)
#print(output.show(1))
indexer = StringIndexer(inputCol = 'broken', outputCol = 'label')
stages += [indexer]
#pipeline = Pipeline(stages = [stages,vectorassembler_stage,indexer])
pipeline = Pipeline(stages = stages)
model_pipeline = pipeline.fit(df_nih)
model_pipeline.write().overwrite().save(pipeline_path)
except:
print(sys.exc_info())
finally:
print('stopping spark session')
spark.stop()
sc.stop()
This is the output:
> /usr/hdp/3.0.0.0-1634/spark2> ./bin/spark-submit --master yarn
> /opt/nih/sample_spark.py SparkContext Version: 2.3.1.3.0.0.0-1634
> SparkContext Version: 2.3.1.3.0.0.0-1634 Python version: 2.7 Master
> URL to connect to: yarn Path where Spark is installed on worker
> nodes: None Retrieve name of the Spark User running SparkContext:
> spark Application name: sparkstream Application ID:
> application_1540925663097_0023 Default level of parallelism: 2
> Default number of partitions for RDDs: 2
> +--------+------+----------------+----------------+----------------+-----+---------+ |lifetime|broken| pressureInd| moistureInd| temperatureInd|
> team| provider|
> +--------+------+----------------+----------------+----------------+-----+---------+ | 56|
> 0|92.1788540640753|104.230204454489|96.5171587259733|TeamA|Provider4|
> +--------+------+----------------+----------------+----------------+-----+---------+ only showing top 1 row
>
> 999 (<class 'py4j.protocol.Py4JJavaError'>, Py4JJavaError(u'An error
> occurred while calling o309.save.\n', JavaObject id=o311), <traceback
> object at 0x7f9061c86290>) stopping spark session
If I comment the saving part(model_pipeline.write) everything works fine. Please provide your suggestions
I am getting a complex json message through kafka. I need to extract the required fields from the json and store them in hive tables. I know I cannot use the spark driver sqlContext in the executors. I want to know how to use the sqlContext in the code run by the executors. Here is the code :
kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming", topic)
msgs = kvs.map(lambda msg: msg[1])
msgs.foreachRDD(lambda rdd: rdd.foreach(lambda m : timeline_events(m)))
def timeline_events(m):
msg = json.loads(m)
for msgJson in msg:
event_id = msgJson['events'][0]['event_id']
event_type = msgJson['events'][0]['type']
incidence_source = msgJson['incident']['source']
csr_description = msgJson['incident']['data']['csr_description']
sc_display_priority = msgjson['incident']['data']['display_priority']
launch_tool_rec_label = msgJson['incident']['data']['LaunchTool'][0]['Label']
launch_tool_rec_uri = msgJson['incident']['data']['LaunchTool'][0]['URI']
launch_itg_rec_label = msgJson['incident']['data']['LaunchItg'][0]['Label']
launch_itg_rec_uri = msgJson['incident']['data']['LaunchItg'][0]['URI']
sqlContext.sql("Insert into nexus.timeline_events values({},{},{},{},{},{},{},{},{},{},{})".format(event_id, event_type, csr_description, incidence_source, sc_display_priority, launch_tool_rec_label,launch_tool_rec_uri, launch_tool_rec_id, launch_itg_rec_label, launch_itg_rec_uri, launch_itg_rec_id))