Unable to create a dataframe from json dstream using pyspark - python

I am attempting to create a dataframe from json in dstream but the code below does not seem to get the dataframe right -
import sys
import json
from pyspark import SparkContext
from pyspark.streaming import StreamingContext
from pyspark.sql import SQLContext
def getSqlContextInstance(sparkContext):
if ('sqlContextSingletonInstance' not in globals()):
globals()['sqlContextSingletonInstance'] = SQLContext(sparkContext)
return globals()['sqlContextSingletonInstance']
if __name__ == "__main__":
if len(sys.argv) != 3:
raise IOError("Invalid usage; the correct format is:\nquadrant_count.py <hostname> <port>")
# Initialize a SparkContext with a name
spc = SparkContext(appName="jsonread")
sqlContext = SQLContext(spc)
# Create a StreamingContext with a batch interval of 2 seconds
stc = StreamingContext(spc, 2)
# Checkpointing feature
stc.checkpoint("checkpoint")
# Creating a DStream to connect to hostname:port (like localhost:9999)
lines = stc.socketTextStream(sys.argv[1], int(sys.argv[2]))
lines.pprint()
parsed = lines.map(lambda x: json.loads(x))
def process(time, rdd):
print("========= %s =========" % str(time))
try:
# Get the singleton instance of SQLContext
sqlContext = getSqlContextInstance(rdd.context)
# Convert RDD[String] to RDD[Row] to DataFrame
rowRdd = rdd.map(lambda w: Row(word=w))
wordsDataFrame = sqlContext.createDataFrame(rowRdd)
# Register as table
wordsDataFrame.registerTempTable("mytable")
testDataFrame = sqlContext.sql("select summary from mytable")
print(testDataFrame.show())
print(testDataFrame.printSchema())
except:
pass
parsed.foreachRDD(process)
stc.start()
# Wait for the computation to terminate
stc.awaitTermination()
No errors but when the script runs, it does read the json from streaming context successfully however it does not print the values in summary or the dataframe schema.
Example json I am attempting to read -
{"reviewerID": "A2IBPI20UZIR0U", "asin": "1384719342", "reviewerName":
"cassandra tu \"Yeah, well, that's just like, u...", "helpful": [0,
0], "reviewText": "Not much to write about here, but it does exactly
what it's supposed to. filters out the pop sounds. now my recordings
are much more crisp. it is one of the lowest prices pop filters on
amazon so might as well buy it, they honestly work the same despite
their pricing,", "overall": 5.0, "summary": "good", "unixReviewTime":
1393545600, "reviewTime": "02 28, 2014"}
I am absolute new comer to spark streaming and started working on pet projects by reading documentation. Any help and guidance is greatly appreciated.

Related

How to break down script into smaller function and create main.py? [closed]

Closed. This question is not reproducible or was caused by typos. It is not currently accepting answers.
This question was caused by a typo or a problem that can no longer be reproduced. While similar questions may be on-topic here, this one was resolved in a way less likely to help future readers.
Closed 1 year ago.
Improve this question
I wrote the script in python that works perfectly fine if executed as-is. What I am trying to do is to break this script into meaningful functions and create main.py to execute this as a proper python application.
Here is my LiveStream.py code with which I am collecting data from the sensor at the beginning of every minute, and sending it to the MySQL database, and also posting it to the URL. As mentioned this works perfectly fine if I execute: python3 LiveStream.py
# Import Dependencies
import board
import pandas as pd
from busio import I2C
import adafruit_bme680
from datetime import datetime, timedelta
import time
import requests
import mysql.connector
import json
import sqlalchemy
# read database config file
with open("config.json") as config:
param = json.load(config)
# Create library object using Bus I2C port
i2c = I2C(board.SCL, board.SDA)
bme680 = adafruit_bme680.Adafruit_BME680_I2C(i2c, debug=False)
# change this to match the location's pressure (hPa) at sea level
bme680.sea_level_pressure = 1013.25
# Read data from sensors
while True:
# Create the now variable to capture the current moment
TimeStamp = datetime.now()
Temperature = round((bme680.temperature * 9/5) + 32, 2)
Gas = round(bme680.gas, 2)
Humidity = round(bme680.humidity, 2)
Pressure = round(bme680.pressure, 2)
Altitude = round(bme680.altitude, 2)
now = datetime.strftime(TimeStamp,"%Y-%m-%dT%H:%M:%S")
# Adding collected measurements into dataframe
data = pd.DataFrame([
{
"TimeStamp": now,
"Temperature": Temperature,
"Gas": Gas,
"Humidity": Humidity,
"Pressure": Pressure,
"Altitude": Altitude
}
])
# Try establishing connection with database
try:
engine = sqlalchemy.create_engine('mysql+mysqlconnector://{0}:{1}#{2}/{3}'.
format(param['MyDemoServer'][0]['user'],
param['MyDemoServer'][0]['password'],
param['MyDemoServer'][0]['host'],
param['MyDemoServer'][0]['database']), echo=False)
# Cleaning the data from existing tables MetricValues and Metrics
db_con = engine.connect()
if db_con.connect():
try:
data.to_sql('sensordata', con = db_con, if_exists = 'append', index = False)
db_con.close()
# Dispose the engine
engine.dispose()
except OSError as e:
print(e)
except OSError as e:
print(e)
# Power BI API
# BI Address to push the data to
url = 'https://api.powerbi.com/beta/94cd2fa9-eb6a-490b-af36-53bf7f5ef485/datasets/2a7a2529-dbfd-4c32-9513-7d5857b61137/rows?noSignUpCheck=1&key=nS3bP1Mo4qN9%2Fp6XJcTBgHBUV%2FcOZb0edYrK%2BtVWDg6iWwzRtY16HWUGSqB9YsqF3GHMNO2fe3r5ltB7NhVIvw%3D%3D'
# post/push data to the streaming API
headers = {
"Content-Type": "application/json"
}
response = requests.request(
method="POST",
url=url,
headers=headers,
data=json.dumps(data.to_json())
)
data = pd.DataFrame()
# Re-run the script at the beginning of every new minute.
dt = datetime.now() + timedelta(minutes=1)
dt = dt.replace(second=1)
while datetime.now() < dt:
time.sleep(1)
Here is what I have tried so far... I created a lib folder where I have etl.py file. in this file I tried creating functions such us:
def sensorsreading():
# Create library object using Bus I2C port
i2c = I2C(board.SCL, board.SDA)
bme680 = adafruit_bme680.Adafruit_BME680_I2C(i2c, debug=False)
# change this to match the location's pressure (hPa) at sea level
bme680.sea_level_pressure = 1013.25
# Read data from sensors
while True:
# Create the now variable to capture the current moment
TimeStamp = datetime.now()
Temperature = round((bme680.temperature * 9 / 5) + 32, 2)
Gas = round(bme680.gas, 2)
Humidity = round(bme680.humidity, 2)
Pressure = round(bme680.pressure, 2)
Altitude = round(bme680.altitude, 2)
now = datetime.strftime(TimeStamp, "%Y-%m-%dT%H:%M:%S")
# Adding collected measurements into dataframe
data = pd.DataFrame([
{
"TimeStamp": now,
"Temperature": Temperature,
"Gas": Gas,
"Humidity": Humidity,
"Pressure": Pressure,
"Altitude": Altitude
}
])
return data
And also function:
def dataload(data):
# Try establishing connection with database
try:
engine = sqlalchemy.create_engine('mysql+mysqlconnector://{0}:{1}#{2}/{3}'.
format(param['MyDemoServer'][0]['user'],
param['MyDemoServer'][0]['password'],
param['MyDemoServer'][0]['host'],
param['MyDemoServer'][0]['database']), echo=False)
# Cleaning the data from existing tables MetricValues and Metrics
db_con = engine.connect()
if db_con.connect():
try:
data.to_sql('sensordata', con=db_con, if_exists='append', index=False)
db_con.close()
# Dispose the engine
engine.dispose()
except OSError as e:
print(e)
except OSError as e:
print(e)
And my main.py looks like this:
import pandas as pd
from datetime import datetime, timedelta
import time
from lib.etl import *
def etl(name):
data = sensorsreading()
dataload(data)
# Press the green button in the gutter to run the script.
if __name__ == '__main__':
etl('PyCharm')
# Re-run the script at the beginning of every new minute.
dt = datetime.now() + timedelta(minutes=1)
dt = dt.replace(second=1)
while datetime.now() < dt:
time.sleep(1)
When I run main.py it seems that I am not passing the data frame from sensorsreading() to dataload() function.
Any idea what am I doing wrong here?
To address you original question, you were using yield instead of return. Yields is used in generators, as you can read more here: https://www.geeksforgeeks.org/use-yield-keyword-instead-return-keyword-python/
In the case you don't need a precise execution, this will call the function each 60 seconds. Anyways, I'll sugest using a scheduler like systemctl or cron.
import time
while True:
etl('PyCharm')
time.sleep(60)
If you want something more precise you could use:
import time
starttime = time.time()
while True:
etl('PyCharm')
time.sleep(60.0 - ((time.time() - starttime) % 60.0))
as explained in What is the best way to repeatedly execute a function every x seconds?

Azure Blob storage error can't parse a date in spark

I am trying to read a file allocated in azure datalake gen2 into spark dataframe using python.
Code is
from pyspark import SparkConf
from pyspark.sql import SparkSession
# create spark session
key = "some_key"
appName = "DataExtract"
master = "local[*]"
sparkConf = SparkConf() \
.setAppName(appName) \
.setMaster(master) \
.set("fs.azure.account.key.myaccount.dfs.core.windows.net", key)
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
data_csv="abfs://test-file-system#myaccount.dfs.core.windows.net/data.csv"
data_out = "abfs://test-file-system#myaccount.dfs.core.windows.net/data_out.csv"
# read csv
df = self.spark_session.read.csv(data_csv)
# write csv
df.write.csv(data_out)
The file is read and is written well, but I am getting following error
ERROR AzureBlobFileSystemStore: Failed to parse the date Thu, 09 Sep 2021 10:12:34 GMT
Date seems to be file creation date.
How can I parse the date to avoid getting the error?
I tried reproducing the same issue and found it is with these lines that is causing the error.
data_csv="abfs://test-file-system#myaccount.dfs.core.windows.net/data.csv" data_out =
"abfs://test-file-system#myaccount.dfs.core.windows.net/data_out.csv"
# read csv df = self.spark_session.read.csv(data_csv) ```
Here is the code that worked for me when I tried replacing the above lines of code i.e.. abfs to abfss
from pyspark import SparkConf
from pyspark.sql import SparkSession
# create spark session
key = "<Your Storage Account Key>"
appName = "<Synapse App Name>"
master = "local[*]"
sparkConf = SparkConf() \
.setAppName(appName) \
.setMaster(master) \
.set("fs.azure.account.key.<Storage Account Name>.dfs.core.windows.net", key)
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
data_csv="abfss://<ContainerName>#<Storage Account Name>.dfs.core.windows.net/<Directory>"
# read csv
df1 = spark.read.option('header','true')\
.option('delimiter', ',')\
.csv(data_csv + '/sample1.csv')
df1.show()
# write csv
df2 = df1.write.csv(data_csv + '/<Give the name of blob you want to write to>.csv')
else you can even try the below code which perfectly worked for me
from pyspark.sql import SparkSession
from pyspark.sql.types import *
account_name = "<StorageAccount Name>"
container_name = "<Storage Account Container Name>"
relative_path = "<Directory path>"
adls_path = 'abfss://%s#%s.dfs.core.windows.net/%s'%(container_name,account_name,relative_path)
dataframe1 = spark.read.option('header','true')\
.option('delimiter', ',')\
.csv(adls_path + '/sample1.csv')
dataframe1.show()
dataframe2 = dataframe1.write.csv(adls_path + '/<Give the name of blob you want to write to>.csv')
REFERENCE :
Synapse Spark – Reading CSV files from Azure Data Lake Storage Gen 2 with Synapse Spark using Python - SQL Stijn (sql-stijn.com)

Writting data from pubsub to bigtable via cloud functions

I am a beginner at cloud big table and have big issues using cloud functions writing data from pub/sub to bigtable.
Cloud functions gets the messages from pubsub, but the issue is in the next step, writing it into bigtable.
The message is created in a python script and sent to pub/sub.
One example for a message:
b'{"eda":2.015176,"temperature":33.39,"bvp":-0.49,"x_acc":-36.0,"y_acc":-38.0,"z_acc":-128.0,"heart_rate":83.78,"iddevice":15.0,"timestamp":"2019-12-01T20:01:36.927Z"}'
For writing it into bigtable I created a table:
from google.cloud import bigtable
from google.cloud.bigtable import column_family
client = bigtable.Client(project="projectid", admin=True)
instance = client.instance("bigtableinstance")
table = instance.table("bigtable1")
print('Creating the {} table.'.format(table))
print('Creating columnfamily cf1 with Max Version GC rule...')
max_versions_rule = column_family.MaxVersionsGCRule(2)
column_family_id = 'cf1'
column_families = {column_family_id: max_versions_rule}
if not table.exists():
table.create(column_families=column_families)
print("Table {} is created.".format(table))
else:
print("Table {} already exists.".format(table))
This works without problems.
Now I tried to write the message via pub/sub to bigtable with the following python code in cloud functions using the main method:
import json
import base64
import os
from google.cloud import bigtable
from google.cloud.bigtable import column_family, row_filters
project_id = os.environ.get('projetid', 'UNKNOWN')
INSTANCE = 'bigtableinstance'
TABLE = 'bigtable1'
client = bigtable.Client(project=project_id, admin=True)
instance = client.instance(INSTANCE)
colFamily = "cf1"
def writeToBigTable(table, data):
# Parameters row_key (bytes) – The key for the row being created.
# Returns A row owned by this table.
row_key = data[colFamily]['iddevice'].value.encode()
row = table.row(row_key)
for colFamily in data.keys():
for key in data[colFamily].keys():
row.set_cell(colFamily,
key,
data[colFamily][key])
table.mutate_rows([row])
return data
def selectTable():
stage = os.environ.get('stage', 'dev')
table_id = TABLE + stage
table = instance.table(table_id)
return table
def main(event, context):
data = base64.b64decode(event['data']).decode('utf-8')
print("DATA: {}".format(data))
eda, temperature, bvp, x_acc, y_acc, z_acc, heart_rate, iddevice, timestamp = data.split(',')
table = selectTable()
data = {'eda': eda,
'temperature': temperature,
'bvp': bvp,
'x_acc':x_acc,
'y_acc':y_acc,
'z_acc':z_acc,
'heart_rate':heart_rate,
'iddevice':iddevice,
'timestamp':timestamp}
writeToBigTable(table, data)
print("Data Written: {}".format(data))
I tried different versions but cannot find a solution.
Thanks for the help.
All the best
Dominik
I think this line is wrong:
row_key = data[colFamily]['iddevice'].value.encode()
You're passing in the data object, but it doesn't have a 'cf1' property. You also don't have to encode it. Give this a try:
row_key = data['iddevice']
Your for loop will also have the same issue. I think this is what you want instead
for col in data.keys():
row.set_cell(colFamily, key, data[key])
Also, I know you're just playing with it, but using a device id as the only value for a rowkey will end up poorly. What is recommended might be to combine the rowkey and the date or one of your other properties (depending on your query,) and use that as your rowkey. There is a document on Cloud Bigtable schema that is helpful, and a codelab using a more realistic sample dataset and walks through how to pick a schema for that example. It's in Java, but you can still import the data and run your own queries.
first thanks a lot for the help.
I tried to fix it with you code recommendation which is , but unfortunately it doesn't work now due to other errors.
AttributeError: 'DirectRow' object has no attribute 'append'
I guess this is within the following line of code
row.set_cell(colFamily,
key,
data[key])
I could imagine that the errors origin is in the split of the string "data"
eda, temperature, bvp, x_acc, y_acc, z_acc, heart_rate, iddevice, timestamp = data.split(',')
E.g. eda would look like this:
"'eda':2.015176"
which looks pretty wrong to me.
Especially when I insert it into the following dict:
data = {'eda': eda,....}
The error
AttributeError: 'DirectRow' object has no attribute 'append'
seems to say, that there is a problem with the data I want to process with set_cell. There is said set_cell with row as a list or any other iterable of Direct Row Instance. Shouldn't fit a dic for it?
I tried a workaround with a list, but this seems to make it even worse.
client = bigtable.Client(project=project_id, admin=True)
instance = client.instance(INSTANCE)
colFamily = "cf1"
def writeToBigTable(table, dat):
row_key = "{}-{}".format(dat[16], dat[17])
row = table.row(row_key)
for n in range(len(dat)):
row.set_cell(colFamily,
dat[n],
dat[n+9])
table.mutate_rows([row])
return dat
def selectTable():
stage = os.environ.get('stage', 'dev')
table_id = TABLE + stage
table = instance.table(table_id)
return table
def main(event, context):
data = base64.b64decode(event['data']).decode('utf-8')
print("DATA: {}".format(data))
var_1, eda, var_2, temperature, var_3, bvp, var_4, x_acc, var_5, y_acc, var_6, z_acc, var_7, heart_rate, var_8, iddevice, var_9, timestamp = data.replace(':',',').split(',')
table = selectTable(); dat = [var_1, var_2, var_3, var_4, var_5, var_6, var_7, var_8, var_9, eda, temperature, bvp, x_acc, y_acc, z_acc, heart_rate, iddevice, timestamp];
# data = {'eda': eda,
# 'temperature': temperature,
# 'bvp': bvp,
# 'x_acc':x_acc,
# 'y_acc':y_acc,
# 'z_acc':z_acc,
# 'heart_rate':heart_rate,
# 'iddevice':iddevice,
# 'timestamp':timestamp}
writeToBigTable(table, dat)
print("Data Written: {}".format(data))
I am really hard stuck at this problem and have no further ideas how to solve it.

Getting profiled user groups from ALS(Alternating Least Squares) algorithm

We are using ALS (Alternating Least Squares) method in our Google Cloud spark environment to recommend some companies to our users. For making the recommendation we are using this tuple (userId, companyId, rating) and the rating value consists of a combination of the user's interests such as clicking the company page, adding a company to favorite list, making an order from the company, etc. (our method is very similar to this link)
And the results are pretty good and works for our business case, however, we are missing 1 thing which is important for us.
We need to learn which users are grouped as similar interests(a.k.a neighbors), Do you know is there any way to get grouped users from pyspark's ALS algorithm?
So we would be able to tag the users according to that grouping
Edit:
I've tried the answered code in the below but the results are strange, my data is paired like this (userId, companyId, rating)
When I run the below code, it groups the users with no common companyId in the same clusterId.
For example, one of the results of the below code is:
(userId: 471, clusterId: 2)
(userId: 490, clusterId: 2)
However users 471 and 490 have nothing in common. I think there is a mistake here:
from __future__ import print_function
import sys
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import IntegerType
from pyspark.mllib.clustering import KMeans, KMeansModel
conf = SparkConf().setAppName("user_clustering")
sc = SparkContext(conf=conf)
sc.setCheckpointDir('checkpoint/')
sqlContext = SQLContext(sc)
CLOUDSQL_INSTANCE_IP = sys.argv[1]
CLOUDSQL_DB_NAME = sys.argv[2]
CLOUDSQL_USER = sys.argv[3]
CLOUDSQL_PWD = sys.argv[4]
BEST_RANK = int(sys.argv[5])
BEST_ITERATION = int(sys.argv[6])
BEST_REGULATION = float(sys.argv[7])
TABLE_ITEMS = "companies"
TABLE_RATINGS = "ml_ratings"
TABLE_RECOMMENDATIONS = "ml_reco"
TABLE_USER_CLUSTERS = "ml_user_clusters"
# Read the data from the Cloud SQL
# Create dataframes
#[START read_from_sql]
jdbcUrl = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % (CLOUDSQL_INSTANCE_IP, CLOUDSQL_DB_NAME, CLOUDSQL_USER, CLOUDSQL_PWD)
dfAccos = sqlContext.read.jdbc(url=jdbcUrl, table=TABLE_ITEMS)
dfRates = sqlContext.read.jdbc(url=jdbcUrl, table=TABLE_RATINGS)
print("Start Clustering Users")
# print("User Ratings:")
# dfRates.show(100)
#[END read_from_sql]
# Get all the ratings rows of our user
# print("Filtered User Ratings For User:",USER_ID)
# print("------------------------------")
# for x in dfUserRatings:
# print(x)
#[START split_sets]
rddTraining, rddValidating, rddTesting = dfRates.rdd.randomSplit([6,2,2])
print("RDDTraining Size:",rddTraining.count()," RDDValidating Size:",rddValidating.count()," RDDTesting Size:",rddTesting.count())
print("Rank:",BEST_RANK," Iteration:",BEST_ITERATION," Regulation:",BEST_REGULATION)
#print("RDD Training Values:",rddTraining.collect())
#[END split_sets]
print("Start predicting")
#[START predict]
# Build our model with the best found values
# Rating, Rank, Iteration, Regulation
model = ALS.train(rddTraining, BEST_RANK, BEST_ITERATION, BEST_REGULATION)
# print("-----------------")
# print("User Groups Are Created")
# print("-----------------")
user_features = model.userFeatures().map(lambda x: x[1])
related_users = model.userFeatures().map(lambda x: x[0])
number_of_clusters = 10
model_kmm = KMeans.train(user_features, number_of_clusters, initializationMode = "random", runs = 3)
user_features_with_cluster_id = model_kmm.predict(user_features)
user_features_with_related_users = related_users.zip(user_features_with_cluster_id)
clusteredUsers = user_features_with_related_users.map(lambda x: (x[0],x[1]))
orderedUsers = clusteredUsers.takeOrdered(200,key = lambda x: x[1])
print("Ordered Users:")
print("--------------")
for x in orderedUsers:
print(x)
#[START save user groups]
userGroupSchema = StructType([StructField("primaryUser", IntegerType(), True), StructField("groupId", IntegerType(), True)])
dfUserGroups = sqlContext.createDataFrame(orderedUsers,userGroupSchema)
try:
dfUserGroups.write.jdbc(url=jdbcUrl, table=TABLE_USER_CLUSTERS, mode='append')
except:
print("Data is already written to DB")
print("Written to DB and Finished Job")
Once you have trained your model you can get the users feature vector using userFeatures()
After that, you can calculate the distance between the users using some distance function or use a clustering model like KMeans
So if the model is already trained:
user_features = model.userFeatures().map(lambda x: x[1]).repartition(50)
number_of_clusters = 10
model_kmm = KMeans.train(user_features, number_of_clusters, initializationMode = "random", runs = 3)
user_features_with_cluster_id = model_kmm.predict(user_features).zip(user_features)

ERROR:SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063

I am presently working with ASN 1 Decoder.I will be getting a Hex decimal code from producer and i will be collecting it in consumer.
Then after i will be converting the hex code to RDD and then pass the hex value RDD to another function with in same class Decode_Module and will be using python asn1 decoder to decode the hex data and return it back and print it.
I don't understand whats wrong with my code.I have already installed my asn1 parser dependencies in worker nodes too.
Any wrong with the way i call in lambda expression or something else.
My ERROR: Exception: It appears that you are attempting to reference SparkContext from a broadcast variable, action, or transformation. SparkContext can only be used on the driver, not in code that it run on workers. For more information, see SPARK-5063
PLEASE HELP ME THANK YOU
My CODE:
class telco_cn:
def __init__(self,sc):
self.sc = sc
print ('in init function')
logging.info('eneterd into init function')
def decode_module(self,msg):
try:
logging.info('Entered into generate module')
### Providing input for module we need to load
load_module(config_values['load_module'])
### Providing Value for Type of Decoding
ASN1.ASN1Obj.CODEC = config_values['PER_DECODER']
### Providing Input for Align/UnAlign
PER.VARIANT = config_values['PER_ALIGNED']
### Providing Input for pdu load
pdu = GLOBAL.TYPE[config_values['pdu_load']]
### Providing Hex value to buf
buf = '{}'.format(msg).decode('hex')
return val
except Exception as e:
logging.debug('error in decode_module function %s' %str(e))
def consumer_input(self,sc,k_topic):
logging.info('entered into consumer input');print(k_topic)
consumer = KafkaConsumer(ip and other values given)
consumer.subscribe(k_topic)
for msg in consumer:
print(msg.value);
a = sc.parallelize([msg.value])
d = a.map(lambda x: self.decode_module(x)).collect()
print d
if __name__ == "__main__":
logging.info('Entered into main')
conf = SparkConf()
conf.setAppName('telco_consumer')
conf.setMaster('yarn-client')
sc = SparkContext(conf=conf)
sqlContext = HiveContext(sc)
cn = telco_cn(sc)
cn.consumer_input(sc,config_values['kafka_topic'])
This is because self.decode_module contain instance of SparkContext.
To fix your code you can use #staticmethod:
class telco_cn:
def __init__(self, sc):
self.sc = sc
#staticmethod
def decode_module(msg):
return msg
def consumer_input(self, sc, k_topic):
a = sc.parallelize(list('abcd'))
d = a.map(lambda x: telco_cn.decode_module(x)).collect()
print d
if __name__ == "__main__":
conf = SparkConf()
sc = SparkContext(conf=conf)
cn = telco_cn(sc)
cn.consumer_input(sc, '')
For more infomation:
http://spark.apache.org/docs/latest/programming-guide.html#passing-functions-to-spark
You cannot reference the instance method (self.decode_module) inside the lambda expression, because it the instance object contains a SparkContext reference.
This occurs because internally PySpark tries to Pickle everything it gets to send to its workers. So when you say it should execute self.decode_module() inside the nodes, PySpark tries to pickle the whole (self) object (that contains a reference to the spark context).
To fix that, you just need to remove the SparkContext reference from the telco_cn class and use a different approach like using the SparkContext before calling the class instance (like Zhangs's answer suggests).
With me the issue was:
text_df = "some text"
convertUDF = udf(lambda z: my_fynction(z), StringType())
cleaned_fun = text_df.withColumn('cleaned', udf(convertUDF, StringType())('text'))
I was giving udf() twice. Just did this:
convertUDF = lambda z: my_fynction(z)
cleaned_fun = text_df.withColumn('cleaned', udf(convertUDF, StringType())('text'))
and solved the error

Categories

Resources