SparkStreaming: How to get list like collect() - python

I am beginner of SparkStreaming.
I want to load HBase record at SparkStreaming App.
So, I write the the under code by python.
My "load_records" function is getting HBase Records and return the records.
SparkStreaming can not use collect(). sc.newAPIHadoopRDD() need to be used at Driver Program. But SparkStreaming do not have the method which can get objects from workers to driver.
How to get HBase Record at SparkStreaming? or How to call sc.newAPIHadoopRDD()?
def load_records(sc, table, keys):
host = 'localhost'
keyConv = "org.apache.spark.examples.pythonconverters.ImmutableBytesWritableToStringConverter"
valueConv = "org.apache.spark.examples.pythonconverters.HBaseResultToStringConverter"
rdd_list = []
for key in keys:
if table == "user":
conf = {"hbase.zookeeper.quorum": host, "hbase.mapreduce.inputtable": "user",
"hbase.mapreduce.scan.columns": "u:uid",
"hbase.mapreduce.scan.row.start": key, "hbase.mapreduce.scan.row.stop": key + "\x00"}
rdd = sc.newAPIHadoopRDD("org.apache.hadoop.hbase.mapreduce.TableInputFormat",
"org.apache.hadoop.hbase.io.ImmutableBytesWritable",
"org.apache.hadoop.hbase.client.Result",
keyConverter=keyConv, valueConverter=valueConv, conf=conf)
rdd_list.append(rdd)
first_rdd = rdd_list.pop(0)
for rdd in rdd_list:
first_rdd = first_rdd.union(rdd)
return first_rdd
sc = SparkContext(appName="UserStreaming")
ssc = StreamingContext(sc, 3)
topics = ["json"]
broker_list = "localhost:9092"
inputs = KafkaUtils.createDirectStream(ssc, topics, {"metadata.broker.list": broker_list})
jsons = inputs.map(lambda input: json.loads(input[1]))
user_id_rdd = jsons.map(lambda json: json["user_id"])
# the under line is not working. Any another methods?
user_id_list = user_id_rdd.collect()
user_record_rdd = load_records(sc, 'user', user_id_list)

Related

Writting data from pubsub to bigtable via cloud functions

I am a beginner at cloud big table and have big issues using cloud functions writing data from pub/sub to bigtable.
Cloud functions gets the messages from pubsub, but the issue is in the next step, writing it into bigtable.
The message is created in a python script and sent to pub/sub.
One example for a message:
b'{"eda":2.015176,"temperature":33.39,"bvp":-0.49,"x_acc":-36.0,"y_acc":-38.0,"z_acc":-128.0,"heart_rate":83.78,"iddevice":15.0,"timestamp":"2019-12-01T20:01:36.927Z"}'
For writing it into bigtable I created a table:
from google.cloud import bigtable
from google.cloud.bigtable import column_family
client = bigtable.Client(project="projectid", admin=True)
instance = client.instance("bigtableinstance")
table = instance.table("bigtable1")
print('Creating the {} table.'.format(table))
print('Creating columnfamily cf1 with Max Version GC rule...')
max_versions_rule = column_family.MaxVersionsGCRule(2)
column_family_id = 'cf1'
column_families = {column_family_id: max_versions_rule}
if not table.exists():
table.create(column_families=column_families)
print("Table {} is created.".format(table))
else:
print("Table {} already exists.".format(table))
This works without problems.
Now I tried to write the message via pub/sub to bigtable with the following python code in cloud functions using the main method:
import json
import base64
import os
from google.cloud import bigtable
from google.cloud.bigtable import column_family, row_filters
project_id = os.environ.get('projetid', 'UNKNOWN')
INSTANCE = 'bigtableinstance'
TABLE = 'bigtable1'
client = bigtable.Client(project=project_id, admin=True)
instance = client.instance(INSTANCE)
colFamily = "cf1"
def writeToBigTable(table, data):
# Parameters row_key (bytes) – The key for the row being created.
# Returns A row owned by this table.
row_key = data[colFamily]['iddevice'].value.encode()
row = table.row(row_key)
for colFamily in data.keys():
for key in data[colFamily].keys():
row.set_cell(colFamily,
key,
data[colFamily][key])
table.mutate_rows([row])
return data
def selectTable():
stage = os.environ.get('stage', 'dev')
table_id = TABLE + stage
table = instance.table(table_id)
return table
def main(event, context):
data = base64.b64decode(event['data']).decode('utf-8')
print("DATA: {}".format(data))
eda, temperature, bvp, x_acc, y_acc, z_acc, heart_rate, iddevice, timestamp = data.split(',')
table = selectTable()
data = {'eda': eda,
'temperature': temperature,
'bvp': bvp,
'x_acc':x_acc,
'y_acc':y_acc,
'z_acc':z_acc,
'heart_rate':heart_rate,
'iddevice':iddevice,
'timestamp':timestamp}
writeToBigTable(table, data)
print("Data Written: {}".format(data))
I tried different versions but cannot find a solution.
Thanks for the help.
All the best
Dominik
I think this line is wrong:
row_key = data[colFamily]['iddevice'].value.encode()
You're passing in the data object, but it doesn't have a 'cf1' property. You also don't have to encode it. Give this a try:
row_key = data['iddevice']
Your for loop will also have the same issue. I think this is what you want instead
for col in data.keys():
row.set_cell(colFamily, key, data[key])
Also, I know you're just playing with it, but using a device id as the only value for a rowkey will end up poorly. What is recommended might be to combine the rowkey and the date or one of your other properties (depending on your query,) and use that as your rowkey. There is a document on Cloud Bigtable schema that is helpful, and a codelab using a more realistic sample dataset and walks through how to pick a schema for that example. It's in Java, but you can still import the data and run your own queries.
first thanks a lot for the help.
I tried to fix it with you code recommendation which is , but unfortunately it doesn't work now due to other errors.
AttributeError: 'DirectRow' object has no attribute 'append'
I guess this is within the following line of code
row.set_cell(colFamily,
key,
data[key])
I could imagine that the errors origin is in the split of the string "data"
eda, temperature, bvp, x_acc, y_acc, z_acc, heart_rate, iddevice, timestamp = data.split(',')
E.g. eda would look like this:
"'eda':2.015176"
which looks pretty wrong to me.
Especially when I insert it into the following dict:
data = {'eda': eda,....}
The error
AttributeError: 'DirectRow' object has no attribute 'append'
seems to say, that there is a problem with the data I want to process with set_cell. There is said set_cell with row as a list or any other iterable of Direct Row Instance. Shouldn't fit a dic for it?
I tried a workaround with a list, but this seems to make it even worse.
client = bigtable.Client(project=project_id, admin=True)
instance = client.instance(INSTANCE)
colFamily = "cf1"
def writeToBigTable(table, dat):
row_key = "{}-{}".format(dat[16], dat[17])
row = table.row(row_key)
for n in range(len(dat)):
row.set_cell(colFamily,
dat[n],
dat[n+9])
table.mutate_rows([row])
return dat
def selectTable():
stage = os.environ.get('stage', 'dev')
table_id = TABLE + stage
table = instance.table(table_id)
return table
def main(event, context):
data = base64.b64decode(event['data']).decode('utf-8')
print("DATA: {}".format(data))
var_1, eda, var_2, temperature, var_3, bvp, var_4, x_acc, var_5, y_acc, var_6, z_acc, var_7, heart_rate, var_8, iddevice, var_9, timestamp = data.replace(':',',').split(',')
table = selectTable(); dat = [var_1, var_2, var_3, var_4, var_5, var_6, var_7, var_8, var_9, eda, temperature, bvp, x_acc, y_acc, z_acc, heart_rate, iddevice, timestamp];
# data = {'eda': eda,
# 'temperature': temperature,
# 'bvp': bvp,
# 'x_acc':x_acc,
# 'y_acc':y_acc,
# 'z_acc':z_acc,
# 'heart_rate':heart_rate,
# 'iddevice':iddevice,
# 'timestamp':timestamp}
writeToBigTable(table, dat)
print("Data Written: {}".format(data))
I am really hard stuck at this problem and have no further ideas how to solve it.

How can I get my Python Code to restart when the network disconnects

I have a piece of Python Code running as a service that pulls weather data via API.
The code itself runs perfectly fine when everything is hunky dory, ie the network, but I have noticed that sometimes the WiFi on the Pi that is pulling the API data will drop and then the python codes seems to stop.
I have a small line of code providing the most basic of logs, but I would like to improve upon it greatly. The log code just provides me with the datetime.now so I can see when the last time the code ran was.
#!/usr/bin/python3
#import modules
import cymysql
from time import sleep
from urllib.request import urlopen
import json
import datetime
#set MySQl Variables
host = "localhost"
user = "xxx"
password = "xxx"
schema = "xxx"
#connect to MySQL DB
db = cymysql.connect(host, user, password, schema)
curs = db.cursor()
#set api key for DarkSky API
apikey="xxx"
# Latitude & longitude
lati="-26.20227"
longi="28.04363"
# Add units=si to get it in sensible ISO units.
url="https://api.forecast.io/forecast/"+apikey+"/"+lati+","+longi+"?units=si"
#begin infinite loop
while True:
#convert API reading to json and readable array 'weather'
meteo=urlopen(url).read()
meteo = meteo.decode('utf-8')
weather = json.loads(meteo)
#set variables for current weather
cTemp = (weather['currently']['temperature'])
cCond = (weather['currently']['summary'])
cRain1 = (weather['currently']['precipProbability'])
cRain2 = cRain1*100
cIcon = (weather['currently']['icon'])
oaSum = (weather['daily']['summary'])
#print variables - for testing purposes
#print (cTemp)
#print (cCond)
#print (cRain2)
#print (cIcon)
#print (oaSum)
#extract daily data from 'weather' array
daily = (weather['daily']['data'])
#create new arrays for daily variables
listHigh = []
listLow = []
listCond = []
listRain = []
listIcon = []
#set daily variables
for i in daily:
listHigh.append(i['temperatureHigh'])
for i in range(0,len(listHigh)):
high1 = listHigh[0]
high2 = listHigh[1]
high3 = listHigh[2]
high4 = listHigh[3]
high5 = listHigh[4]
high6 = listHigh[5]
high7 = listHigh[6]
high8 = listHigh[7]
for o in daily:
listLow.append(o['temperatureLow'])
for o in range(0,len(listLow)):
low1 = listLow[0]
low2 = listLow[1]
low3 = listLow[2]
low4 = listLow[3]
low5 = listLow[4]
low6 = listLow[5]
low7 = listLow[6]
low8 = listLow[7]
for p in daily:
listCond.append(p['summary'])
for p in range(0,len(listCond)):
cond1 = listCond[0]
cond2 = listCond[1]
cond3 = listCond[2]
cond4 = listCond[3]
cond5 = listCond[4]
cond6 = listCond[5]
cond7 = listCond[6]
cond8 = listCond[7]
for m in daily:
listRain.append(m['precipProbability'])
for m in range(0,len(listRain)):
rain1 = listRain[0]
rain2 = listRain[1]
rain3 = listRain[2]
rain4 = listRain[3]
rain5 = listRain[4]
rain6 = listRain[5]
rain7 = listRain[6]
rain8 = listRain[7]
#convert rain chance to readable percentage
prain1 = rain1*100
prain2 = rain2*100
prain3 = rain3*100
prain4 = rain4*100
prain5 = rain5*100
prain6 = rain6*100
prain7 = rain7*100
prain8 = rain8*100
for l in daily:
listIcon.append(l['icon'])
for l in range (0,len(listIcon)):
icon1 = listIcon[0]
icon2 = listIcon[1]
icon3 = listIcon[2]
icon4 = listIcon[3]
icon5 = listIcon[4]
icon6 = listIcon[5]
icon7 = listIcon[6]
icon8 = listIcon[7]
#print daily variables - for testing purposes
#print (high1)
#print (low1)
#print (cond1)
#print (prain1)
#print (icon1)
#print (high2)
#print (low2)
#print (cond2)
#print (prain2)
#print (icon2)
#update data in DataBase
try:
sql_update_query = """UPDATE weather SET current_temp = %s, cur$
varis = (cTemp, cCond, cRain2, cIcon, high1, low1, cond1, prain$
curs.execute(sql_update_query, varis)
db.commit()
except db.Error as error:
print("Error: {}".format(error))
db.rollback()
#write date to log file
with open ("/home/pi/CoRo/Projects/WeatherMan/weatherlog.txt", mode="w") as file:
file.write('Last Data was pulled at: %s' %(datetime.datetime.now()))
#set loop to sleep for 10 minutes and go again
sleep(600)
I understand that the Database Code is snipped, but it is just the variables being put in to the database, which I can see works.
However if the network disconnects, the code stops and the database is left with the last polled API data.
How would I restart the python code if the API get fails?
Thanks in advance,
You could rewrite the portion of your code that pulls the weather data as a function or separate module. This would allow you to call it only when the network connection is working. Some pseudo code below:
if network_connection:
pull_weather_data()
else:
do_something()
do_something() could be an effort to reconnect to the network, such as resetting your network adapter.
You could determine the state of the network connection by trying to ping your router or an external IP like one of Google's DNS server (8.8.8.8 or 8.8.4.4).
To avoid nested loops you could use the continue clause. For example:
while True:
if network_connection:
pull_weather_data()
else:
reset_network_connection()
time.sleep(180) # Sleep for 3 minutes.
continue
The continue will send the interpreter back to the start of the while loop. From there it will check the network connection and either pull data or reset the network connection and sleep for another 3 minutes.
Using Quernons answer above the code has been edited as follows:
#!/usr/bin/python3
#import modules
import os
import cymysql
from time import sleep
from urllib.request import urlopen
import json
import datetime
#set MySQl Variables
host = "localhost"
user = "xxx"
password = "xxx"
schema = "xxx"
#connect to MySQL DB
db = cymysql.connect(host, user, password, schema)
curs = db.cursor()
#set api key for DarkSky API
apikey="xxx"
# Latitude & longitude
lati="-26.20227"
longi="28.04363"
# Add units=si to get it in sensible ISO units not stupid Fahreneheit.
url="https://api.forecast.io/forecast/"+apikey+"/"+lati+","+longi+"?units=si"
#begin infinite loop
while True:
#function to check if there is an internet connection
def check_ping():
hostname = "8.8.8.8"
response = os.system("ping -c 1 " + hostname)
#and then check the response...
if response == 0:
pingstatus = 0
else:
pingstatus = 1
return pingstatus
networkstatus = check_ping()
#print check_ping() - for testing purposes
#print (networkstatus)
#function to pull weather data from API
def get_weather():
#insert weather data here with no changes
if networkstatus == 0:
get_weather()
else:
print ("Resetting Network Adapters")
dwnnw = 'ifconfig wlan0 down'
upnw = 'ifconfig wlan0 up'
os.system(dwnnw)
os.system(upnw)
sleep(180)
continue

Using Hive SQLContext in spark executors

I am getting a complex json message through kafka. I need to extract the required fields from the json and store them in hive tables. I know I cannot use the spark driver sqlContext in the executors. I want to know how to use the sqlContext in the code run by the executors. Here is the code :
kvs = KafkaUtils.createStream(ssc, zkQuorum, "spark-streaming", topic)
msgs = kvs.map(lambda msg: msg[1])
msgs.foreachRDD(lambda rdd: rdd.foreach(lambda m : timeline_events(m)))
def timeline_events(m):
msg = json.loads(m)
for msgJson in msg:
event_id = msgJson['events'][0]['event_id']
event_type = msgJson['events'][0]['type']
incidence_source = msgJson['incident']['source']
csr_description = msgJson['incident']['data']['csr_description']
sc_display_priority = msgjson['incident']['data']['display_priority']
launch_tool_rec_label = msgJson['incident']['data']['LaunchTool'][0]['Label']
launch_tool_rec_uri = msgJson['incident']['data']['LaunchTool'][0]['URI']
launch_itg_rec_label = msgJson['incident']['data']['LaunchItg'][0]['Label']
launch_itg_rec_uri = msgJson['incident']['data']['LaunchItg'][0]['URI']
sqlContext.sql("Insert into nexus.timeline_events values({},{},{},{},{},{},{},{},{},{},{})".format(event_id, event_type, csr_description, incidence_source, sc_display_priority, launch_tool_rec_label,launch_tool_rec_uri, launch_tool_rec_id, launch_itg_rec_label, launch_itg_rec_uri, launch_itg_rec_id))

load the csv file into Big query auto detect schema using python API

I'm trying to load the CSV file with schema under auto detection but I am unable to load the file into Big query. Can any one help me on this.
Please find my code below:
def load_data_from_file(dataset_name, table_name, source_file_name):
bigquery_client = bigquery.Client()
dataset = bigquery_client.dataset(dataset_name)
table = dataset.table(table_name)
table.reload()
with open(source_file_name, 'rb') as source_file:
job = table.upload_from_file(
source_file, source_format='text/csv')
wait_for_job(job)
print('Loaded {} rows into {}:{}.'.format(
job.output_rows, dataset_name, table_name))
def wait_for_job(job):
while True:
job.reload()
if job.state == 'DONE':
if job.error_result:
raise RuntimeError(job.errors)
return
time.sleep(1)
Based on the Google BigQuery python API documentation, you should set source_format to 'CSV' instead of 'text/csv':
source_format='CSV'
Code Sample:
with open(csv_file.name, 'rb') as readable:
table.upload_from_file(
readable, source_format='CSV', skip_leading_rows=1)
Source: https://googlecloudplatform.github.io/google-cloud-python/stable/bigquery-usage.html#datasets
If this does not solve your problem, please provide more specifics about the errors you are observing.
You can use the below code snippet to create and load data (CSV format) from Cloud Storage to BigQuery with auto-detect schema:
from google.cloud import bigquery
bigqueryClient = bigquery.Client()
jobConfig = bigquery.LoadJobConfig()
jobConfig.skip_leading_rows = 1
jobConfig.source_format = bigquery.SourceFormat.CSV
jobConfig.write_disposition = bigquery.WriteDisposition.WRITE_APPEND
jobConfig.autodetect=True
datasetName = "dataset-name"
targetTable = "table_name"
uri = "gs://bucket_name/file.csv"
tableRef = bigqueryClient.dataset(datasetName).table(targetTable)
bigqueryJob = bigqueryClient.load_table_from_uri(uri, tableRef, job_config=jobConfig)
bigqueryJob.result()
Currently, the Python Client has no support for loading data from file with a schema auto-detection flag (I plan on doing a pull request to add this support but still I'd like to talk to the maintainers what their opinions are on this implementation).
There are still some ways to work this around. I didn't find a very elegant solution so far but nevertheless this code allows you to add schema detection as input flag:
from google.cloud.bigquery import Client
import os
os.environ['GOOGLE_APPLICATION_CREDENTIALS'] = 'path/your/json.key'
import google.cloud.bigquery.table as mtable
def _configure_job_metadata(metadata,
allow_jagged_rows,
allow_quoted_newlines,
create_disposition,
encoding,
field_delimiter,
ignore_unknown_values,
max_bad_records,
quote_character,
skip_leading_rows,
write_disposition):
load_config = metadata['configuration']['load']
if allow_jagged_rows is not None:
load_config['allowJaggedRows'] = allow_jagged_rows
if allow_quoted_newlines is not None:
load_config['allowQuotedNewlines'] = allow_quoted_newlines
if create_disposition is not None:
load_config['createDisposition'] = create_disposition
if encoding is not None:
load_config['encoding'] = encoding
if field_delimiter is not None:
load_config['fieldDelimiter'] = field_delimiter
if ignore_unknown_values is not None:
load_config['ignoreUnknownValues'] = ignore_unknown_values
if max_bad_records is not None:
load_config['maxBadRecords'] = max_bad_records
if quote_character is not None:
load_config['quote'] = quote_character
if skip_leading_rows is not None:
load_config['skipLeadingRows'] = skip_leading_rows
if write_disposition is not None:
load_config['writeDisposition'] = write_disposition
load_config['autodetect'] = True # --> Here you can add the option for schema auto-detection
mtable._configure_job_metadata = _configure_job_metadata
bq_client = Client()
ds = bq_client.dataset('dataset_name')
ds.table = lambda: mtable.Table('table_name', ds)
table = ds.table()
with open(source_file_name, 'rb') as source_file:
job = table.upload_from_file(
source_file, source_format='text/csv')
Just wanted to show how i've used the python client.
Below is my function to create a table and load it with a csv file.
Also, self.client is my bigquery.Client()
def insertTable(self, datasetName, tableName, csvFilePath, schema=None):
"""
This function creates a table in given dataset in our default project
and inserts the data given via a csv file.
:param datasetName: The name of the dataset to be created
:param tableName: The name of the dataset in which the table needs to be created
:param csvFilePath: The path of the file to be inserted
:param schema: The schema of the table to be created
:return: returns nothing
"""
csv_file = open(csvFilePath, 'rb')
dataset_ref = self.client.dataset(datasetName)
# <import>: from google.cloud.bigquery import Dataset
dataset = Dataset(dataset_ref)
table_ref = dataset.table(tableName)
if schema is not None:
table = bigquery.Table(table_ref,schema)
else:
table = bigquery.Table(table_ref)
try:
self.client.delete_table(table)
except:
pass
table = self.client.create_table(table)
# <import>: from google.cloud.bigquery import LoadJobConfig
job_config = LoadJobConfig()
table_ref = dataset.table(tableName)
job_config.source_format = 'CSV'
job_config.skip_leading_rows = 1
job_config.autodetect = True
job = self.client.load_table_from_file(
csv_file, table_ref, job_config=job_config)
job.result()
Let me know if this solves your problem.

pymongo- upsert not able to perform insertion with $set operation

I am having an empty collection and have thousands of entries to process (entries might have redudancy for which I want to use both updates and inserts).
The python code (using pymongo) I wrote:
for mydoc in alldocs:
key = {'myid': mydoc['myid']}
data = process_doc(mydoc) # returns simple dictionary
db.mydocs.update(key, {"$set": data}, upsert = True)
The following code is unable to perform any insert operations. The collection still remains empty. But when I remove $set and use simply data, it works fine. Can't I use $set in upsert? The reason why I want $set was so that pre-existing fields for a BSON doesn't get affected. Can someone please guide. I really can't figure out what to do.
Reproducable code:
from pymongo import Connection
DB_CONTENT_BASE_KEY = 'contentbase'
def connect_to_db(dbname, hostname = 'localhost', portno = 27017, **kwargs):
connection = Connection(hostname, portno)
dbConnection = connection[dbname]
return dbConnection
class MetawebCustomCollectionBuilder(object):
# key ought to be a dictionary to filter results from contentbase.
def __init__(self, inDbConfig, outDbConfig, key = {}, verbose = False):
self.verbose = verbose
self.inDbConfig = inDbConfig
self.inDb = connect_to_db(**inDbConfig)
self.outDbConfig = outDbConfig
self.outDb = connect_to_db(**outDbConfig)
self.inDbContentBase = self.inDb[self.inDbConfig[DB_CONTENT_BASE_KEY]]
self.outDbContentBase = self.outDb[self.outDbConfig[DB_CONTENT_BASE_KEY]]
self.key = key
self.in_db_collection_constraints()
self.out_db_collection_constraints()
def in_db_collection_constraints(self):
self.inDbContentBase.ensure_index('mid')
if self.verbose: print("Assured index on mid for inDbContentBase...")
def out_db_collection_constraints(self):
self.outDbContentBase.ensure_index('mid')
if self.verbose: print("Assured index on mid for outDbContentBase...")
def process_in_record(self, inRecord):
outRecord = inRecord # [YET TO] continue from here...
return outRecord
def transit_collection(self):
for record in self.inDbContentBase.find(self.key):
outRecord = self.process_in_record(record)
key = {'mid':outRecord['mid']}
data = outRecord
print key
self.outDbContentBase.update(key, {"$set": data}, True)
if self.verbose: print 'Done with transiting collection from in DB to out DB'
def cleanup_out_collection(self):
pass
def in_db_sandbox(self):
# To have tests and analytics placed in here corresponding to inDb.
pass
if __name__ == '__main__':
inDbConfig = {'dbname':'metaweb', 'contentbase': 'content'}
outDbConfig = {'dbname': 'similarkind', 'contentbase': 'content'}
mccb = MetawebCustomCollectionBuilder(inDbConfig, outDbConfig, verbose = True)
mccb.transit_collection()
There must be a prexisting database inDb. From this collection I want to create a new modified collection.
Your claim is wrong
>>> import pymongo
>>> c = pymongo.Connection()
>>> db = c.mydb
>>> db.mydocs.find().count()
0
>>> db.mydocs.update({'myid': '438'}, {"$set": {'keyA':'valueA'}}, upsert = True)
>>> db.mydocs.find().count()
1
>>> db.mydocs.find_one()
{u'myid': u'438', u'keyA': u'valueA', u'_id': ObjectId('504c2fd1a694cc9624bbd6a2')}

Categories

Resources