Pyspark - checking Json format using accumulator - python

How do I check JSON file is corrupted like missing {, }, commas or wrong datatype. I am trying to achieve by using accumulator because process runs on multiple executors.
spark_config = SparkConf().setAppName(application_name)
ss = SparkSession.builder.config(conf=spark_config).getOrCreate()
class StringAccumulatorParam(AccumulatorParam):
def zero(self, v):
return []
def addInPlace(self, variable, value):
variable.append(value)
return variable
errorCount = ss.sparkContext.accumulator(0)
errorValues = ss.sparkContext.accumulator("", StringAccumulatorParam())
newSchema = StructType([
StructField("id", IntegerType(), True),
StructField("name", StringType(), True)
StructField("status", BooleanType(), True)])
errorDF = ss.read.json("/Users/test.jsonl")
errorDF2 = ss.createDataFrame(errorDF, newSchema).cache()
def checkErrorCount(row):
global errorCount
errorDF2["id"] = row. newSchema["id"]
errorCount.add(1)
errorValues.add(errorDF2["id"])
errorDF.foreach(lambda x: checkErrorCount(x))
print("{} rows had questionable values.".format(errorCount.value))
ss.stop()
Here is corrupt JSON file -
{"name":"Standards1","id":90,"status":true}
{"name":"Standards2","id":91
{"name":"Standards3","id":92,"status":true}
{"name":781,"id":93,"status":true}

I had a play with this and came up with the following.
Of the 2 solutions, I think the difference of counts will be faster since it will use native Spark JSON processing.
The UDF solution will do the JSON parsing in Python, meaning you have to pay the cost of transferring each file line from Java to Python so will probably be slower.
import json
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, udf
from pyspark.sql.types import LongType
application_name = 'Count bad JSON lines'
spark_config = SparkConf().setAppName(application_name)
ss = SparkSession.builder.config(conf=spark_config).getOrCreate()
# Difference of counts solution
input_path = '/baddata.json'
total_lines = ss.read.text(input_path).count()
good_lines = ss.read.option('mode', 'DROPMALFORMED').json(input_path).count()
bad_lines = total_lines - good_lines
print('Found {} bad JSON lines in data'.format(bad_lines))
# Parse JSON with UDF solution
def is_bad(line):
try:
json.loads(line)
return 0
except ValueError:
return 1
is_bad_udf = udf(is_bad, LongType())
lines = ss.read.text(input_path)
bad_sum = lines.select(sum(is_bad_udf('value'))).collect()[0][0]
print('Got {} bad lines'.format(bad_sum))
ss.stop()

Related

Decryption not working - how to get raw data from csv/pandas - python

Below is my code for decrypting from a csv file stored on DropBox. I get the user to type in their ID, I match this with a database containing hashed values, and then I use the ID typed in to search my stored csv file for the matching row. I then place all the row values into my decryption function.
Also im aware my variable names/formatting is awful im just using this code as a prototype as of right now.
My results are being printed as such:
b'b\xebS\x1b\xc8v\xe2\xf8\xa2\\x84\x0e7M~\x1b'
b'\x01B#6i\x1b\xfc]\xc3\xca{\xd5{B\xbe!'
b'in*V7\xf3P\xa0\xb2\xc5\xd2\xb7\x1dz~\x95'
I store my key and IV so they are always the same, yet the decryption doesnt seem to work. My only thinking is perhaps my data is changed somehow when stored in a csv or pandas table etc. does anyone know what the issue would be or if the bytes can be altered when stored/imported to dataframe?
also maybe i am extracting the data from my csv to pandas incorrectly?
def login():
import sqlite3
import os.path
def decoder():
from Crypto.Cipher import AES
import hashlib
from secrets import token_bytes
cursor.execute(
'''
Select enc_key FROM Login where ID = (?);
''',
(L_ID_entry.get(), ))
row = cursor.fetchone()
if row is not None:
keys = row[0]
#design padding function for encryption
def padded_text(data_in):
while len(data_in)% 16 != 0:
data_in = data_in + b"0"
return data_in
#calling stored key from main file and reverting back to bytes
key_original = bytes.fromhex(keys)
mode = AES.MODE_CBC
#model
cipher = AES.new(key_original, mode, IV3)
#padding data
p4 = padded_text(df1.tobytes())
p5 = padded_text(df2.tobytes())
p6 = padded_text(df3.tobytes())
#decrypting data
d_fname = cipher.decrypt(p4)
d_sname = cipher.decrypt(p5)
d_email = cipher.decrypt(p6)
print(d_fname)
print(d_sname)
print(d_email)
#connecting to db
try:
conn = sqlite3.connect('login_details.db')
cursor = conn.cursor()
print("Connected to SQLite")
except sqlite3.Error as error:
print("Failure, error: ", error)
finally:
#downloading txt from dropbox and converting to dataframe to operate on
import New_user
import ast
_, res = client.files_download("/user_details/enc_logins.csv")
with io.BytesIO(res.content) as csvfile:
with open("enc_logins.csv", 'rb'):
df = pd.read_csv(csvfile, names=['ID', 'Fname', 'Sname', 'Email'], encoding= 'unicode_escape')
newdf = df[(df == L_ID_entry.get()).any(axis=1)]
print(newdf)
df1 = newdf['Fname'].to_numpy()
df2 = newdf['Sname'].to_numpy()
df3 = newdf['Email'].to_numpy()
print(df1)
print(df2)
print(df3)
csvfile.close()
decoder()

Reduce memory usage by json.loads with multiprocessing or Dask in Python

I have a csv file with 1 million rows and 3gb data size. I used panda read_csv to convert it to DataFrame and it works well.
Next now i have to format the data columns and append another column also according to the value of some columns. To do this, i am using Dask DataFrame npartitions then apply row-wise. We have 7.5gb of RAM at our instance, but it hangs and kill the process with MemoryError.
This is my code to format the data columns:
import pandas as pd
import json
import dask.dataframe as dd
import multiprocessing
def formatting_data(data):
print("cleaning and formatting data")
data["IsBadJson"] = False
data["BadJsonStr"] = None
data = dd.from_pandas(data, npartitions=4*multiprocessing.cpu_count())
.map_partitions(lambda df: df.apply(lambda row: parse_data_to_json(row), axis=1))
.compute(scheduler='processes')
return data
Below is code for function parse_data_to_json we are using for formatting
def parse_data_to_json(x):
try:
if x.get("RequestSent") == "nan":
x["RequestSent"] = None
x["IsBadJson"] = True
x["BadJsonStr"] = str(x.get("RequestSent"))
else:
x["RequestSent"] = json.loads(x.get("RequestSent"))
x["IsBadJson"] = False
x["BadJsonStr"] = None
except Exception as error:
print("Found an error value in Tax Json field RequestSent: {}, error details: {}".format(x.get("RequestSent"), error))
print("{}-{}-{}".format(None, True, str(x.get("RequestSent"))))
x["RequestSent"] = None
x["IsBadJson"] = True
x["BadJsonStr"] = str(x.get("RequestSent"))
try:
if x.get("ResponseReceived") == "nan":
x["ResponseReceived"] = None
x["IsBadJson"] = True
x["BadJsonStr"] = str(x.get("ResponseReceived"))
else:
x["ResponseReceived"] = json.loads(x.get("ResponseReceived"))
x["IsBadJson"] = False
x["BadJsonStr"] = None
except Exception as error:
print("Found an error value in Tax Json field RequestSent: {}, error details: {}".format(x.get("ResponseReceived"), error))
print("{}-{}-{}".format(None, True, str(x.get("ResponseReceived"))))
x["ResponseReceived"] = None
x["IsBadJson"] = True
x["BadJsonStr"] = str(x.get("ResponseReceived"))
return x
I recommend allowing Dask to load your data directly from CSV, rather than pass it a Pandas dataframe. See https://docs.dask.org/en/latest/best-practices.html#load-data-with-dask

Why is creating this UDF so much faster in PySpark than in Scala Spark?

I have the Python script:
import time
from pyspark.sql.types import StringType
from pyspark.sql.functions import udf
from urllib.parse import urlsplit, unquote
def extractPath(host, url):
if host in url:
return urlsplit(url).path
else:
return '-'
startCreateUdfs = time.time()
getPathUdf = udf(extractPath, StringType())
endCreateUdfs = time.time()
print("Python udf creation time: {}".format(endCreateUdfs - startCreateUdfs))
and the Scala script:
import java.net.URLDecoder
import java.nio.charset.StandardCharsets
import java.net.URL
import org.apache.spark.sql.SparkSession
import org.apache.spark.sql.functions.udf
object UdfTimes extends App{
val spark = SparkSession.builder().master("local").getOrCreate()
spark.sparkContext.setLogLevel("ERROR")
val extractPath: (String, String) => String = (host, url) => {
if (url.contains(host))
new URL(url).getPath
else
"-"
}
val unquote: String => String = str => URLDecoder.decode(str, StandardCharsets.UTF_8.name())
val startTimeUdf = System.nanoTime()
val getPathUdf = udf(extractPath)
val endTimeUdf = System.nanoTime()
println("Scala udf registering time: " + (endTimeUdf - startTimeUdf) / math.pow(10, 9))
}
Which I have written to do the same thing. The udf creation is instant in Python (from command line):
Python udf creation time: 2.0503997802734375e-05
but in Scala, it takes almost a second (sbt command line):
udf registering time: 0.768687091
What is the reason for this big difference?

Getting profiled user groups from ALS(Alternating Least Squares) algorithm

We are using ALS (Alternating Least Squares) method in our Google Cloud spark environment to recommend some companies to our users. For making the recommendation we are using this tuple (userId, companyId, rating) and the rating value consists of a combination of the user's interests such as clicking the company page, adding a company to favorite list, making an order from the company, etc. (our method is very similar to this link)
And the results are pretty good and works for our business case, however, we are missing 1 thing which is important for us.
We need to learn which users are grouped as similar interests(a.k.a neighbors), Do you know is there any way to get grouped users from pyspark's ALS algorithm?
So we would be able to tag the users according to that grouping
Edit:
I've tried the answered code in the below but the results are strange, my data is paired like this (userId, companyId, rating)
When I run the below code, it groups the users with no common companyId in the same clusterId.
For example, one of the results of the below code is:
(userId: 471, clusterId: 2)
(userId: 490, clusterId: 2)
However users 471 and 490 have nothing in common. I think there is a mistake here:
from __future__ import print_function
import sys
from pyspark import SparkContext, SparkConf, SQLContext
from pyspark.mllib.recommendation import ALS, MatrixFactorizationModel, Rating
from pyspark.sql.types import StructType
from pyspark.sql.types import StructField
from pyspark.sql.types import IntegerType
from pyspark.mllib.clustering import KMeans, KMeansModel
conf = SparkConf().setAppName("user_clustering")
sc = SparkContext(conf=conf)
sc.setCheckpointDir('checkpoint/')
sqlContext = SQLContext(sc)
CLOUDSQL_INSTANCE_IP = sys.argv[1]
CLOUDSQL_DB_NAME = sys.argv[2]
CLOUDSQL_USER = sys.argv[3]
CLOUDSQL_PWD = sys.argv[4]
BEST_RANK = int(sys.argv[5])
BEST_ITERATION = int(sys.argv[6])
BEST_REGULATION = float(sys.argv[7])
TABLE_ITEMS = "companies"
TABLE_RATINGS = "ml_ratings"
TABLE_RECOMMENDATIONS = "ml_reco"
TABLE_USER_CLUSTERS = "ml_user_clusters"
# Read the data from the Cloud SQL
# Create dataframes
#[START read_from_sql]
jdbcUrl = 'jdbc:mysql://%s:3306/%s?user=%s&password=%s' % (CLOUDSQL_INSTANCE_IP, CLOUDSQL_DB_NAME, CLOUDSQL_USER, CLOUDSQL_PWD)
dfAccos = sqlContext.read.jdbc(url=jdbcUrl, table=TABLE_ITEMS)
dfRates = sqlContext.read.jdbc(url=jdbcUrl, table=TABLE_RATINGS)
print("Start Clustering Users")
# print("User Ratings:")
# dfRates.show(100)
#[END read_from_sql]
# Get all the ratings rows of our user
# print("Filtered User Ratings For User:",USER_ID)
# print("------------------------------")
# for x in dfUserRatings:
# print(x)
#[START split_sets]
rddTraining, rddValidating, rddTesting = dfRates.rdd.randomSplit([6,2,2])
print("RDDTraining Size:",rddTraining.count()," RDDValidating Size:",rddValidating.count()," RDDTesting Size:",rddTesting.count())
print("Rank:",BEST_RANK," Iteration:",BEST_ITERATION," Regulation:",BEST_REGULATION)
#print("RDD Training Values:",rddTraining.collect())
#[END split_sets]
print("Start predicting")
#[START predict]
# Build our model with the best found values
# Rating, Rank, Iteration, Regulation
model = ALS.train(rddTraining, BEST_RANK, BEST_ITERATION, BEST_REGULATION)
# print("-----------------")
# print("User Groups Are Created")
# print("-----------------")
user_features = model.userFeatures().map(lambda x: x[1])
related_users = model.userFeatures().map(lambda x: x[0])
number_of_clusters = 10
model_kmm = KMeans.train(user_features, number_of_clusters, initializationMode = "random", runs = 3)
user_features_with_cluster_id = model_kmm.predict(user_features)
user_features_with_related_users = related_users.zip(user_features_with_cluster_id)
clusteredUsers = user_features_with_related_users.map(lambda x: (x[0],x[1]))
orderedUsers = clusteredUsers.takeOrdered(200,key = lambda x: x[1])
print("Ordered Users:")
print("--------------")
for x in orderedUsers:
print(x)
#[START save user groups]
userGroupSchema = StructType([StructField("primaryUser", IntegerType(), True), StructField("groupId", IntegerType(), True)])
dfUserGroups = sqlContext.createDataFrame(orderedUsers,userGroupSchema)
try:
dfUserGroups.write.jdbc(url=jdbcUrl, table=TABLE_USER_CLUSTERS, mode='append')
except:
print("Data is already written to DB")
print("Written to DB and Finished Job")
Once you have trained your model you can get the users feature vector using userFeatures()
After that, you can calculate the distance between the users using some distance function or use a clustering model like KMeans
So if the model is already trained:
user_features = model.userFeatures().map(lambda x: x[1]).repartition(50)
number_of_clusters = 10
model_kmm = KMeans.train(user_features, number_of_clusters, initializationMode = "random", runs = 3)
user_features_with_cluster_id = model_kmm.predict(user_features).zip(user_features)

Parsing Json with multiple "levels" with Python

I'm trying to parse a json file from an api call.
I have found this code that fits my need and trying to adapt it to what I want:
import math, urllib2, json, re
def download():
graph = {}
page = urllib2.urlopen("http://fx.priceonomics.com/v1/rates/?q=1")
jsrates = json.loads(page.read())
pattern = re.compile("([A-Z]{3})_([A-Z]{3})")
for key in jsrates:
matches = pattern.match(key)
conversion_rate = -math.log(float(jsrates[key]))
from_rate = matches.group(1).encode('ascii','ignore')
to_rate = matches.group(2).encode('ascii','ignore')
if from_rate != to_rate:
if from_rate not in graph:
graph[from_rate] = {}
graph[from_rate][to_rate] = float(conversion_rate)
return graph
And I've turned it into:
import math, urllib2, json, re
def download():
graph = {}
page = urllib2.urlopen("https://bittrex.com/api/v1.1/public/getmarketsummaries")
jsrates = json.loads(page.read())
for pattern in jsrates['result'][0]['MarketName']:
for key in jsrates['result'][0]['Ask']:
matches = pattern.match(key)
conversion_rate = -math.log(float(jsrates[key]))
from_rate = matches.group(1).encode('ascii','ignore')
to_rate = matches.group(2).encode('ascii','ignore')
if from_rate != to_rate:
if from_rate not in graph:
graph[from_rate] = {}
graph[from_rate][to_rate] = float(conversion_rate)
return graph
Now the problem is that there is multiple level in the json "Result > 0, 1,2 etc"
json screenshot
for key in jsrates['result'][0]['Ask']:
I want the zero to be able to be any number, I don't know if thats clear.
So I could get all the ask price to match their marketname.
I have shortened the code so it doesnt make too long of a post.
Thanks
PS: sorry for the english, its not my native language.
You could loop through all of the result values that are returned, ignoring the meaningless numeric index:
for result in jsrates['result'].values():
ask = result.get('Ask')
if ask is not None:
# Do things with your ask...

Categories

Resources