delete all subdomain and Get Root Domain only pyspark - python

I have a big list is a json file an example :
{"timestamp":"1600840155","name":"0.0.0.1","value":"subdomain.test.net","type":"hd"}
{"timestamp":"1600840155","name":"0.0.0.2","value":"test.net","type":"hd"}
{"timestamp":"1600846210","name":"0.0.0.3","value":"node-fwx.pool-1-0.dynamic.exmple4.net","type":"hd"}
{"timestamp":"1600846210","name":"0.0.0.4","value":"exmple4.net","type":"hd"}
{"timestamp":"1600848078","name":"0.0.0.5","value":"node-fwy.pool-1-0.dynamic.exmple5.net","type":"hd"}
{"timestamp":"1600848078","name":"0.0.0.6","value":"exmple5.net","type":"hd"}
{"timestamp":"1600838189","name":"0.0.0.7","value":"node-fwz.pool-1-0.dynamic.exmple6.net","type":"hd"}
{"timestamp":"1600838189","name":"0.0.0.8","value":"exmple6.net","type":"hd"}
{"timestamp":"1600879127","name":"0.0.0.9","value":"node-fx0.pool-1-0.dynamic.exmple7.net","type":"hd"}
{"timestamp":"1600838189","name":"0.0.0.10","value":"exmple7.net","type":"hd"}
{"timestamp":"1600874834","name":"0.0.0.11","value":"node-fx1.pool-1-0.dynamic.exmple8.net","type":"hd"}
{"timestamp":"1600838189","name":"0.0.0.12","value":"exmple8.net","type":"hd"}
{"timestamp":"1600825122","name":"0.0.0.13","value":"node-ftb.pool-1-0.dynamic.exmple9.net","type":"hd"}
{"timestamp":"1600838189","name":"0.0.0.14","value":"exmple9.net","type":"hd"}
{"timestamp":"1600849239","name":"0.0.0.15","value":"node-fx2.pool-1-0.dynamic.exmple10.net","type":"hd"}
{"timestamp":"1600838189","name":"0.0.0.16","value":"exmple10.net","type":"hd"}
{"timestamp":"1600820784","name":"0.0.0.17","value":"node-fx3.pool-1-0.dynamic.other11.net","type":"hd"}
{"timestamp":"1600838189","name":"0.0.0.18","value":"exmple11.net","type":"hd"}
{"timestamp":"1600840955","name":"0.0.0.19","value":"node-fx4.pool-1-0.dynamic.other12.net","type":"hd"}
{"timestamp":"1600838189","name":"0.0.0.20","value":"exmple12.net","type":"hd"}
{"timestamp":"1600860091","name":"0.0.0.21","value":"another -one.pool-1-0.dynamic.other13.net","type":"hd"}
{"timestamp":"1600838189","name":"0.0.0.22","value":"exmple13.net","type":"hd"}
and I would like to get just the the root only and delete the other one using pyspark
so want to select
df.select("name","value","type").distinct() \
.write \
.save("mycleanlist",format="json")
i want this result
"name":"0.0.0.22","value":"exmple13.net","type":"hd"}
"name":"0.0.0.2","value":"test.net","type":"hd"}
"name":"0.0.0.4","value":"exmple4.net","type":"hd"}
"name":"0.0.0.6","value":"exmple5.net","type":"hd"}
"name":"0.0.0.8","value":"exmple6.net","type":"hd"}
"name":"0.0.0.10","value":"exmple7.net","type":"hd"}
"name":"0.0.0.12","value":"exmple8.net","type":"hd"}
"name":"0.0.0.14","value":"exmple9.net","type":"hd"}
"name":"0.0.0.16","value":"exmple10.net","type":"hd"}
"name":"0.0.0.18","value":"exmple11.net","type":"hd"}
"name":"0.0.0.20","value":"exmple12.net","type":"hd"}
"name":"0.0.0.22","value":"exmple13.net","type":"hd"}

You can use a UDF to wrap a method that extract the root domain:
from pyspark.sql.functions import udf
from pyspark.sql.types import StringType
def extract_domain(url):
if url:
parts = url.split('.')
url = '.'.join(parts[-2:])
return url
extract_domain_udf = udf(extract_domain, StringType())
df.select("name", extract_domain_udf("value"), "type").distinct() \
.write \
.save("mycleanlist",format="json")

Related

Write paritioned csv files to a single folder - Pyspark

While using partitionby() in pyspark, what approach should I follow to write csv files in one single folder rather than multiple folders ? Any suggested solution ?
Code
from pyspark.sql import SparkSession
from pyspark import SparkConf
import pyodbc
appName = "PySpark Teradata Example"
master = "local"
conf = SparkConf() # create the configuration
conf.set("spark.repl.local.jars", "terajdbc4.jar")
conf.set("spark.executor.extraClassPath", "terajdbc4.jar")
conf.set("spark.driver.extraClassPath", "terajdbc4.jar")
spark = SparkSession.builder \
.config(conf=conf) \
.appName(appName) \
.master(master) \
.getOrCreate()
#input table name
table = "my_table_1"
df =spark.read \
.format('jdbc') \
.option('url', 'jdbc:teradata://xxx.xxx.xx.xx') \
.option('user', 'dbc') \
.option('password', 'dbc') \
.option('driver', 'com.teradata.jdbc.TeraDriver') \
.option('STRICT_NAMES', 'OFF') \
.option('query',"Select eno, CAST(edata.asJSONText() AS VARCHAR(32000)) as edata from AdventureWorksDW."+table)\
.load()
df.show()
df = df.withColumn("id_tmp", F.col(df.columns[0]) % 4).orderBy("id_tmp")
df.coalesce(4)
.write \
.option("header",True) \
.mode("overwrite") \
.partitionBy("id_tmp") \
.option("sep","|")\
.format("csv") \
.save("C:\\Data\\"+table+"\\")
It is giving multiple folders with multiple CSV as an output. How to download it to a single folder ? Also, how can we change the name of the file while writing it to the folder ?
df = df.repartition(1) will reset the amount of partitions to 1, but as Kafels mentioned, it is better to use coalesce:
df = df.coalesce(1)
more info:
https://stackoverflow.com/a/31675351
https://stackoverflow.com/a/40983145
source:
https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.repartition.html
https://spark.apache.org/docs/latest/api/python/reference/api/pyspark.sql.DataFrame.coalesce.html

Word Count with timestamp in Python

This example is extracted from Structured Streaming Programming Guide of Spark:
from pyspark.sql import SparkSession
from pyspark.sql.functions import explode
from pyspark.sql.functions import split
spark = SparkSession \
.builder \
.appName("StructuredNetworkWordCount") \
.getOrCreate()
# Create DataFrame representing the stream of input lines from connection to localhost:9999
lines = spark \
.readStream \
.format("socket") \
.option("host", "localhost") \
.option("port", 9999) \
.load()
# Split the lines into words
words = lines.select(
explode(
split(lines.value, " ")
).alias("word"),
lines.timestamp.alias('time')
)
# Generate running word count
wordCounts = words.groupBy("word").count() #line to modify
# Start running the query that prints the running counts to the console
query = wordCounts \
.writeStream \
.outputMode("complete") \
.format("console") \
.start()
query.awaitTermination()
I need to create a table with every word and its input time. The output table should be like this:
+-------+--------------------+
|word | time |
+-------+--------------------+
| car |2021-12-16 12:21:..|
+-------+--------------------+
How can I do it? I think the line marked with "#line to modify" is only the line to modify.
Try, something like this:
streamingDF.writeStream.foreachBatch { (batchDF: DataFrame, batchId: Long) =>
batchDF.persist()
batchDF.write.format(...).save(...) // location 1
batchDF.write.format(...).save(...) // location 2
batchDF.unpersist()
}
You can do something like this:
writeStream
.format("parquet") // can be "orc", "json", "csv", etc.
.option("path", "path/to/destination/dir")
.start()
and make an external table to point, and set the path if needed yourself.
See https://spark.apache.org/docs/latest/structured-streaming-programming-guide.html#using-foreach-and-foreachbatch
Delta also writes to file location:
df.writeStream
.format("delta")
.outputMode("append")
.option("checkpointLocation", "/delta/df/_checkpoints/etl-from-json")
.start("/delta/df")
You may want to think about "complete".

Azure Blob storage error can't parse a date in spark

I am trying to read a file allocated in azure datalake gen2 into spark dataframe using python.
Code is
from pyspark import SparkConf
from pyspark.sql import SparkSession
# create spark session
key = "some_key"
appName = "DataExtract"
master = "local[*]"
sparkConf = SparkConf() \
.setAppName(appName) \
.setMaster(master) \
.set("fs.azure.account.key.myaccount.dfs.core.windows.net", key)
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
data_csv="abfs://test-file-system#myaccount.dfs.core.windows.net/data.csv"
data_out = "abfs://test-file-system#myaccount.dfs.core.windows.net/data_out.csv"
# read csv
df = self.spark_session.read.csv(data_csv)
# write csv
df.write.csv(data_out)
The file is read and is written well, but I am getting following error
ERROR AzureBlobFileSystemStore: Failed to parse the date Thu, 09 Sep 2021 10:12:34 GMT
Date seems to be file creation date.
How can I parse the date to avoid getting the error?
I tried reproducing the same issue and found it is with these lines that is causing the error.
data_csv="abfs://test-file-system#myaccount.dfs.core.windows.net/data.csv" data_out =
"abfs://test-file-system#myaccount.dfs.core.windows.net/data_out.csv"
# read csv df = self.spark_session.read.csv(data_csv) ```
Here is the code that worked for me when I tried replacing the above lines of code i.e.. abfs to abfss
from pyspark import SparkConf
from pyspark.sql import SparkSession
# create spark session
key = "<Your Storage Account Key>"
appName = "<Synapse App Name>"
master = "local[*]"
sparkConf = SparkConf() \
.setAppName(appName) \
.setMaster(master) \
.set("fs.azure.account.key.<Storage Account Name>.dfs.core.windows.net", key)
spark = SparkSession.builder.config(conf=sparkConf).getOrCreate()
data_csv="abfss://<ContainerName>#<Storage Account Name>.dfs.core.windows.net/<Directory>"
# read csv
df1 = spark.read.option('header','true')\
.option('delimiter', ',')\
.csv(data_csv + '/sample1.csv')
df1.show()
# write csv
df2 = df1.write.csv(data_csv + '/<Give the name of blob you want to write to>.csv')
else you can even try the below code which perfectly worked for me
from pyspark.sql import SparkSession
from pyspark.sql.types import *
account_name = "<StorageAccount Name>"
container_name = "<Storage Account Container Name>"
relative_path = "<Directory path>"
adls_path = 'abfss://%s#%s.dfs.core.windows.net/%s'%(container_name,account_name,relative_path)
dataframe1 = spark.read.option('header','true')\
.option('delimiter', ',')\
.csv(adls_path + '/sample1.csv')
dataframe1.show()
dataframe2 = dataframe1.write.csv(adls_path + '/<Give the name of blob you want to write to>.csv')
REFERENCE :
Synapse Spark – Reading CSV files from Azure Data Lake Storage Gen 2 with Synapse Spark using Python - SQL Stijn (sql-stijn.com)

Pyspark - checking Json format using accumulator

How do I check JSON file is corrupted like missing {, }, commas or wrong datatype. I am trying to achieve by using accumulator because process runs on multiple executors.
spark_config = SparkConf().setAppName(application_name)
ss = SparkSession.builder.config(conf=spark_config).getOrCreate()
class StringAccumulatorParam(AccumulatorParam):
def zero(self, v):
return []
def addInPlace(self, variable, value):
variable.append(value)
return variable
errorCount = ss.sparkContext.accumulator(0)
errorValues = ss.sparkContext.accumulator("", StringAccumulatorParam())
newSchema = StructType([
StructField("id", IntegerType(), True),
StructField("name", StringType(), True)
StructField("status", BooleanType(), True)])
errorDF = ss.read.json("/Users/test.jsonl")
errorDF2 = ss.createDataFrame(errorDF, newSchema).cache()
def checkErrorCount(row):
global errorCount
errorDF2["id"] = row. newSchema["id"]
errorCount.add(1)
errorValues.add(errorDF2["id"])
errorDF.foreach(lambda x: checkErrorCount(x))
print("{} rows had questionable values.".format(errorCount.value))
ss.stop()
Here is corrupt JSON file -
{"name":"Standards1","id":90,"status":true}
{"name":"Standards2","id":91
{"name":"Standards3","id":92,"status":true}
{"name":781,"id":93,"status":true}
I had a play with this and came up with the following.
Of the 2 solutions, I think the difference of counts will be faster since it will use native Spark JSON processing.
The UDF solution will do the JSON parsing in Python, meaning you have to pay the cost of transferring each file line from Java to Python so will probably be slower.
import json
from pyspark import SparkConf
from pyspark.sql import SparkSession
from pyspark.sql.functions import sum, udf
from pyspark.sql.types import LongType
application_name = 'Count bad JSON lines'
spark_config = SparkConf().setAppName(application_name)
ss = SparkSession.builder.config(conf=spark_config).getOrCreate()
# Difference of counts solution
input_path = '/baddata.json'
total_lines = ss.read.text(input_path).count()
good_lines = ss.read.option('mode', 'DROPMALFORMED').json(input_path).count()
bad_lines = total_lines - good_lines
print('Found {} bad JSON lines in data'.format(bad_lines))
# Parse JSON with UDF solution
def is_bad(line):
try:
json.loads(line)
return 0
except ValueError:
return 1
is_bad_udf = udf(is_bad, LongType())
lines = ss.read.text(input_path)
bad_sum = lines.select(sum(is_bad_udf('value'))).collect()[0][0]
print('Got {} bad lines'.format(bad_sum))
ss.stop()

create labeledpoint from mongodb using python

I want to create labeledpoint from mongodb using python,
I already tried to do that with a csv file instead of mongodb
here is the code of function that return the labeledpoint
def createLabeledPoints(fields):
q1 = int(fields[0])
q2 = int(fields[1])
result = int(fields[38])
return LabeledPoint(result, array([q1, q2, q3))
this code works for me with csv file
and I get my collection from mongodb as a pandas dataframe using the code below
from pymongo import MongoClient
client = MongoClient('localhost', 27017)
db1 = client.newumc
collection1 = db.data_classification
rawData1 = DataFrame(list(collection.find({})))
and I get each field using the code below
field_for_test = collection.find({}, {'field_from_mongodb':1,'_id':0})
i solved the problem by using
spark = SparkSession \
.builder \
.appName("myApp") \
.config("spark.mongodb.input.uri", "mongodb://127.0.0.1/newumc.classification_data") \
.config("spark.mongodb.output.uri", "mongodb://127.0.0.1/newumc.classification_data") \
.getOrCreate()
df = spark.read.format("com.mongodb.spark.sql.DefaultSource").load()
field1 = df[1]
field2 = df[2]

Categories

Resources