I'm using PySpark v1.6.1 and I want to create a dataframe using another one:
Convert a field that has a struct of three values in different columns
Convert the timestamp from string to datatime
Create more columns using that timestamp
Change the rest of the column names and types
Right now is using .map(func) creating an RDD using that function (which transforms from one row from the original type and returns a row with the new one). But this is creating an RDD and I don't wont that.
Is there a nicer way to do this?
from pyspark.sql.functions import unix_timestamp, col, to_date, struct
####
#sample data
####
df = sc.parallelize([[25, 'Prem', 'M', '12-21-2006 11:00:05','abc', '1'],
[20, 'Kate', 'F', '05-30-2007 10:05:00', 'asdf', '2'],
[40, 'Cheng', 'M', '12-30-2017 01:00:01', 'qwerty', '3']]).\
toDF(["age","name","sex","datetime_in_strFormat","initial_col_name","col_in_strFormat"])
#create 'struct' type column by combining first 3 columns of sample data - (this is built to answer query #1)
df = df.withColumn("struct_col", struct('age', 'name', 'sex')).\
drop('age', 'name', 'sex')
df.show()
df.printSchema()
####
#query 1
####
#Convert a field that has a struct of three values (i.e. 'struct_col') in different columns (i.e. 'name', 'age' & 'sex')
df = df.withColumn('name', col('struct_col.name')).\
withColumn('age', col('struct_col.age')).\
withColumn('sex', col('struct_col.sex')).\
drop('struct_col')
df.show()
df.printSchema()
####
#query 2
####
#Convert the timestamp from string (i.e. 'datetime_in_strFormat') to datetime (i.e. 'datetime_in_tsFormat')
df = df.withColumn('datetime_in_tsFormat',
unix_timestamp(col('datetime_in_strFormat'), 'MM-dd-yyyy hh:mm:ss').cast("timestamp"))
df.show()
df.printSchema()
####
#query 3
####
#create more columns using above timestamp (e.g. fetch date value from timestamp column)
df = df.withColumn('datetime_in_dateFormat', to_date(col('datetime_in_tsFormat')))
df.show()
####
#query 4.a
####
#Change column name (e.g. 'initial_col_name' is renamed to 'new_col_name)
df = df.withColumnRenamed('initial_col_name', 'new_col_name')
df.show()
####
#query 4.b
####
#Change column type (e.g. string type in 'col_in_strFormat' is coverted to double type in 'col_in_doubleFormat')
df = df.withColumn("col_in_doubleFormat", col('col_in_strFormat').cast("double"))
df.show()
df.printSchema()
Sample data:
+---------------------+----------------+----------------+------------+
|datetime_in_strFormat|initial_col_name|col_in_strFormat| struct_col|
+---------------------+----------------+----------------+------------+
| 12-21-2006 11:00:05| abc| 1| [25,Prem,M]|
| 05-30-2007 10:05:00| asdf| 2| [20,Kate,F]|
| 12-30-2017 01:00:01| qwerty| 3|[40,Cheng,M]|
+---------------------+----------------+----------------+------------+
root
|-- datetime_in_strFormat: string (nullable = true)
|-- initial_col_name: string (nullable = true)
|-- col_in_strFormat: string (nullable = true)
|-- struct_col: struct (nullable = false)
| |-- age: long (nullable = true)
| |-- name: string (nullable = true)
| |-- sex: string (nullable = true)
Final output data:
+---------------------+------------+----------------+-----+---+---+--------------------+----------------------+-------------------+
|datetime_in_strFormat|new_col_name|col_in_strFormat| name|age|sex|datetime_in_tsFormat|datetime_in_dateFormat|col_in_doubleFormat|
+---------------------+------------+----------------+-----+---+---+--------------------+----------------------+-------------------+
| 12-21-2006 11:00:05| abc| 1| Prem| 25| M| 2006-12-21 11:00:05| 2006-12-21| 1.0|
| 05-30-2007 10:05:00| asdf| 2| Kate| 20| F| 2007-05-30 10:05:00| 2007-05-30| 2.0|
| 12-30-2017 01:00:01| qwerty| 3|Cheng| 40| M| 2017-12-30 01:00:01| 2017-12-30| 3.0|
+---------------------+------------+----------------+-----+---+---+--------------------+----------------------+-------------------+
root
|-- datetime_in_strFormat: string (nullable = true)
|-- new_col_name: string (nullable = true)
|-- col_in_strFormat: string (nullable = true)
|-- name: string (nullable = true)
|-- age: long (nullable = true)
|-- sex: string (nullable = true)
|-- datetime_in_tsFormat: timestamp (nullable = true)
|-- datetime_in_dateFormat: date (nullable = true)
|-- col_in_doubleFormat: double (nullable = true)
Related
I have a spark df as follows:
+----------+-----------+
| date|impressions|
+----------+-----------+
|22/04/2020| 136821|
|23/04/2020| 159688|
|24/04/2020| 165053|
|25/04/2020| 165609|
|26/04/2020| 183574|
+----------+-----------+
Where column date is of type string formated as %d/%m/%Y. I need that same column to be changed to the supported date format for spark and be of type date.
Use either to_date,to_timestamp,from_unixtime(unix_timestamp()) functions for this case
Example:
df.show()
#+----------+-----------+
#| date|impressions|
#+----------+-----------+
#|22/04/2020| 136821|
#+----------+-----------+
from pyspark.sql.functions import *
from pyspark.sql.types import *
df.withColumn("dt",to_date(col("date"),"dd/MM/yyyy")).\
withColumn("dt1",to_timestamp(col("date"),"dd/MM/yyyy").cast("date")).\
withColumn("dt2",from_unixtime(unix_timestamp(col("date"),"dd/MM/yyyy")).cast("date")).\
show()
#+----------+-----------+----------+----------+----------+
#| date|impressions| dt| dt1| dt2|
#+----------+-----------+----------+----------+----------+
#|22/04/2020| 136821|2020-04-22|2020-04-22|2020-04-22|
#+----------+-----------+----------+----------+----------+
df.withColumn("dt",to_date(col("date"),"dd/MM/yyyy")).\
withColumn("dt1",to_timestamp(col("date"),"dd/MM/yyyy").cast("date")).\
withColumn("dt2",from_unixtime(unix_timestamp(col("date"),"dd/MM/yyyy")).cast("date")).\
printSchema()
#root
# |-- date: string (nullable = true)
# |-- impressions: string (nullable = true)
# |-- dt: date (nullable = true)
# |-- dt1: date (nullable = true)
# |-- dt2: date (nullable = true)
from pyspark.sql.functions import unix_timestamp, from_unixtime
df.select(
'date',
from_unixtime(unix_timestamp('date', '%d/%m/%Y')).alias('date')
)
If you don't want to create a new column, just change the original column data type you can use the original column name in withColumn():
changed_df = df.withColumn("date", to_date(col("date"),"dd/MM/yyyy"))
changed_df.printSchema()
#root
# |-- date: date (nullable = true)
# |-- impression: integer (nullable = true)
changed_df.show(10, False)
+----------+----------+
|date |impression|
+----------+----------+
|2020-04-22|136821 |
|2020-04-23|159688 |
|2020-04-24|165053 |
|2020-04-25|165609 |
|2020-04-26|183574 |
+----------+----------+
I have the following schema for a pyspark dataframe
root
|-- maindata: array (nullable = true)
| |-- element: array (containsNull = true)
| | |-- element: struct (containsNull = true)
| | | |-- label: string (nullable = true)
| | | |-- value: string (nullable = true)
| | | |-- unit: string (nullable = true)
| | | |-- dateTime: string (nullable = true)
Giving some data for a particular row which I received by df.select(F.col("maindata")).show(1,False)
|[[[a1, 43.24, km/h, 2019-04-06T13:02:08.020], [TripCount, 135, , 2019-04-06T13:02:08.790],["t2", 0, , 2019-04-06T13:02:08.040], [t4, 0, , 2019-04-06T13:02:08.050], [t09, 0, , 2019-04-06T13:02:08.050], [t3, 1, , 2019-04-06T13:02:08.050], [t7, 0, , 2019-04-06T13:02:08.050],[TripCount, ,136, 2019-04-06T13:02:08.790]]
I want access the tripcount value inside this ex: [TripCount -> 136,135 etc,What is the best way to access this data?TripC is present multiple times
and also is there any way to access say for example only label data like maindata.label..?
I would suggest to do explode multiple times, to convert array elements into individual rows, and then either convert struct into individual columns, or work with nested elements using the dot syntax. For example:
from pyspark.sql.functions import col, explode
df=spark.createDataFrame([[[[('k1','v1', 'v2')]]]], ['d'])
df2 = df.select(explode(col('d')).alias('d')).select(explode(col('d')).alias('d'))
>>> df2.printSchema()
root
|-- data: struct (nullable = true)
| |-- _1: string (nullable = true)
| |-- _2: string (nullable = true)
| |-- _3: string (nullable = true)
>>> df2.filter(col("data._1") == "k1").show()
+------------+
| data|
+------------+
|[k1, v1, v2]|
+------------+
or you can extract members of the struct as individual columns:
from pyspark.sql.functions import col, explode
df = spark.createDataFrame([[[[('k1','v1', 'v2')]]]], ['d'])
df2 = df.select(explode(col('d')).alias('d')).select(explode(col('d')).alias('d')).select("d.*").drop("d")
>>> df2.printSchema()
root
|-- _1: string (nullable = true)
|-- _2: string (nullable = true)
|-- _3: string (nullable = true)
>>> df2.filter(col("_1") == "k1").show()
+---+---+---+
| _1| _2| _3|
+---+---+---+
| k1| v1| v2|
+---+---+---+
Want to convert a nested json to tsv in databricks notebook using pysoark.
Below is json structure where columns can be changed.
{"tables":[{"name":"Result","columns":[{"name":"JobTime","type":"datetime"},{"name":"Status","type":"string"}]
,"rows":[
["2020-04-19T13:45:12.528Z","Failed"]
,["2020-04-19T14:05:40.098Z","Failed"]
,["2020-04-19T13:46:31.655Z","Failed"]
,["2020-04-19T14:01:16.275Z","Failed"],
["2020-04-19T14:03:16.073Z","Failed"],
["2020-04-19T14:01:16.672Z","Failed"],
["2020-04-19T14:02:13.958Z","Failed"],
["2020-04-19T14:04:41.099Z","Failed"],
["2020-04-19T14:04:41.16Z","Failed"],
["2020-04-19T14:05:14.462Z","Failed"]
]}
]}
I am new in databricks Please help
you have two ways to deal with this problem. Either you do some preprocessing in python with json library (or equivalent), or you load directly into pyspark and play around such as:
from pyspark.sql import SparkSession
import pyspark.sql.functions as f
spark = SparkSession.builder.getOrCreate()
# your json
so_json = """
{"tables":[{"name":"Result","columns":[{"name":"JobTime","type":"datetime"},{"name":"Status","type":"string"}]
,"rows":[
["2020-04-19T13:45:12.528Z","Failed"]
,["2020-04-19T14:05:40.098Z","Failed"]
,["2020-04-19T13:46:31.655Z","Failed"]
,["2020-04-19T14:01:16.275Z","Failed"],
["2020-04-19T14:03:16.073Z","Failed"],
["2020-04-19T14:01:16.672Z","Failed"],
["2020-04-19T14:02:13.958Z","Failed"],
["2020-04-19T14:04:41.099Z","Failed"],
["2020-04-19T14:04:41.16Z","Failed"],
["2020-04-19T14:05:14.462Z","Failed"]
]}
]}
"""
# load in directly using read.json(), you'll see that this becomes
# a nested ArrayType/StructType wombo combo
json_df = spark.read.json(spark._sc.parallelize([so_json]))
json_df.printSchema()
root
|-- tables: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- columns: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- name: string (nullable = true)
| | | | |-- type: string (nullable = true)
| | |-- name: string (nullable = true)
| | |-- rows: array (nullable = true)
| | | |-- element: array (containsNull = true)
| | | | |-- element: string (containsNull = true)
# select nested columns "tables" and "rows" and explode
array_df = json_df.select(f.explode(f.col('tables')['rows'][0]))
Exploding takes the rows which is ArrayType and splits it into actual rows.
Then you can subselect either by dot or slice notation
array_df.printSchema()
root
|-- col: array (nullable = true)
| |-- element: string (containsNull = true)
tabular_df = array_df.select(
array_df.col[0].alias("JobTime"),
array_df.col[1].alias("Status")
)
tabular_df.show()
+--------------------+------+
| JobTime|Status|
+--------------------+------+
|2020-04-19T13:45:...|Failed|
|2020-04-19T14:05:...|Failed|
|2020-04-19T13:46:...|Failed|
|2020-04-19T14:01:...|Failed|
|2020-04-19T14:03:...|Failed|
|2020-04-19T14:01:...|Failed|
|2020-04-19T14:02:...|Failed|
|2020-04-19T14:04:...|Failed|
|2020-04-19T14:04:...|Failed|
|2020-04-19T14:05:...|Failed|
+--------------------+------+
Finally, you want to save as CSV with a custom separator (\t). Hence:
tabular_df.write.csv("path/to/file.tsv", sep="\t")
NB: You may need to manually control for types, such as converting JobTime to TimestampType, but I'll leave that up to you.
Hope this helps.
My Question is while converting from Rdd to dataframe in pyspark does the schema depends on the first row ?
data1 = [('A','abc',0.1,'',0.562),('B','def',0.15,0.5,0.123),('A','ghi',0.2,0.2,0.1345),('B','jkl','',0.1,0.642),('B','mno',0.1,0.1,'')]
>>> val1=sc.parallelize(data1).toDF()
>>> val1.show()
+---+---+----+---+------+
| _1| _2| _3| _4| _5|
+---+---+----+---+------+
| A|abc| 0.1| | 0.562| <------ Does it depends on type of this row?
| B|def|0.15|0.5| 0.123|
| A|ghi| 0.2|0.2|0.1345|
| B|jkl|null|0.1| 0.642|
| B|mno| 0.1|0.1| null|
+---+---+----+---+------+
>>> val1.printSchema()
root
|-- _1: string (nullable = true)
|-- _2: string (nullable = true)
|-- _3: double (nullable = true)
|-- _4: string (nullable = true)
|-- _5: double (nullable = true)
As you can see column _4 should have been double but it considered as string.
Any Suggestions will be helpfull.
Thanks!
#Prathik, I think you are right.
toDF() is a shorthand for spark.createDataFrame(rdd, schema, sampleRatio).
Here's the signature for createDataFrame:
def createDataFrame(self, data, schema=None, samplingRatio=None, verifySchema=True)
So by default, the parameters schema and samplingRatio are None.
According to the doc:
If schema inference is needed, samplingRatio is used to determined the ratio of
rows used for schema inference. The first row will be used if samplingRatio is None.
So by default, toDF() will use the first row to infer the data type, which it figures StringType for column 4, but FloatType for column 5.
Here you can't specify the schema to be FloatType for column 4 and 5, since they have strings in their columns.
But you can try set sampleRatio to 0.3 as below:
data1 = [('A','abc',0.1,'',0.562),('B','def',0.15,0.5,0.123),('A','ghi',0.2,0.2,0.1345),('B','jkl','',0.1,0.642),('B','mno',0.1,0.1,'')]
val1=sc.parallelize(data1).toDF(sampleRatio=0.3)
val1.show()
val1.printSchema()
Some times the above code will throw out error if it happens to sample the string row
Can not merge type <class 'pyspark.sql.types.DoubleType'> and <class 'pyspark.sql.types.StringType'>
but if you are patient and try more times (< 10 for me), you may get something like this. And you can see that both column 4 and 5 are FloatType, because by luck, the program picked double numbers while running createDataFrame.
+---+---+----+----+------+
| _1| _2| _3| _4| _5|
+---+---+----+----+------+
| A|abc| 0.1|null| 0.562|
| B|def|0.15| 0.5| 0.123|
| A|ghi| 0.2| 0.2|0.1345|
| B|jkl|null| 0.1| 0.642|
| B|mno| 0.1| 0.1| null|
+---+---+----+----+------+
root
|-- _1: string (nullable = true)
|-- _2: string (nullable = true)
|-- _3: double (nullable = true)
|-- _4: double (nullable = true)
|-- _5: double (nullable = true)
I need to convert a PySpark df column type from array to string and also remove the square brackets. This is the schema for the dataframe. columns that needs to be processed is CurrencyCode and TicketAmount
>>> plan_queryDF.printSchema()
root
|-- event_type: string (nullable = true)
|-- publishedDate: string (nullable = true)
|-- plannedCustomerChoiceID: string (nullable = true)
|-- assortedCustomerChoiceID: string (nullable = true)
|-- CurrencyCode: array (nullable = true)
| |-- element: string (containsNull = true)
|-- TicketAmount: array (nullable = true)
| |-- element: string (containsNull = true)
|-- currentPlan: boolean (nullable = true)
|-- originalPlan: boolean (nullable = true)
|-- globalId: string (nullable = true)
|-- PlanJsonData: string (nullable = true)
sample data from dataframe
+--------------------+--------------------+-----------------------+------------------------+------------+------------+-----------+------------+------------+--------------------+
| event_type| publishedDate|plannedCustomerChoiceID|assortedCustomerChoiceID|CurrencyCode|TicketAmount|currentPlan|originalPlan| globalId| PlanJsonData|
+--------------------+--------------------+-----------------------+------------------------+------------+------------+-----------+------------+------------+--------------------+
|PlannedCustomerCh...|2016-08-23T04:46:...| 087d1ff1-5f3a-496...| 2539cc4a-37e5-4f3...| [GBP]| [0]| false| false|000576015000|{"httpStatus":200...|
|PlannedCustomerCh...|2016-08-23T04:30:...| 0a1af217-d1e8-4ab...| 61bc5fda-0160-484...| [CNY]| [329]| true| false|000189668017|{"httpStatus":200...|
|PlannedCustomerCh...|2016-08-23T05:49:...| 1028b477-f93e-47f...| c6d5b761-94f2-454...| [JPY]| [3400]| true| false|000576058003|{"httpStatus":200...|
how can I do it? Currently I am doing a cast to string and then replacing the square braces with regexp_replace. but this approach fails when I process huge amount of data.
Is there any other way I can do it?
This is what I want.
+--------------------+--------------------+-----------------------+------------------------+------------+------------+-----------+------------+------------+--------------------+
| event_type| publishedDate|plannedCustomerChoiceID|assortedCustomerChoiceID|CurrencyCode|TicketAmount|currentPlan|originalPlan| globalId| PlanJsonData|
+--------------------+--------------------+-----------------------+------------------------+------------+------------+-----------+------------+------------+--------------------+
|PlannedCustomerCh...|2016-08-23T04:46:...| 087d1ff1-5f3a-496...| 2539cc4a-37e5-4f3...| GBP| 0| false| false|000576015000|{"httpStatus":200...|
|PlannedCustomerCh...|2016-08-23T04:30:...| 0a1af217-d1e8-4ab...| 61bc5fda-0160-484...| CNY| 329| true| false|000189668017|{"httpStatus":200...|
|PlannedCustomerCh...|2016-08-23T05:49:...| 1028b477-f93e-47f...| c6d5b761-94f2-454...| JPY| 3400| true| false|000576058003|{"httpStatus":200...|
You can try getItem(0):
df \
.withColumn("CurrencyCode", df["CurrencyCode"].getItem(0).cast("string")) \
.withColumn("TicketAmount", df["TicketAmount"].getItem(0).cast("string"))
The final cast to string is optional.