Below code flattens the entire JSON but the output is not desired and requires the modification.
# reading the data from datalake
df = spark.read.json(f'/mnt/bronze/categories/**/*.json')
# Function to flatten nested json, created a column for each parameter with a key-value dict as cell-value
def flatten_df(nested_df):
flat_cols = [c[0] for c in nested_df.dtypes if c[1][:6] == 'string']
nested_cols = [c[0] for c in nested_df.dtypes if c[1][:6] == 'struct']
flat_df = nested_df.select(flat_cols +
[col(nc+'.'+c).alias(nc+'_'+c)
for nc in nested_cols
for c in nested_df.select(nc+'.*').columns])
return flat_df
# Top level hierarchy
df = df.select('_embedded.*')
#Reaching the lower level called "items"
df1 = df.select(explode(df.items).alias('required'))
# Creating dataframe which will be passed to flatten_df to flatten entire data under "items" hierarchy
df2 = df1.select('required.*')
final = flatten_df(df2)
display(final)
Structure of the file is here:
root
|-- _embedded: struct (nullable = true)
| |-- items: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- _links: struct (nullable = true)
| | | | |-- self: struct (nullable = true)
| | | | | |-- href: string (nullable = true)
| | | |-- code: string (nullable = true)
| | | |-- labels: struct (nullable = true)
| | | | |-- bg_BG: string (nullable = true)
| | | | |-- cs_CZ: string (nullable = true)
| | | | |-- da_DK: string (nullable = true)
| | | | |-- de_AT: string (nullable = true)
| | | | |-- zh_TW: string (nullable = true)
| | | |-- parent: string (nullable = true)
| | | |-- updated: string (nullable = true)
|-- _links: struct (nullable = true)
| |-- first: struct (nullable = true)
| | |-- href: string (nullable = true)
| |-- next: struct (nullable = true)
| | |-- href: string (nullable = true)
| |-- previous: struct (nullable = true)
| | |-- href: string (nullable = true)
| |-- self: struct (nullable = true)
| | |-- href: string (nullable = true)
|-- current_page: long (nullable = true)
output: it takes each key value and transforms it as headers and values as cell values.
Desired output should look like this: Lables key and values should be presented as headers (lables_key and lables_values col names).
Related
I have a pyspark dataframe created from XML. Because of the way XML is structured I have an extra, unnecessary level of nesting in the schema of the dataframe.
The schema of my current dataframe:
root
|-- a: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- movies: struct (nullable = true)
| | | |-- movie: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- b: string (nullable = true)
| | | | | |-- c: string (nullable = true)
| | | | | |-- d: integer (nullable = true)
| | | | | |-- e: string (nullable = true)
| | |-- f: string (nullable = true)
| | |-- g: string (nullable = true)
I'm trying to replace the movies struct with the movie array underneath it as follows:
root
|-- a: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- movies: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- b: string (nullable = true)
| | | | |-- c: string (nullable = true)
| | | | |-- d: integer (nullable = true)
| | | | |-- e: string (nullable = true)
| | |-- f: string (nullable = true)
| | |-- g: string (nullable = true)
The closest I've gotten was using:
from pyspark.sql import functions as F
df.withColumn("a", F.transform('a', lambda x: x.withField("movies_new", F.col("a.movies.movie"))))
which results in the following schema:
root
|-- a: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- movies: struct (nullable = true)
| | | |-- movie: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- b: string (nullable = true)
| | | | | |-- c: string (nullable = true)
| | | | | |-- d: integer (nullable = true)
| | | | | |-- e: string (nullable = true)
| | |-- f: string (nullable = true)
| | |-- g: string (nullable = true)
| | |-- movies_new: array (nullable = true)
| | | |-- element: array (containsNull = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- b: string (nullable = true)
| | | | | |-- c: string (nullable = true)
| | | | | |-- d: integer (nullable = true)
| | | | | |-- e: string (nullable = true)
I understand why this is happening, but thought if I never extracted the nested array out of 'a' that it might not become an array of an array.
Any suggestions?
The logic is:
Explode array "a".
Recompute new struct as (movies.movie, f, g)
Collect "a" back as array.
df = df.withColumn("a", F.explode("a"))
df = df.withColumn("a", F.struct( \
df.a.movies.getField("movie").alias("movies"), \
df.a.f.alias("f"), \
df.a.g.alias("g")))
df = df.select(F.collect_list("a").alias("a"))
The full working code:
import pyspark.sql.functions as F
df = spark.createDataFrame(data=[
[[(([("b1", "c1", "d1", "e1")],), "f1", "g1")]]
], schema="a array<struct<movies struct<movie array<struct<b string, c string, d string, e string>>>, f string, g string>>")
df.printSchema()
# df.show(truncate=False)
df = df.withColumn("a", F.explode("a"))
df = df.withColumn("a", F.struct( \
df.a.movies.getField("movie").alias("movies"), \
df.a.f.alias("f"), \
df.a.g.alias("g")))
df = df.select(F.collect_list("a").alias("a"))
df.printSchema()
# df.show(truncate=False)
Output schema before:
root
|-- a: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- movies: struct (nullable = true)
| | | |-- movie: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- b: string (nullable = true)
| | | | | |-- c: string (nullable = true)
| | | | | |-- d: string (nullable = true)
| | | | | |-- e: string (nullable = true)
| | |-- f: string (nullable = true)
| | |-- g: string (nullable = true)
Output schema after:
root
|-- a: array (nullable = false)
| |-- element: struct (containsNull = false)
| | |-- movies: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- b: string (nullable = true)
| | | | |-- c: string (nullable = true)
| | | | |-- d: string (nullable = true)
| | | | |-- e: string (nullable = true)
| | |-- f: string (nullable = true)
| | |-- g: string (nullable = true)
I have one nested array of struct and I would like to modify column name to something else as given in example below.
Source format
|-- HelloWorld: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- version: string (nullable = true)
| | |-- abc-version: string (nullable = true) ----->This part needs to renamed
| | |-- again_something: array (nullable = true)
| | | |-- element: map (containsNull = true)
| | | | |-- key: string
| | | | |-- value: string (valueContainsNull = true)
Output format should look like below.
|-- HelloWorld: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- version: string (nullable = true)
| | |-- abc_version: string (nullable = true). ----->This part has changed
| | |-- again_something: array (nullable = true)
| | | |-- element: map (containsNull = true)
| | | | |-- key: string
| | | | |-- value: string (valueContainsNull = true)
I tried different withField, F.expr to transform the column name, but didn't really work well.
Please help.
I would recast it with the same dtype while changing the column name
df3 = df.withColumn("HelloWorld",F.expr("transform(HelloWorld, x -> struct(cast((x['abc-version']) as integer) as abc_version, x.version,x.gain_something))"))
root
|-- HelloWorld: array (nullable = true)
| |-- element: struct (containsNull = false)
| | |-- abc_version: integer (nullable = true)
| | |-- version: string (nullable = true)
| | |-- gain_something: array (nullable = true)
| | | |-- element: map (containsNull = true)
| | | | |-- key: string
| | | | |-- value: string (valueContainsNull = true)
I have around 30 JSON files with nested attributes, sample shown below. I would like to drop "questionnaire" column from the file and then would like to Union all files.
Could you please suggest how shall we achieve this using Python.
Sample file:
|-- profileEntity: struct (nullable = true)
| |-- consent: string (nullable = true)
| |-- documents: array (nullable = true)
| | |-- element: struct (containsNull = true)
| | | |-- documentProperties: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- field: string (nullable = true)
| | | | | |-- value: string (nullable = true)
| | | |-- documentType: string (nullable = true)
| | | |-- files: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- edmFileIndex: long (nullable = true)
| | | | | |-- edmFilename: string (nullable = true)
| |-- questionnaire: struct (nullable = true)
| | |-- deposit3rdParties: array (nullable = true)
| | | |-- element: string (containsNull = true)
| | |-- deposit3rdPartiesOther: string (nullable = true)
The reason for this is because "questionnaire" column is not in sync in each file and hence UNION is failing.
Error:
Py4JJavaError: An error occurred while calling o72.union.
: org.apache.spark.sql.AnalysisException: Union can only be performed on tables with the compatible column types.
I'm trying to transform the json output of aws glue get-tables command into a PySpark dataframe.
After reading the json output with this command:
df = spark.read.option("inferSchema", "true") \
.option("multiline", "true") \
.json("tmp/my_json.json")
I get the following from printSchema:
root
|-- TableList: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- CatalogId: string (nullable = true)
| | |-- CreateTime: string (nullable = true)
| | |-- CreatedBy: string (nullable = true)
| | |-- DatabaseName: string (nullable = true)
| | |-- IsRegisteredWithLakeFormation: boolean (nullable = true)
| | |-- LastAccessTime: string (nullable = true)
| | |-- Name: string (nullable = true)
| | |-- Owner: string (nullable = true)
| | |-- Parameters: struct (nullable = true)
| | | |-- CrawlerSchemaDeserializerVersion: string (nullable = true)
| | | |-- CrawlerSchemaSerializerVersion: string (nullable = true)
| | | |-- UPDATED_BY_CRAWLER: string (nullable = true)
| | | |-- averageRecordSize: string (nullable = true)
| | | |-- classification: string (nullable = true)
| | | |-- compressionType: string (nullable = true)
| | | |-- objectCount: string (nullable = true)
| | | |-- recordCount: string (nullable = true)
| | | |-- sizeKey: string (nullable = true)
| | | |-- spark.sql.create.version: string (nullable = true)
| | | |-- spark.sql.sources.schema.numPartCols: string (nullable = true)
| | | |-- spark.sql.sources.schema.numParts: string (nullable = true)
| | | |-- spark.sql.sources.schema.part.0: string (nullable = true)
| | | |-- spark.sql.sources.schema.part.1: string (nullable = true)
| | | |-- spark.sql.sources.schema.partCol.0: string (nullable = true)
| | | |-- spark.sql.sources.schema.partCol.1: string (nullable = true)
| | | |-- typeOfData: string (nullable = true)
| | |-- PartitionKeys: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- Name: string (nullable = true)
| | | | |-- Type: string (nullable = true)
| | |-- Retention: long (nullable = true)
| | |-- StorageDescriptor: struct (nullable = true)
| | | |-- BucketColumns: array (nullable = true)
| | | | |-- element: string (containsNull = true)
| | | |-- Columns: array (nullable = true)
| | | | |-- element: struct (containsNull = true)
| | | | | |-- Name: string (nullable = true)
| | | | | |-- Type: string (nullable = true)
| | | |-- Compressed: boolean (nullable = true)
| | | |-- InputFormat: string (nullable = true)
| | | |-- Location: string (nullable = true)
| | | |-- NumberOfBuckets: long (nullable = true)
| | | |-- OutputFormat: string (nullable = true)
| | | |-- Parameters: struct (nullable = true)
| | | | |-- CrawlerSchemaDeserializerVersion: string (nullable = true)
| | | | |-- CrawlerSchemaSerializerVersion: string (nullable = true)
| | | | |-- UPDATED_BY_CRAWLER: string (nullable = true)
| | | | |-- averageRecordSize: string (nullable = true)
| | | | |-- classification: string (nullable = true)
| | | | |-- compressionType: string (nullable = true)
| | | | |-- objectCount: string (nullable = true)
| | | | |-- recordCount: string (nullable = true)
| | | | |-- sizeKey: string (nullable = true)
| | | | |-- spark.sql.create.version: string (nullable = true)
| | | | |-- spark.sql.sources.schema.numPartCols: string (nullable = true)
| | | | |-- spark.sql.sources.schema.numParts: string (nullable = true)
| | | | |-- spark.sql.sources.schema.part.0: string (nullable = true)
| | | | |-- spark.sql.sources.schema.part.1: string (nullable = true)
| | | | |-- spark.sql.sources.schema.partCol.0: string (nullable = true)
| | | | |-- spark.sql.sources.schema.partCol.1: string (nullable = true)
| | | | |-- typeOfData: string (nullable = true)
| | | |-- SerdeInfo: struct (nullable = true)
| | | | |-- Parameters: struct (nullable = true)
| | | | | |-- serialization.format: string (nullable = true)
| | | | |-- SerializationLibrary: string (nullable = true)
| | | |-- SortColumns: array (nullable = true)
| | | | |-- element: string (containsNull = true)
| | | |-- StoredAsSubDirectories: boolean (nullable = true)
| | |-- TableType: string (nullable = true)
| | |-- UpdateTime: string (nullable = true)
But just one column with the whole json is created in df:
+--------------------+
| TableList|
+--------------------+
|[[903342277921, 2...|
+--------------------+
Is there a way to programmatically (and dynamically) create the dataframe in the same way that is referenced in printSchema?
Thanks in advance!
You can use the explode() function to turn the elements of an array to separate rows:
df = df.select('*',explode(df['TableList']).select('col.*')
How can I apply UDF to the tags array (root>bankAccounts>transactions>tags) base in the following dataframe?
data_df.printSchema():
root
|-- reference: string (nullable = true)
|-- bankAccounts: array (nullable = true)
| |-- element: struct (containsNull = true)
| | |-- accountName: string (nullable = true)
| | |-- currentBalance: string (nullable = true)
| | |-- id: long (nullable = true)
| | |-- transactions: array (nullable = true)
| | | |-- element: struct (containsNull = true)
| | | | |-- amount: double (nullable = true)
| | | | |-- balance: string (nullable = true)
| | | | |-- date: string (nullable = true)
| | | | |-- tags: array (nullable = true)
| | | | | |-- element: struct (containsNull = true)
| | | | | | |-- category: string (nullable = true)
| | | | | | |-- creditDebit: string (nullable = true)
| | | | | | |-- thirdParty: string (nullable = true)
| | | | |-- text: string (nullable = true)
| | | | |-- type: string (nullable = true)
|-- countryCode: string (nullable = true)
|-- text: string (nullable = true)
The python method:
def transform_array_to_dict(tag):
dall = {}
for d in tag:
for a in list(d.keys()):
dall[a] = d[a]
return dall
which change the outcome for each transaction tag from something like:
[
{
"category": "Dishonours"
},
{
"creditDebit": "debit"
}
]
into:
{
"category": "Dishonours",
"creditDebit": "debit"
}
The method run as intended, when I test
sample = transform_array_to_dict([{"category": "Dishonours"},{"creditDebit": "debit"}])
sample
My outcome is
{'category': 'Dishonours', 'creditDebit': 'debit'}