Databricks - Pyspark - Handling nested json with a dynamic key - python

I have an example json data file which has the following structure:
{
"Header": {
"Code1": "abc",
"Code2": "def",
"Code3": "ghi",
"Code4": "jkl",
},
"TimeSeries": {
"2020-11-25T03:00:00+00:00": {
"UnitPrice": 1000,
"Amount": 10000,
},
"2020-11-26T03:00:00+00:00": {
"UnitPrice": 1000,
"Amount": 10000,
}
}
}
When I parse this into databricks with command:
df = spark.read.json("/FileStore/test.txt")
I get as output 2 objects: Header and TimeSeries. With the TimeSeries I want to be able to flatten the structure so it has the following schema:
Date
UnitPrice
Amount
As the date field is a key, I am currently only able to access it via iterating through the column names and then using this in the dot-notation dynamically:
def flatten_json(data):
columnlist = data.select("TimeSeries.*")
count = 0
for name in data.select("TimeSeries.*"):
df1 = data.select("Header.*").withColumn(("Timeseries"), lit(columnlist.columns[count])).withColumn("join", lit("a"))
df2 = data.select("TimeSeries." + columnlist.columns[count] + ".*").withColumn("join", lit("a"))
if count == 0:
df3 = df1.join(df2, on=['join'], how="inner")
else:
df3 = df3.union(df1.join(df2, on=['join'], how="inner"))
count = count + 1
return(df3)
This is far from ideal. Does anyone know a better method to create the described dataframe?

The idea:
Step 1: Extract Header and TimeSeries separately.
Step 2: For each field in the TimeSeries object, extract the Amount and UnitPrice, together with the name of the field, stuff them into a struct.
Step 3: Merge all these structs into an array column, and explode it.
Step 4: Extract Timeseries, Amount and UnitPrice from the exploded column.
Step 5: Cross join with the Header row.
import pyspark.sql.functions as F
header_df = df.select("Header.*")
timeseries_df = df.select("TimeSeries.*")
fieldNames = enumerate(timeseries_df.schema.fieldNames())
cols = [F.struct(F.lit(name).alias("Timeseries"), col(name).getItem("Amount").alias("Amount"), col(name).getItem("UnitPrice").alias("UnitPrice")).alias("ts_" + str(idx)) for idx, name in fieldNames]
combined = explode(array(cols)).alias("comb")
timeseries = timeseries_df.select(combined).select('comb.Timeseries', 'comb.Amount', 'comb.UnitPrice')
result = header_df.crossJoin(timeseries)
result.show(truncate = False)
Output:
+-----+-----+-----+-----+-------------------------+------+---------+
|Code1|Code2|Code3|Code4|Timeseries |Amount|UnitPrice|
+-----+-----+-----+-----+-------------------------+------+---------+
|abc |def |ghi |jkl |2020-11-25T03:00:00+00:00|10000 |1000 |
|abc |def |ghi |jkl |2020-11-26T03:00:00+00:00|10000 |1000 |
+-----+-----+-----+-----+-------------------------+------+---------+

Related

Flattening multi nested json into a pandas dataframe

I'm trying to flatten this json response into a pandas dataframe to export to csv.
It looks like this:
j = [
{
"id": 401281949,
"teams": [
{
"school": "Louisiana Tech",
"conference": "Conference USA",
"homeAway": "away",
"points": 34,
"stats": [
{"category": "rushingTDs", "stat": "1"},
{"category": "puntReturnYards", "stat": "24"},
{"category": "puntReturnTDs", "stat": "0"},
{"category": "puntReturns", "stat": "3"},
],
}
],
}
]
...Many more items in the stats area.
If I run this and flatten to the teams level:
multiple_level_data = pd.json_normalize(j, record_path =['teams'])
I get:
school conference homeAway points stats
0 Louisiana Tech Conference USA away 34 [{'category': 'rushingTDs', 'stat': '1'}, {'ca...
How do I flatten it twice so that all of the stats are on their own column in each row?
If I do this:
multiple_level_data = pd.json_normalize(j, record_path =['teams'])
multiple_level_data = multiple_level_data.explode('stats').reset_index(drop=True)
multiple_level_data=multiple_level_data.join(pd.json_normalize(multiple_level_data.pop('stats')))
I end up with multiple rows instead of more columns:
You can try:
df = pd.DataFrame(j).explode("teams")
df = pd.concat([df, df.pop("teams").apply(pd.Series)], axis=1)
df["stats"] = df["stats"].apply(lambda x: {d["category"]: d["stat"] for d in x})
df = pd.concat(
[
df,
df.pop("stats").apply(pd.Series),
],
axis=1,
)
print(df)
Prints:
id school conference homeAway points rushingTDs puntReturnYards puntReturnTDs puntReturns
0 401281949 Louisiana Tech Conference USA away 34 1 24 0 3
can you try this:
multiple_level_data = pd.json_normalize(j, record_path =['teams'])
multiple_level_data = multiple_level_data.explode('stats').reset_index(drop=True)
multiple_level_data=multiple_level_data.join(pd.json_normalize(multiple_level_data.pop('stats')))
#convert rows to columns.
multiple_level_data=multiple_level_data.set_index(multiple_level_data.columns[0:4].to_list())
dfx=multiple_level_data.pivot_table(values='stat',columns='category',aggfunc=list).apply(pd.Series.explode).reset_index(drop=True)
multiple_level_data=multiple_level_data.reset_index().drop(['stat','category'],axis=1).drop_duplicates().reset_index(drop=True)
multiple_level_data=multiple_level_data.join(dfx)
Output:
school
conference
homeAway
points
puntReturnTDs
puntReturnYards
puntReturns
rushingTDs
0
Louisiana Tech
Conference USA
away
34
0
24
3
1
Instead of calling explode() on an output of a json_normalize(), you can explicitly pass the paths to the meta data for each column in a single json_normalize() call. For example, ['teams', 'school'] would be one path, ['teams', 'conference'] is another path, etc. This will create a long dataframe similar to what you already have.
Then you can call pivot() to reshape this output into the correct shape.
# normalize json
df = pd.json_normalize(
j, record_path=['teams', 'stats'],
meta=['id', *(['teams', c] for c in ('school', 'conference', 'homeAway', 'points'))]
)
# column name contains 'teams' prefix; remove it
df.columns = [c.split('.')[1] if '.' in c else c for c in df]
# pivot the intermediate result
df = (
df.astype({'points': int, 'id': int})
.pivot(['id', 'school', 'conference', 'homeAway', 'points'], 'category', 'stat')
.reset_index()
)
# remove index name
df.columns.name = None
df

How to reference a dynamically created dataframe in a for loop?

I have some variables and a dictionary of strings and google sheets imported:
grad_year = '2029'
df_dict = {'grade_1': grade_1_class_2029,
'grade_2': grade_2_class_2029,
'grade_3': grade_3_class_2029,
'grade_4': grade_4_class_2029,
'grade_5': grade_5_class_2029}
I then turn the google sheets into dataframes, naming them dynamically:
for key, val in df_dict.items():
rows = val.get_all_values()
vars()["df_" + key + "_class_" + grad_year] = pd.DataFrame.from_records(
rows[2:], columns=rows[1]
)
Now I would like to reference them without a pre-created dictionary of their names.
There is still a bunch of stuff I would like to do to the new dataframes such as deleting blank rows. I have tried:
for key, val in df_dict.items():
rows = val.get_all_values()
vars()["df_" + key + "_class_" + grad_year] = pd.DataFrame.from_records(
rows[2:], columns=rows[1]
)
vars()["df_" + key + "_class_" + grad_year].replace("", nan_value, inplace=True)
vars()["df_" + key + "_class_" + grad_year].dropna(
subset=["Last Name"], inplace=True
)
and
for key, val in df_dict.items():
rows = val.get_all_values()
vars()["df_" + key + "_class_" + grad_year] = (
pd.DataFrame.from_records(rows[2:], columns=rows[1])
.replace("", nan_value, inplace=True)
.dropna(subset=["Last Name"], inplace=True)
)
but neither worked.
If you replace nan_value by pd.NA (Pandas 1.0.0 and beyond), your first code snippet works fine:
import pandas as pd
grad_year = "2029"
vars()[f"df_{grad_year}"] = pd.DataFrame(
{
"class": {
0: "class1",
1: "class2",
2: "class3",
3: "class4",
},
"name": {0: "John", 1: "Jack", 2: "", 3: "Butch"},
}
)
vars()[f"df_{grad_year}"].replace("", pd.NA, inplace=True)
vars()[f"df_{grad_year}"].dropna(subset=["name"], inplace=True)
print(vars()[f"df_{grad_year}"])
# Outputs
class name
0 class1 John
1 class2 Jack
3 class4 Butch
In your second code snippet, you also have to set inplace to False instead of True both times in order for chain assignments to work:
vars()[f"df_{grad_year}"] = (
pd.DataFrame(
{
"class": {
0: "class1",
1: "class2",
2: "class3",
3: "class4",
},
"name": {0: "John", 1: "Jack", 2: "", 3: "Butch"},
}
)
.replace("", pd.NA, inplace=False)
.dropna(subset=["name"], inplace=False)
)
print(vars()[f"df_{grad_year}"])
# Output
class name
0 class1 John
1 class2 Jack
3 class4 Butch

Filter nested JSON structure and get field names as values in Pyspark

I have the following complex data that would like to parse in PySpark:
records = '[{"segmentMembership":{"ups":{"FF6KCPTR6AQ0836R":{"lastQualificationTime":"2021-01-16 22:05:11.074357","status":"exited"},"QMS3YRT06JDEUM8O":{"lastQualificationTime":"2021-01-16 22:05:11.074357","status":"realized"},"8XH45RT87N6ZV4KQ":{"lastQualificationTime":"2021-01-16 22:05:11.074357","status":"exited"}}},"_aepgdcdevenablement2":{"emailId":{"address":"stuff#someemail.com"},"person":{"name":{"firstName":"Name2"}},"identities":{"customerid":"PH25PEUWOTA7QF93"}}},{"segmentMembership":{"ups":{"FF6KCPTR6AQ0836R":{"lastQualificationTime":"2021-01-16 22:05:11.074457","status":"realized"},"D45TOO8ZUH0B7GY7":{"lastQualificationTime":"2021-01-16 22:05:11.074457","status":"realized"},"QMS3YRT06JDEUM8O":{"lastQualificationTime":"2021-01-16 22:05:11.074457","status":"existing"}}},"_aepgdcdevenablement2":{"emailId":{"address":"stuff4#someemail.com"},"person":{"name":{"firstName":"TestName"}},"identities":{"customerid":"9LAIHVG91GCREE3Z"}}}]'
df = spark.read.json(sc.parallelize([records]))
df.show()
df.printSchema()
The problem I am having is with the segmentMembership object. The JSON object looks like this:
"segmentMembership": {
"ups": {
"FF6KCPTR6AQ0836R": {
"lastQualificationTime": "2021-01-16 22:05:11.074357",
"status": "exited"
},
"QMS3YRT06JDEUM8O": {
"lastQualificationTime": "2021-01-16 22:05:11.074357",
"status": "realized"
},
"8XH45RT87N6ZV4KQ": {
"lastQualificationTime": "2021-01-16 22:05:11.074357",
"status": "exited"
}
}
}
The annoying thing with this is, the key values ("FF6KCPTR6AQ0836R", "QMS3YRT06JDEUM8O", "8XH45RT87N6ZV4KQ") end up being defined as a column in pyspark.
In the end, if the status of the segment is "exited", I was hoping to get the results as follows.
+--------------------+----------------+---------+------------------+
|address |customerid |firstName|segment_id |
+--------------------+----------------+---------+------------------+
|stuff#someemail.com |PH25PEUWOTA7QF93|Name2 |[8XH45RT87N6ZV4KQ]|
|stuff4#someemail.com|9LAIHVG91GCREE3Z|TestName |[8XH45RT87N6ZV4KQ]|
+--------------------+----------------+---------+------------------+
After loading the data into a dataframe(above), I tried the following:
dfx = df.select("_aepgdcdevenablement2.emailId.address", "_aepgdcdevenablement2.identities.customerid", "_aepgdcdevenablement2.person.name.firstName", "segmentMembership.ups")
dfx.show(truncate=False)
seg_list = array(*[lit(k) for k in ["8XH45RT87N6ZV4KQ", "QMS3YRT06JDEUM8O"]])
print(seg_list)
# if v["status"] in ['existing', 'realized']
def confusing_compare(ups, seg_list):
seg_id_filtered_d = dict((k, ups[k]) for k in seg_list if k in ups)
# This is the line I am having a problem with.
# seg_id_status_filtered_d = {key for key, value in seg_id_filtered_d.items() if v["status"] in ['existing', 'realized']}
return list(seg_id_filtered_d)
final_conf_dx_pred = udf(confusing_compare, ArrayType(StringType()))
result_df = dfx.withColumn("segment_id", final_conf_dx_pred(dfx.ups, seg_list)).select("address", "customerid", "firstName", "segment_id")
result_df.show(truncate=False)
I am not able to check the status field within the value field of the dic.
You can actually do that without using UDF. Here I'm using all the segment names present in the schema and filtering out those with status = 'exited'. You can adapt it depending on which segments and status you want.
First, using the schema fields, get the list of all segment names like this:
segment_names = df.select("segmentMembership.ups.*").schema.fieldNames()
Then, by looping throught the list created above and using when function, you can create a column that can have either segment_name as value or null depending on status:
active_segments = [
when(col(f"segmentMembership.ups.{c}.status") != lit("exited"), lit(c))
for c in segment_names
]
Finally, add new column segments of array type and use filter function to remove null elements from the array (which corresponds to status 'exited'):
dfx = df.withColumn("segments", array(*active_segments)) \
.withColumn("segments", expr("filter(segments, x -> x is not null)")) \
.select(
col("_aepgdcdevenablement2.emailId.address"),
col("_aepgdcdevenablement2.identities.customerid"),
col("_aepgdcdevenablement2.person.name.firstName"),
col("segments").alias("segment_id")
)
dfx.show(truncate=False)
#+--------------------+----------------+---------+------------------------------------------------------+
#|address |customerid |firstName|segment_id |
#+--------------------+----------------+---------+------------------------------------------------------+
#|stuff#someemail.com |PH25PEUWOTA7QF93|Name2 |[QMS3YRT06JDEUM8O] |
#|stuff4#someemail.com|9LAIHVG91GCREE3Z|TestName |[D45TOO8ZUH0B7GY7, FF6KCPTR6AQ0836R, QMS3YRT06JDEUM8O]|
#+--------------------+----------------+---------+------------------------------------------------------+

Json file not formatted correctly when writing json differences with pandas and numpy

I am trying to compare two json and then write another json with columns names and with differences as yes or no. I am using pandas and numpy
The below is sample files i am including actually, these json are dynamic, that mean we dont know how many key will be there upfront
Input files:
fut.json
[
{
"AlarmName": "test",
"StateValue": "OK"
}
]
Curr.json:
[
{
"AlarmName": "test",
"StateValue": "OK"
}
]
Below code I have tried:
import pandas as pd
import numpy as np
with open(r"c:\csv\fut.json", 'r+') as f:
data_b = json.load(f)
with open(r"c:\csv\curr.json", 'r+') as f:
data_a = json.load(f)
df_a = pd.json_normalize(data_a)
df_b = pd.json_normalize(data_b)
_, df_a = df_b.align(df_a, fill_value=np.NaN)
_, df_b = df_a.align(df_b, fill_value=np.NaN)
with open(r"c:\csv\report.json", 'w') as _file:
for col in df_a.columns:
df_temp = pd.DataFrame()
df_temp[col + '_curr'], df_temp[col + '_fut'], df_temp[col + '_diff'] = df_a[col], df_b[col], np.where((df_a[col] == df_b[col]), 'No', 'Yes')
#[df_temp.rename(columns={c:'Missing'}, inplace=True) for c in df_temp.columns if df_temp[c].isnull().all()]
df_temp.fillna('Missing', inplace=True)
with pd.option_context('display.max_colwidth', -1):
_file.write(df_temp.to_json(orient='records'))
Expected output:
[
{
"AlarmName_curr": "test",
"AlarmName_fut": "test",
"AlarmName_diff": "No"
},
{
"StateValue_curr": "OK",
"StateValue_fut": "OK",
"StateValue_diff": "No"
}
]
Coming output: Not able to parse it in json validator, below is the problem, those [] should be replaed by ',' to get right json dont know why its printing like that
[{"AlarmName_curr":"test","AlarmName_fut":"test","AlarmName_diff":"No"}][{"StateValue_curr":"OK","StateValue_fut":"OK","StateValue_diff":"No"}]
Edit1:
Tried below as well
_file.write(df_temp.to_json(orient='records',lines=True))
now i get json which is again not parsable, ',' is missing and unless i add , between two dic and [ ] at beginning and end manually , its not parsing..
[{"AlarmName_curr":"test","AlarmName_fut":"test","AlarmName_diff":"No"}{"StateValue_curr":"OK","StateValue_fut":"OK","StateValue_diff":"No"}]
Honestly pandas is overkill for this... however
load dataframes as you did
concat them as columns. rename columns
do calcs and map boolean to desired Yes/No
to_json() returns a string so json.loads() to get it back into a list/dict. Filter columns to get to your required format
import json
data_b = [
{
"AlarmName": "test",
"StateValue": "OK"
}
]
data_a = [
{
"AlarmName": "test",
"StateValue": "OK"
}
]
df_a = pd.json_normalize(data_a)
df_b = pd.json_normalize(data_b)
df = pd.concat([df_a, df_b], axis=1)
df.columns = [c+"_curr" for c in df_a.columns] + [c+"_fut" for c in df_a.columns]
df["AlarmName_diff"] = df["AlarmName_curr"] == df["AlarmName_fut"]
df["StateValue_diff"] = df["StateValue_curr"] == df["StateValue_fut"]
df = df.replace({True:"Yes", False:"No"})
js = json.loads(df.loc[:,(c for c in df.columns if c.startswith("Alarm"))].to_json(orient="records"))
js += json.loads(df.loc[:,(c for c in df.columns if c.startswith("State"))].to_json(orient="records"))
js
output
[{'AlarmName_curr': 'test', 'AlarmName_fut': 'test', 'AlarmName_diff': 'Yes'},
{'StateValue_curr': 'OK', 'StateValue_fut': 'OK', 'StateValue_diff': 'Yes'}]

What is the data format returned by the AdWords API TargetingIdeaPage service?

When I query the AdWords API to get search volume data and trends through their TargetingIdeaSelector using the Python client library the returned data looks like this:
(TargetingIdeaPage){
totalNumEntries = 1
entries[] =
(TargetingIdea){
data[] =
(Type_AttributeMapEntry){
key = "KEYWORD_TEXT"
value =
(StringAttribute){
Attribute.Type = "StringAttribute"
value = "keyword phrase"
}
},
(Type_AttributeMapEntry){
key = "TARGETED_MONTHLY_SEARCHES"
value =
(MonthlySearchVolumeAttribute){
Attribute.Type = "MonthlySearchVolumeAttribute"
value[] =
(MonthlySearchVolume){
year = 2016
month = 2
count = 2900
},
...
(MonthlySearchVolume){
year = 2015
month = 3
count = 2900
},
}
},
},
}
This isn't JSON and appears to just be a messy Python list. What's the easiest way to flatten the monthly data into a Pandas dataframe with a structure like this?
Keyword | Year | Month | Count
keyword phrase 2016 2 10
The output is a sudsobject. I found that this code does the trick:
import suds.sudsobject as sudsobject
import pandas as pd
a = [sudsobject.asdict(x) for x in output]
df = pd.DataFrame(a)
Addendum: This was once correct but new versions of the API (I tested
201802) now return a zeep.objects. However, zeep.helpers.serialize_object should do the same trick.
link
Here's the complete code that I used to query the TargetingIdeaSelector, with requestType STATS, and the method I used to parse the data to a useable dataframe; note the section starting "Parse results to pandas dataframe" as this takes the output given in the question above and converts it to a dataframe. Probably not the fastest or best, but it works! Tested with Python 2.7.
"""This code pulls trends for a set of keywords, and parses into a dataframe.
The LoadFromStorage method is pulling credentials and properties from a
"googleads.yaml" file. By default, it looks for this file in your home
directory. For more information, see the "Caching authentication information"
section of our README.
"""
from googleads import adwords
import pandas as pd
adwords_client = adwords.AdWordsClient.LoadFromStorage()
PAGE_SIZE = 10
# Initialize appropriate service.
targeting_idea_service = adwords_client.GetService(
'TargetingIdeaService', version='v201601')
# Construct selector object and retrieve related keywords.
offset = 0
stats_selector = {
'searchParameters': [
{
'xsi_type': 'RelatedToQuerySearchParameter',
'queries': ['donald trump', 'bernie sanders']
},
{
# Language setting (optional).
# The ID can be found in the documentation:
# https://developers.google.com/adwords/api/docs/appendix/languagecodes
'xsi_type': 'LanguageSearchParameter',
'languages': [{'id': '1000'}],
},
{
# Location setting
'xsi_type': 'LocationSearchParameter',
'locations': [{'id': '1027363'}] # Burlington,Vermont
}
],
'ideaType': 'KEYWORD',
'requestType': 'STATS',
'requestedAttributeTypes': ['KEYWORD_TEXT', 'TARGETED_MONTHLY_SEARCHES'],
'paging': {
'startIndex': str(offset),
'numberResults': str(PAGE_SIZE)
}
}
stats_page = targeting_idea_service.get(stats_selector)
##########################################################################
# Parse results to pandas dataframe
stats_pd = pd.DataFrame()
if 'entries' in stats_page:
for stats_result in stats_page['entries']:
stats_attributes = {}
for stats_attribute in stats_result['data']:
#print (stats_attribute)
if stats_attribute['key'] == 'KEYWORD_TEXT':
kt = stats_attribute['value']['value']
else:
for i, val in enumerate(stats_attribute['value'][1]):
data = {'keyword': kt,
'year': val['year'],
'month': val['month'],
'count': val['count']}
data = pd.DataFrame(data, index = [i])
stats_pd = stats_pd.append(data, ignore_index=True)
print(stats_pd)

Categories

Resources