parsing nested JSON into multiple dataframe using pandas python - python

I have a nested JSON as shown below and want to parse into multiple dataframe in python .. please help
{
"tableName": "cases",
"url": "EndpointVoid",
"tableDataList": [{
"_id": "100017252700",
"title": "Test",
"type": "TECH",
"created": "2016-09-06T19:00:17.071Z",
"createdBy": "193164275",
"lastModified": "2016-10-04T21:50:49.539Z",
"lastModifiedBy": "1074113719",
"notes": [{
"id": "30",
"title": "Multiple devices",
"type": "INCCL",
"origin": "D",
"componentCode": "PD17A",
"issueCode": "IP321",
"affectedProduct": "134322",
"summary": "testing the json",
"caller": {
"email": "katie.slabiak#spps.org",
"phone": "651-744-4522"
}
}, {
"id": "50",
"title": "EDU: Multiple Devices - Lightning-to-USB Cable",
"type": "INCCL",
"origin": "D",
"componentCode": "PD17A",
"issueCode": "IP321",
"affectedProduct": "134322",
"summary": "parsing json 2",
"caller": {
"email": "testing1#test.org",
"phone": "123-345-1111"
}
}],
"syncCount": 2316,
"repair": [{
"id": "D208491610",
"created": "2016-09-06T19:02:48.000Z",
"createdBy": "193164275",
"lastModified": "2016-09-21T12:49:47.000Z"
}, {
"id": "D208491610"
}, {
"id": "D208491628",
"created": "2016-09-06T19:03:37.000Z",
"createdBy": "193164275",
"lastModified": "2016-09-21T12:49:47.000Z"
}
],
"enterpriseStatus": "8"
}],
"dateTime": 1475617849,
"primaryKeys": ["$._id"],
"primaryKeyVals": ["100017252700"],
"operation": "UPDATE"
}
I want to parse this and create 3 tables/dataframe/csv as shown below.. please help..
Output table in this format

I don't think this is best way, but I wanted to show you possibility.
import pandas as pd
from pandas.io.json import json_normalize
import json
with open('your_sample.json') as f:
dt = json.load(f)
Table1
df1 = json_normalize(dt, 'tableDataList', 'dateTime')[['_id', 'title', 'type', 'created', 'createdBy', 'lastModified', 'lastModifiedBy', 'dateTime']]
print df1
_id title type created createdBy \
0 100017252700 Test TECH 2016-09-06T19:00:17.071Z 193164275
lastModified lastModifiedBy dateTime
0 2016-10-04T21:50:49.539Z 1074113719 1475617849
Table 2
df2 = json_normalize(dt['tableDataList'], 'notes', '_id')
df2['phone'] = df2['caller'].map(lambda x: x['phone'])
df2['email'] = df2['caller'].map(lambda x: x['email'])
df2 = df2[['_id', 'id', 'title', 'email', 'phone']]
print df2
_id id title \
0 100017252700 30 Multiple devices
1 100017252700 50 EDU: Multiple Devices - Lightning-to-USB Cable
email phone
0 katie.slabiak#spps.org 651-744-4522
1 testing1#test.org 123-345-1111
Table 3
df3 = json_normalize(dt['tableDataList'], 'repair', '_id').dropna()
print df3
created createdBy id lastModified \
0 2016-09-06T19:02:48.000Z 193164275 D208491610 2016-09-21T12:49:47.000Z
2 2016-09-06T19:03:37.000Z 193164275 D208491628 2016-09-21T12:49:47.000Z
_id
0 100017252700
2 100017252700

Related

Python: parce json with 2 arrays via json_normalize

Would you help, please, to parce 2-arrayed json via python, json_normalize.
Here is the code:
import json
from pandas.io.json import json_normalize
data5 = {
"id": "0001",
"type": "donut",
"name": "Cake",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
{ "id": "1002", "type": "Chocolate" },
{ "id": "1003", "type": "Blueberry" },
{ "id": "1004", "type": "Devil's Food" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5007", "type": "Powdered Sugar" },
{ "id": "5006", "type": "Chocolate with Sprinkles" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
}
df2 = json_normalize(data5
, record_path = ['topping']
, meta = ['id', 'type', 'name', 'ppu', 'batters']
, record_prefix='_'
, errors='ignore'
)
This parces "topping" object but doesn't parce the "batters".
To parce the "batters" may be applied the code:
# parce the part of json string into another dataframe
df3 = json_normalize(data5
,record_path = ['batters', 'batter'])
# cross join 2 dataframes
df2['key_'] = 1
df3['key_'] = 1
result = pd.merge(df2, df3, on ='key_').drop("key_", 1)
But this looks complicated.
Is it possible to combine 2 steps above in one query? E.g.:
df2 = json_normalize(data5
, record_path = ['topping', ['batters', 'batter']]
, meta = ['id', 'type', 'name', 'ppu', ]
, record_prefix='_'
, errors='ignore'
)
Thank you.
I don't think you can specify that within json_normalize. However, you can avoid creating the key_ column by specifying how="cross" in pd.merge (also no need to keep batters in df2):
import pandas as pd
df2 = pd.json_normalize(data5
, record_path = ['topping']
, meta = ['id', 'type', 'name', 'ppu']
, record_prefix='_'
)
df3 = pd.json_normalize(data5
,record_path = ['batters', 'batter'])
pd.merge(df2, df3, how="cross")

Merge remaining columns after groupBy and remove NaT/NaNs

Input
id
name
country
lost_item
year
status
resolved_date
closed_date
refunded_date
123
John
US
Bike
2020
Resolved
2021-12-25
125
Mike
CAN
Car
2021
Refunded
2021-11-22
123
John
US
Car
2019
Resolved
2021-12-25
563
Steve
CAN
Battery
2022
Closed
2019-02-03
Desired output
{
"items": {
"item": [
{
"id": "123",
"name": "John",
"categories": {
"category": [
{
"lost_item": "Bike",
"year": "2020"
},
{
"lost_item": "Car",
"year": "2019"
}
]
},
"country": "US",
"status": "Resolved",
"resolved_date":"2021-12-25",
},
{
"id": "125",
"name": "Mike",
"categories": {
"category": [
{
"lost_item": "Car",
"year": "2021"
},
]
},
"country": "CAN",
"status": "Reopened",
"refunded_date":"2021-11-22",
},
{
"id": "563",
"name": "Steve",
"categories": {
"category": [
{
"lost_item": "Bike",
"year": "2020"
},
]
},
"country": "CAN",
"status": "Closed",
"closed_date":"2019-02-03",
}
]
}
}
My code:
df = pd.read_excel('C:/Users/hero/Desktop/sample.xlsx', sheet_name='catalog')
df["closed_date"] = df["closed_date"].astype(str)
df["resolved_date"] = df["resolved_date"].astype(str)
df["refunded_date"] = df["refunded_date"].astype(str)
partial = df.groupby(['id', 'name', 'country', 'status', 'closed_date', 'resolved_date', 'refunded_date'], dropna=False).apply(lambda x: {"category":x[['lost_item','year']].to_dict('records')}).reset_index(name="categories").to_dict(orient="records")
res = []
for dict in partial:
clean = {key: value for (key, value) in dict.items() if value!="NaT"}
res.append(clean)
print(json.dumps(res, indent=2)) ## I will be writing the final payload to a JSON file.
In my input the fields id, name, country, status are mandatory fields. The fields resolved_Date, closed_date, refunded_date are not mandatory and will be empty values.
My questions:
Does including columns that have NaN values in GroupBy will have side effects for large datasets? I didn't find any problem with the above sample input.
Can i remove the fields resolved_Date, closed_date, refunded_date in group by and append these columns after group by ?
Whats the best way to handle the NaNs in the dataset ? For my usecase if a NaN is present then i have to drop that particular key not the entire row.
Please let me know if there is any room for improvement in my existing code. Any help is appreciated.
Thanks

Convert PANDAS dataframe to nested JSON + add array name

I've been wresting with this for many days now and would appreciate any help.
I'm importing an Excel file to a Pandas data frame resulting in the following dataframe [record]:
account_id
name
timestamp
value
A0001C
Fund_1
1588618800000000000
1
B0001B
Dev_2
1601578800000000000
1
I'm looking to produce a nested JSON output (will be used to submit data to an API), include adding a records and metric labels for the arrays.
Here is the output i'm looking for:
{
"records": [
{
"name": "Fund_1",
"account_id": "A0001C",
"metrics": [
{
"timestamp": 1588618800000000000,
"value": 1
}
]
}
{
"name": "Dev_2",
"account_id": "B0001B",
"metrics": [
{
"timestamp": 1601578800000000000,
"value": 1
}
]
}
]
}
I've gotten an output of a none nested JSON data set, but not able split out the timestamp and value to add the metrics part.
for record in df.to_dict(orient='records'):
record_data = {'records': [record]}
payload_json = json.dumps(record_data)
print(payload_json)
I get the following output:
{"records": [{"account_id": "A0001C", "name": "Fund_1", "Date Completed": 1588618800000000000, "Count": "1"}]}
{"records": [{"account_id": "B0001B", "name": "Dev_2", "Date Completed": 1601578800000000000, "Count": "1"}]}
Any help on how i can modify my code to add the metrics label and nest the data.
Thanks in advance.
One approach is through the use of pd.apply. This allows you to apply a function to series (either column- or row-wise) in your dataframe.
In your particular case, you want to apply the function row-by-row, so you have to use apply with axis=1:
records = list(df.apply(lambda row: {"name": row["name"],
"account_id": row["account_id"],
"metrics": [{
"timestamp": row["timestamp"],
"value": row["value"]}]
},
axis=1).values)
payload = {"records": records}
Alternatively, you could introduce an auxiliary column "metrics" in which you store your metrics (subsequently applying pd.to_json):
df["metrics"] = df.apply(lambda e: [{"timestamp": e.timestamp,
"value": e.value}],
axis=1)
records = df[["account_id", "name", "metrics"]].to_dict(orient="records")
payload = {"records": records}
Here's a full example applying option 2:
import io
import json
import pandas as pd
data = io.StringIO("""account_id name timestamp value
A0001C Fund_1 1588618800000000000 1
B0001B Dev_2 1601578800000000000 1""")
df = pd.read_csv(data, sep="\t")
df["metrics"] = df.apply(lambda e: [{"timestamp": e.timestamp,
"value": e.value}],
axis=1)
records = df[["account_id", "name", "metrics"]].to_dict(orient="records")
payload = {"records": records}
print(json.dumps(payload, indent=4))
Output:
{
"records": [
{
"account_id": "A0001C",
"name": "Fund_1",
"metrics": [
{
"timestamp": 1588618800000000000,
"value": 1
}
]
},
{
"account_id": "B0001B",
"name": "Dev_2",
"metrics": [
{
"timestamp": 1601578800000000000,
"value": 1
}
]
}
]
}
Edit: The second approach also makes grouping by accounts (in case you want to do that) rather easy. Below is a small example and output:
import io
import json
import pandas as pd
data = io.StringIO("""account_id name timestamp value
A0001C Fund_1 1588618800000000000 1
A0001C Fund_1 1588618900000000000 2
B0001B Dev_2 1601578800000000000 1""")
df = pd.read_csv(data, sep="\t")
# adding the metrics column as above
df["metrics"] = df.apply(lambda e: {"timestamp": e.timestamp,
"value": e.value},
axis=1)
# group metrics by account
df_grouped = df.groupby(by=["name", "account_id"]).metrics.agg(list).reset_index()
records = df_grouped[["account_id", "name", "metrics"]].to_dict(orient="records")
payload = {"records": records}
print(json.dumps(payload, indent=4))
Output:
{
"records": [
{
"account_id": "B0001B",
"name": "Dev_2",
"metrics": [
{
"timestamp": 1601578800000000000,
"value": 1
}
]
},
{
"account_id": "A0001C",
"name": "Fund_1",
"metrics": [
{
"timestamp": 1588618800000000000,
"value": 1
},
{
"timestamp": 1588618900000000000,
"value": 2
}
]
}
]
}

Import JSON to dataframe and normalize

I have the following json document which i want to import into a dataframe:
{
"agents": [
{
"core_build": "17",
"core_version": "7.1.1",
"distro": "win-x86-64",
"groups": [
{
"id": 101819,
"name": "O Laptops"
}
],
"id": 2198802,
"ip": "x.x.x.x",
"last_connect": 1539962159,
"last_scanned": 1539373347,
"linked_on": 1534964847,
"name": "x1x1x1x1",
"platform": "WINDOWS",
"plugin_feed_id": "201810182051",
"status": "on",
"uuid": "ca8b941a-80cd-4c1c-8044-760e69781eb7"
},
{
"core_build": "17",
"core_version": "7.1.1",
"distro": "win-x86-64",
"groups": [
{
"id": 101839,
"name": "G Personal"
},
{
"id": 102037,
"name": "W6"
},
{
"id": 102049,
"name": "MS8"
}
],
"id": 2097601,
"ip": "x.x.x.x",
"last_connect": 1539962304,
"last_scanned": 1539437865,
"linked_on": 1529677890,
"name": "x2xx2x2x2",
"platform": "WINDOWS",
"plugin_feed_id": "201810181351",
"status": "on",
"uuid": "7e3ef1ff-4f08-445a-b500-e7ce3ca9a2f2"
},
{
"core_build": "14",
"core_version": "7.1.0",
"distro": "win-x86-64",
"id": 2234103,
"ip": "x6x6x6x6x",
"last_connect": 1537384290,
"linked_on": 1537384247,
"name": "x7x7x7x",
"platform": "WINDOWS",
"status": "off",
"uuid": "0696ee38-402a-4866-b753-2816482dfce6"
}],
"pagination": {
"limit": 5000,
"offset": 0,
"sort": [
{
"name": "name",
"order": "asc"
}
],
"total": 14416
}
}
I've written the following code for the same purpose:
import json
from pandas.io.json import json_normalize
with open('out.json') as f:
data = json.load(f)
df = json_normalize(data, 'agents', [['groups', 'name']], errors='ignore')
print(df)
This unpacks all the fields within 'agents'(along with the 'groups' field as a multi-value field) as is, along with a new field called 'groups.name' which is null(all values are NaN).
I only wish to unpack the fields within the 'agents' field into a dataframe, with the fields within 'groups' field being unpacked into individual columns ('core_build', 'core_version', 'distro', 'groups.name', 'id', 'ip','last_connect', 'last_scanned', 'linked_on', 'name', 'platform','plugin_feed_id', 'status', 'uuid').
How can I achieve this?
Edit:
Doing the following
df = json_normalize(pd.concat([pd.DataFrame(i) for i in data['agents']]).to_dict('r'))
returns an error
ValueError: If using all scalar values, you must pass an index
You can use pd.concat() with a list comprehension:
df = pd.concat([pd.DataFrame(i) for i in my_json['agents']])
Or try this if you would like to unpack the group column of type dict to separate columns:
df = json_normalize(pd.concat([pd.DataFrame(i) for i in my_json['agents']]).to_dict('r'))
Yields:
core_build core_version distro groups.id groups.name id \
0 17 7.1.1 win-x86-64 101819 O Laptops 2198802
1 17 7.1.1 win-x86-64 101893 V Laptops 2169839
2 17 7.1.1 win-x86-64 101839 Personal 2097601
3 17 7.1.1 win-x86-64 102037 Wi 2097601
4 17 7.1.1 win-x86-64 102049 MS8 2097601
ip last_connect last_scanned linked_on name platform \
0 x.x.x.x 1539962159 1539373347 1534964847 x1x1x1x1 WINDOWS
1 x.x.x.x 1539962767 1539374603 1533666075 x2x2x2x2 WINDOWS
2 x.x.x.x 1539962304 1539437865 1529677890 x3x3x3x3 WINDOWS
3 x.x.x.x 1539962304 1539437865 1529677890 x3x3x3x3 WINDOWS
4 x.x.x.x 1539962304 1539437865 1529677890 x3x3x3x3 WINDOWS
plugin_feed_id status uuid
0 201810182051 on ca8b941a-80cd-4c1c-8044-760e69781eb7
1 201810171657 on 9400817b-235b-423b-b163-c4c86f973232
2 201810181351 on 7e3ef1ff-4f08-445a-b500-e7ce3ca9a2f2
3 201810181351 on 7e3ef1ff-4f08-445a-b500-e7ce3ca9a2f2
4 201810181351 on 7e3ef1ff-4f08-445a-b500-e7ce3ca9a2f2

Convert JSON with nested objects to Pandas Dataframe

I am trying to load json from a url and convert to a Pandas dataframe, so that the dataframe would look like the sample below.
I've tried json_normalize, but it duplicates the columns, one for each data type (value and stringValue). Is there a simpler way than this method and then dropping and renaming columns after creating the dataframe? I want to keep the stringValue.
Person ID Position ID Job ID Manager
0 192 936 93 Tom
my_json = {
"columns": [
{
"alias": "c3",
"label": "Person ID",
"dataType": "integer"
},
{
"alias": "c36",
"label": "Position ID",
"dataType": "string"
},
{
"alias": "c40",
"label": "Job ID",
"dataType": "integer",
"entityType": "job"
},
{
"alias": "c19",
"label": "Manager",
"dataType": "integer"
},
],
"data": [
{
"c3": {
"value": 192,
"stringValue": "192"
},
"c36": {
"value": "936",
"stringValue": "936"
},
"c40": {
"value": 93,
"stringValue": "93"
},
"c19": {
"value": 12412453,
"stringValue": "Tom"
}
}
]
}
If c19 is of type string, this should work
alias_to_label = {x['alias']: x['label'] for x in my_json["columns"]}
is_str = {x['alias']: ('string' == x['dataType']) for x in my_json["columns"]}
data = []
for x in my_json["data"]:
data.append({
k: v["stringValue" if is_str[k] else 'value']
for k, v in x.items()
})
df = pd.DataFrame(data).rename(columns=alias_to_label)

Categories

Resources