Convert nested dictionary to pandas dataframe in python

Convert nested dictionary to pandas dataframe in python - python

All, I have the following nested dictionary (from a JSON API response). I would like to access the individual items by means of a pandas dataframe.
The dictionary looks as follows:
{'pagination': {'limit': 2, 'offset': 0, 'count': 2, 'total': 1474969}, 'data': [{'flight_date': '2022-10-12', 'flight_status': 'active', 'departure': {'airport': 'Tullamarine', 'timezone': 'Australia/Melbourne', 'iata': 'MEL', 'icao': 'YMML', 'terminal': '2', 'gate': '16', 'delay': 20, 'scheduled': '2022-10-12T00:50:00+00:00', 'estimated': '2022-10-12T00:50:00+00:00', 'actual': '2022-10-12T01:09:00+00:00', 'estimated_runway': '2022-10-12T01:09:00+00:00', 'actual_runway': '2022-10-12T01:09:00+00:00'}, 'arrival': {'airport': 'Hong Kong International', 'timezone': 'Asia/Hong_Kong', 'iata': 'HKG', 'icao': 'VHHH', 'terminal': '1', 'gate': None, 'baggage': None, 'delay': None, 'scheduled': '2022-10-12T06:55:00+00:00', 'estimated': '2022-10-12T06:55:00+00:00', 'actual': None, 'estimated_runway': None, 'actual_runway': None}, 'airline': {'name': 'Finnair', 'iata': 'AY', 'icao': 'FIN'}, 'flight': {'number': '5844', 'iata': 'AY5844', 'icao': 'FIN5844', 'codeshared': {'airline_name': 'cathay pacific', 'airline_iata': 'cx', 'airline_icao': 'cpa', 'flight_number': '178', 'flight_iata': 'cx178', 'flight_icao': 'cpa178'}}, 'aircraft': None, 'live': None}, {'flight_date': '2022-10-12', 'flight_status': 'active', 'departure': {'airport': 'Tullamarine', 'timezone': 'Australia/Melbourne', 'iata': 'MEL', 'icao': 'YMML', 'terminal': '2', 'gate': '5', 'delay': 25, 'scheduled': '2022-10-12T00:30:00+00:00', 'estimated': '2022-10-12T00:30:00+00:00', 'actual': '2022-10-12T00:55:00+00:00', 'estimated_runway': '2022-10-12T00:55:00+00:00', 'actual_runway': '2022-10-12T00:55:00+00:00'}, 'arrival': {'airport': 'Kuala Lumpur International Airport (klia)', 'timezone': 'Asia/Kuala_Lumpur', 'iata': 'KUL', 'icao': 'WMKK', 'terminal': '1', 'gate': None, 'baggage': None, 'delay': 3, 'scheduled': '2022-10-12T06:00:00+00:00', 'estimated': '2022-10-12T06:00:00+00:00', 'actual': None, 'estimated_runway': None, 'actual_runway': None}, 'airline': {'name': 'KLM', 'iata': 'KL', 'icao': 'KLM'}, 'flight': {'number': '4109', 'iata': 'KL4109', 'icao': 'KLM4109', 'codeshared': {'airline_name': 'malaysia airlines', 'airline_iata': 'mh', 'airline_icao': 'mas', 'flight_number': '128', 'flight_iata': 'mh128', 'flight_icao': 'mas128'}}, 'aircraft': None, 'live': None}]}
The dictionary is stored under the variable name api_response. I am using the following code to convert to a dataframe as described in https://sparkbyexamples.com/pandas/pandas-convert-json-to-dataframe/
My code:
import boto3
import json
from datetime import datetime
import calendar
import random
import time
import requests
import pandas as pd
aircraftdata = ''
params = {
'access_key': 'KEY',
'limit': '2',
'flight_status':'active'
}
url = "http://api.aviationstack.com/v1/flights"
api_result = requests.get('http://api.aviationstack.com/v1/flights', params)
api_statuscode = api_result.status_code
api_response = api_result.json()
print (type(api_response)) #dictionary
print (api_response)
df = pd.DataFrame.from_dict(api_response, orient = 'index')
This yields the following error:
AttributeError: 'list' object has no attribute 'items'
I would like to obtain a dataframe with for each flight the live data:
flight_iata, live_latitude, live_longitude
AA1004, 36.2, -106.8

df = pd.json_normalize(api_response["data"])
df = df[df.loc[:, df.columns.str.contains("live", case=False)].columns]
print(df)
live.updated live.latitude live.longitude live.altitude live.direction live.speed_horizontal live.speed_vertical live.is_ground
0 2019-12-12T10:00:00+00:00 36.2856 -106.807 8846.82 114.34 894.348 1.188 False
If you want to drop live. from the headers you can:
df.columns = df.columns.str.split(".").str[-1]
print(df)
updated latitude longitude altitude direction speed_horizontal speed_vertical is_ground
0 2019-12-12T10:00:00+00:00 36.2856 -106.807 8846.82 114.34 894.348 1.188 False

Considering the desired output, let's say that the dictionary dic looks like the following
dic = {
"pagination": {
"limit": 100,
"offset": 0,
"count": 100,
"total": 1669022
},
"data": [
{
"flight_date": "2019-12-12",
"flight_status": "active",
"departure": {
"airport": "San Francisco International",
"timezone": "America/Los_Angeles",
"iata": "SFO",
"icao": "KSFO",
"terminal": "2",
"gate": "D11",
"delay": 13,
"scheduled": "2019-12-12T04:20:00+00:00",
"estimated": "2019-12-12T04:20:00+00:00",
"actual": "2019-12-12T04:20:13+00:00",
"estimated_runway": "2019-12-12T04:20:13+00:00",
"actual_runway": "2019-12-12T04:20:13+00:00"
},
"arrival": {
"airport": "Dallas/Fort Worth International",
"timezone": "America/Chicago",
"iata": "DFW",
"icao": "KDFW",
"terminal": "A",
"gate": "A22",
"baggage": "A17",
"delay": 0,
"scheduled": "2019-12-12T04:20:00+00:00",
"estimated": "2019-12-12T04:20:00+00:00",
"actual": None,
"estimated_runway": None,
"actual_runway": None
},
"airline": {
"name": "American Airlines",
"iata": "AA",
"icao": "AAL"
},
"flight": {
"number": "1004",
"iata": "AA1004",
"icao": "AAL1004",
"codeshared": None
},
"aircraft": {
"registration": "N160AN",
"iata": "A321",
"icao": "A321",
"icao24": "A0F1BB"
},
"live": {
"updated": "2019-12-12T10:00:00+00:00",
"latitude": 36.28560000,
"longitude": -106.80700000,
"altitude": 8846.820,
"direction": 114.340,
"speed_horizontal": 894.348,
"speed_vertical": 1.188,
"is_ground": False
}
}
]
}
In order to obtain the desired output, one can start by converting the dictionary to a dataframe with pandas.DataFrame
df = pd.DataFrame(dic['data'], columns=['flight', 'live'])
[Out]:
flight live
0 {'number': '1004', 'iata': 'AA1004', 'icao': '... {'updated': '2019-12-12T10:00:00+00:00', 'lati...
Then, one can use .apply() with a custom lambda function as follows
df = df[['flight', 'live']].apply(lambda x: pd.Series([x['flight']['iata'], x['live']['latitude'], x['live']['longitude'], x['live']['altitude']]), axis=1)
[Out]:
0 1 2 3
0 AA1004 36.2856 -106.807 8846.82
Finally, the only thing missing is the name of the columns. And in order to change it, one can do the following
df.columns = ['flight_iata', 'live_latitude', 'live_longitude', 'live_altitude']
[Out]:
flight_iata live_latitude live_longitude live_altitude
0 AA1004 36.2856 -106.807 8846.82
And that is the desired output.

Related

Save dict as netCDF / xarray

I have a problem. I want to save a dict. But unfortunately I got the following error - TypeError: expected bytes, list found. I want to save my my_dict as netCDF. How could I save my dict? I looked at https://docs.xarray.dev/en/stable/user-guide/io.html , Saving Python dictionary to netCDF4 file and some other links and blogs
from netCDF4 import Dataset
my_dict = {
'_key': '1',
'group': 'test',
'data': {},
'type': '',
'code': '007',
'conType': '1',
'flag': None,
'createdAt': '2021',
'currency': 'EUR',
'detail': {
'selector': {
'number': '12312',
'isTrue': True,
'requirements': [{
'type': 'customer',
'requirement': '1'}]
}
}
'identCode': [],
}
ds = Dataset(my_dict)
[OUT] TypeError: expected bytes, list found
ds.to_netcdf("saved_on_disk.nc")

Create JSON from another JSON with duplicate values in Python

I have a JSON:
[{'job': 'fireman', 'salary': 30000', 'country':'USA'}, {'job': 'doctor', 'salary': '50000': 'country': 'Canada'},{'job': 'fireman', 'salary': 60000', 'country':'France'}, {'job': 'Engineer', 'salary': 45000', 'country':'Mexico'} ]
I want to combine the duplicate values and create a JSON like:
[
{"job": "fireman",
"sumamry": [{"country": "USA", "Salary": 40000}, {"Country": "France", "Salary": 60000}]
"total" : 100000},
{"job": "doctor",
"summary": [{"country": "Canada", "Salary": 50000}]
"total" : 50000},
....
]

Try this:
non_summarized = [{'job': 'fireman', 'salary': 30000, 'country':'USA'}, {'job': 'doctor', 'salary': 50000, 'country': 'Canada'},{'job': 'fireman', 'salary': 60000, 'country':'France'}, {'job': 'Engineer', 'salary': 45000, 'country':'Mexico'}]
# sort the list of dictionary base on job keys, so we can loop in the order
non_summarized = sorted(non_summarized, key = lambda i: i['job'])
summarized = list()
last_value = dict()
for d in non_summarized:
# check if the last value has the same job or not
# if not then create a new dict value and update with new information
if last_value.get('job') != d.get('job'):
last_value = {
'job': d.get('job'),
'total': 0,
'summary': list()
}
summarized.append(last_value)
last_value['total'] += d.get('salary', 0)
last_value['summary'].append({
'country': d.get('country'),
'salary': d.get('salary')
})
print(summarized)
Please let me know if you need any clarification.

How to iterate over a JSON array and get values for a key which itself is a JSON object

I have been trying to do something simple yet something hard for me to solve it!
I have a json object that looks like:
jsonObject = {
'attributes': {
'192': { <--- This can be changed times to times meaning different number
'id': '192',
'code': 'hello',
'label': 'world',
'options': [
{
'id': '211',
'label': '5'
},
{
'id': '1202',
'label': '8.5'
},
{
'id': '54',
'label': '9'
},
{
'id': '1203',
'label': '9.5'
},
{
'id': '58',
'label': '10'
}
]
}
},
'template': '12345',
'basePrice': '51233',
'oldPrice': '51212',
'productId': 'hello',
}
and what I want to do is to get the values from options (To have both id and label saved into a list)
For now I only managed to do:
for att, value in jsonObject.items():
print(f"{att} - {value}"
How can I get the label and id?

You can try the following code:
attr = jsonObject['attributes']
temp = list(attr.values())[0] # It is same as "temp = attr['192']", but you said '192' can be changed.
options = temp['options']
for option in options:
print(f"id: {option['id']}, label: {option['label']}")

How do I use Pandas to convert an Excel file to a nested JSON?

I am a rookie programmer and I'm trying to convert an excel file into a nested JSON using Pandas.
I am posting my code and the expected output, which I am not able to achieve so far. The problem is that the excel columns which I transform into nested info, should actually fall under the name "addresses" and I can't figure out how to do that. Will be grateful for any advice.
This is how the excel file looks like:
import pandas as pd
import json
df = pd.read_excel("...", encoding = "utf-8-sig")
df.fillna('', inplace = True)
def get_nested_entry(key, grp):
entry = {}
entry['Forename'] = key[0]
entry['Middle Name'] = key[1]
entry['Surname'] = key[2]
for field in ['Address - Country']:
entry[field] = list(grp[field].unique())
return entry
entries = []
for key, grp in df.groupby(['Forename', 'Middle Name', 'Surname']):
entry = get_nested_entry(key, grp)
entries.append(entry)
print(entries)
with open("excel_to_json_output.json", "w", encoding = "utf-8-sig") as f:
json.dump(entries, f, indent = 4)
This is the expected outcome
[
{
"firstName": "Angela",
"lastName": "L.",
"middleName": "Johnson",
"addresses": [
{
"postcode": "32807",
"city": "Orlando",
"state": "FL",
"country": "United States of America"
}
],
What I get is this
[
{
"Forename": "Angela",
"Middle Name": "L.",
"Surname": "Johnson",
"Address - Country": [
"United States of America"
]
},

Try this
b = {'First_Name': ["Angela","Peter","John"],
'Middle_Name': ["L","J","A"],
'Last_Name': ["Johnson","Roth","Williams"],
'City': ["chicago","seattle","st.loius"],
'state': ["IL","WA","MO"],
'zip': [60007,98105,63115],
'country': ["USA","USA","USA"]}
df = pd.DataFrame(b)
predict = df.iloc[:,:3].to_dict(orient='records')
postdict = df.iloc[:,3:].to_dict(orient='records')
entities=[]
for i in range(df.shape[0]):
tm = predict[i]
tm["addresses"] = [postdict[i]]
entities.append(tm)
output
[{'First_Name': 'Angela',
'Middle_Name': 'L',
'Last_Name': 'Johnson',
'addresses': [{'City': 'chicago',
'state': 'IL',
'zip': 60007,
'country': 'USA'}]},
{'First_Name': 'Peter',
'Middle_Name': 'J',
'Last_Name': 'Roth',
'addresses': [{'City': 'seattle',
'state': 'WA',
'zip': 98105,
'country': 'USA'}]},
{'First_Name': 'John',
'Middle_Name': 'A',
'Last_Name': 'Williams',
'addresses': [{'City': 'st.loius',
'state': 'MO',
'zip': 63115,
'country': 'USA'}]}]

Create 2 records from JSON Array having Structs

I have a JSON array which is in format below:
{
"id": "1",
"active": "True",
"gender": "female",
"coding": [
{
"system": "http://loinc.org",
"code": "8310-5",
"display": "Body temperature"
},
{
"system": "http://loinc.org",
"code": "8716-3",
"display": "Vital Signs grouping"
}
]
}
- I need output as two records. is it possible can someone help me with the Python code
{"id": "1","active": "True","gender": "female",{"system": "http://loinc.org","code": "8310-5","display": "Body temperature"},
{"id": "1","active": "True","gender": "female",{"system": "http://loinc.org","code": "8716-3","display": "Vital Signs grouping"}

I'm going to assume you want the codings in their own key since your question wasn't clear
import json
obj = json.loads(s) # where s is your json string
objs = [] # where we will store the results
for coding in obj['coding']:
new_obj = obj.copy()
new_obj['coding'] = coding # set the coding entry to one coding
objs.append(new_obj)
Output of objs:
[{'active': 'True',
'coding': {'code': '8310-5',
'display': 'Body temperature',
'system': 'http://loinc.org'},
'gender': 'female',
'id': '1'},
{'active': 'True',
'coding': {'code': '8716-3',
'display': 'Vital Signs grouping',
'system': 'http://loinc.org'},
'gender': 'female',
'id': '1'}]
If you want just a flat dict then
objs = []
for coding in obj['coding']:
new_obj = obj.copy()
del new_obj['coding']
new_obj.update(coding)
objs.append(new_obj)
Now objs is:
[{'active': 'True',
'code': '8310-5',
'display': 'Body temperature',
'gender': 'female',
'id': '1',
'system': 'http://loinc.org'},
{'active': 'True',
'code': '8716-3',
'display': 'Vital Signs grouping',
'gender': 'female',
'id': '1',
'system': 'http://loinc.org'}]

You can do it like this:
import json
input_dict = json.loads(myjson)
base = input_dict.copy()
base.pop('coding')
output = [dict(base, **c) for c in input_dict['coding']]
print(output)
Output:
[{'active': 'True', 'code': '8310-5', 'display': 'Body temperature', 'gender': 'female', 'id': '1', 'system': 'http://loinc.org'},
{'active': 'True', 'code': '8716-3', 'display': 'Vital Signs grouping', 'gender': 'female', 'id': '1', 'system': 'http://loinc.org'}]

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Convert nested dictionary to pandas dataframe in python - python

Related

Save dict as netCDF / xarray

Create JSON from another JSON with duplicate values in Python

How to iterate over a JSON array and get values for a key which itself is a JSON object

How do I use Pandas to convert an Excel file to a nested JSON?

Create 2 records from JSON Array having Structs

Categories

Resources