Unpacking Json with nested Lists in Pandas - python

I have a json file that I am trying to unpack that looks like this:
[{'batter': 'LA Marsh',
'bowler': 'MJG Nielsen',
'non_striker': 'M Kapp',
'runs': {'batter': 0, 'extras': 0, 'total': 0}},
{'batter': 'LA Marsh',
'bowler': 'MJG Nielsen',
'non_striker': 'M Kapp',
'runs': {'batter': 0, 'extras': 0, 'total': 0},
'wickets': [{'player_out': 'LA Marsh', 'kind': 'bowled'}]},
{'batter': 'EA Perry',
'bowler': 'MJG Nielsen',
'non_striker': 'M Kapp',
'runs': {'batter': 0, 'extras': 0, 'total': 0}}]
using the following code:
df = pd.json_normalize(data)
I get the following:
As you can see, the second entry has a nested list in it. In place of the column 'wickets' I would like to have two columns "player_out" and "kind". My preferred output looks like this:

Use:
df = df.drop(columns=['wickets']).join(df['wickets'].explode().apply(pd.Series))

You can try:
import pandas as pd
from collections import MutableMapping
def flatten(d, parent_key='', sep='.'):
items = []
for k, v in d.items():
new_key = parent_key + sep + k if parent_key else k
if isinstance(v, MutableMapping):
items.extend(flatten(v, new_key, sep=sep).items())
elif isinstance(v, list):
for idx, value in enumerate(v):
items.extend(flatten(value, new_key, sep).items())
else:
items.append((new_key, v))
return dict(items)
data = [{'batter': 'LA Marsh',
'bowler': 'MJG Nielsen',
'non_striker': 'M Kapp',
'runs': {'batter': 0, 'extras': 0, 'total': 0}},
{'batter': 'LA Marsh',
'bowler': 'MJG Nielsen',
'non_striker': 'M Kapp',
'runs': {'batter': 0, 'extras': 0, 'total': 0},
'wickets': [{'player_out': 'LA Marsh', 'kind': 'bowled'}]},
{'batter': 'EA Perry',
'bowler': 'MJG Nielsen',
'non_striker': 'M Kapp',
'runs': {'batter': 0, 'extras': 0, 'total': 0}}]
output = []
for dict_data in data:
output.append(flatten(dict_data))
df = pd.DataFrame(output)
print(df)
Output:
batter bowler non_striker runs.batter runs.extras runs.total wickets.player_out wickets.kind
0 LA Marsh MJG Nielsen M Kapp 0 0 0 NaN NaN
1 LA Marsh MJG Nielsen M Kapp 0 0 0 LA Marsh bowled
2 EA Perry MJG Nielsen M Kapp 0 0 0 NaN NaN

if you want to keep using json normalize you need to fisrt homogenize the data
then apply json normalize
# homogenize data
nan_entries = [{'player_out': pd.NA, 'kind': pd.NA}]
for entry in data:
if 'wickets' not in entry.keys():
entry['wickets'] = nan_entries
# use json normailze
pd.json_normalize(data,
record_path='wickets',
meta=['batter', 'bowler', 'non_striker', ['runs', 'batter'],
['runs', 'extras'], ['runs', 'total'] ],
record_prefix='wickets.')
output
wickets.player_out wickets.kind batter bowler non_striker runs.batter runs.extras runs.total
0 <NA> <NA> LA Marsh MJG Nielsen M Kapp 0 0 0
1 LA Marsh bowled LA Marsh MJG Nielsen M Kapp 0 0 0
2 <NA> <NA> EA Perry MJG Nielsen M Kapp 0 0 0

Related

How to extract value from a column with multiple dictionaries

I have a column that was extracted using Pandas. The following column may contain one dictionary or more than one dictionary.
Column B
[{'url': 'mailto:Kim_Do#dmx.com', 'type': 0, 'id': 1021857, 'name': 'KIM Do', 'entryListId': -1}, {'url': 'mailto:Angel_Kong#dmx.com', 'type': 0, 'id': 1023306, 'name': 'Angel Kong', 'entryListId': -1}, {'url': 'mailto:Alex_Do#dmx.com', 'type': 0, 'id': 1023289, 'name': 'Alex Do', 'entryListId': -1}
[{'url': 'mailto:Ray_Chan#dmx.com', 'type': 0, 'id': 1021857, 'name': 'Ray Chan', 'entryListId': -1}, {'url': 'mailto:Paul_Jones#dmx.com', 'type': 0, 'id': 1023306, 'name': 'Paul Jones, 'entryListId': -1}]
nan
nan
[{'url': 'mailto:Ray_Chaudhry#dmx.com', 'type': 0, 'id': 1021857, 'name': 'Ray Chaudhry', 'entryListId': -1}]
What I want back is just the names from the dictionary. So, the output should be as follows:
Column B
Kim Do, Angel Kong, Alex Do, Fred Tome
Ray Chan, Paul Jones
nan
nan
Ray Chaudhry
How can I achieve this. Thank you!
You can use:
df['New'] = df['Column B'].explode().str['name'].dropna().groupby(level=0).agg(', '.join)
Output (New column only):
0 KIM Do, Angel Kong, Alex Do
1 Ray Chan, Paul Jones
2 NaN
3 NaN
4 Ray Chaudhry
Use this function:
def extract_names(list_data):
row_names = []
for n in range(len(list_data)):
row_names.append(list_data[n]['name'])
return row_names
store_names = []
col = 0 #column name
for idx, row in df.iterrows():
# print(idx)
# print(row[col])
store_names.append(extract_names(row[col]))
# print('--')
Now you can store the list as parameter of your choice:
df['Names'] = store_names

How to Unpack Dictionary in Column Dataframe Pandas

Stackoverflow, please do your magic,
i have dataframe pandas like this
Column_one \
{{'name': 'Marfon ', 'email': '', 'phone': '123454333', 'address': 'San Jose', 'estimated_date': 2019-10-01 00:00:00, 'estimated_time': {'minimum': 1000, 'maximum': 1200, 'min': 0, 'max': 0}}
{{'name': 'Joe Doe ', 'email': 'joe#gmail.com', 'phone': '987655444', 'address': 'Carolina', 'estimated_date': 2019-10-01 00:00:00, 'estimated_time': {'minimum': 1000, 'maximum': 1200, 'min': 0, 'max': 0}}
Column_two
[{'status': False, 'item_code': 'JSK', 'price': 15000, 'note': [], 'sub_total_price': 50}]
[{'status': False, 'item_code': 'HSO', 'price': 15000, 'note': [], 'sub_total_price': 100}]
how to create new dataframe like this?
name email phone address item_code
Marfon 123454333 San Jose JSK
Joe Doe joe#gmail.com 987655444 Carolina HSO
solved
column_one = pd.DataFrame(main_df['Column_one'].values.tolist(), index=main_df.index)
column_two = main_df['Column_two'].apply(lambda x: ', '.join(y['item_code'] for y in x))
data_con = pd.concat([column_one, column_two], axis=1)
print(data_con)
You have some mess in your input data. But if what you meant was this, then:
Column_one =\
[{'name': 'Marfon ', 'email': '', 'phone': '123454333', 'address': 'San Jose', 'estimated_date': '2019-10-01 00:00:00'},
{'name': 'Joe Doe ', 'email': 'joe#gmail.com', 'phone': '987655444', 'address': 'Carolina', 'estimated_date': '2019-10-01 00:00:00'}]
Column_two=\
[{'status': False, 'item_code': 'JSK', 'price': 15000, 'note': [], 'sub_total_price': 50},
{'status': False, 'item_code': 'HSO', 'price': 15000, 'note': [], 'sub_total_price': 100}]
pd.concat([pd.DataFrame(Column_one), pd.DataFrame(Column_two)], axis=1)
output:
name email phone address estimated_date status item_code price note sub_total_price
Marfon 123454333 San Jose 2019-10-01 00:00:00 False JSK 15000 [] 50
Joe Doe joe#gmail.com 987655444 Carolina 2019-10-01 00:00:00 False HSO 15000 [] 100

googlemaps distance_matrix Creating a Matrix of Results

So I am trying to do a project for work where I create a matrix where the x-axis is a list of stores and the y axis is another list of stores and the values are the distances returned see the link below for an example of what im looking for
Example Output
This is the code I am running with dummy addresses for lst_stores1 and lst_stores2
INPUT:
import googlemaps
lst_store1 = ['777 Brockton Avenue, Abington MA 2351',
'30 Memorial Drive, Avon MA',
'250 Hartford Avenue, Bellingham MA',
'700 Oak Street, Brockton MA',
'591 Memorial Dr, Chicopee MA']
lst_store2 = ['55 Brooksby Village Way, Danvers MA',
'137 Teaticket Hwy, East Falmouth MA',
'42 Fairhaven Commons Way, Fairhaven MA',
'374 William S Canning Blvd, Fall River MA',
'121 Worcester Rd, Framingham MA']
my_dist = gmaps.distance_matrix(lst_store1,lst_store2)
print(my_dist)
OUTPUT:
'destination_addresses': ['55 Brooksby Village Dr, Danvers, MA 01923, USA', '137 Teaticket Hwy, Teaticket, MA 02536, USA', '42 Fairhaven Commons Way, Fairhaven, MA 02719, USA', '374 William S Canning Blvd, Fall River, MA 02721, USA', '121 Worcester Rd, Framingham, MA 01701, USA'], 'origin_addresses': ['777 Brockton Ave, Abington, MA 02351, USA', '30 Memorial Dr, Avon, MA 02322, USA', '250 Hartford Ave, Bellingham, MA 02019, USA', '700 Oak St, Brockton, MA 02301, USA', '591 Memorial Dr, Chicopee, MA 01020, USA'], 'rows': [{'elements': [{'distance': {'text': '65.0 km', 'value': 65015}, 'duration': {'text': '1 hour 4 mins', 'value': 3860}, 'status': 'OK'}, {'distance': {'text': '89.0 km', 'value': 89014}, 'duration': {'text': '1 hour 14 mins', 'value': 4437}, 'status': 'OK'}, {'distance': {'text': '72.4 km', 'value': 72367}, 'duration': {'text': '56 mins', 'value': 3339}, 'status': 'OK'}, {'distance': {'text': '63.4 km', 'value': 63418}, 'duration': {'text': '51 mins', 'value': 3034}, 'status': 'OK'}, {'distance': {'text': '58.7 km', 'value': 58690}, 'duration': {'text': '50 mins', 'value': 2998}, 'status': 'OK'}]}, {'elements': [{'distance': {'text': '62.6 km', 'value': 62649}, 'duration': {'text': '53 mins', 'value': 3189}, 'status': 'OK'}, {'distance': {'text': '96.8 km', 'value': 96832}, 'duration': {'text': '1 hour 5 mins', 'value': 3889}, 'status': 'OK'}, {'distance': {'text': '70.4 km', 'value': 70413}, 'duration': {'text': '46 mins', 'value': 2788}, 'status': 'OK'}, {'distance': {'text': '61.5 km', 'value': 61463}, 'duration': {'text': '41 mins', 'value': 2483}, 'status': 'OK'}, {'distance': {'text': '50.5 km', 'value': 50512}, 'duration': {'text': '38 mins', 'value': 2273}, 'status': 'OK'}]}, {'elements': [{'distance': {'text': '95.3 km', 'value': 95321}, 'duration': {'text': '1 hour 2 mins', 'value': 3702}, 'status': 'OK'}, {'distance': {'text': '115 km', 'value': 115239}, 'duration': {'text': '1 hour 14 mins', 'value': 4436}, 'status': 'OK'}, {'distance': {'text': '90.2 km', 'value': 90161}, 'duration': {'text': '57 mins', 'value': 3427}, 'status': 'OK'}, {'distance': {'text': '81.2 km', 'value': 81211}, 'duration': {'text': '52 mins', 'value': 3122}, 'status': 'OK'}, {'distance': {'text': '39.8 km', 'value': 39785}, 'duration': {'text': '29 mins', 'value': 1710}, 'status': 'OK'}]}, {'elements': [{'distance': {'text': '65.5 km', 'value': 65521}, 'duration': {'text': '55 mins', 'value': 3323}, 'status': 'OK'}, {'distance': {'text': '91.4 km', 'value': 91450}, 'duration': {'text': '1 hour 2 mins', 'value': 3726}, 'status': 'OK'}, {'distance': {'text': '65.0 km', 'value': 65031}, 'duration': {'text': '44 mins', 'value': 2625}, 'status': 'OK'}, {'distance': {'text': '56.1 km', 'value': 56081}, 'duration': {'text': '39 mins', 'value': 2320}, 'status': 'OK'}, {'distance': {'text': '53.4 km', 'value': 53385}, 'duration': {'text': '40 mins', 'value': 2407}, 'status': 'OK'}]}, {'elements': [{'distance': {'text': '168 km', 'value': 167923}, 'duration': {'text': '1 hour 43 mins', 'value': 6187}, 'status': 'OK'}, {'distance': {'text': '227 km', 'value': 227118}, 'duration': {'text': '2 hours 17 mins', 'value': 8217}, 'status': 'OK'}, {'distance': {'text': '183 km', 'value': 183401}, 'duration': {'text': '1 hour 54 mins', 'value': 6818}, 'status': 'OK'}, {'distance': {'text': '163 km', 'value': 163452}, 'duration': {'text': '1 hour 43 mins', 'value': 6170}, 'status': 'OK'}, {'distance': {'text': '112 km', 'value': 112386}, 'duration': {'text': '1 hour 10 mins', 'value': 4196}, 'status': 'OK'}]}], 'status': 'OK'}
I'm having trouble deciphering the output it's returning and figuring out how to convert it into the format I described above.
P.S. I have imported the appropriate libraries and set up my API key etc it is being referenced in a different cell in jupyter
For what I understand, you want to get the corresponding distance of a store from x to each store in y. What I done to achieve this is to put a for loop to each store in both list and sending the distance matrix request for each combination. This is so,I can put the result of each combination to the array. I then use a for loop to with the range of length to lookup the index on my result array so that I can get each result from the array. Here is a sample code. Here is the breakdown of my code:
First, I used these lines from your code:
import googlemaps
lst_store1 = ['777 Brockton Avenue, Abington MA 2351',
'30 Memorial Drive, Avon MA',
'250 Hartford Avenue, Bellingham MA',
'700 Oak Street, Brockton MA',
'591 Memorial Dr, Chicopee MA']
lst_store2 = ['55 Brooksby Village Way, Danvers MA',
'137 Teaticket Hwy, East Falmouth MA',
'42 Fairhaven Commons Way, Fairhaven MA',
'374 William S Canning Blvd, Fall River MA',
'121 Worcester Rd, Framingham MA']
Then I added the line where I put the API Key. Note that To properly use the Distance Matrix API, you need to have an API Key.
gmaps = googlemaps.Client(key='YOUR_API_KEY_HERE')
Then declare an empty array where I will put my results:
my_result= []
Then use a for loop for lst_store1 array and put a for loop inside it for lst_store2 array. Then append the result of the distance matrix between each combination of your array. Here are the code:
for x in lst_store1:
for y in lst_store2:
my_result.append(gmaps.distance_matrix(x,y))
You will get something like these combination:
orig: 777 Brockton Avenue, Abington MA 2351 dest: 55 Brooksby Village Way, Danvers MA
orig: 777 Brockton Avenue, Abington MA 2351 dest: 137 Teaticket Hwy, East Falmouth MA
orig: 777 Brockton Avenue, Abington MA 2351 dest:42 Fairhaven Commons Way, Fairhaven MA
orig: 777 Brockton Avenue, Abington MA 2351 dest:374 William S Canning Blvd, Fall River MA
orig: 777 Brockton Avenue, Abington MA 2351 dest: 121 Worcester Rd, Framingham MA
orig: 30 Memorial Drive, Avon MA dest: 55 Brooksby Village Way, Danvers MA
orig: 30 Memorial Drive, Avon MA 2351 dest: 137 Teaticket Hwy, East Falmouth MA
... and so on. You'll have 25 combinations from your list.
Then I put a for loop using range of length so that we can get the index. You can also edit the format on how you print the output of the array.
for z in range(len(my_result)):
print("{}- from:{} to:{} distance:{}".format(z+1, my_result[z]['origin_addresses'],my_result[z]['destination_addresses'], my_result[z]['rows'][0]['elements'][0]['distance']['value'] ))
This will be the sample output of the print from the for loop which will be easier for you to sort:
1- from:['777 Brockton Ave, Abington, MA 02351, USA'] to:['55 Brooksby Village Dr, Danvers, MA 01923, USA'] distance:65015
2- from:['777 Brockton Ave, Abington, MA 02351, USA'] to:['137 Teaticket Hwy, Teaticket, MA 02536, USA'] distance:88198
3- from:['777 Brockton Ave, Abington, MA 02351, USA'] to:['42 Fairhaven Commons Way, Fairhaven, MA 02719, USA'] distance:72367
4- from:['777 Brockton Ave, Abington, MA 02351, USA'] to:['374 William S Canning Blvd, Fall River, MA 02721, USA'] distance:63418
5- from:['777 Brockton Ave, Abington, MA 02351, USA'] to:['121 Worcester Rd, Framingham, MA 01701, USA'] distance:58690
6- from:['30 Memorial Dr, Avon, MA 02322, USA'] to:['55 Brooksby Village Dr, Danvers, MA 01923, USA'] distance:62649
7- from:['30 Memorial Dr, Avon, MA 02322, USA'] to:['137 Teaticket Hwy, Teaticket, MA 02536, USA'] distance:96832
Hope this helps!

Rows not appending to dataframe while on loop

I was working through a database and creating a dataframe of selected information. The database can be found at www.cricsheet.org.
The code for the same is:
bat = {'Name' : [], 'Runs' : [], 'Balls' : [], 'StrikeR' : []}
batsman = pd.DataFrame(bat)
batsman.head()
index = ['Name','Runs','Balls','StrikeR']
data = []
count = 0
for i in items[0]["1st innings"]["deliveries"]:
name = list(i.values())[0]["batsman"]
run = list(i.values())[0]["runs"]["batsman"]
if name in list(batsman['Name']):
batsman.loc[batsman.Name == name].Runs += run
batsman.loc[batsman.Name == name].Balls += 1
batsman.loc[batsman.Name == name].StrikeR = batsman.loc[batsman.Name == name].Runs/batsman.loc[batsman.Name == name].Balls
else:
data = [name,run,1,run]
print(b)
batsman.append(pd.Series(data, index = index), ignore_index=True)
To give a context the array data is of type:
['GC Smith', 0, 1, 0]
['HH Dippenaar', 0, 1, 0]
['HH Dippenaar', 0, 1, 0]
['HH Dippenaar', 2, 1, 2]
['HH Dippenaar', 0, 1, 0]
I was hoping to update this data in a pandas dataframe, However the data is not appending to the dataframe. Can anyone tell me why and what is the solution to it?
Edit: I am adding a part of items[0] dataset.
{'1st innings': {'team': 'South Africa', 'deliveries': [{0.1: {'batsman': 'GC Smith', 'bowler': 'WPUJC Vaas', 'non_striker': 'HH Dippenaar', 'runs': {'batsman': 0, 'extras': 0, 'total': 0}}}, {0.2: {'batsman': 'GC Smith', 'bowler': 'WPUJC Vaas', 'non_striker': 'HH Dippenaar', 'runs': {'batsman': 0, 'extras': 0, 'total': 0}}}, {0.3: {'batsman': 'GC Smith', 'bowler': 'WPUJC Vaas', 'non_striker': 'HH Dippenaar', 'runs': {'batsman': 0, 'extras': 0, 'total': 0}}}, {0.4: {'batsman': 'GC Smith', 'bowler': 'WPUJC Vaas', 'non_striker': 'HH Dippenaar', 'runs': {'batsman': 0, 'extras': 0, 'total': 0}}}, {0.5: {'batsman': 'GC Smith', 'bowler': 'WPUJC Vaas', 'non_striker': 'HH Dippenaar', 'runs': {'batsman': 0, 'extras': 0, 'total': 0}}}, {0.6: {'batsman': 'GC Smith', 'bowler': 'WPUJC Vaas', 'non_striker': 'HH Dippenaar', 'runs': {'batsman': 0, 'extras': 0, 'total': 0}}}
Hei,
Appending to a dataframe doesn't happen in place. The append function will only return the new dataframe which contains the appended value, and will not modify the original dataframe
So,
batsman.append(pd.Series(data, index = index), ignore_index=True)
Should be
batsman = batsman.append(pd.Series(data, index = index), ignore_index=True)

pandas.io.json.json_normalize with very nested json

I have been trying to normalize a very nested json file I will later analyze. What I am struggling with is how to go more than one level deep to normalize.
I went through the pandas.io.json.json_normalize documentation, since it does exactly what I want it to do.
I have been able to normalize part of it and now understand how dictionaries work, but I am still not there.
With below code I am able to get only the first level.
import json
import pandas as pd
from pandas.io.json import json_normalize
with open('authors_sample.json') as f:
d = json.load(f)
raw = json_normalize(d['hits']['hits'])
authors = json_normalize(data = d['hits']['hits'],
record_path = '_source',
meta = ['_id', ['_source', 'journal'], ['_source', 'title'],
['_source', 'normalized_venue_name']
])
I am trying to 'dig' into the 'authors' dictionary with below code, but the record_path = ['_source', 'authors'] throws me TypeError: string indices must be integers. As far as I understand json_normalize the logic should be good, but I still don't quite understand how to dive into a json with dict vs list.
I even went through this simple example.
authors = json_normalize(data = d['hits']['hits'],
record_path = ['_source', 'authors'],
meta = ['_id', ['_source', 'journal'], ['_source', 'title'],
['_source', 'normalized_venue_name']
])
Below is a chunk of the json file (5 records).
{u'_shards': {u'failed': 0, u'successful': 5, u'total': 5},
u'hits': {u'hits': [{u'_id': u'7CB3F2AD',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': None,
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'Physical Review Letters',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'phys rev lett',
u'pages': None,
u'parent_keywords': [u'Chromatography',
u'Quantum mechanics',
u'Particle physics',
u'Quantum field theory',
u'Analytical chemistry',
u'Quantum chromodynamics',
u'Physics',
u'Mass spectrometry',
u'Chemistry'],
u'pub_date': u'1987-03-02 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'mass spectra', u'elementary particles', u'bound states'],
u'title': u'Evidence for a new meson: A quasinuclear NN-bar bound state',
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'Physical Review Letters',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'},
{u'_id': u'7AF8EBC3',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': [{u'affiliations': [u'Punjabi University'],
u'author_id': u'780E3459',
u'author_name': u'munish puri'},
{u'affiliations': [u'Punjabi University'],
u'author_id': u'48D92C79',
u'author_name': u'rajesh dhaliwal'},
{u'affiliations': [u'Punjabi University'],
u'author_id': u'7D9BD37C',
u'author_name': u'r s singh'}],
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'Journal of Industrial Microbiology & Biotechnology',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'j ind microbiol biotechnol',
u'pages': None,
u'parent_keywords': [u'Nuclear medicine',
u'Psychology',
u'Hydrology',
u'Chromatography',
u'X-ray crystallography',
u'Nuclear fusion',
u'Medicine',
u'Fluid dynamics',
u'Thermodynamics',
u'Physics',
u'Gas chromatography',
u'Radiobiology',
u'Engineering',
u'Organic chemistry',
u'High-performance liquid chromatography',
u'Chemistry',
u'Organic synthesis',
u'Psychotherapist'],
u'pub_date': u'2008-04-04 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'flow rate',
u'operant conditioning',
u'packed bed reactor',
u'immobilized enzyme',
u'specific activity'],
u'title': u'Development of a stable continuous flow immobilized enzyme reactor for the hydrolysis of inulin',
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'Journal of Industrial Microbiology & Biotechnology',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'},
{u'_id': u'7521A721',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': [{u'author_id': u'7FF872BC',
u'author_name': u'barbara eileen ryan'}],
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'The American Historical Review',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'american historical review',
u'pages': None,
u'parent_keywords': [u'Social science',
u'Politics',
u'Sociology',
u'Law'],
u'pub_date': u'1992-01-01 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'social movements'],
u'title': u"Feminism and the women's movement : dynamics of change in social movement ideology, and activism",
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'The American Historical Review',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'},
{u'_id': u'7DAEB9A4',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': [{u'author_id': u'0299B8E9',
u'author_name': u'fraser j harbutt'}],
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'The American Historical Review',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'american historical review',
u'pages': None,
u'parent_keywords': [u'Superconductivity',
u'Nuclear fusion',
u'Geology',
u'Chemistry',
u'Metallurgy'],
u'pub_date': u'1988-01-01 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'iron'],
u'title': u'The iron curtain : Churchill, America, and the origins of the Cold War',
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'The American Historical Review',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'},
{u'_id': u'7B3236C5',
u'_index': u'scibase_listings',
u'_score': 1.0,
u'_source': {u'authors': [{u'author_id': u'7DAB7B72',
u'author_name': u'richard m freeland'}],
u'deleted': 0,
u'description': None,
u'doi': u'',
u'is_valid': 1,
u'issue': None,
u'journal': u'The American Historical Review',
u'link': None,
u'meta_description': None,
u'meta_keywords': None,
u'normalized_venue_name': u'american historical review',
u'pages': None,
u'parent_keywords': [u'Political Science', u'Economics'],
u'pub_date': u'1985-01-01 00:00:00',
u'pubtype': None,
u'rating_avg_weighted': 0,
u'rating_clarity': 0.0,
u'rating_clarity_weighted': 0.0,
u'rating_innovation': 0.0,
u'rating_innovation_weighted': 0.0,
u'rating_num_weighted': 0,
u'rating_reproducability': 0,
u'rating_reproducibility_weighted': 0.0,
u'rating_versatility': 0.0,
u'rating_versatility_weighted': 0.0,
u'review_count': 0,
u'tag': [u'foreign policy'],
u'title': u'The Truman Doctrine and the origins of McCarthyism : foreign policy, domestic politics, and internal security, 1946-1948',
u'userAvg': 0.0,
u'user_id': None,
u'venue_name': u'The American Historical Review',
u'views_count': 0,
u'volume': None},
u'_type': u'listing'}],
u'max_score': 1.0,
u'total': 36429433},
u'timed_out': False,
u'took': 170}
In the pandas example (below) what do the brackets mean? Is there a logic to be followed to go deeper with the []. [...]
result = json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])
Each string or list of strings in the ['state', 'shortname', ['info', 'governor']] value is a path to an element to include, in addition to the selected rows. The second argument json_normalize() argument (record_path, set to 'counties' in the documentation example) tells the function how to select elements from the input data structure that make up the rows in the output, and the meta paths adds further metadata that will be included with each of the rows. Think of these as table joins in a database, if you will.
The input for the US States documentation example has two dictionaries in a list, and both of these dictionaries have a counties key that references another list of dicts:
>>> data = [{'state': 'Florida',
... 'shortname': 'FL',
... 'info': {'governor': 'Rick Scott'},
... 'counties': [{'name': 'Dade', 'population': 12345},
... {'name': 'Broward', 'population': 40000},
... {'name': 'Palm Beach', 'population': 60000}]},
... {'state': 'Ohio',
... 'shortname': 'OH',
... 'info': {'governor': 'John Kasich'},
... 'counties': [{'name': 'Summit', 'population': 1234},
... {'name': 'Cuyahoga', 'population': 1337}]}]
>>> pprint(data[0]['counties'])
[{'name': 'Dade', 'population': 12345},
{'name': 'Broward', 'population': 40000},
{'name': 'Palm Beach', 'population': 60000}]
>>> pprint(data[1]['counties'])
[{'name': 'Summit', 'population': 1234},
{'name': 'Cuyahoga', 'population': 1337}]
Between them there are 5 rows of data to use in the output:
>>> json_normalize(data, 'counties')
name population
0 Dade 12345
1 Broward 40000
2 Palm Beach 60000
3 Summit 1234
4 Cuyahoga 1337
The meta argument then names some elements that live next to those counties lists, and those are then merged in separately. The values from the first data[0] dictionary for those meta elements are ('Florida', 'FL', 'Rick Scott'), respectively, and for data[1] the values are ('Ohio', 'OH', 'John Kasich'), so you see those values attached to the counties rows that came from the same top-level dictionary, repeated 3 and 2 times respectively:
>>> data[0]['state'], data[0]['shortname'], data[0]['info']['governor']
('Florida', 'FL', 'Rick Scott')
>>> data[1]['state'], data[1]['shortname'], data[1]['info']['governor']
('Ohio', 'OH', 'John Kasich')
>>> json_normalize(data, 'counties', ['state', 'shortname', ['info', 'governor']])
name population state shortname info.governor
0 Dade 12345 Florida FL Rick Scott
1 Broward 40000 Florida FL Rick Scott
2 Palm Beach 60000 Florida FL Rick Scott
3 Summit 1234 Ohio OH John Kasich
4 Cuyahoga 1337 Ohio OH John Kasich
So, if you pass in a list for the meta argument, then each element in the list is a separate path, and each of those separate paths identifies data to add to the rows in the output.
In your example JSON, there are only a few nested lists to elevate with the first argument, like 'counties' did in the example. The only example in that datastructure is the nested 'authors' key; you'd have to extract each ['_source', 'authors'] path, after which you can add other keys from the parent object to augment those rows.
The second meta argument then pulls in the _id key from the outermost objects, followed by the nested ['_source', 'title'] and ['_source', 'journal'] nested paths.
The record_path argument takes the authors lists as the starting point, these look like:
>>> d['hits']['hits'][0]['_source']['authors'] # this value is None, and is skipped
>>> d['hits']['hits'][1]['_source']['authors']
[{'affiliations': ['Punjabi University'],
'author_id': '780E3459',
'author_name': 'munish puri'},
{'affiliations': ['Punjabi University'],
'author_id': '48D92C79',
'author_name': 'rajesh dhaliwal'},
{'affiliations': ['Punjabi University'],
'author_id': '7D9BD37C',
'author_name': 'r s singh'}]
>>> d['hits']['hits'][2]['_source']['authors']
[{'author_id': '7FF872BC',
'author_name': 'barbara eileen ryan'}]
>>> # etc.
and so gives you the following rows:
>>> json_normalize(d['hits']['hits'], ['_source', 'authors'])
affiliations author_id author_name
0 [Punjabi University] 780E3459 munish puri
1 [Punjabi University] 48D92C79 rajesh dhaliwal
2 [Punjabi University] 7D9BD37C r s singh
3 NaN 7FF872BC barbara eileen ryan
4 NaN 0299B8E9 fraser j harbutt
5 NaN 7DAB7B72 richard m freeland
and then we can use the third meta argument to add more columns like _id, _source.title and _source.journal, using ['_id', ['_source', 'journal'], ['_source', 'title']]:
>>> json_normalize(
... data['hits']['hits'],
... ['_source', 'authors'],
... ['_id', ['_source', 'journal'], ['_source', 'title']]
... )
affiliations author_id author_name _id \
0 [Punjabi University] 780E3459 munish puri 7AF8EBC3
1 [Punjabi University] 48D92C79 rajesh dhaliwal 7AF8EBC3
2 [Punjabi University] 7D9BD37C r s singh 7AF8EBC3
3 NaN 7FF872BC barbara eileen ryan 7521A721
4 NaN 0299B8E9 fraser j harbutt 7DAEB9A4
5 NaN 7DAB7B72 richard m freeland 7B3236C5
_source.journal
0 Journal of Industrial Microbiology & Biotechno...
1 Journal of Industrial Microbiology & Biotechno...
2 Journal of Industrial Microbiology & Biotechno...
3 The American Historical Review
4 The American Historical Review
5 The American Historical Review
_source.title \
0 Development of a stable continuous flow immobi...
1 Development of a stable continuous flow immobi...
2 Development of a stable continuous flow immobi...
3 Feminism and the women's movement : dynamics o...
4 The iron curtain : Churchill, America, and the...
5 The Truman Doctrine and the origins of McCarth...
You can also have a look at the library flatten_json, which does not require you to write column hierarchies as in json_normalize:
from flatten_json import flatten
data = d['hits']['hits']
dict_flattened = (flatten(record, '.') for record in data)
df = pd.DataFrame(dict_flattened)
print(df)
See https://github.com/amirziai/flatten.
Adding to Sanders comment,
more context can be found here as the creator of this function has a medium blog:
https://towardsdatascience.com/flattening-json-objects-in-python-f5343c794b10
It is worth keeping in mind that panda's json_normalize can handle most json objects, like arrays for example. The flatten_json library requires it to be a nested dict. However, you can work around this requirement by adding the array to a dict like so:
flatten({'response':data}, '.')
In this case, the flatten_json library will actually use a counter in the dot notation to distinguished against duplicates. For example:
flatten({
'response': [
{'metrics': {'clicks': '0', 'cost_micros': '0', 'impressions': '3'},
'segments': {'date': '2022-12-01'}},
{'metrics': {'clicks': '1', 'cost_micros': '609240', 'impressions': '358'},
'segments': {'date': '2022-12-01'}},
{'metrics': {'clicks': '0', 'cost_micros': '0', 'impressions': '3'},
'segments': {'date': '2022-12-02'}},
{'metrics': {'clicks': '2', 'cost_micros': '40000', 'impressions': '291'},
'segments': {'date': '2022-12-02'}},
{'metrics': {'clicks': '0', 'cost_micros': '0', 'impressions': '2'},
'segments': {'date': '2022-12-03'}},
{'metrics': {'clicks': '2', 'cost_micros': '337754', 'impressions': '241'},
'segments': {'date': '2022-12-03'}},
{'metrics': {'clicks': '0', 'cost_micros': '0', 'impressions': '4'},
'segments': {'date': '2022-12-04'}},
{'metrics': {'clicks': '2', 'cost_micros': '757299', 'impressions': '197'},
'segments': {'date': '2022-12-04'}}
]
}, '.')
Produces:
{'response.0.metrics.clicks': '0',
'response.0.metrics.cost_micros': '0',
'response.0.metrics.impressions': '3',
'response.0.segments.date': '2022-12-01',
'response.1.metrics.clicks': '1',
'response.1.metrics.cost_micros': '609240',
'response.1.metrics.impressions': '358',
'response.1.segments.date': '2022-12-01',
'response.2.metrics.clicks': '0',
'response.2.metrics.cost_micros': '0',
'response.2.metrics.impressions': '3',
'response.2.segments.date': '2022-12-02',
'response.3.metrics.clicks': '2',
'response.3.metrics.cost_micros': '40000',
'response.3.metrics.impressions': '291',
'response.3.segments.date': '2022-12-02',
'response.4.metrics.clicks': '0',
'response.4.metrics.cost_micros': '0',
'response.4.metrics.impressions': '2',
'response.4.segments.date': '2022-12-03',
'response.5.metrics.clicks': '2',
'response.5.metrics.cost_micros': '337754',
'response.5.metrics.impressions': '241',
'response.5.segments.date': '2022-12-03',
'response.6.metrics.clicks': '0',
'response.6.metrics.cost_micros': '0',
'response.6.metrics.impressions': '4',
'response.6.segments.date': '2022-12-04',
'response.7.metrics.clicks': '2',
'response.7.metrics.cost_micros': '757299',
'response.7.metrics.impressions': '197',
'response.7.segments.date': '2022-12-04'}

Categories

Resources