I wrote a code which is making a request to API and recieving output in JSON. So my question is how to write output for each request in file. Now my code is doing the last one request.
import requests
import json
with open("query4.txt", "rt") as file:
data_file = file.read()
for line in data_file.split("\n"):
drX, drY, fromX, fromY, dist = line.split(",")
url = "https://api.openrouteservice.org/directions?"
params = [
["api_key", "my_api_key"],
["coordinates", "%s,%s|%s,%s" % (drY, drX, fromY, fromX)],
["profile", "driving-car"]
]
headers = {
"Accept": "application/json, application/geo+json,"
"application/gpx+xml, img/png; charset=utf-8"}
responce = requests.get(url=url, params=params, headers=headers)
# print(responce.url)
# print(responce.text)
result = json.loads(responce.text)
# print(result)
with open("result.txt", "w+") as f_o:
for rows in result["routes"]:
f_o.writelines(json.dumps(rows["summary"]["distance"])) # depending on how do you want the result
print(result["routes"])
I have an output like this:
{'routes': [{'warnings': [{'code': 1, 'message': 'There may be restrictions on some roads'}], 'summary': {'distance': 899.6, 'duration': 102.1}, 'geometry_format': 'encodedpolyline', 'geometry': 'u~uiHir{iFb#SXzADTlAk#JOJ]#_#CWW{AKo#k#eDEYKo#y#{EGc#G]GYCOa#gCc#iCoBsLNGlAm#VK^Sh#Un#tD', 'segments': [{'distance': 899.6, 'duration': 102.1, 'steps': [{'distance': 22.1, 'duration': 5.3, 'type': 11, 'instruction': 'Head south', 'name': '', 'way_points': [0, 1]}, {'distance': 45.4, 'duration': 10.9, 'type': 1, 'instruction': 'Turn right', 'name': '', 'way_points': [1, 3]}, {'distance': 645.5, 'duration': 52.3, 'type': 0, 'instruction': 'Turn left onto Партизанська вулиця', 'name': 'Партизанська вулиця', 'way_points': [3, 21]}, {'distance': 114.4, 'duration': 20.6, 'type': 1, 'instruction': 'Turn right', 'name': '', 'way_points': [21, 26]}, {'distance': 72.1, 'duration': 13, 'type': 1, 'instruction': 'Turn right', 'name': '', 'way_points': [26, 27]}, {'distance': 0, 'duration': 0, 'type': 10, 'instruction': 'Arrive at your destination, on the left', 'name': '', 'way_points': [27, 27]}]}], 'way_points': [0, 27], 'extras': {'roadaccessrestrictions': {'values': [[0, 1, 0], [1, 3, 2], [3, 27, 0]], 'summary': [{'value': 0, 'distance': 854.2, 'amount': 94.95}, {'value': 2, 'distance': 45.4, 'amount': 5.05}]}}, 'bbox': [38.484536, 48.941171, 38.492904, 48.943022]}], 'bbox': [38.484536, 48.941171, 38.492904, 48.943022], 'info': {'attribution': 'openrouteservice.org | OpenStreetMap contributors', 'engine': {'version': '5.0.1', 'build_date': '2019-05-29T14:22:56Z'}, 'service': 'routing', 'timestamp': 1568280549854, 'query': {'profile': 'driving-car', 'preference': 'fastest', 'coordinates': [[38.485115, 48.942059], [38.492073, 48.941676]], 'language': 'en', 'units': 'm', 'geometry': True, 'geometry_format': 'encodedpolyline', 'instructions_format': 'text', 'instructions': True, 'elevation': False}}}
{'routes': [{'summary': {'distance': 2298, 'duration': 365.6}, 'geometry_format': 'encodedpolyline', 'geometry': 'u~a{Gee`zDLIvBvDpClCtA|AXHXCp#m#bBsBvBmC`AmAtIoKNVLXHPb#c#`A_AFENGzAc#XKZCJ?PDLBH#F?T?PC~CcATOt#Sd#QLKBCBAb#]ZG|#OY_DQ}IE{DC_DAg#Eg#q#aFgBuH^GjBFj#
I did NeverHopeless answer, but i've got the same:
result = json.loads(responce.text)
i = 0
with open(f"result-{i}.txt", "w+") as f_o:
i += 1
for rows in result["routes"]:
f_o.writelines(json.dumps(rows["summary"]["distance"])) # depending on how do you want the result
print(result["routes"])
Output now looks like this
899.622982138.832633191.8
I'm expecting to get this:
2298
2138.8
3263
3191.8
Every value is a distance from different requests so i need to have each on the new line.
You need to open and keep open the output file before your loop:
import requests
import json
with open("query4.txt", "rt") as file:
data_file = file.read()
with open("result.txt", "w") as f_o:
for line in data_file.split("\n"):
drX, drY, fromX, fromY, dist = line.split(",")
url = "https://api.openrouteservice.org/directions?"
params = [
["api_key", "my_api_key"],
["coordinates", "%s,%s|%s,%s" % (drY, drX, fromY, fromX)],
["profile", "driving-car"]
]
headers = {
"Accept": "application/json, application/geo+json,"
"application/gpx+xml, img/png; charset=utf-8"}
responce = requests.get(url=url, params=params, headers=headers)
# print(responce.url)
# print(responce.text)
result = json.loads(responce.text)
# print(result)
for rows in result["routes"]:
print(rows["summary"]["distance"], file=f_o) # depending on how do you want the result
# print(result["routes"])
I think it's better to write results in different files with timestamp. in this way you don't rewrite on your older file and also you can find them easier.
current_time = time.strftime("%m_%d_%y %H_%M_%S", time.localtime())
with open(current_time + ".txt", "w+") as f_o:
for rows in result["routes"]:
f_o.writelines(json.dumps(rows["summary"]["distance"])) # depending on how do you want the result
print(result["routes"])
You need to make this filename "result.txt" dynamic. Currently it is overwriting content.
Perhaps like this:
i = 0 # <----- Keep it outside your for loop or it will be always set to zero
with open(f"result-{i}.txt", "w+") as f_o:
i += 1
Or instead of integers, you may better use timestamp in filename.
Related
I have 70k files all of which look similar to this:
{'id': 24, 'name': None, 'city': 'City', 'region_id': 19,
'story_id': 1, 'description': 'text', 'uik': None, 'ustatus': 'status',
'wuiki_tik_name': '', 'reaction': None, 'reaction_official': '',
'created_at': '2011-09-07T07:24:44.420Z', 'lat': 54.7, 'lng': 20.5,
'regions': {'id': 19, 'name': 'name'}, 'stories': {'id': 1, 'name': '2011-12-04'}, 'assets': [], 'taggings': [{'tags': {'id': 6, 'name': 'name',
'tag_groups': {'id': 3, 'name': 'Violation'}}},
{'tags': {'id': 8, 'name': 'name', 'tag_groups': {'id': 5, 'name': 'resource'}}},
{'tags': {'id': 1, 'name': '01. Federal', 'tag_groups': {'id': 1, 'name': 'Level'}}},
{'tags': {'id': 3, 'name': '03. Local', 'tag_groups': {'id': 1, 'name': 'stuff'}}},
{'tags': {'id': 2, 'name': '02. Regional', 'tag_groups':
{'id': 1, 'name': 'Level'}}}], 'message_id': None, '_count': {'assets': 0, 'other_messages': 0, 'similars': 0, 'taggings': 5}}
The ultimate goal is to export it into a single CSV file. It can be successfully done without flattening. But since it has a lot of nested values, I would like to flatten it, and this is where I began facing problems related to data types. Here's the code:
import json
from pandas.io.json import json_normalize
import glob
path = glob.glob("all_messages/*.json")
for file in path:
with open(file, "r") as filer:
content = json.loads(json.dumps(filer.read()))
if content != 404:
df_main = json_normalize(content)
df_regions = json_normalize(content, record_path=['regions'], record_prefix='regions.', meta=['id'])
df_stories = json_normalize(content, record_path=['stories'], record_prefix='stories.', meta=['id'])
#... More code related to normalization
df_out.to_csv('combined_json.csv')
This code occasionally throws:
AttributeError: 'str' object has no attribute 'values' or ValueError: DataFrame constructor not properly called!. I realise that this is caused by json.dumps() JSON string output. However, I have failed to turn it into anything useable.
Any possible solutions to this?
If you only need to change ' to ":
...
for file in path:
with open(file, "r") as filer:
filer.replace("\'", "\"")
...
Making copies and using grep would be easier
While it is not the solution I was initially expecting, this approach worked as well. I kept getting error messages related to the structure of the dict literals that were reluctant to become json, so I took the csv file that I wanted to normalise and worked with each column one by one:
df = pd.read_csv("combined_json.csv")
df['regions'] = df['regions'].apply(lambda x: x.replace("'", '"'))
regions = pd.json_normalize(df['regions'].apply(json.loads).tolist()).rename(
columns=lambda x: x.replace('regions.', ''))
df['regions'] = regions['name']
Or, if it had more nested levels:
df['taggings'] = df['taggings'].apply(lambda x: x.replace("'", '"'))
taggings = pd.concat([pd.json_normalize(json.loads(j)) for j in df['taggings']])
df = df.reset_index(drop=True)
taggings = taggings.reset_index(drop=True)
df[['tags_id', 'nametag', 'group_tag', 'group_tag_name']] = taggings[['tags.id', 'tags.name', 'tags.tag_groups.id', 'tags.tag_groups.name']]
Which was eventually df.to_csv().
#This is my code
import pandas as pd
import bson
FILE="users_(1).bson"
with open(FILE,'rb') as f:
data = bson.decode_all(f.read())
main_df=pd.DataFrame(data)
main_df.describe()
#This is my .bson file
[{'_id': ObjectId('999f24f260f653401b'),
'isV2': False,
'isBeingMigratedToV2': False,
'firstName': 'Jezz',
'lastName': 'Bezos',
'subscription': {'_id': ObjectId('999f24f260f653401b'),
'chargebeeId': 'AzZdd6T847kHQ',
'currencyCode': 'EUR',
'customerId': 'AzZdd6T847kHQ',
'nextBillingAt': datetime.datetime(2022, 7, 7, 10, 14, 6),
'numberOfMonthsPaid': 1,
'planId': 'booster-v3-eur',
'startedAt': datetime.datetime(2022, 6, 7, 10, 14, 6),
'addons': [],
'campaign': None,
'maskedCardNumber': '************1234'},
'email': 'jeffbezos#gmail.com',
'groupName': None,
'username': 'jeffbezy',
'country': 'DE'},
{'_id': ObjectId('999f242660f653401b'),
'isV2': False,
'isBeingMigratedToV2': False,
'firstName': 'Caterina',
'lastName': 'Fake',
'subscription': {'_id': ObjectId('999f242660f653401b'),
'chargebeeId': '16CGLYT846t99',
'currencyCode': 'GBP',
'customerId': '16CGLYT846t99',
'nextBillingAt': datetime.datetime(2022, 7, 7, 10, 10, 41),
'numberOfMonthsPaid': 1,
'planId': 'personal-v3-gbp',
'startedAt': datetime.datetime(2022, 6, 7, 10, 10, 41),
'addons': [],
'campaign': None,
'maskedCardNumber': '************4311'},
'email': 'caty.fake#gmail.com',
'groupName': None,
'username': 'cfake',
'country': 'GB'}]
I get the error
'bson.errors.InvalidBSON: objsize too large'
Is it something to do with the datetime? Is it the structure of the .bson file, been at this for hours and can't seem to see the error. I know how to work with json and tried to convert it to json but no success. Any tips would be appreciated.
If the main goal here is to read the data into a pandas DataFrame you could indeed format the data to json and use bson.json_util.loads:
import pandas as pd
from bson.json_util import loads
with open(filepath,'r') as f:
data = f.read()
mapper = {
'\'': '"', # using double quotes
'False': 'false',
'None': '\"None\"', # double quotes around None
# modifying the ObjectIds and timestamps
'("': '(',
'")': ')',
')': ')"',
'ObjectId': '"ObjectId',
'datetime.datetime': '"datetime.datetime'
}
for k, v in mapper.items():
data = data.replace(k, v)
data = loads(data)
df = pd.DataFrame(data)
I need to get certain values out of a list of dictionaries, which looks like that and is assigned to the variable 'segment_values':
[{'distance': 114.6,
'duration': 20.6,
'instruction': 'Head north',
'name': '-',
'type': 11,
'way_points': [0, 5]},
{'distance': 288.1,
'duration': 28.5,
'instruction': 'Turn right onto Knaufstraße',
'name': 'Knaufstraße',
'type': 1,
'way_points': [5, 17]},
{'distance': 3626.0,
'duration': 273.0,
'instruction': 'Turn slight right onto B320',
'name': 'B320',
'type': 5,
'way_points': [17, 115]},
{'distance': 54983.9,
'duration': 2679.3,
'instruction': 'Keep right onto Ennstal-Bundesstraße, B320',
'name': 'Ennstal-Bundesstraße, B320',
'type': 13,
'way_points': [115, 675]},
{'distance': 11065.1,
'duration': 531.1,
'instruction': 'Keep left onto Pyhrn Autobahn, A9',
'name': 'Pyhrn Autobahn, A9',
'type': 12,
'way_points': [675, 780]},
{'distance': 800.7,
'duration': 64.1,
'instruction': 'Keep right',
'name': '-',
'type': 13,
'way_points': [780, 804]},
{'distance': 49.6,
'duration': 4.0,
'instruction': 'Keep left',
'name': '-',
'type': 12,
'way_points': [804, 807]},
{'distance': 102057.2,
'duration': 4915.0,
'instruction': 'Keep right',
'name': '-',
'type': 13,
'way_points': [807, 2104]},
{'distance': 56143.4,
'duration': 2784.5,
'instruction': 'Keep left onto S6',
'name': 'S6',
'type': 12,
'way_points': [2104, 2524]},
{'distance': 7580.6,
'duration': 389.8,
'instruction': 'Keep left',
'name': '-',
'type': 12,
'way_points': [2524, 2641]},
{'distance': 789.0,
'duration': 63.1,
'instruction': 'Keep right',
'name': '-',
'type': 13,
'way_points': [2641, 2663]},
{'distance': 815.9,
'duration': 65.3,
'instruction': 'Keep left',
'name': '-',
'type': 12,
'way_points': [2663, 2684]},
{'distance': 682.9,
'duration': 54.6,
'instruction': 'Turn left onto Heinrich-Drimmel-Platz',
'name': 'Heinrich-Drimmel-Platz',
'type': 0,
'way_points': [2684, 2711]},
{'distance': 988.1,
'duration': 79.0,
'instruction': 'Turn left onto Arsenalstraße',
'name': 'Arsenalstraße',
'type': 0,
'way_points': [2711, 2723]},
{'distance': 11.7,
'duration': 2.1,
'instruction': 'Turn left',
'name': '-',
'type': 0,
'way_points': [2723, 2725]},
{'distance': 0.0,
'duration': 0.0,
'instruction': 'Arrive at your destination, on the left',
'name': '-',
'type': 10,
'way_points': [2725, 2725]}]
I need to get the duration values and the waypoint values out of that code segment.
For the duration I tried:
segment_values= data['features'][0]['properties']['segments'][0]['steps'] #gets me the above code
print(segment_values[0:]['duration'])
Shouldn't this print me all dictionaries, and the values at duration in each of them?
I also tried this:
duration = data['features'][0]['properties']['segments'][0]['steps'][0:]['duration']
print(duration)
Both tries give me "TypeError: list indices must be integers or slices, not str
"
Where am I going wrong?
Your data is a list of dictionaries.
For this reason you need to cycle through its content in order to access data.
Please try this print statement to look at the data more closely:
for item in data_list:
print(item)
In order to access duration per each item you can use similar code:
for item in data_list:
print(item['duration'])
You can also use list comprehension to achieve the same result:
duration = [item['duration'] for item in data_list]
List comprehension is a Pythonic way to obtain the same result, you can read more about it here.
The same principle can be applied twice if a key in your data contains a list or another iterable, here's another example:
for item in data:
print("\nPrinting waypoints for name: " + item['name'])
for way_point in item['way_points']:
print(way_point)
duration = [x['duration'] for x in segment_values]
waypoints =[x['way_points'] for x in segment_values]
You might be thinking of higher-level wrappers like pandas, which would let you do
>>> import pandas as pd
>>> import numpy as np
>>> df = pd.DataFrame(np.random.randn(3, 2), index=list('abc'), columns=list('xy'))
>>> df
x y
a -0.192041 -0.312067
b -0.595700 0.339085
c -0.524242 0.946350
>>> df.x
a -0.192041
b -0.595700
c -0.524242
Name: x, dtype: float64
>>> df[0:].x
a -0.192041
b -0.595700
c -0.524242
Name: x, dtype: float64
>>> df[1:].y
b 0.339085
c 0.946350
Name: y, dtype: float64
Another tool for this is glom, which provides helpers for logic like this (pip install glom).
>>> from glom import glom
>>> from pprint import pprint
>>> data = <your data>
>>> pprint(glom(data, [{'wp': 'way_points', 'dist': 'distance'}]))
[{'dist': 114.6, 'wp': [0, 5]},
{'dist': 288.1, 'wp': [5, 17]},
{'dist': 3626.0, 'wp': [17, 115]},
{'dist': 54983.9, 'wp': [115, 675]},
{'dist': 11065.1, 'wp': [675, 780]},
{'dist': 800.7, 'wp': [780, 804]},
{'dist': 49.6, 'wp': [804, 807]},
{'dist': 102057.2, 'wp': [807, 2104]},
{'dist': 56143.4, 'wp': [2104, 2524]},
{'dist': 7580.6, 'wp': [2524, 2641]},
{'dist': 789.0, 'wp': [2641, 2663]},
{'dist': 815.9, 'wp': [2663, 2684]},
{'dist': 682.9, 'wp': [2684, 2711]},
{'dist': 988.1, 'wp': [2711, 2723]},
{'dist': 11.7, 'wp': [2723, 2725]},
{'dist': 0.0, 'wp': [2725, 2725]}]
You can get a feel on how other cases might work from the documentation:
https://glom.readthedocs.io/en/latest/faq.html#how-does-glom-work
def glom(target, spec):
# if the spec is a string or a Path, perform a deep-get on the target
if isinstance(spec, (basestring, Path)):
return _get_path(target, spec)
# if the spec is callable, call it on the target
elif callable(spec):
return spec(target)
# if the spec is a dict, assign the result of
# the glom on the right to the field key on the left
elif isinstance(spec, dict):
ret = {}
for field, subspec in spec.items():
ret[field] = glom(target, subspec)
return ret
# if the spec is a list, run the spec inside the list on every
# element in the list and return the new list
elif isinstance(spec, list):
subspec = spec[0]
iterator = _get_iterator(target)
return [glom(t, subspec) for t in iterator]
# if the spec is a tuple of specs, chain the specs by running the
# first spec on the target, then running the second spec on the
# result of the first, and so on.
elif isinstance(spec, tuple):
res = target
for subspec in spec:
res = glom(res, subspec)
return res
else:
raise TypeError('expected one of the above types')
I am trying to ge around with APIs in general. To test this I coded this little snippet of code to get a list of all the channels on the Swedish national public service radio, and I want to print the ID and NAME of the channels:
import requests as rq
import json
from pprint import pprint
resp = rq.get('http://api.sr.se/api/v2/channels?
format=json&indent=TRUE')
respjson = json.loads(resp.text)
pprint (respjson['id'])
And I get the error
File "sr-api.py", line 9, in <module>
pprint (respjson['id']['name'])
KeyError: 'id'
The (abbreviated) 'respjson' looks like this
{'channels': [{'channeltype': 'Rikskanal',
'color': '31a1bd',
'id': 132,
'image': 'http://static-cdn.sr.se/sida/images/132/2186745_512_512.jpg?preset=api-default-square',
'imagetemplate': 'http://static-cdn.sr.se/sida/images/132/2186745_512_512.jpg',
'liveaudio': {'id': 132,
'statkey': '/app/direkt/p1[k(132)]',
'url': 'http://sverigesradio.se/topsy/direkt/srapi/132.mp3'},
'name': 'P1',
'scheduleurl': 'http://api.sr.se/v2/scheduledepisodes?channelid=132',
'siteurl': 'http://sverigesradio.se/p1',
'xmltvid': 'p1.sr.se'},
{'channeltype': 'Lokal kanal',
'color': 'c31eaa',
'id': 200,
'image': 'http://static-cdn.sr.se/sida/images/200/2186775_512_512.jpg?preset=api-default-square',
'imagetemplate': 'http://static-cdn.sr.se/sida/images/200/2186775_512_512.jpg',
'liveaudio': {'id': 200,
'statkey': '/app/direkt/p4 jämtland[k(200)]',
'url': 'http://sverigesradio.se/topsy/direkt/srapi/200.mp3'},
'name': 'P4 Jämtland',
'scheduleurl': 'http://api.sr.se/v2/scheduledepisodes?channelid=200',
'siteurl': 'http://sverigesradio.se/jamtland/',
'xmltvid': 'p4jmtl.sr.se'}],
'copyright': 'Copyright Sveriges Radio 2017. All rights reserved.',
'pagination': {'nextpage': 'http://api.sr.se/v2/channelsformat=json&indent=true&page=2',
'page': 1,
'size': 10,
'totalhits': 55,
'totalpages': 6}}
Channels is a list. You have to iterate on it to get all channels and print their ids.
# starting from respjson
respjson = {
'channels': [
{
'channeltype': 'Rikskanal',
'color': '31a1bd',
'id': 132,
'image': 'http://static-cdn.sr.se/sida/images/132/2186745_512_512.jpg?preset=api-default-square',
'imagetemplate': 'http://static-cdn.sr.se/sida/images/132/2186745_512_512.jpg',
'liveaudio': {'id': 132,
'statkey': '/app/direkt/p1[k(132)]',
'url': 'http://sverigesradio.se/topsy/direkt/srapi/132.mp3'},
'name': 'P1',
'scheduleurl': 'http://api.sr.se/v2/scheduledepisodes?channelid=132',
'siteurl': 'http://sverigesradio.se/p1',
'xmltvid': 'p1.sr.se'},
{
'channeltype': 'Lokal kanal',
'color': 'c31eaa',
'id': 200,
'image': 'http://static-cdn.sr.se/sida/images/200/2186775_512_512.jpg?preset=api-default-square',
'imagetemplate': 'http://static-cdn.sr.se/sida/images/200/2186775_512_512.jpg',
'liveaudio': {'id': 200,
'statkey': '/app/direkt/p4 jämtland[k(200)]',
'url': 'http://sverigesradio.se/topsy/direkt/srapi/200.mp3'},
'name': 'P4 Jämtland',
'scheduleurl': 'http://api.sr.se/v2/scheduledepisodes?channelid=200',
'siteurl': 'http://sverigesradio.se/jamtland/',
'xmltvid': 'p4jmtl.sr.se'
}
],
'copyright': 'Copyright Sveriges Radio 2017. All rights reserved.',
'pagination': {
'nextpage': 'http://api.sr.se/v2/channelsformat=json&indent=true&page=2',
'page': 1,
'size': 10,
'totalhits': 55,
'totalpages': 6
}
}
for channel in respjson['channels']:
print(channel['id'])
What you want to do is look thru the dictionaries presented inside the channels, you can do that with the following...
for dic in respjson['channels']:
pprint(dic['id'])
I'm trying to flatten a json file using json_normalize in Python (Pandas), but being a noob at this I always seem to end up in a KeyError.
What I would like to achieve is a DataFrame with all the Plays in a game.
I've tried numerous variants of paths and prefixes, but no success. Googled a lot as well, but I'm still falling short.
What I would like to end up with is a DataFrame like:
period, time, type, player1, player2, xcord, ycord
import pandas as pd
import json
with open('PlayByPlay.json') as data_file:
data = json.load(data_file)
from pandas.io.json import json_normalize
records = json_normalize(data)
plays = records['data.game.plays.play'][0]
plays
Would generate
{'aoi': [8470324, 8473449, 8475158, 8475215, 8477499, 8477933],
'apb': [],
'as': 0,
'asog': 0,
'desc': 'Zack Kassian hit Kyle Okposo',
'eventid': 7,
'formalEventId': 'EDM7',
'hoi': [8471678, 8475178, 8475660, 8476454, 8476457, 8476472],
'hpb': [],
'hs': 0,
'hsog': 0,
'localtime': '5:12 PM',
'p1name': 'Zack Kassian',
'p2name': 'Kyle Okposo',
'p3name': '',
'period': 1,
'pid': 8475178,
'pid1': 8475178,
'pid2': 8473449,
'pid3': '',
'playername': 'Zack Kassian',
'strength': 701,
'sweater': '44',
'teamid': 22,
'time': '00:28',
'type': 'Hit',
'xcoord': 22,
'ycoord': 38}
Json
{'data': {'game': {'awayteamid': 7,
'awayteamname': 'Buffalo Sabres',
'awayteamnick': 'Sabres',
'hometeamid': 22,
'hometeamname': 'Edmonton Oilers',
'hometeamnick': 'Oilers',
'plays': {'play': [{'aoi': [8470324,
8473449,
8475158,
8475215,
8477499,
8477933],
'apb': [],
'as': 0,
'asog': 0,
'desc': 'Zack Kassian hit Kyle Okposo',
'eventid': 7,
'formalEventId': 'EDM7',
'hoi': [8471678, 8475178, 8475660, 8476454, 8476457, 8476472],
'hpb': [],
'hs': 0,
'hsog': 0,
'localtime': '5:12 PM',
'p1name': 'Zack Kassian',
'p2name': 'Kyle Okposo',
'p3name': '',
'period': 1,
'pid': 8475178,
'pid1': 8475178,
'pid2': 8473449,
'pid3': '',
'playername': 'Zack Kassian',
'strength': 701,
'sweater': '44',
'teamid': 22,
'time': '00:28',
'type': 'Hit',
'xcoord': 22,
'ycoord': 38},
{'aoi': [8471742, 8475179, 8475215, 8475220, 8475235, 8475728],
'apb': [],
'as': 0,
'asog': 0,
'desc': 'Jesse Puljujarvi Tip-In saved by Robin Lehner',
'eventid': 59,
'formalEventId': 'EDM59',
'hoi': [8473468, 8474034, 8475660, 8477498, 8477934, 8479344],
'hpb': [],
'hs': 0,
'hsog': 1,
'localtime': '5:13 PM',
'p1name': 'Jesse Puljujarvi',
'p2name': 'Robin Lehner',
'p3name': '',
'period': 1,
'pid': 8479344,
'pid1': 8479344,
'pid2': 8475215,
'pid3': '',
'playername': 'Jesse Puljujarvi',
'strength': 701,
'sweater': '98',
'teamid': 22,
'time': '01:32',
'type': 'Shot',
'xcoord': 81,
'ycoord': 3}]}},
'refreshInterval': 0}}
If you have only one game, this will create the dataframe you want:
json_normalize(data['data']['game']['plays']['play'])
Then you just need to extract the columns you're interested in.
it might be un-intuitive to use this API when the structure becomes complicated.
but the key is: json_normalize extracts JSON fields into table.
for my case: I have a table
----------
| fact | // each row is a json object {'a':a, 'b':b....}
----------
rrrrr = []
for index, row in data.iterrows():
r1 = json_normalize(row['fact'])
rrrrr.append(r1)
rr1 = pd.concat(rrrrr)