How to put 'flat' json data into python data frame? - python

I'm making an API request and getting data that I'm not sure how to deal with. I would like to have all the data within a pandas dataframe with the 'channel_id' as rows (or index I suppose) and all of the other info as columns.
This is the call I make:
with requests.Session() as s:
r1 = s.post(url_post,data=payload)
r2 = s.get(url_get)
data = r2.json()
And here is what the data looks like:
{u'value': {u'data': [{u'avg_revenue': u'0.006585714285714286',
u'avg_revenue_usd': u'0.006585714285714286',
u'channel_id': u'95622',
u'channel_name': u'game',
u'clicks': u'0',
u'coverage': u'6035',
u'cpi': u'0.006585714285714286',
u'cpi_usd': u'0.0066',
u'cpm': u'6.585714285714286',
u'cpm_usd': u'6.58570',
u'ctr': u'0.0000',
u'currency_code': u'USD',
u'ecpm': u'0.007598483599802209',
u'ecpm_usd': u'0.00760',
u'fill_rate': u'0.0012',
u'high_revenue': u'0.024',
u'high_revenue_usd': u'0.024000',
u'impressions': u'7',
u'low_revenue': u'0.0221',
u'low_revenue_usd': u'0.000000',
u'net_cpm': u'6.3692857142857140',
u'net_cpm_usd': u'6.36930',
u'payout': u'0.044585000000',
u'payout_usd': u'0.044585',
u'publisher_id': u'#######',
u'publisher_name': u'Agency',
u'queries': u'6067',
u'results': u'17561',
u'revenue': u'0.0461',
u'revenue_usd': u'0.0461'},
....
{u'avg_revenue': u'7.368266666666667',
u'avg_revenue_usd': u'7.368266666666667',
u'channel_id': u'122795',
u'channel_name': u'BT3',
u'clicks': u'26',
u'coverage': u'495022',
u'cpi': u'0.007470361608651572',
u'cpi_usd': u'0.0075',
u'cpm': u'7.470361608651572',
u'cpm_usd': u'7.47040',
u'ctr': u'0.0088',
u'currency_code': u'USD',
u'ecpm': u'0.01847035191331348',
u'ecpm_usd': u'0.01850',
u'fill_rate': u'0.0025',
u'high_revenue': u'22.1048',
u'high_revenue_usd': u'5.466700',
u'impressions': u'2959',
u'low_revenue': u'22.1048',
u'low_revenue_usd': u'0.000000',
u'net_cpm': u'7.0342510983440350',
u'net_cpm_usd': u'7.03430',
u'payout': u'20.814349000000',
u'payout_usd': u'20.814349',
u'publisher_id': u'#######',
u'publisher_name': u'Agency',
u'queries': u'1196772',
u'results': u'1425193',
u'revenue': u'22.1048',
u'revenue_usd': u'22.1048'}]}}

df = pd.DataFrame.from_dict(data['value']['data'])
df.set_index(['channel_id'], inplace=True)

Related

Convert JSON to CSV with Python 3

I need to get some data from the Meetup API, convert the JSON I get into a CSV, all of that in Python 3. I've never worked with JSON or Python, so I've run into some issues. Getting the data is not a problem, but I can't seem to parse the JSON object into a proper CSV file. My code, anonymized:
import json
import requests
import csv
def main():
# initialize variables
output_file = 'result_meetup.csv'
api_key = "YOUR_MEETUP_API_KEY"
params = {'key': api_key}
url_path = 'http://api.meetup.com/pro/:URLNAME/groups'
# get data from API, parse to JSON
data = requests.get(url_path, params=params)
data_parsed = data.json()
length_data = len(data_parsed) - 1
data_to_file = open(output_file, 'w')
csv_writer = csv.writer(data_to_file)
for i in range(0, length_data):
meetup = data_parsed[i]
csv_writer.writerow([meetup])
data_to_file.close()
if __name__ == "__main__":
main()
Now, as you can see, I write into the CSV as a list. This leads to a really ugly output, looking like this (again, the values in caps are anonymized):
{u'rsvps_per_event': 0.0, u'organizers': [{u'permission': u'organizer', u'name': u'NAME', u'member_id': ID}], u'repeat_rsvpers': 0, u'topics': [{u'lang': u'en_US', u'id': ID, u'urlkey': u'socialnetwork', u'name': u'Social Networking'}, {u'lang': u'en_US', u'id': ID, u'urlkey': u'education', u'name': u'Education'}, {u'lang': u'en_US', u'id': ID, u'urlkey': u'newtech', u'name': u'New Technology'}, {u'lang': u'en_US', u'id': ID, u'urlkey': u'business-referral-networking', u'name': u'Business Referral Networking'}], u'upcoming_events': 0, u'gender_female': 0.3499999940395355, u'pro_join_date': DATE, u'id': ID, u'category': [{u'shortname': u'career-business', u'sort_name': u'Career & Business', u'id': 2, u'name': u'Career & Business'}], u'city': u'CITY', u'member_count': 73, u'lon': LON, u'organizer_photo': {u'thumb_link': u'LINK.JPEG', u'base_url': u'URL', u'id': ID, u'type': u'member', u'photo_link': u'LINK.JPEG', u'highres_link': u'LINK.JPEG'}, u'average_age': 35.555599212646484, u'status': u'Active', u'description': u'DESCRIPTION' u'founded_date': DATE, lat': LAT, u'urlname': u'NAME', u'gender_male': 0.6000000238418579, u'name': u'NAME', u'country': u'Portugal', u'gender_unknown': 0.05000000074505806, u'past_events': 0, u'gender_other': 0.0, u'past_rsvps': 0}
So basically, the whole JSON object in a single CSV field, with weird 'u's, in lists and so on. However, if I don't write it as a list, I only get the fields of the JSON object, without the data, so it would just be 'Organizers', 'Name' etc, without the actual name.
As I said, I am a python beginner and haven't found any libraries to help me with it, but I'm sure they exist. Any help is really appreciated, and it would be great if it was Python3 compatible.
Edit: What I would like it to look like in the end:
I get multiple Meetup groups in the reply, all having the same structure as shown above. Therefore, the description of the value should be listed just once, as a header, and the values listed beneath (new lines and pipes indicating a new field of a csv sheet):
RSVPs_per_event | Organizer | ID
5 | Tom | 1
20 | Jack | 2
35 | Anne | 3
To convert the json data to csv you need to extract keys and write them in header and then work on the values. This might help you:
data_parsed = json.loads(Data)
header = data_parsed[0].keys()
csv_writer.writerow(header)
for i in range(0,length_data)
meetup = data_parsed[i].values()
csv_writer.writerow([meetup])
If anyone else has the same problem, I solved it, not very elegantly, but I needed the data. The JSON was too nestled and complex to parse, so now I just read the fields I need from the data.
import json
import requests
import csv
def main():
# initialize variables
output_file = 'result_meetup.csv'
api_key = "YOUR_API_KEY"
params = {'key': api_key}
url_path = 'http://api.meetup.com/pro/:URLNAME/groups'
# get data from API, parse to JSON
data = requests.get(url_path, params=params)
data_parsed = data.json()
length_data = len(data_parsed) - 1
data_to_file = open(output_file, 'w', newline='')
csv_writer = csv.writer(data_to_file, delimiter=";")
csv_writer.writerow(["id","name","city","country","member count","average age","founded_date","past_rsvps","rsvps_per_event","repeat_rsvpers","gender_unknown","gender_female","gender_male","gender_other"])
for i in range(0, length_data):
meetup = data_parsed[i]
id = meetup['id']
name = meetup['name']
city = meetup['city']
country = meetup['country']
member_count = meetup['member_count']
average_age = meetup['average_age']
founded_date = meetup['founded_date']
past_rsvps = meetup['past_rsvps']
rsvps_per_event = meetup['rsvps_per_event']
repeat_rsvpers = meetup['repeat_rsvpers']
gender_unknown = meetup['gender_unknown']
gender_female = meetup['gender_female']
gender_male = meetup['gender_male']
gender_other = meetup['gender_other']
csv_writer.writerow([id,name,city,country,member_count,average_age,founded_date,past_rsvps,rsvps_per_event,repeat_rsvpers,gender_unknown,gender_female,gender_male,gender_other])
data_to_file.close()
if __name__ == "__main__":
main()

Storing Python dictionary data into a csv

I have a list of dicts that stores Facebook status data (Graph API):
len(test_statuses)
3
test_statuses
[{u'comments': {u'data': [{u'created_time': u'2016-01-27T10:47:30+0000',
u'from': {u'id': u'1755814687982070', u'name': u'Fadi Cool Panther'},
u'id': u'447173898813933_447182555479734',
u'message': u'Sidra Abrar'}],
u'paging': {u'cursors': {u'after': u'WTI5dGJXVnVkRjlqZFhKemIzSTZORFEzTVRneU5UVTFORGM1TnpNME9qRTBOVE00T1RFMk5UQT0=',
u'before': u'WTI5dGJXVnVkRjlqZFhKemIzSTZORFEzTVRneU5UVTFORGM1TnpNME9qRTBOVE00T1RFMk5UQT0='}},
u'summary': {u'can_comment': False,
u'order': u'ranked',
u'total_count': 1}},
u'created_time': u'2016-01-27T10:16:56+0000',
u'id': u'5842136044_10153381090881045',
u'likes': {u'data': [{u'id': u'729038357232696'},
{u'id': u'547422955417520'},
{u'id': u'422351987958296'},
{u'id': u'536057309903473'},
{u'id': u'206846772999449'},
{u'id': u'1671329739783719'},
{u'id': u'991398107599340'},
{u'id': u'208751836138231'},
{u'id': u'491047841097510'},
{u'id': u'664580270350825'}],
u'paging': {u'cursors': {u'after': u'NjY0NTgwMjcwMzUwODI1',
u'before': u'NzI5MDM4MzU3MjMyNjk2'},
u'next': u'https://graph.facebook.com/v2.5/5842136044_10153381090881045/likes?limit=10&summary=true&access_token=521971961312518|121ca7ef750debf4c51d1388cf25ead4&after=NjY0NTgwMjcwMzUwODI1'},
u'summary': {u'can_like': False, u'has_liked': False, u'total_count': 13}},
u'link': u'https://www.facebook.com/ukbhangrasongs/videos/447173898813933/',
u'message': u'Track : Ik Waar ( Official Music Video )\nSinger : Falak shabir ft DJ Shadow\nMusic by Dj Shadow\nFor more : UK Bhangra Songs',
u'shares': {u'count': 7},
u'type': u'video'},
{u'comments': {u'data': [],
u'summary': {u'can_comment': False,
u'order': u'chronological',
u'total_count': 0}},
u'created_time': u'2016-01-27T06:15:40+0000',
u'id': u'5842136044_10153380831261045',
u'likes': {u'data': [],
u'summary': {u'can_like': False, u'has_liked': False, u'total_count': 0}},
u'message': u'I want to work with you. tracks for flicks',
u'type': u'status'}]
I need to extract each status text and the text of each comment under the status, which I can do by appending them to separate lists e.g.,:
status_text = []
comment_text = []
for s in test_statuses:
try:
status_text.append(s['message'])
for c in s['comments']['data']:
comment_text.append(c['message'])
except:
continue
This gives me two lists of separate lengths len(status_text) = 2, len(comment_text) = 49.
Unfortunately that's a horrible way of dealing with the data since I cannot keep track of what comment belongs to what status. Ideally I would like to store this as a tree structure and export in into a cvs file, but I can't figure out how to do it.
Probable data acv data structure:
Text is_comment
status1 0
status2 0
statusN 0
comment1 status1
comment2 status1
commentN statusN
Why do you need this to be in a CSV? It is already structured and ready to be persisted as JSON.
If you really need the tabular approach offered by CSV, then you have to either denormalize it, or use more than one CSV table with references from one to another (and again, the best approach would be to put the data in an SQL database which takes care of the relationships for you)
That said, the way to denormalize is simply to save the same status text to each row where a comment is - that is: record you CSV row in the innermost loop with your approach:
import csv
status_text = []
comment_text = []
writer = csv.writer(open("mycsv.csv", "wt"))
for s in test_statuses:
test_messages.append(s['message'])
for c in s['comments']['data']:
test_comments.append(c['message'])
writer.writerow((s['message'], c['message']))
Note that you'd probably be better by writing the status idto each row, and create a second table with the status message where the id is the key (and put it in a database instead of various CSV files). And then, again, you are probably better of simply keeping the JSON. If you need search capabilities, use a JSON capable database such as MongoDB or PostgreSQL

How do I merge and sort two json lists using key value

I can get a JSON list of groups and devices from an API, but the key values don't allow me to do a merge without manipulating the returned lists. Unfortunately, the group info and devices info have to be retrieved using separate http requests.
The code for getting the group info looks like this:
#Python Code
import requests
import simplejson as json
import datetime
import pprintpp
print datetime.datetime.now().time()
url = 'https://www.somecompany.com/api/v2/groups/?fields=id,name'
s = requests.Session()
## Ver2 API Authenticaion ##
headers = {
'X-ABC-API-ID': 'nnnn-nnnn-nnnn-nnnn-nnnn',
'X-ABC-API-KEY': 'nnnnnnnn',
'X-DE-API-ID': 'nnnnnnnn',
'X-DE-API-KEY': 'nnnnnnnn'
}
r = json.loads(s.get((url), headers=headers).text)
print "Working...Groups extracted"
groups = r["data"]
print "*** Ver2 API Groups Information ***"
pprintpp.pprint (groups)
The printed output of groups looks like this:
#Groups
[
{u'id': u'0001', u'name': u'GroupA'},
{u'id': u'0002', u'name': u'GroupB'},
]
The code for getting the devices info looks like this:
url = 'https://www.somecompany.com/api/v2/devicess/?limit=500&fields=description,group,id,name'
r = json.loads(s.get((url), headers=headers).text)
print "Working...Devices extracted"
devices = r["data"]
print "*** Ver2 API Devices Information ***"
pprintpp.pprint (devices)
The devices output looks like this:
#Devices
[
{
u'description': u'GroupB 100 (City State)',
u'group': u'https://www.somecompany.com/api/v2/groups/0002/',
u'id': u'90001',
u'name': u'ABC550-3e9',
},
{
u'description': u'GroupA 101 (City State)',
u'group': u'https://www.somecompany.com/api/v2/groups/0001/',
u'id': u'90002',
u'name': u'ABC500-3e8',
}
]
What I would like to do is to be able to merge and sort the two JSON lists into an output that looks like this:
#Desired Output
#Seperated List of GroupA & GroupB Devices
[
{u'id': u'0001', u'name': u'GroupA'},
{
u'description': u'GroupA 101 (City State)',
u'group': u'https://www.somecompany.com/api/v2/groups/0001/',
u'id': u'90002',
u'name': u'ABC500-3e8',
},
{u'id': u'0002', u'name': u'GroupB'},
{
u'description': u'GroupB 100 (City State)',
u'group': u'https://www.somecompany.com/api/v2/groups/0002/',
u'id': u'90001',
u'name': u'ABC550-3e9',
}
]
A couple of problems I am having is that the key names for groups and devices output are not unique. The key named 'id' in groups is actually the same value as the last 4 digits of the key named 'group' in devices, and is the value I wish to use for the sort. Also, 'id' and 'name' in groups is different than 'id' and 'name' in devices. My extremely limited skill with Python is making this quite the challenge. Any help with pointing me in the correct direction for a solution will be greatly appreciated.
This program produces your desired output:
import pprintpp
groups = [
{u'id': u'0001', u'name': u'GroupA'},
{u'id': u'0002', u'name': u'GroupB'},
]
devices = [
{
u'description': u'GroupB 100 (City State)',
u'group': u'https://www.somecompany.com/api/v2/groups/0002/',
u'id': u'90001',
u'name': u'ABC550-3e9',
},
{
u'description': u'GroupA 101 (City State)',
u'group': u'https://www.somecompany.com/api/v2/groups/0001/',
u'id': u'90002',
u'name': u'ABC500-3e8',
}
]
desired = sorted(
groups + devices,
key = lambda x: x.get('group', x.get('id')+'/')[-5:-1])
pprintpp.pprint(desired)
Or, if that lambda does not seem self-documenting:
def key(x):
'''Sort on either the last few digits of x['group'], if that exists,
or the entirety of x['id'], if x['group'] does not exist.
'''
if 'group' in x:
return x['group'][-5:-1]
return x['id']
desired = sorted(groups + devices, key=key)

Create dictionary with different fields from different dictionaries

I have such list:
two_dimension_sizelist = \
[{u'sizeOptionId': u'1542',
u'sizeOptionName': u'1',
u'sortOrderNumber': u'915'},
{u'sizeOptionId': u'1543',
u'sizeOptionName': u'2',
u'sortOrderNumber': u'975'},
...
{u'sizeOptionId': u'1602',
u'sizeOptionName': u'Long',
u'sortOrderNumber': u'6873'}]
And I have some products:
{
"businessCatalogItemId":"5453220021802",
"inventoryStatusId":"0",
"colorName":"Medium Wash",
"sizeVariantId":"1",
"upcCode":"197476818021",
"onOrderDate":"2014-07-04T00:00:00-04:00",
"currentMinPrice":"34.94",
"currentPrice":"0.0",
"baseColorId":"1021",
"isClearanceItem":"false",
"catalogItemTypeId":"3",
"sizeDimension1Id":"1591",
"catalogItemSubtypeId":"15",
"isInStock":"true",
"skuId":"5453220021802",
"regularMaxPrice":"34.94",
"nowPrice":null,
"variantName":"Regular",
"isOnOrder":"false",
"mailOnlyReturn":"M",
"regularMinPrice":"34.94",
"reservable":"true",
"onlyAvailableOnline":"false",
"catalogItemId":"5146840",
"regularPrice":"0.0",
"isLowInventory":"false",
"sizeDimension2Id":"1601",
"priceType":"1",
"currentMaxPrice":"34.94"
},
{
"businessCatalogItemId":"5453220021803",
"inventoryStatusId":"4",
"colorName":"Medium Wash",
"sizeVariantId":"1",
"upcCode":"197476818038",
"onOrderDate":"2014-07-02T00:00:00-04:00",
"currentMinPrice":"34.94",
"currentPrice":"0.0",
"baseColorId":"1021",
"isClearanceItem":"false",
"catalogItemTypeId":"3",
"sizeDimension1Id":"1591",
"catalogItemSubtypeId":"15",
"isInStock":"true",
"skuId":"5453220021803",
"regularMaxPrice":"34.94",
"nowPrice":null,
"variantName":"Regular",
"isOnOrder":"true",
"mailOnlyReturn":"M",
"regularMinPrice":"34.94",
"reservable":"true",
"onlyAvailableOnline":"true",
"catalogItemId":"5146832",
"regularPrice":"0.0",
"isLowInventory":"false",
"sizeDimension2Id":"1602",
"priceType":"1",
"currentMaxPrice":"34.94"
}
Each product can have sizeDimension1Id and sizeDimension2Id or only sizeDimension1Id or none, I should map each size from two_diminsion_sizelist for each product to build dict:
{'size2Name': 'Long', 'size2Id': '1602', 'size1Name': '16', 'size1Id': '1590', {other product parameters}}
I have done this thing:
for dict_size in two_dimension_sizelist:
two_dimension_sizedict.update(
{dict_size['sizeOptionId']: dict_size['sizeOptionName']})
Which gives me in two_dimension_sizedict:
{u'1542': u'1',
u'1543': u'2',
u'1590': u'16',
u'1591': u'18',
u'1601': u'Regular',
u'1602': u'Long',
u'1604': u'Short',
u'1640': u'4',
u'1642': u'6',
u'1644': u'8',
u'1645': u'10',
u'1646': u'12',
u'1647': u'14'}
Am I doing right?
Now I dont properly know how to couple this sizes with products.
It seems to me that you will scan through the two_dimension_sizelist a lot of times and search for matching "sizeOptionId" from your product dictionaries.
I would suggest doing this:
all_size_list = dict((str(x['sizeOptionId']), str(x['sizeOptionName'])) for x in two_dimension_sizelist)
for product in products_list:
if "sizeDimension1Id" in product.keys():
size = product['sizeDimension1Id']
if size in all_size_list.keys():
product.update({
'size1Name': all_size_list[size],
'size1Id': size
})
# This can exist only if size1 exists
if "sizeDimension2Id" in product.keys():
size = product['sizeDimension2Id']
if size in all_size_list.keys():
product.update({
'size2Name': all_size_list[size],
'size2Id': size
})
Your product list seems like JSON string, please convert Python values using json library.

Using the data for 'requests' module in python

I am able to load in some weather data from the requests module for python with the following code:
from pprint import pprint
import requests
r = requests.get('http://api.openweathermap.org/data/2.5/weather?q=London')
pprint(r.json())
But how do I actually use the data it produces? I cannot for the life of me find the relevant documentation or tutorial on how to do this. this is the output of the pprint:
{u'base': u'cmc stations',
u'clouds': {u'all': 0},
u'cod': 200,
u'coord': {u'lat': 42.98, u'lon': -81.23},
u'dt': 1397676977,
u'id': 6058560,
u'main': {u'humidity': 25,
u'pressure': 1024,
u'temp': 278,
u'temp_max': 283.15,
u'temp_min': 275.37},
u'name': u'London',
u'rain': {u'1h': 0.25},
u'snow': {u'3h': 0},
u'sys': {u'country': u'CA',
u'message': 0.0467,
u'sunrise': 1397644810,
u'sunset': 1397693338},
u'weather': [{u'description': u'light rain'
u'icon': u'10d',
u'id': 500,
u'main': u'Rain'}],
u'wind': {u'deg': 168.001, u'speed': 3.52}}
How could I address an item within the list? For example to print just the temp on it's own and maybe to use it as a variable. E.g.:
temp = *not sure what to put here*
print temp
Now that you have the results:
results = r.json()
just access it like any other Python dict:
main = results['main'] # Get the 'main' key's value out of results
temp = main['temp'] # Get the 'temp' key's value out of main
print temp
or more tersely (and the way you'd almost always write this in real life):
print results['main']['temp']

Categories

Resources