Convert JSON to CSV with Python 3 - python

I need to get some data from the Meetup API, convert the JSON I get into a CSV, all of that in Python 3. I've never worked with JSON or Python, so I've run into some issues. Getting the data is not a problem, but I can't seem to parse the JSON object into a proper CSV file. My code, anonymized:
import json
import requests
import csv
def main():
# initialize variables
output_file = 'result_meetup.csv'
api_key = "YOUR_MEETUP_API_KEY"
params = {'key': api_key}
url_path = 'http://api.meetup.com/pro/:URLNAME/groups'
# get data from API, parse to JSON
data = requests.get(url_path, params=params)
data_parsed = data.json()
length_data = len(data_parsed) - 1
data_to_file = open(output_file, 'w')
csv_writer = csv.writer(data_to_file)
for i in range(0, length_data):
meetup = data_parsed[i]
csv_writer.writerow([meetup])
data_to_file.close()
if __name__ == "__main__":
main()
Now, as you can see, I write into the CSV as a list. This leads to a really ugly output, looking like this (again, the values in caps are anonymized):
{u'rsvps_per_event': 0.0, u'organizers': [{u'permission': u'organizer', u'name': u'NAME', u'member_id': ID}], u'repeat_rsvpers': 0, u'topics': [{u'lang': u'en_US', u'id': ID, u'urlkey': u'socialnetwork', u'name': u'Social Networking'}, {u'lang': u'en_US', u'id': ID, u'urlkey': u'education', u'name': u'Education'}, {u'lang': u'en_US', u'id': ID, u'urlkey': u'newtech', u'name': u'New Technology'}, {u'lang': u'en_US', u'id': ID, u'urlkey': u'business-referral-networking', u'name': u'Business Referral Networking'}], u'upcoming_events': 0, u'gender_female': 0.3499999940395355, u'pro_join_date': DATE, u'id': ID, u'category': [{u'shortname': u'career-business', u'sort_name': u'Career & Business', u'id': 2, u'name': u'Career & Business'}], u'city': u'CITY', u'member_count': 73, u'lon': LON, u'organizer_photo': {u'thumb_link': u'LINK.JPEG', u'base_url': u'URL', u'id': ID, u'type': u'member', u'photo_link': u'LINK.JPEG', u'highres_link': u'LINK.JPEG'}, u'average_age': 35.555599212646484, u'status': u'Active', u'description': u'DESCRIPTION' u'founded_date': DATE, lat': LAT, u'urlname': u'NAME', u'gender_male': 0.6000000238418579, u'name': u'NAME', u'country': u'Portugal', u'gender_unknown': 0.05000000074505806, u'past_events': 0, u'gender_other': 0.0, u'past_rsvps': 0}
So basically, the whole JSON object in a single CSV field, with weird 'u's, in lists and so on. However, if I don't write it as a list, I only get the fields of the JSON object, without the data, so it would just be 'Organizers', 'Name' etc, without the actual name.
As I said, I am a python beginner and haven't found any libraries to help me with it, but I'm sure they exist. Any help is really appreciated, and it would be great if it was Python3 compatible.
Edit: What I would like it to look like in the end:
I get multiple Meetup groups in the reply, all having the same structure as shown above. Therefore, the description of the value should be listed just once, as a header, and the values listed beneath (new lines and pipes indicating a new field of a csv sheet):
RSVPs_per_event | Organizer | ID
5 | Tom | 1
20 | Jack | 2
35 | Anne | 3

To convert the json data to csv you need to extract keys and write them in header and then work on the values. This might help you:
data_parsed = json.loads(Data)
header = data_parsed[0].keys()
csv_writer.writerow(header)
for i in range(0,length_data)
meetup = data_parsed[i].values()
csv_writer.writerow([meetup])

If anyone else has the same problem, I solved it, not very elegantly, but I needed the data. The JSON was too nestled and complex to parse, so now I just read the fields I need from the data.
import json
import requests
import csv
def main():
# initialize variables
output_file = 'result_meetup.csv'
api_key = "YOUR_API_KEY"
params = {'key': api_key}
url_path = 'http://api.meetup.com/pro/:URLNAME/groups'
# get data from API, parse to JSON
data = requests.get(url_path, params=params)
data_parsed = data.json()
length_data = len(data_parsed) - 1
data_to_file = open(output_file, 'w', newline='')
csv_writer = csv.writer(data_to_file, delimiter=";")
csv_writer.writerow(["id","name","city","country","member count","average age","founded_date","past_rsvps","rsvps_per_event","repeat_rsvpers","gender_unknown","gender_female","gender_male","gender_other"])
for i in range(0, length_data):
meetup = data_parsed[i]
id = meetup['id']
name = meetup['name']
city = meetup['city']
country = meetup['country']
member_count = meetup['member_count']
average_age = meetup['average_age']
founded_date = meetup['founded_date']
past_rsvps = meetup['past_rsvps']
rsvps_per_event = meetup['rsvps_per_event']
repeat_rsvpers = meetup['repeat_rsvpers']
gender_unknown = meetup['gender_unknown']
gender_female = meetup['gender_female']
gender_male = meetup['gender_male']
gender_other = meetup['gender_other']
csv_writer.writerow([id,name,city,country,member_count,average_age,founded_date,past_rsvps,rsvps_per_event,repeat_rsvpers,gender_unknown,gender_female,gender_male,gender_other])
data_to_file.close()
if __name__ == "__main__":
main()

Related

Converting deeply nested JSON response from an API call to pandas dataframe

I am currently having trouble parsing a deeply nested JSON response from a HTTP API call.
My JSON Response is like
{'took': 476,
'_revision': 'r08badf3',
'response': {'accounts': {'hits': [{'name': '4002238760',
'display_name': 'Googleglass-4002238760',
'selected_fields': ['Googleglass',
'DDMonkey',
'Papu New Guinea',
'Jonathan Vardharajan',
'4002238760',
'DDMadarchod-INSTE',
None,
'Googleglass',
'0001012556',
'CC',
'Setu Non Standard',
'40022387',
320142,
4651321321333,
1324650651651]},
{'name': '4003893720',
'display_name': 'Swift-4003893720',
'selected_fields': ['Swift',
'DDMonkey',
'Papu New Guinea',
'Jonathan Vardharajan',
'4003893720',
'DDMadarchod-UPTM-RemotexNBD',
None,
'S.W.I.F.T. SCRL',
'0001000110',
'SE',
'Setu Non Standard',
'40038937',
189508,
1464739200000,
1559260800000]},
After I receive the response I am storing it in data object using json normalize
data = response.json()
data = data['response']['accounts']['hits']
data = json_normalize(data)
However after I normalize my dataframe looks like this
My Curl Statement looks like this
curl --data 'query= {"terms":[{"type":"string_attribute","attribute":"Account Type","query_term_id":"account_type","in_list":["Contract"]},{"type":"string","term":"status_group","in_list":["paying"]},{"type":"string_attribute","attribute":"Region","in_list":["DDEU"]},{"type":"string_attribute","attribute":"Country","in_list":["Belgium"]},{"type":"string_attribute","attribute":"CSM Tag","in_list":["EU CSM"]},{"type":"date_attribute","attribute":"Contract Renewal Date","gte":1554057000000,"lte":1561833000000}],"count":1000,"offset":0,"fields":[{"type":"string_attribute","attribute":"DomainName","field_display_name":"Client Name"},{"type":"string_attribute","attribute":"Region","field_display_name":"Region"},{"type":"string_attribute","attribute":"Country","field_display_name":"Country"},{"type":"string_attribute","attribute":"Success Manager","field_display_name":"Client Success Manager"},{"type":"string","term":"identifier","field_display_name":"Account id"},{"type":"string_attribute","attribute":"DeviceSLA","field_display_name":"[FIN] Material Part Number"},{"type":"string_attribute","attribute":"SFDCAccountId","field_display_name":"SFDCAccountId"},{"type":"string_attribute","attribute":"Client","field_display_name":"[FIN] Client Sold-To Name"},{"type":"string_attribute","attribute":"Sold To Code","field_display_name":"[FIN] Client Sold To Code"},{"type":"string_attribute","attribute":"BU","field_display_name":"[FIN] Active BUs"},{"type":"string_attribute","attribute":"Service Type","field_display_name":"[FIN] Service Type"},{"type":"string_attribute","attribute":"Contract Header ID","field_display_name":"[FIN] SAP Contract Header ID"},{"type":"number_attribute","attribute":"Contract Value","field_display_name":"[FIN] ACV - Annual Contract Value","desc":true},{"type":"date_attribute","attribute":"Contract Start Date","field_display_name":"[FIN] Contract Start Date"},{"type":"date_attribute","attribute":"Contract Renewal Date","field_display_name":"[FIN] Contract Renewal Date"}],"scope":"all"}' --header 'app-token:YOUR-TOKEN-HERE' 'https://app.totango.com/api/v1/search/accounts'
So ultimately I want to store the Response in a dataframe along with the field names.
I've had to do this sort of thing a few times in the past (flatten out a nested json) I'll explain my process, and you can see if it works, or at least can then work the code a bit to fit your needs.
1) Took the data response, and completely flattened it out using a function. This blog was very helpful when I first had to do this.
2) Then it iterates through the flat dictionary created to find where each rows and columns are needed to be created by the numbering of the new key names within the nested parts. There are also keys that are unique/distinct, so they don't have a number to identify as a "new" row, so I account for those in what I called special_cols.
3) As it iterates through those, pulls the specified row number (embedded in those flat keys), and then constructs the dataframe in that way.
It sounds complicated, but if you debug and run line by line, you could see how it works. None-the-less, I believe it should get you what you need.
data = {'took': 476,
'_revision': 'r08badf3',
'response': {'accounts': {'hits': [{'name': '4002238760',
'display_name': 'Googleglass-4002238760',
'selected_fields': ['Googleglass',
'DDMonkey',
'Papu New Guinea',
'Jonathan Vardharajan',
'4002238760',
'DDMadarchod-INSTE',
None,
'Googleglass',
'0001012556',
'CC',
'Setu Non Standard',
'40022387',
320142,
4651321321333,
1324650651651]},
{'name': '4003893720',
'display_name': 'Swift-4003893720',
'selected_fields': ['Swift',
'DDMonkey',
'Papu New Guinea',
'Jonathan Vardharajan',
'4003893720',
'DDMadarchod-UPTM-RemotexNBD',
None,
'S.W.I.F.T. SCRL',
'0001000110',
'SE',
'Setu Non Standard',
'40038937',
189508,
1464739200000,
1559260800000]}]}}}
import pandas as pd
import re
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
flat = flatten_json(data)
results = pd.DataFrame()
special_cols = []
columns_list = list(flat.keys())
for item in columns_list:
try:
row_idx = re.findall(r'\_(\d+)\_', item )[0]
except:
special_cols.append(item)
continue
column = re.findall(r'\_\d+\_(.*)', item )[0]
column = column.replace('_', '')
row_idx = int(row_idx)
value = flat[item]
results.loc[row_idx, column] = value
for item in special_cols:
results[item] = flat[item]
Output:
print (results.to_string())
name displayname selectedfields0 selectedfields1 selectedfields2 selectedfields3 selectedfields4 selectedfields5 selectedfields6 selectedfields7 selectedfields8 selectedfields9 selectedfields10 selectedfields11 selectedfields12 selectedfields13 selectedfields14 took _revision
0 4002238760 Googleglass-4002238760 Googleglass DDMonkey Papu New Guinea Jonathan Vardharajan 4002238760 DDMadarchod-INSTE NaN Googleglass 0001012556 CC Setu Non Standard 40022387 320142.0 4.651321e+12 1.324651e+12 476 r08badf3
1 4003893720 Swift-4003893720 Swift DDMonkey Papu New Guinea Jonathan Vardharajan 4003893720 DDMadarchod-UPTM-RemotexNBD NaN S.W.I.F.T. SCRL 0001000110 SE Setu Non Standard 40038937 189508.0 1.464739e+12 1.559261e+12 476 r08badf3

PATCH call not updating field via API

Have a function to update a field in Netbox via API. The same data works on the Django web interface so I know it's not that, just something in my script that I'm doing wrong.
def change_allocated_server_status(api_token="", limit="",jira_access=""):
api_token_here = "Token " + api_token
headers = {'Authorization': api_token_here}
params = {'limit': limit}
sites = "https://my-url.com/api/dcim/devices/?role=server-planned"
session = requests.Session()
site_response = session.get(sites, headers=headers, params=params)
site_results = site_response.json()['results']
allocated_servers = get_devices_by_dc_loca(api_token, limit, jira_access)
url = "https://my-url.com/api/dcim/devices/239"
update = {
"device_role": 41
}
change = requests.patch(url, headers=headers, data=update)
change_results = change.json()
print change_results
The output of print change_results is
{u'status': 2, u'device_role': 40, u'name': u'device-name', u'site': 1, u'comments': u'', u'rack': 4, u'asset_tag': None, u'platform': None, u'primary_ip4': None, u'device_type': 7, u'primary_ip6': None, u'custom_fields': {}, u'position': 5, u'serial': u'', u'face': 0, u'id': 239, u'tenant': 1}
device_role isn't being changed. Doing a print change.status_code returns 200 so I know I'm hitting the API without a authentication problem, just guessing it's something simple I'm missing
Was missing a '/' from the end of my url, stopping the PATCH from being called but throwing no error.

Storing Python dictionary data into a csv

I have a list of dicts that stores Facebook status data (Graph API):
len(test_statuses)
3
test_statuses
[{u'comments': {u'data': [{u'created_time': u'2016-01-27T10:47:30+0000',
u'from': {u'id': u'1755814687982070', u'name': u'Fadi Cool Panther'},
u'id': u'447173898813933_447182555479734',
u'message': u'Sidra Abrar'}],
u'paging': {u'cursors': {u'after': u'WTI5dGJXVnVkRjlqZFhKemIzSTZORFEzTVRneU5UVTFORGM1TnpNME9qRTBOVE00T1RFMk5UQT0=',
u'before': u'WTI5dGJXVnVkRjlqZFhKemIzSTZORFEzTVRneU5UVTFORGM1TnpNME9qRTBOVE00T1RFMk5UQT0='}},
u'summary': {u'can_comment': False,
u'order': u'ranked',
u'total_count': 1}},
u'created_time': u'2016-01-27T10:16:56+0000',
u'id': u'5842136044_10153381090881045',
u'likes': {u'data': [{u'id': u'729038357232696'},
{u'id': u'547422955417520'},
{u'id': u'422351987958296'},
{u'id': u'536057309903473'},
{u'id': u'206846772999449'},
{u'id': u'1671329739783719'},
{u'id': u'991398107599340'},
{u'id': u'208751836138231'},
{u'id': u'491047841097510'},
{u'id': u'664580270350825'}],
u'paging': {u'cursors': {u'after': u'NjY0NTgwMjcwMzUwODI1',
u'before': u'NzI5MDM4MzU3MjMyNjk2'},
u'next': u'https://graph.facebook.com/v2.5/5842136044_10153381090881045/likes?limit=10&summary=true&access_token=521971961312518|121ca7ef750debf4c51d1388cf25ead4&after=NjY0NTgwMjcwMzUwODI1'},
u'summary': {u'can_like': False, u'has_liked': False, u'total_count': 13}},
u'link': u'https://www.facebook.com/ukbhangrasongs/videos/447173898813933/',
u'message': u'Track : Ik Waar ( Official Music Video )\nSinger : Falak shabir ft DJ Shadow\nMusic by Dj Shadow\nFor more : UK Bhangra Songs',
u'shares': {u'count': 7},
u'type': u'video'},
{u'comments': {u'data': [],
u'summary': {u'can_comment': False,
u'order': u'chronological',
u'total_count': 0}},
u'created_time': u'2016-01-27T06:15:40+0000',
u'id': u'5842136044_10153380831261045',
u'likes': {u'data': [],
u'summary': {u'can_like': False, u'has_liked': False, u'total_count': 0}},
u'message': u'I want to work with you. tracks for flicks',
u'type': u'status'}]
I need to extract each status text and the text of each comment under the status, which I can do by appending them to separate lists e.g.,:
status_text = []
comment_text = []
for s in test_statuses:
try:
status_text.append(s['message'])
for c in s['comments']['data']:
comment_text.append(c['message'])
except:
continue
This gives me two lists of separate lengths len(status_text) = 2, len(comment_text) = 49.
Unfortunately that's a horrible way of dealing with the data since I cannot keep track of what comment belongs to what status. Ideally I would like to store this as a tree structure and export in into a cvs file, but I can't figure out how to do it.
Probable data acv data structure:
Text is_comment
status1 0
status2 0
statusN 0
comment1 status1
comment2 status1
commentN statusN
Why do you need this to be in a CSV? It is already structured and ready to be persisted as JSON.
If you really need the tabular approach offered by CSV, then you have to either denormalize it, or use more than one CSV table with references from one to another (and again, the best approach would be to put the data in an SQL database which takes care of the relationships for you)
That said, the way to denormalize is simply to save the same status text to each row where a comment is - that is: record you CSV row in the innermost loop with your approach:
import csv
status_text = []
comment_text = []
writer = csv.writer(open("mycsv.csv", "wt"))
for s in test_statuses:
test_messages.append(s['message'])
for c in s['comments']['data']:
test_comments.append(c['message'])
writer.writerow((s['message'], c['message']))
Note that you'd probably be better by writing the status idto each row, and create a second table with the status message where the id is the key (and put it in a database instead of various CSV files). And then, again, you are probably better of simply keeping the JSON. If you need search capabilities, use a JSON capable database such as MongoDB or PostgreSQL

How do I merge and sort two json lists using key value

I can get a JSON list of groups and devices from an API, but the key values don't allow me to do a merge without manipulating the returned lists. Unfortunately, the group info and devices info have to be retrieved using separate http requests.
The code for getting the group info looks like this:
#Python Code
import requests
import simplejson as json
import datetime
import pprintpp
print datetime.datetime.now().time()
url = 'https://www.somecompany.com/api/v2/groups/?fields=id,name'
s = requests.Session()
## Ver2 API Authenticaion ##
headers = {
'X-ABC-API-ID': 'nnnn-nnnn-nnnn-nnnn-nnnn',
'X-ABC-API-KEY': 'nnnnnnnn',
'X-DE-API-ID': 'nnnnnnnn',
'X-DE-API-KEY': 'nnnnnnnn'
}
r = json.loads(s.get((url), headers=headers).text)
print "Working...Groups extracted"
groups = r["data"]
print "*** Ver2 API Groups Information ***"
pprintpp.pprint (groups)
The printed output of groups looks like this:
#Groups
[
{u'id': u'0001', u'name': u'GroupA'},
{u'id': u'0002', u'name': u'GroupB'},
]
The code for getting the devices info looks like this:
url = 'https://www.somecompany.com/api/v2/devicess/?limit=500&fields=description,group,id,name'
r = json.loads(s.get((url), headers=headers).text)
print "Working...Devices extracted"
devices = r["data"]
print "*** Ver2 API Devices Information ***"
pprintpp.pprint (devices)
The devices output looks like this:
#Devices
[
{
u'description': u'GroupB 100 (City State)',
u'group': u'https://www.somecompany.com/api/v2/groups/0002/',
u'id': u'90001',
u'name': u'ABC550-3e9',
},
{
u'description': u'GroupA 101 (City State)',
u'group': u'https://www.somecompany.com/api/v2/groups/0001/',
u'id': u'90002',
u'name': u'ABC500-3e8',
}
]
What I would like to do is to be able to merge and sort the two JSON lists into an output that looks like this:
#Desired Output
#Seperated List of GroupA & GroupB Devices
[
{u'id': u'0001', u'name': u'GroupA'},
{
u'description': u'GroupA 101 (City State)',
u'group': u'https://www.somecompany.com/api/v2/groups/0001/',
u'id': u'90002',
u'name': u'ABC500-3e8',
},
{u'id': u'0002', u'name': u'GroupB'},
{
u'description': u'GroupB 100 (City State)',
u'group': u'https://www.somecompany.com/api/v2/groups/0002/',
u'id': u'90001',
u'name': u'ABC550-3e9',
}
]
A couple of problems I am having is that the key names for groups and devices output are not unique. The key named 'id' in groups is actually the same value as the last 4 digits of the key named 'group' in devices, and is the value I wish to use for the sort. Also, 'id' and 'name' in groups is different than 'id' and 'name' in devices. My extremely limited skill with Python is making this quite the challenge. Any help with pointing me in the correct direction for a solution will be greatly appreciated.
This program produces your desired output:
import pprintpp
groups = [
{u'id': u'0001', u'name': u'GroupA'},
{u'id': u'0002', u'name': u'GroupB'},
]
devices = [
{
u'description': u'GroupB 100 (City State)',
u'group': u'https://www.somecompany.com/api/v2/groups/0002/',
u'id': u'90001',
u'name': u'ABC550-3e9',
},
{
u'description': u'GroupA 101 (City State)',
u'group': u'https://www.somecompany.com/api/v2/groups/0001/',
u'id': u'90002',
u'name': u'ABC500-3e8',
}
]
desired = sorted(
groups + devices,
key = lambda x: x.get('group', x.get('id')+'/')[-5:-1])
pprintpp.pprint(desired)
Or, if that lambda does not seem self-documenting:
def key(x):
'''Sort on either the last few digits of x['group'], if that exists,
or the entirety of x['id'], if x['group'] does not exist.
'''
if 'group' in x:
return x['group'][-5:-1]
return x['id']
desired = sorted(groups + devices, key=key)

Using the data for 'requests' module in python

I am able to load in some weather data from the requests module for python with the following code:
from pprint import pprint
import requests
r = requests.get('http://api.openweathermap.org/data/2.5/weather?q=London')
pprint(r.json())
But how do I actually use the data it produces? I cannot for the life of me find the relevant documentation or tutorial on how to do this. this is the output of the pprint:
{u'base': u'cmc stations',
u'clouds': {u'all': 0},
u'cod': 200,
u'coord': {u'lat': 42.98, u'lon': -81.23},
u'dt': 1397676977,
u'id': 6058560,
u'main': {u'humidity': 25,
u'pressure': 1024,
u'temp': 278,
u'temp_max': 283.15,
u'temp_min': 275.37},
u'name': u'London',
u'rain': {u'1h': 0.25},
u'snow': {u'3h': 0},
u'sys': {u'country': u'CA',
u'message': 0.0467,
u'sunrise': 1397644810,
u'sunset': 1397693338},
u'weather': [{u'description': u'light rain'
u'icon': u'10d',
u'id': 500,
u'main': u'Rain'}],
u'wind': {u'deg': 168.001, u'speed': 3.52}}
How could I address an item within the list? For example to print just the temp on it's own and maybe to use it as a variable. E.g.:
temp = *not sure what to put here*
print temp
Now that you have the results:
results = r.json()
just access it like any other Python dict:
main = results['main'] # Get the 'main' key's value out of results
temp = main['temp'] # Get the 'temp' key's value out of main
print temp
or more tersely (and the way you'd almost always write this in real life):
print results['main']['temp']

Categories

Resources