Python Flatten Multilevel JSON (LinkedIn API) to CSV - python
I am trying to DE-NEST the below JSON in python to create a CSV table, can somebody help?
Input JSON
{
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerGains": {
"organicFollowerGain": 2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634169600000, "end": 1634256000000 }
},
{
"followerGains": {
"organicFollowerGain": -1,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634256000000, "end": 1634342400000 }
},
{
"followerGains": {
"organicFollowerGain": -2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634342400000, "end": 1634428800000 }
},
{
"followerGains": {
"organicFollowerGain": 0,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634428800000, "end": 1634515200000 }
},
I tried the below code but it flattens everything into a single line.
I read in another thread that using json_normalize() will structure data in columns. But can someone please tell me how to do it for this case?
The code I used is as follows
Python code
import json
import pandas as pd
from pandas.io.json import json_normalize
data = json.load(open('C:/Users/Muj/Downloads/Linkedin data/follower_statistics_per_day.json'))
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
fd=flatten_json(data)
flat_data=json_normalize(fd)
flat_data.to_csv('C:/Users/Muj/Downloads/Linkedin data/test1.csv', index = False)
Can someone please help
The desired output is as follows -
organicFollowerGain
paidFollowerGain
organizationalEntity
2
0
urn:li:organization:28849398
-1
0
urn:li:organization:28849398
There is no need for your flatten_json function. Just pass the elements portion directly to json_nomalize
flat_data = json_normalize(data['elements'])
That returns
organizationalEntity,followerGains.organicFollowerGain,followerGains.paidFollowerGain,timeRange.start,timeRange.end
urn:li:organization:28849398,2,0,1634169600000,1634256000000
urn:li:organization:28849398,-1,0,1634256000000,1634342400000
urn:li:organization:28849398,-2,0,1634342400000,1634428800000
urn:li:organization:28849398,0,0,1634428800000,1634515200000
Then you just need to rename the column headers and remove any columns you don't want.
# Rename columns to only use the final section in dot name
flat_data.rename(dict((x, x.split('.')[-1]) for x in flat_data.columns if '.' in x), axis=1, inplace=True)
# Drop start and end columns
flat_data.drop(['start', 'end'], axis=1, inplace=True)
And that returns
organizationalEntity,organicFollowerGain,paidFollowerGain
urn:li:organization:28849398,2,0
urn:li:organization:28849398,-1,0
urn:li:organization:28849398,-2,0
urn:li:organization:28849398,0,0
Putting that all together:
from pandas.io.json import json_normalize
data = {
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerGains": {
"organicFollowerGain": 2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634169600000, "end": 1634256000000 }
},
{
"followerGains": {
"organicFollowerGain": -1,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634256000000, "end": 1634342400000 }
},
{
"followerGains": {
"organicFollowerGain": -2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634342400000, "end": 1634428800000 }
},
{
"followerGains": {
"organicFollowerGain": 0,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634428800000, "end": 1634515200000 }
}
]
}
flat_data = json_normalize(data['elements'])
# Rename columns to only use the final section in dot name
flat_data.rename(dict((x, x.split('.')[-1]) for x in flat_data.columns if '.' in x), axis=1, inplace=True)
# Drop start and end columns
flat_data.drop(['start', 'end'], axis=1, inplace=True)
flat_data.to_csv('out.csv', index=False)
Try the below (Not using any external lib - just core python)
import csv
data = {
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerGains": {
"organicFollowerGain": 2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634169600000, "end": 1634256000000 }
},
{
"followerGains": {
"organicFollowerGain": -1,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634256000000, "end": 1634342400000 }
},
{
"followerGains": {
"organicFollowerGain": -2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634342400000, "end": 1634428800000 }
},
{
"followerGains": {
"organicFollowerGain": 0,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634428800000, "end": 1634515200000 }
},
]}
holder = []
for e in data['elements']:
temp = [str(e['followerGains']['organicFollowerGain'])]
temp.append(str(e['followerGains']['paidFollowerGain']))
temp.append(e['organizationalEntity'])
temp.append(str(e['timeRange']['start']))
temp.append(str(e['timeRange']['end']))
holder.append(temp)
with open('out.csv','w') as f:
f.write('organicFollowerGain,paidFollowerGain,organizationalEntity,start,end\n')
writer = csv.writer(f)
writer.writerows(holder)
out.csv
organicFollowerGain,paidFollowerGain,organizationalEntity,start,end
2,0,urn:li:organization:28849398,1634169600000,1634256000000
-1,0,urn:li:organization:28849398,1634256000000,1634342400000
-2,0,urn:li:organization:28849398,1634342400000,1634428800000
0,0,urn:li:organization:28849398,1634428800000,1634515200000
For anyone wondering how to do this, thanks to #Waylan and #balderman The answer is as follows -
from pandas.io.json import json_normalize
data = {
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerGains": {
"organicFollowerGain": 2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634169600000, "end": 1634256000000 }
},
{
"followerGains": {
"organicFollowerGain": -1,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634256000000, "end": 1634342400000 }
},
{
"followerGains": {
"organicFollowerGain": -2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634342400000, "end": 1634428800000 }
},
{
"followerGains": {
"organicFollowerGain": 0,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634428800000, "end": 1634515200000 }
}
]
}
flat_data = json_normalize(data['elements'])
# Rename columns to only use the final section in dot name
flat_data.rename(dict((x, x.split('.')[-1]) for x in flat_data.columns if '.' in x), axis=1, inplace=True)
# Drop start and end columns
flat_data.drop(['start', 'end'], axis=1, inplace=True)
flat_data.to_csv('out.csv', index=False)
Hope this helps someone down the road! Cheers!
Related
How to combine jsons?
So say I have a json with the following structure: { "json_1": { "1": { "banana": 0, "corn": 5, "apple": 5 }, "2": { "melon": 10 }, "3": { "onion": 9, "garlic": 4 } } } but I also have another json with the same structure but a little different data: { "json_2": { "1": { "banana": 2, "corn": 3 }, "2": { "melon": 1, "watermelon": 5 }, "3": { "onion": 4, "garlic": 1 } } } whats a fast algorithm to combine these two jsons into one so that for each number i would have the json_1 amount and the json_2 amount for every fruit and if for example one json doesn't have a fruit that the other one has it will not combine them: { "combined": { "1": { "banana": { "json_1": 0, "json_2": 2 }, "corn": { "json_1": 5, "json_2": 3 } }, "2": { "melon": { "json_1": 10, "json_2": 1 } }, "3": { "onion": { "json_1": 9, "json_2": 4 }, "garlic": { "json_1": 4, "json_2": 1 } } } }
Get fields from a JSON file with Python
I have this json file loaded in Python with json.loads('myfile.json'): [ { "cart": { "items": { "3154ba405e5c5a22bbdf9bf1": { "item": { "_id": "3154ba405e5c5a22bbdf9bf1", "title": "Drink alla cannella", "price": 5.65, "__v": 0 }, "qty": 1, "price": 5.65 } }, "totalQty": 1, "totalPrice": 5.65 } }, { "cart": { "items": { "6214ba405e4c5a31bbdf9ad7": { "item": { "_id": "6214ba405e4c5a31bbdf9ad7", "title": "Drink alla menta", "price": 5.65, "__v": 0 }, "qty": 2, "price": 11.3 } }, "totalQty": 2, "totalPrice": 11.3 } } ] How I can access to both totalQty and totalPrice fields at same time and sum them? How I can access to both Title fields to print it?
Let's assume that you have the JSON data available as a string then: jdata = ''' [ { "cart": { "items": { "3154ba405e5c5a22bbdf9bf1": { "item": { "_id": "3154ba405e5c5a22bbdf9bf1", "title": "Drink alla cannella", "price": 5.65, "__v": 0 }, "qty": 1, "price": 5.65 } }, "totalQty": 1, "totalPrice": 5.65 } }, { "cart": { "items": { "6214ba405e4c5a31bbdf9ad7": { "item": { "_id": "6214ba405e4c5a31bbdf9ad7", "title": "Drink alla menta", "price": 5.65, "__v": 0 }, "qty": 2, "price": 11.3 } }, "totalQty": 2, "totalPrice": 11.3 } } ] ''' totalQty = 0 totalPrice = 0 for d in json.loads(jdata): c = d['cart'] totalQty += c['totalQty'] totalPrice += c['totalPrice'] for sd in c['items'].values(): print(sd['item']['title']) print(f'{totalQty:d}', f'{totalPrice:.2f}') Output: 3 16.95 Note: I suspect that what you really want to do is multiply those two values
update_many objects by reference to objects in same documents
Let's say I have a collection like the following. For every document that contains animals.horse, I want to set animals.goat equal to animals.horse (so the horses don't get lonely or outnumbered). [ { "_id": 1, "animals": { "goat": 1 } }, { "_id": 2, "animals": { "cow": 1, "horse": 2, "goat": 1 } }, { "_id": 3, "animals": { "horse": 5 } }, { "_id": 4, "animals": { "cow": 1 } } ] In Mongo shell, this works as desired: db.collection.update( {"animals.horse": { "$gt": 0 }}, [ { "$set": { "animals.goat": "$animals.horse" } } ], { "multi": true } ) which achieves the desired result: [ { "_id": 1, "animals": { "goat": 1 } }, { "_id": 2, "animals": { "cow": 1, "goat": 2, "horse": 2 } }, { "_id": 3, "animals": { "goat": 5, "horse": 5 } }, { "_id": 4, "animals": { "cow": 1 } } ] However, this doesn't work in pymongo -- the collection is unaltered. db.collection.update_many( filter = {'animals.horse': {'$gt':0} }, update = [ {'$set': {'animals.goat': '$animals.horse' } } ], upsert = True ) What am I doing wrong?
How to map the dictionary values to another dictionary
I have dictionary which is below { "aggregations": { "A": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": "ADL", "doc_count": 1 }, { "key": "SDD", "doc_count": 1 }, { "key": "JJD", "doc_count": 1 } ] }, "B": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": "ABC", "doc_count": 1 }, { "key": "CDE", "doc_count": 1 }, { "key": "FGH", "doc_count": 1 } ] }, "C": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ { "key": "XYX", "doc_count": 1 }, { "key": "NXS", "doc_count": 1 } ] } } } aggregations.keys will be aggregationfilters.fieldName aggregations.buckets.key will be aggregationfilters.values.title aggregationfilters.values.paragraph is null everytime aggregations.buckets.doc_count will be aggregationfilters.values.count Basically I need to extract aggregations.keys and aggregations.bucket values and put into different dictionary. Need to write a general code structure to do that. I cannot do with .pop(rename) the dictioanry My expected out { "aggregationfilters": [ { "name": "ABC", "fieldName": "A", "values": [ { "title": "ADL", "paragraph": null, "count": 1 }, { "title": "SDD", "paragraph": null, "count": 1 }, { "title": "JJD", "paragraph": null, "count": 1 } ] }, { "name": "CDE", "fieldName": "B", "values": [ { "title": "ABC", "paragraph": null, "count": 1 }, { "title": "CDE", "paragraph": null, "count": 1 }, { "title": "FGH", "paragraph": null, "count": 1 } ] }, { "name": "FGH", "fieldName": "C", "values": [ { "title": "XYX", "paragraph": null, "count": 1 }, { "title": "NXS", "paragraph": null, "count": 1 } ] } ] }
Well, this works, but even with my best effort this still doesn't look that clean. import json source = { "aggregations": { "A": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ {"key": "ADL", "doc_count": 1}, {"key": "SDD", "doc_count": 1}, {"key": "JJD", "doc_count": 1}, ], }, "B": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [ {"key": "ABC", "doc_count": 1}, {"key": "CDE", "doc_count": 1}, {"key": "FGH", "doc_count": 1}, ], }, "C": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [{"key": "XYX", "doc_count": 1}, {"key": "NXS", "doc_count": 1}], }, } } convert_map = { "buckets": "values", "doc_count": "count", "key": "title", } remove_map = {"sum_other_doc_count", "doc_count_error_upper_bound"} add_map = {"name": "Changed VAL_", "fieldName": "VAL_"} def converting_generator( source_: dict, convert_map_: dict, remove_map_: set, add_map_: dict ): working_dict = {k: v for k, v in source_.items()} variable_identifier = "VAL_" for key, inner_dic in working_dict.items(): inner_dic: dict for rm_key in remove_map_: try: inner_dic.pop(rm_key) except KeyError: pass for add_key, add_val in add_map_.items(): inner_dic[add_key] = add_val.replace(variable_identifier, key) dumped = json.dumps(inner_dic, indent=2) for original, target in convert_map_.items(): dumped = dumped.replace(original, target) yield json.loads(dumped) converted = { "aggregation_filters": list( converting_generator(source["aggregations"], convert_map, remove_map, add_map) ) } for inner_dict in converted["aggregation_filters"]: for even_inner_dict in inner_dict["values"]: even_inner_dict["paragraph"] = None print(json.dumps(converted, indent=2)) Output: { "aggregation_filters": [ { "values": [ { "title": "ADL", "count": 1, "paragraph": null }, { "title": "SDD", "count": 1, "paragraph": null }, { "title": "JJD", "count": 1, "paragraph": null } ], "name": "Changed A", "fieldName": "A" }, { "values": [ { "title": "ABC", "count": 1, "paragraph": null }, { "title": "CDE", "count": 1, "paragraph": null }, { "title": "FGH", "count": 1, "paragraph": null } ], "name": "Changed B", "fieldName": "B" }, { "values": [ { "title": "XYX", "count": 1, "paragraph": null }, { "title": "NXS", "count": 1, "paragraph": null } ], "name": "Changed C", "fieldName": "C" } ] } Always show your code, would be nice if that's a working one - to show that you've put at least that worth of the effort on your problem. I don't bother it as this feels like puzzle solving, but others may not.
How to aggregate data with date range?
Hello I have the following problem, whenever I aggregate data, the aggregations and to be more exact the date_histogram is always different. It starts with pretty much random date. I am using elasticpy and my query looks like this before executing. Note that I am using python datetime objects to get a "real" results. I had some problems with other formats. { "query": { "bool": { "filter": [ { "range": { "original_date": { "gte": datetime.datetime(2020, 2, 13, 0, 0), "lte": datetime.datetime(2020, 2, 15, 23, 0), } } } ], "must": [ { "query_string": { "query": "whatever string" } } ], } }, "aggs": { "docs_histogram": { "date_histogram": { "field": "original_date", "interval": "hour", "time_zone": "EET", }, ... (other aggs) }, }, } The date histogram should be in this range: 2020-02-13 00:00:00 - 2020-02-15 23:00:00 But look at the output's start and end. It starts 1 day later and ends same day 18:00 ?? "buckets": [ { "key_as_string": "2020-02-14T00:00:00.000+02:00", "key": 1581631200000, "doc_count": 1, "source_name": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [{"key": "WhateverKey", "doc_count": 1}], }, }, ... { "key_as_string": "2020-02-14T18:00:00.000+02:00", "key": 1581696000000, "doc_count": 1, "source_name": { "doc_count_error_upper_bound": 0, "sum_other_doc_count": 0, "buckets": [{"key": "WhateverKey2", "doc_count": 1}], }, }, ]