Python Flatten Multilevel JSON (LinkedIn API) to CSV - python

I am trying to DE-NEST the below JSON in python to create a CSV table, can somebody help?
Input JSON
{
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerGains": {
"organicFollowerGain": 2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634169600000, "end": 1634256000000 }
},
{
"followerGains": {
"organicFollowerGain": -1,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634256000000, "end": 1634342400000 }
},
{
"followerGains": {
"organicFollowerGain": -2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634342400000, "end": 1634428800000 }
},
{
"followerGains": {
"organicFollowerGain": 0,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634428800000, "end": 1634515200000 }
},
I tried the below code but it flattens everything into a single line.
I read in another thread that using json_normalize() will structure data in columns. But can someone please tell me how to do it for this case?
The code I used is as follows
Python code
import json
import pandas as pd
from pandas.io.json import json_normalize
data = json.load(open('C:/Users/Muj/Downloads/Linkedin data/follower_statistics_per_day.json'))
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
fd=flatten_json(data)
flat_data=json_normalize(fd)
flat_data.to_csv('C:/Users/Muj/Downloads/Linkedin data/test1.csv', index = False)
Can someone please help
The desired output is as follows -
organicFollowerGain
paidFollowerGain
organizationalEntity
2
0
urn:li:organization:28849398
-1
0
urn:li:organization:28849398

There is no need for your flatten_json function. Just pass the elements portion directly to json_nomalize
flat_data = json_normalize(data['elements'])
That returns
organizationalEntity,followerGains.organicFollowerGain,followerGains.paidFollowerGain,timeRange.start,timeRange.end
urn:li:organization:28849398,2,0,1634169600000,1634256000000
urn:li:organization:28849398,-1,0,1634256000000,1634342400000
urn:li:organization:28849398,-2,0,1634342400000,1634428800000
urn:li:organization:28849398,0,0,1634428800000,1634515200000
Then you just need to rename the column headers and remove any columns you don't want.
# Rename columns to only use the final section in dot name
flat_data.rename(dict((x, x.split('.')[-1]) for x in flat_data.columns if '.' in x), axis=1, inplace=True)
# Drop start and end columns
flat_data.drop(['start', 'end'], axis=1, inplace=True)
And that returns
organizationalEntity,organicFollowerGain,paidFollowerGain
urn:li:organization:28849398,2,0
urn:li:organization:28849398,-1,0
urn:li:organization:28849398,-2,0
urn:li:organization:28849398,0,0
Putting that all together:
from pandas.io.json import json_normalize
data = {
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerGains": {
"organicFollowerGain": 2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634169600000, "end": 1634256000000 }
},
{
"followerGains": {
"organicFollowerGain": -1,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634256000000, "end": 1634342400000 }
},
{
"followerGains": {
"organicFollowerGain": -2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634342400000, "end": 1634428800000 }
},
{
"followerGains": {
"organicFollowerGain": 0,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634428800000, "end": 1634515200000 }
}
]
}
flat_data = json_normalize(data['elements'])
# Rename columns to only use the final section in dot name
flat_data.rename(dict((x, x.split('.')[-1]) for x in flat_data.columns if '.' in x), axis=1, inplace=True)
# Drop start and end columns
flat_data.drop(['start', 'end'], axis=1, inplace=True)
flat_data.to_csv('out.csv', index=False)

Try the below (Not using any external lib - just core python)
import csv
data = {
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerGains": {
"organicFollowerGain": 2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634169600000, "end": 1634256000000 }
},
{
"followerGains": {
"organicFollowerGain": -1,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634256000000, "end": 1634342400000 }
},
{
"followerGains": {
"organicFollowerGain": -2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634342400000, "end": 1634428800000 }
},
{
"followerGains": {
"organicFollowerGain": 0,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634428800000, "end": 1634515200000 }
},
]}
holder = []
for e in data['elements']:
temp = [str(e['followerGains']['organicFollowerGain'])]
temp.append(str(e['followerGains']['paidFollowerGain']))
temp.append(e['organizationalEntity'])
temp.append(str(e['timeRange']['start']))
temp.append(str(e['timeRange']['end']))
holder.append(temp)
with open('out.csv','w') as f:
f.write('organicFollowerGain,paidFollowerGain,organizationalEntity,start,end\n')
writer = csv.writer(f)
writer.writerows(holder)
out.csv
organicFollowerGain,paidFollowerGain,organizationalEntity,start,end
2,0,urn:li:organization:28849398,1634169600000,1634256000000
-1,0,urn:li:organization:28849398,1634256000000,1634342400000
-2,0,urn:li:organization:28849398,1634342400000,1634428800000
0,0,urn:li:organization:28849398,1634428800000,1634515200000

For anyone wondering how to do this, thanks to #Waylan and #balderman The answer is as follows -
from pandas.io.json import json_normalize
data = {
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerGains": {
"organicFollowerGain": 2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634169600000, "end": 1634256000000 }
},
{
"followerGains": {
"organicFollowerGain": -1,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634256000000, "end": 1634342400000 }
},
{
"followerGains": {
"organicFollowerGain": -2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634342400000, "end": 1634428800000 }
},
{
"followerGains": {
"organicFollowerGain": 0,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634428800000, "end": 1634515200000 }
}
]
}
flat_data = json_normalize(data['elements'])
# Rename columns to only use the final section in dot name
flat_data.rename(dict((x, x.split('.')[-1]) for x in flat_data.columns if '.' in x), axis=1, inplace=True)
# Drop start and end columns
flat_data.drop(['start', 'end'], axis=1, inplace=True)
flat_data.to_csv('out.csv', index=False)
Hope this helps someone down the road! Cheers!

Related

How to combine jsons?

So say I have a json with the following structure:
{
"json_1": {
"1": {
"banana": 0,
"corn": 5,
"apple": 5
},
"2": {
"melon": 10
},
"3": {
"onion": 9,
"garlic": 4
}
}
}
but I also have another json with the same structure but a little different data:
{
"json_2": {
"1": {
"banana": 2,
"corn": 3
},
"2": {
"melon": 1,
"watermelon": 5
},
"3": {
"onion": 4,
"garlic": 1
}
}
}
whats a fast algorithm to combine these two jsons into one so that for each number i would have the json_1 amount and the json_2 amount for every fruit and if for example one json doesn't have a fruit that the other one has it will not combine them:
{
"combined": {
"1": {
"banana": {
"json_1": 0,
"json_2": 2
},
"corn": {
"json_1": 5,
"json_2": 3
}
},
"2": {
"melon": {
"json_1": 10,
"json_2": 1
}
},
"3": {
"onion": {
"json_1": 9,
"json_2": 4
},
"garlic": {
"json_1": 4,
"json_2": 1
}
}
}
}

Get fields from a JSON file with Python

I have this json file loaded in Python with json.loads('myfile.json'):
[
{
"cart": {
"items": {
"3154ba405e5c5a22bbdf9bf1": {
"item": {
"_id": "3154ba405e5c5a22bbdf9bf1",
"title": "Drink alla cannella",
"price": 5.65,
"__v": 0
},
"qty": 1,
"price": 5.65
}
},
"totalQty": 1,
"totalPrice": 5.65
}
},
{
"cart": {
"items": {
"6214ba405e4c5a31bbdf9ad7": {
"item": {
"_id": "6214ba405e4c5a31bbdf9ad7",
"title": "Drink alla menta",
"price": 5.65,
"__v": 0
},
"qty": 2,
"price": 11.3
}
},
"totalQty": 2,
"totalPrice": 11.3
}
}
]
How I can access to both totalQty and totalPrice fields at same time and sum them?
How I can access to both Title fields to print it?
Let's assume that you have the JSON data available as a string then:
jdata = '''
[
{
"cart": {
"items": {
"3154ba405e5c5a22bbdf9bf1": {
"item": {
"_id": "3154ba405e5c5a22bbdf9bf1",
"title": "Drink alla cannella",
"price": 5.65,
"__v": 0
},
"qty": 1,
"price": 5.65
}
},
"totalQty": 1,
"totalPrice": 5.65
}
},
{
"cart": {
"items": {
"6214ba405e4c5a31bbdf9ad7": {
"item": {
"_id": "6214ba405e4c5a31bbdf9ad7",
"title": "Drink alla menta",
"price": 5.65,
"__v": 0
},
"qty": 2,
"price": 11.3
}
},
"totalQty": 2,
"totalPrice": 11.3
}
}
]
'''
totalQty = 0
totalPrice = 0
for d in json.loads(jdata):
c = d['cart']
totalQty += c['totalQty']
totalPrice += c['totalPrice']
for sd in c['items'].values():
print(sd['item']['title'])
print(f'{totalQty:d}', f'{totalPrice:.2f}')
Output:
3 16.95
Note:
I suspect that what you really want to do is multiply those two values

update_many objects by reference to objects in same documents

Let's say I have a collection like the following. For every document that contains animals.horse, I want to set animals.goat equal to animals.horse (so the horses don't get lonely or outnumbered).
[
{
"_id": 1,
"animals": {
"goat": 1
}
},
{
"_id": 2,
"animals": {
"cow": 1,
"horse": 2,
"goat": 1
}
},
{
"_id": 3,
"animals": {
"horse": 5
}
},
{
"_id": 4,
"animals": {
"cow": 1
}
}
]
In Mongo shell, this works as desired:
db.collection.update(
{"animals.horse": { "$gt": 0 }},
[ { "$set": { "animals.goat": "$animals.horse" } } ],
{ "multi": true }
)
which achieves the desired result:
[
{
"_id": 1,
"animals": {
"goat": 1
}
},
{
"_id": 2,
"animals": {
"cow": 1,
"goat": 2,
"horse": 2
}
},
{
"_id": 3,
"animals": {
"goat": 5,
"horse": 5
}
},
{
"_id": 4,
"animals": {
"cow": 1
}
}
]
However, this doesn't work in pymongo -- the collection is unaltered.
db.collection.update_many( filter = {'animals.horse': {'$gt':0} },
update = [ {'$set': {'animals.goat': '$animals.horse' } } ],
upsert = True
)
What am I doing wrong?

How to map the dictionary values to another dictionary

I have dictionary which is below
{
"aggregations": {
"A": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{ "key": "ADL", "doc_count": 1 },
{ "key": "SDD", "doc_count": 1 },
{ "key": "JJD", "doc_count": 1 }
]
},
"B": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{ "key": "ABC", "doc_count": 1 },
{ "key": "CDE", "doc_count": 1 },
{ "key": "FGH", "doc_count": 1 }
]
},
"C": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{ "key": "XYX", "doc_count": 1 },
{ "key": "NXS", "doc_count": 1 }
]
}
}
}
aggregations.keys will be aggregationfilters.fieldName
aggregations.buckets.key will be aggregationfilters.values.title
aggregationfilters.values.paragraph is null everytime
aggregations.buckets.doc_count will be aggregationfilters.values.count
Basically I need to extract aggregations.keys and aggregations.bucket values and put into different dictionary.
Need to write a general code structure to do that.
I cannot do with .pop(rename) the dictioanry
My expected out
{
"aggregationfilters": [
{
"name": "ABC",
"fieldName": "A",
"values": [
{ "title": "ADL", "paragraph": null, "count": 1 },
{ "title": "SDD", "paragraph": null, "count": 1 },
{ "title": "JJD", "paragraph": null, "count": 1 }
]
}, {
"name": "CDE",
"fieldName": "B",
"values": [
{ "title": "ABC", "paragraph": null, "count": 1 },
{ "title": "CDE", "paragraph": null, "count": 1 },
{ "title": "FGH", "paragraph": null, "count": 1 }
]
}, {
"name": "FGH",
"fieldName": "C",
"values": [
{ "title": "XYX", "paragraph": null, "count": 1 },
{ "title": "NXS", "paragraph": null, "count": 1 }
]
}
]
}
Well, this works, but even with my best effort this still doesn't look that clean.
import json
source = {
"aggregations": {
"A": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{"key": "ADL", "doc_count": 1},
{"key": "SDD", "doc_count": 1},
{"key": "JJD", "doc_count": 1},
],
},
"B": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [
{"key": "ABC", "doc_count": 1},
{"key": "CDE", "doc_count": 1},
{"key": "FGH", "doc_count": 1},
],
},
"C": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{"key": "XYX", "doc_count": 1}, {"key": "NXS", "doc_count": 1}],
},
}
}
convert_map = {
"buckets": "values",
"doc_count": "count",
"key": "title",
}
remove_map = {"sum_other_doc_count", "doc_count_error_upper_bound"}
add_map = {"name": "Changed VAL_", "fieldName": "VAL_"}
def converting_generator(
source_: dict, convert_map_: dict, remove_map_: set, add_map_: dict
):
working_dict = {k: v for k, v in source_.items()}
variable_identifier = "VAL_"
for key, inner_dic in working_dict.items():
inner_dic: dict
for rm_key in remove_map_:
try:
inner_dic.pop(rm_key)
except KeyError:
pass
for add_key, add_val in add_map_.items():
inner_dic[add_key] = add_val.replace(variable_identifier, key)
dumped = json.dumps(inner_dic, indent=2)
for original, target in convert_map_.items():
dumped = dumped.replace(original, target)
yield json.loads(dumped)
converted = {
"aggregation_filters": list(
converting_generator(source["aggregations"], convert_map, remove_map, add_map)
)
}
for inner_dict in converted["aggregation_filters"]:
for even_inner_dict in inner_dict["values"]:
even_inner_dict["paragraph"] = None
print(json.dumps(converted, indent=2))
Output:
{
"aggregation_filters": [
{
"values": [
{
"title": "ADL",
"count": 1,
"paragraph": null
},
{
"title": "SDD",
"count": 1,
"paragraph": null
},
{
"title": "JJD",
"count": 1,
"paragraph": null
}
],
"name": "Changed A",
"fieldName": "A"
},
{
"values": [
{
"title": "ABC",
"count": 1,
"paragraph": null
},
{
"title": "CDE",
"count": 1,
"paragraph": null
},
{
"title": "FGH",
"count": 1,
"paragraph": null
}
],
"name": "Changed B",
"fieldName": "B"
},
{
"values": [
{
"title": "XYX",
"count": 1,
"paragraph": null
},
{
"title": "NXS",
"count": 1,
"paragraph": null
}
],
"name": "Changed C",
"fieldName": "C"
}
]
}
Always show your code, would be nice if that's a working one - to show that you've put at least that worth of the effort on your problem.
I don't bother it as this feels like puzzle solving, but others may not.

How to aggregate data with date range?

Hello I have the following problem, whenever I aggregate data, the aggregations and to be more exact the date_histogram is always different. It starts with pretty much random date.
I am using elasticpy and my query looks like this before executing. Note that I am using python datetime objects to get a "real" results. I had some problems with other formats.
{
"query": {
"bool": {
"filter": [
{
"range": {
"original_date": {
"gte": datetime.datetime(2020, 2, 13, 0, 0),
"lte": datetime.datetime(2020, 2, 15, 23, 0),
}
}
}
],
"must": [
{
"query_string": {
"query": "whatever string"
}
}
],
}
},
"aggs": {
"docs_histogram": {
"date_histogram": {
"field": "original_date",
"interval": "hour",
"time_zone": "EET",
},
... (other aggs)
},
},
}
The date histogram should be in this range: 2020-02-13 00:00:00 - 2020-02-15 23:00:00 But look at the output's start and end. It starts 1 day later and ends same day 18:00 ??
"buckets": [
{
"key_as_string": "2020-02-14T00:00:00.000+02:00",
"key": 1581631200000,
"doc_count": 1,
"source_name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{"key": "WhateverKey", "doc_count": 1}],
},
},
...
{
"key_as_string": "2020-02-14T18:00:00.000+02:00",
"key": 1581696000000,
"doc_count": 1,
"source_name": {
"doc_count_error_upper_bound": 0,
"sum_other_doc_count": 0,
"buckets": [{"key": "WhateverKey2", "doc_count": 1}],
},
},
]

Categories

Resources