How to combine jsons? - python

So say I have a json with the following structure:
{
"json_1": {
"1": {
"banana": 0,
"corn": 5,
"apple": 5
},
"2": {
"melon": 10
},
"3": {
"onion": 9,
"garlic": 4
}
}
}
but I also have another json with the same structure but a little different data:
{
"json_2": {
"1": {
"banana": 2,
"corn": 3
},
"2": {
"melon": 1,
"watermelon": 5
},
"3": {
"onion": 4,
"garlic": 1
}
}
}
whats a fast algorithm to combine these two jsons into one so that for each number i would have the json_1 amount and the json_2 amount for every fruit and if for example one json doesn't have a fruit that the other one has it will not combine them:
{
"combined": {
"1": {
"banana": {
"json_1": 0,
"json_2": 2
},
"corn": {
"json_1": 5,
"json_2": 3
}
},
"2": {
"melon": {
"json_1": 10,
"json_2": 1
}
},
"3": {
"onion": {
"json_1": 9,
"json_2": 4
},
"garlic": {
"json_1": 4,
"json_2": 1
}
}
}
}

Related

Returning data that is not in ElasticSearch as 0 in doc_count

I am filtering in ElasticSearch. I want doc_count to return 0 on non-data dates, but it doesn't print those dates at all, only dates with data are returned to me. do you know how i can do it? Here is the Python output:
0 NaN
1 NaN
2 NaN
3 NaN
4 NaN
...
33479 {'date': '2022-04-13T08:08:00.000Z', 'value': 7}
33480 {'date': '2022-04-13T08:08:00.000Z', 'value': 7}
33481 {'date': '2022-04-13T08:08:00.000Z', 'value': 7}
33482 {'date': '2022-04-13T08:08:00.000Z', 'value': 7}
33483 {'date': '2022-04-13T08:08:00.000Z', 'value': 7}
And here is my ElasticSearch filter:
"from": 0,
"size": 0,
"query": {
"bool": {
"must":
[
{
"range": {
"#timestamp": {
"gte": "now-1M",
"lt": "now"
}
}
}
]
}
},
"aggs": {
"continent": {
"terms": {
"field": "source.geo.continent_name.keyword"
},
"aggs": {
"_source": {
"date_histogram": {
"field": "#timestamp", "interval": "8m"
}}}}}}
You need to set min_doc_count value to 0 for aggregation where you want result with zero doc_count.
{
"from": 0,
"size": 0,
"query": {
"bool": {
"must": [
{
"range": {
"#timestamp": {
"gte": "now-1M",
"lt": "now"
}
}
}
]
}
},
"aggs": {
"continent": {
"terms": {
"field": "source.geo.continent_name.keyword",
"min_doc_count": 0
},
"aggs": {
"_source": {
"date_histogram": {
"field": "#timestamp",
"interval": "8m",
"min_doc_count": 0
}
}
}
}
}
}

Get fields from a JSON file with Python

I have this json file loaded in Python with json.loads('myfile.json'):
[
{
"cart": {
"items": {
"3154ba405e5c5a22bbdf9bf1": {
"item": {
"_id": "3154ba405e5c5a22bbdf9bf1",
"title": "Drink alla cannella",
"price": 5.65,
"__v": 0
},
"qty": 1,
"price": 5.65
}
},
"totalQty": 1,
"totalPrice": 5.65
}
},
{
"cart": {
"items": {
"6214ba405e4c5a31bbdf9ad7": {
"item": {
"_id": "6214ba405e4c5a31bbdf9ad7",
"title": "Drink alla menta",
"price": 5.65,
"__v": 0
},
"qty": 2,
"price": 11.3
}
},
"totalQty": 2,
"totalPrice": 11.3
}
}
]
How I can access to both totalQty and totalPrice fields at same time and sum them?
How I can access to both Title fields to print it?
Let's assume that you have the JSON data available as a string then:
jdata = '''
[
{
"cart": {
"items": {
"3154ba405e5c5a22bbdf9bf1": {
"item": {
"_id": "3154ba405e5c5a22bbdf9bf1",
"title": "Drink alla cannella",
"price": 5.65,
"__v": 0
},
"qty": 1,
"price": 5.65
}
},
"totalQty": 1,
"totalPrice": 5.65
}
},
{
"cart": {
"items": {
"6214ba405e4c5a31bbdf9ad7": {
"item": {
"_id": "6214ba405e4c5a31bbdf9ad7",
"title": "Drink alla menta",
"price": 5.65,
"__v": 0
},
"qty": 2,
"price": 11.3
}
},
"totalQty": 2,
"totalPrice": 11.3
}
}
]
'''
totalQty = 0
totalPrice = 0
for d in json.loads(jdata):
c = d['cart']
totalQty += c['totalQty']
totalPrice += c['totalPrice']
for sd in c['items'].values():
print(sd['item']['title'])
print(f'{totalQty:d}', f'{totalPrice:.2f}')
Output:
3 16.95
Note:
I suspect that what you really want to do is multiply those two values

Python Flatten Multilevel JSON (LinkedIn API) to CSV

I am trying to DE-NEST the below JSON in python to create a CSV table, can somebody help?
Input JSON
{
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerGains": {
"organicFollowerGain": 2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634169600000, "end": 1634256000000 }
},
{
"followerGains": {
"organicFollowerGain": -1,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634256000000, "end": 1634342400000 }
},
{
"followerGains": {
"organicFollowerGain": -2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634342400000, "end": 1634428800000 }
},
{
"followerGains": {
"organicFollowerGain": 0,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634428800000, "end": 1634515200000 }
},
I tried the below code but it flattens everything into a single line.
I read in another thread that using json_normalize() will structure data in columns. But can someone please tell me how to do it for this case?
The code I used is as follows
Python code
import json
import pandas as pd
from pandas.io.json import json_normalize
data = json.load(open('C:/Users/Muj/Downloads/Linkedin data/follower_statistics_per_day.json'))
def flatten_json(y):
out = {}
def flatten(x, name=''):
if type(x) is dict:
for a in x:
flatten(x[a], name + a + '_')
elif type(x) is list:
i = 0
for a in x:
flatten(a, name + str(i) + '_')
i += 1
else:
out[name[:-1]] = x
flatten(y)
return out
fd=flatten_json(data)
flat_data=json_normalize(fd)
flat_data.to_csv('C:/Users/Muj/Downloads/Linkedin data/test1.csv', index = False)
Can someone please help
The desired output is as follows -
organicFollowerGain
paidFollowerGain
organizationalEntity
2
0
urn:li:organization:28849398
-1
0
urn:li:organization:28849398
There is no need for your flatten_json function. Just pass the elements portion directly to json_nomalize
flat_data = json_normalize(data['elements'])
That returns
organizationalEntity,followerGains.organicFollowerGain,followerGains.paidFollowerGain,timeRange.start,timeRange.end
urn:li:organization:28849398,2,0,1634169600000,1634256000000
urn:li:organization:28849398,-1,0,1634256000000,1634342400000
urn:li:organization:28849398,-2,0,1634342400000,1634428800000
urn:li:organization:28849398,0,0,1634428800000,1634515200000
Then you just need to rename the column headers and remove any columns you don't want.
# Rename columns to only use the final section in dot name
flat_data.rename(dict((x, x.split('.')[-1]) for x in flat_data.columns if '.' in x), axis=1, inplace=True)
# Drop start and end columns
flat_data.drop(['start', 'end'], axis=1, inplace=True)
And that returns
organizationalEntity,organicFollowerGain,paidFollowerGain
urn:li:organization:28849398,2,0
urn:li:organization:28849398,-1,0
urn:li:organization:28849398,-2,0
urn:li:organization:28849398,0,0
Putting that all together:
from pandas.io.json import json_normalize
data = {
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerGains": {
"organicFollowerGain": 2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634169600000, "end": 1634256000000 }
},
{
"followerGains": {
"organicFollowerGain": -1,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634256000000, "end": 1634342400000 }
},
{
"followerGains": {
"organicFollowerGain": -2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634342400000, "end": 1634428800000 }
},
{
"followerGains": {
"organicFollowerGain": 0,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634428800000, "end": 1634515200000 }
}
]
}
flat_data = json_normalize(data['elements'])
# Rename columns to only use the final section in dot name
flat_data.rename(dict((x, x.split('.')[-1]) for x in flat_data.columns if '.' in x), axis=1, inplace=True)
# Drop start and end columns
flat_data.drop(['start', 'end'], axis=1, inplace=True)
flat_data.to_csv('out.csv', index=False)
Try the below (Not using any external lib - just core python)
import csv
data = {
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerGains": {
"organicFollowerGain": 2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634169600000, "end": 1634256000000 }
},
{
"followerGains": {
"organicFollowerGain": -1,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634256000000, "end": 1634342400000 }
},
{
"followerGains": {
"organicFollowerGain": -2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634342400000, "end": 1634428800000 }
},
{
"followerGains": {
"organicFollowerGain": 0,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634428800000, "end": 1634515200000 }
},
]}
holder = []
for e in data['elements']:
temp = [str(e['followerGains']['organicFollowerGain'])]
temp.append(str(e['followerGains']['paidFollowerGain']))
temp.append(e['organizationalEntity'])
temp.append(str(e['timeRange']['start']))
temp.append(str(e['timeRange']['end']))
holder.append(temp)
with open('out.csv','w') as f:
f.write('organicFollowerGain,paidFollowerGain,organizationalEntity,start,end\n')
writer = csv.writer(f)
writer.writerows(holder)
out.csv
organicFollowerGain,paidFollowerGain,organizationalEntity,start,end
2,0,urn:li:organization:28849398,1634169600000,1634256000000
-1,0,urn:li:organization:28849398,1634256000000,1634342400000
-2,0,urn:li:organization:28849398,1634342400000,1634428800000
0,0,urn:li:organization:28849398,1634428800000,1634515200000
For anyone wondering how to do this, thanks to #Waylan and #balderman The answer is as follows -
from pandas.io.json import json_normalize
data = {
"paging": { "start": 0, "count": 10, "links": [] },
"elements": [
{
"followerGains": {
"organicFollowerGain": 2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634169600000, "end": 1634256000000 }
},
{
"followerGains": {
"organicFollowerGain": -1,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634256000000, "end": 1634342400000 }
},
{
"followerGains": {
"organicFollowerGain": -2,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634342400000, "end": 1634428800000 }
},
{
"followerGains": {
"organicFollowerGain": 0,
"paidFollowerGain": 0
},
"organizationalEntity": "urn:li:organization:28849398",
"timeRange": { "start": 1634428800000, "end": 1634515200000 }
}
]
}
flat_data = json_normalize(data['elements'])
# Rename columns to only use the final section in dot name
flat_data.rename(dict((x, x.split('.')[-1]) for x in flat_data.columns if '.' in x), axis=1, inplace=True)
# Drop start and end columns
flat_data.drop(['start', 'end'], axis=1, inplace=True)
flat_data.to_csv('out.csv', index=False)
Hope this helps someone down the road! Cheers!

update_many objects by reference to objects in same documents

Let's say I have a collection like the following. For every document that contains animals.horse, I want to set animals.goat equal to animals.horse (so the horses don't get lonely or outnumbered).
[
{
"_id": 1,
"animals": {
"goat": 1
}
},
{
"_id": 2,
"animals": {
"cow": 1,
"horse": 2,
"goat": 1
}
},
{
"_id": 3,
"animals": {
"horse": 5
}
},
{
"_id": 4,
"animals": {
"cow": 1
}
}
]
In Mongo shell, this works as desired:
db.collection.update(
{"animals.horse": { "$gt": 0 }},
[ { "$set": { "animals.goat": "$animals.horse" } } ],
{ "multi": true }
)
which achieves the desired result:
[
{
"_id": 1,
"animals": {
"goat": 1
}
},
{
"_id": 2,
"animals": {
"cow": 1,
"goat": 2,
"horse": 2
}
},
{
"_id": 3,
"animals": {
"goat": 5,
"horse": 5
}
},
{
"_id": 4,
"animals": {
"cow": 1
}
}
]
However, this doesn't work in pymongo -- the collection is unaltered.
db.collection.update_many( filter = {'animals.horse': {'$gt':0} },
update = [ {'$set': {'animals.goat': '$animals.horse' } } ],
upsert = True
)
What am I doing wrong?

Use python to recurse a flat file and build a hierarchy

I have a flat text file that represents a hierarchy. It looks similar to this:
0 tom (1)
1 janet (8)
2 harry (1)
3 jules (1)
3 jacob (1)
1 mary (13)
2 jeff (1)
3 sam (2)
1 bob (28)
2 dick (1)
I want to read this in and build a nested dictionary (or some kind of data structure) to represent the hierarchy so it is easier to manage but I can't wrap my head around how to iterate and create a data structure. Maybe recursion?
The first number is the level of the hierarchy, the word is the name I want to store and the value in the parenthesis is the quantity that I also want to store.
I'd like to end up with something similar to this:
{
"tom": {
"quantity": 1,
"names": {
"janet": {
"quantity": 8,
"names": {
"harry": {
"quantity": 1,
"names": {
"jules": {
"quantity": 1
},
"jacob": {
"quantity": 1
}
}
}
}
},
"mary": {
"quantity": 13,
"names": {
"jeff": {
"quantity": 1,
"names": {
"sam": {
"quantity": 2
}
}
}
}
},
"bob": {
"quantity": 28,
"names": {
"dick": {
"quantity": 1
}
}
}
}
}
}
You can use recursion:
import re
with open('test_hierarchy.txt') as f:
d = [[int((k:=re.findall('\d+|\w+', i))[0]), k[1], int(k[-1])] for i in f]
def to_tree(data):
if not data:
return {}
r, _key, _val = {}, None, []
for a, b, c in data:
if not a:
if _key is not None:
r[_key[0]] = {'quantity':_key[-1], 'names':to_tree(_val)}
_key, _val = (b, c), []
else:
_val.append([a-1, b, c])
r = {**r, _key[0]:{'quantity':_key[-1], 'names':to_tree(_val)}}
return {a:{'quantity':b['quantity']} if not b['names'] else b for a, b in r.items()}
import json
print(json.dumps(to_tree(d), indent=4))
Output:
{
"tom": {
"quantity": 1,
"names": {
"janet": {
"quantity": 8,
"names": {
"harry": {
"quantity": 1,
"names": {
"jules": {
"quantity": 1
},
"jacob": {
"quantity": 1
}
}
}
}
},
"mary": {
"quantity": 13,
"names": {
"jeff": {
"quantity": 1,
"names": {
"sam": {
"quantity": 2
}
}
}
}
},
"bob": {
"quantity": 28,
"names": {
"dick": {
"quantity": 1
}
}
}
}
}
}

Categories

Resources