I'm looking to flatten a JSON hierarchy of unknown structure to a dictionary, capturing the full key hierarchy in the dictionary result to uniquely identify it.
So far I am able to print the key:value pair for the all the parent/child nodes recursively but I am having trouble:
(1) figuring out how to pass the parent hierarchy keys for a recursive (child) execution and then reset it when it exits the child key.
(2) writing to a single dictionary result - when I define the dictionary within the recursive function, I end up creating multiple dictionaries ... Do I need to wrap this function in a master function to avoid this?
Thanks!
# flatten/enumerate example I'm using
with open('_json\\file.json') as f:
data = json.load(f)
def parse_json_response(content):
if len (content.keys()) > 1 :
for key, value in content.items():
if type(value) is dict:
parse_json_response(value)
else:
print('{}:{}'.format(key,value))
else:
print(value)
if __name__ == '__main__':
parse_json_response(data)
# current result as print
id = 12345
firstName = John
lastName = Smith
DOB = 1980-01-01
phone = 123
line1 = Unit 4
line2 = 3 Main st
# desired result to dictionary {}
id = 12345
fields.firstName = John
fields.lastName = Smith
fields.DOB = 1980-01-01
fields.phone = 123
fields.address.residential.line1 = Unit 4
fields.address.residential.line2 = 3 Main st
You can create the flattened dictionary (rather than just print values), by keeping track of the parent and recursing in the correct spot. That might look something like:
d = {
"ID": "12345",
"fields": {
"firstName": "John",
"lastName": "Smith",
"DOB": "1980-01-01",
"phoneLand": "610292659333",
"address": {
"residential": {
"line1": "Unit 4",
"line2": "3 Main st"
}
}
}
}
def flattenDict(d, parent=None):
ret = {}
for k, v in d.items():
if parent:
k = f'{parent}.{k}'
if isinstance(v, dict):
ret.update(flattenDict(v, k))
else:
ret[k] = v
return ret
flat = flattenDict(d)
flat will be:
{'ID': '12345',
'fields.firstName': 'John',
'fields.lastName': 'Smith',
'fields.DOB': '1980-01-01',
'fields.phoneLand': '610292659333',
'fields.address.residential.line1': 'Unit 4',
'fields.address.residential.line2': '3 Main st'}
You can also arrange the output to be a generator that yields tuples. You can then pass this to dict() for the same result:
def flattenDict(d):
for k, v in d.items():
if isinstance(v, dict):
yield from ((f'{k}.{kk}', v) for kk, v in flattenDict(v))
else:
yield (k, v)
dict(flattenDict(d))
Try this below:
test = {
"ID": "12345",
"fields": {
"firstName": "John",
"lastName": "Smith",
"DOB": "1980-01-01",
"phoneLand": "610292659333",
"address": {
"residential": {
"line1": "Unit 4",
"line2": "3 Main st"
}
}
}
}
def func(d, parent=""):
for key, value in d.items():
if isinstance(value, dict):
func(value, parent=parent+key+".")
else:
print(f"{parent+key} = {value}")
func(test)
Result:
ID = 12345
fields.firstName = John
fields.lastName = Smith
fields.DOB = 1980-01-01
fields.phoneLand = 610292659333
fields.address.residential.line1 = Unit 4
fields.address.residential.line2 = 3 Main st
Related
I have a dict list of dicts.
dict_list = [
{"nameClient": "client.name"},
{"emailClient": "client.email"},
{"Document.typeDocument": "client.type_document"},
{"Document.numberDocument": "client.number_document"},
{"Document.docOne.number": "client.docone_number"},
{"Document.docOne.type": "client.docone_type"},
{"Document.docTwo.number": "client.doctwo_number"},
{"Document.docOne.extra.number": "client.docone_extra_number"},
]
I want to create a dict based on key of this dicts and get value from my class based on value of this dicts.
Client Class and Values initialized:
class Client:
def __init__(self, data):
self.name = data['name']
self.email = data['email']
self.type_document = data['type_document']
self.number_document = data['number_document']
self.docone_number = data['docone_number']
self.docone_type = data['docone_type']
self.doctwo_number = data['doctwo_number']
self.docone_extra_number = data['docone_extra_number']
# this will be passed on build_final_dict(FINAL_DICT, i, k, v, client)
client1 = Client({
"name": "Stack",
"email": "xxxxx#xxxx.com",
"type_document": "TPS",
"number_document": "22222222",
"docone_number": "11111111",
"docone_type": "docone type",
"doctwo_number": "doc two number",
"docone_extra_number": "doc extra number",
})
So I started passing to a function the key/value
FINAL_DICT = {}
for i in dict_list:
for k, v in i.items():
build_final_dict(FINAL_DICT, i, k, v, client)
My build_final_dict()
def build_final_dict(FINAL_DICT, i, k, v, client):
if '.' not in k:
FINAL_DICT[k] = getattr(client, v.replace('client.', ''))
else:
subdict = k.split('.')[0] # ex documento
subdict_key = i.items()[0][0]
subdict_key = subdict_key.replace('%s.' % subdict, '')
subdict_value = i.items()[0][1]
subdict_value = subdict_value.replace('client.', '')
if subdict not in FINAL_DICT:
FINAL_DICT[subdict] = dict()
result_value = getattr(client, subdict_value)
if '.' not in subdict_key:
FINAL_DICT[subdict][subdict_key] = result_value
else:
new_subkey = subdict_key.split('.')[0]
new_subvalue = subdict_key.split('.')[1]
if new_subkey not in FINAL_DICT[subdict]:
FINAL_DICT[subdict][new_subkey] = dict()
build_final_dict(FINAL_DICT[subdict][new_subkey], i, new_subvalue, v, client)
Actual Result:
{
"emailClient":"xxxxx#xxxx.com",
"nameClient":"Stack",
"Document":{
"typeDocument":"TPS",
"numberDocument":"22222222",
"docTwo":{
"number":"doc two number"
},
"docOne":{
"type":"docone type",
"extra":"doc extra number", // should continue creating dict within dict...
"number":"11111111"
}
}
}
Most of the dictionary is right. But the "extra" dictionary that I put inside a dictionary (third sub level) did not create a new dictionary and put the final value.
There is the possibility of having infinite sub dictionaries, I need my script to be prepared for that.
Result I expected (based on Client class Data):
{
"emailClient":"xxxxx#xxxx.com",
"nameClient":"Stack",
"Document":{
"typeDocument":"TPS",
"numberDocument":"22222222",
"docTwo":{
"number":"doc two number"
},
"docOne":{
"type":"docone type",
"extra": { "number": "doc extra number" }, // dict, not string
"number":"11111111"
}
}
}
I think, this will help you in transforming the initial JSON to the structure you need. This is in accordance to your original requirement of
I want to create a dict based on key of this dicts and get value from
my class based on value of this dicts.
I would try this in a split approach, to create nested dicts for every dict in your list (if they have nesting in keys), and then merge them together as single unit.
See an example approach here:
from collections.abc import MutableMapping
from functools import reduce
def merge(d1, d2):
# Merge 2 dictionaries -deep.
for k, v in d1.items():
if k in d2:
if all(isinstance(e, MutableMapping) for e in (v, d2[k])):
d2[k] = merge(v, d2[k])
md = d1.copy()
md.update(d2)
return md
def explode_to_dict(key, value):
# Create nested dicts based on the key structure
if "." in key:
p, c = key.rsplit(".", 1)
return explode_to_dict(p, {c: value})
else:
return {key: value}
if __name__ == '__main__':
dict_list = [
{"nameClient": "client.name"},
{"emailClient": "client.email"},
{"Document.typeDocument": "client.type_document"},
{"Document.numberDocument": "client.number_document"},
{"Document.docOne.number": "client.docone_number"},
{"Document.docOne.type": "client.docone_type"},
{"Document.docTwo.number": "client.doctwo_number"},
{"Document.docOne.extra.number": "client.docone_extra_number"},
]
result_dict = {}
result_dict = reduce(merge, [explode_to_dict(*d.popitem()) for d in dict_list])
print(json.dumps(result_dict))
This provides a structure like below(expected):
{
"nameClient": "client.name",
"emailClient": "client.email",
"Document": {
"typeDocument": "client.type_document",
"numberDocument": "client.number_document",
"docOne": {
"number": "client.docone_number",
"type": "client.docone_type",
"extra": {
"number": "client.docone_extra_number"
}
},
"docTwo": {
"number": "client.doctwo_number"
}
}
}
I suppose, you can proceed with your class manipulation from here!
I have created a var that is equal to t.json. The JSON file is a follows:
{
"groups": {
"customerduy": {
"nonprod": {
"name": "customerduynonprod",
"id": "529646781943",
"owner": "cloudops#coerce.com",
"manager_email": ""
},
"prod": {
"name": "phishing_duyaccountprod",
"id": "241683454720",
"owner": "cloudops#coerce.com",
"manager_email": ""
}
},
"customerduyprod": {
"nonprod": {
"name": "phishing_duyaccountnonprod",
"id": "638968214142",
"owner": "cloudops#coerce.com",
"manager_email": ""
}
},
"ciasuppliergenius": {
"prod": {
"name": "ciasuppliergeniusprod",
"id": "220753788760",
"owner": "cia_developers#coerce.com",
"manager_email": "jarks#coerce.com"
}
}
}
}
my goal was to pars this JSON file and get value for "owner" and output it to a new var. Example below:
t.json = group_map
group_id_aws = group(
group.upper(),
"accounts",
template,
owner = group_map['groups']['prod'],
manager_description = "Groups for teams to access their product accounts.",
The error I keep getting is: KeyError: 'prod'
Owner occurs 4 times, so here is how to get all of them.
import json
# read the json
with open("C:\\test\\test.json") as f:
data = json.load(f)
# get all 4 occurances
owner_1 = data['groups']['customerduy']['nonprod']['owner']
owner_2 = data['groups']['customerduy']['prod']['owner']
owner_3 = data['groups']['customerduyprod']['nonprod']['owner']
owner_4 = data['groups']['ciasuppliergenius']['prod']['owner']
# print results
print(owner_1)
print(owner_2)
print(owner_3)
print(owner_4)
the result:
cloudops#coerce.com
cloudops#coerce.com
cloudops#coerce.com
cia_developers#coerce.com
You get a key error since the key 'prod' is not in 'groups'
What you have is
group_map['groups']['customerduy']['prod']
group_map['groups']['ciasuppliergenius']['prod']
So you will have to extract the 'owner' from each element in the tree:
def s(d,t):
for k,v in d.items():
if t == k:
yield v
try:
for i in s(v,t):
yield i
except:
pass
print(','.join(s(j,'owner')))
If your JSON is loaded in variable data, you can use a recursive function
that deals with the two containers types (dict and list) that can occur
in a JSON file, recursively:
def find_all_values_for_key(d, key, result):
if isinstance(d, dict):
if key in d:
result.append(d[key])
return
for k, v in d.items():
find_all_values_for_key(v, key, result)
elif isinstance(d, list):
for elem in d:
find_all_values_for_key(elem, key, result)
owners = []
find_all_values_for_key(data, 'owner', owners)
print(f'{owners=}')
which gives:
owners=['cloudops#coerce.com', 'cloudops#coerce.com', 'cloudops#coerce.com', 'cia_developers#coerce.com']
This way you don't have to bother with the names of intermediate keys, or in general the structure of your JSON file.
You don't have any lists in your example, but it is trivial to recurse through
them to any dict with an owner key that might "lurk" somewhere nested
under a a list element, so it is better to deal with potential future changes
to the JSON.
Given a list of dictionaries:
data = {
"data": [
{
"categoryOptionCombo": {
"id": "A"
},
"dataElement": {
"id": "123"
}
},
{
"categoryOptionCombo": {
"id": "B"
},
"dataElement": {
"id": "123"
}
},
{
"categoryOptionCombo": {
"id": "C"
},
"dataElement": {
"id": "456"
}
}
]
}
I would like to display the dataElement where the count of distinct categoryOptionCombo is larger than 1.
e.g. the result of the function would be an iterable of IDs:
[123]
because the dataElement with id 123 has two different categoryOptionCombos.
tracker = {}
for d in data['data']:
data_element = d['dataElement']['id']
coc = d['categoryOptionCombo']['id']
if data_element not in tracker:
tracker[data_element] = set()
tracker[data_element].add(coc)
too_many = [key for key,value in tracker.items() if len(value) > 1]
How can I iterate the list of dictionaries preferably with a comprehension? This solution above is not pythonic.
One approach:
import collections
counts = collections.defaultdict(set)
for d in data["data"]:
counts[d["dataElement"]["id"]].add(d["categoryOptionCombo"]["id"])
res = [k for k, v in counts.items() if len(v) > 1]
print(res)
Output
['123']
This approach creates a dictionary mapping dataElements to the different types of categoryOptionCombo:
defaultdict(<class 'set'>, {'123': {'B', 'A'}, '456': {'C'}})
Almost a one-liner:
counts = collections.Counter( d['dataElement']['id'] for d in data['data'] )
print( counts )
Output:
Counter({'123': 2, '456': 1})
No need for sets, you can just remember each data element's first coc or mark it as having 'multiple'.
tracker = {}
for d in data['data']:
data_element = d['dataElement']['id']
coc = d['categoryOptionCombo']['id']
if tracker.setdefault(data_element, coc) != coc:
tracker[data_element] = 'multiple'
too_many = [key for key,value in tracker.items() if value == 'multiple']
(If the string 'multiple' can be a coc id, then use multiple = object() and compare with is).
I have a json file for which I want to remove the $oid and $date and replace the keys like in the example below:
import json
def key_replacer(dictionary):
new_dictionary = {}
for k,v in dictionary.items():
if k in ('$oid', '$date'):
return v
if isinstance(v, dict):
v = key_replacer(v)
new_dictionary[k] = v
return new_dictionary
data = """{
"_id": {
"$oid": "5e7511c45cb29ef48b8cfcff"
},
"description": "some text",
"startDate": {
"$date": "5e7511c45cb29ef48b8cfcff"
},
"completionDate": {
"$date": "2021-01-05T14:59:58.046Z"
},
"videos":[{"$oid":"5ecf6cc19ad2a4dfea993fed"}]
}"""
info = json.loads(data)
refined = key_replacer(info)
new_data = json.dumps(refined)
print(new_data)
Output: {"_id": "5e7511c45cb29ef48b8cfcff", "description": "some text", "startDate": "5e7511c45cb29ef48b8cfcff", "completionDate": "2021-01-05T14:59:58.046Z", "videos": [{"$oid": "5ecf6cc19ad2a4dfea993fed"}]}. It works the way I want until "videos". How could I remove the $ sign for the "videos" part and replace the key like it happens in the other cases? It doesn't get into the contents of the list and I assume this is the cause.
Your original function doesn't account for the case where the dictionary value is a list. By accommodating for that, you will get the desired result
def key_replacer(dictionary):
new_dictionary = {}
for k, v in dictionary.items():
if k in ('$oid', '$date'):
return v
elif isinstance(v, dict):
v = key_replacer(v)
elif isinstance(v, list):
tmp = []
for itm in v:
tmp.append(key_replacer(itm))
v = tmp
new_dictionary[k] = v
return new_dictionary
NB: If the items in the list is not dictionaries the code will break, so be mindful of that
I have following JSON, returned from a REST service, where I want to generate a unique names for each value by combining parent keys. For example. name+phone+address+city+name , name+phone+address+city+population+skilled+male and so on.
{
"name": "name",
"phone": "343444444",
"address": {
"lat": 23.444,
"lng": 34.3322,
"city":{
"name": "city name",
"population": {
"skilled": {
"male": 2,
"female": 4
},
"uneducated": {
"male": 20,
"femail": 4
}
}
}
},
"email": "email",
"education": "phd"
}
I want to combine all key names starting from the parent of the JSON tree.
Here is what I am doing
class TestJson
def walk_through(self, json_object):
for k, v in json_object.items():
self.x_path = self.x_path + k
if type(v) is dict:
self.walk_through(v)
else:
print(self.x_path)
self.x_path = ""
This code is printing keys but only starting from the current parent node. I want to combine all keys up to root of the json.
If you ignore the name and phone keys, since they are not ancestors of city name or skilled male and the order of keys is not guaranteed, you can recursively build a flattened dict.
def walk_through(json_object):
d = {}
for k, v in json_object.items():
if isinstance(v, dict):
v = walk_through(v)
for vk, vv in v.items():
d["%s+%s" % (k, vk)] = vv
else:
d[k] = v
return d
print(json.dumps(walk_through(json_object), indent=2))
This prints:
{
"address+city+population+skilled+male": 2,
"name": "name",
"address+lng": 34.3322,
"address+city+name": "city name",
"address+lat": 23.444,
"address+city+population+uneducated+male": 20,
"phone": "343444444",
"address+city+population+uneducated+femail": 4,
"education": "phd",
"email": "email",
"address+city+population+skilled+female": 4
}
Note: this ignores lists an will not find dicts inside them.
If you want to print all keys of your python dict you can do the following:
def print_keys(d):
for key, value in d.iteritems():
print key,
if isinstance(value, dict):
print_keys(value)