Search for key inside nested lists inside nested dictionaries - python

The original post is Go through json line by line including unkown nested arrays and objects
I am trying to search for a specific value in all nested lists and nested dictionaries inside a JSON dictionary. The structure of the dictionary is not always known. The nested dictionary can have a nested list.
The key I'm looking for is Date and should not have String. Example key = Date is True but if key = DateString condition is not met.
Code:
def ConvertTimestamp(my_list_of_dicts: list):
for e in my_list_of_dicts:
# check top level keys whose values are not a list
keys_with_date = [k for k, v in e.items() if 'Date' in k and type(v) and 'String' not in k != list]
for k1 in keys_with_date:
e[k1] = 'found'
# check top level keys whose values are a list
keys_with_lists = [k for k, v in e.items() if type(v) == list]
for k1 in keys_with_lists:
for i, d in enumerate(e[k1]):
for k2, v in d.items():
if 'Date' in k2 and 'String' not in k2:
e[k1][i][k2] = 'found'
return my_list_of_dicts
Data
test_data = [{
"PurchaseOrderID": "aaff50c2-05d5-4943-9a37-421d1b326dc3",
"PurchaseOrderNumber": "PO-0001",
"DateString": "2020-06-04T00:00:00",
"Date": "2020-06-04T02:00:00.000000",
"DeliveryDateString": "2020-06-11T00:00:00",
"DeliveryDate": "2020-06-11T02:00:00.000000",
"DeliveryAddress": "",
"AttentionTo": "",
"Telephone": "",
"DeliveryInstructions": "",
"HasErrors": false,
"IsDiscounted": true,
"Reference": "",
"Type": "PURCHASEORDER",
"CurrencyRate": 1.0,
"CurrencyCode": "EUR",
"Contact": {
"ContactID": "31dcd998-026662967",
"ContactStatus": "ACTIVE",
"Name": "Test",
"FirstName": "",
"LastName": "",
"Addresses": [],
"Phones": [],
"UpdatedDateUTC": "/Date(1591272554130+0000)/",
"ContactGroups": [],
"DefaultCurrency": "EUR",
"ContactPersons": [],
"HasValidationErrors": false
},
"BrandingThemeID": "86a1c878-7b2ed792b224",
"Status": "DELETED",
"LineAmountTypes": "Exclusive",
"SubTotal": 1000.0,
"TotalTax": 0.0,
"Total": 1000.0,
"UpdatedDateUTC": "2020-06-04T12:14:26.527000",
"HasAttachments": false }]
Result should be
[{
"PurchaseOrderID": "aaff50c2-05d5-4943-9a37-421d1b326dc3",
"PurchaseOrderNumber": "PO-0001",
"DateString": "2020-06-04T00:00:00",
"Date": "2020-06-04T02:00:00.000000",
"DeliveryDateString": "2020-06-11T00:00:00",
"DeliveryDate": "2020-06-11T02:00:00.000000",
"DeliveryAddress": "",
"AttentionTo": "",
"Telephone": "",
"DeliveryInstructions": "",
"HasErrors": false,
"IsDiscounted": true,
"Reference": "",
"Type": "PURCHASEORDER",
"CurrencyRate": 1.0,
"CurrencyCode": "EUR",
"Contact": {
"ContactID": "31dcd998-026662967",
"ContactStatus": "ACTIVE",
"Name": "Test",
"FirstName": "",
"LastName": "",
"Addresses": [],
"Phones": [],
"UpdatedDateUTC": "2020-06-03T09:55:30.000000",
"ContactGroups": [],
"DefaultCurrency": "EUR",
"ContactPersons": [],
"HasValidationErrors": false
},
"BrandingThemeID": "86a1c878-7b2ed792b224",
"Status": "DELETED",
"LineAmountTypes": "Exclusive",
"SubTotal": 1000.0,
"TotalTax": 0.0,
"Total": 1000.0,
"UpdatedDateUTC": "2020-06-04T12:14:26.527000",
"HasAttachments": false }]

Since you don't know what the structure of the dictionaries are, it could be under an arbitrary number of levels.
Also, the expected result didn't really show what you want to do with the dictionary once found, so I've just added those dictionaries to a list.
Recursion helps in such cases.
def search_dict(d, results):
for k,v in d.items():
if 'Date' in k and 'String' not in k:
# Do what you want with `d` here.
# Your "Result should be" didn't really explain the
# found part, but if it gets here it means you found it.
# Appended to results as we want to continue searching for more.
# Appending (k,v,d) where d is the dictionary containing
# this key and value, incase you wanted that too.
# Adjust this accordingly.
results.append((k,v,d))
if isinstance(v, dict):
search_dict(v, results)
if isinstance(v, list):
search_list(v, results)
def search_list(l, results):
for item in l:
if not isinstance(item, dict):
continue # don't care about things that aren't dictionaries
if isinstance(item, list):
search_list(item, results)
if isinstance(item, dict):
search_dict(item, results)
def ConvertTimestamp(my_list_of_dicts: list):
results = []
search_list(my_list_of_dicts, results)
return results

Here is how you can recurse through the object, making the updates.
For sake of generality, the recursive function takes two externally defined callables an addition to the object being recursed:
a "key tester" function that should take a key (string) and return a boolean, which is used to decide which keys are to have their values updated
a "replacer" function that should take a value and return the new value
from datetime import date
from pprint import pprint
from copy import deepcopy
import re
def do_replacements(obj, key_tester, replacer):
"""
recursing through the nested list/dict structure,
and wherever key_tester(key) yields True,
use replacer function to generate the new value
"""
if isinstance(obj, dict):
for k, v in obj.items():
if key_tester(k):
obj[k] = replacer(v)
else:
do_replacements(v, key_tester, replacer)
elif isinstance(obj, list):
for item in obj:
do_replacements(item, key_tester, replacer)
def fix_time(ts):
"""
replace the timestamp if it fits a particular pattern
(based on code in original question)
"""
pattern = '\(|\)'
if not re.search(pattern, ts):
return ts
format = '%Y-%m-%dT%H:%M:%S.%f'
ts_utc = re.split(pattern, ts)[1]
ts_utc = ts_utc[:ts_utc.find("+")]
return date.fromtimestamp(float(ts_utc)/1000).strftime(format)
test_data = [{'PurchaseOrderID': 'aaff50c2-05d5-4943-9a37-421d1b326dc3', 'PurchaseOrderNumber': 'PO-0001', 'DateString': '2020-06-04T00:00:00', 'Date': '2020-06-04T02:00:00.000000', 'DeliveryDateString': '2020-06-11T00:00:00', 'DeliveryDate': '2020-06-11T02:00:00.000000', 'DeliveryAddress': '', 'AttentionTo': '', 'Telephone': '', 'DeliveryInstructions': '', 'HasErrors': False, 'IsDiscounted': True, 'Reference': '', 'Type': 'PURCHASEORDER', 'CurrencyRate': 1.0, 'CurrencyCode': 'EUR', 'Contact': {'ContactID': '31dcd998-026662967', 'ContactStatus': 'ACTIVE', 'Name': 'Test', 'FirstName': '', 'LastName': '', 'Addresses': [], 'Phones': [], 'UpdatedDateUTC': '/Date(1591272554130+0000)/', 'ContactGroups': [], 'DefaultCurrency': 'EUR', 'ContactPersons': [], 'HasValidationErrors': False}, 'BrandingThemeID': '86a1c878-7b2ed792b224', 'Status': 'DELETED', 'LineAmountTypes': 'Exclusive', 'SubTotal': 1000.0, 'TotalTax': 0.0, 'Total': 1000.0, 'UpdatedDateUTC': '2020-06-04T12:14:26.527000', 'HasAttachments': False}]
func = lambda k: "Date" in k and "String" not in k
output = deepcopy(test_data)
do_replacements(output, func, fix_time)
pprint(output)
gives:
[{'AttentionTo': '',
'BrandingThemeID': '86a1c878-7b2ed792b224',
'Contact': {'Addresses': [],
'ContactGroups': [],
'ContactID': '31dcd998-026662967',
'ContactPersons': [],
'ContactStatus': 'ACTIVE',
'DefaultCurrency': 'EUR',
'FirstName': '',
'HasValidationErrors': False,
'LastName': '',
'Name': 'Test',
'Phones': [],
'UpdatedDateUTC': '2020-06-04T00:00:00.000000'},
'CurrencyCode': 'EUR',
'CurrencyRate': 1.0,
'Date': '2020-06-04T02:00:00.000000',
'DateString': '2020-06-04T00:00:00',
'DeliveryAddress': '',
'DeliveryDate': '2020-06-11T02:00:00.000000',
'DeliveryDateString': '2020-06-11T00:00:00',
'DeliveryInstructions': '',
'HasAttachments': False,
'HasErrors': False,
'IsDiscounted': True,
'LineAmountTypes': 'Exclusive',
'PurchaseOrderID': 'aaff50c2-05d5-4943-9a37-421d1b326dc3',
'PurchaseOrderNumber': 'PO-0001',
'Reference': '',
'Status': 'DELETED',
'SubTotal': 1000.0,
'Telephone': '',
'Total': 1000.0,
'TotalTax': 0.0,
'Type': 'PURCHASEORDER',
'UpdatedDateUTC': '2020-06-04T12:14:26.527000'}]
(Note: the output shown here is the pretty-printed python object, rather than JSON, although similar.)

Related

How to get multiple key value pairs from list of JSON dictionaries

I have a JSON file called "hostnames" formatted like below
{
'propertyName': 'www.property1.com',
'propertyVersion': 1,
'etag': 'jbcas6764023nklf78354',
'rules': {
'name': 'default',
'children': [{
'name': 'Route',
'children': [],
'behaviors': [{
'name': 'origin',
'options': {
'originType': 'CUSTOMER',
'hostname': 'www.origin1.com',
and I wanted to get the values of keys "propertyName" and "hostname" and have a new JSON file like below
'properties': [{
'propertyName': 'www.property1.com',
'hostnames': ['www.origin1.com', 'www.origin2.com']
}, {
'propertyName': 'www.property1.com',
'hostnames': ['www.origin1.com', 'www.origin2.com']
}]
my code looks like this
hostnames = result.json()
hostnameslist = [host['hostname'] for host in hostnames['rules']['children']['behaviors']['options']]
print(hostnameslist)
but I'm getting the error
TypeError: list indices must be integers or slices, not str
You are trying to access a list elements with a string index ('behaviors').
Try:
hostnames = result.json()
hostnameslist = []
for child in hostnames['rules']['children']:
for behavior in child['behaviors']:
if behavior['name'] == 'origin':
hostnameslist.append(behavior['options']['hostname'])
properties = [{
'propertyName': hostnames['propertyName'],
'hostnames': hostnameslist
}]
Making an assumption about how the OP's data might be structured.
Recursive navigation of the dictionary to find all/any values associated with a dictionary key of 'hostname' appears to be well-suited here.
Doing it this way obviates the need for knowledge about the depth of the dictionary or indeed any of the dictionary key names except (obviously) 'hostname'.
Of course, there may be other dictionaries within the "master" dictionary that contain a 'hostname' key. If that's the case then this function may return values that are not needed/wanted.
data = {
'propertyName': 'www.property1.com',
'propertyVersion': 1,
'etag': 'jbcas6764023nklf78354',
'rules': {
'name': 'default',
'children': [{
'name': 'Route',
'children': [],
'behaviors': [{
'name': 'origin',
'options': {
'originType': 'CUSTOMER',
'hostname': 'www.origin1.com'
}
},
{
'name': 'origin',
'options': {
'originType': 'CUSTOMER',
'hostname': 'www.origin2.com'
}
}
]
}
]
}
}
def get_hostnames(d):
def _get_hostnames(_d, _l):
if isinstance(_d, dict):
if 'hostname' in _d:
_l.append(_d['hostname'])
else:
for _v in _d.values():
_get_hostnames(_v, _l)
else:
if isinstance(_d, list):
for _v in _d:
_get_hostnames(_v, _l)
return _l
return _get_hostnames(d, [])
result = {'properties': [{'propertyName': data.get('propertyName'), 'hostnames': get_hostnames(data)}]}
print(result)
Output:
{'properties': [{'propertyName': 'www.property1.com', 'hostnames': ['www.origin1.com', 'www.origin2.com']}]}

Extract value from list inside JSON dictionary (Python)

I need to extract a value from a dictionary, inside a list, inside a dictionary.
The value I'm tryng to get is dealId inside affectedDeals. (highlighted with ^)
data = {'date': '2022-11-04T12:36:57.016', 'status': 'OPEN', 'reason': 'SUCCESS', 'dealStatus': 'ACCEPTED', 'epic': 'SILVER', 'dealReference': 'o_0bc30104-8ddf-4d67-9daa-e7d878a8cad9', 'dealId': '006011e7-0055-311e-0000-000080507631', 'affectedDeals': [{'dealId': '006011e7-0055-311e-0000-000080507633', 'status': 'OPENED'}], 'level': 20.138, 'size': 1.0, 'direction': 'BUY', 'guaranteedStop': False, 'trailingStop': False}
^
output = 006011e7-0055-311e-0000-000080507633
May be this could help you to get the output as expected in question
Code:
data = {
"date": "2022-11-04T12:36:57.016",
"status": "OPEN",
"reason": "SUCCESS",
"dealStatus": "ACCEPTED",
"epic": "SILVER",
"dealReference": "o_0bc30104-8ddf-4d67-9daa-e7d878a8cad9",
"dealId": "006011e7-0055-311e-0000-000080507631",
"affectedDeals": [
{"dealId": "006011e7-0055-311e-0000-000080507633", "status": "OPENED"}
],
"level": 20.138,
"size": 1.0,
"direction": "BUY",
"guaranteedStop": False,
"trailingStop": False,
}
[i.get("dealId") for i in data.get("affectedDeals")]
Output :
006011e7-0055-311e-0000-000080507633
Output will be enclosed in list you can use indexing to retrieve the data from list like below
lst = [i.get("dealId") for i in data.get("affectedDeals")]
lst[0]
Try doing this:
data = {'date': '2022-11-04T12:36:57.016',
'status': 'OPEN',
'reason': 'SUCCESS',
'dealStatus': 'ACCEPTED',
'epic': 'SILVER',
'dealReference': 'o_0bc30104-8ddf-4d67-9daa-e7d878a8cad9',
'dealId': '006011e7-0055-311e-0000-000080507631',
'affectedDeals': [
{'dealId': '006011e7-0055-311e-0000-000080507633',
'status': 'OPENED'
}
],
'level': 20.138,
'size': 1.0,
'direction': 'BUY',
'guaranteedStop': False,
'trailingStop': False
}
print(data['affectedDeals'][0]['dealId'])
# Output: 006011e7-0055-311e-0000-000080507633
We basically accessed the list(affectedDeals) and then selected the 0th index, which is again a dictionary, so we accessed the specified key(dealID) and then print it.
Hope that helps

How to transform this python loop into a list comprehension?

Nevermind. As indicated by juanpa.arrivillaga, list comprehension does not return a dictionary ><
Context
I have the following python code (see next piece of code). I would like to try and optimize it to compare the execution time between regular loop and list comprehension.
Regular loop version
def flatten_json( nested_dict, flattened_dict={}, superior_level_key: str = ""):
for key, value in nested_dict.items():
if type(nested_dict[key]) is dict:
flattened_dict = flatten_json(
nested_dict[key], flattened_dict, "{}_".format(key))
else:
flattened_dict['{}{}'.format(superior_level_key, key)] = value
return flattened_dict
import json
with open('json.json') as j:
d = json.load(j)
print(flatten_json(d, {}, ""))
Current, failing, list comprehension version
def flatten_json(nested_dict, flattened_dict={}, superior_level_key: str = ""):
return [flatten_json(nested_dict[key], flattened_dict, "{}_".format(key))
if type(nested_dict) is dict
else value for key, value in nested_dict.items()]
import json
with open('json.json') as j:
d = json.load(j)
print(flatten_json(d, {}, ""))
Error
The list comprehension version throws the following error:
Traceback (most recent call last):
File "p1.py", line 13, in <module>
print(flatten_json(d, {}, ""))
File "p1.py", line 3, in flatten_json
return [flatten_json(nested_dict[key], flattened_dict, "{}_".format(key))
File "p1.py", line 3, in <listcomp>
return [flatten_json(nested_dict[key], flattened_dict, "{}_".format(key))
File "p1.py", line 5, in flatten_json
else value for key, value in nested_dict.items()]
AttributeError: 'float' object has no attribute 'items'
Question
Why is it throwing that error and how to fix it ?
Input
{
"_score": 1.0,
"_index": "sirene_prod",
"_id": "AXSp612eur2DngRir4BH",
"_type": "sirene_prod",
"_source": {
"enseigne": "",
"codpos": {
"cp": "17300",
"bur_distrib": "300",
"depet": "17"
},
"id": "ddf9e5b2aa0099ff6934a3d83b1678f64e27859e377362ef8682a9b1",
"l3_normalisee": "",
"apet700": "10.71C",
"sigle": "",
"siren": "793120569",
"libapen": "Boulangerie et boulangerie-patisserie",
"apen700": "10.71C",
"cedex": "",
"typvoie": "AV",
"numvoie": 33,
"nom": "",
"depet_limit": [
"16",
"24",
"33",
"79",
"85"
],
"libcom": "ROCHEFORT",
"l2_normalisee": {
"text": "",
"nom": "",
"initial": ""
},
"libvoie": "GAMBETTA",
"nic": "00017",
"prenom": "",
"nomen_long": {
"text": "LA PASSION DU PAIN",
"nom": "LA PASSION DU PAIN",
"initial": "LPDP"
},
"indrep": ""
}
}
Output
{'_score': 1.0, '_index': 'sirene_prod', '_id': 'AXSp612eur2DngRir4BH', '_type': 'sirene_prod', '_surce_enseigne': '', 'codpos_cp': '17300', 'codpos_bur_distrib': '300', 'codpos_depet': '17', '_soure_id': 'ddf9e5b2aa0099ff6934a3d83b1678f64e27859e377362ef8682a9b1', '_source_l3_normalisee': '', '_surce_apet700': '10.71C', '_source_sigle': '', '_source_siren': '793120569', '_source_libapen': 'Bouangerie et boulangerie-patisserie', '_source_apen700': '10.71C', '_source_cedex': '', '_source_typvie': 'AV', '_source_numvoie': 33, '_source_nom': '', '_source_depet_limit': ['16', '24', '33', '79' '85'], '_source_libcom': 'ROCHEFORT', 'l2_normalisee_text': '', 'l2_normalisee_nom': '', 'l2_normamalisee_initial': '', '_source_libvoie': 'GAMBETTA', '_source_nic': '00017', '_source_prenom': '', renom': '', 'nomen_long_text': 'LA PASSION DU PAIN', 'nomen_long_nom': 'LA PASSION DU P_long_initiaAIN', 'nomen_long_initial': 'LPDP', '_source_indrep': ''}
Here is an iterative solution, which should be faster than your current, recursive solution:
def flatten_json_iterative(nested):
flattened = {}
stack = [(nested, "")]
push_to_stack = stack.append
pop_from_stack = stack.pop
while stack:
nested_dict, superior_key = pop_from_stack()
for key, value in nested_dict.items():
if isinstance(value, dict):
push_to_stack((value, f"{key}_"))
else:
flattened[f"{superior_key}{key}"] = value
return flattened
In the repl:
In [8]: flatten_json_iterative(data)
Out[8]:
{'_score': 1.0,
'_index': 'sirene_prod',
'_id': 'AXSp612eur2DngRir4BH',
'_type': 'sirene_prod',
'_source_enseigne': '',
'_source_id': 'ddf9e5b2aa0099ff6934a3d83b1678f64e27859e377362ef8682a9b1',
'_source_l3_normalisee': '',
'_source_apet700': '10.71C',
'_source_sigle': '',
'_source_siren': '793120569',
'_source_libapen': 'Boulangerie et boulangerie-patisserie',
'_source_apen700': '10.71C',
'_source_cedex': '',
'_source_typvoie': 'AV',
'_source_numvoie': 33,
'_source_nom': '',
'_source_depet_limit': ['16', '24', '33', '79', '85'],
'_source_libcom': 'ROCHEFORT',
'_source_libvoie': 'GAMBETTA',
'_source_nic': '00017',
'_source_prenom': '',
'_source_indrep': '',
'nomen_long_text': 'LA PASSION DU PAIN',
'nomen_long_nom': 'LA PASSION DU PAIN',
'nomen_long_initial': 'LPDP',
'l2_normalisee_text': '',
'l2_normalisee_nom': '',
'l2_normalisee_initial': '',
'codpos_cp': '17300',
'codpos_bur_distrib': '300',
'codpos_depet': '17'}
BTW:
Your approach wasn't working because you were checking if nested_dict is a dict then recursing on it, if type(nested_dict) is dict instead, you would want to check if the value is a dict, then recurse on it. But that isn't going to help you much. I am not clever enough to figure out a way to use recursion and a, say, dict comprehension to flatten a nested dictionary like this.

Updating only the value n-nested dictionary

I'm trying update update the value of a nested dictionary within a for loop, so it doesn't generate a new dictionary every time, I'm pretty new to traversing nested structures so bear with me. Each value is located in a list:
My list:
id_list = ['asf245', 'kjb456', '235sdg']
My dictionary:
temp = {"ent": {"type": "IDN", "attributes": [{"ent": {"id": "abc123"}}], "limit": 20}}
Ideally I would append each update dictionary to a dataframe and then update it with the new value:
Ideal output:
temp = {"ent": {"type": "IDN", "attributes": [{"ent": {"id": "asf245"}}], "limit": 20}}
temp = {"ent": {"type": "IDN", "attributes": [{"ent": {"id": "kjb456"}}], "limit": 20}}
temp = {"ent": {"type": "IDN", "attributes": [{"ent": {"id": "235sdg"}}], "limit": 20}}
Where temp gets appended to a dataframe every iteration then gets overwritten with the new value:
I've tried:
import collections
def update(d, u):
for k, v in u.items():
if isinstance(v, collections.Mapping):
d[k] = update(d.get(k, {}), v)
else:
d[k] = v
return d
print(update(temp, 'Apples')) <- "run this through a loop"
But running this through a visualizer I can see that it doesn't go deep enough, and I don't truly have a good understanding of it, if anyone could explain it that would be awesome.
Here. The result of the function is a list of dicts (with modified id)
import copy
def clone_dict(d, ids):
result = []
for id in ids:
clone = copy.deepcopy(d)
clone['ent']['attributes'][0]['ent']['id'] = id
result.append(clone)
return result
temp = {"ent": {"type": "IDN", "attributes": [{"ent": {"id": "abc123"}}], "limit": 20}}
ids = ['x', 'y', 'z']
print(clone_dict(temp, ids))
output
[{'ent': {'attributes': [{'ent': {'id': 'x'}}], 'type': 'IDN', 'limit': 20}}, {'ent': {'attributes': [{'ent': {'id': 'y'}}], 'type': 'IDN', 'limit': 20}}, {'ent': {'attributes': [{'ent': {'id': 'z'}}], 'type': 'IDN', 'limit': 20}}]
A generic approach below
import copy
def clone_dict(src_dict, values_to_inject, path_elements):
""" Clone a dict N times and replace a nested field
:param src_dict: Used as 'template'
:param values_to_inject: List of values to inject
:param path_elements: List of path elements. Used in dict navigation
:return: A list of cloned modified dicts
"""
result = []
for value in values_to_inject:
clone = copy.deepcopy(src_dict)
temp = clone[path_elements[0]]
for path_element in path_elements[1:-1]:
temp = temp[path_element]
temp[path_elements[-1]] = value
result.append(clone)
return result
src_dict = {"ent": {"type": "IDN", "attributes": [{"ent": {"id": "abc123"}}], "limit": 20}}
values_to_inject = ['x', 'y', 'z']
path_elements = ['ent', 'attributes', 0, 'ent', 'id']
print(clone_dict(src_dict, values_to_inject, path_elements))
Here is a more generic solution involving recursion. It takes a dictionary to update, the key to update, and the value that you want to update.
def update(to_update, key, val):
for k, v in to_update.items():
if k == key:
to_update[k] = val
else:
if isinstance(v, dict):
update(v, key, val)
elif isinstance(v, list):
for item in v:
if isinstance(item, (dict, list)):
update(item, key, val)
else:
continue
else:
continue
return to_update
for id_ in id_list:
new = update(temp, 'id', id_)
print(new)
{'ent': {'type': 'IDN', 'attributes': [{'ent': {'id': 'asf245'}}], 'limit': 20}}
{'ent': {'type': 'IDN', 'attributes': [{'ent': {'id': 'kjb456'}}], 'limit': 20}}
{'ent': {'type': 'IDN', 'attributes': [{'ent': {'id': '235sdg'}}], 'limit': 20}}

Flattening a dict

I have the following array of dicts (there's only one dict):
[{
'RuntimeInMinutes': '21',
'EpisodeNumber': '21',
'Genres': ['Animation'],
'ReleaseDate': '2005-02-05',
'LanguageOfMetadata': 'EN',
'Languages': [{
'_Key': 'CC',
'Value': ['en']
}, {
'_Key': 'Primary',
'Value': ['EN']
}],
'Products': [{
'URL': 'http://www.hulu.com/watch/217566',
'Rating': 'TV-Y',
'Currency': 'USD',
'SUBSCRIPTION': '0.00',
'_Key': 'US'
}, {
'URL': 'http://www.hulu.com/d/217566',
'Rating': 'TV-Y',
'Currency': 'USD',
'SUBSCRIPTION': '0.00',
'_Key': 'DE'
}],
'ReleaseYear': '2005',
'TVSeriesID': '5638#TVSeries',
'Type': 'TVEpisode',
'Studio': '4K Media'
}]
I would like to flatten the dict as follows:
[{
'RuntimeInMinutes': '21',
'EpisodeNumber': '21',
'Genres': ['Animation'],
'ReleaseDate': '2005-02-05',
'LanguageOfMetadata': 'EN',
'Languages._Key': ['CC', 'Primary'],
'Languages.Value': ['en', 'EN'],
'Products.URL': ['http://www.hulu.com/watch/217566', 'http://www.hulu.com/d/217566'],
'Products.Rating': ['TV-Y', 'TV-Y'],
'Products.Currency': ['USD', 'USD'],
'Products.SUBSCRIPTION': ['0.00', '0.00'],
'Products._Key': ['US', 'DE'],
'ReleaseYear': '2005',
'TVSeriesID': '5638#TVSeries',
'Type': 'TVEpisode',
'Studio': '4K Media'
}]
In other words, anytime a dict is encountered, it need to convert to either a string, number, or list.
What I currently have is something along the lines of the following, which uses a while loop to iterate through all the subpaths of the json.
while True:
for key in copy(keys):
val = get_sub_object_from_path(obj, key)
if isinstance(val, dict):
FLAT_OBJ[key.replace('/', '.')] = val
else:
keys.extend(os.path.join(key, _nextkey) for _nextkey in val.keys())
keys.remove(key)
if (not keys) or (n > 5):
break
else:
n += 1
continue
You can use recursion with a generator:
from collections import defaultdict
_d = [{'RuntimeInMinutes': '21', 'EpisodeNumber': '21', 'Genres': ['Animation'], 'ReleaseDate': '2005-02-05', 'LanguageOfMetadata': 'EN', 'Languages': [{'_Key': 'CC', 'Value': ['en']}, {'_Key': 'Primary', 'Value': ['EN']}], 'Products': [{'URL': 'http://www.hulu.com/watch/217566', 'Rating': 'TV-Y', 'Currency': 'USD', 'SUBSCRIPTION': '0.00', '_Key': 'US'}, {'URL': 'http://www.hulu.com/d/217566', 'Rating': 'TV-Y', 'Currency': 'USD', 'SUBSCRIPTION': '0.00', '_Key': 'DE'}], 'ReleaseYear': '2005', 'TVSeriesID': '5638#TVSeries', 'Type': 'TVEpisode', 'Studio': '4K Media'}]
def get_vals(d, _path = []):
for a, b in getattr(d, 'items', lambda :{})():
if isinstance(b, list) and all(isinstance(i, dict) or isinstance(i, list) for i in b):
for c in b:
yield from get_vals(c, _path+[a])
elif isinstance(b, dict):
yield from get_vals(b, _path+[a])
else:
yield ['.'.join(_path+[a]), b]
results = [i for b in _d for i in get_vals(b)]
_c = defaultdict(list)
for a, b in results:
_c[a].append(b)
result = [{a:list(b) if len(b) > 1 else b[0] for a, b in _c.items()}]
import json
print(json.dumps(result, indent=4))
Output:
[
{
"RuntimeInMinutes": "21",
"EpisodeNumber": "21",
"Genres": [
"Animation"
],
"ReleaseDate": "2005-02-05",
"LanguageOfMetadata": "EN",
"Languages._Key": [
"CC",
"Primary"
],
"Languages.Value": [
[
"en"
],
[
"EN"
]
],
"Products.URL": [
"http://www.hulu.com/watch/217566",
"http://www.hulu.com/d/217566"
],
"Products.Rating": [
"TV-Y",
"TV-Y"
],
"Products.Currency": [
"USD",
"USD"
],
"Products.SUBSCRIPTION": [
"0.00",
"0.00"
],
"Products._Key": [
"US",
"DE"
],
"ReleaseYear": "2005",
"TVSeriesID": "5638#TVSeries",
"Type": "TVEpisode",
"Studio": "4K Media"
}
]
Edit: wrapping solution in outer function:
def flatten_obj(data):
def get_vals(d, _path = []):
for a, b in getattr(d, 'items', lambda :{})():
if isinstance(b, list) and all(isinstance(i, dict) or isinstance(i, list) for i in b):
for c in b:
yield from get_vals(c, _path+[a])
elif isinstance(b, dict):
yield from get_vals(b, _path+[a])
else:
yield ['.'.join(_path+[a]), b]
results = [i for b in data for i in get_vals(b)]
_c = defaultdict(list)
for a, b in results:
_c[a].append(b)
return [{a:list(b) if len(b) > 1 else b[0] for a, b in _c.items()}]
EDIT
This now appears to be fixed:
As #panda-34 correctly points out (+1), the currently accepted
solution loses data, specifically Genres and Languages.Value when
you run the posted code.
Unfortunately, #panda-34's code modifies Genres:
'Genres': 'Animation',
rather than leaving it alone as in the OP's example:
'Genres': ['Animation'],
Below's my solution which attacks the problem a different way. None of the keys in the original data contains a dictionary as a value, only non-containers or lists (e.g. lists of dictionaries). So a primary a list of dictionaries will becomes a dictionary of lists (or just a plain dictionary if there's only one dictionary in the list.) Once we've done that, then any value that's now a dictionary is expanded back into the original data structure:
def flatten(container):
# A list of dictionaries becomes a dictionary of lists (unless only one dictionary in list)
if isinstance(container, list) and all(isinstance(element, dict) for element in container):
new_dictionary = {}
first, *rest = container
for key, value in first.items():
new_dictionary[key] = [flatten(value)] if rest else flatten(value)
for dictionary in rest:
for key, value in dictionary.items():
new_dictionary[key].append(value)
container = new_dictionary
# Any dictionary value that's a dictionary is expanded into original dictionary
if isinstance(container, dict):
new_dictionary = {}
for key, value in container.items():
if isinstance(value, dict):
for sub_key, sub_value in value.items():
new_dictionary[key + "." + sub_key] = sub_value
else:
new_dictionary[key] = value
container = new_dictionary
return container
OUTPUT
{
"RuntimeInMinutes": "21",
"EpisodeNumber": "21",
"Genres": [
"Animation"
],
"ReleaseDate": "2005-02-05",
"LanguageOfMetadata": "EN",
"Languages._Key": [
"CC",
"Primary"
],
"Languages.Value": [
[
"en"
],
[
"EN"
]
],
"Products.URL": [
"http://www.hulu.com/watch/217566",
"http://www.hulu.com/d/217566"
],
"Products.Rating": [
"TV-Y",
"TV-Y"
],
"Products.Currency": [
"USD",
"USD"
],
"Products.SUBSCRIPTION": [
"0.00",
"0.00"
],
"Products._Key": [
"US",
"DE"
],
"ReleaseYear": "2005",
"TVSeriesID": "5638#TVSeries",
"Type": "TVEpisode",
"Studio": "4K Media"
}
But this solution introduces a new apparent inconsistency:
'Languages.Value': ['en', 'EN'],
vs.
"Languages.Value": [["en"], ["EN"]],
However, I believe this is tied up with the Genres inconsistency mentioned earlier and the OP needs to define a consistent resolution.
Ajax1234's answer loses values of 'Genres' and 'Languages.Value'
Here's a bit more generic version:
def flatten_obj(data):
def flatten_item(item, keys):
if isinstance(item, list):
for v in item:
yield from flatten_item(v, keys)
elif isinstance(item, dict):
for k, v in item.items():
yield from flatten_item(v, keys+[k])
else:
yield '.'.join(keys), item
res = []
for item in data:
res_item = defaultdict(list)
for k, v in flatten_item(item, []):
res_item[k].append(v)
res.append({k: (v if len(v) > 1 else v[0]) for k, v in res_item.items()})
return res
P.S. "Genres" value is also flattened. It is either an inconsistency in the OP requirements or a separate problem which is not addressed in this answer.

Categories

Resources