Exclude empty/null values from JSON serialization - python

I am serializing multiple nested dictionaries to JSON using Python with simplejson.
Is there any way to automatically exclude empty/null values?
For example, serialize this:
{
"dict1" : {
"key1" : "value1",
"key2" : None
}
}
to
{
"dict1" : {
"key1" : "value1"
}
}
When using Jackson with Java you can use Inclusion.NON_NULL to do this. Is there a simplejson equivalent?

def del_none(d):
"""
Delete keys with the value ``None`` in a dictionary, recursively.
This alters the input so you may wish to ``copy`` the dict first.
"""
# For Python 3, write `list(d.items())`; `d.items()` won’t work
# For Python 2, write `d.items()`; `d.iteritems()` won’t work
for key, value in list(d.items()):
if value is None:
del d[key]
elif isinstance(value, dict):
del_none(value)
return d # For convenience
Sample usage:
>>> mydict = {'dict1': {'key1': 'value1', 'key2': None}}
>>> print(del_none(mydict.copy()))
{'dict1': {'key1': 'value1'}}
Then you can feed that to json.

My Python3 version of this has the benefit of not changing the input, as well as recursion into dictionaries nested in lists:
def clean_nones(value):
"""
Recursively remove all None values from dictionaries and lists, and returns
the result as a new dictionary or list.
"""
if isinstance(value, list):
return [clean_nones(x) for x in value if x is not None]
elif isinstance(value, dict):
return {
key: clean_nones(val)
for key, val in value.items()
if val is not None
}
else:
return value
For example:
a = {
"a": None,
"b": "notNone",
"c": ["hello", None, "goodbye"],
"d": [
{
"a": "notNone",
"b": None,
"c": ["hello", None, "goodbye"],
},
{
"a": "notNone",
"b": None,
"c": ["hello", None, "goodbye"],
}
]
}
print(clean_nones(a))
results in this:
{
'b': 'notNone',
'c': ['hello', 'goodbye'],
'd': [
{
'a': 'notNone',
'c': ['hello', 'goodbye']
},
{
'a': 'notNone',
'c': ['hello', 'goodbye']
}
]
}

>>> def cleandict(d):
... if not isinstance(d, dict):
... return d
... return dict((k,cleandict(v)) for k,v in d.iteritems() if v is not None)
...
>>> mydict = dict(dict1=dict(key1='value1', key2=None))
>>> print cleandict(mydict)
{'dict1': {'key1': 'value1'}}
>>>
I don't like using del in general, changing the existing dictionary can have subtle effects depending on how they are created. Creating new dictionaries with None removed prevents all side effect.

You can try this approach. In my case (I use python 3), it works well.
def to_json(self):
return json.dumps(self,
default=lambda o: dict((key, value) for key, value in o.__dict__.items() if value),
indent=4,
allow_nan=False)

This solution is correction of the one above from #eric which does not handle list type corectly.
Values in canonical JSON dictionary can be of one of following 3 types:
dictionary
list
value type (string, integer or floating point)
Note: Assumption is that we are dealing here with canonical JSON dictionary which can really contain only above mentioned types. If dictionary contains other types then ones mentioned above (e.g. tuples, custom classes, ...), then this solution won't work as expected.
The essential difference between this solution (below) and the original one from #eric is that list can contain elements of dictionary type from iside of which we want to drop elements with None value.
def cleandict(d):
if isinstance(d, dict):
return {k: cleandict(v) for k, v in d.items() if v is not None}
elif isinstance(d, list):
return [cleandict(v) for v in d]
else:
return d
Note: Please keep in mind that we must NOT remove None elements from the list since it would affect structural integrity of the list data. If some ( or all) of list elements have None value, they shall remain listed in the list structure as they were in order to preserve original structural meaning/integrity of the list.

def excludeNone(d):
for k in list(d):
if k in d:
if type(d[k]) == dict:
excludeNone(d[k])
if not d[k]:
del d[k]

It works for me:
When dictionary has dict/list/tuple values ....
for example it is my object:
dict_obj = {
'inline_keyboard': [
[
{'text': '0-0', 'url': None, 'login_url': None, 'callback_data': '0-0', 'switch_inline_query': None},
{'text': '0-1', 'url': None, 'login_url': None, 'callback_data': '0-1', 'switch_inline_query': None}
],
[
{'text': '1-0', 'url': None, 'login_url': None, 'callback_data': '1-0', 'switch_inline_query': None},
{'text': '1-1', 'url': None, 'login_url': None, 'callback_data': '1-1', 'switch_inline_query': None}
],
[
{'text': '2-0', 'url': None, 'login_url': None, 'callback_data': '2-0', 'switch_inline_query': None}
]
]
}
I wrote this function:
def delete_none_values(obj):
if isinstance(obj, dict):
for k, v in list(obj.items()):
if v is None:
del obj[k]
elif isinstance(v, dict):
delete_none_values(v)
elif isinstance(v, (list, tuple)):
for _ in v:
delete_none_values(_)
elif isinstance(obj, (list, tuple)):
for _ in obj:
delete_none_values(_)
return obj
And then when use this fuction:
from json import dumps
print(
dumps(
delete_none_values(dict_obj.copy()),
indent=2
)
)
output is:
{
"inline_keyboard": [
[
{"text": "0-0", "callback_data": "0-0"},
{"text": "0-1", "callback_data": "0-1"}
],
[
{"text": "1-0", "callback_data": "1-0"},
{"text": "1-1", "callback_data": "1-1"}
],
[
{"text": "2-0", "callback_data": "2-0"}
]
]
}

Could you maybe remain 'url' if it has value in one place and remove it if it none on another place?
'inline_keyboard': [
[
{'text': '0-0', 'url': 'someValue', 'login_url': None, 'callback_data': '0-0', 'switch_inline_query': None},
{'text': '0-1', 'url': None, 'login_url': None, 'callback_data': '0-1', 'switch_inline_query': None}
],
[
{'text': '1-0', 'url': None, 'login_url': None, 'callback_data': '1-0', 'switch_inline_query': None},
{'text': '1-1', 'url': None, 'login_url': None, 'callback_data': '1-1', 'switch_inline_query': None}
],
[
{'text': '2-0', 'url': None, 'login_url': None, 'callback_data': '2-0', 'switch_inline_query': None}
]
]

Related

Search for key inside nested lists inside nested dictionaries

The original post is Go through json line by line including unkown nested arrays and objects
I am trying to search for a specific value in all nested lists and nested dictionaries inside a JSON dictionary. The structure of the dictionary is not always known. The nested dictionary can have a nested list.
The key I'm looking for is Date and should not have String. Example key = Date is True but if key = DateString condition is not met.
Code:
def ConvertTimestamp(my_list_of_dicts: list):
for e in my_list_of_dicts:
# check top level keys whose values are not a list
keys_with_date = [k for k, v in e.items() if 'Date' in k and type(v) and 'String' not in k != list]
for k1 in keys_with_date:
e[k1] = 'found'
# check top level keys whose values are a list
keys_with_lists = [k for k, v in e.items() if type(v) == list]
for k1 in keys_with_lists:
for i, d in enumerate(e[k1]):
for k2, v in d.items():
if 'Date' in k2 and 'String' not in k2:
e[k1][i][k2] = 'found'
return my_list_of_dicts
Data
test_data = [{
"PurchaseOrderID": "aaff50c2-05d5-4943-9a37-421d1b326dc3",
"PurchaseOrderNumber": "PO-0001",
"DateString": "2020-06-04T00:00:00",
"Date": "2020-06-04T02:00:00.000000",
"DeliveryDateString": "2020-06-11T00:00:00",
"DeliveryDate": "2020-06-11T02:00:00.000000",
"DeliveryAddress": "",
"AttentionTo": "",
"Telephone": "",
"DeliveryInstructions": "",
"HasErrors": false,
"IsDiscounted": true,
"Reference": "",
"Type": "PURCHASEORDER",
"CurrencyRate": 1.0,
"CurrencyCode": "EUR",
"Contact": {
"ContactID": "31dcd998-026662967",
"ContactStatus": "ACTIVE",
"Name": "Test",
"FirstName": "",
"LastName": "",
"Addresses": [],
"Phones": [],
"UpdatedDateUTC": "/Date(1591272554130+0000)/",
"ContactGroups": [],
"DefaultCurrency": "EUR",
"ContactPersons": [],
"HasValidationErrors": false
},
"BrandingThemeID": "86a1c878-7b2ed792b224",
"Status": "DELETED",
"LineAmountTypes": "Exclusive",
"SubTotal": 1000.0,
"TotalTax": 0.0,
"Total": 1000.0,
"UpdatedDateUTC": "2020-06-04T12:14:26.527000",
"HasAttachments": false }]
Result should be
[{
"PurchaseOrderID": "aaff50c2-05d5-4943-9a37-421d1b326dc3",
"PurchaseOrderNumber": "PO-0001",
"DateString": "2020-06-04T00:00:00",
"Date": "2020-06-04T02:00:00.000000",
"DeliveryDateString": "2020-06-11T00:00:00",
"DeliveryDate": "2020-06-11T02:00:00.000000",
"DeliveryAddress": "",
"AttentionTo": "",
"Telephone": "",
"DeliveryInstructions": "",
"HasErrors": false,
"IsDiscounted": true,
"Reference": "",
"Type": "PURCHASEORDER",
"CurrencyRate": 1.0,
"CurrencyCode": "EUR",
"Contact": {
"ContactID": "31dcd998-026662967",
"ContactStatus": "ACTIVE",
"Name": "Test",
"FirstName": "",
"LastName": "",
"Addresses": [],
"Phones": [],
"UpdatedDateUTC": "2020-06-03T09:55:30.000000",
"ContactGroups": [],
"DefaultCurrency": "EUR",
"ContactPersons": [],
"HasValidationErrors": false
},
"BrandingThemeID": "86a1c878-7b2ed792b224",
"Status": "DELETED",
"LineAmountTypes": "Exclusive",
"SubTotal": 1000.0,
"TotalTax": 0.0,
"Total": 1000.0,
"UpdatedDateUTC": "2020-06-04T12:14:26.527000",
"HasAttachments": false }]
Since you don't know what the structure of the dictionaries are, it could be under an arbitrary number of levels.
Also, the expected result didn't really show what you want to do with the dictionary once found, so I've just added those dictionaries to a list.
Recursion helps in such cases.
def search_dict(d, results):
for k,v in d.items():
if 'Date' in k and 'String' not in k:
# Do what you want with `d` here.
# Your "Result should be" didn't really explain the
# found part, but if it gets here it means you found it.
# Appended to results as we want to continue searching for more.
# Appending (k,v,d) where d is the dictionary containing
# this key and value, incase you wanted that too.
# Adjust this accordingly.
results.append((k,v,d))
if isinstance(v, dict):
search_dict(v, results)
if isinstance(v, list):
search_list(v, results)
def search_list(l, results):
for item in l:
if not isinstance(item, dict):
continue # don't care about things that aren't dictionaries
if isinstance(item, list):
search_list(item, results)
if isinstance(item, dict):
search_dict(item, results)
def ConvertTimestamp(my_list_of_dicts: list):
results = []
search_list(my_list_of_dicts, results)
return results
Here is how you can recurse through the object, making the updates.
For sake of generality, the recursive function takes two externally defined callables an addition to the object being recursed:
a "key tester" function that should take a key (string) and return a boolean, which is used to decide which keys are to have their values updated
a "replacer" function that should take a value and return the new value
from datetime import date
from pprint import pprint
from copy import deepcopy
import re
def do_replacements(obj, key_tester, replacer):
"""
recursing through the nested list/dict structure,
and wherever key_tester(key) yields True,
use replacer function to generate the new value
"""
if isinstance(obj, dict):
for k, v in obj.items():
if key_tester(k):
obj[k] = replacer(v)
else:
do_replacements(v, key_tester, replacer)
elif isinstance(obj, list):
for item in obj:
do_replacements(item, key_tester, replacer)
def fix_time(ts):
"""
replace the timestamp if it fits a particular pattern
(based on code in original question)
"""
pattern = '\(|\)'
if not re.search(pattern, ts):
return ts
format = '%Y-%m-%dT%H:%M:%S.%f'
ts_utc = re.split(pattern, ts)[1]
ts_utc = ts_utc[:ts_utc.find("+")]
return date.fromtimestamp(float(ts_utc)/1000).strftime(format)
test_data = [{'PurchaseOrderID': 'aaff50c2-05d5-4943-9a37-421d1b326dc3', 'PurchaseOrderNumber': 'PO-0001', 'DateString': '2020-06-04T00:00:00', 'Date': '2020-06-04T02:00:00.000000', 'DeliveryDateString': '2020-06-11T00:00:00', 'DeliveryDate': '2020-06-11T02:00:00.000000', 'DeliveryAddress': '', 'AttentionTo': '', 'Telephone': '', 'DeliveryInstructions': '', 'HasErrors': False, 'IsDiscounted': True, 'Reference': '', 'Type': 'PURCHASEORDER', 'CurrencyRate': 1.0, 'CurrencyCode': 'EUR', 'Contact': {'ContactID': '31dcd998-026662967', 'ContactStatus': 'ACTIVE', 'Name': 'Test', 'FirstName': '', 'LastName': '', 'Addresses': [], 'Phones': [], 'UpdatedDateUTC': '/Date(1591272554130+0000)/', 'ContactGroups': [], 'DefaultCurrency': 'EUR', 'ContactPersons': [], 'HasValidationErrors': False}, 'BrandingThemeID': '86a1c878-7b2ed792b224', 'Status': 'DELETED', 'LineAmountTypes': 'Exclusive', 'SubTotal': 1000.0, 'TotalTax': 0.0, 'Total': 1000.0, 'UpdatedDateUTC': '2020-06-04T12:14:26.527000', 'HasAttachments': False}]
func = lambda k: "Date" in k and "String" not in k
output = deepcopy(test_data)
do_replacements(output, func, fix_time)
pprint(output)
gives:
[{'AttentionTo': '',
'BrandingThemeID': '86a1c878-7b2ed792b224',
'Contact': {'Addresses': [],
'ContactGroups': [],
'ContactID': '31dcd998-026662967',
'ContactPersons': [],
'ContactStatus': 'ACTIVE',
'DefaultCurrency': 'EUR',
'FirstName': '',
'HasValidationErrors': False,
'LastName': '',
'Name': 'Test',
'Phones': [],
'UpdatedDateUTC': '2020-06-04T00:00:00.000000'},
'CurrencyCode': 'EUR',
'CurrencyRate': 1.0,
'Date': '2020-06-04T02:00:00.000000',
'DateString': '2020-06-04T00:00:00',
'DeliveryAddress': '',
'DeliveryDate': '2020-06-11T02:00:00.000000',
'DeliveryDateString': '2020-06-11T00:00:00',
'DeliveryInstructions': '',
'HasAttachments': False,
'HasErrors': False,
'IsDiscounted': True,
'LineAmountTypes': 'Exclusive',
'PurchaseOrderID': 'aaff50c2-05d5-4943-9a37-421d1b326dc3',
'PurchaseOrderNumber': 'PO-0001',
'Reference': '',
'Status': 'DELETED',
'SubTotal': 1000.0,
'Telephone': '',
'Total': 1000.0,
'TotalTax': 0.0,
'Type': 'PURCHASEORDER',
'UpdatedDateUTC': '2020-06-04T12:14:26.527000'}]
(Note: the output shown here is the pretty-printed python object, rather than JSON, although similar.)

Updating only the value n-nested dictionary

I'm trying update update the value of a nested dictionary within a for loop, so it doesn't generate a new dictionary every time, I'm pretty new to traversing nested structures so bear with me. Each value is located in a list:
My list:
id_list = ['asf245', 'kjb456', '235sdg']
My dictionary:
temp = {"ent": {"type": "IDN", "attributes": [{"ent": {"id": "abc123"}}], "limit": 20}}
Ideally I would append each update dictionary to a dataframe and then update it with the new value:
Ideal output:
temp = {"ent": {"type": "IDN", "attributes": [{"ent": {"id": "asf245"}}], "limit": 20}}
temp = {"ent": {"type": "IDN", "attributes": [{"ent": {"id": "kjb456"}}], "limit": 20}}
temp = {"ent": {"type": "IDN", "attributes": [{"ent": {"id": "235sdg"}}], "limit": 20}}
Where temp gets appended to a dataframe every iteration then gets overwritten with the new value:
I've tried:
import collections
def update(d, u):
for k, v in u.items():
if isinstance(v, collections.Mapping):
d[k] = update(d.get(k, {}), v)
else:
d[k] = v
return d
print(update(temp, 'Apples')) <- "run this through a loop"
But running this through a visualizer I can see that it doesn't go deep enough, and I don't truly have a good understanding of it, if anyone could explain it that would be awesome.
Here. The result of the function is a list of dicts (with modified id)
import copy
def clone_dict(d, ids):
result = []
for id in ids:
clone = copy.deepcopy(d)
clone['ent']['attributes'][0]['ent']['id'] = id
result.append(clone)
return result
temp = {"ent": {"type": "IDN", "attributes": [{"ent": {"id": "abc123"}}], "limit": 20}}
ids = ['x', 'y', 'z']
print(clone_dict(temp, ids))
output
[{'ent': {'attributes': [{'ent': {'id': 'x'}}], 'type': 'IDN', 'limit': 20}}, {'ent': {'attributes': [{'ent': {'id': 'y'}}], 'type': 'IDN', 'limit': 20}}, {'ent': {'attributes': [{'ent': {'id': 'z'}}], 'type': 'IDN', 'limit': 20}}]
A generic approach below
import copy
def clone_dict(src_dict, values_to_inject, path_elements):
""" Clone a dict N times and replace a nested field
:param src_dict: Used as 'template'
:param values_to_inject: List of values to inject
:param path_elements: List of path elements. Used in dict navigation
:return: A list of cloned modified dicts
"""
result = []
for value in values_to_inject:
clone = copy.deepcopy(src_dict)
temp = clone[path_elements[0]]
for path_element in path_elements[1:-1]:
temp = temp[path_element]
temp[path_elements[-1]] = value
result.append(clone)
return result
src_dict = {"ent": {"type": "IDN", "attributes": [{"ent": {"id": "abc123"}}], "limit": 20}}
values_to_inject = ['x', 'y', 'z']
path_elements = ['ent', 'attributes', 0, 'ent', 'id']
print(clone_dict(src_dict, values_to_inject, path_elements))
Here is a more generic solution involving recursion. It takes a dictionary to update, the key to update, and the value that you want to update.
def update(to_update, key, val):
for k, v in to_update.items():
if k == key:
to_update[k] = val
else:
if isinstance(v, dict):
update(v, key, val)
elif isinstance(v, list):
for item in v:
if isinstance(item, (dict, list)):
update(item, key, val)
else:
continue
else:
continue
return to_update
for id_ in id_list:
new = update(temp, 'id', id_)
print(new)
{'ent': {'type': 'IDN', 'attributes': [{'ent': {'id': 'asf245'}}], 'limit': 20}}
{'ent': {'type': 'IDN', 'attributes': [{'ent': {'id': 'kjb456'}}], 'limit': 20}}
{'ent': {'type': 'IDN', 'attributes': [{'ent': {'id': '235sdg'}}], 'limit': 20}}

Flattening a dict

I have the following array of dicts (there's only one dict):
[{
'RuntimeInMinutes': '21',
'EpisodeNumber': '21',
'Genres': ['Animation'],
'ReleaseDate': '2005-02-05',
'LanguageOfMetadata': 'EN',
'Languages': [{
'_Key': 'CC',
'Value': ['en']
}, {
'_Key': 'Primary',
'Value': ['EN']
}],
'Products': [{
'URL': 'http://www.hulu.com/watch/217566',
'Rating': 'TV-Y',
'Currency': 'USD',
'SUBSCRIPTION': '0.00',
'_Key': 'US'
}, {
'URL': 'http://www.hulu.com/d/217566',
'Rating': 'TV-Y',
'Currency': 'USD',
'SUBSCRIPTION': '0.00',
'_Key': 'DE'
}],
'ReleaseYear': '2005',
'TVSeriesID': '5638#TVSeries',
'Type': 'TVEpisode',
'Studio': '4K Media'
}]
I would like to flatten the dict as follows:
[{
'RuntimeInMinutes': '21',
'EpisodeNumber': '21',
'Genres': ['Animation'],
'ReleaseDate': '2005-02-05',
'LanguageOfMetadata': 'EN',
'Languages._Key': ['CC', 'Primary'],
'Languages.Value': ['en', 'EN'],
'Products.URL': ['http://www.hulu.com/watch/217566', 'http://www.hulu.com/d/217566'],
'Products.Rating': ['TV-Y', 'TV-Y'],
'Products.Currency': ['USD', 'USD'],
'Products.SUBSCRIPTION': ['0.00', '0.00'],
'Products._Key': ['US', 'DE'],
'ReleaseYear': '2005',
'TVSeriesID': '5638#TVSeries',
'Type': 'TVEpisode',
'Studio': '4K Media'
}]
In other words, anytime a dict is encountered, it need to convert to either a string, number, or list.
What I currently have is something along the lines of the following, which uses a while loop to iterate through all the subpaths of the json.
while True:
for key in copy(keys):
val = get_sub_object_from_path(obj, key)
if isinstance(val, dict):
FLAT_OBJ[key.replace('/', '.')] = val
else:
keys.extend(os.path.join(key, _nextkey) for _nextkey in val.keys())
keys.remove(key)
if (not keys) or (n > 5):
break
else:
n += 1
continue
You can use recursion with a generator:
from collections import defaultdict
_d = [{'RuntimeInMinutes': '21', 'EpisodeNumber': '21', 'Genres': ['Animation'], 'ReleaseDate': '2005-02-05', 'LanguageOfMetadata': 'EN', 'Languages': [{'_Key': 'CC', 'Value': ['en']}, {'_Key': 'Primary', 'Value': ['EN']}], 'Products': [{'URL': 'http://www.hulu.com/watch/217566', 'Rating': 'TV-Y', 'Currency': 'USD', 'SUBSCRIPTION': '0.00', '_Key': 'US'}, {'URL': 'http://www.hulu.com/d/217566', 'Rating': 'TV-Y', 'Currency': 'USD', 'SUBSCRIPTION': '0.00', '_Key': 'DE'}], 'ReleaseYear': '2005', 'TVSeriesID': '5638#TVSeries', 'Type': 'TVEpisode', 'Studio': '4K Media'}]
def get_vals(d, _path = []):
for a, b in getattr(d, 'items', lambda :{})():
if isinstance(b, list) and all(isinstance(i, dict) or isinstance(i, list) for i in b):
for c in b:
yield from get_vals(c, _path+[a])
elif isinstance(b, dict):
yield from get_vals(b, _path+[a])
else:
yield ['.'.join(_path+[a]), b]
results = [i for b in _d for i in get_vals(b)]
_c = defaultdict(list)
for a, b in results:
_c[a].append(b)
result = [{a:list(b) if len(b) > 1 else b[0] for a, b in _c.items()}]
import json
print(json.dumps(result, indent=4))
Output:
[
{
"RuntimeInMinutes": "21",
"EpisodeNumber": "21",
"Genres": [
"Animation"
],
"ReleaseDate": "2005-02-05",
"LanguageOfMetadata": "EN",
"Languages._Key": [
"CC",
"Primary"
],
"Languages.Value": [
[
"en"
],
[
"EN"
]
],
"Products.URL": [
"http://www.hulu.com/watch/217566",
"http://www.hulu.com/d/217566"
],
"Products.Rating": [
"TV-Y",
"TV-Y"
],
"Products.Currency": [
"USD",
"USD"
],
"Products.SUBSCRIPTION": [
"0.00",
"0.00"
],
"Products._Key": [
"US",
"DE"
],
"ReleaseYear": "2005",
"TVSeriesID": "5638#TVSeries",
"Type": "TVEpisode",
"Studio": "4K Media"
}
]
Edit: wrapping solution in outer function:
def flatten_obj(data):
def get_vals(d, _path = []):
for a, b in getattr(d, 'items', lambda :{})():
if isinstance(b, list) and all(isinstance(i, dict) or isinstance(i, list) for i in b):
for c in b:
yield from get_vals(c, _path+[a])
elif isinstance(b, dict):
yield from get_vals(b, _path+[a])
else:
yield ['.'.join(_path+[a]), b]
results = [i for b in data for i in get_vals(b)]
_c = defaultdict(list)
for a, b in results:
_c[a].append(b)
return [{a:list(b) if len(b) > 1 else b[0] for a, b in _c.items()}]
EDIT
This now appears to be fixed:
As #panda-34 correctly points out (+1), the currently accepted
solution loses data, specifically Genres and Languages.Value when
you run the posted code.
Unfortunately, #panda-34's code modifies Genres:
'Genres': 'Animation',
rather than leaving it alone as in the OP's example:
'Genres': ['Animation'],
Below's my solution which attacks the problem a different way. None of the keys in the original data contains a dictionary as a value, only non-containers or lists (e.g. lists of dictionaries). So a primary a list of dictionaries will becomes a dictionary of lists (or just a plain dictionary if there's only one dictionary in the list.) Once we've done that, then any value that's now a dictionary is expanded back into the original data structure:
def flatten(container):
# A list of dictionaries becomes a dictionary of lists (unless only one dictionary in list)
if isinstance(container, list) and all(isinstance(element, dict) for element in container):
new_dictionary = {}
first, *rest = container
for key, value in first.items():
new_dictionary[key] = [flatten(value)] if rest else flatten(value)
for dictionary in rest:
for key, value in dictionary.items():
new_dictionary[key].append(value)
container = new_dictionary
# Any dictionary value that's a dictionary is expanded into original dictionary
if isinstance(container, dict):
new_dictionary = {}
for key, value in container.items():
if isinstance(value, dict):
for sub_key, sub_value in value.items():
new_dictionary[key + "." + sub_key] = sub_value
else:
new_dictionary[key] = value
container = new_dictionary
return container
OUTPUT
{
"RuntimeInMinutes": "21",
"EpisodeNumber": "21",
"Genres": [
"Animation"
],
"ReleaseDate": "2005-02-05",
"LanguageOfMetadata": "EN",
"Languages._Key": [
"CC",
"Primary"
],
"Languages.Value": [
[
"en"
],
[
"EN"
]
],
"Products.URL": [
"http://www.hulu.com/watch/217566",
"http://www.hulu.com/d/217566"
],
"Products.Rating": [
"TV-Y",
"TV-Y"
],
"Products.Currency": [
"USD",
"USD"
],
"Products.SUBSCRIPTION": [
"0.00",
"0.00"
],
"Products._Key": [
"US",
"DE"
],
"ReleaseYear": "2005",
"TVSeriesID": "5638#TVSeries",
"Type": "TVEpisode",
"Studio": "4K Media"
}
But this solution introduces a new apparent inconsistency:
'Languages.Value': ['en', 'EN'],
vs.
"Languages.Value": [["en"], ["EN"]],
However, I believe this is tied up with the Genres inconsistency mentioned earlier and the OP needs to define a consistent resolution.
Ajax1234's answer loses values of 'Genres' and 'Languages.Value'
Here's a bit more generic version:
def flatten_obj(data):
def flatten_item(item, keys):
if isinstance(item, list):
for v in item:
yield from flatten_item(v, keys)
elif isinstance(item, dict):
for k, v in item.items():
yield from flatten_item(v, keys+[k])
else:
yield '.'.join(keys), item
res = []
for item in data:
res_item = defaultdict(list)
for k, v in flatten_item(item, []):
res_item[k].append(v)
res.append({k: (v if len(v) > 1 else v[0]) for k, v in res_item.items()})
return res
P.S. "Genres" value is also flattened. It is either an inconsistency in the OP requirements or a separate problem which is not addressed in this answer.

Travers through a nested json object and store values- Python

This is a follow up on this question. Question
Also this question is similar but does not solve my problem Question2
I am trying to parse a nested json to get Check how many children a specific location has, I am trying to check if "children:" = None and increment counter to check how many levels down i need to go in order to get the lowest child, or
A more efficient solution would be:
I need to get all the child values into a list and keep going until "children:" = None.
The Json object can increase in the amount of children so we can have multiple level of children, Which can get messy if I want to nest the list and get the values, How could I do it dynamically?
{
'locationId': 'location1',
'name': 'Name',
'type': 'Ward',
'patientId': None,
'children': [{
'locationId': 'Child_location2',
'name': 'Name',
'type': 'Bed',
'patientId': None,
'children': [{
'locationId': 'Child_Child_location3',
'name': 'Name',
'type': 'HospitalGroup',
'patientId': None,
'children': None
}]
}, {
'locationId': 'location4',
'name': 'Name',
'type': 'Hospital',
'patientId': None,
'children': None
}, {
'locationId': 'location5',
'name': 'Name',
'type': 'Bed',
'patientId': None,
'children': None
}, {
'locationId': 'location6',
'name': 'Name',
'type': 'Bed',
'patientId': None,
'children': None
}, {
'locationId': 'location27',
'name': 'Name',
'type': 'Bed',
'patientId': None,
'children': None
}]
}
I tried to do something like this
import requests
def Get_Child(URL, Name):
headers = {
'accept': 'text/plain',
}
response = requests.get(
URL + Name,
headers=headers)
json_data = response.json()
print (json_data)
list = []
for locationId in json_data['locationId']:
list.append(locationId)
for children in locationId['children']:
list.append(children)
but that give me the following error,
for children in locationId['locationId']: TypeError: string indices must be integers
Your code shows append, but you ask for a count. Here is a recursive way to get the number of children in this JSON if I am understanding you correctly:
def get_children(body, c=1):
if not body.get('children'):
c += 1
elif isinstance(body.get('children'), list):
c += 1
for subchild in body.get('children'):
c += 1
get_children(subchild, c)
return c
counts = get_children(your_json_blob)
print(counts)
>>> 7
Edit: I purposely did not use if/else because I don't know if you can have subchildren that are dict rather than list which would mean you would need extra conditions, but that's up to you if that ends up being the case.
I found a solution fro my problem,
The following code will get all the children and append them to a list
class Children():
def Get_All_Children(self,json_input, lookup_key):
if isinstance(json_input, dict):
for k, v in json_input.items():
if k == lookup_key:
yield v
else:
yield from self.Get_All_Children(v, lookup_key)
elif isinstance(json_input, list):
for item in json_input:
yield from self.Get_All_Children(item, lookup_key)
for locations in self.Get_All_Children(self.json_data, 'locationId'):
self.mylist.append(locations)

Trying to make JSON Schema validator in Python to set default values

I'm using a slightly modified version of the code from the JSON Schema FAQ to create a validator that sets default values:
def extend_with_default(validator_class):
validate_properties = validator_class.VALIDATORS["properties"]
def set_defaults(validator, properties, instance, schema):
for property_, subschema in properties.items():
if "default" in subschema:
instance.setdefault(property_, subschema["default"])
for error in validate_properties(
validator, properties, instance, schema,
):
yield error
return validators.extend(
validator_class, {"properties": set_defaults},
)
DefaultValidatingDraft4Validator = extend_with_default(Draft4Validator)
And I have a JSON Schema like so:
{'definitions': {
'obj': {'additionalProperties': False,
'properties': {
'foo': {'default': None, 'oneOf': [{'type': 'null'}, {'type': 'string'}]},
'bar': {'default': None, 'oneOf': [{'type': 'null'}, {'type': 'string'}]},
'baz': {'default': None, 'oneOf': [{'type': 'null'}, {'type': 'string'}]},
'children': {'default': None, 'oneOf': [
{'type': 'null'},
{
'items': {'$ref': '#/definitions/obj'},
'minItems': 1,
'type': 'array'
}
]}},
'required': ['foo', 'bar', 'baz'],
'type': 'object'}},
'oneOf': [
{'$ref': '#/definitions/obj'},
{
'items': {'$ref': '#/definitions/obj'},
'minItems': 1,
'type': 'array'
}
]
}
So basically, there's an object that can have foo/bar/baz fields, and the entire instance can either be one of those objects or a list of them. Additionally, each object can have a list of child objects in the children field.
When I try to run this code against a single object, it works fine, but it fails when I have a list of objects:
In [22]: DefaultValidatingDraft4Validator(schema).validate({'foo': 'hi'})
In [23]: DefaultValidatingDraft4Validator(schema).validate([{'foo': 'hi'}, {'baz': 'bye'}])
...
AttributeError: 'list' object has no attribute 'setdefault'
With the "children" field, I need a way to handle lists at every level of the schema validation. Is there a way to do that properly?
In the validator, the list that is causing the exception, is a valid element.
Changes Needed:
So you need to exclude the list from consideration by changing:
if "default" in subschema:
instance.setdefault(property_, subschema["default"])
to:
if "default" in subschema and not isinstance(instance, list):
instance.setdefault(property_, subschema["default"])
This was all that was needed to get the two test cases to pass.
Code:
from jsonschema import Draft4Validator, validators
def extend_with_default(validator_class):
validate_properties = validator_class.VALIDATORS["properties"]
def set_defaults(validator, properties, instance, schema):
for property_, subschema in properties.items():
if "default" in subschema and not isinstance(instance, list):
instance.setdefault(property_, subschema["default"])
for error in validate_properties(
validator, properties, instance, schema,
):
yield error
return validators.extend(
validator_class, {"properties": set_defaults},
)
FillDefaultValidatingDraft4Validator = extend_with_default(Draft4Validator)
Test Code:
test_schema = {
'definitions': {
'obj': {'additionalProperties': False,
'properties': {
'foo': {'default': None, 'oneOf': [{'type': 'null'}, {'type': 'string'}]},
'bar': {'default': None, 'oneOf': [{'type': 'null'}, {'type': 'string'}]},
'baz': {'default': None, 'oneOf': [{'type': 'null'}, {'type': 'string'}]},
'children': {'default': None, 'oneOf': [
{'type': 'null'},
{
'items': {'$ref': '#/definitions/obj'},
'minItems': 1,
'type': 'array'
}
]}
},
'required': ['foo', 'bar', 'baz'],
'type': 'object'}
},
'oneOf': [
{'$ref': '#/definitions/obj'},
{
'items': {'$ref': '#/definitions/obj'},
'minItems': 1,
'type': 'array'
}
]
}
for test_data in ({'foo': 'hi'}, [{'foo': 'hi'}, {'baz': 'bye'}],
[{'children': [{'foo': 'hi'}, {'baz': 'bye'}]}]):
FillDefaultValidatingDraft4Validator(test_schema).validate(test_data)
print(test_data)
Results:
{'foo': 'hi', 'bar': None, 'baz': None, 'children': None}
[
{'foo': 'hi', 'bar': None, 'baz': None, 'children': None},
{'baz': 'bye', 'foo': None, 'bar': None, 'children': None}
]
[
{'children': [
{'foo': 'hi', 'bar': None, 'baz': None, 'children': None},
{'baz': 'bye', 'foo': None, 'bar': None, 'children': None}
], 'foo': None, 'bar': None, 'baz': None}
]
Also you want to check if it is a valid schema to match to add the default, otherwise it'll add default defined in the unmatching schema.
For example, if you're using the oneOf directive, you only want to add the default value from the matching schema in the oneOf list.
This code will do the work:
def extend_validator_with_default(validator_class: jsonschema.protocols.Validator):
validate_properties = validator_class.VALIDATORS["properties"]
def set_defaults(validator, properties, instance, schema):
valid = True
for error in validate_properties(
validator,
properties,
instance,
schema,
):
valid = False
yield error
if valid:
for property, subschema in properties.items():
if "default" in subschema and not isinstance(instance, list):
instance.setdefault(property, subschema["default"])
return jsonschema.validators.extend(validator_class, {"properties": set_defaults})
There are several missing features in the given solutions, e.g. the default values are not validated (so it is possible to set an invalid default value), the schema must contain empty objects as default values in nested schemas, etc. I created a new function for my own needs to handle these cases:
def extend_validator_with_default(validator_class):
"""Extend a validator to automatically set default values during validation."""
_NO_DEFAULT = object()
validate_properties = validator_class.VALIDATORS["properties"]
def set_defaults_and_validate(validator, properties, instance, schema):
drop_if_empty = set()
new_instance = deepcopy(instance)
for prop, subschema in properties.items():
if prop in new_instance:
continue
obj_type = subschema.get("type", "")
default_value = subschema.get("default", _NO_DEFAULT)
if default_value is not _NO_DEFAULT:
new_instance.setdefault(prop, default_value)
elif obj_type == "object":
new_instance.setdefault(prop, {})
drop_if_empty.add(prop)
is_valid = True
for error in validate_properties(
validator,
properties,
new_instance,
schema,
):
is_valid = False
yield error
for prop in drop_if_empty:
instance_prop = new_instance[prop]
if isinstance(instance_prop, Mapping) and len(instance_prop) == 0:
del new_instance[prop]
if is_valid:
instance.update(new_instance)
return validators.extend(
validator_class,
{"properties": set_defaults_and_validate},
)
You can find this function and some tests here:
https://gist.github.com/adrien-berchet/4da364bee20b9d4286f3e38161d4eb72

Categories

Resources