Flatten nested JSON arrays with inherits properties in Python - python

I have a big json/dictionary with different levels of nested json arrays, I would like to flatten it, and also capture the relationship of the structure,
Part of my json looks like:
{
"name": "root",
"type": "all",
"children": [
{
"name": "properties",
"type": "feature",
"children": [
{
"name": "print",
"type": "feature",
"children": [
{
"name": "graphic print",
"type": "feature",
"inherits": true
},
{
"name": "striped print",
"type": "feature",
"inherits": true,
"children": [
{
"name": "pinstriped",
"type": "feature",
"inherits": true
},
{
"name": "light stripe",
"type": "feature",
"inherits": true
},
{
"name": "wide stripe",
"type": "feature",
"inherits": true
}
]
}
]
}
]
},
{
"name": "colours",
"type": "colour",
"children": [
{
"name": "main colours",
"type": "colour",
"children": [
{
"name": "black",
"type": "colour",
"children": [
{
"name": "light black",
"type": "colour",
"inherits": true
},
{
"name": "blue black",
"type": "colour",
"inherits": true
}
]
},
{
"name": "red",
"type": "colour",
"children": [
{
"name": "bright red",
"type": "colour",
"inherits": true
},
{
"name": "light red",
"type": "colour"
}
]
}
]
}
]
},
{
"name": "genders",
"type": "gender",
"children": [
{
"name": "female",
"type": "gender"
},
{
"name": "male",
"type": "gender"
}
]
}
]
}
The depth of nests is not all the same. I
- want all the nodes (values of "name")
- also want all its parents if the node has "Inherit" key of True value.
Something like:
But if there are better ideas on how to store this data, will be happy to accept as well!
Many Thanks!

I think this should do your need
def parse_dict_of_dict(_dict, _parent = '', ret_dict={}):
_name, _children, _inherit = _dict["name"], _dict.get('children', None), _dict.get('inherits', False)
if _children is not None:
if isinstance(_children, list):
for _child in _children:
parse_dict_of_dict(_child, _name+ ', ' + _parent if _inherit else _name , ret_dict)
ret_dict[ _name] = _parent.strip(' ').strip(',') if _inherit else None
return ret_dict

Can you elaborate more on your output?
OR you can use this function to flatten a nested JSON to a simple JSON.
def parse_dict_of_dict(_dict, _str = ''):
ret_dict = {}
for k, v in _dict.iteritems():
if isinstance(v, dict):
ret_dict.update(parse_dict_of_dict(v, _str= _str+k+'_'))
elif isinstance(v, list):
for index, item in enumerate(v):
if isinstance(item, dict):
ret_dict.update(parse_dict_of_dict(item, _str=_str+k+'_%d_'%(index)))
else:
ret_dict.update({k+'_%d'%(index): item})
else:
try:
ret_dict[_str + k] = str(v)
except Exception as e:
ret_dict[_str + k] = unicode.encode(v, errors='ignore')
return ret_dict

Related

Writing resilient recursive code that will return results from a big json file

I have written a recursive code. I want more experienced people to tell me how resillient and fail-safe is my code:
I have a json file (Json file can be as big as 300MB):
[
{
"modules": {
"webpages": []
},
"webpages": {
"ip_addr": {
"value": "127.0.0.1",
"tags": []
},
"http": {
"status": {
"value": "Unavailable",
"tags": []
},
"title": {
"value": "403 Forbidden",
"tags": [
{
"category": "Server Code",
"match": "403"
},
{
"category": "Interesting Words",
"match": "Forbidden"
}
]
},
"server": {
"value": "Apache",
"tags": [
{
"category": "Apache Server",
"match": "Apache"
}
]
}
},
"redirects": [],
"robottxt": null
}
},
{
"modules": {
"webpages": []
}
}
]
I want to return value keys where tags are populated.
So I want to ignore:
"status": {
"value": "Unavailable",
"tags": []
},
But I want to return the title and server values. I also want to return ip_addr.value
I have written this code:
def getAllValues(nestedDictionary, firstArray, firstObj, firstUseful):
returnedArray = firstArray
tempValue = firstObj
useful = firstUseful
for key, value in nestedDictionary.items():
ipString = nestedDictionary.get("ip_addr")
if ipString is not None:
ipValue = ipString.get("value")
useful = {"ip_add": ipValue}
if isinstance(value, dict):
temp = {
"Key": key,
"useful": useful,
}
getAllValues(value, returnedArray, temp, useful)
else:
if key == "value":
tempValue["value"] = value
if key == "tags" and isinstance(value, list) and len(value) > 0:
tempValue["tags"] = value
returnedArray.append(tempValue)
return returnedArray
The above code should return:
[
{
"Key": "title",
"value": "403 Forbidden",
"useful": { "ip_addr": "127.0.0.1" },
"tags": [
{
"category": "Server Code",
"match": "403"
},
{
"category": "Interesting Words",
"match": "Forbidden"
}
]
},
{
"Key": "server",
"value": "Apache",
"useful": { "ip_addr": "127.0.0.1" },
"tags": [
{
"category": "Apache Server",
"match": "Apache"
}
]
}
]
Its a long post, but hopefully, someone can give me some assurance :)

convert list of dir paths into a json object

I am trying to build a py script that converts a list of paths into a json object as below; the output of the script should be structured as below.
json_out is a list of dictionaries that have four elements (1)type (2)name (3)path and (4)children
json_out = [
{
"type": "folder",
"name": "dir1",
"path": "/dir1",
"children": [
{
"type": "folder",
"name": "photos",
"path": "/dir1/photos",
"children": [
{
"type": "file",
"name": "mydir1.pdf",
"path": "/dir1/photos/mydir1.pdf"
},
{
"type": "file",
"name": "yourdir1.pdf",
"path": "/dir1/photos/yourdir1.pdf"
}
]
}
]
},
{
"type": "folder",
"name": "dir2",
"path": "/dir2",
"children": [
{
"type": "folder",
"name": "photos",
"path": "/dir2/photos",
"children": [
{
"type": "file",
"name": "mydir2.pdf",
"path": "/dir2/photos/mydir2.pdf"
},
{
"type": "file",
"name": "yourdir2.pdf",
"path": "/dir2/photos/yourdir2.pdf"
}
]
}
]
}
]
This is what I have so far, but this does not return the correct output structure
def my_fx(paths):
for path in paths:
file_path=path
l=file_path.split('/')[1:]
def gen_json(l=l, d=dict()):
tmp = {}
if not d:
d["name"] = l.pop(-1)
tmp["children"]=d
tmp["name"]=l.pop(-1)
return gen_json(l,tmp) if l else tmp
print(json.dumps(gen_json(l), ensure_ascii=False))
My Input
list_of_paths = [
"dir1/photos/mydir1.pdf",
"dir1/photos/yourdir1.pdf",
"dir2/photos/mydir2.pdf",
"dir2/photos/yourdir2.pdf"
]
My Output
{"children": {"name": "mydir1.pdf"}, "name": "photos"}
{"children": {"name": "yourdir1.pdf"}, "name": "photos"}
{"children": {"name": "mydir2.pdf"}, "name": "photos"}
{"children": {"name": "yourdir2.pdf"}, "name": "photos"}
Thanks in advance

Loop only looks at first row of JSON data

I have a slightly more complex than posted function which loops through JSON data. But the problem should be apparent here.
The for loop only looks through the very first row of the JSON data, which does not have the parameter I'm seeking, and so it stops immediately after parsing the very first row of the JSON data. I would like for it to continue looping through each row in the data until it finds the value_id I pass in, which does exist the JSON data some many rows down.
Is there some simple modification I'm missing here that can make this possible?
def get_values(value_id):
identifier = None
data = api_grabber.get_value_data()
for row in data["data"]:
if row["department"]["segment"]["data"]["id"] == value_id
break
if identifier is None:
return
You have to recursively search the JSON object to find the required key. The code searches all levels to find the matching key. The below code is adapted from this Stack answer.
def get_values(json_input, lookup_key):
if isinstance(json_input, dict):
for k, v in json_input.items():
if k == lookup_key:
yield v
else:
yield from get_values(v, lookup_key)
elif isinstance(json_input, list):
for item in json_input:
yield from get_values(item, lookup_key)
For example,
dict_test = {
"id": "0001",
"type": "donut",
"name": "Cake",
"ppu": 0.55,
"batters":
{
"batter":
[
{ "id": "1001", "type": "Regular" },
{ "id": "1002", "type": "Chocolate" },
{ "id": "1003", "type": "Blueberry" },
{ "id": "1004", "type": "Devil's Food" }
]
},
"topping":
[
{ "id": "5001", "type": "None" },
{ "id": "5002", "type": "Glazed" },
{ "id": "5005", "type": "Sugar" },
{ "id": "5007", "type": "Powdered Sugar" },
{ "id": "5006", "type": "Chocolate with Sprinkles" },
{ "id": "5003", "type": "Chocolate" },
{ "id": "5004", "type": "Maple" }
]
}
print(list(get_values(dict_test, "id")))
Output:
['0001',
'1001',
'1002',
'1003',
'1004',
'5001',
'5002',
'5005',
'5007',
'5006',
'5003',
'5004']
The above output shows the matching values for the key id in all levels.

Recursive list inside dict? - python

I have started learning Python not long ago, and I'm trying to do recursive function for my list. This is the function:
def get_employee(j, field, value):
res = j
for x in res:
if x['Name'] == field and x['Value'] == value:
return res
elif "Properties" not in x:
if x is not None:
continue
elif "Properties" in x:
return get_employee(x['Properties'], field, value)
And this is my JSON:
[{
"Value": "Sales",
"Name": "Department",
"Properties": [{
"Value": "US",
"Name": "Country",
"Properties": [{
"Value": "Employee",
"Name": "EmployeeType",
"Properties": [{
"Value": "Manya Bishter",
"Name": "EmployeeFullName",
"Properties": [{
"Value": 1111,
"Name": "EmployeeID"
},
{
"Value": "Manya",
"Name": "EmployeeFirstName"
},
{
"Value": "Bishter",
"Name": "EmployeeLastName"
}
]
},
{
"Value": "Michael Ort",
"Name": "EmployeeFullName",
"Properties": [{
"Value": 1112,
"Name": "EmployeeID"
},
{
"Value": "Michael",
"Name": "EmployeeFirstName"
},
{
"Value": "Ort",
"Name": "EmployeeLastName"
}
]
}
]
},
{
"Value": "Manager",
"Name": "EmployeeType",
"Properties": [{
"Value": "Nick Fair",
"Name": "EmployeeFullName",
"Properties": [{
"Value": 1113,
"Name": "EmployeeID"
},
{
"Value": "Nick",
"Name": "EmployeeFirstName"
},
{
"Value": "Fair",
"Name": "EmployeeLastName"
}
]
}]
}
]
}]
},
{
"Value": "Marketing",
"Name": "Department",
"Properties": [{
"Value": "US",
"Name": "Country",
"Properties": [{
"Value": "Employee",
"Name": "EmployeeType",
"Properties": [{
"Value": "Tamta Hiresh",
"Name": "EmployeeFullName",
"Properties": [{
"Value": 1121,
"Name": "EmployeeID"
},
{
"Value": "Tamta",
"Name": "EmployeeFirstName"
},
{
"Value": "Hiresh",
"Name": "EmployeeLastName"
}
]
}]
}]
}]
}
]
The function work only on Manya, but nowhere else.
For example, if I do this:
print(get_employee(myjson, "EmployeeFirstName", "Nick"))
It will print:
[{'Value': 1111, 'Name': 'EmployeeID'}, {'Value': 'Manya', 'Name': 'EmployeeFirstName'}, {'Value': 'Bishter', 'Name': 'EmployeeLastName'}]
But for others (like Nick), it will return None.
Can you please help?
Thanks!
Here is the code:
def get_employee(data, field, value):
result = []
def recursor(j, field, value):
res = j
for x in res:
if x['Name'] == field and x['Value'] == value:
result.append(res)
elif "Properties" not in x:
if x is not None:
continue
elif "Properties" in x:
recursor(x['Properties'], field, value)
recursor(data,field,value)
return result
The problem with your recursive function is that, once it hits return get_employee(x['Properties'], field, value); it will stop the outer for loop for x in res: . Thus it will never run on the next item in your list.

Python dictionary from schema

I need to check if all the keys declared in a schema file are present in a dictionary and if they are not, I need to fill them with a default value, of a given type. I need to do this dynamically because the structure can be even more complicated than the one below.
{
"type": "object",
"properties": {
"vid": {
"type": ["null", "integer"]
},
"merged-vids": {
"type": ["null", "array"],
"items": {
"type": ["null", "integer"]
}
},
"portal-id": {
"type": ["null", "integer"]
},
"is-contact": {
"type": ["null", "boolean"]
}
"form-submissions": {
"type": ["null", "array"],
"items": {
"type": ["null", "object"],
"properties": {
"conversion-id": {
"type": ["null", "string"]
},
"timestamp": {
"type": ["null", "string"],
"format": "date-time"
},
"form-id": {
"type": ["null", "string"]
},
"portal-id": {
"type": ["null", "integer"]
},
"page-url": {
"type": ["null", "string"]
},
"title": {
"type": ["null", "string"]
}
}
}
}
}
}
This is an example of dictionary:
{
"vid": 1000,
"portal-id": 2512,
"is-contact": true,
"profile-token": "dummy_profile_token",
"profile-url": "dummy_profile_url",
"form-submissions": [
{
"conversion-id": "127-798",
"timestamp": 1484080167266,
"form-id": "70fd-4b98-14796-777",
"page-url": "https://example.com/landing-page-url",
"title": "A new test form",
"meta-data": []
}
]
}
I am also new to python and this is a bit too much.
This is what I tried, but I cannot figure out what to do.
def get_default(type_object):
if type_object == 'object':
new_dict = {}
elif type_object == 'array':
return []
else:
return ''
def fill_fields_with_empty_str(record, schema):
if isinstance(schema['type'], list):
type_obj = schema['type'][len(schema['type'])-1]
elif isinstance(schema['type'], str):
type_obj = schema['type']
if type_obj == 'object':
new_dict = {}
for key in schema['properties'].keys():
if not record.get(key):
record[key] = get_default(schema['properties'][key]['type'])
new_dict[key] = fill_fields_with_empty_str(record[key], schema['properties'][key])
return new_dict
elif type_obj == 'array':
new_list = []
type_obj = schema["items"]['type']
if len(record) == 0:
record = get_default(type_obj)
for element in schema["items"]:
new_list.append(fill_fields_with_empty_str(record, schema['items']))
return new_list
else:
return ''

Categories

Resources