I try to recursively print sentences from a nested list of lists
I want to obtain a list containing
['big bad dog', 'big fluffy cat', 'small blue happy pony', 'small frog']
Here is my code, it don't work...
Am I on the right path or I should structure my data in an another way to achieve my goal?
from pprint import pprint
dirs = [
{
'kw': 'big',
'childs': [
{
'kw': 'bad',
'childs': [
{
'kw': 'dog'
}
]
},
{
'kw': 'fluffy',
'childs': [
{
'kw': 'cat'
}
]
}
]
},
{
'kw': 'small',
'childs': [
{
'kw': 'blue',
'childs': [
{
'kw': 'happy',
'childs': [
{
'kw': 'pony'
}
]
}
]
},
{
'kw': 'frog'
}
]
},
]
def traverse(d, l):
kw = d.get('kw')
c = d.get('childs')
l.append(kw)
if c:
for cc in c:
l = traverse(cc, l)
return l
r = traverse(dirs[0], [])
pprint(r)
As usual, generators work nicely with recursive structures
def traverse(i):
for d in i:
childs = d.get('childs')
for j in traverse(childs) if childs else ['']:
yield d['kw']+' '+j
res = list(traverse(dirs))
In Python3.3, this becomes
def traverse(i):
for d in i:
c = d.get('childs')
yield from (d['kw']+' '+j for j in (traverse(c) if c else ['']))
Check out this function:
def traverse(d, l, s = None):
kw = d.get('kw')
c = d.get('childs')
if(s == None):
s = kw
else:
s = s + " " +kw
if c:
for cc in c:
l = traverse(cc, l, s)
else:
l.append(s)
return l
Its a very small modification to your recursion function:
r = traverse(dirs[0], [])
I would say my solution is pretty simple (change the item lookup for get if you want to handle cases where the key isn't found)
def _traverse(d):
prefix = d['kw']
if 'childs' not in d:
return [prefix]
results = []
for subdict in d['childs']:
subtraversal = _traverse(subdict)
results.extend(prefix+' '+item for item in subtraversal)
return results
def traverse(d):
return list(sum((_traverse(subdict) for subdict in d),[]))
Here's a solution to the given problem, with dirs being the json like structure you defined above. It's recursive, it works and it covers edge cases like the top structure being a dictionary.
def traverse(l, al = "", comps = []):
if isinstance(l,dict):
if not al:
al += l.get("kw")
else:
al += ", %s" % l.get("kw")
traverse(l.get("childs"), al, comps)
elif isinstance (l,list):
for i in l:
traverse(i, al, comps)
else:
comps.append(al)
return comps
print traverse(dirs)
Ah, Gnibbler beat me to the generator hattip. Only difference worth mentioning is " ".join to stitch the sentence and the *syntax to avoid if tests
def traverse_keys(*dictvalues):
for dictval in dictvalues:
for token in traverse_keys(*dictval.get('childs', [])):
yield token
kw = dictval.get('kw')
if kw: yield kw
tokens = [kw for kw in traverse_keys (*dirs)]
tokens.reverse()
print " ".join(tokens)
If you dont expect multiple branches in your children, the you can just nest the dictionaries directly - your logic doesn't have a way to choose which branch in the current structure. You could have named branches just by nesting dictionaries:
{ 'kw': 'dog'
'big': { 'kw': 'scary' }
'small': { 'kw': 'yippy', 'fluffy': { 'kw': 'poodle'} }
}
which would make the traversal cleaner
Related
I would like to convert a nested list like this:
["Pie", ["Sugar", "Biscuit", ["Egg"] ], "Cocoa", []]
to a nested dictionary like this:
{ "Pie": { "Sugar": {}, "Biscuit": { "Egg": {} } }, "Cocoa": {} }
with max recursion.
Possible variants of nested list:
["Pie", ["Sugar", "Biscuit", ["Egg"], "Something", ["Something2"] ], "Cocoa", []]
["Pie", ["Sugar", ["Biscuit"], "Another something", ["Egg"], "Something", ["Something2"] ], "Cocoa", ["One", ["Nested1"], "Two", ["Nested2"] ]]
INCORRECT variants:
["Pie", [["Sugar"], "Biscuit", ["Egg"], "Something", ["Something2"] ], "Cocoa", []]
[["Pie"], ["Sugar", "Biscuit", ["Egg"], "Something", ["Something2"] ], "Cocoa", []]
Here is one approach (see comments in the code for details):
l = ["Pie", ["Sugar", "Biscuit", ["Egg"] ], "Cocoa", []]
def to_nested(l):
out = {}
skip = False
for i, e in enumerate(l): # enumerate to keep track of position
if skip: # we already used this item as value, skip it
skip = False
continue
# ensure we have a next item and that it is a list
if i+1<len(l) and isinstance(l[i+1], list):
skip = True # flag item to be skipped as key
out[e] = to_nested(l[i+1])
else: # add a default empty dictionary as value
out[e] = {}
return out
out = to_nested(l)
output:
{'Pie': {'Sugar': {}, 'Biscuit': {'Egg': {}}}, 'Cocoa': {}}
Simple approach with while loop and recursion. Hope it helps.
I used pytest for quick testing.
import pytest
def convert(test):
result = {}
if not isinstance(test, list):
test = list(test)
if len(test) == 1:
return {test[0]:{}}
i = 0
while i < len(test):
if not isinstance(test[i+1], list):
result[test[i]] = {}
i += 1
continue
result[test[i]] = convert(test[i+1])
i+=2
return result
example = [
'Pie', [
'Sugar',
'Biscuit', ['Egg']
],
'Cocoa', []
]
wanted = {
"Pie": {
"Sugar": {},
"Biscuit": { "Egg": {} } },
"Cocoa": {}
}
example_convertet = convert(example)
def test_simple_key_value():
simple = ['hello', ['to you']]
test_convert = convert(simple)
assert test_convert == {
'hello': {'to you':{}}
}
def test_simple_key_with_multiple_values():
simple = ['hello', ['to you', 'and you too', ['lol']]]
test_convert = convert(simple)
assert test_convert == {
'hello' : {
'to you': {},
'and you too': {'lol':{}}}
}
I am trying to create a complex object based on metadata I have. It is an array of attributes which I am iterating and trying to create a dict. For example below is the array:
[
"itemUniqueId",
"itemDescription",
"manufacturerInfo[0].manufacturer.value",
"manufacturerInfo[0].manufacturerPartNumber",
"attributes.noun.value",
"attributes.modifier.value",
"attributes.entityAttributes[0].attributeName",
"attributes.entityAttributes[0].attributeValue",
"attributes.entityAttributes[0].attributeUOM",
"attributes.entityAttributes[1].attributeName",
"attributes.entityAttributes[1].attributeValue",
"attributes.entityAttributes[1].attributeUOM",
]
This array should give an output as below:
{
"itemUniqueId": "",
"itemDescription": "",
"manufacturerInfo": [
{
"manufacturer": {
"value": ""
},
"manufacturerPartNumber": ""
}
],
"attributes": {
"noun": {
"value": ""
},
"modifier": {
"value": ""
},
"entityAttributes": [
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
},
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
}
]
}
}
I have written this logic but unable to get the desired output. It should work on both object and array given the metadata.
source_json = [
"itemUniqueId",
"itemDescription",
"manufacturerInfo[0].manufacturer.value",
"manufacturerInfo[0].manufacturerPartNumber",
"attributes.noun.value",
"attributes.modifier.value",
"attributes.entityAttributes[0].attributeName",
"attributes.entityAttributes[0].attributeValue",
"attributes.entityAttributes[0].attributeUOM",
"attributes.entityAttributes[1].attributeName",
"attributes.entityAttributes[1].attributeValue",
"attributes.entityAttributes[1].attributeUOM",
]
for row in source_json:
propertyNames = row.split('.')
temp = ''
parent = {}
parentArr = []
parentObj = {}
# if len(propertyNames) > 1:
arrLength = len(propertyNames)
for i, (current) in enumerate(zip(propertyNames)):
if i == 0:
if '[' in current:
parent[current]=parentArr
else:
parent[current] = parentObj
temp = current
if i > 0 and i < arrLength - 1:
if '[' in current:
parent[current] = parentArr
else:
parent[current] = parentObj
temp = current
if i == arrLength - 1:
if '[' in current:
parent[current] = parentArr
else:
parent[current] = parentObj
temp = current
# temp[prev][current] = ""
# finalMapping[target] = target
print(parent)
There's a similar question at Convert Dot notation string into nested Python object with Dictionaries and arrays where the accepted answer works for this question, but has unused code paths (e.g. isInArray) and caters to unconventional conversions expected by that question:
❓ "arrOne[0]": "1,2,3" → "arrOne": ["1", "2", "3"] instead of
✅ "arrOne[0]": "1,2,3" → "arrOne": ["1,2,3"] or
✅ "arrOne[0]": "1", "arrOne[1]": "2", "arrOne[2]": "3" → "arrOne": ["1", "2", "3"]
Here's a refined implementation of the branch function:
def branch(tree, path, value):
key = path[0]
array_index_match = re.search(r'\[([0-9]+)\]', key)
if array_index_match:
# Get the array index, and remove the match from the key
array_index = int(array_index_match[0].replace('[', '').replace(']', ''))
key = key.replace(array_index_match[0], '')
# Prepare the array at the key
if key not in tree:
tree[key] = []
# Prepare the object at the array index
if array_index == len(tree[key]):
tree[key].append({})
# Replace the object at the array index
tree[key][array_index] = value if len(path) == 1 else branch(tree[key][array_index], path[1:], value)
else:
# Prepare the object at the key
if key not in tree:
tree[key] = {}
# Replace the object at the key
tree[key] = value if len(path) == 1 else branch(tree[key], path[1:], value)
return tree
Usage:
VALUE = ''
def create_dict(attributes):
d = {}
for path_str in attributes:
branch(d, path_str.split('.'), VALUE)
return d
source_json = [
"itemUniqueId",
"itemDescription",
"manufacturerInfo[0].manufacturer.value",
"manufacturerInfo[0].manufacturerPartNumber",
"attributes.noun.value",
"attributes.modifier.value",
"attributes.entityAttributes[0].attributeName",
"attributes.entityAttributes[0].attributeValue",
"attributes.entityAttributes[0].attributeUOM",
"attributes.entityAttributes[1].attributeName",
"attributes.entityAttributes[1].attributeValue",
"attributes.entityAttributes[1].attributeUOM",
]
assert create_dict(source_json) == {
"itemUniqueId": "",
"itemDescription": "",
"manufacturerInfo": [
{
"manufacturer": {
"value": ""
},
"manufacturerPartNumber": ""
}
],
"attributes": {
"noun": {
"value": ""
},
"modifier": {
"value": ""
},
"entityAttributes": [
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
},
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
}
]
}
}
First we should iterate over whole list and store each 3rd attributes, after that we could change this struct to our desired output:
from typing import Dict, List
source_json = [
"attributes.entityAttributes[0].attributeName",
"attributes.entityAttributes[0].attributeValue",
"attributes.entityAttributes[0].attributeUOM",
"attributes.entityAttributes[1].attributeName",
"attributes.entityAttributes[1].attributeValue",
"attributes.entityAttributes[1].attributeUOM",
"attributes.entityAttributes[2].attributeName"
]
def accumulate(source: List) -> Dict:
accumulator = {}
for v in source:
vs = v.split(".")
root_attribute = vs[0]
if not root_attribute in accumulator:
accumulator[root_attribute] = {}
i = vs[1].rfind('[')
k = (vs[1][:i], vs[1][i+1:-1])
if not k in accumulator[root_attribute]:
accumulator[root_attribute][k] = {}
accumulator[root_attribute][k][vs[2]] = ""
return accumulator
def get_result(accumulated: Dict) -> Dict:
result = {}
for k, v in accumulated.items():
result[k] = {}
for (entity, idx), v1 in v.items():
if not entity in result[k]:
result[k][entity] = []
if len(v1) == 3:
result[k][entity].append(v1)
return result
print(get_result(accumulate(source_json)))
The output will be:
{
'attributes':
{
'entityAttributes':
[
{
'attributeName': '',
'attributeValue': '',
'attributeUOM': ''
},
{'attributeName': '',
'attributeValue': '',
'attributeUOM': ''
}
]
}
}
In accumulate function we store 3rd level attributes in Dict with (entityAttributes, 0) ... (entityAttributes, 2) keys.
In get_result function we convert Dict with (entityAttributes, 0) ... (entityAttributes, 2) keys to Dict from string to List.
How about something like this:
import re
import json
source_json = [
"attributes.entityAttributes[0].attributeName",
"attributes.entityAttributes[0].attributeValue",
"attributes.entityAttributes[0].attributeUOM",
"attributes.entityAttributes[1].attributeName",
"attributes.entityAttributes[1].attributeValue",
"attributes.entityAttributes[1].attributeUOM",
"attributes.entityAttributes[2].attributeName"
]
def to_object(source_json):
def add_attribute(target, attribute_list):
head, tail = attribute_list[0], attribute_list[1:]
if tail:
add_attribute(target.setdefault(head,{}), tail)
else:
target[head] = ''
target = {}
for row in source_json:
add_attribute(target, re.split(r'[\.\[\]]+',row))
return target
print(json.dumps(to_object(source_json), indent=4))
Note that this will not exactly do what you requested. It interprets stores the array also as an object with keys '0' ... '2'. This makes it easier to implement and also more stable. What would you expect, when the input list missed the entries with entityAttributes[0]. Should the list include an empty element or something different. Anyway you save space by not including this element, which works only if you store the array in an object.
None of the answers provided so far strike me as very intuitive. Here's one way
to tackle the problem with three easy-to-understand functions.
Normalize inputs. First we need a function to normalize the inputs strings. Instead of rules-bearing strings like
'foo[0].bar' – where one must understand that integers
in square brackets imply a list – we want a simple tuple
of keys like ('foo', 0, 'bar').
def attribute_to_keys(a):
return tuple(
int(k) if k.isdigit() else k
for k in a.replace('[', '.').replace(']', '').split('.')
)
Build a uniform data structure. Second, we need a function to assemble a data structure consisting of dicts
of dicts of dicts ... all the way down.
def assemble_data(attributes):
data = {}
for a in attributes:
d = data
for k in attribute_to_keys(a):
d = d.setdefault(k, {})
return convert(data)
def convert(d):
# Just a placeholder for now.
return d
Convert the uniform data. Third, we need to implement a real version of the placeholder. Specifically, we
need it to recursively convert the uniform data structure into our ultimate
goal having (a) empty strings at leaf nodes, and (b) lists rather than dicts
whenever the dict keys are all integers. Note that this even fills in empty
list positions with an empty string (a contingency not covered in your problem
description; adjust as needed if you want a different behavior).
def convert(d):
if not d:
return ''
elif all(isinstance(k, int) for k in d):
return [convert(d.get(i)) for i in range(max(d) + 1)]
else:
return {k : convert(v) for k, v in d.items()}
You can use a custom builder class which implements __getattr__ and __getitem__ to gradually build the underlying object. This building can then be triggered by using eval on each of the attribute strings (note: eval is not safe for input from untrusted sources).
The following is an example implementation:
class Builder:
def __init__(self):
self.obj = None
def __getattr__(self, key):
if self.obj is None:
self.obj = {}
return self.obj.setdefault(key, Builder())
def __getitem__(self, index):
if self.obj is None:
self.obj = []
self.obj.extend(Builder() for _ in range(index+1-len(self.obj)))
return self.obj[index]
def convert(self):
if self.obj is None:
return ''
elif isinstance(self.obj, list):
return [v.convert() for v in self.obj]
elif isinstance(self.obj, dict):
return {k: v.convert() for k,v in self.obj.items()}
else:
assert False
attributes = [
'itemUniqueId',
'itemDescription',
'manufacturerInfo[0].manufacturer.value',
'manufacturerInfo[0].manufacturerPartNumber',
'attributes.noun.value',
'attributes.modifier.value',
'attributes.entityAttributes[0].attributeName',
'attributes.entityAttributes[0].attributeValue',
'attributes.entityAttributes[0].attributeUOM',
'attributes.entityAttributes[1].attributeName',
'attributes.entityAttributes[1].attributeValue',
'attributes.entityAttributes[1].attributeUOM',
]
builder = Builder()
for attr in attributes:
eval(f'builder.{attr}')
result = builder.convert()
import json
print(json.dumps(result, indent=4))
which gives the following output:
{
"itemUniqueId": "",
"itemDescription": "",
"manufacturerInfo": [
{
"manufacturer": {
"value": ""
},
"manufacturerPartNumber": ""
}
],
"attributes": {
"noun": {
"value": ""
},
"modifier": {
"value": ""
},
"entityAttributes": [
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
},
{
"attributeName": "",
"attributeValue": "",
"attributeUOM": ""
}
]
}
}
Given a list of dictionaries:
data = {
"data": [
{
"categoryOptionCombo": {
"id": "A"
},
"dataElement": {
"id": "123"
}
},
{
"categoryOptionCombo": {
"id": "B"
},
"dataElement": {
"id": "123"
}
},
{
"categoryOptionCombo": {
"id": "C"
},
"dataElement": {
"id": "456"
}
}
]
}
I would like to display the dataElement where the count of distinct categoryOptionCombo is larger than 1.
e.g. the result of the function would be an iterable of IDs:
[123]
because the dataElement with id 123 has two different categoryOptionCombos.
tracker = {}
for d in data['data']:
data_element = d['dataElement']['id']
coc = d['categoryOptionCombo']['id']
if data_element not in tracker:
tracker[data_element] = set()
tracker[data_element].add(coc)
too_many = [key for key,value in tracker.items() if len(value) > 1]
How can I iterate the list of dictionaries preferably with a comprehension? This solution above is not pythonic.
One approach:
import collections
counts = collections.defaultdict(set)
for d in data["data"]:
counts[d["dataElement"]["id"]].add(d["categoryOptionCombo"]["id"])
res = [k for k, v in counts.items() if len(v) > 1]
print(res)
Output
['123']
This approach creates a dictionary mapping dataElements to the different types of categoryOptionCombo:
defaultdict(<class 'set'>, {'123': {'B', 'A'}, '456': {'C'}})
Almost a one-liner:
counts = collections.Counter( d['dataElement']['id'] for d in data['data'] )
print( counts )
Output:
Counter({'123': 2, '456': 1})
No need for sets, you can just remember each data element's first coc or mark it as having 'multiple'.
tracker = {}
for d in data['data']:
data_element = d['dataElement']['id']
coc = d['categoryOptionCombo']['id']
if tracker.setdefault(data_element, coc) != coc:
tracker[data_element] = 'multiple'
too_many = [key for key,value in tracker.items() if value == 'multiple']
(If the string 'multiple' can be a coc id, then use multiple = object() and compare with is).
I know that there are a lot of questions about duplicates but I can't find a solution suitable for me.
I have a json structure like this:
{
"test": [
{
"name2": [
"Tik",
"eev",
"asdv",
"asdfa",
"sadf",
"Nick"
]
},
{
"name2": [
"Tik",
"eev",
"123",
"r45",
"676",
"121"
]
}
]
}
I want to keep the first value and remove all the other duplicates.
Expected Result
{
"test": [
{
"name2": [
"Tik",
"eev",
"asdv",
"asdfa",
"sadf",
"Nick"
]
},
{
"name2": [
"123",
"r45",
"676",
"121"
]
}
]
}
I tried using a tmp to check for duplicates but it didn't seem to work. Also I can't find a way to make it json again.
import json
with open('myjson') as access_json:
read_data = json.load(access_json)
tmp = []
tmp2 = []
def get_synonyms():
ingredients_access = read_data['test']
for x in ingredients_access:
for j in x['name2']:
tmp.append(j)
if j in tmp:
tmp2.append(j)
get_synonyms()
print(len(tmp))
print(len(tmp2))
You can use recursion:
def filter_d(d):
seen = set()
def inner(_d):
if isinstance(_d, dict):
return {a:inner(b) if isinstance(b, (dict, list)) else b for a, b in _d.items()}
_r = []
for i in _d:
if isinstance(i, (dict, list)):
_r.append(inner(i))
elif i not in seen:
_r.append(i)
seen.add(i)
return _r
return inner(d)
import json
print(json.dumps(filter_d(data), indent=4))
Output:
{
"test": [
{
"name2": [
"Tik",
"eev",
"asdv",
"asdfa",
"sadf",
"Nick"
]
},
{
"name2": [
"123",
"r45",
"676",
"121"
]
}
]
}
You are first adding everything to tmp and then to tmp2 because every value was added to tmp before.
I changed the function a little bit to work for your specific test example:
def get_synonyms():
test_list = []
ingredients_access = read_data['test']
used_values =[]
for x in ingredients_access:
inner_tmp = []
for j in x['name2']:
if j not in used_values:
inner_tmp.append(j)
used_values.append(j)
test_list.append({'name2':inner_tmp})
return {'test': test_list}
result = get_synonyms()
print(result)
Output:
{'test': [{'name2': ['Tik', 'eev', 'asdv', 'asdfa', 'sadf', 'Nick']}, {'name2': ['123', 'r45', '676', '121']}]}
Here's a little hackish answer:
d = {'test': [{'name2': ['Tik', 'eev', 'asdv', 'asdfa', 'sadf', 'Nick']},
{'name2': ['Tik', 'eev', '123', 'r45', '676', '121']}]}
s = set()
for l in d['test']:
l['name2'] = [(v, s.add(v))[0] for v in l['name2'] if v not in s]
Output:
{'test': [{'name2': ['Tik', 'eev', 'asdv', 'asdfa', 'sadf', 'Nick']},
{'name2': ['123', 'r45', '676', '121']}]}
This uses a set to track the unique values, and add unique values to set while returning the value back to the list.
I'm trying to create a Python function to convert the lists (Objects of arrays in ELK term) to dictionary. I found a sample Ruby function which does that and I'm trying to convert it to Python function for my usage. I'm finding hard time to get the output. The output will be inserted back to Elastic Search.
Ruby Function - I found in Internet
def arrays_to_hash(h)
h.each do |k,v|
# If v is nil, an array is being iterated and the value is k.
# If v is not nil, a hash is being iterated and the value is v.
value = v || k
if value.is_a?(Array)
# "value" is replaced with "value_hash" later.
value_hash = {}
value.each_with_index do |v, i|
value_hash[i.to_s] = v
end
h[k] = value_hash
end
if value.is_a?(Hash) || value.is_a?(Array)
arrays_to_hash(value)
end
end
end
Python Function - I'm trying - Upon seeing the O/P i can see the first list inside the dictionary is getting converted but the nested list inside that is still present
def array_path(my_dict):
for k,v in my_dict.items():
if isinstance(v,list):
print (len(v))
for i, item in enumerate(v):
my_dict2[str(i)] = item
my_dict[k] = my_dict2
elif isinstance(v,dict):
array_path(v)
else:
my_dict[k] = v
Input
{
"foo": "bar",
"test": {
"steps": [
{
"response_time": "100"
},
{
"response_time": "101",
"more_nested": [
{
"hello": "world"
},
{
"hello2": "world2"
}
]
}
]
}
}
**
Expected Output
**
{
"foo": "bar",
"test": {
"steps": {
"0": {
"response_time": "100"
},
"1": {
"response_time": "101",
"more_nested": {
"0": {
"hello": "world"
},
"1": {
"hello2": "world2"
}
}
}
}
}
}
Current O/P
{'0': {'response_time': '100'},
'1': {'more_nested': [{'hello': 'world'}, {'hello2': 'world2'}],
'response_time': '101'}}
the original script stopped its check to list, not implementing a solution for a list of dicts. looks ok now
def array_path(my_dict):
if type(my_dict) is dict:
for k, v in my_dict.items():
my_dict[k] = array_path(v)
elif type(my_dict) is list:
return {str(i): array_path(item) for i, item in enumerate(my_dict)}
return my_dict