I have a list of complex dictionaries like this:
data = [
{
"l_1_k": 1,
"l_1_ch": [
{
"l_2_k": 2,
"l_2_ch": [...more levels]
},
{
"l_2_k": 3,
"l_2_ch": [...more levels]
}
]
},
...more items
]
I'm trying to flatten this structure to a list of rows like this:
list = [
{ "l_1_k": 1, "l_2_k": 2, ... },
{ "l_1_k": 1, "l_2_k": 3, ... },
]
I need this list to build a pandas data frame.
So, I'm doing a recursion for each nesting level, and at the last level I'm trying to append to rows list.
def build_dict(d, row_dict, rows):
# d is the data dictionary at each nesting level
# row_dict is the final row dictionary
# rows is the final list of rows
for key, value in d.items():
if not isinstance(value, list):
row_dict[key] = value
else:
for child in value:
build_dict(child, row_dict, rows)
rows.append(row_dict) # <- How to detect the last recursion and call the append
I'm calling this function like this:
rows = []
for row in data:
build_dict(d=row, row_dict={}, rows=rows)
My question is how to detect the last call of this recursive function if I do not know how many nesting levels there are. With the current code, the row is duplicated at each nesting level.
Or, is there a better approach to obtain the final result?
After looking up some ideas, the solution I have in mind is this:
Declare the following function, taken from here:
def find_depth(d):
if isinstance(d, dict):
return 1 + (max(map(find_depth, d.values())) if d else 0)
return 0
In your function, increment every time you go deeper as follows:
def build_dict(d, row_dict, rows, depth=0):
# depth = 1 for the beginning
for key, value in d.items():
if not isinstance(value, list):
row_dict[key] = value
else:
for child in value:
build_dict(child, row_dict, rows, depth + 1)
Finally, test if you reach the maximum depth, if so, at the end of your function you can append it. You will need to add an extra variable which you will call:
def build_dict(d, row_dict, rows, max_depth, depth=0):
# depth = 1 for the beginning
for key, value in d.items():
if not isinstance(value, list):
row_dict[key] = value
else:
for child in value:
build_dict(child, row_dict, rows,max_depth, depth + 1)
if depth == max_depth:
rows.append(row_dict)
Call the function as:
build_dict(d=row, row_dict={}, rows=rows, max_depth=find_depth(data))
Do keep in mind since I don't have a data-set I can use, there might be a syntax error or two in there, but the approach should be fine.
I don't think it is good practice to try to play with mutable default argument in function prototype.
Also, I think that the function in the recursion loop should never be aware of the level it is in. That's the point of the recursion. Instead, you need to think about what the function should return, and when it should exit the recursion loop to climb back to the zeroth level. On the climb back, higher level function calls handle the return value of lower level function calls.
Here is the code that I think will work. I am not sure it is optimal, in term of computing time.
edit: fixed return list of dicts instead of dict only
def build_dict(d):
"""
returns a list when there is no lowerlevel list of dicts.
"""
lst = []
for key, value in d.items():
if not isinstance(value, list):
lst.append([{key: value}])
else:
lst_lower_levels = []
for child in value:
lst_lower_levels.extend(build_dict(child))
new_lst = []
for elm in lst:
for elm_ll in lst_lower_levels:
lst_of_dct = elm + elm_ll
new_lst.append([{k: v for d in lst_of_dct for k, v in d.items()}])
lst = new_lst
return lst
rows = []
for row in data:
rows.extend(build_dict(d=row))
Related
The code:
def main():
nested_dict = {'A': {'A_1': 'value_1', 'B_1': 'value_2'},
'B': 'value_3'}
access_pattern = ['A', 'B_1']
new_value = 'value_4'
nested_dict[access_pattern] = new_value
return nested_dict
Background information:
As can be seen, I have a variable called nested_dict - in reality, it contains hundreds of elements with a different number of sub-elements each (I'm simplifying it for the purpose of the example).
I need to modify the value of some elements inside this dictionary, but it is not predetermined which elements exactly. The specific "path" to the elements that need be modified, will be provided by the access_pattern variable, which will be different every time.
The problem:
I know how to reference the value of the dictionary with this function functools.reduce(dict.get, access_pattern, nested_dict). However, I do not know how to universally modify (regardless of the contained variable type) the value of the access_pattern in the dictionary.
The provided code produces a TypeError that I do not know how to overcome elegantly. I did think of some solution, specified in 4.
Possible solutions:
if len(access_pattern) == 1:
nested_dict[access_pattern[0]] = new_value
elif len(access_pattern) == 2:
nested_dict[access_pattern[0]][access_pattern[1]] = new_value
...
So on for all len()
This just seems VERY inelegant and painful. Is there a more practical way to achieve this?
Make use of recursion
def edit_from_access_pattern(access_pattern, nested_dict, new_value):
if len(access_pattern) == 1:
nested_dict[access_pattern[0]] = new_value
else:
return edit_from_access_pattern(access_pattern[1:], nested_dict[access_pattern[0], new_value]
You can use recursion
def set_value(container, key, value):
if len(key) == 1:
container[key[0]] = value
else:
set_value(container[key[0]], key[1:], value)
but an explicit loop is probably going to be more efficient
def set_value(container, key, value):
for i in range(len(key)-1):
container = container[key[i]]
container[key[-1]] = value
Right now I have a for loop that looks one by one for whether the key value == a variable.
I'm doing this one by one by selecting the [0] and [1] index to get the first two children. There could be up to four children, is there a more efficient way to do this than elif?
# INITIALIZE NEW FILTERED DICTIONARY (RETAINING TOP LEVEL ITEMS)
newdata = OrderedDict({k:v for k,v in data.items() if k in ['stop_id', 'stop_name']})
newdata['mode'] = []
arrivalarray = []
# ITERATE CONDITIONALLY KEEPING NEEDED SECTIONS
for i in data['mode']:
if i['route'][0]['route_name'] == line:
if i['route'][0]['direction'][0]['direction_name'] == direction:
for s in i['route'][0]['direction'][0]['trip']:
arrivalarray.append(s['pre_away'])
elif i['route'][0]['direction'][1]['direction_name'] == direction:
for s in i['route'][0]['direction'][1]['trip']:
arrivalarray.append(s['pre_away'])
Well yes, you could use recursion instead of iteration and that is actually what DFS is.
def traverse_json(json, depth):
if depth = 0 :
return [];
else:
data = [];
for i in json.keys():
if isinstance(json[i], dict):
data += traverse_json(json[i], depth -1)
else :
data.append(json[i])
return data
You could start with the max depth you require.
Once you've loaded the JSON data, it's no longer JSON data. It's just a nested series of Python lists, dicts, strings, etc. As such, you can do what you'd do for any Python data structure, such as use a for loop to iterate over the elements of a list:
for d in i['route'][0]['direction']:
if d['direction_name'] == direction:
for s in d['trip']:
arrivalarray.append(s['pre_away'])
so I have the following function:
def check(g,s):
for keys, value in g.items():
for w, val in enumerate(g[keys]):
print(s, g[keys][w], w)
if(s==g[keys][w]):
return 1
else:
return 0
g is a dictionary with an integer as a key and a list as a value
s is an integer I am looking for in the value lists
And I have the following data (a dictionary with an integer as the key and a list as a value):
d={1.4953487812212205: [1, 1.2228445449938519], 2.23606797749979: [2, 1.4953487812212205], 3.309750919646873: [3, 1.8192720851062585]}
The data is actually much longer but I reduced it for our simplicity.
That check function is looking for a specific value in the list of every key in the dictionary and return a 1 or 0. Now the problem is if I take the code and run it as a standalone (fixed dictionary and just run the check function), the code works fine (if I search 3, it will return a 1).
But if I integrate it to my larger code, the w variable never increments and it also only checks the first dictionary entry (only 1 key) instead of all of them and it never finds 3 (since it never gets there).
What can the problem be? I can't seem to put a finger on it.
Don't return 0 until you have checked all the values:
def check(g,s):
for keys, value in g.items():
for w, val in enumerate(g[keys]):
print(s, g[keys][w], w)
if(s==g[keys][w]):
return 1
return 0
Since you are iterating over items(), you don't have to re-lookup the values using the keys:
def check(g,s):
for keys, value in g.items():
for w, val in enumerate(value):
print(s, val, w)
if(s==val):
return 1
return 0
If you use any with a nested generator, you can accomplish the same thing (just look at the dict's values() collection, since you don't seem to care what the key is):
def check(g,s):
if any(s in vlist for vlist in g.values()):
return 1
else:
return 0
And since True = 1 and False = 0, this further reduces to:
def check(g,s):
return any(s in vlist for vlist in g.values())
So I have to loop through a list of objects, using some of their values to do computation, and then assign them new values.
Because many of the items in the list will be assigned the same new value, I used a dictionary to hold the list of items that will require the same value. For example:
item_dict = {}
for item in list:
value = item.value
if value not in item_dict:
item_dict[value] = [item]
else:
item_dict[value].append(item)
# do some calculations base on values
new_data # some dictionary created by computation
# new data is stored new_data[value] = new_value
for value, new_value in new_data.items():
items = item_dict[value]
for item in items:
item.value = new_value
I was think about removing the for item in items loop with a decorator since all the new_value(s) for that list are the same. For example:
def dec(item):
def wrap(value):
item.value = value
return wrap
def rec(item, func):
def wrap(value):
item.value = value
func(value)
return wrap
item_dict = {}
for item in list:
value = item.value
if value not in item_dict:
item_dict[value] = dec(item)
else:
item_dict[value] = rec(item, item_dict[value])
# do some calculations base on values
new_data # some dictionary created by computation
# new data is stored new_data[value] = new_value
for value, new_value in new_data.items():
items = item_dict[value]
items(new_value)
Would the decorator fashion be more efficient and how much of a memory impact will it have? Are there any better ways of doing this?
A defaultdict works well here:
from collections import defaultdict
item_dict = defaultdict(list)
for item in value_list:
item_dict[item.value].append(item)
# do some calculations base on values
new_data # some dictionary created by computation
# new data is stored new_data[value] = new_value
for value, new_value in new_data.items():
for item in item_dict[value]:
item.value = new_value
I struggle to think of a way the decorator version could be better - for one thing, you have to worry about the recursion limit.
The get method works well in the first case.
item_dict = {}
for item in list:
item_dict[item.value] = item_dict.get(item.value, []) + [item]
The key to making this work is to use list addition instead of append, as append returns None.
I am trying to find corresponding keys in two different dictionaries. Each has about 600k entries.
Say for example:
myRDP = { 'Actinobacter': 'GATCGA...TCA', 'subtilus sp.': 'ATCGATT...ACT' }
myNames = { 'Actinobacter': '8924342' }
I want to print out the value for Actinobacter (8924342) since it matches a value in myRDP.
The following code works, but is very slow:
for key in myRDP:
for jey in myNames:
if key == jey:
print key, myNames[key]
I've tried the following but it always results in a KeyError:
for key in myRDP:
print myNames[key]
Is there perhaps a function implemented in C for doing this? I've googled around but nothing seems to work.
Thanks.
Use sets, because they have a built-in intersection method which ought to be quick:
myRDP = { 'Actinobacter': 'GATCGA...TCA', 'subtilus sp.': 'ATCGATT...ACT' }
myNames = { 'Actinobacter': '8924342' }
rdpSet = set(myRDP)
namesSet = set(myNames)
for name in rdpSet.intersection(namesSet):
print name, myNames[name]
# Prints: Actinobacter 8924342
You could do this:
for key in myRDP:
if key in myNames:
print key, myNames[key]
Your first attempt was slow because you were comparing every key in myRDP with every key in myNames. In algorithmic jargon, if myRDP has n elements and myNames has m elements, then that algorithm would take O(n×m) operations. For 600k elements each, this is 360,000,000,000 comparisons!
But testing whether a particular element is a key of a dictionary is fast -- in fact, this is one of the defining characteristics of dictionaries. In algorithmic terms, the key in dict test is O(1), or constant-time. So my algorithm will take O(n) time, which is one 600,000th of the time.
in python 3 you can just do
myNames.keys() & myRDP.keys()
for key in myRDP:
name = myNames.get(key, None)
if name:
print key, name
dict.get returns the default value you give it (in this case, None) if the key doesn't exist.
You could start by finding the common keys and then iterating over them. Set operations should be fast because they are implemented in C, at least in modern versions of Python.
common_keys = set(myRDP).intersection(myNames)
for key in common_keys:
print key, myNames[key]
Best and easiest way would be simply perform common set operations(Python 3).
a = {"a": 1, "b":2, "c":3, "d":4}
b = {"t1": 1, "b":2, "e":5, "c":3}
res = a.items() & b.items() # {('b', 2), ('c', 3)} For common Key and Value
res = {i[0]:i[1] for i in res} # In dict format
common_keys = a.keys() & b.keys() # {'b', 'c'}
Cheers!
Use the get method instead:
for key in myRDP:
value = myNames.get(key)
if value != None:
print key, "=", value
You can simply write this code and it will save the common key in a list.
common = [i for i in myRDP.keys() if i in myNames.keys()]
Copy both dictionaries into one dictionary/array. This makes sense as you have 1:1 related values. Then you need only one search, no comparison loop, and can access the related value directly.
Example Resulting Dictionary/Array:
[Name][Value1][Value2]
[Actinobacter][GATCGA...TCA][8924342]
[XYZbacter][BCABCA...ABC][43594344]
...
Here is my code for doing intersections, unions, differences, and other set operations on dictionaries:
class DictDiffer(object):
"""
Calculate the difference between two dictionaries as:
(1) items added
(2) items removed
(3) keys same in both but changed values
(4) keys same in both and unchanged values
"""
def __init__(self, current_dict, past_dict):
self.current_dict, self.past_dict = current_dict, past_dict
self.set_current, self.set_past = set(current_dict.keys()), set(past_dict.keys())
self.intersect = self.set_current.intersection(self.set_past)
def added(self):
return self.set_current - self.intersect
def removed(self):
return self.set_past - self.intersect
def changed(self):
return set(o for o in self.intersect if self.past_dict[o] != self.current_dict[o])
def unchanged(self):
return set(o for o in self.intersect if self.past_dict[o] == self.current_dict[o])
if __name__ == '__main__':
import unittest
class TestDictDifferNoChanged(unittest.TestCase):
def setUp(self):
self.past = dict((k, 2*k) for k in range(5))
self.current = dict((k, 2*k) for k in range(3,8))
self.d = DictDiffer(self.current, self.past)
def testAdded(self):
self.assertEqual(self.d.added(), set((5,6,7)))
def testRemoved(self):
self.assertEqual(self.d.removed(), set((0,1,2)))
def testChanged(self):
self.assertEqual(self.d.changed(), set())
def testUnchanged(self):
self.assertEqual(self.d.unchanged(), set((3,4)))
class TestDictDifferNoCUnchanged(unittest.TestCase):
def setUp(self):
self.past = dict((k, 2*k) for k in range(5))
self.current = dict((k, 2*k+1) for k in range(3,8))
self.d = DictDiffer(self.current, self.past)
def testAdded(self):
self.assertEqual(self.d.added(), set((5,6,7)))
def testRemoved(self):
self.assertEqual(self.d.removed(), set((0,1,2)))
def testChanged(self):
self.assertEqual(self.d.changed(), set((3,4)))
def testUnchanged(self):
self.assertEqual(self.d.unchanged(), set())
unittest.main()
def combine_two_json(json_request, json_request2):
intersect = {}
for item in json_request.keys():
if item in json_request2.keys():
intersect[item]=json_request2.get(item)
return intersect