default dict in grouping - python

My code doesn't give me the same results as my assert function doesn't work.
What might be the problem?
I've tried shifting the 'return d' function one tab right to shorten what the code might do and after printing 'groups1' and 'test_max_item', the output is similar. However the assert function still fails.
Scripts =[ { ‘a’: ‘1, ‘bnf_name’: ‘joy’, ‘items’ : 3}
{ ‘a’ :2, ‘ bnf_name’: ‘loss’, ‘ items’: 6}....]
from collections import defaultdict
def group_by_field(dictionary, grpbykey):
d = defaultdict(list)
for poop in dictionary:
d[poop[grpbykey]].append(poop)
return d
groups1 = group_by_field(scripts, 'bnf_name')
test_max_item = group_by_field(scripts,'bnf_name')
assert test_max_item == groups1
I expected no assert errors.

Your code lacks readability (x100). Anyway, maybe this is what you are looking for:
from collections import defaultdict
scripts = [{'a': 1, 'bnf_name': 'joy', 'items': 3}, {'a': 2, ' bnf_name': 'loss', ' items': 6}]
reverse_scripts = [{val:key for val,key in d.iteritems()} for d in scripts]
def group_by_field(dictionaries, grpbykey):
d = defaultdict(list)
for dictionary in dictionaries:
if grpbykey in dictionary:
d[grpbykey].append(dictionary[grpbykey])
return d
groups1 = group_by_field(reverse_scripts, 'bnf_name')
test_max_item = group_by_field(reverse_scripts, 'bnf_name')
assert test_max_item == groups1

Related

merge a dict into another, overwriting values including updating lists and sub-dicts (not overwriting the list itself)

I have a dictionary D which contains default settings for my application. It has a complex hierarchy, such as lists, and more dicts inside those lists (e.g. it might have a list of modules, and within each module there are further dicts, sometimes with more lists and more dicts etc).
I also have a small preferences dictionary P which contains an arbitrary subset of this dict (I'm 100% sure that this is a perfect subset).
I'd like to merge this subset P over the default dictionary D.
I thought D.update(P) would work, but this overwrites the lists.
E.g.
D={'i':0, 'j':1, 'modules':[{'a':1}, {'b':2}, {'c':3}] }
P={'i':10, 'modules':[{'c':30}] }
D.update()
# gives {'i': 10, 'j': 1, 'modules': [{'c': 30}]}
# I'd like {'i': 10, 'j': 1, 'modules': [{'a': 1}, {'b': 2}, {'c': 30}]}
There are a lot of similar posts regarding merging dictionaries in different ways, adding entries etc, but none of them seem to address this exact issue. This seems like a very common task but I couldn't figure out how to do it so I'd appreciate any pointers.
Cheers,
(P.S. I'd also like to maintain the order of all of the lists, as it gets reflected in the GUI)
EDIT:
It seems I wasn't very clear in my explanation. Sorry about that. The example above is a very simple toy example. My actual data (when saved to JSON) is about 50K. The hierarchy goes quite deep and I have dicts inside lists inside dicts inside lists etc. Also the atomic update rule wasn't clear apparently (i.e. 0 to 10 is addition or overwriting?). To be clear the atomic update is overwriting. P overwrites D. It's only dicts and lists of dicts which need to further iterated. (I was hoping the user Preferences overwriting Default settings would help visualise this). I also omitted an important detail in the above toy example, and that is that the dictionaries in the list should be matched not by key name (as is in the example above, i.e. the dict with key 'a' is common to P and D), but by value on a specific key. See new toy example below.
D={'i':'Hello', 'j':'World', 'modules':[{'name':'a', 'val':1}, {'name':'b', 'val':2}, {'name':'c', 'val':3}, {'name':'d', 'val':4}] }
P={'i':'Goodbye', 'modules':[{'name':'a', 'val':10}, {'name':'c', 'val':30}] }
EDIT2:
I've added a solution which seems to work. I was hoping for a more concise pythonic solution, but this does the job for now.
Here is a hack that merge your current two dicts.
I'm aware that is not the "most pythonic" way to do it, but it can handle a dicts like yours and give the desired output.
In my answer, i'm using groupby and zip_longest from itertools module.
Here is my answer:
from itertools import groupby, zip_longest
D = {'i':0, 'j':1, 'modules':[{'a':1}, {'b':2}, {'c':3}] }
P = {'i':10, 'modules':[{'c':30}] }
sub = list(D.items()) + list(P.items())
final = {}
for k,v in groupby(sorted(sub, key=lambda x: x[0]), lambda x: x[0]):
bb = list(v)
if not isinstance(bb[0][1], list):
for j in bb:
final[k] = max(bb, key=lambda x: x[1])[1]
else:
kk, ff = [], []
for k_ in zip_longest(*[k[1] for k in bb]):
kk += [j for j in k_ if j != None]
for j,m in groupby(sorted(kk, key= lambda x: list(x.keys())[0]), lambda x: list(x.keys())[0]):
ff += ff += [dict(max([list(k.items()) for k in list(m)], key=lambda x:x))]
final[k] = ff
print(final)
Output:
{'i': 10, 'j': 1, 'modules': [{'a': 1}, {'b': 2}, {'c': 30}]}
I was hoping for a more pythonic solution (much more concise). Here is a C-like solution (which is more where I come from).
Note: D and P below are very simplified toy examples. In reality they are quite deep with dicts inside lists inside dicts inside lists. This might not cover all cases, but it seems to work with my data (~50KBish when saved to json).
Output:
In [2]: P
Out[2]:
{'i': 'Goodbye',
'modules': [{'name': 'a', 'val': 10}, {'name': 'c', 'val': 30}]}
In [3]: D
Out[3]:
{'i': 'Hello',
'j': 'World',
'modules': [{'name': 'a', 'val': 1},
{'name': 'b', 'val': 2},
{'name': 'c', 'val': 3},
{'name': 'd', 'val': 4}]}
In [4]: merge_dicts_by_name(P, D)
merge_dicts_by_name <type 'dict'> <type 'dict'>
key: .i : Hello overwritten by Goodbye
key: .modules :
merge_dicts_by_name .modules <type 'list'> <type 'list'>
list item: .modules[0]
merge_dicts_by_name .modules[0] <type 'dict'> <type 'dict'>
key: .modules[0].name : a overwritten by a
key: .modules[0].val : 1 overwritten by 10
list item: .modules[1]
merge_dicts_by_name .modules[1] <type 'dict'> <type 'dict'>
key: .modules[1].name : c overwritten by c
key: .modules[1].val : 3 overwritten by 30
In [5]: D
Out[5]:
{'i': 'Goodbye',
'j': 'World',
'modules': [{'name': 'a', 'val': 10},
{'name': 'b', 'val': 2},
{'name': 'c', 'val': 30},
{'name': 'd', 'val': 4}]}
Code:
def merge_dicts_by_name(P, D, id_key='name', root='', depth=0, verbose=True, indent=' '):
'''
merge from dict (or list of dicts) P into D.
i.e. can think of D as Default settings, and P as a subset containing user Preferences.
Any value in P or D can be a dict or a list of dicts
in which case same behaviour will apply (through recursion):
lists are iterated and dicts are matched between P and D
dicts are matched via an id_key (only at same hierarchy depth / level)
matching dicts are updated with same behaviour
for anything else P overwrites D
P : dict or list of dicts (e.g. containing user Preferences, subset of D)
D : dict or list of dicts (e.g. Default settings)
id_key : the key by which sub-dicts are compared against (e.g. 'name')
root : for keeping track of full path during recursion
depth : keep track of recursion depth (for indenting)
verbose : dump progress to console
indent : with what to indent (if verbose)
'''
if verbose:
indent_full = indent * depth
print(indent_full, 'merge_dicts_by_name', root, type(P), type(D))
if type(P)==list: # D and P are lists of dicts
assert(type(D)==type(P))
for p_i, p_dict in enumerate(P): # iterate dicts in P
path = root + '[' + str(p_i) + ']'
if verbose: print(indent_full, 'list item:', path)
d_id = p_dict[id_key] # get name of current dict
# find corresponding dict in D
d_dict = D[ next(i for (i,d) in enumerate(D) if d[id_key] == d_id) ]
merge_dicts_by_name(p_dict, d_dict, id_key=id_key, root=path, depth=depth+1, verbose=verbose, indent=indent)
elif type(P)==dict:
assert(type(D)==type(P))
for k in P:
path = root + '.' + k
if verbose: print(indent_full, 'key:', path, end=' : ')
if k in D:
if type(P[k]) in [dict, list]:
print()
merge_dicts_by_name(P[k], D[k], id_key=id_key, root=path, depth=depth+1, verbose=verbose, indent=indent)
else:
if verbose: print(D[k], 'overwritten by', P[k])
D[k] = P[k]
else:
print(indent_full, 'Warning: Key {} in P not found in D'.format(path))
else:
print(indent_full, "Warning: Don't know what to do with these types", type(P), type(D))

How can I find dict keys for matching values in two dicts?

I have two dictionaries mapping IDs to values. For simplicity, lets say those are the dictionaries:
d_source = {'a': 1, 'b': 2, 'c': 3, '3': 3}
d_target = {'A': 1, 'B': 2, 'C': 3, '1': 1}
As named, the dictionaries are not symmetrical.
I would like to get a dictionary of keys from dictionaries d_source and d_target whose values match. The resulting dictionary would have d_source keys as its own keys, and d_target keys as that keys value (in either a list, tuple or set format).
This would be The expected returned value for the above example should be the following list:
{'a': ('1', 'A'),
'b': ('B',),
'c': ('C',),
'3': ('C',)}
There are two somewhat similar questions, but those solutions can't be easily applied to my question.
Some characteristics of the data:
Source would usually be smaller than target. Having roughly few thousand sources (tops) and a magnitude more targets.
Duplicates in the same dict (both d_source and d_target) are not too likely on values.
matches are expected to be found for (a rough estimate) not more than 50% than d_source items.
All keys are integers.
What is the best (performance wise) solution to this problem?
Modeling data into other datatypes for improved performance is totally ok, even when using third party libraries (i'm thinking numpy)
All answers have O(n^2) efficiency which isn't very good so I thought of answering myself.
I use 2(source_len) + 2(dict_count)(dict_len) memory and I have O(2n) efficiency which is the best you can get here I believe.
Here you go:
from collections import defaultdict
d_source = {'a': 1, 'b': 2, 'c': 3, '3': 3}
d_target = {'A': 1, 'B': 2, 'C': 3, '1': 1}
def merge_dicts(source_dict, *rest):
flipped_rest = defaultdict(list)
for d in rest:
while d:
k, v = d.popitem()
flipped_rest[v].append(k)
return {k: tuple(flipped_rest.get(v, ())) for k, v in source_dict.items()}
new_dict = merge_dicts(d_source, d_target)
By the way, I'm using a tuple in order not to link the resulting lists together.
As you've added specifications for the data, here's a closer matching solution:
d_source = {'a': 1, 'b': 2, 'c': 3, '3': 3}
d_target = {'A': 1, 'B': 2, 'C': 3, '1': 1}
def second_merge_dicts(source_dict, *rest):
"""Optimized for ~50% source match due to if statement addition.
Also uses less memory.
"""
unique_values = set(source_dict.values())
flipped_rest = defaultdict(list)
for d in rest:
while d:
k, v = d.popitem()
if v in unique_values:
flipped_rest[v].append(k)
return {k: tuple(flipped_rest.get(v, ())) for k, v in source_dict.items()}
new_dict = second_merge_dicts(d_source, d_target)
from collections import defaultdict
from pprint import pprint
d_source = {'a': 1, 'b': 2, 'c': 3, '3': 3}
d_target = {'A': 1, 'B': 2, 'C': 3, '1': 1}
d_result = defaultdict(list)
{d_result[a].append(b) for a in d_source for b in d_target if d_source[a] == d_target[b]}
pprint(d_result)
Output:
{'3': ['C'],
'a': ['A', '1'],
'b': ['B'],
'c': ['C']}
Timing results:
from collections import defaultdict
from copy import deepcopy
from random import randint
from timeit import timeit
def Craig_match(source, target):
result = defaultdict(list)
{result[a].append(b) for a in source for b in target if source[a] == target[b]}
return result
def Bharel_match(source_dict, *rest):
flipped_rest = defaultdict(list)
for d in rest:
while d:
k, v = d.popitem()
flipped_rest[v].append(k)
return {k: tuple(flipped_rest.get(v, ())) for k, v in source_dict.items()}
def modified_Bharel_match(source_dict, *rest):
"""Optimized for ~50% source match due to if statement addition.
Also uses less memory.
"""
unique_values = set(source_dict.values())
flipped_rest = defaultdict(list)
for d in rest:
while d:
k, v = d.popitem()
if v in unique_values:
flipped_rest[v].append(k)
return {k: tuple(flipped_rest.get(v, ())) for k, v in source_dict.items()}
# generate source, target such that:
# a) ~10% duplicate values in source and target
# b) 2000 unique source keys, 20000 unique target keys
# c) a little less than 50% matches source value to target value
# d) numeric keys and values
source = {}
for k in range(2000):
source[k] = randint(0, 1800)
target = {}
for k in range(20000):
if k < 1000:
target[k] = randint(0, 2000)
else:
target[k] = randint(2000, 19000)
best_time = {}
approaches = ('Craig', 'Bharel', 'modified_Bharel')
for a in approaches:
best_time[a] = None
for _ in range(3):
for approach in approaches:
test_source = deepcopy(source)
test_target = deepcopy(target)
statement = 'd=' + approach + '_match(test_source,test_target)'
setup = 'from __main__ import test_source, test_target, ' + approach + '_match'
t = timeit(stmt=statement, setup=setup, number=1)
if not best_time[approach] or (t < best_time[approach]):
best_time[approach] = t
for approach in approaches:
print(approach, ':', '%0.5f' % best_time[approach])
Output:
Craig : 7.29259
Bharel : 0.01587
modified_Bharel : 0.00682
Here is another solution. There are a lot of ways to do this
for key1 in d1:
for key2 in d2:
if d1[key1] == d2[key2]:
stuff
Note that you can use any name for key1 and key2.
This maybe "cheating" in some regards, although if you are looking for the matching values of the keys regardless of the case sensitivity then you might be able to do:
import sets
aa = {'a': 1, 'b': 2, 'c':3}
bb = {'A': 1, 'B': 2, 'd': 3}
bbl = {k.lower():v for k,v in bb.items()}
result = {k:k.upper() for k,v in aa.iteritems() & bbl.viewitems()}
print( result )
Output:
{'a': 'A', 'b': 'B'}
The bbl declaration changes the bb keys into lowercase (it could be either aa, or bb).
* I only tested this on my phone, so just throwing this idea out there I suppose... Also, you've changed your question radically since I began composing my answer, so you get what you get.
It is up to you to determine the best solution. Here is a solution:
def dicts_to_tuples(*dicts):
result = {}
for d in dicts:
for k,v in d.items():
result.setdefault(v, []).append(k)
return [tuple(v) for v in result.values() if len(v) > 1]
d1 = {'a': 1, 'b': 2, 'c':3}
d2 = {'A': 1, 'B': 2}
print dicts_to_tuples(d1, d2)

make a dict/json from string with duplicate keys Python

I have a string that could be parsed as a JSON or dict object. My string variable looks like this :
my_string_variable = """{
"a":1,
"b":{
"b1":1,
"b2":2
},
"b": {
"b1":3,
"b2":2,
"b4":8
}
}"""
When I do json.loads(my_string_variable), I have a dict but only the second value of the key "b" is kept, which is normal because a dict can't contain duplicate keys.
What would be the best way to have some sort of defaultdict like this :
result = {
"a": 1,
"b": [{"b1": 1, "b2": 2}, {"b1": 3, "b2": 2, "b4": 8}],
}
I have already looked for similar questions but they all deal with dicts or lists as an input and then create defaultdicts to handle the duplicate keys.
In my case I have a string variable and I would want to know if there is a simple way to achieve this.
something like the following can be done.
import json
def join_duplicate_keys(ordered_pairs):
d = {}
for k, v in ordered_pairs:
if k in d:
if type(d[k]) == list:
d[k].append(v)
else:
newlist = []
newlist.append(d[k])
newlist.append(v)
d[k] = newlist
else:
d[k] = v
return d
raw_post_data = '{"a":1, "b":{"b1":1,"b2":2}, "b": { "b1":3, "b2":2,"b4":8} }'
newdict = json.loads(raw_post_data, object_pairs_hook=join_duplicate_keys)
print (newdict)
Please note that above code depends on value type, if type(d[k]) == list. So if original string itself gives a list then there could be some error handling required to make the code robust.
Accepted answer is perfectly fine. I just wanted to show another approach.
So at first, you dedicate a list for values in order to easily accumulate next values. At the end, you call pop on the lists which have only one item. This means that the list doesn't have duplicate values:
import json
from collections import defaultdict
my_string_variable = '{"a":1, "b":{"b1":1,"b2":2}, "b": { "b1":3, "b2":2,"b4":8} }'
def join_duplicate_keys(ordered_pairs):
d = defaultdict(list)
for k, v in ordered_pairs:
d[k].append(v)
return {k: v.pop() if len(v) == 1 else v for k, v in d.items()}
d = json.loads(my_string_variable, object_pairs_hook=join_duplicate_keys)
print(d)
output:
{'a': 1, 'b': [{'b1': 1, 'b2': 2}, {'b1': 3, 'b2': 2, 'b4': 8}]}

python construct nested dict dynamically

I want to create a nested dict in python dynamically. By saying that:
given
tuple1 = ('A','B'),
tuple2 = ('A','C'),
dict = {}
I'd like to have dict like dict = {'A': {'B':1}} after adding tuple1 to dict;
then dict = {'A': {'B' : 1, 'C' : 1}} after adding tuple2 to dict
That's what I have tried, i find the following code to create nested dict recursively. But I'm not sure how to add node dynamically and also increment its value by 1.
def incr_dict(dct, tpl):
if len(tpl) == 0:
dct = dct
else:
dct = {tpl[-1]:dct}
return incr_dict(dct, tpl[0:-1])
return dct
dct = {}
tpl = ('a', 'b', 'c')
dct = incr_dict(dct, tpl)
print(dct)
At the end of the below code, you will have a dict d which is {'A': {'B': 1, 'C': 1}}; note that the outermost loop isn't strictly necessary, but it saved me some typing in this instance.
tuple1 = ('A','B')
tuple2 = ('A','C')
d = {}
for l in [list(tuple1), list(tuple2)]:
for k in l:
v = l.pop()
if (d.has_key(k)):
if (d[k].has_key(v)):
d[k][v] = d[k][v]+1
else:
d[k][v] = 1
else:
d[k] = {}
d[k][v] = 1

Convert a list into a nested dictionary

For example I have
x = ['a','b','c']
I need to convert it to:
y['a']['b']['c'] = ''
Is that possible?
For the background, I have a config file which contains dotted notation that points to a place in some json data. I'd like to use the dotted notation string to access that specific data in the json file. For example, in the config:
path_to_data = "user.name.first_name"
I'd like my script to recognize that as:
json_data["user"]["name"]["first_name"]
so I can get the value of the first_name field. I converted the original string into a list, and now I don't know how to convert it to a nested dict.
EDIT: There is an existing data structure that I need to apply the dict with. Let's say:
m = {'a': {'b': {'c': 'lolcat'}}}
so that
m['a']['b']['c']
gives me 'lolcat'. If I get the right dictionary structure (as some of the replies did), I would still need to apply this to the existing dictionary 'm'.
So, again, I get this from a config file:
c = 'a.b.c'
That I converted to a list, thinking this will make things easier:
x = ['a','b','c']
Now I have a json-like data structure:
m = {'a': {'b': {'c': 'lolcat'}}}
So the nested dict generated from 'x' should be able to traverse 'm' so that
m['a']['b']['c']
gets me the cat.
li = ['a','b','c']
d = reduce(lambda x, y: {y:x}, reversed(li+['']))
print(d)
print(d['a']['b']['c'])
I guess you also want to include a value in the end. This works for that too:
def get_value(d, l):
if len(l) > 1:
return get_value(d[l[0]], l[1:])
return d[l[0]]
def add_keys(d, l, c=None):
if len(l) > 1:
d[l[0]] = _d = {}
d[l[0]] = d.get(l[0], {})
add_keys(d[l[0]], l[1:], c)
else:
d[l[0]] = c
def main():
d = {}
l1 = ['a', 'b', 'c', 'd']
c1 = 'letters'
l2 = [42, "42", (42,)]
c2 = 42
add_keys(d, l1, c1)
print d
add_keys(d, l2, c2)
print d
if __name__ == '__main__':
main()
It prints:
{'a': {'b': {'c': {'d': 'letters'}}}}
{'a': {'b': {'c': {'d': 'letters'}}}, 42: {'42': {(42,): 42}}}
letters
42
So it surely works. Recursion for the win.
>>> x = ['a','b','c']
>>> y={}
>>> y[x[-1]]=""
>>> x.pop(-1)
'c'
>>> for i in x[::-1]:
... y={i:y}
...
>>> y
{'a': {'b': {'c': ''}}}
>>> y['a']['b']['c']
''
This will work.
#!/usr/bin/python2
from __future__ import print_function
x = ['a','b','c']
def ltod(l):
rv = d = {}
while l:
i = l.pop(0)
d[i] = {}
d = d[i]
return rv
d = ltod(x)
print(d)
print(d["a"]["b"]["c"])
d["a"]["b"]["c"] = "text"
print(d["a"]["b"]["c"])
Outputs:
{'a': {'b': {'c': {}}}}
{}
text
Find below sample that is not very beautiful but quite simple:
path_to_data = "user.name.first_name"
keys = path_to_data.split('.')
t = []
for key in keys[::-1]: # just to iterate in reversed order
if not t:
t.append({k:{}})
else:
t[-1] = ({k: t[-1]})
#t[0] will contain your dictionary
A general solution would be to use collections.defaultdict to create a nested dictionary. Then override __setitem__ for whatever behavior you'd like. This example will do the string parsing as well.
from collections import defaultdict
class nesteddict(defaultdict):
def __init__(self):
defaultdict.__init__(self, nesteddict)
def __setitem__(self, key, value):
keys = key.split('.')
for key in keys[:-1]:
self = self[key]
defaultdict.__setitem__(self, keys[-1], value)
nd = nesteddict()
nd['a.b.c'] = 'lolcat'
assert nd['a']['b']['c'] == 'lolcat'

Categories

Resources