Python - Find duplicates in list and group them by key

Python - Find duplicates in list and group them by key - python

I have a list of python dicts like this:
[{
'id': 1,
'name': 'name1'
}, {
'id': 2,
'name': 'name2'
}, {
'id': 3,
'name': 'name1'
}]
What I want to do is to create a new list of dictionaries, containing only the ones that have the key 'name' duplicated, and group them.
[{
'id1': 1,
'id2': 3,
'name': 'name1'
}]
The first list is an SQL query output and I need to delete the rows that have the key 'name' duplicated, keeping only one.

You can use itertools.groupby:
import itertools
d = [{'id': 1, 'name': 'name1'}, {'id': 2, 'name': 'name2'}, {'id': 3, 'name': 'name1'}]
new_data = [[a, list(b)] for a, b in itertools.groupby(sorted(d, key=lambda x:x['name']), key=lambda x:x['name'])]
final_dicts = [{'name':a, **{f'id{i}':a['id'] for i, a in enumerate(b, 1)}} for a, b in new_data if len(b) > 1]
Output:
[{'name': 'name1', 'id1': 1, 'id2': 3}]

I suggest you the following solution, quite easy to read and understand:
from collections import defaultdict
ds = [{'id': 1, 'name': 'name1'},
{'id': 2, 'name': 'name2'},
{'id': 3, 'name': 'name1'}]
newd = defaultdict(list)
for d in ds:
newd[d['name']].append(d['id'])
# Here newd is {'name1': [1, 3], 'name2': [2]}
result = []
for k,v in newd.items():
if len(v) > 1:
d = {f'id{i}':i for i in v}
d['name'] = k
result.append(d)
print(result) # [{'id1': 1, 'id3': 3, 'name': 'name1'}]

You can use collections.Counter:
from collections import Counter
from operator import itemgetter
l = [{'id': 1, 'name': 'name1'}, {'id': 2, 'name': 'name2'}, {'id': 3, 'name': 'name1'}]
print([{'name': n, **{'id%d' % i: d['id'] for i, d in enumerate([d for d in l if d['name'] == n], 1)}} for n, c in Counter(map(itemgetter('name'), l)).items() if c > 1])
This outputs:
[{'name': 'name1', 'id1': 1, 'id2': 3}]

Related

How to merge two list of dictionaries based on a value

I have two lists of dictionaries, lets say:
a = [{'id': 1, 'name': 'a'}]
b = [{'id': 1, 'city': 'b'}]
I want to have a list that merges every dictionary in both lists with the same ID. In this example i expect to have:
a = [{'id': 1, 'name': 'a', 'city': 'b'}]
Is there any cleaner way of doing it other than a for nested into the other?
Thanks

You can keep track of the ids with another dict (or defaultdict to make things simpler). Then update the items in that dict as you iterate. In the end the dict's values will have your list.
from collections import defaultdict
d = defaultdict(dict)
a = [{'id': 1, 'name': 'a'}, {'id': 3, 'name': 'a'}]
b = [{'id': 1, 'city': 'b'}, {'id': 2, 'city': 'c'}, {'id': 3, 'city': 'd'}]
for item in a + b:
d[item['id']].update(item)
list(d.values())
# [{'id': 1, 'name': 'a', 'city': 'b'},
# {'id': 3, 'name': 'a', 'city': 'd'},
# {'id': 2, 'city': 'c'}]
Note this will overwrite duplicate values other than id — so if you have two with id: 1 and two different cities, you will only get the last city.

One way to do this is to make a dictionary, mapping the identifier that you want to use (id in this case) to a dictionary of merged results.
#!/usr/bin/python
import collections
def merge_on_key(list_of_dictionaries, key, result):
for d in list_of_dictionaries:
assert(key in d)
result[d[key]].update(d)
a = [{'id': 1, 'name': 'a'}]
b = [{'id': 1, 'city': 'b'}, {'id': 2, 'color': 'blue'}]
print 'a', a
print 'b', b
c = collections.defaultdict(lambda: {})
merge_on_key(a, 'id', c)
merge_on_key(b, 'id', c)
print 'merged results in dictionary with id 1', c[1]
That returns:
merged results in dictionary with id 1 {'city': 'b', 'id': 1, 'name': 'a'}

You can use map, lambda function in conjunction with update method for dictionaries, like this:
a = [{'id': 1, 'name': 'a'}, {'id': 2, 'name': 'a'}, {'id': 3, 'name': 'k'}]
b = [{'id': 1, 'city': 'b'}, {'id': 2, 'city': 'c'}, {'id': 4, 'city': 'cm'}]
a.extend(list(map(lambda x,y: y if x.get('id') != y.get('id') else x.update(y), a, b)))
a = list(filter(None, a))
a will now become a list containing dictionaries of merged values like this:
[{'id': 1, 'name': 'a', 'city': 'b'},
{'id': 2, 'name': 'a', 'city': 'c'},
{'id': 3, 'name': 'k'},
{'id': 4, 'city': 'cm'}]

from collections import defaultdict
from operator import itemgetter
l1 =[{'id': 1, 'City': 'Calcutta'}, {'id': 3, 'Country': 'Germany'}]
l2 = [{'id': 1, 'Country': 'India'}, {'id': 2, 'City': 'Delhi'}, {'id': 3, 'City': 'Berlin'}]
def merge1(l1,l2):
d = defaultdict(dict)
for l in (l1, l2):
for innerdict1 in l:
d[innerdict1['id']].update(innerdict1)
l4 = sorted(d.values(), key=itemgetter("id"))
l4p = print(l4)
return l4p
merge1(l1, l2)
"""
[{'id': 1, 'City': 'Delhi', 'Country': 'India'}, {'id': 2, 'City': 'Calcutta'}, {'id': 3, 'Country': 'Germany', 'City': 'Berlin'}]
"""

Finding differences of two list of dictionaries in Python [duplicate]

This question already has answers here:
Getting the difference (delta) between two lists of dictionaries
(4 answers)
Closed 3 years ago.
I have two dictionaries like below:
prev = [
{ 'id': 0, 'name': 'a' },
{ 'id': 1, 'name': 'b' },
{ 'id': 2, 'name': 'c' }
]
current = [
{ 'id': 1, 'name': 'b' },
{ 'id': 2, 'name': 'c' },
{ 'id': 3, 'name': 'e' },
{ 'id': 4, 'name': 'f' }
]
I want to get the difference of them, the result should be like below:
result = [
{ 'id': 3, 'name': 'e' },
{ 'id': 4, 'name': 'f' }
]
Only the difference of those two should appear in result list, mu solution is like below
common = []
for c in current:
for p in prev:
if c['name'] == p['name']:
common.append(c)
print(common)
I'm trying to find the common items between two and then subtract it from current list but I don't know hot to handle it. If I am using the wrong procedure to resolve the issue, is there another way I can find these two diffs?
I tried to search a lot but all the results I found were just comparing two list of integers, which in my case is list of dictionaries.
Also note that the id key is just for separating those items, let's compare by name, let's consider I want to remove commons from current and remain the rest just in current list. By that I mean that I don't need name: a and name: b from prev list.

Simple
From the data you posted, you can compare the whole dicts, so just find dicts in current that are not in prev:
new = [d for d in current if d not in prev]
print(new) # -> [{'id': 3, 'name': 'e'}, {'id': 4, 'name': 'f'}]
Complex
If your real-world data might have differing ids, the solution needs to get more complex.
Since only the names are important, make a set of common names. Then you can loop over the dicts and check whether the name is in the common set.
prev = [{'id': 0, 'name': 'a'}, {'id': 1, 'name': 'b'}, {'id': 2, 'name': 'c'}]
current = [{'id': 1, 'name': 'b'}, {'id': 2, 'name': 'c'}, {'id': 3, 'name': 'e'}, {'id': 4, 'name': 'f'}]
prev_names, current_names = [{d['name'] for d in x} for x in (prev, current)] # [{'c', 'b', 'a'}, {'c', 'b', 'f', 'e'}]
common_names = prev_names & current_names # {'b', 'c'}
new = [d for d in current if d['name'] not in common_names]
print(new) # -> [{'id': 3, 'name': 'e'}, {'id': 4, 'name': 'f'}]
This is also easy to adapt to getting names in prev that are not common:
old = [d for d in prev if d['name'] not in common_names]
print(old) # -> [{'id': 0, 'name': 'a'}]

This will do the job
prev = [ { 'id': 0, 'name': 'a' }, { 'id': 1, 'name': 'b' }, { 'id': 2, 'name': 'c' } ]
current = [ { 'id': 1, 'name': 'b' }, { 'id': 2, 'name': 'c' }, { 'id': 3, 'name': 'e' }, { 'id': 4, 'name': 'f' } ]
common = []
for c in current:
if not any(c['id'] == p['id'] and c['name'] == p['name'] for p in prev):
common.append(c)
print(common)
Return True if any element of the iterable is true. If the iterable is empty, return False
Alse, as #wjandrea noted in the comments, this
new = [c for c in current if c not in prev]
is also a fair and nice answer. But note that it only works if comparing the whole dicts

If I understood correctly, you want only the items that appear in current and did not appear in prev.
Something like this should work
prev_names = set(map(lambda x: x['name'], prev))
new_items = [item for item in current if item['name'] not in prev_names]
new_items # [{'id': 3, 'name': 'e'}, {'id': 4, 'name': 'f'}]

Code
import itertools
list(itertools.filterfalse(lambda x: x in prev, current))
Output:
[{'id': 3, 'name': 'e'}, {'id': 4, 'name': 'f'}]

Based on all of the answers here is a little benchmark
import timeit
import itertools
prev = [{'id': 0, 'name': 'a'}, {'id': 1, 'name': 'b'}, {'id': 2, 'name': 'c'}]
current = [{'id': 1, 'name': 'b'}, {'id': 2, 'name': 'c'}, {'id': 3, 'name': 'e'}, {'id': 4, 'name': 'f'}]
def f1():
prev_names = set(map(lambda x: x['name'], prev))
new_items = [item for item in current if item['name'] not in prev_names]
return new_items
def f2():
common = []
for c in current:
if not any(c['id'] == p['id'] and c['name'] == p['name'] for p in prev):
common.append(c)
return common
def f3():
return list(itertools.filterfalse(lambda x: x in prev, current))
print(f1())
print(timeit.timeit("f1()", setup="from __main__ import f1"))
print(f2())
print(timeit.timeit("f2()", setup="from __main__ import f2"))
print(f3())
print(timeit.timeit("f3()", setup="from __main__ import f3"))
[{'id': 3, 'name': 'e'}, {'id': 4, 'name': 'f'}]
0.8235890520736575
[{'id': 3, 'name': 'e'}, {'id': 4, 'name': 'f'}]
2.0767332719406113
[{'id': 3, 'name': 'e'}, {'id': 4, 'name': 'f'}]
0.864271447993815

from list of dicts to list of lists of dicts with same values

I have list of dicts:
dict_list = [{'Id': 0, 'UserID': 1, 'Name': 'John'},
{'Id': 1, 'UserID': 2, 'Name': 'Martin'},
{'Id': 2, 'UserID': 1, 'Name': 'Rob'},
{'Id': 3, 'UserID': 1, 'Name': 'Neil'},
{'Id': 4, 'UserID': 2, 'Name': 'Bill'}]
How to make a list of lists of dicts that grouped by key UserID?
So I want to group dicts with the same value of key UserID to lists.
I expect smth like that:
[[{'Id': 0,'UserID': 1, 'Name': 'John'},
{'Id': 2,'UserID': 1, 'Name': 'Rob'},
{'Id': 3,'UserID': 1, 'Name': 'Neil'}],
[{'Id': 1,'UserID': 2, 'Name': 'Martin'},
{'Id': 4,'UserID': 2, 'Name': 'Bill'}]]

First sort the dict_list based on UserID and then use itertools.groupby to group the results based on UserID
>>> from itertools import groupby
>>> key = lambda d: d['UserID']
>>> res = [list(grp) for _,grp in groupby(sorted(dict_list, key=key), key)]
>>>
>>> pprint(res)
[[{'Id': 0, 'Name': 'John', 'UserID': 1},
{'Id': 2, 'Name': 'Rob', 'UserID': 1},
{'Id': 3, 'Name': 'Neil', 'UserID': 1}],
[{'Id': 1, 'Name': 'Martin', 'UserID': 2},
{'Id': 4, 'Name': 'Bill', 'UserID': 2}]]

It's also possible to use list comprehension like this:
dict_list = [{'Id': 0, 'UserID': 1, 'Name': 'John'},
{'Id': 1, 'UserID': 2, 'Name': 'Martin'},
{'Id': 2, 'UserID': 1, 'Name': 'Rob'},
{'Id': 3, 'UserID': 1, 'Name': 'Neil'},
{'Id': 4, 'UserID': 2, 'Name': 'Bill'}]
user_ids=set([x['UserID'] for x in dict_list])
result_list=[]
for user_id in user_ids:
user_id_list = [x for x in dict_list if x['UserID']==user_id]
result_list.append(user_id_list)
print(result_list)

from itertools import groupby
dict_list = [{'Id': 0, 'UserID': 1, 'Name': 'John'},
{'Id': 1, 'UserID': 2, 'Name': 'Martin'},
{'Id': 2, 'UserID': 1, 'Name': 'Rob'},
{'Id': 3, 'UserID': 1, 'Name': 'Neil'},
{'Id': 4, 'UserID': 2, 'Name': 'Bill'}]
res =[list(group) for _,group in groupby(sorted(dict_list, key=lambda f: f['UserID']), lambda f: f['UserID'])]
print(res)

merge list of dictionaries which have list init

My current list:
my_list = [
{'id': 1, 'val': [6]},
{'id': 2, 'val': [7]},
{'id': 3, 'val': [8]},
{'id': 2, 'val': [9]},
{'id': 1, 'val': [10]},
]
Desired output:
my_list = [
{'id': 1, 'val': [6, 10]},
{'id': 2, 'val': [7, 9]},
{'id': 3, 'val': [8]},
]
what I tried so far:
my_new_list = []
id_set = set()
for d in my_list:
if d['id'] not in id_set:
id_set.add(d['id'])
temp = {'id': d['id'], 'val': d['val']}
my_new_list.append(temp)
else:
# loop over the new list and find the dict which already have d['id'] and update by appending value
# but this is not efficient
any other more efficient approach or may be some inbuilt function I'm not aware of.
PS: Order is important!

.setdefault() is your friend:
(We should use collections.OrderedDict to remember the order that keys were first inserted.)
>>> import collections
>>> result = collections.OrderedDict()
>>> for d in my_list:
... result.setdefault(d["id"], []).extend(d["val"])
>>> lst = []
>>> for k, v in result.items():
... lst.append({"id": k, "val": v})

Same approach as ozgur, but using collections.defaultdict:
>>> from collections import defaultdict
>>> d = defaultdict(list)
>>> for dd in my_list:
d[dd['id']].extend(dd['val'])
>>> d
defaultdict(<type 'list'>, {1: [6, 10], 2: [7, 9], 3: [8]})
>>>
>>> lst = []
>>> for k,v in d.iteritems():
lst.append({'id':k, 'val':v})
>>> lst
[{'id': 1, 'val': [6, 10]}, {'id': 2, 'val': [7, 9]}, {'id': 3, 'val': [8]}]
>>>

You can use itertools.groupby in order to sort and group the original list by 'id' and accumulate the 'val' for each group:
from itertools import groupby
key_fnc = lambda d: d['id']
result = [
{'id': k, 'val': sum([d['val'] for d in g], [])}
for k, g in groupby(sorted(my_list, key=key_fnc), key=key_fnc)
]

Create a new list of dicts in common between n lists of dicts?

I have an unknown number of lists of product results as dictionary entries that all have the same keys. I'd like to generate a new list of products that appear in all of the old lists.
'what products are available in all cities?'
given:
list1 = [{'id': 1, 'name': 'bat', 'price': 20.00}, {'id': 2, 'name': 'ball', 'price': 12.00}, {'id': 3, 'name': 'brick', 'price': 19.00}]
list2 = [{'id': 1, 'name': 'bat', 'price': 18.00}, {'id': 3, 'name': 'brick', 'price': 11.00}, {'id': 2, 'name': 'ball', 'price': 17.00}]
list3 = [{'id': 1, 'name': 'bat', 'price': 16.00}, {'id': 4, 'name': 'boat', 'price': 10.00}, {'id': 3, 'name': 'brick', 'price': 15.00}]
list4 = [{'id': 1, 'name': 'bat', 'price': 14.00}, {'id': 2, 'name': 'ball', 'price': 9.00}, {'id': 3, 'name': 'brick', 'price': 13.00}]
list...
I want a list of dicts in which the 'id' exists in all of the old lists:
result_list = [{'id': 1, 'name': 'bat}, {'id': 3, 'name': 'brick}]
The values that aren't constant for a given 'id' can be discarded, but the values that are the same for a given 'id' must be in the results list.
If I know how many lists I've got, I can do:
results_list = []
for dict in list1:
if any(dict['id'] == d['id'] for d in list2):
if any(dict['id'] == d['id'] for d in list3):
if any(dict['id'] == d['id'] for d in list4):
results_list.append(dict)
How can I do this if I don't know how many lists I've got?

Put the ids into sets and then take the intersection of the sets.
list1 = [{'id': 1, 'name': 'steve'}, {'id': 2, 'name': 'john'}, {'id': 3, 'name': 'mary'}]
list2 = [{'id': 1, 'name': 'jake'}, {'id': 3, 'name': 'tara'}, {'id': 2, 'name': 'bill'}]
list3 = [{'id': 1, 'name': 'peter'}, {'id': 4, 'name': 'rick'}, {'id': 3, 'name': 'marci'}]
list4 = [{'id': 1, 'name': 'susan'}, {'id': 2, 'name': 'evan'}, {'id': 3, 'name': 'tom'}]
lists = [list1, list2, list3, list4]
sets = [set(x['id'] for x in lst) for lst in lists]
intersection = set.intersection(*sets)
print(intersection)
Result:
{1, 3}
Note that we call the class method set.intersection rather than the instance method set().intersection, since the latter takes intersections of its arguments with the empty set set(), and of course the intersection of anything with the empty set is empty.
If you want to turn this back into a list of dicts, you can do:
result = [{'id': i, 'name': None} for i in intersection]
print(result)
Result:
[{'id': 1, 'name': None}, {'id': 3, 'name': None}]
Now, if you also want to hold onto those attributes which are the same for all instances of a given id, you'll want to do something like this:
list1 = [{'id': 1, 'name': 'bat', 'price': 20.00}, {'id': 2, 'name': 'ball', 'price': 12.00}, {'id': 3, 'name': 'brick', 'price': 19.00}]
list2 = [{'id': 1, 'name': 'bat', 'price': 18.00}, {'id': 3, 'name': 'brick', 'price': 11.00}, {'id': 2, 'name': 'ball', 'price': 17.00}]
list3 = [{'id': 1, 'name': 'bat', 'price': 16.00}, {'id': 4, 'name': 'boat', 'price': 10.00}, {'id': 3, 'name': 'brick', 'price': 15.00}]
list4 = [{'id': 1, 'name': 'bat', 'price': 14.00}, {'id': 2, 'name': 'ball', 'price': 9.00}, {'id': 3, 'name': 'brick', 'price': 13.00}]
lists = [list1, list2, list3, list4]
sets = [set(x['id'] for x in lst) for lst in lists]
intersection = set.intersection(*sets)
all_keys = set(lists[0][0].keys())
result = []
for ident in intersection:
res = [dic for lst in lists
for dic in lst
if dic['id'] == ident]
replicated_keys = []
for key in all_keys:
if len(set(dic[key] for dic in res)) == 1:
replicated_keys.append(key)
result.append({key: res[0][key] for key in replicated_keys})
print(result)
Result:
[{'id': 1, 'name': 'bat'}, {'id': 3, 'name': 'brick'}]
What we do here is:
Look at each id in intersection and grab each dict corresponding to that id.
Find which keys have the same value in all of those dicts (one of which is guaranteed to be id).
Put those key-value pairs into result
This code assumes that:
Each dict in list1, list2, ... will have the same keys. If this assumption is false, let me know - it shouldn't be difficult to relax.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python - Find duplicates in list and group them by key - python

Related

How to merge two list of dictionaries based on a value

Finding differences of two list of dictionaries in Python [duplicate]

from list of dicts to list of lists of dicts with same values

merge list of dictionaries which have list init

Create a new list of dicts in common between n lists of dicts?

Categories

Resources