Merge two dictionaries based on similarity excluding a key - python

I have the following three dictionaries in an array:
items = [
{
'FirstName': 'David',
'LastName': 'Smith',
'Language': set(['en'])
},
{
'FirstName': 'David',
'LastName': 'Smith',
'Language': set(['fr'])
},
{
'FirstName': 'Bob',
'LastName': 'Jones',
'Language': set(['en'])
} ]
I want to merge together these dictionaries if the two dictionaries are the same minus the specified key: and add that key together. If using the "Language" key it would merge the array into the following:
[ {
'FirstName': 'David',
'LastName': 'Smith',
'Language': set(['en','fr'])
},{
'FirstName': 'Bob',
'LastName': 'Jones',
'Language': set(['en'])
} ]
Here is what I'm currently doing:
from copy import deepcopy
def _merge_items_on_field(items, field):
'''Given an array of dicts, merge the
dicts together if they are the same except for the 'field'.
If merging dicts, add the unique values of that field together.'''
items = deepcopy(items)
items_merged_on_field = []
for num, item in enumerate(items):
# Remove that key/value from the dict
field_value = item.pop(field)
# Get an array of items *without* that field to compare against
items_without_field = deepcopy(items_merged_on_field)
map(lambda d: d.pop(field), items_without_field)
# If the dict item is found ("else"), add the fields together
# If not ("except"), then add in the dict item to the array
try:
index = items_without_field.index(item)
except ValueError:
item[field] = field_value
items_merged_on_field.append(item)
else:
items_merged_on_field[index][field] = items_merged_on_field[index][field].union(field_value)
return items_merged_on_field
>>> items = [{'LastName': 'Smith', 'Language': set(['en']), 'FirstName': 'David'}, {'LastName': 'Smith', 'Language': set(['fr']), 'FirstName': 'David'}, {'LastName': 'Jones', 'Language': set(['en']), 'FirstName': 'Bob'}]
>>> _merge_items_on_field(items, 'Language')
[{'LastName': 'Smith', 'Language': set(['fr', 'en']), 'FirstName': 'David'}, {'LastName': 'Jones', 'Language': set(['en']), 'FirstName': 'Bob'}]
This seems a bit complicated -- is there a better way to do this?

There are a couple of ways of doing this. The most painless method to my knowledge utilises the pandas library—in particular, a groupby + apply.
import pandas as pd
merged = (
pd.DataFrame(items)
.groupby(['FirstName', 'LastName'], sort=False)
.Language
.apply(lambda x: set.union(*x))
.reset_index()
.to_dict(orient='records')
)
print(merged)
[
{'FirstName': 'David', 'LastName': 'Smith', 'Language': {'en', 'fr'}},
{'FirstName': 'Bob', 'LastName': 'Jones', 'Language': {'en'}}
]
The other method (that I mentioned) uses itertools.groupby, but seeing as you have 30 columns to group on, I'd just recommend sticking to pandas.
If you want to turn this into a function,
def merge(items, field):
df = pd.DataFrame(items)
columns = df.columns.difference([field]).tolist()
return (
df.groupby(columns, sort=False)[field]
.apply(lambda x: set.union(*x))
.reset_index()
.to_dict(orient='records')
)
merged = merge(items, 'Language')
print(merged)
[
{'FirstName': 'David', 'LastName': 'Smith', 'Language': {'en', 'fr'}},
{'FirstName': 'Bob', 'LastName': 'Jones', 'Language': {'en'}}
]

You can use itertools.groupby:
import itertools
d = [{'FirstName': 'David', 'LastName': 'Smith', 'Language': {'en'}}, {'FirstName': 'David', 'LastName': 'Smith', 'Language': {'fr'}}, {'FirstName': 'Bob', 'LastName': 'Jones', 'Language': {'en'}}]
v = [[a, list(b)] for a, b in itertools.groupby(sorted(d, key=lambda x:x['FirstName']), key=lambda x:x['FirstName'])]
final_dict = [{**{'FirstName':a}, **{'LastName':(lambda x:[list(set(x)), x[0]][len(set(x)) == 1])([i['LastName'] for i in b])}, **{'Language':set([list(i['Language'])[0] for i in b])}} for a, b in v]
Output:
[{'FirstName': 'Bob', 'LastName': 'Jones', 'Language': {'en'}}, {'FirstName': 'David', 'LastName': 'Smith', 'Language': {'en', 'fr'}}]

If pandas is not an option:
from itertools import groupby
from functools import reduce
arr = [
{'FirstName': 'David', 'LastName': 'Smith', 'Language': set(['en'])},
{'FirstName': 'David', 'LastName': 'Smith', 'Language': set(['fr'])},
{'FirstName': 'David', 'LastName': 'Jones', 'Language': set(['sp'])}
]
def reduce_field(items, field, op=set.union, sort=False):
def _key(d):
return tuple((k, v) for k, v in d.items() if k != field)
if sort:
items = sorted(items, key=_key)
res = []
for k, g in groupby(items, key=_key):
d = dict(k)
d[field] = reduce(op, (el[field] for el in g))
res.append(d)
return res
reduce_field(arr, 'Language')

You can try it manually :
new_dict={}
#
#
#
d = [{'FirstName': 'David', 'LastName': 'Smith', 'Language': {'en'}},
{'FirstName': 'David', 'LastName': 'Smith', 'Language': {'fr'}},
{'FirstName': 'Bob', 'LastName': 'Jones', 'Language': {'en'}}]
for i in d:
if (i['FirstName'],i['LastName']) not in new_dict:
new_dict[(i['FirstName'],i['LastName'])]=i
else:
new_dict[(i['FirstName'],i['LastName'])]['Language']=set(list(new_dict[(i['FirstName'],i['LastName'])]['Language'])+list(i['Language']))
print(new_dict.values())
output:
# dict_values([{'FirstName': 'Bob',
# 'LastName': 'Jones',
# 'Language': {'en'}},
# {'FirstName': 'David',
# 'LastName': 'Smith',
# 'Language': {'fr', 'en'}}])

Related

How to loop through a list of dictionary and extract those with the same 'name' and 'school' into a new list while getting their other values in it

I have this list of dictionary and I would like to get those with the same exact value of 'name' and 'school' into a new list and also getting their 'age' merged into a list as well and the rest of the dictionary that is not identical to just add into the list as per usual..
Here is an example of the list of dictionary
[{'name': 'Jane', 'age':12, 'school': 'SIT'}, {'name': 'John', 'age':13, 'school': 'SMU'},{'name': 'Jane', 'age':14, 'school': 'SIT'}, {'name': 'Jane', 'age':16, 'school': 'SIT'}, {'name': 'John', 'age':13, 'school': 'NUS'}]
and I would like it to make it into something like this..
[{'name': 'Jane', 'age': [12,14,16], 'school': 'SIT'}, {'name': 'John', 'age': 13, 'school': 'SMU'}, {'name': 'John', 'age':13, 'school': 'NUS'}]
using Python.. please help!
tried using counter, loops but still can't get it to work..
You could use itertools.groupby().
Example:
import itertools
from pprint import pprint
data = [{'name': 'Jane', 'age':12, 'school': 'SIT'}, {'name': 'John', 'age':13, 'school': 'SMU'},{'name': 'Jane', 'age':14, 'school': 'SIT'}, {'name': 'Jane', 'age':16, 'school': 'SIT'}, {'name': 'John', 'age':13, 'school': 'NUS'}]
keyfunc = lambda x: (x["name"], x["school"])
# needs to be sorted to use groupby
data.sort(key=keyfunc)
output = []
for k,v in itertools.groupby(data, key=keyfunc):
this_group = {
"name": k[0],
"school": k[1],
"age": [i["age"] for i in v],
}
output.append(this_group)
pprint(output)
The output is:
[{'age': [12, 14, 16], 'name': 'Jane', 'school': 'SIT'},
{'age': [13], 'name': 'John', 'school': 'NUS'},
{'age': [13], 'name': 'John', 'school': 'SMU'}]
If you wish to go with the solution based on a buffer dictionary, please check out the dict.setdefault() method.
Example:
buffer = {}
for i in data:
buffer.setdefault((i["name"], i["school"]), []).append(i["age"])
For reference:
https://docs.python.org/3/library/itertools.html#itertools.groupby
https://docs.python.org/3/library/stdtypes.html#dict.setdefault
x = [{'name': 'Jane', 'age':12, 'school': 'SIT'}, {'name': 'John', 'age':13, 'school': 'SMU'},{'name': 'Jane', 'age':14, 'school': 'SIT'}, {'name': 'Jane', 'age':16, 'school': 'SIT'}, {'name': 'John', 'age':13, 'school': 'NUS'}]
new_x = {}
for r in x:
if r['name'] in new_x.keys():
if not isinstance(new_x[r['name']]['age'], list):
new_x[r['name']]['age'] = [new_x[r['name']]['age']]
if r['age'] not in new_x[r['name']]['age']:
new_x[r['name']]['age'].append(r['age'])
else:
new_x[r['name']] = {'age': r['age'], 'school': r['school']}
z = [v.update(name=k) for k, v in new_x.items()]
z = [v for k, v in new_x.items()]
Here is a universal solution to your problem. Only name and school are considered "special". All other keys, like age are converted to list when a new value has to be added.
l = [
{"name": "Jane", "age": 12, "school": "SIT"},
{"name": "John", "age": 13, "school": "SMU"},
{"name": "Jane", "age": 14, "school": "SIT"},
{"name": "Jane", "age": 16, "school": "SIT"},
{"name": "John", "age": 13, "school": "NUS"},
]
r = {}
for x in l:
id = f"{x['name']}-{x['school']}"
if id in r:
for k,v in x.items():
if k not in ["name", "school"]:
if k in r[id]:
if isinstance(r[id][k], list):
r[id][k].append(v)
else:
r[id][k] = [r[id][k], v]
else:
r[id][k] = v
else:
r[id] = x
result = [x for x in r.values()]

How to best iterate through a list within a dictionary?

I have a dictionary as follows:
a = {'name': 'Test', 'lastName': 'Test', 'scores': ['1', '2'], 'subjects': ['te','re'] }
I have tried nested loops, but I'm not sure if that's the best approach.
As an output I need a list of dictionaries for each score and subject :
result1 = { 'name':'Test', 'lastName': 'Test', 'score': '1', 'subjects': 'te'}
result2 = { 'name':'Test', 'lastName': 'Test', 'score': '2', 'subjects': 're'}
How to best iterate through the lists and create such dictionary? The number of scores and subjects will always match.
Any help would be appreciated.
Here is a function which unzip your dictionary.
We first use next to find some list value in the dictionary, its length is the expected output size.
Note that this will fail if the dictionary contains no list at all.
def unzip_dict(d):
# Find one of the list in the dictionary to read its length
length = len(next(value for value in d.values() if isinstance(value, list)))
output = []
# Unzip the dictionary
for i in range(length):
output.append({k: v[i] if isinstance(v, list) else v for k, v in d.items()})
return output
a = {'name': 'Test', 'lastName': 'Test', 'scores': ['1', '2'], 'subjects': ['te', 're']}
print(unzip_dict(a))
Output
[{'lastName': 'Test', 'name': 'Test', 'scores': '1', 'subjects': 'te'},
{'lastName': 'Test', 'name': 'Test', 'scores': '2', 'subjects': 're'}]
Try this:
# setup data
a = {'name': 'Test', 'lastName': 'Test', 'scores': ['1', '2'], 'subjects': ['te','re'] }
# create list of dictionary
out_list = []
for value in a.get('scores'):
for item in a.get('subjects'):
out_list.append({'name': 'Test', 'lastName': 'Test', 'scores':value, 'subjects':item})
Output:
{'name': 'Test', 'lastName': 'Test', 'scores': '1', 'subjects': 'te'}
{'name': 'Test', 'lastName': 'Test', 'scores': '1', 'subjects': 're'}
{'name': 'Test', 'lastName': 'Test', 'scores': '2', 'subjects': 'te'}
{'name': 'Test', 'lastName': 'Test', 'scores': '2', 'subjects': 're'}
You don't need nested for loops, a single for loop is sufficient:
def foo(a):
finarr=[]
for i in range(len(a['scores'])):
fnarr={}
fnarr['name']=a['name']
fnarr['lastName']=a['lastName']
fnarr['score']=a['scores'][i]
fnarr['subject']=a['subjects'][i]
finarr.append(fnarr)
return finarr
Output:
[{'name': 'Test', 'lastName': 'Test', 'score': '1', 'subject': 'te'},
{'name': 'Test', 'lastName': 'Test', 'score': '2', 'subject': 're'}]
you can try this:
res1 = {}
res2= {}
for k,v in a.items():
if(k == "scores"):
res1[k] = v[0]
res2[k] = v[1]
elif(k=="subjects"):
res1[k] = v[0]
res2[k] = v[1]
else:
res1[k] = v
res2[k] = v
print(res1)
print(res2)
you can also take a look to defaultdict i think that would help you on your Task
You can utilize zip to attach scores and subjects to each other and then add it to a new list.
d = {'name': 'Test', 'lastName': 'Test', 'scores': ['1', '2'], 'subjects': ['te','re'] }
template = {'name': d['name'], 'lastName': d['lastName']}
res = []
for subject, score in zip(d['subjects'], d['scores']):
template.update({'subjects': subject, 'scores': score})
res.append(template)
print(res)

Removing a dictionary from a nested dictionary [duplicate]

This question already has answers here:
How do I remove the first item from a list?
(12 answers)
Closed 5 years ago.
I wanna remove a dictionary from a nested dictionary and I don't know how.
From this dictionary:
dict = {
'user': [
{
'firstName': 'john',
'lastName': 'doe',
'movieList': []
},
{
'firstName': 'sarah',
'lastName': 'doe',
'movieList': []
},
{
'firstName': 'john',
'lastName': 'smith',
'movieList': []
},
{
'firstName': 'sarah',
'lastName': 'smith',
'movieList': []
}
], 'movie': []
}
I want to remove:
{
'firstName': 'john',
'lastName': 'doe',
'movieList': []
}
which has the index 0
I tried using delete but i get this error:
dict['user'][userId] TypeError: list indices must be integers or slices, not str
First, I wouldn't name the dict "dict", use "d" or something else.
dict['user'].pop(0)
I would write a comment, but my reputation is too low. It seems like your variable userId might be a string instead of an integer. If this is the case, try converting userId to an int:
userId = int(userId)
See this question for more info on del pop and remove Difference between del, remove and pop on lists
If you do not want to use the dictionary you are removing, I would use del, as below:
d = {'user': [{'firstName': 'john', 'lastName': 'doe', 'movieList': []},
{'firstName': 'sarah', 'lastName': 'doe', 'movieList': []},
{'firstName': 'john', 'lastName': 'smith', 'movieList': []},
{'firstName': 'sarah', 'lastName': 'smith', 'movieList': []}], 'movie': []}
del d['user'][0]
This [0] is deleting the first index of the list stored as the value of the 'user' key
If you are repeatedly deleting the first index of your list and your list is long, consider using a deque() instead, which has fast pops from either end:
https://docs.python.org/3/library/collections.html#collections.deque
Also, don't call your variables the same name as their types, aka don't call your dictionary variable dict
You can try this:
dict1 = {'user': [{'firstName': 'john', 'lastName': 'doe', 'movieList': []}, {'firstName': 'sarah', 'lastName': 'doe', 'movieList': []}, {'firstName': 'john', 'lastName': 'smith', 'movieList': []}, {'firstName': 'sarah', 'lastName': 'smith', 'movieList': []}], 'movie': []}
new_dict = {a:b[1:] if b else b for a, b in dict1.items()}
Output:
{'movie': [], 'user': [{'lastName': 'doe', 'movieList': [], 'firstName': 'sarah'}, {'lastName': 'smith', 'movieList': [], 'firstName': 'john'}, {'lastName': 'smith', 'movieList': [], 'firstName': 'sarah'}]}

Create a list of dictionaries from a list of keys and multiple lists of values

My solution
keys = ['FirstName', 'LastName', 'ID']
name1 = ['Michael', 'Jordan', '224567']
name2 = ['Kyle', 'Hynes', '294007']
name3 = ['Josef', 'Jones', '391107']
dictList = []
dictList.append(dict(zip(keys, name1)))
dictList.append(dict(zip(keys, name2)))
dictList.append(dict(zip(keys, name3)))
Works fine, but is there any other solution, because I will have at least 20000 names, so I am looking how to improve this.
Place all your "name" sublists into the parent list names. Then you can easily use list comprehension:
keys = ['FirstName', 'LastName', 'ID']
names = [
['Michael', 'Jordan', '224567'],
['Kyle', 'Hynes', '294007'],
['Josef', 'Jones', '391107']
]
dictList = [{k:v for k,v in zip(keys, n)} for n in names]
print(dictList)
The output:
[{'FirstName': 'Michael', 'LastName': 'Jordan', 'ID': '224567'}, {'FirstName': 'Kyle', 'LastName': 'Hynes', 'ID': '294007'}, {'FirstName': 'Josef', 'LastName': 'Jones', 'ID': '391107'}]
Do you really need a dictionary? Why not just use a namedtuple:
>>> from collections import namedtuple
>>> Employee = namedtuple('Employee', 'FirstName, LastName, ID')
>>> names_list = [['Michael', 'Jordan', '224567'], ['Kyle', 'Hynes', '294007'], ['Josef', 'Jones', '391107']]
>>> employee_list = map(Employee._make, names_list)
>>> employee_list[0].FirstName
'Michael'
>>> pprint(employee_list)
[Employee(FirstName='Michael', LastName='Jordan', ID='224567'),
Employee(FirstName='Kyle', LastName='Hynes', ID='294007'),
Employee(FirstName='Josef', LastName='Jones', ID='391107')]
pandas makes this too easy.
import pandas as pd
keys = ['FirstName', 'LastName', 'ID']
name1 = ['Michael', 'Jordan', '224567']
name2 = ['Kyle', 'Hynes', '294007']
name3 = ['Josef', 'Jones', '391107']
doc_list = [name1,name2,name3]
df = pd.DataFrame(doc_list,columns = keys)
So you'll have a DataFrame like this:
FirstName LastName ID
0 Michael Jordan 224567
1 Kyle Hynes 294007
2 Josef Jones 391107
If your names are already in a file,read_csv would be better.
pd.read_csv("file_name.csv",header=keys)//remove the header parameter if it is present in your csv.
You should append your dictionaries to the list inside a loop, like this:
In [1152]: names = [name1, name2, name3]
In [1153]: d = []
In [1154]: for name in names:
...: d.append(dict(zip(keys, name)))
...:
In [1155]: d
Out[1155]:
[{'FirstName': 'Michael', 'ID': '224567', 'LastName': 'Jordan'},
{'FirstName': 'Kyle', 'ID': '294007', 'LastName': 'Hynes'},
{'FirstName': 'Josef', 'ID': '391107', 'LastName': 'Jones'}]
Or, if you prefer, a list comprehension:
In [1160]: d = [dict(zip(keys, name)) for name in names]
In [1161]: d
Out[1161]:
[{'FirstName': 'Michael', 'ID': '224567', 'LastName': 'Jordan'},
{'FirstName': 'Kyle', 'ID': '294007', 'LastName': 'Hynes'},
{'FirstName': 'Josef', 'ID': '391107', 'LastName': 'Jones'}]

Python: convert 2 lists to dict and repeat the keys with each set of data

Learning about Python's zip function. I can do this
list_keys = ['fname','lname','dob']
list_data = ['bob','smith','12121950']
keys_and_data = dict(zip(list_keys,list_data))
print keys_and_data
#output
{'lname': 'smith', 'dob': '12121950', 'fname': 'bob'}
Now I am trying to figure out how to get this to work
list_keys = ['fname','lname','dob']
list_data = [['bob','smith','12121950'],['john','jones','10101940']]
keys_and_data = ??
print keys_and_data
#output
{['lname': 'smith', 'dob': '12121950', 'fname': 'bob'],
['lname': 'jones', 'dob': '10101940', 'fname': 'john']}
How do I get python to repeat the keys?
Loop over list_data in a list comprehension:
keys_and_data = [dict(zip(list_keys, data)) for data in list_data]
Demo:
>>> list_keys = ['fname','lname','dob']
>>> list_data = [['bob','smith','12121950'],['john','jones','10101940']]
>>> [dict(zip(list_keys, data)) for data in list_data]
[{'lname': 'smith', 'dob': '12121950', 'fname': 'bob'}, {'lname': 'jones', 'dob': '10101940', 'fname': 'john'}]

Categories

Resources