Remove duplicate values in different Json Lists python - python

I know that there are a lot of questions about duplicates but I can't find a solution suitable for me.
I have a json structure like this:
{
"test": [
{
"name2": [
"Tik",
"eev",
"asdv",
"asdfa",
"sadf",
"Nick"
]
},
{
"name2": [
"Tik",
"eev",
"123",
"r45",
"676",
"121"
]
}
]
}
I want to keep the first value and remove all the other duplicates.
Expected Result
{
"test": [
{
"name2": [
"Tik",
"eev",
"asdv",
"asdfa",
"sadf",
"Nick"
]
},
{
"name2": [
"123",
"r45",
"676",
"121"
]
}
]
}
I tried using a tmp to check for duplicates but it didn't seem to work. Also I can't find a way to make it json again.
import json
with open('myjson') as access_json:
read_data = json.load(access_json)
tmp = []
tmp2 = []
def get_synonyms():
ingredients_access = read_data['test']
for x in ingredients_access:
for j in x['name2']:
tmp.append(j)
if j in tmp:
tmp2.append(j)
get_synonyms()
print(len(tmp))
print(len(tmp2))

You can use recursion:
def filter_d(d):
seen = set()
def inner(_d):
if isinstance(_d, dict):
return {a:inner(b) if isinstance(b, (dict, list)) else b for a, b in _d.items()}
_r = []
for i in _d:
if isinstance(i, (dict, list)):
_r.append(inner(i))
elif i not in seen:
_r.append(i)
seen.add(i)
return _r
return inner(d)
import json
print(json.dumps(filter_d(data), indent=4))
Output:
{
"test": [
{
"name2": [
"Tik",
"eev",
"asdv",
"asdfa",
"sadf",
"Nick"
]
},
{
"name2": [
"123",
"r45",
"676",
"121"
]
}
]
}

You are first adding everything to tmp and then to tmp2 because every value was added to tmp before.
I changed the function a little bit to work for your specific test example:
def get_synonyms():
test_list = []
ingredients_access = read_data['test']
used_values =[]
for x in ingredients_access:
inner_tmp = []
for j in x['name2']:
if j not in used_values:
inner_tmp.append(j)
used_values.append(j)
test_list.append({'name2':inner_tmp})
return {'test': test_list}
result = get_synonyms()
print(result)
Output:
{'test': [{'name2': ['Tik', 'eev', 'asdv', 'asdfa', 'sadf', 'Nick']}, {'name2': ['123', 'r45', '676', '121']}]}

Here's a little hackish answer:
d = {'test': [{'name2': ['Tik', 'eev', 'asdv', 'asdfa', 'sadf', 'Nick']},
{'name2': ['Tik', 'eev', '123', 'r45', '676', '121']}]}
s = set()
for l in d['test']:
l['name2'] = [(v, s.add(v))[0] for v in l['name2'] if v not in s]
Output:
{'test': [{'name2': ['Tik', 'eev', 'asdv', 'asdfa', 'sadf', 'Nick']},
{'name2': ['123', 'r45', '676', '121']}]}
This uses a set to track the unique values, and add unique values to set while returning the value back to the list.

Related

Python 3.x Convert nested list of elements into nested dictionary with max recursion

I would like to convert a nested list like this:
["Pie", ["Sugar", "Biscuit", ["Egg"] ], "Cocoa", []]
to a nested dictionary like this:
{ "Pie": { "Sugar": {}, "Biscuit": { "Egg": {} } }, "Cocoa": {} }
with max recursion.
Possible variants of nested list:
["Pie", ["Sugar", "Biscuit", ["Egg"], "Something", ["Something2"] ], "Cocoa", []]
["Pie", ["Sugar", ["Biscuit"], "Another something", ["Egg"], "Something", ["Something2"] ], "Cocoa", ["One", ["Nested1"], "Two", ["Nested2"] ]]
INCORRECT variants:
["Pie", [["Sugar"], "Biscuit", ["Egg"], "Something", ["Something2"] ], "Cocoa", []]
[["Pie"], ["Sugar", "Biscuit", ["Egg"], "Something", ["Something2"] ], "Cocoa", []]
Here is one approach (see comments in the code for details):
l = ["Pie", ["Sugar", "Biscuit", ["Egg"] ], "Cocoa", []]
def to_nested(l):
out = {}
skip = False
for i, e in enumerate(l): # enumerate to keep track of position
if skip: # we already used this item as value, skip it
skip = False
continue
# ensure we have a next item and that it is a list
if i+1<len(l) and isinstance(l[i+1], list):
skip = True # flag item to be skipped as key
out[e] = to_nested(l[i+1])
else: # add a default empty dictionary as value
out[e] = {}
return out
out = to_nested(l)
output:
{'Pie': {'Sugar': {}, 'Biscuit': {'Egg': {}}}, 'Cocoa': {}}
Simple approach with while loop and recursion. Hope it helps.
I used pytest for quick testing.
import pytest
def convert(test):
result = {}
if not isinstance(test, list):
test = list(test)
if len(test) == 1:
return {test[0]:{}}
i = 0
while i < len(test):
if not isinstance(test[i+1], list):
result[test[i]] = {}
i += 1
continue
result[test[i]] = convert(test[i+1])
i+=2
return result
example = [
'Pie', [
'Sugar',
'Biscuit', ['Egg']
],
'Cocoa', []
]
wanted = {
"Pie": {
"Sugar": {},
"Biscuit": { "Egg": {} } },
"Cocoa": {}
}
example_convertet = convert(example)
def test_simple_key_value():
simple = ['hello', ['to you']]
test_convert = convert(simple)
assert test_convert == {
'hello': {'to you':{}}
}
def test_simple_key_with_multiple_values():
simple = ['hello', ['to you', 'and you too', ['lol']]]
test_convert = convert(simple)
assert test_convert == {
'hello' : {
'to you': {},
'and you too': {'lol':{}}}
}

Count number of objects in list of dictionary where a key's value is more than 1

Given a list of dictionaries:
data = {
"data": [
{
"categoryOptionCombo": {
"id": "A"
},
"dataElement": {
"id": "123"
}
},
{
"categoryOptionCombo": {
"id": "B"
},
"dataElement": {
"id": "123"
}
},
{
"categoryOptionCombo": {
"id": "C"
},
"dataElement": {
"id": "456"
}
}
]
}
I would like to display the dataElement where the count of distinct categoryOptionCombo is larger than 1.
e.g. the result of the function would be an iterable of IDs:
[123]
because the dataElement with id 123 has two different categoryOptionCombos.
tracker = {}
for d in data['data']:
data_element = d['dataElement']['id']
coc = d['categoryOptionCombo']['id']
if data_element not in tracker:
tracker[data_element] = set()
tracker[data_element].add(coc)
too_many = [key for key,value in tracker.items() if len(value) > 1]
How can I iterate the list of dictionaries preferably with a comprehension? This solution above is not pythonic.
One approach:
import collections
counts = collections.defaultdict(set)
for d in data["data"]:
counts[d["dataElement"]["id"]].add(d["categoryOptionCombo"]["id"])
res = [k for k, v in counts.items() if len(v) > 1]
print(res)
Output
['123']
This approach creates a dictionary mapping dataElements to the different types of categoryOptionCombo:
defaultdict(<class 'set'>, {'123': {'B', 'A'}, '456': {'C'}})
Almost a one-liner:
counts = collections.Counter( d['dataElement']['id'] for d in data['data'] )
print( counts )
Output:
Counter({'123': 2, '456': 1})
No need for sets, you can just remember each data element's first coc or mark it as having 'multiple'.
tracker = {}
for d in data['data']:
data_element = d['dataElement']['id']
coc = d['categoryOptionCombo']['id']
if tracker.setdefault(data_element, coc) != coc:
tracker[data_element] = 'multiple'
too_many = [key for key,value in tracker.items() if value == 'multiple']
(If the string 'multiple' can be a coc id, then use multiple = object() and compare with is).

How to convert from lists to json in python?

The required output of my nested loops is json, how to get there?
The input list structure looks like list = [[name, version, id],[name, version, id], ...]
list_1 = [
['mipl-abnd','v1.0.2','eelp234'],
['mipl-avfd','v1.1.5','32fv423'],
['mipl-awsd','v9.0.2','234eelp'],
['mipl-tgfd','v3.0.0','124fdge'],
['mipl-hrdss','v1.0.2','543rfd3'],
['mipl-oldss','v1.0.2','eelp234']]
list_2 = [
['mipl-abnd','v1.0.2','eelp234'],
['mipl-avfd','v1.1.6','3254323'],
['mipl-awsd','v9.0.2','234eelp'],
['mipl-tgfd','v3.0.0','124fdge'],
['mipl-hrdss','v1.0.2','543rfd3'],
['mipl-newss','v1.0.2','eelp234']]
This is the code I used to get a final list:
def get_difference(l1,l2):
l1 = get_ordered_list(file1.read())
l2 = get_ordered_list(file2.read())
d1 = {k:[v1,v2] for k,v1,v2 in l1}
d2 = {k:[v1,v2] for k,v1,v2 in l2}
result = []
for k,v in d2.items():
if k in d1:
v1 = d1[k]
if v1[0] != v[0]:
result.append({k,v1[0],v[0], v1[1],v[1]})
else:
result.append({k,'new',v[0],'new', v[1]})
for k,v in d1.items():
if k not in d2:
result.append({k,v[0],'deprecated', v[1], 'deprecated'})
res_json = json.dumps(result)
return res_json
Current Output :
result = [['mipl-avfd', 'v1.1.5', 'v1.1.6','32fv423', '3254323'], ['mipl-oldss','v1.0.2', 'deprecated','eelp234', 'deprecated'], ['mipl-newss', 'new','v1.0.2','new', 'eelp234']]
Required Output(I want to write it to an easily readable JSON which can be later made into a table) :
{diff = {"name" : "mipl-avfd",
"old-version" : "v1.1.5",
"new-version" : "v1.1.6",
"old-id" : "32fv423",
"new-id" : "3254323"
},
{"name" : "mipl-oldss",
"old-version" : "v1.0.2",
"new-version" : "deprecated",
"old-id" : "eelp234",
"new-id" : "deprecated"
},
{"name" : "mipl-newss",
"old-version" : "new",
"new-version" : "v1.0.2",
"old-id" : "eelp234",
"new-id" : "new"
}
}
I hope I understand your question right. You have "old" list_1 and "new" list_2 and you want to construct flat list how the versions change (I assume, in list_1 you have old versions):
import json
from itertools import groupby
list_1 = [
['mipl-abnd','v1.0.2','eelp234'],
['mipl-avfd','v1.1.5','32fv423'],
['mipl-awsd','v9.0.2','234eelp'],
['mipl-tgfd','v3.0.0','124fdge'],
['mipl-hrdss','v1.0.2','543rfd3'],
['mipl-oldss','v1.0.2','eelp234']]
list_2 = [
['mipl-abnd','v1.0.2','eelp234'],
['mipl-avfd','v1.1.6','3254323'],
['mipl-awsd','v9.0.2','234eelp'],
['mipl-tgfd','v3.0.0','124fdge'],
['mipl-hrdss','v1.0.2','543rfd3'],
['mipl-newss','v1.0.2','eelp234']]
s = sorted(list_1 + list_2, key=lambda k: k[0])
out = []
for v, g in groupby(s, lambda k: k[0]):
g = list(g)
if len(g) == 2:
out.append({
'name': v,
'old-version': g[0][1],
'new-version': g[1][1],
'old-id': g[0][2],
'new-id': g[1][2],
})
else:
if g[0] in list_1:
out.append({
'name': v,
'old-version': g[0][1],
'new-version': 'deprecated',
'old-id': g[0][2],
'new-id': 'deprecated',
})
else:
out.append({
'name': v,
'old-version': 'new',
'new-version': g[0][1],
'old-id': 'new',
'new-id': g[0][2],
})
print(json.dumps(out, indent=4))
Prints:
[
{
"name": "mipl-abnd",
"old-version": "v1.0.2",
"new-version": "v1.0.2",
"old-id": "eelp234",
"new-id": "eelp234"
},
{
"name": "mipl-avfd",
"old-version": "v1.1.5",
"new-version": "v1.1.6",
"old-id": "32fv423",
"new-id": "3254323"
},
{
"name": "mipl-awsd",
"old-version": "v9.0.2",
"new-version": "v9.0.2",
"old-id": "234eelp",
"new-id": "234eelp"
},
{
"name": "mipl-hrdss",
"old-version": "v1.0.2",
"new-version": "v1.0.2",
"old-id": "543rfd3",
"new-id": "543rfd3"
},
{
"name": "mipl-newss",
"old-version": "new",
"new-version": "v1.0.2",
"old-id": "new",
"new-id": "eelp234"
},
{
"name": "mipl-oldss",
"old-version": "v1.0.2",
"new-version": "deprecated",
"old-id": "eelp234",
"new-id": "deprecated"
},
{
"name": "mipl-tgfd",
"old-version": "v3.0.0",
"new-version": "v3.0.0",
"old-id": "124fdge",
"new-id": "124fdge"
}
]
What you said is a json is not a valid json. Also, json is a string - you want a dict structure. You don't have to dump it into a json string.
Why do you give l1 and l2 as arguments to the function when you overwrite them in the first lines?
file1 and file2 are not defined in the function. Also, for reading files you should use with to properly close the file.
First, you need to declare the keys (labels) somewhere:
keys = ["name", "old-version", "old-id", "new-id"]
Then, instead of appending a list, you append a dict.
Thankfully, dicts can be easily made from lists of tuples - and we can merge keys and your current lists into lists of tuples easily, e.g.:
dict(zip(keys, [k,v1[0],v[0], v1[1],v[1]]))
So it now looks like this:
for k,v in d2.items():
if k in d1:
v1 = d1[k]
if v1[0] != v[0]:
result.append(dict(zip(keys, [k,v1[0],v[0], v1[1],v[1]])))
else:
result.append(dict(zip(keys, [k,'new',v[0],'new', v[1]])))
for k,v in d1.items():
if k not in d2:
result.append(dict(zip(keys, [k,v[0],'deprecated', v[1], 'deprecated'])))

How to sort all lists in a deeply nested dictionary in python?

I want to sort all lists within a deeply nested dictionary. It is basically a JSON object which deep nesting of dictionaries within lists and then lists within dictionaries. All I want to do is, parse through all dictionary keys to all leaf nodes and sort all the lists that i encounter on the way. Basically, any list directly available or deep down within that given dictionary object should get sorted and the same dictionary with all sorted lists should be returned.
I tried doing recursion on the dict object to pass any dict object encountered to the recursion method and sorting the lists when encountered. But they fail to produce results when there is a dict inside a list and then another list inside that dict object.
Sample JSON below:
my_json = {
a: {
b: {
c: [
{
d: [
{ f: 'some_string' }
]
},
{
e: {
g: [
h: 'another string'
]
}
}
]
}
}
z: [
b: {
c: [
{
d: [
{ f: 'some_string1' }
]
},
{
e: {
g: [
h: 'another string1'
]
}
}
]
},
x: {
c: [
{
d: [
{ f: 'some_string2' }
]
},
{
e: {
g: [
h: 'another string2'
]
}
}
]
}
]
}
def gen_dict_extract(input_dict):
result_obj = input_dict;
if hasattr(var, 'iteritems'):
for k, v in var.iteritems():
if isinstance(v, dict):
for result in gen_dict_extract(v):
yield result
elif isinstance(v, list):
v.sort();
for d in v:
for result in gen_dict_extract(d):
yield result
The output expectation is just to have all lists sorted irrespective of where they lie. I am even okay with sorting every item in the dictionary but list sorting is what I require.
Taking a smaller example here to explain the output:
old_json = {
'x': [
{
'z': {
'y': ['agsd', 'xef', 'sdsd', 'erer']
}
},
{
's': {
'f': 'ererer',
'd': [5, 6, 2, 3, 1]
}
}
]
}
new_json = {
'x': [
{
's': {
'f': 'ererer',
'd': [1, 2, 3, 5, 6]
}
},
{
'z': {
'y': ['agsd', 'erer', 'sdsd','xef']
}
}
]
}
Something like above.
If you want the output to be a different dictionary (i.e. not sorting the original), the function should be written like this:
def sortedDeep(d):
if isinstance(d,list):
return sorted( sortedDeep(v) for v in d )
if isinstance(d,dict):
return { k: sortedDeep(d[k]) for k in sorted(d)}
return d
This way you can use sortedDeep() the same way you would use the buil-in sorted() function:
new_json = sortedDeep(old_json)
[EDIT] Improved version that will also sort lists of dictionaries (or list of lists) based on the smallest key/value of the embedded object:
def sortedDeep(d):
def makeTuple(v): return (*v,) if isinstance(v,(list,dict)) else (v,)
if isinstance(d,list):
return sorted( map(sortedDeep,d) ,key=makeTuple )
if isinstance(d,dict):
return { k: sortedDeep(d[k]) for k in sorted(d)}
return d
I believe the code snippet here will do the job for sorting nested dictionaries.
def nested_sort(d:dict):
for v in d.values():
if isinstance(v,dict):
nested_sort(v)
elif isinstance(v,list):
v.sort()
However, I cannot test the code because the example you gave is not in legal JSON format or a legal python dictionary.

Walk recursively through childs of a list in Python

I try to recursively print sentences from a nested list of lists
I want to obtain a list containing
['big bad dog', 'big fluffy cat', 'small blue happy pony', 'small frog']
Here is my code, it don't work...
Am I on the right path or I should structure my data in an another way to achieve my goal?
from pprint import pprint
dirs = [
{
'kw': 'big',
'childs': [
{
'kw': 'bad',
'childs': [
{
'kw': 'dog'
}
]
},
{
'kw': 'fluffy',
'childs': [
{
'kw': 'cat'
}
]
}
]
},
{
'kw': 'small',
'childs': [
{
'kw': 'blue',
'childs': [
{
'kw': 'happy',
'childs': [
{
'kw': 'pony'
}
]
}
]
},
{
'kw': 'frog'
}
]
},
]
def traverse(d, l):
kw = d.get('kw')
c = d.get('childs')
l.append(kw)
if c:
for cc in c:
l = traverse(cc, l)
return l
r = traverse(dirs[0], [])
pprint(r)
As usual, generators work nicely with recursive structures
def traverse(i):
for d in i:
childs = d.get('childs')
for j in traverse(childs) if childs else ['']:
yield d['kw']+' '+j
res = list(traverse(dirs))
In Python3.3, this becomes
def traverse(i):
for d in i:
c = d.get('childs')
yield from (d['kw']+' '+j for j in (traverse(c) if c else ['']))
Check out this function:
def traverse(d, l, s = None):
kw = d.get('kw')
c = d.get('childs')
if(s == None):
s = kw
else:
s = s + " " +kw
if c:
for cc in c:
l = traverse(cc, l, s)
else:
l.append(s)
return l
Its a very small modification to your recursion function:
r = traverse(dirs[0], [])
I would say my solution is pretty simple (change the item lookup for get if you want to handle cases where the key isn't found)
def _traverse(d):
prefix = d['kw']
if 'childs' not in d:
return [prefix]
results = []
for subdict in d['childs']:
subtraversal = _traverse(subdict)
results.extend(prefix+' '+item for item in subtraversal)
return results
def traverse(d):
return list(sum((_traverse(subdict) for subdict in d),[]))
Here's a solution to the given problem, with dirs being the json like structure you defined above. It's recursive, it works and it covers edge cases like the top structure being a dictionary.
def traverse(l, al = "", comps = []):
if isinstance(l,dict):
if not al:
al += l.get("kw")
else:
al += ", %s" % l.get("kw")
traverse(l.get("childs"), al, comps)
elif isinstance (l,list):
for i in l:
traverse(i, al, comps)
else:
comps.append(al)
return comps
print traverse(dirs)
Ah, Gnibbler beat me to the generator hattip. Only difference worth mentioning is " ".join to stitch the sentence and the *syntax to avoid if tests
def traverse_keys(*dictvalues):
for dictval in dictvalues:
for token in traverse_keys(*dictval.get('childs', [])):
yield token
kw = dictval.get('kw')
if kw: yield kw
tokens = [kw for kw in traverse_keys (*dirs)]
tokens.reverse()
print " ".join(tokens)
If you dont expect multiple branches in your children, the you can just nest the dictionaries directly - your logic doesn't have a way to choose which branch in the current structure. You could have named branches just by nesting dictionaries:
{ 'kw': 'dog'
'big': { 'kw': 'scary' }
'small': { 'kw': 'yippy', 'fluffy': { 'kw': 'poodle'} }
}
which would make the traversal cleaner

Categories

Resources