Grouping duplicate objects into an array of values

Grouping duplicate objects into an array of values - python

Need to group the set of objects with duplicate value
Input
{
"errors": [
{
"check_no": "1.4",
"security": "IAM"
},
{
"check_no": "1.3",
"security": "SLM"
},
{
"check_no": "1.4",
"security": "EKM"
}
]
}
Here the check_no inside the array has 1.4, 1.3, 1.4. I need to group them into an array with an additional key as shown below
Output
{
"errors": [
{
"check_no": "1.4",
"values": [
{
"security": "IAM"
},
{
"security": "EKM"
}
]
},
{
"check_no": "1.3",
"values": [
{
"security": "SLM"
}
]
}
]
}

First you need to collect the "security": "EKM" per "check_no", for that a defaultdict is very suitable, then rebuild the result
from collections import defaultdict
grouped_check_no = defaultdict(list)
for error in v['errors']:
grouped_check_no[error['check_no']].append({"security": error['security']})
result = {'errors': [{"check_no": k, "values": v} for k, v in grouped_check_no.items()]}
grouped_check_no is
{'1.4': [{'security': 'IAM'}, {'security': 'EKM'}], '1.3': [{'security': 'SLM'}]}

Related

Filter nested python dict by value

I have a python dictionary, where I don't exactly know, how deeply nested it is, but here is an example of such:
{
"name":"a_struct",
"type":"int",
"data":{
"type":"struct",
"elements":[
{
"data":[
{
"name":"test1",
"data_id":0,
"type":"uint8",
"wire_type":0,
"data":0
},
{
"name":"test2",
"data_id":2,
"type":"uint32",
"wire_type":2,
"data":0
},
{
"name":"test3",
"data_id":3,
"type":"int",
"wire_type":4,
"data":{
"type":"uint32",
"elements":[
]
}
},
{
"name":"test4",
"data_id":4,
"type":"uint32",
"wire_type":2,
"data":0
},
{
"name":"test5",
"data_id":5,
"type":"int",
"wire_type":4,
"data":{
"type":"uint32",
"elements":[
]
}
}
]
}
]
}
}
My goal is to filter out each dictionary that does not contains values ["test1", "test3", "test5"] by the name key. This shall be applicable to various deeply nested dictionaries.
So in that case, the result shall be a filtered dictionary:
{
"name":"a_struct",
"type":"int",
"data":{
"type":"struct",
"elements":[
{
"data":[
{
"name":"test1",
"data_id":0,
"type":"uint8",
"wire_type":0,
"data":0
},
{
"name":"test3",
"data_id":3,
"type":"int",
"wire_type":4,
"data":{
"type":"uint32",
"elements":[
]
}
},
{
"name":"test5",
"data_id":5,
"type":"int",
"wire_type":4,
"data":{
"type":"uint32",
"elements":[
]
}
}
]
}
]
}
}
I tried to use the dpath lib (https://pypi.org/project/dpath/), by providing a filter criteria like so:
def afilter(x):
if isinstance(x, dict):
if "name" in x:
if x["name"] in ["test1", "test3", "test5"]:
return True
else:
return False
else:
return False
result = dpath.util.search(my_dict, "**", afilter=afilter)
But I get a wrong result, so every other key, has been filtered out, which is not what I want:
{
"data":{
"elements":[
{
"data":[
{
"name":"test1",
"data_id":0,
"type":"uint8",
"wire_type":0,
"data":0
},
null,
{
"name":"test3",
"data_id":3,
"type":"int",
"wire_type":4,
"data":{
"type":"uint32",
"elements":[
]
}
},
null,
{
"name":"test5",
"data_id":5,
"type":"int",
"wire_type":4,
"data":{
"type":"uint32",
"elements":[
]
}
}
]
}
]
}
}
How to get this right?
PS: I'm not forced to use the dpath lib. So, the solution might be written in pure python.

You can recursively process your dictionary while filtering unneeded records:
def delete_keys(data, keys_to_keep):
res = {}
for k, v in data.items():
if isinstance(v, dict):
res[k] = delete_keys(v, keys_to_keep)
elif isinstance(v, list):
if k == "data":
res[k] = [delete_keys(obj, keys_to_keep) for obj in v if obj.get('name') in keys_to_keep]
else:
res[k] = [delete_keys(obj, keys_to_keep) for obj in v]
else:
res[k] = v
return res
keys_to_keep = {'test1', 'test3', 'test5'}
print(delete_keys(data, keys_to_keep))
For your input, it gives:
{
"name": "a_struct",
"type": "int",
"data": {
"type": "struct",
"elements": [
{
"data": [
{
"name": "test1",
"data_id": 0,
"type": "uint8",
"wire_type": 0,
"data": 0,
},
{
"name": "test3",
"data_id": 3,
"type": "int",
"wire_type": 4,
"data": {"type": "uint32", "elements": []},
},
{
"name": "test5",
"data_id": 5,
"type": "int",
"wire_type": 4,
"data": {"type": "uint32", "elements": []},
},
]
}
],
},
}

using pymongo,I want to retrieve only values in hint's subject from hints array if the hint type is free&active and then list the values into a list

As title suggests, I have this document structure:
{
"_id":ObjectId("61e53553ac31665894ebf6bc"),
"questionID":"8",
"questionContent":"find it",
"questoinAnswer":"it's here",
"questionStatus":"active",
"questionImage":"some image",
"hints":[
{
"hintID":"1",
"hintSubject":"in you pucket",
"hintContent":"bla bla bla",
"hintType":"private",
"hintStatus":"Active",
"time":"2022-01-23 11:02:41.976391"
},
{
"hintID":"2",
"hintSubject":"red sea",
"hintContent":"bla bla bla",
"hintMedia":"some media",
"hintType":"puplic",
"hintStatus":"Active",
"time":"2022-01-23 11:05:47.567226"
}
]
}
I want to retrieve only the values of hintSubject if the hintType is free and hintStatus is active and put it into a list

Use the below aggregation query where list of hintSubject is stored in hintSubject key in root dictionary key.
from pymongo import MongoClient
c = MongoClient()
db = c["db_name"]
col = db["sample_collection"]
for x in col.aggregate([
{
"$addFields": {
"hintSubject": {
"$reduce": {
"input": "$hints",
"initialValue": [],
"in": {
"$concatArrays": [
"$$value",
{
"$cond": {
"if": {
"$and": [
{
"$eq": [
"$$this.hintType",
"free"
]
},
{
"$eq": [
"$$this.hintStatus",
"Active"
]
},
]
},
"then": [
"$$this.hintSubject"
],
"else": [],
},
},
],
},
},
}
}
}
])
print(x["hintSubject"])
Mongo Playground Sample Execution

How to filter elements of array in mongodb

i have a document in mongodb:
{
"company": "npcompany",
"department": [
{
"name": "it",
"employeeIds": [
"emp1",
"emp2",
"emp3"
]
},
{
"name": "economy",
"employeeIds": [
"emp1",
"emp3",
"emp4"
]
}
]
}
I want to find "emp4". In this case i want to get "economy" department data only. If i found "emp1" then i want to get "npcompany" and "economy" datas. How can i do it in mongodb (or pymongo)?

play
db.collection.aggregate([ //As you need to fetch all matching array elements, reshape them
{
$unwind: "$department"
},
{
"$match": {//look for match
"department.employeeIds": "emp4"
}
},
{
$group: {//regroup them
"_id": "$_id",
data: {
"$push": "$$ROOT"
}
}
}
])

Set difference between unions of specific subfields

I have a large collection that can be modeled more or less as the one created by the following code:
import string
from random import randint, random, choice
documents = []
for i in range(100):
letters = choice(string.letters[0:15])
documents.append({'hgvs_id': "".join([str(randint(0,9)), letters]),
'sample_id': "CDE",
'number': i*random()*50 - 30 })
documents.append({'hgvs_id': "".join([str(randint(0,9)), letters]),
'sample_id': 'ABC',
'number': i*random()*50 - 30 })
documents.append({'hgvs_id': "".join([str(randint(0,9)), letters]),
'sample_id': 'GEF',
'number': i*random()*50 - 30 })
for i in range(10): # add some unique values for sample_id 'ABC'
letters = choice(string.letters[0:15])
documents.append({'hgvs_id': "55" + letters,
'sample_id': 'ABC',
'number': i*random()*50 - 30 })
collection.insert_many(documents)
I am trying to retrieve the unique hgvs_id's that occur within documents that have a specific sample_id (ABC here) but not in documents containing the other two. Usually, there will be many more sample_id than just three.
It sounds pretty simple, but so far I have been unsuccessful. Given the size of the collection I'm working with (~30GB), I've been trying to use the aggregate framework as follows:
sample_1 = collection.aggregate(
[
{'$group':
{
'_id': '$hgvs_id',
#'sample_id' : {"addToSet": '$hgvs_id'},
'matchedDocuments':
{'$push':
{
'id': '$_id',
'sample_name': "$sample_id",
'hgvs_ids': "$hgvs_id"
}
},
}
},
{'$match': {
"$and": [
{'matchedDocuments': {"$elemMatch": {'sample_name': 'ABC'}}},
# Some other operation????
]
}
}
]) #, allowDiskUse=True) may be needed
This returns (understandably) all the hgvs_id's having sample_id equal ABC. Any leads would be more than appreciated.

If it's the only sample_id in the "set" of grouped values then the $size will be one:
With MongoDB 3.4 you can use $in in combination:
[
{ "$group": {
"_id": "$hgvs_id",
"samples": { "$addToSet": "$sample_id" }
}},
{ "$redact": {
"$cond": {
"if": {
"$and": [
{ "$in": [ "ABC", "$samples" ] },
{ "$eq": [ { "$size": "$samples" }, 1 ] }
]
},
"then": "$$KEEP",
"else": "$$PRUNE"
}
}}
]
Otherwise use $setIntersection which is just a little longer in syntax:
[
{ "$group": {
"_id": "$hgvs_id",
"samples": { "$addToSet": "$sample_id" }
}},
{ "$redact": {
"$cond": {
"if": {
"$and": [
{ "$eq": [ { "$size": { "$setIntersection": [ "$samples", ["ABC"] ] } }, 1 ] },
{ "$eq": [ { "$size": "$samples" }, 1 ] }
]
},
"then": "$$KEEP",
"else": "$$PRUNE"
}
}}
]
Or probably in the simplest form for all versions supporting aggregation anyway:
{ "$group": {
"_id": "$hgvs_id",
"samples": { "$addToSet": "$sample_id" }
}},
{ "$match": {
"$and": [{ "samples": "ABC" },{ "samples": { "$size": 1 } }]
}}
]
The same principle applies to any number of arguments in that the "set" produced much much the size of the arguments given as well as containing the specific value.

How can I convert my JSON into the format required to make a D3 sunburst diagram?

I have the following JSON data:
{
"data": {
"databis": {
"dataexit": {
"databis2": {
"1250": { }
}
},
"datanode": {
"20544": { }
}
}
}
}
I want to use it to generate a D3 sunburst diagram, but that requires a different data format:
{
"name": "data",
"children": [
{
"name": "databis",
"children": [
{
"name": "dataexit",
"children": [
{
"name": "databis2",
"size": "1250"
}
]
},
{
"name": "datanode",
"size": "20544"
}
]
}
]
}
How can I do this with Python? I think I need to use a recursive function, but I don't know where to start.

You could use recursive solution with function that takes name and dictionary as parameter. For every item in given dict it calls itself again to generate list of children which look like this: {'name': 'name here', 'children': []}.
Then it will check for special case where there's only one child which has key children with value of empty list. In that case dict which has given parameter as a name and child name as size is returned. In all other cases function returns dict with name and children.
import json
data = {
"data": {
"databis": {
"dataexit": {
"databis2": {
"1250": { }
}
},
"datanode": {
"20544": { }
}
}
}
}
def helper(name, d):
# Collect all children
children = [helper(*x) for x in d.items()]
# Return dict containing size in case only child looks like this:
# {'name': '1250', 'children': []}
# Note that get is used to so that cases where child already has size
# instead of children work correctly
if len(children) == 1 and children[0].get('children') == []:
return {'name': name, 'size': children[0]['name']}
# Normal case where returned dict has children
return {'name': name, 'children': [helper(*x) for x in d.items()]}
def transform(d):
return helper(*next(iter(d.items())))
print(json.dumps(transform(data), indent=4))
Output:
{
"name": "data",
"children": [
{
"name": "databis",
"children": [
{
"name": "dataexit",
"children": [
{
"name": "databis2",
"size": "1250"
}
]
},
{
"name": "datanode",
"size": "20544"
}
]
}
]
}

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Grouping duplicate objects into an array of values - python

Related

Filter nested python dict by value

using pymongo,I want to retrieve only values in hint's subject from hints array if the hint type is free&active and then list the values into a list

How to filter elements of array in mongodb

Set difference between unions of specific subfields

How can I convert my JSON into the format required to make a D3 sunburst diagram?

Categories

Resources