I have a large collection that can be modeled more or less as the one created by the following code:
import string
from random import randint, random, choice
documents = []
for i in range(100):
letters = choice(string.letters[0:15])
documents.append({'hgvs_id': "".join([str(randint(0,9)), letters]),
'sample_id': "CDE",
'number': i*random()*50 - 30 })
documents.append({'hgvs_id': "".join([str(randint(0,9)), letters]),
'sample_id': 'ABC',
'number': i*random()*50 - 30 })
documents.append({'hgvs_id': "".join([str(randint(0,9)), letters]),
'sample_id': 'GEF',
'number': i*random()*50 - 30 })
for i in range(10): # add some unique values for sample_id 'ABC'
letters = choice(string.letters[0:15])
documents.append({'hgvs_id': "55" + letters,
'sample_id': 'ABC',
'number': i*random()*50 - 30 })
collection.insert_many(documents)
I am trying to retrieve the unique hgvs_id's that occur within documents that have a specific sample_id (ABC here) but not in documents containing the other two. Usually, there will be many more sample_id than just three.
It sounds pretty simple, but so far I have been unsuccessful. Given the size of the collection I'm working with (~30GB), I've been trying to use the aggregate framework as follows:
sample_1 = collection.aggregate(
[
{'$group':
{
'_id': '$hgvs_id',
#'sample_id' : {"addToSet": '$hgvs_id'},
'matchedDocuments':
{'$push':
{
'id': '$_id',
'sample_name': "$sample_id",
'hgvs_ids': "$hgvs_id"
}
},
}
},
{'$match': {
"$and": [
{'matchedDocuments': {"$elemMatch": {'sample_name': 'ABC'}}},
# Some other operation????
]
}
}
]) #, allowDiskUse=True) may be needed
This returns (understandably) all the hgvs_id's having sample_id equal ABC. Any leads would be more than appreciated.
If it's the only sample_id in the "set" of grouped values then the $size will be one:
With MongoDB 3.4 you can use $in in combination:
[
{ "$group": {
"_id": "$hgvs_id",
"samples": { "$addToSet": "$sample_id" }
}},
{ "$redact": {
"$cond": {
"if": {
"$and": [
{ "$in": [ "ABC", "$samples" ] },
{ "$eq": [ { "$size": "$samples" }, 1 ] }
]
},
"then": "$$KEEP",
"else": "$$PRUNE"
}
}}
]
Otherwise use $setIntersection which is just a little longer in syntax:
[
{ "$group": {
"_id": "$hgvs_id",
"samples": { "$addToSet": "$sample_id" }
}},
{ "$redact": {
"$cond": {
"if": {
"$and": [
{ "$eq": [ { "$size": { "$setIntersection": [ "$samples", ["ABC"] ] } }, 1 ] },
{ "$eq": [ { "$size": "$samples" }, 1 ] }
]
},
"then": "$$KEEP",
"else": "$$PRUNE"
}
}}
]
Or probably in the simplest form for all versions supporting aggregation anyway:
{ "$group": {
"_id": "$hgvs_id",
"samples": { "$addToSet": "$sample_id" }
}},
{ "$match": {
"$and": [{ "samples": "ABC" },{ "samples": { "$size": 1 } }]
}}
]
The same principle applies to any number of arguments in that the "set" produced much much the size of the arguments given as well as containing the specific value.
Related
As title suggests, I have this document structure:
{
"_id":ObjectId("61e53553ac31665894ebf6bc"),
"questionID":"8",
"questionContent":"find it",
"questoinAnswer":"it's here",
"questionStatus":"active",
"questionImage":"some image",
"hints":[
{
"hintID":"1",
"hintSubject":"in you pucket",
"hintContent":"bla bla bla",
"hintType":"private",
"hintStatus":"Active",
"time":"2022-01-23 11:02:41.976391"
},
{
"hintID":"2",
"hintSubject":"red sea",
"hintContent":"bla bla bla",
"hintMedia":"some media",
"hintType":"puplic",
"hintStatus":"Active",
"time":"2022-01-23 11:05:47.567226"
}
]
}
I want to retrieve only the values of hintSubject if the hintType is free and hintStatus is active and put it into a list
Use the below aggregation query where list of hintSubject is stored in hintSubject key in root dictionary key.
from pymongo import MongoClient
c = MongoClient()
db = c["db_name"]
col = db["sample_collection"]
for x in col.aggregate([
{
"$addFields": {
"hintSubject": {
"$reduce": {
"input": "$hints",
"initialValue": [],
"in": {
"$concatArrays": [
"$$value",
{
"$cond": {
"if": {
"$and": [
{
"$eq": [
"$$this.hintType",
"free"
]
},
{
"$eq": [
"$$this.hintStatus",
"Active"
]
},
]
},
"then": [
"$$this.hintSubject"
],
"else": [],
},
},
],
},
},
}
}
}
])
print(x["hintSubject"])
Mongo Playground Sample Execution
Need to group the set of objects with duplicate value
Input
{
"errors": [
{
"check_no": "1.4",
"security": "IAM"
},
{
"check_no": "1.3",
"security": "SLM"
},
{
"check_no": "1.4",
"security": "EKM"
}
]
}
Here the check_no inside the array has 1.4, 1.3, 1.4. I need to group them into an array with an additional key as shown below
Output
{
"errors": [
{
"check_no": "1.4",
"values": [
{
"security": "IAM"
},
{
"security": "EKM"
}
]
},
{
"check_no": "1.3",
"values": [
{
"security": "SLM"
}
]
}
]
}
First you need to collect the "security": "EKM" per "check_no", for that a defaultdict is very suitable, then rebuild the result
from collections import defaultdict
grouped_check_no = defaultdict(list)
for error in v['errors']:
grouped_check_no[error['check_no']].append({"security": error['security']})
result = {'errors': [{"check_no": k, "values": v} for k, v in grouped_check_no.items()]}
grouped_check_no is
{'1.4': [{'security': 'IAM'}, {'security': 'EKM'}], '1.3': [{'security': 'SLM'}]}
I have 2 mongo db collections, 'Contacts' and 'Messages'. Both collections share the phone number field(Primary/Foreign Key relation in SQL).
Contacts collection has this field as follows:
{
"phone": "+192******",
"name": "test"
}
and Messages as follows:
{
"Tel": "tel:+192******"
}
I want to aggregate the 2 collections such that I can have this nested document:
"text": "text sent by user",
"contact": {
"phone": "+192******",
"name": "test"
}
So far, I have tried the following aggregation but it doesn't work:
cursor = messages_client.aggregate([{
'$lookup':
{
'from': "contacts",
'let': { 'phone': "$phone"},
'pipeline': [
{ '$addFields': { 'phone_number': { "$substr": [ "$Tel", 4, -1 ] }}},
{'$match': { "$expr": { '$eq': [ '$phone_number', '$$phone']}}}
],
'as': 'contact'
}}
], allowDiskUse=True)
Could someone kindly help me? I'm using pymongo and Python3 if that is helpful.
Found some help from $indexOfCp operator, for anyone with a similar problem.
cursor = messages_client.aggregate([{
'$lookup':
{
'from': "contacts",
'let': { 'phone': "$phone"},
'pipeline': [
{'$match': { "$expr": { '$gt': [{ "$indexOfCP": ["$Tel", "$$phone"]}, -1]}}}
],
'as': 'contact'
}}
], allowDiskUse=True)
Try this:
db.messages.aggregate([
{
$addFields: {
'phone_number': { "$substr": ["$Tel", 4, -1] }
}
},
{
$lookup: {
from: "contacts",
let: { "phone": "$phone_number" },
pipeline: [
{
$match: {
$expr: { $eq: ["$phone", "$$phone"] }
}
}
],
as: "contact"
}
},
{ $unwind: "$contact" }
]);
i have a document in mongodb:
{
"company": "npcompany",
"department": [
{
"name": "it",
"employeeIds": [
"emp1",
"emp2",
"emp3"
]
},
{
"name": "economy",
"employeeIds": [
"emp1",
"emp3",
"emp4"
]
}
]
}
I want to find "emp4". In this case i want to get "economy" department data only. If i found "emp1" then i want to get "npcompany" and "economy" datas. How can i do it in mongodb (or pymongo)?
play
db.collection.aggregate([ //As you need to fetch all matching array elements, reshape them
{
$unwind: "$department"
},
{
"$match": {//look for match
"department.employeeIds": "emp4"
}
},
{
$group: {//regroup them
"_id": "$_id",
data: {
"$push": "$$ROOT"
}
}
}
])
I have an aggregate query via mongoengine:
Foo.objects.aggregate(*[ { "$match": { "groups.1": { "$exists": True } } }, { "$redact": { "$cond": [ { "$gte": [ { "$size": { "$setIntersection": [ "$groups", my_groups ] } }, 1 ]}, "$$KEEP", "$$PRUNE" ] }} ])
But, the results of the query are not enough.
I need to find all documents that match this query OR other queries.
How should I do that?
Thank you!