dictionary is below. Below is sample dictionary of length 2 and 3. By checking the condition need to generate the query dynamically
a = [{'data': 'abc'}, {'prod': 'def'}]
if len(a) = 2:
#below query has to generate
"query": {
"bool": {
"should": [
"query_string": {
"query": "*abc*",
"fields": [
"query_string": {
"query": "*def*",
"fields": [
a = [{'data': 'abc'}, {'prod': 'def'},{'email': '#gmail'}]
if len(a) = 3
#below is the query
"query": {
"bool": {
"should": [
"query_string": {
"query": "*abc*",
"fields": [
"query_string": {
"query": "*def*",
"fields": [
"query_string": {
"query": "*#gmail.com*",
"fields": [
Basically if dictionary keep on adding {"query_string": {"query": "*#gmail.com*","fields": ["email"]}} the query also keep on adding
Using a simple iteration.
a = [{'data': 'abc'}, {'prod': 'def'}]
result = {"query": {
"bool": {
"should": []
for item in a:
for k, v in item.items():
"query_string": {
"query": f"*{v}*",
"fields": [
{'query': {'bool': {'should': [{'query_string': {'fields': ['data'],
'query': '*abc*'}},
{'query_string': {'fields': ['prod'],
'query': '*def*'}}]}}}
I have a large collection that can be modeled more or less as the one created by the following code:
import string
from random import randint, random, choice
documents = []
for i in range(100):
letters = choice(string.letters[0:15])
documents.append({'hgvs_id': "".join([str(randint(0,9)), letters]),
'sample_id': "CDE",
'number': i*random()*50 - 30 })
documents.append({'hgvs_id': "".join([str(randint(0,9)), letters]),
'sample_id': 'ABC',
'number': i*random()*50 - 30 })
documents.append({'hgvs_id': "".join([str(randint(0,9)), letters]),
'sample_id': 'GEF',
'number': i*random()*50 - 30 })
for i in range(10): # add some unique values for sample_id 'ABC'
letters = choice(string.letters[0:15])
documents.append({'hgvs_id': "55" + letters,
'sample_id': 'ABC',
'number': i*random()*50 - 30 })
I am trying to retrieve the unique hgvs_id's that occur within documents that have a specific sample_id (ABC here) but not in documents containing the other two. Usually, there will be many more sample_id than just three.
It sounds pretty simple, but so far I have been unsuccessful. Given the size of the collection I'm working with (~30GB), I've been trying to use the aggregate framework as follows:
sample_1 = collection.aggregate(
'_id': '$hgvs_id',
#'sample_id' : {"addToSet": '$hgvs_id'},
'id': '$_id',
'sample_name': "$sample_id",
'hgvs_ids': "$hgvs_id"
{'$match': {
"$and": [
{'matchedDocuments': {"$elemMatch": {'sample_name': 'ABC'}}},
# Some other operation????
]) #, allowDiskUse=True) may be needed
This returns (understandably) all the hgvs_id's having sample_id equal ABC. Any leads would be more than appreciated.
If it's the only sample_id in the "set" of grouped values then the $size will be one:
With MongoDB 3.4 you can use $in in combination:
{ "$group": {
"_id": "$hgvs_id",
"samples": { "$addToSet": "$sample_id" }
{ "$redact": {
"$cond": {
"if": {
"$and": [
{ "$in": [ "ABC", "$samples" ] },
{ "$eq": [ { "$size": "$samples" }, 1 ] }
"then": "$$KEEP",
"else": "$$PRUNE"
Otherwise use $setIntersection which is just a little longer in syntax:
{ "$group": {
"_id": "$hgvs_id",
"samples": { "$addToSet": "$sample_id" }
{ "$redact": {
"$cond": {
"if": {
"$and": [
{ "$eq": [ { "$size": { "$setIntersection": [ "$samples", ["ABC"] ] } }, 1 ] },
{ "$eq": [ { "$size": "$samples" }, 1 ] }
"then": "$$KEEP",
"else": "$$PRUNE"
Or probably in the simplest form for all versions supporting aggregation anyway:
{ "$group": {
"_id": "$hgvs_id",
"samples": { "$addToSet": "$sample_id" }
{ "$match": {
"$and": [{ "samples": "ABC" },{ "samples": { "$size": 1 } }]
The same principle applies to any number of arguments in that the "set" produced much much the size of the arguments given as well as containing the specific value.
test_cursor = db.command({
"aggregate": "New_layout",
"pipeline": [
{ "$match": { "$and": [
{ "FIRST_DATE": { "$gte": new_date } },
{ "CHAIN_ID": { "$ne": "" } }
] } },
{ "$unwind": { "path": "$ENTERS", "includeArrayIndex": "Date" } },
{ "$project": {
"_id": 0,
"ZIP": "$ZIP",
"ZIP3": "$ZIP3",
"DATE": "$Date",
} }
"allowDiskUse": bool(1),
"cursor": {}
The contents of the cursor are as below :-
[u'cursor', u'ok', u'waitedMS'] .
However with an $out statement, the output collection has the expected contents.
I am running pymongo v3.2.2 and mongo 3.2. I was told this problem is experienced with v3.0 or lesser, but this is something I am not able to figure out
You should use aggregate() instead of command().
test_cursor = db.New_layout.aggregate([
{ "$match": { "$and": [
{ "FIRST_DATE": { "$gte": new_date } },
{ "CHAIN_ID": { "$ne": "" } }
] } },
{ "$unwind": { "path": "$ENTERS", "includeArrayIndex": "Date" } },
{ "$project": {
"_id": 0,
"ZIP": "$ZIP",
"ZIP3": "$ZIP3",
"DATE": "$Date",
} }
I'm new on NoSQL Database and i choose MongoDB as my first NoSQL Database. I made an aggregation pipeline to shows the data that i want, here's my document sample:
Document sample from Users Collection
"_id": 9,
"name": "Sample Name",
"email": "email#example.com",
"password": "password hash"
Document sample from Pages Collection (this one doesn't really matter)
"_id": 42,
"name": "Product Name",
"description": "Product Description",
"user_id": 8,
"rating_categories": [{
"_id": 114,
"name": "Build Quality"
}, {
"_id": 115,
"name": "Price"
}, {
"_id": 116,
"name": "Feature"
}, {
"_id": 117,
"name": "Comfort"
}, {
"_id": 118,
"name": "Switch"
Document sample from Reviews Collection
"_id": 10,
"page_id": 42, #ID reference from pages collection
"user_id": 8, #ID reference from users collection
"review": "The review of the product",
"ratings": [{
"_id": 114, #ID Reference from pages collection of what rating category it is
"rating": 5
}, {
"_id": 115,
}, {
"_id": 116,
"rating": 5
}, {
"_id": 117,
"rating": 3
}, {
"_id": 118,
"rating": 4
"created": "1582825968963", #Date Object
"votes": {
"downvotes": [],
"upvotes": [9] #IDs of users who upvote this review
I want to get reviews by page_id which can be accessed from the API i made, here's the expected result from the aggregation:
"_id": 10, #Review of the ID
"created": "Thu, 27 Feb 2020 17:52:48 GMT",
"downvote_count": 0, #Length of votes.downvotes from reviews collection
"page_id": 42, #Page ID
"ratings": [ #Stores what rate at what rating category id
"_id": 114,
"rating": 5
"_id": 115,
"rating": 4
"_id": 116,
"rating": 5
"_id": 117,
"rating": 3
"_id": 118,
"rating": 4
"review": "The Review",
"upvote_count": 0, #Length of votes.upvotes from reviews collection
"user": { #User who reviewed
"_id": 8, #User ID
"downvote_count": 0, #How many downvotes this user receive from all of the user's reviews
"name": "Sample Name", #Username
"review_count": 1, #How many reviews the user made
"upvote_count": 1 #How many upvotes this user receive from all of the user's reviews
"vote_state": 0 #Determining vote state from the user (who requested to the API) for this review, 0 for no vote, -1 for downvote, 1 for upvote
Here's the pipeline of the aggregation for reviews collection that i made for the result above:
user_id = 9
page_id = 42
pipeline = [
{"$group": {
"_id": {"user_id":"$user_id", "page_id": "$page_id"},
"review_id": {"$last": "$_id"},
"page_id": {"$last": "$page_id"},
"user_id" : {"$last": "$user_id"},
"ratings": {"$last": "$ratings"},
"review": {"$last": "$review"},
"created": {"$last": "$created"},
"votes": {"$last": "$votes"},
"upvote_count": {"$sum":
{"$cond": [
{"$ifNull": ["$votes.upvotes", False]},
{"$size": "$votes.upvotes"},
"downvote_count": {"$sum":
{"$cond": [
{"$ifNull": ["$votes.downvotes", False]},
{"$size": "$votes.downvotes"},
{"$lookup": {
"from": "users",
"localField": "user_id",
"foreignField": "_id",
"as": "user"
{"$unwind": "$user"},
{"$lookup": {
"from": "reviews",
"localField": "user._id",
"foreignField": "user_id",
"as": "user.reviews"
"_id": "$review_id",
"user.review_count": {"$size": "$user.reviews"},
"user.upvote_count": {"$sum":{
"in":{"$cond": [
{"$ifNull": ["$$this.votes.upvotes", False]},
{"$size": "$$this.votes.upvotes"},
"user.downvote_count": {"$sum":{
"in":{"$cond": [
{"$ifNull": ["$$this.votes.downvotes", False]},
{"$size": "$$this.votes.downvotes"},
"vote_state": {"$switch": {
"branches": [
{"case": { "$and" : [
{"$ifNull": ["$votes.upvotes", False]},
{"$in": [user_id, "$votes.upvotes"]}
]}, "then": 1
{"case": { "$and" : [
{"$ifNull": ["$votes.downvotes", False]},
{"$in": [user_id, "$votes.downvotes"]}
]}, "then": -1
"default": 0
"user.password": 0,
"user.email": 0,
"user_id": 0,
"review_id" : 0,
"votes": 0,
"user.reviews": 0
{"$sort": {"created": -1}},
{"$match": {"page_id": page_id}},
Note: User can make multiple reviews for same page_id, but only the latest will be shown
I'm using pymongo btw, that's why operators have quotation mark
My questions are:
Is there any room to optimize my aggregation pipeline?
Is it considered as a good practice to have multiple small aggregate execution to get datas like above, or its always better to have 1 big aggregation (or as less as possible) to get the data that i want?
As you can see, every time i want to access votes.upvotes or votes.downvotes from a document on review collection, i checked whether the field is null or not, that's because the field votes.upvotes and votes.downvotes isn't being made when user make a review, instead it's being made when an user gives a vote to that review. Should i make an empty field on votes.upvotes and votes.downvotes when user make a review and remove the $ifNull? Will that increase the performance of the aggregation?
Check if this aggregation has better performance.
Create these indexes if you don't have already:
db.reviews.create_index([("page_id", 1)])
Note: We can improve even more the performance avoiding $lookup reviews again.
$match: {
page_id: page_id
$addFields: {
request_user_id: user_id
$group: {
_id: {
page_id: "$page_id",
user_id: "$user_id",
request_user_id: "$request_user_id"
data: {
$push: "$$ROOT"
$lookup: {
"from": "users",
"let": {
root_user_id: "$_id.user_id"
"pipeline": [
$match: {
$expr: {
$eq: [
$lookup: {
"from": "reviews",
"let": {
root_user_id: "$$root_user_id"
"pipeline": [
$match: {
$expr: {
$eq: [
$project: {
user_id: 1,
downvote_count: {
$size: "$votes.downvotes"
upvote_count: {
$size: "$votes.upvotes"
$group: {
_id: null,
review_count: {
$sum: {
$cond: [
$eq: [
upvote_count: {
$sum: "$upvote_count"
downvote_count: {
$sum: "$downvote_count"
$unset: "_id"
"as": "stats"
$project: {
tmp: {
$mergeObjects: [
_id: "$_id",
name: "$name"
$arrayElemAt: [
$replaceWith: "$tmp"
"as": "user"
$addFields: {
first: {
$mergeObjects: [
$arrayElemAt: [
user: {
$arrayElemAt: [
created: {
$toDate: {
$toLong: {
$arrayElemAt: [
downvote_count: {
$reduce: {
input: "$data.votes.downvotes",
initialValue: 0,
in: {
$add: [
$size: "$$this"
upvote_count: {
$reduce: {
input: "$data.votes.upvotes",
initialValue: 0,
in: {
$add: [
$size: "$$this"
vote_state: {
$cond: [
$gt: [
$size: {
$filter: {
input: "$data.votes.upvotes",
cond: {
$in: [
$cond: [
$gt: [
$size: {
$filter: {
input: "$data.votes.downvotes",
cond: {
$in: [
$unset: [
$replaceWith: "$first"
"$sort": {
"created": -1