MongoDB Aggregation - Creating variable for $sum - python

Sample input:
{
"students":[
{
"name" : "John",
"semesters":[
{
"semester": "fall",
"grades": [
{"EXAM_1" : 25},
{"EXAM_2" : 45},
{"EXAM_3" : 22}
]
},
{
"semester": "winter",
"grades": [
{"EXAM_1" : 85},
{"EXAM_2" : 32},
{"EXAM_3" : 17}
]
}
]
},{
"name" : "Abraham",
"semesters":[
{
"semester": "fall",
"grades": [
{"EXAM_1" : 5},
{"EXAM_2" : 91},
{"EXAM_3" : 51}
]
},
{
"semester": "winter",
"grades": [
{"EXAM_1" : 55},
{"EXAM_2" : 62},
{"EXAM_3" : 17}
]
}
]
},{
"name" : "Zach",
"semesters":[
{
"semester": "spring",
"grades": [
{"EXAM_1" : 18},
{"EXAM_2" : 19},
{"EXAM_3" : 26}
]
},
{
"semester": "winter",
"grades": [
{"EXAM_1" : 100},
{"EXAM_2" : 94},
{"EXAM_3" : 45}
]
}
]
}
]
}
So this is what I have so far
data = await db.userstats.aggregate([
{ "$unwind": "$students.semesters" },
{ "$unwind": "$students.semesters.fall" },
{ "$unwind": f"$students.semesters.fall.grades" },
{
{ "$sum": [
{"$match" : { "$students.semesters.fall.grades" : "EXAM_3" } },
{"$multiply": [2, {"$match" : { "$students.semesters.fall.grades" : "EXAM_1" } }]}
]
}
},
{
"$project": {
"name" : "$name",
"character" : "$students.semesters.fall",
"exam_name" : "$students.semesters.fall.grades",
"exam_value" : "2*exam 1 + exam 3"
}
},
{ "$sort": { "exam_value": -1 }},
{ '$limit' : 30 }
]).to_list(length=None)
print(data)
I've been trying to implement a calculation performed on exam grades for each student in a data sample and comparing it to other students. I am stuck on how to properly perform the calculation. The basic rundown is that I need the output to be sorted calculations of
2*exam 1 + exam3.
I understand that $sum cannot be used in the pipeline stage, but I am unaware of how to use the $match command within the $sum operator.
Sample output:
{name: John, calculated_exam_grade: 202, 'semester':'winter'},
{name: Abraham, calculated_exam_grade: 101, 'semester':'fall'},
{name: John, calculated_exam_grade: 95, 'semester':'fall'},
etc...

Based on the expected result provided, the query is almost similar to the link I posted in the comment.
$unwind - Deconstruct students array.
$unwind - Deconstruct student.semesters array.
$project - Decorate output documents with the calculation for the calculated_exam_grade field.
$sort
$limit
db.collection.aggregate([
{
"$unwind": "$students"
},
{
"$unwind": "$students.semesters"
},
{
"$project": {
_id: 0,
"name": "$students.name",
"semester": "$students.semesters.semester",
"calculated_exam_grade": {
$sum: [
{
"$multiply": [
2,
{
$sum: [
"$students.semesters.grades.EXAM_1"
]
}
]
},
{
$sum: [
"$students.semesters.grades.EXAM_3"
]
}
]
}
}
},
{
"$sort": {
"calculated_exam_grade": -1
}
},
{
"$limit": 30
}
])
Sample Mongo Playground

Related

Mongodb aggregation with $or nested clauses

I have mongodb documents like this:
{
"_id" : ObjectId("5d35ba501545d248c383871f"),
"key1" : 1,
"currentTime" : ISODate("2019-07-18T19:41:54.000Z"),
"iState" : "START - 1",
"errGyro" : -4.0,
"states" : [
{
"ts" : 3,
"accY" : -165.877227783203,
"gyroZ" : 8.2994499206543,
},
{
"ts" : 4,
"accY" : -15.843573,
"gyroZ" : 12.434643,
},
{
"ts" : 3,
"accY" : 121.32667,
"gyroZ" : 98.45566,
}
]
}
I want to return all the states objects and the parent document where "ts" is 3 or 5.
I tried this query at first:
db.getCollection('logs').find(
{"states" :
{ "$elemMatch" : { "$or":[
{ "ts":
{ "$eq" : 3}
},
{ "ts":
{ "$eq" : 5}
}
]
}
}
},{"states.$":1 })
But this returns only the first "state" document where the "eq" occurred.
How can I return all the matching documents?
Thank you.
You can use aggregation pipelines
db.getCollection('logs').aggregate([
{
$unwind: "$states"
},
{
$match: {
$or: [
{ "states.ts": 3 },
{ "states.ts": 5 },
]
}
},
{
$group: {
_id: "$_id",
"key1": { $first: "key1" },
"currentTime": { $first: "currentTime" },
"iState": { $first: "$iState" },
"errGyro": { $first: "$errGyro" },
states: { $push: "$states" }
}
}
])
As $elemMatch returns only the first matching element of array, you have to use aggregation to achieve your goal. Here's the query :
db.collection.aggregate([
{
$match: {
$or: [
{
"states.ts": {
$eq: 3
}
},
{
"states.ts": {
$eq: 5
}
}
]
}
},
{
$project: {
states: 1
}
},
{
$unwind: "$states"
},
{
$match: {
$or: [
{
"states.ts": {
$eq: 3
}
},
{
"states.ts": {
$eq: 5
}
}
]
}
},
{
$group: {
_id: "$_id",
states: {
$push: "$states"
}
}
}
])
First $match and $project stages are here for query optimization and memory save, if many documents are returned.

Count all occurrences per field in one index

I want to count how many entries i have for each field in my elasticsearch DB for one index. I have tried with the code below, but this only returns the total number of entries. I'm working in Python.
What I have tried so far:
qry = {
"aggs": {
"field": {
"terms" : {"field": "field"}
}
}, "size": 0
}
r = es.search(body=qry,
index="webhose_english")
My current result:
Out[64]:
{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
'aggregations': {'field': {'buckets': [],
'doc_count_error_upper_bound': 0,
'sum_other_doc_count': 0}},
'hits': {'hits': [], 'max_score': 0.0, 'total': 4519134},
'timed_out': False,
'took': 16}
And I would ideally have something like:
{'field_1': 321,
'field_2': 231,
'field_3': 132}
This information used to be part of the _field_stats API, but it has been removed in 6.0. So you are on the right track, you will need an aggregation. I think value_count is the one you need and for good measure I've added global as well, so we know how many documents are there in total.
Three sample docs:
PUT foo/_doc/1
{
"foo": "bar"
}
PUT foo/_doc/2
{
"foo": "bar",
"bar": "bar"
}
PUT foo/_doc/3
{
"foo": "bar",
"bar": "bar",
"baz": "bar"
}
Aggregation (I'm not sure if there might be a shorter version of this especially with many fields):
GET foo/_search
{
"aggs": {
"count_fields": {
"global": {},
"aggs": {
"count_foo": {
"value_count": {
"field": "foo.keyword"
}
},
"count_bar": {
"value_count": {
"field": "bar.keyword"
}
},
"count_baz": {
"value_count": {
"field": "baz.keyword"
}
}
}
}
},
"size": 0
}
Result:
{
"took" : 16,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 3,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"count_fields" : {
"doc_count" : 3,
"count_foo" : {
"value" : 3
},
"count_bar" : {
"value" : 2
},
"count_baz" : {
"value" : 1
}
}
}
}
I did it by iterating over the following query and then collecting the "total" values in a dictionary:
qry = {
"query": {
"exists": {
"field": "fields_to_iterate"
}
}
}

Obtain records based on matching key value pairs and comparing date in Python

I have a following collection in MongoDB:
{
"_id" : ObjectId("5bbc86e5c16a27f1e1bd39f8"),
"name" : "swetha",
"nameId" : 123,
"source" : "Blore",
"sourceId" : 10,
"LastUpdate" : "10-Oct-2018"
}
{
"_id" : ObjectId("5bbc86e5c16a27f1e1bd39f9"),
"name" : "swetha",
"nameId" : 123,
"source" : "Mlore",
"sourceId" : "11",
"LastUpdate" : "11-Oct-2018"
}
{
"_id" : ObjectId("5bbc86e5c16a27f1e1bd39fa"),
"name" : "swathi",
"nameId" : 124,
"source" : "Mlore",
"sourceId" : "11",
"LastUpdate" : "9-Oct-2018"
}
I am a beginner to Python and want to compare the 'LastUpdate' between the above records based on matching 'name' or 'nameId' and want to push the record with latest date to another collection. E.g. name:'Swetha' is same in first two records. So compare 'LastUpdate' between them and output the record with latest date.
I have written following code to read data records from MongoDB and to print. I didn't understand how to compare records within a same key and compare their timestamp though I referred few resources on Google.
import json
import pandas as pd
from pymongo import MongoClient
try:
client = MongoClient()
print("Connected successfully!!!")
except:
print("Could not connect to MongoDB")
# database
db = client.conn
collection = db.contactReg
df = collection.find()
for row in df:
print(row)
Links that are ref
Is there a better way to compare dictionary values
https://gis.stackexchange.com/questions/87276/how-to-compare-values-from-a-column-in-attribute-table-with-values-in-dictionary
Comparing two dictionaries and printing key value pair in python and few more.
I think what you need is an aggregation. This might look big but once you get the hang out of mongo aggregations you'll get comfortable.
df = collection.aggregate([
{
"$project": {
"_id": 0,
"name": 1,
"nameId": 1,
"source": 1,
"sourceId": 1,
"LastUpdate": 1,
"LastUpdateArray": {
"$split": [
"$LastUpdate",
"-"
]
}
}
},
{
"$project": {
"name": 1,
"nameId": 1,
"source": 1,
"sourceId": 1,
"LastUpdate": 1,
"LastUpdateArray": 1,
"LastUpdateMonth": {
"$arrayElemAt": [
"$LastUpdateArray",
1
]
}
}
},
{
"$project": {
"name": 1,
"nameId": 1,
"source": 1,
"sourceId": 1,
"LastUpdate": 1,
"Year": {
"$arrayElemAt": [
"$LastUpdateArray",
2
]
},
"Date": {
"$arrayElemAt": [
"$LastUpdateArray",
0
]
},
"Month": {
"$switch": {
"branches": [
{
"case": {
"$eq": [
"$LastUpdateMonth",
"Jan"
]
},
"then": "01"
},
{
"case": {
"$eq": [
"$LastUpdateMonth",
"Feb"
]
},
"then": "02"
},
{
"case": {
"$eq": [
"$LastUpdateMonth",
"Mar"
]
},
"then": "03"
},
{
"case": {
"$eq": [
"$LastUpdateMonth",
"Apr"
]
},
"then": "04"
},
{
"case": {
"$eq": [
"$LastUpdateMonth",
"May"
]
},
"then": "05"
},
{
"case": {
"$eq": [
"$LastUpdateMonth",
"Jun"
]
},
"then": "06"
},
{
"case": {
"$eq": [
"$LastUpdateMonth",
"Jul"
]
},
"then": "07"
},
{
"case": {
"$eq": [
"$LastUpdateMonth",
"Aug"
]
},
"then": "08"
},
{
"case": {
"$eq": [
"$LastUpdateMonth",
"Sep"
]
},
"then": "09"
},
{
"case": {
"$eq": [
"$LastUpdateMonth",
"Oct"
]
},
"then": "10"
},
{
"case": {
"$eq": [
"$LastUpdateMonth",
"Nov"
]
},
"then": "11"
},
{
"case": {
"$eq": [
"$LastUpdateMonth",
"Dec"
]
},
"then": "12"
}
],
"default": "01"
}
}
}
},
{
"$project": {
"name": 1,
"nameId": 1,
"source": 1,
"sourceId": 1,
"LastUpdate": 1,
"Year": 1,
"Date": 1,
"Month": 1,
"DateString": {
"$concat": [
"$Year",
"-",
"$Month",
"-",
"$Date"
]
}
}
},
{
"$project": {
"name": 1,
"nameId": 1,
"source": 1,
"sourceId": 1,
"LastUpdate": 1,
"Date": {
"$dateFromString": {
"dateString": "$DateString"
}
}
}
},
{
"$sort": {
"Date": -1
}
},
{
"$group": {
"_id": "$name",
"name": {
"$first": "$name"
},
"nameId": {
"$first": "$nameId"
},
"source": {
"$first": "$source"
},
"sourceId": {
"$first": "$sourceId"
},
"LastUpdate": {
"$first": "$LastUpdate"
},
"Date": {
"$first": "$Date"
}
}
},
{
"$project": {
"name": 1,
"nameId": 1,
"source": 1,
"sourceId": 1,
"LastUpdate": 1
}
}
])
The first 5 steps of aggregation, I tried to convert it into a date and then sort descending by date. In group by I grouped with name and took the first that comes with that name.
Hope this helps.
I'm assuming what you need is duplicate records and I'm taking the first one that comes. Reference : https://stackoverflow.com/a/26985011/7630071
df = collection.aggregate([
{
"$group": {
"_id": "$name",
"count": {
"$sum": 1
},
"data": {
"$push": {
"nameId": "$nameId",
"source": "$source",
"sourceId": "$sourceId",
"LastUpdate": "$LastUpdate"
}
}
}
},
{
"$match": {
"_id": {
"$ne": null
},
"count": {
"$gt": 1
}
}
}
])

Set difference between unions of specific subfields

I have a large collection that can be modeled more or less as the one created by the following code:
import string
from random import randint, random, choice
documents = []
for i in range(100):
letters = choice(string.letters[0:15])
documents.append({'hgvs_id': "".join([str(randint(0,9)), letters]),
'sample_id': "CDE",
'number': i*random()*50 - 30 })
documents.append({'hgvs_id': "".join([str(randint(0,9)), letters]),
'sample_id': 'ABC',
'number': i*random()*50 - 30 })
documents.append({'hgvs_id': "".join([str(randint(0,9)), letters]),
'sample_id': 'GEF',
'number': i*random()*50 - 30 })
for i in range(10): # add some unique values for sample_id 'ABC'
letters = choice(string.letters[0:15])
documents.append({'hgvs_id': "55" + letters,
'sample_id': 'ABC',
'number': i*random()*50 - 30 })
collection.insert_many(documents)
I am trying to retrieve the unique hgvs_id's that occur within documents that have a specific sample_id (ABC here) but not in documents containing the other two. Usually, there will be many more sample_id than just three.
It sounds pretty simple, but so far I have been unsuccessful. Given the size of the collection I'm working with (~30GB), I've been trying to use the aggregate framework as follows:
sample_1 = collection.aggregate(
[
{'$group':
{
'_id': '$hgvs_id',
#'sample_id' : {"addToSet": '$hgvs_id'},
'matchedDocuments':
{'$push':
{
'id': '$_id',
'sample_name': "$sample_id",
'hgvs_ids': "$hgvs_id"
}
},
}
},
{'$match': {
"$and": [
{'matchedDocuments': {"$elemMatch": {'sample_name': 'ABC'}}},
# Some other operation????
]
}
}
]) #, allowDiskUse=True) may be needed
This returns (understandably) all the hgvs_id's having sample_id equal ABC. Any leads would be more than appreciated.
If it's the only sample_id in the "set" of grouped values then the $size will be one:
With MongoDB 3.4 you can use $in in combination:
[
{ "$group": {
"_id": "$hgvs_id",
"samples": { "$addToSet": "$sample_id" }
}},
{ "$redact": {
"$cond": {
"if": {
"$and": [
{ "$in": [ "ABC", "$samples" ] },
{ "$eq": [ { "$size": "$samples" }, 1 ] }
]
},
"then": "$$KEEP",
"else": "$$PRUNE"
}
}}
]
Otherwise use $setIntersection which is just a little longer in syntax:
[
{ "$group": {
"_id": "$hgvs_id",
"samples": { "$addToSet": "$sample_id" }
}},
{ "$redact": {
"$cond": {
"if": {
"$and": [
{ "$eq": [ { "$size": { "$setIntersection": [ "$samples", ["ABC"] ] } }, 1 ] },
{ "$eq": [ { "$size": "$samples" }, 1 ] }
]
},
"then": "$$KEEP",
"else": "$$PRUNE"
}
}}
]
Or probably in the simplest form for all versions supporting aggregation anyway:
{ "$group": {
"_id": "$hgvs_id",
"samples": { "$addToSet": "$sample_id" }
}},
{ "$match": {
"$and": [{ "samples": "ABC" },{ "samples": { "$size": 1 } }]
}}
]
The same principle applies to any number of arguments in that the "set" produced much much the size of the arguments given as well as containing the specific value.

Return certain fields in mongodb

I am trying to return just the arrays that meet my criteria. Here is what i have:
{
"_id": 1,
"awardAmount": 20000,
"url": "www.url.com",
"numAwards": 2,
"award": "Faculty Research Grant",
"Type": "faculty",
"Applicants": [
{
"preAwards": "NO1",
"Name": "Omar1",
"School": "SCSU1",
"citizenship": "YES1",
"budget": 1,
"Advisor": "Dr. DaPonte1",
"Major": "CSC1",
"appId": 100,
"Research": "Test data entry1",
"Time": "12 months1",
"URL": "www.url.com",
"Evaluators": [
{
"abstractScore": 11,
"evalNum": 1,
"goalsObjectivesScore": 11
},
{
"abstractScore": 22,
"evalNum": 2,
"goalsObjectivesScore": 22
}
]
},
{
"preAwards": "NO2",
"citizenship": "YES2",
"Major": "CSC2",
"Time": "12 months2",
"budget": 2,
"URL": "www.2.com",
"appId": 200,
"Advisor": "Dr. DaPonte2",
"Name": "Omar2",
"Research": "Test data entry2",
"School": "SCSU2",
"url": "www.2.com"
},
{
"preAwards": "NO3",
"citizenship": "YES3",
"Major": "CSC3",
"Time": "12 months3",
"budget": 3,
"URL": "www.3.com",
"appId": 300,
"Advisor": "Dr. DaPonte3",
"Name": "Omar3",
"Research": "Test data entry3",
"School": "SCSU3",
"url": "www.3.com",
"Evaluators": [
{
"abstractScore": 454,
"evalNum": 1,
"goalsObjectivesScore": 4546
}
]
}
]
}
I want to get back just the applicants that don't have Evaluators fields.
{
"_id": 1,
"awardAmount": 20000,
"url": "www.url.com",
"numAwards": 2,
"award": "Faculty Research Grant",
"Type": "faculty",
"Applicants": [
{
"preAwards": "NO2",
"citizenship": "YES2",
"Major": "CSC2",
"Time": "12 months2",
"budget": 2,
"URL": "www.2.com",
"appId": 200,
"Advisor": "Dr. DaPonte2",
"Name": "Omar2",
"Research": "Test data entry2",
"School": "SCSU2",
"url": "www.2.com"
}
]
}
This is just an example of one document. I want all the Applicants with no Evaluators fields in all documents.
Using aggregation with pymongo
col.aggregate([{"$unwind": "$Applicants"}, {"$match" : {"Applicants.Evaluators": {"$exists": False}}}]))
Output
{'ok': 1.0,
'result': [{'Applicants': {'Advisor': 'Dr. DaPonte2',
'Major': 'CSC2',
'Name': 'Omar2',
'Research': 'Test data entry2',
'School': 'SCSU2',
'Time': '12 months2',
'URL': 'www.2.com',
'appId': 200,
'budget': 2,
'citizenship': 'YES2',
'preAwards': 'NO2',
'url': 'www.2.com'},
'Type': 'faculty',
'_id': 1,
'award': 'Faculty Research Grant',
'awardAmount': 20000,
'numAwards': 2,
'url': 'www.url.com'}]}
In mongo shell you can do this:
db.test.find(
{
Applicants : { $elemMatch : { "Evaluators" : { $exists : 0 } }}
},
{
"_id" : 1,
"awardAmount" : 1,
"url" : 1,
"numAwards" : 2,
"award" : 1,
"Type" : 1,
'Applicants.$' : 1,
});
One problem is that the above query just return one Applicants with no Evaluators in it, the valid complete solution will achieve via aggregation
db.test.aggregate(
[
{ $match : { Applicants : { $elemMatch : { "Evaluators" : { $exists : 0 } } } } },
{ $unwind : "$Applicants" },
{ $match : { "Applicants.Evaluators" : { $exists : 0 } } },
{
$group :
{
_id : '$_id',
Applicants : { $push : '$Applicants' },
awardAmount : { $first : '$awardAmount' } ,
url : { $first : '$url' } ,
numAwards : { $first : '$numAwards' } ,
award : { $first : '$award' } ,
Type : { $first : '$Type' } ,
}
}
]
)
If I understand your question correctly I would suggest using the aggregation pipeline to $unwind the documents on your 'Applicants' field. You can then filter the resulting documents using $match to remove the documents where 'Evaluators' exist then $group them back together using $first and $push. Hope this is of some help.

Categories

Resources