Building a histogram from MongoDB with Pymongo - python

I'm trying to create a histogram of MongoDB documents in the following format:
{
"_id":1
"Properties":[
{
"type": "a"
},
{
"type": "d"
}
]
}
{
"_id":2
"Properties":[
{
"type": "c"
},
{
"type": "a"
}
]
}
{
"_id":3
"Properties":[
{
"type": "c"
},
{
"type": "d"
}
]
}
The output in this example should be:
a = 2
c = 2
d = 2
My workaround at the moment includes querying the entire collection with:
collection.find({})
And afterwards traversing and accumulating the data with a python dictionary.
I'm sure that there's a better way to do this in the MongoDB query itself, can I achieve this data in a single query as I suspect?
Note that I don't know which "types" I may find before the query is performed.

In this case, you can use MongoDB aggregation.
More about Aggregation: https://docs.mongodb.org/manual/core/aggregation-introduction/
db.collection.aggregate([
{ $unwind : "$Properties" },
{ $group: { _id: "$Properties.type", count: { $sum: 1 } } }
]);
Output:
{
"result" : [
{
"_id" : "c",
"count" : 2.0000000000000000
},
{
"_id" : "d",
"count" : 2.0000000000000000
},
{
"_id" : "a",
"count" : 2.0000000000000000
}
],
"ok" : 1.0000000000000000
}
In Python:
from pymongo import MongoClient
if __name__ == '__main__':
db = MongoClient().test
pipeline = [
{ "$unwind" : "$Properties" },
{ "$group": { "_id": "$Properties.type", "count": { "$sum": 1 } } }
]
print list(db.collection.aggregate(pipeline))
Output:
[{u'count': 2, u'_id': u'c'}, {u'count': 2, u'_id': u'd'}, {u'count': 2, u'_id': u'a'}]

Not sure if this can fit to your scenario, but you can do them separate by property like:
count_a = collection.find({'Properties.type':'a'}).count()
count_b = collection.find({'Properties.type':'b'}).count()
count_c = collection.find({'Properties.type':'c'}).count()
If you do not know the type you create a variable that would take different type and can simply do something like:
mistery_type = 'assign the misery type in var when you know it'
mistery_type_count = collection.find({'Properties.type': mistery_type}).count()

Related

Aggregation function for Counting of Duplicates in a field based on duplicate items in another field

I am using mongoengine as ORM with flask application. The model class is define like
class MyData(db.Document):
task_id = db.StringField(max_length=50, required=True)
url = db.URLField(max_length=500,required=True,unique=True)
organization = db.StringField(max_length=250,required=True)
val = db.StringField(max_length=50, required=True)
The field organization can be repeating and I want to get the count of duplicates with respect to values in another field. For example if the data in mongodb is like
[{"task_id":"as4d2rds5","url":"https:example1.com","organization":"Avengers","val":"null"},
{"task_id":"rfre43fed","url":"https:example1.com","organization":"Avengers","val":"valid"},
{"task_id":"uyje3dsxs","url":"https:example2.com","organization":"Metro","val":"valid"},
{"task_id":"ghs563vt6","url":"https:example1.com","organization":"Avengers","val":"invalid"},
{"task_id":"erf6egy64","url":"https:example2.com","organization":"Metro","val":"null"}]
Then I am querying all the objects using
data = MyData.objects()
I want a response like
[{"url":"https:example1.com","Avengers":{"valid":1,"null":1,"invalid":1}},{"url":"https:example2.com",Metro":{"valid":1,"null":1,"invalid":0}}]
I tried like
db.collection.aggregate([
{
"$group": {
"_id": "$organization",
"count": [
{
"null": {
"$sum": 1
},
"valid": {
"$sum": 1
},
"invalid": {
"$sum": 1
}
}
]
}
}
])
but I am getting an error
The field 'count' must be an accumulator object
Maybe something like this:
db.collection.aggregate([
{
"$group": {
"_id": {
k: "$organization",
v: "$val"
},
"cnt": {
$sum: 1
}
}
},
{
$project: {
_id: 0,
k: "$_id.k",
o: {
k: "$_id.v",
v: "$cnt"
}
}
},
{
$group: {
_id: "$k",
v: {
$push: "$o"
}
}
},
{
$addFields: {
v: {
"$arrayToObject": "$v"
}
}
},
{
$project: {
_id: 0,
new: [
{
k: "$_id",
v: "$v"
}
]
}
},
{
"$addFields": {
"new": {
"$arrayToObject": "$new"
}
}
},
{
"$replaceRoot": {
"newRoot": "$new"
}
}
])
Explained:
Group to count
Project for arrayToObject
Group to join the values
arrayToObject one more time
project additionally
arrayToObject to form the final object
project one more time
replaceRoot to move the object to root.
P.S.
Please, note this solution is not showing the missing values if they do not exist , if you need the missing values additional mapping / mergeObjects need to be added
playground1
Option with missing values ( if possible values are fixed to null,valid,invalid) :
just replace the second addFiedlds with:
{
$addFields: {
v: {
"$mergeObjects": [
{
"null": 0,
valid: 0,
invalid: 0
},
{
"$arrayToObject": "$v"
}
]
}
}
}
playground2
++url:
playground3

MongoDB - How to aggregate data for each record

I have some stored data like this:
{
"_id" : 1,
"serverAddresses" : {
"name" : "0.0.0.0:8000",
"name2": "0.0.0.0:8001"
}
}
I need aggregated data to this:
[
{
"gameId": "1",
"name": "name1",
"url": "0.0.0.0:8000"
},
{
"gameId": "1",
"name": "name2",
"url": "0.0.0.0:8001"
}
]
What is the solution without using for loop?
$project - Add addresses field by converting $serverAddress to (key-value) array.
$unwind - Descontruct addresses field to multiple documents.
$replaceRoot - Decorate the output document based on (2).
db.collection.aggregate([
{
"$project": {
"addresses": {
"$objectToArray": "$serverAddresses"
}
}
},
{
$unwind: "$addresses"
},
{
"$replaceRoot": {
"newRoot": {
gameId: "$_id",
name: "$addresses.k",
address: "$addresses.v"
}
}
}
])
Sample Mongo Playground

PyMongo not returning results on aggregation

I'm a total beginner in PyMongo. I'm trying to find activities that are registered multiple times. This code is returning an empty list. Could you please help me in finding the mistake:
rows = self.db.Activity.aggregate( [
{ '$group':{
"_id":
{
"user_id": "$user_id",
"transportation_mode": "$transportation_mode",
"start_date_time": "$start_date_time",
"end_date_time": "$end_date_time"
},
"count": {'$sum':1}
}
},
{'$match':
{ "count": { '$gt': 1 } }
},
{'$project':
{"_id":0,
"user_id":"_id.user_id",
"transportation_mode":"_id.transportation_mode",
"start_date_time":"_id.start_date_time",
"end_date_time":"_id.end_date_time",
"count": 1
}
}
]
)
5 rows from db:
{ "_id" : 0, "user_id" : "000", "start_date_time" : "2008-10-23 02:53:04", "end_date_time" : "2008-10-23 11:11:12" }
{ "_id" : 1, "user_id" : "000", "start_date_time" : "2008-10-24 02:09:59", "end_date_time" : "2008-10-24 02:47:06" }
{ "_id" : 2, "user_id" : "000", "start_date_time" : "2008-10-26 13:44:07", "end_date_time" : "2008-10-26 15:04:07" }
{ "_id" : 3, "user_id" : "000", "start_date_time" : "2008-10-27 11:54:49", "end_date_time" : "2008-10-27 12:05:54" }
{ "_id" : 4, "user_id" : "000", "start_date_time" : "2008-10-28 00:38:26", "end_date_time" : "2008-10-28 05:03:42" }
Thank you
When you pass _id: 0 in the $project stage, it will not project the sub-objects even if they are projected in the follow up, since the rule is overwritten.
Try the below $project stage.
{
'$project': {
"user_id":"_id.user_id",
"transportation_mode":"_id.transportation_mode",
"start_date_time":"_id.start_date_time",
"end_date_time":"_id.end_date_time",
"count": 1
}
}
rows = self.db.Activity.aggregate( [
{
'$group':{
"_id": {
"user_id": "$user_id",
"transportation_mode": "$transportation_mode",
"start_date_time": "$start_date_time",
"end_date_time": "$end_date_time"
},
"count": {'$sum':1}
}
},
{
'$match':{
"count": { '$gt': 1 }
}
},
{
'$project': {
"user_id":"_id.user_id",
"transportation_mode":"_id.transportation_mode",
"start_date_time":"_id.start_date_time",
"end_date_time":"_id.end_date_time",
"count": 1,
}
}
])
Your group criteria is likely too narrow.
The $group stage will create a separate output document for each distinct value of the _id field. The pipeline in the question will only include two input documents in the same group if they have exactly the same value in all four of those fields.
In order for a count to be greater than 1, there must exist 2 documents with the same user, mode, and exactly the same start and end.
In the same data you show, there are no two documents that would be in the same group, so all of the output documents from the $group stage would have a count of 1, and therefore none of them satisfy the $match, and the return is an empty list.

Basic request to mongodb with pymongo

I need to get all objects inside "posts" that have "published: true"
with pymongo. I've tried already so many variants but all I can do:
for elt in db[collection].find({}, {"posts"}):
print(elt)
And it'll show all "posts". I've tried smth like this:
for elt in db[collection].find({}, {"posts", {"published": {"$eq": True}}}):
print(elt)
But it doesn't work. Help, I'm trying for 3 days already =\
What you want to be doing is to use the aggregate $filter like so:
db[collection].aggregate([
{
"$match": { // only fetch documents with such posts
"posts.published": {"$eq": True}
}
},
{
"$project": {
"posts": {
"$filter": {
"input": "$posts",
"as": "post",
"cond": {"$eq": ["$$post.published", True]}
}
}
}
}
])
Note that the currenct structure returned will be:
[
{posts: [post1, post2]},
{posts: [post3, post4]}
]
If you want to retrieve it as a list of posts you'll need to add an $unwind stage to flatten the array.
The query options are quite limited you can do it with $elemMatch (projection) or with the $ operator but both of these return only the first post that matches the condition which is not what you want.
------- EDIT --------
Realizing posts is actually an object and not an array, you'll have to turn the object to an array, iterate over to filter and then restore the structure like so:
db.collection.aggregate([
{
$project: {
"posts": {
"$arrayToObject": {
$filter: {
input: {
"$objectToArray": "$posts"
},
as: "post",
cond: {
$eq: [
"$$post.v.published",
true
]
}
}
}
}
}
}
])
Mongo Playground
What I assumed that your document looks like this,
{
"_id" : ObjectId("5f8570f8afdefd2cfe7473a7"),
"posts" : {
"a" : {
"p" : false,
"name" : "abhishek"
},
"k" : {
"p" : true,
"name" : "jack"
},
"c" : {
"p" : true,
"name" : "abhinav"
}
}}
You can try the following query but the result format will be a bit different, adding that for clarification,
db.getCollection('temp2').aggregate([
{
$project: {
subPost: { $objectToArray: "$posts" }
}
},
{
'$unwind' : '$subPost'
},
{
'$match' : {'subPost.v.p':true}
},
{
'$group': {_id:'$_id', subPosts: { $push: { subPost: "$subPost"} }}
}
])
result format,
{
"_id" : ObjectId("5f8570f8afdefd2cfe7473a7"),
"subPosts" : [
{
"subPost" : {
"k" : "k",
"v" : {
"p" : true,
"name" : "jack"
}
}
},
{
"subPost" : {
"k" : "c",
"v" : {
"p" : true,
"name" : "abhinav"
}
}
}
]
}

Elastic Search composite grouping with range

Consider the following documents are in my elastic search . I want to group the documents based on rank, but any rank below 1000 must be displayed individually and anything above 1000 must be grouped how do I achieve this using composite aggregation, I am new and I am using composite because I want to use the after key function to allow pagination.
Documents
{
rank : 200,
name:abcd,
score1 :100,
score2:200
},
{
rank 300,
name:abcd,
score1:100,
score2:200
}
Expected Result:
{
key:{
rank:101
},
doc_count:1,
_score1: {value:3123}
_score2 : {value :3323}
}
{
key:{
rank:1000-*
},
doc_count:1,
_score1: {value:3123}
_score2 : {value :3323}
},
{
key:{
rank:300
},
doc_count:1,
_score1: {value:3123}
_score2 : {value :3323}
}
######## QUery that I tried
{
"query":{"match_all":{}},
"aggs":{
"_scores":{
"composite"{
"sources":[
{"_rank":{"terms":{"field":"rank"}}}
]
}
},
"aggs":{
"_ranks":{
"field":"rank:[
{"to":1000},
{"from":1000}
]
}
"_score1": {"sum": {"field": "score1"}}
"_score2": {"sum": {"field": "score2"}}
}
}
}
From what I understand, you want to
Group the aggregations whose value is below 1000 rank to their own buckets
Group the aggregations whose value is 1000 and above to a single bucket with key 1000-*
And for each buckets, calculate the sum of _score1 of all buckets
Similarly calculate the sum of _score2 of all buckets
For this scenario, you can simply make use of Terms Aggregation as I've mentioned in below answer.
I've mentioned sample mapping, sample documents, query and response so that you'll have clarity on what's happening.
Mapping:
PUT my_sample_index
{
"mappings": {
"properties": {
"rank":{
"type": "integer"
},
"name":{
"type": "keyword"
},
"_score1": {
"type":"integer"
},
"_score2":{
"type": "integer"
}
}
}
}
Sample Documents:
POST my_sample_index/_doc/1
{
"rank": 100,
"name": "john",
"_score1": 100,
"_score2": 100
}
POST my_sample_index/_doc/2
{
"rank": 1001, <--- Rank > 1000
"name": "constantine",
"_score1": 200,
"_score2": 200
}
POST my_sample_index/_doc/3
{
"rank": 200,
"name": "bruce",
"_score1": 100,
"_score2": 100
}
POST my_sample_index/_doc/4
{
"rank": 2001, <--- Rank > 1000
"name": "arthur",
"_score1": 200,
"_score2": 200
}
Aggregation Query:
POST my_sample_index/_search
{
"size":0,
"aggs": {
"_score": {
"terms": {
"script": {
"source": """
if(doc['rank'].value < 1000){
return doc['rank'];
}else
return '1000-*';
"""
}
},
"aggs":{
"_score1_sum":{
"sum": {
"field": "_score1"
}
},
"_score2_sum":{
"sum":{
"field": "_score2"
}
}
}
}
}
}
Note that I've used Scripted Terms Aggregation where I've mentioned by logic in the script. Logic I believe is self-explainable once you go through it.
Response:
{
"took" : 5,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"_score" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1000-*", <---- Note this
"doc_count" : 2, <---- Note this
"_score2_sum" : {
"value" : 400.0
},
"_score1_sum" : {
"value" : 400.0
}
},
{
"key" : "100",
"doc_count" : 1,
"_score2_sum" : {
"value" : 100.0
},
"_score1_sum" : {
"value" : 100.0
}
},
{
"key" : "200",
"doc_count" : 1,
"_score2_sum" : {
"value" : 100.0
},
"_score1_sum" : {
"value" : 100.0
}
}
]
}
}
}
Note that there are two keys having rank > 1000, both of their scores for _score1 and _score2 sum to 400, which is what is expected.
Let me know if this helps!

Categories

Resources