Elasticsearch query - print certain field based on other field - python

My goal is to find max value in one field and print another field in this found document.
My query so far:
{
"fields": ["text"], //NOT WORKING
"query": {
"query_string": {
"query": "_type:bmw AND _exists_:car_type",
"analyze_wildcard": True
}
},
"size": 0,
"aggs": {
"2": {
"terms": {
"field": "compound",
"size": 5,
"order": {
"2-orderAgg": "desc"
}
},
"aggs": {
"2-orderAgg": {
"max": {
"field": "compound"
}
}
}
}
}
}
Result is
'buckets': [{'doc_count': 1, '2-orderAgg': {'value': 0.8442}, 'key': 0.8442}, {'doc_count': 1, '2-orderAgg': {'value': 0.7777}, 'key': 0.7777}, {'doc_count': 1, '2-orderAgg': {'value': 0.7579}, 'key': 0.7579}, {'doc_count': 1, '2-orderAgg': {'value': 0.6476}, 'key': 0.6476}, {'doc_count': 1, '2-orderAgg': {'value': 0.6369}, 'key': 0.6369}]
Now I need to print text field in document contains compound value 0.8442 and so on.. Thank you for your advice.

I achieved this with a small workaroud. It's not pretty but at final I get what I wanted.
Firstly I used response from first query. Than I grabbed all keys from those dictionary and perform new query to find certain document's id.
{
"size": 0,
"query": {
"query_string": {
"analyze_wildcard": True,
"query": "_type:bmw AND compound:"+str(0.8442)+" AND _exists_:car_type"
}
},
"aggs": {
"3": {
"terms": {
"field": "id_str",
"size": 20,
"order": {
"_count": "desc"
}
}
}
}
}
than iterate through response and search document by this id field
for y in res1:
res3 = es.search(index='indexname', body={
"size" : 1,
"query": {
"bool": {
"must": [
{
"match": {
"id_str": y['key']
}
}
]
}
}
})
for x in res3['hits']['hits']:
print (x['_source']['text'])
now result is
Diamond stitch leather is a great addition to any custom vehicle. Prices start from 2k! #bmw i8 getting under car...
which is text what I wanted.

Related

groupby query on joined collection in flask mongoDB

I am currently stuck in this problem, i am relatively new to MongoDB, and i have to retrieve number of reports(count of reports done by users ) for a specific user with his name(name), last reported time(time of last reported post), last reason(report_description) ,
i am stuck here since 2 days now, help will be appreciated .
reported posts collection
{
"created_at": {
"$date": "2021-12-21T18:45:27.489Z"
},
"updated_at": {
"$date": "2021-12-21T18:45:27.489Z"
},
"post_id": {
"$oid": "61955ac35b3475f1d9759255"
},
"user_id": 2,
"report_type": "this is test",
"report_description": "this"
}
Post collection
{
"created_at": {
"$date": "2021-11-17T19:24:53.484Z"
},
"updated_at": {
"$date": "2021-11-17T19:24:53.484Z"
},
"user_id": 8,
"privacy_type": "public",
"post_type": "POST",
"post": "Om Sai Ram",
"total_like": 7,
"total_comment": 0,
"total_share": 0,
"image_url_list": [{
"image_url": "post_images/user-8/a31e39334987463bb9faa964391a935e.jpg",
"image_ratio": "1"
}],
"video_url_list": [],
"tag_list": [],
"is_hidden": false
}
User collection
{
"name": "sathish",
"user_id": 1,
"device_id": "faTOi3aVTjyQnBPFz0L7xm:APA91bHNLE9anWYrKWfwoHgmGWL2BlbWqgiVjU5iy7JooWxu26Atk9yZFxVnNp2OF1IXrXm4I6HdVJPGukEppQjSiUPdMoQ64KbOt78rpctxnYWPWliLrdxc9o1VdKL0DGYwE7Y6hx1H",
"user_name": "sathishkumar",
"updated_at": {
"$date": "2021-11-17T19:13:52.668Z"
},
"profile_picture_url": "1"
}
flask_snip.py
flagged_posts = mb.db_report.aggregate([{
'$group':{
'_id':'$user_id',
}
}])
expected out should be list e.g
[
{
'user_id':1,
'name' :'somename',
'no_of_reports':30,
'last_reported_time':sometime,
'reason':'reason_of lastreported_post',
'post_link':'someurl',
},
{
'user_id':2,
'name' :'somename',
'no_of_reports':30,
'last_reported_time':sometime,
'reason':'reason_of last_reported_post',
'post_link':'someurl',
},
{
'user_id':3,
'name' :'somename',
'no_of_reports':30,
'last_reported_time':sometime,
'reason':'reason_of lastreported_post',
'post_link':'someurl',
},
]
Starting from the reported collection, you can $group to get the last_reason and last_reported_time. Then, perform a $lookup to user collection to get the name.
db.reported.aggregate([
{
"$sort": {
updated_at: -1
}
},
{
"$group": {
"_id": "$user_id",
"last_reported_time": {
"$first": "$updated_at"
},
"last_reason": {
"$first": "$report_description"
},
"no_of_reports": {
$sum: 1
}
}
},
{
"$lookup": {
"from": "user",
"localField": "_id",
"foreignField": "user_id",
"as": "userLookup"
}
},
{
"$unwind": "$userLookup"
},
{
"$project": {
"user_id": "$_id",
"name": "$userLookup.user_name",
"no_of_reports": 1,
"last_reported_time": 1,
"last_reason": 1
}
}
])
Here is the Mongo playground for your reference.

MongoDB aggregation to join 2 fields not working

I have 2 mongo db collections, 'Contacts' and 'Messages'. Both collections share the phone number field(Primary/Foreign Key relation in SQL).
Contacts collection has this field as follows:
{
"phone": "+192******",
"name": "test"
}
and Messages as follows:
{
"Tel": "tel:+192******"
}
I want to aggregate the 2 collections such that I can have this nested document:
"text": "text sent by user",
"contact": {
"phone": "+192******",
"name": "test"
}
So far, I have tried the following aggregation but it doesn't work:
cursor = messages_client.aggregate([{
'$lookup':
{
'from': "contacts",
'let': { 'phone': "$phone"},
'pipeline': [
{ '$addFields': { 'phone_number': { "$substr": [ "$Tel", 4, -1 ] }}},
{'$match': { "$expr": { '$eq': [ '$phone_number', '$$phone']}}}
],
'as': 'contact'
}}
], allowDiskUse=True)
Could someone kindly help me? I'm using pymongo and Python3 if that is helpful.
Found some help from $indexOfCp operator, for anyone with a similar problem.
cursor = messages_client.aggregate([{
'$lookup':
{
'from': "contacts",
'let': { 'phone': "$phone"},
'pipeline': [
{'$match': { "$expr": { '$gt': [{ "$indexOfCP": ["$Tel", "$$phone"]}, -1]}}}
],
'as': 'contact'
}}
], allowDiskUse=True)
Try this:
db.messages.aggregate([
{
$addFields: {
'phone_number': { "$substr": ["$Tel", 4, -1] }
}
},
{
$lookup: {
from: "contacts",
let: { "phone": "$phone_number" },
pipeline: [
{
$match: {
$expr: { $eq: ["$phone", "$$phone"] }
}
}
],
as: "contact"
}
},
{ $unwind: "$contact" }
]);

Count all occurrences per field in one index

I want to count how many entries i have for each field in my elasticsearch DB for one index. I have tried with the code below, but this only returns the total number of entries. I'm working in Python.
What I have tried so far:
qry = {
"aggs": {
"field": {
"terms" : {"field": "field"}
}
}, "size": 0
}
r = es.search(body=qry,
index="webhose_english")
My current result:
Out[64]:
{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
'aggregations': {'field': {'buckets': [],
'doc_count_error_upper_bound': 0,
'sum_other_doc_count': 0}},
'hits': {'hits': [], 'max_score': 0.0, 'total': 4519134},
'timed_out': False,
'took': 16}
And I would ideally have something like:
{'field_1': 321,
'field_2': 231,
'field_3': 132}
This information used to be part of the _field_stats API, but it has been removed in 6.0. So you are on the right track, you will need an aggregation. I think value_count is the one you need and for good measure I've added global as well, so we know how many documents are there in total.
Three sample docs:
PUT foo/_doc/1
{
"foo": "bar"
}
PUT foo/_doc/2
{
"foo": "bar",
"bar": "bar"
}
PUT foo/_doc/3
{
"foo": "bar",
"bar": "bar",
"baz": "bar"
}
Aggregation (I'm not sure if there might be a shorter version of this especially with many fields):
GET foo/_search
{
"aggs": {
"count_fields": {
"global": {},
"aggs": {
"count_foo": {
"value_count": {
"field": "foo.keyword"
}
},
"count_bar": {
"value_count": {
"field": "bar.keyword"
}
},
"count_baz": {
"value_count": {
"field": "baz.keyword"
}
}
}
}
},
"size": 0
}
Result:
{
"took" : 16,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 3,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"count_fields" : {
"doc_count" : 3,
"count_foo" : {
"value" : 3
},
"count_bar" : {
"value" : 2
},
"count_baz" : {
"value" : 1
}
}
}
}
I did it by iterating over the following query and then collecting the "total" values in a dictionary:
qry = {
"query": {
"exists": {
"field": "fields_to_iterate"
}
}
}

How to perform multiple aggregation on an object in Elasticsearch using Python?

I want to perform date histogram query on my Elasticsearch data which is of the format:
datetime,field_obj and field_obj has three fields in it: a,b,c
Alongside date histogram aggregation, I want to find the average of field_obj i.e avg(field_a), avg(field_b), avg(field_c) also.
I tried working it out like this:
res = es.search(index="demo",body={"from": 0, "size": 0, "query":
{"match_all": {}}, "aggs": {
"date_avg": {
"date_histogram": {"field": "datetime","interval": "year"},
"aggs": {"avg_a": {"avg": {"field": "field.a"}}},
"aggs": {"avg_b": {"avg": {"field": "field.b"}}},
"aggs": {"avg_c": {"avg": {"field": "field.c"}}},
}}
})
However, this query only yields an average of field_c. All the other averages are getting overridden.
Good start! You need to do it like this and it will work:
res = es.search(index="demo",body={
"from": 0,
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"date_avg": {
"date_histogram": {
"field": "datetime",
"interval": "year"
},
"aggs": {
"avg_a": {
"avg": {
"field": "field.a"
}
},
"avg_b": {
"avg": {
"field": "field.b"
}
},
"avg_c": {
"avg": {
"field": "field.c"
}
}
}
}
}
})

Optimizing MongoDB Aggregation Pipeline (Group, Lookup, Match)

I'm new on NoSQL Database and i choose MongoDB as my first NoSQL Database. I made an aggregation pipeline to shows the data that i want, here's my document sample:
Document sample from Users Collection
{
"_id": 9,
"name": "Sample Name",
"email": "email#example.com",
"password": "password hash"
}
Document sample from Pages Collection (this one doesn't really matter)
{
"_id": 42,
"name": "Product Name",
"description": "Product Description",
"user_id": 8,
"rating_categories": [{
"_id": 114,
"name": "Build Quality"
}, {
"_id": 115,
"name": "Price"
}, {
"_id": 116,
"name": "Feature"
}, {
"_id": 117,
"name": "Comfort"
}, {
"_id": 118,
"name": "Switch"
}]
}
Document sample from Reviews Collection
{
"_id": 10,
"page_id": 42, #ID reference from pages collection
"user_id": 8, #ID reference from users collection
"review": "The review of the product",
"ratings": [{
"_id": 114, #ID Reference from pages collection of what rating category it is
"rating": 5
}, {
"_id": 115,
"rating":4
}, {
"_id": 116,
"rating": 5
}, {
"_id": 117,
"rating": 3
}, {
"_id": 118,
"rating": 4
}],
"created": "1582825968963", #Date Object
"votes": {
"downvotes": [],
"upvotes": [9] #IDs of users who upvote this review
}
}
I want to get reviews by page_id which can be accessed from the API i made, here's the expected result from the aggregation:
[
{
"_id": 10, #Review of the ID
"created": "Thu, 27 Feb 2020 17:52:48 GMT",
"downvote_count": 0, #Length of votes.downvotes from reviews collection
"page_id": 42, #Page ID
"ratings": [ #Stores what rate at what rating category id
{
"_id": 114,
"rating": 5
},
{
"_id": 115,
"rating": 4
},
{
"_id": 116,
"rating": 5
},
{
"_id": 117,
"rating": 3
},
{
"_id": 118,
"rating": 4
}
],
"review": "The Review",
"upvote_count": 0, #Length of votes.upvotes from reviews collection
"user": { #User who reviewed
"_id": 8, #User ID
"downvote_count": 0, #How many downvotes this user receive from all of the user's reviews
"name": "Sample Name", #Username
"review_count": 1, #How many reviews the user made
"upvote_count": 1 #How many upvotes this user receive from all of the user's reviews
},
"vote_state": 0 #Determining vote state from the user (who requested to the API) for this review, 0 for no vote, -1 for downvote, 1 for upvote
},
...
]
Here's the pipeline of the aggregation for reviews collection that i made for the result above:
user_id = 9
page_id = 42
pipeline = [
{"$group": {
"_id": {"user_id":"$user_id", "page_id": "$page_id"},
"review_id": {"$last": "$_id"},
"page_id": {"$last": "$page_id"},
"user_id" : {"$last": "$user_id"},
"ratings": {"$last": "$ratings"},
"review": {"$last": "$review"},
"created": {"$last": "$created"},
"votes": {"$last": "$votes"},
"upvote_count": {"$sum":
{"$cond": [
{"$ifNull": ["$votes.upvotes", False]},
{"$size": "$votes.upvotes"},
0
]}
},
"downvote_count": {"$sum":
{"$cond": [
{"$ifNull": ["$votes.downvotes", False]},
{"$size": "$votes.downvotes"},
0
]}
}}},
{"$lookup": {
"from": "users",
"localField": "user_id",
"foreignField": "_id",
"as": "user"
}},
{"$unwind": "$user"},
{"$lookup": {
"from": "reviews",
"localField": "user._id",
"foreignField": "user_id",
"as": "user.reviews"
}},
{"$addFields":{
"_id": "$review_id",
"user.review_count": {"$size": "$user.reviews"},
"user.upvote_count": {"$sum":{
"$map":{
"input":"$user.reviews",
"in":{"$cond": [
{"$ifNull": ["$$this.votes.upvotes", False]},
{"$size": "$$this.votes.upvotes"},
0
]}
}
}},
"user.downvote_count": {"$sum":{
"$map":{
"input":"$user.reviews",
"in":{"$cond": [
{"$ifNull": ["$$this.votes.downvotes", False]},
{"$size": "$$this.votes.downvotes"},
0
]}
}
}},
"vote_state": {"$switch": {
"branches": [
{"case": { "$and" : [
{"$ifNull": ["$votes.upvotes", False]},
{"$in": [user_id, "$votes.upvotes"]}
]}, "then": 1
},
{"case": { "$and" : [
{"$ifNull": ["$votes.downvotes", False]},
{"$in": [user_id, "$votes.downvotes"]}
]}, "then": -1
},
],
"default": 0
}},
}},
{"$project":{
"user.password": 0,
"user.email": 0,
"user_id": 0,
"review_id" : 0,
"votes": 0,
"user.reviews": 0
}},
{"$sort": {"created": -1}},
{"$match": {"page_id": page_id}},
]
Note: User can make multiple reviews for same page_id, but only the latest will be shown
I'm using pymongo btw, that's why operators have quotation mark
My questions are:
Is there any room to optimize my aggregation pipeline?
Is it considered as a good practice to have multiple small aggregate execution to get datas like above, or its always better to have 1 big aggregation (or as less as possible) to get the data that i want?
As you can see, every time i want to access votes.upvotes or votes.downvotes from a document on review collection, i checked whether the field is null or not, that's because the field votes.upvotes and votes.downvotes isn't being made when user make a review, instead it's being made when an user gives a vote to that review. Should i make an empty field on votes.upvotes and votes.downvotes when user make a review and remove the $ifNull? Will that increase the performance of the aggregation?
Thanks
Check if this aggregation has better performance.
Create these indexes if you don't have already:
db.reviews.create_index([("page_id", 1)])
Note: We can improve even more the performance avoiding $lookup reviews again.
db.reviews.aggregate([
{
$match: {
page_id: page_id
}
},
{
$addFields: {
request_user_id: user_id
}
},
{
$group: {
_id: {
page_id: "$page_id",
user_id: "$user_id",
request_user_id: "$request_user_id"
},
data: {
$push: "$$ROOT"
}
}
},
{
$lookup: {
"from": "users",
"let": {
root_user_id: "$_id.user_id"
},
"pipeline": [
{
$match: {
$expr: {
$eq: [
"$$root_user_id",
"$_id"
]
}
}
},
{
$lookup: {
"from": "reviews",
"let": {
root_user_id: "$$root_user_id"
},
"pipeline": [
{
$match: {
$expr: {
$eq: [
"$$root_user_id",
"$user_id"
]
}
}
},
{
$project: {
user_id: 1,
downvote_count: {
$size: "$votes.downvotes"
},
upvote_count: {
$size: "$votes.upvotes"
}
}
},
{
$group: {
_id: null,
review_count: {
$sum: {
$cond: [
{
$eq: [
"$$root_user_id",
"$user_id"
]
},
1,
0
]
}
},
upvote_count: {
$sum: "$upvote_count"
},
downvote_count: {
$sum: "$downvote_count"
}
}
},
{
$unset: "_id"
}
],
"as": "stats"
}
},
{
$project: {
tmp: {
$mergeObjects: [
{
_id: "$_id",
name: "$name"
},
{
$arrayElemAt: [
"$stats",
0
]
}
]
}
}
},
{
$replaceWith: "$tmp"
}
],
"as": "user"
}
},
{
$addFields: {
first: {
$mergeObjects: [
"$$ROOT",
{
$arrayElemAt: [
"$data",
0
]
},
{
user: {
$arrayElemAt: [
"$user",
0
]
},
created: {
$toDate: {
$toLong: {
$arrayElemAt: [
"$data.created",
0
]
}
}
},
downvote_count: {
$reduce: {
input: "$data.votes.downvotes",
initialValue: 0,
in: {
$add: [
"$$value",
{
$size: "$$this"
}
]
}
}
},
upvote_count: {
$reduce: {
input: "$data.votes.upvotes",
initialValue: 0,
in: {
$add: [
"$$value",
{
$size: "$$this"
}
]
}
}
},
vote_state: {
$cond: [
{
$gt: [
{
$size: {
$filter: {
input: "$data.votes.upvotes",
cond: {
$in: [
"$_id.request_user_id",
"$$this"
]
}
}
}
},
0
]
},
1,
{
$cond: [
{
$gt: [
{
$size: {
$filter: {
input: "$data.votes.downvotes",
cond: {
$in: [
"$_id.request_user_id",
"$$this"
]
}
}
}
},
0
]
},
-1,
0
]
}
]
}
}
]
}
}
},
{
$unset: [
"first.data",
"first.votes",
"first.user_id",
"first.request_user_id"
]
},
{
$replaceWith: "$first"
},
{
"$sort": {
"created": -1
}
}
])
MongoPlayground

Categories

Resources