Elasticsearch query - print certain field based on other field

Elasticsearch query - print certain field based on other field - python

My goal is to find max value in one field and print another field in this found document.
My query so far:
{
"fields": ["text"], //NOT WORKING
"query": {
"query_string": {
"query": "_type:bmw AND _exists_:car_type",
"analyze_wildcard": True
}
},
"size": 0,
"aggs": {
"2": {
"terms": {
"field": "compound",
"size": 5,
"order": {
"2-orderAgg": "desc"
}
},
"aggs": {
"2-orderAgg": {
"max": {
"field": "compound"
}
}
}
}
}
}
Result is
'buckets': [{'doc_count': 1, '2-orderAgg': {'value': 0.8442}, 'key': 0.8442}, {'doc_count': 1, '2-orderAgg': {'value': 0.7777}, 'key': 0.7777}, {'doc_count': 1, '2-orderAgg': {'value': 0.7579}, 'key': 0.7579}, {'doc_count': 1, '2-orderAgg': {'value': 0.6476}, 'key': 0.6476}, {'doc_count': 1, '2-orderAgg': {'value': 0.6369}, 'key': 0.6369}]
Now I need to print text field in document contains compound value 0.8442 and so on.. Thank you for your advice.

I achieved this with a small workaroud. It's not pretty but at final I get what I wanted.
Firstly I used response from first query. Than I grabbed all keys from those dictionary and perform new query to find certain document's id.
{
"size": 0,
"query": {
"query_string": {
"analyze_wildcard": True,
"query": "_type:bmw AND compound:"+str(0.8442)+" AND _exists_:car_type"
}
},
"aggs": {
"3": {
"terms": {
"field": "id_str",
"size": 20,
"order": {
"_count": "desc"
}
}
}
}
}
than iterate through response and search document by this id field
for y in res1:
res3 = es.search(index='indexname', body={
"size" : 1,
"query": {
"bool": {
"must": [
{
"match": {
"id_str": y['key']
}
}
]
}
}
})
for x in res3['hits']['hits']:
print (x['_source']['text'])
now result is
Diamond stitch leather is a great addition to any custom vehicle. Prices start from 2k! #bmw i8 getting under car...
which is text what I wanted.

Related

groupby query on joined collection in flask mongoDB

I am currently stuck in this problem, i am relatively new to MongoDB, and i have to retrieve number of reports(count of reports done by users ) for a specific user with his name(name), last reported time(time of last reported post), last reason(report_description) ,
i am stuck here since 2 days now, help will be appreciated .
reported posts collection
{
"created_at": {
"$date": "2021-12-21T18:45:27.489Z"
},
"updated_at": {
"$date": "2021-12-21T18:45:27.489Z"
},
"post_id": {
"$oid": "61955ac35b3475f1d9759255"
},
"user_id": 2,
"report_type": "this is test",
"report_description": "this"
}
Post collection
{
"created_at": {
"$date": "2021-11-17T19:24:53.484Z"
},
"updated_at": {
"$date": "2021-11-17T19:24:53.484Z"
},
"user_id": 8,
"privacy_type": "public",
"post_type": "POST",
"post": "Om Sai Ram",
"total_like": 7,
"total_comment": 0,
"total_share": 0,
"image_url_list": [{
"image_url": "post_images/user-8/a31e39334987463bb9faa964391a935e.jpg",
"image_ratio": "1"
}],
"video_url_list": [],
"tag_list": [],
"is_hidden": false
}
User collection
{
"name": "sathish",
"user_id": 1,
"device_id": "faTOi3aVTjyQnBPFz0L7xm:APA91bHNLE9anWYrKWfwoHgmGWL2BlbWqgiVjU5iy7JooWxu26Atk9yZFxVnNp2OF1IXrXm4I6HdVJPGukEppQjSiUPdMoQ64KbOt78rpctxnYWPWliLrdxc9o1VdKL0DGYwE7Y6hx1H",
"user_name": "sathishkumar",
"updated_at": {
"$date": "2021-11-17T19:13:52.668Z"
},
"profile_picture_url": "1"
}
flask_snip.py
flagged_posts = mb.db_report.aggregate([{
'$group':{
'_id':'$user_id',
}
}])
expected out should be list e.g
[
{
'user_id':1,
'name' :'somename',
'no_of_reports':30,
'last_reported_time':sometime,
'reason':'reason_of lastreported_post',
'post_link':'someurl',
},
{
'user_id':2,
'name' :'somename',
'no_of_reports':30,
'last_reported_time':sometime,
'reason':'reason_of last_reported_post',
'post_link':'someurl',
},
{
'user_id':3,
'name' :'somename',
'no_of_reports':30,
'last_reported_time':sometime,
'reason':'reason_of lastreported_post',
'post_link':'someurl',
},
]

Starting from the reported collection, you can $group to get the last_reason and last_reported_time. Then, perform a $lookup to user collection to get the name.
db.reported.aggregate([
{
"$sort": {
updated_at: -1
}
},
{
"$group": {
"_id": "$user_id",
"last_reported_time": {
"$first": "$updated_at"
},
"last_reason": {
"$first": "$report_description"
},
"no_of_reports": {
$sum: 1
}
}
},
{
"$lookup": {
"from": "user",
"localField": "_id",
"foreignField": "user_id",
"as": "userLookup"
}
},
{
"$unwind": "$userLookup"
},
{
"$project": {
"user_id": "$_id",
"name": "$userLookup.user_name",
"no_of_reports": 1,
"last_reported_time": 1,
"last_reason": 1
}
}
])
Here is the Mongo playground for your reference.

MongoDB aggregation to join 2 fields not working

I have 2 mongo db collections, 'Contacts' and 'Messages'. Both collections share the phone number field(Primary/Foreign Key relation in SQL).
Contacts collection has this field as follows:
{
"phone": "+192******",
"name": "test"
}
and Messages as follows:
{
"Tel": "tel:+192******"
}
I want to aggregate the 2 collections such that I can have this nested document:
"text": "text sent by user",
"contact": {
"phone": "+192******",
"name": "test"
}
So far, I have tried the following aggregation but it doesn't work:
cursor = messages_client.aggregate([{
'$lookup':
{
'from': "contacts",
'let': { 'phone': "$phone"},
'pipeline': [
{ '$addFields': { 'phone_number': { "$substr": [ "$Tel", 4, -1 ] }}},
{'$match': { "$expr": { '$eq': [ '$phone_number', '$$phone']}}}
],
'as': 'contact'
}}
], allowDiskUse=True)
Could someone kindly help me? I'm using pymongo and Python3 if that is helpful.

Found some help from $indexOfCp operator, for anyone with a similar problem.
cursor = messages_client.aggregate([{
'$lookup':
{
'from': "contacts",
'let': { 'phone': "$phone"},
'pipeline': [
{'$match': { "$expr": { '$gt': [{ "$indexOfCP": ["$Tel", "$$phone"]}, -1]}}}
],
'as': 'contact'
}}
], allowDiskUse=True)

Try this:
db.messages.aggregate([
{
$addFields: {
'phone_number': { "$substr": ["$Tel", 4, -1] }
}
},
{
$lookup: {
from: "contacts",
let: { "phone": "$phone_number" },
pipeline: [
{
$match: {
$expr: { $eq: ["$phone", "$$phone"] }
}
}
],
as: "contact"
}
},
{ $unwind: "$contact" }
]);

Count all occurrences per field in one index

I want to count how many entries i have for each field in my elasticsearch DB for one index. I have tried with the code below, but this only returns the total number of entries. I'm working in Python.
What I have tried so far:
qry = {
"aggs": {
"field": {
"terms" : {"field": "field"}
}
}, "size": 0
}
r = es.search(body=qry,
index="webhose_english")
My current result:
Out[64]:
{'_shards': {'failed': 0, 'skipped': 0, 'successful': 5, 'total': 5},
'aggregations': {'field': {'buckets': [],
'doc_count_error_upper_bound': 0,
'sum_other_doc_count': 0}},
'hits': {'hits': [], 'max_score': 0.0, 'total': 4519134},
'timed_out': False,
'took': 16}
And I would ideally have something like:
{'field_1': 321,
'field_2': 231,
'field_3': 132}

This information used to be part of the _field_stats API, but it has been removed in 6.0. So you are on the right track, you will need an aggregation. I think value_count is the one you need and for good measure I've added global as well, so we know how many documents are there in total.
Three sample docs:
PUT foo/_doc/1
{
"foo": "bar"
}
PUT foo/_doc/2
{
"foo": "bar",
"bar": "bar"
}
PUT foo/_doc/3
{
"foo": "bar",
"bar": "bar",
"baz": "bar"
}
Aggregation (I'm not sure if there might be a shorter version of this especially with many fields):
GET foo/_search
{
"aggs": {
"count_fields": {
"global": {},
"aggs": {
"count_foo": {
"value_count": {
"field": "foo.keyword"
}
},
"count_bar": {
"value_count": {
"field": "bar.keyword"
}
},
"count_baz": {
"value_count": {
"field": "baz.keyword"
}
}
}
}
},
"size": 0
}
Result:
{
"took" : 16,
"timed_out" : false,
"_shards" : {
"total" : 5,
"successful" : 5,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : 3,
"max_score" : 0.0,
"hits" : [ ]
},
"aggregations" : {
"count_fields" : {
"doc_count" : 3,
"count_foo" : {
"value" : 3
},
"count_bar" : {
"value" : 2
},
"count_baz" : {
"value" : 1
}
}
}
}

I did it by iterating over the following query and then collecting the "total" values in a dictionary:
qry = {
"query": {
"exists": {
"field": "fields_to_iterate"
}
}
}

How to perform multiple aggregation on an object in Elasticsearch using Python?

I want to perform date histogram query on my Elasticsearch data which is of the format:
datetime,field_obj and field_obj has three fields in it: a,b,c
Alongside date histogram aggregation, I want to find the average of field_obj i.e avg(field_a), avg(field_b), avg(field_c) also.
I tried working it out like this:
res = es.search(index="demo",body={"from": 0, "size": 0, "query":
{"match_all": {}}, "aggs": {
"date_avg": {
"date_histogram": {"field": "datetime","interval": "year"},
"aggs": {"avg_a": {"avg": {"field": "field.a"}}},
"aggs": {"avg_b": {"avg": {"field": "field.b"}}},
"aggs": {"avg_c": {"avg": {"field": "field.c"}}},
}}
})
However, this query only yields an average of field_c. All the other averages are getting overridden.

Good start! You need to do it like this and it will work:
res = es.search(index="demo",body={
"from": 0,
"size": 0,
"query": {
"match_all": {}
},
"aggs": {
"date_avg": {
"date_histogram": {
"field": "datetime",
"interval": "year"
},
"aggs": {
"avg_a": {
"avg": {
"field": "field.a"
}
},
"avg_b": {
"avg": {
"field": "field.b"
}
},
"avg_c": {
"avg": {
"field": "field.c"
}
}
}
}
}
})

Optimizing MongoDB Aggregation Pipeline (Group, Lookup, Match)

I'm new on NoSQL Database and i choose MongoDB as my first NoSQL Database. I made an aggregation pipeline to shows the data that i want, here's my document sample:
Document sample from Users Collection
{
"_id": 9,
"name": "Sample Name",
"email": "email#example.com",
"password": "password hash"
}
Document sample from Pages Collection (this one doesn't really matter)
{
"_id": 42,
"name": "Product Name",
"description": "Product Description",
"user_id": 8,
"rating_categories": [{
"_id": 114,
"name": "Build Quality"
}, {
"_id": 115,
"name": "Price"
}, {
"_id": 116,
"name": "Feature"
}, {
"_id": 117,
"name": "Comfort"
}, {
"_id": 118,
"name": "Switch"
}]
}
Document sample from Reviews Collection
{
"_id": 10,
"page_id": 42, #ID reference from pages collection
"user_id": 8, #ID reference from users collection
"review": "The review of the product",
"ratings": [{
"_id": 114, #ID Reference from pages collection of what rating category it is
"rating": 5
}, {
"_id": 115,
"rating":4
}, {
"_id": 116,
"rating": 5
}, {
"_id": 117,
"rating": 3
}, {
"_id": 118,
"rating": 4
}],
"created": "1582825968963", #Date Object
"votes": {
"downvotes": [],
"upvotes": [9] #IDs of users who upvote this review
}
}
I want to get reviews by page_id which can be accessed from the API i made, here's the expected result from the aggregation:
[
{
"_id": 10, #Review of the ID
"created": "Thu, 27 Feb 2020 17:52:48 GMT",
"downvote_count": 0, #Length of votes.downvotes from reviews collection
"page_id": 42, #Page ID
"ratings": [ #Stores what rate at what rating category id
{
"_id": 114,
"rating": 5
},
{
"_id": 115,
"rating": 4
},
{
"_id": 116,
"rating": 5
},
{
"_id": 117,
"rating": 3
},
{
"_id": 118,
"rating": 4
}
],
"review": "The Review",
"upvote_count": 0, #Length of votes.upvotes from reviews collection
"user": { #User who reviewed
"_id": 8, #User ID
"downvote_count": 0, #How many downvotes this user receive from all of the user's reviews
"name": "Sample Name", #Username
"review_count": 1, #How many reviews the user made
"upvote_count": 1 #How many upvotes this user receive from all of the user's reviews
},
"vote_state": 0 #Determining vote state from the user (who requested to the API) for this review, 0 for no vote, -1 for downvote, 1 for upvote
},
...
]
Here's the pipeline of the aggregation for reviews collection that i made for the result above:
user_id = 9
page_id = 42
pipeline = [
{"$group": {
"_id": {"user_id":"$user_id", "page_id": "$page_id"},
"review_id": {"$last": "$_id"},
"page_id": {"$last": "$page_id"},
"user_id" : {"$last": "$user_id"},
"ratings": {"$last": "$ratings"},
"review": {"$last": "$review"},
"created": {"$last": "$created"},
"votes": {"$last": "$votes"},
"upvote_count": {"$sum":
{"$cond": [
{"$ifNull": ["$votes.upvotes", False]},
{"$size": "$votes.upvotes"},
0
]}
},
"downvote_count": {"$sum":
{"$cond": [
{"$ifNull": ["$votes.downvotes", False]},
{"$size": "$votes.downvotes"},
0
]}
}}},
{"$lookup": {
"from": "users",
"localField": "user_id",
"foreignField": "_id",
"as": "user"
}},
{"$unwind": "$user"},
{"$lookup": {
"from": "reviews",
"localField": "user._id",
"foreignField": "user_id",
"as": "user.reviews"
}},
{"$addFields":{
"_id": "$review_id",
"user.review_count": {"$size": "$user.reviews"},
"user.upvote_count": {"$sum":{
"$map":{
"input":"$user.reviews",
"in":{"$cond": [
{"$ifNull": ["$$this.votes.upvotes", False]},
{"$size": "$$this.votes.upvotes"},
0
]}
}
}},
"user.downvote_count": {"$sum":{
"$map":{
"input":"$user.reviews",
"in":{"$cond": [
{"$ifNull": ["$$this.votes.downvotes", False]},
{"$size": "$$this.votes.downvotes"},
0
]}
}
}},
"vote_state": {"$switch": {
"branches": [
{"case": { "$and" : [
{"$ifNull": ["$votes.upvotes", False]},
{"$in": [user_id, "$votes.upvotes"]}
]}, "then": 1
},
{"case": { "$and" : [
{"$ifNull": ["$votes.downvotes", False]},
{"$in": [user_id, "$votes.downvotes"]}
]}, "then": -1
},
],
"default": 0
}},
}},
{"$project":{
"user.password": 0,
"user.email": 0,
"user_id": 0,
"review_id" : 0,
"votes": 0,
"user.reviews": 0
}},
{"$sort": {"created": -1}},
{"$match": {"page_id": page_id}},
]
Note: User can make multiple reviews for same page_id, but only the latest will be shown
I'm using pymongo btw, that's why operators have quotation mark
My questions are:
Is there any room to optimize my aggregation pipeline?
Is it considered as a good practice to have multiple small aggregate execution to get datas like above, or its always better to have 1 big aggregation (or as less as possible) to get the data that i want?
As you can see, every time i want to access votes.upvotes or votes.downvotes from a document on review collection, i checked whether the field is null or not, that's because the field votes.upvotes and votes.downvotes isn't being made when user make a review, instead it's being made when an user gives a vote to that review. Should i make an empty field on votes.upvotes and votes.downvotes when user make a review and remove the $ifNull? Will that increase the performance of the aggregation?
Thanks

Check if this aggregation has better performance.
Create these indexes if you don't have already:
db.reviews.create_index([("page_id", 1)])
Note: We can improve even more the performance avoiding $lookup reviews again.
db.reviews.aggregate([
{
$match: {
page_id: page_id
}
},
{
$addFields: {
request_user_id: user_id
}
},
{
$group: {
_id: {
page_id: "$page_id",
user_id: "$user_id",
request_user_id: "$request_user_id"
},
data: {
$push: "$$ROOT"
}
}
},
{
$lookup: {
"from": "users",
"let": {
root_user_id: "$_id.user_id"
},
"pipeline": [
{
$match: {
$expr: {
$eq: [
"$$root_user_id",
"$_id"
]
}
}
},
{
$lookup: {
"from": "reviews",
"let": {
root_user_id: "$$root_user_id"
},
"pipeline": [
{
$match: {
$expr: {
$eq: [
"$$root_user_id",
"$user_id"
]
}
}
},
{
$project: {
user_id: 1,
downvote_count: {
$size: "$votes.downvotes"
},
upvote_count: {
$size: "$votes.upvotes"
}
}
},
{
$group: {
_id: null,
review_count: {
$sum: {
$cond: [
{
$eq: [
"$$root_user_id",
"$user_id"
]
},
1,
0
]
}
},
upvote_count: {
$sum: "$upvote_count"
},
downvote_count: {
$sum: "$downvote_count"
}
}
},
{
$unset: "_id"
}
],
"as": "stats"
}
},
{
$project: {
tmp: {
$mergeObjects: [
{
_id: "$_id",
name: "$name"
},
{
$arrayElemAt: [
"$stats",
0
]
}
]
}
}
},
{
$replaceWith: "$tmp"
}
],
"as": "user"
}
},
{
$addFields: {
first: {
$mergeObjects: [
"$$ROOT",
{
$arrayElemAt: [
"$data",
0
]
},
{
user: {
$arrayElemAt: [
"$user",
0
]
},
created: {
$toDate: {
$toLong: {
$arrayElemAt: [
"$data.created",
0
]
}
}
},
downvote_count: {
$reduce: {
input: "$data.votes.downvotes",
initialValue: 0,
in: {
$add: [
"$$value",
{
$size: "$$this"
}
]
}
}
},
upvote_count: {
$reduce: {
input: "$data.votes.upvotes",
initialValue: 0,
in: {
$add: [
"$$value",
{
$size: "$$this"
}
]
}
}
},
vote_state: {
$cond: [
{
$gt: [
{
$size: {
$filter: {
input: "$data.votes.upvotes",
cond: {
$in: [
"$_id.request_user_id",
"$$this"
]
}
}
}
},
0
]
},
1,
{
$cond: [
{
$gt: [
{
$size: {
$filter: {
input: "$data.votes.downvotes",
cond: {
$in: [
"$_id.request_user_id",
"$$this"
]
}
}
}
},
0
]
},
-1,
0
]
}
]
}
}
]
}
}
},
{
$unset: [
"first.data",
"first.votes",
"first.user_id",
"first.request_user_id"
]
},
{
$replaceWith: "$first"
},
{
"$sort": {
"created": -1
}
}
])
MongoPlayground

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Elasticsearch query - print certain field based on other field - python

Related

groupby query on joined collection in flask mongoDB

MongoDB aggregation to join 2 fields not working

Count all occurrences per field in one index

How to perform multiple aggregation on an object in Elasticsearch using Python?

Optimizing MongoDB Aggregation Pipeline (Group, Lookup, Match)

Categories

Resources