Elasticsearch - match_phrase query sort by int - python

I have 'device' type documents, which I search by model using following query (using Flask & Elasticsearch as an api):
match handset
query = {
"query": {
"match_phrase": {
"model": model_name
}
},
"track_scores": True,
"size": 1,
"sort":
[
{"_score": {"order": "desc"}},
{"model": {"order": "asc"}}
]
}
device = es.search(body=query, doc_type='device')
That returns single device with 'model' closest to requested (model_name).
Example list of devices:
[{ "id":482,
"memory":"16",
"model":"iPhone 5s 16GB" },
{ "id":483,
"memory":"32",
"model":"iPhone 5s 32GB" },
{ "id":484,
"memory":"16",
"model":"iPhone 5c 16GB" },
{ "id":486,
"memory":"64",
"model":"iPhone 6 64GB" },
{ "id":485,
"memory":"32",
"model":"iPhone 6 32GB" }]
How can I change it so it return device with the lowest memory?
>>> query.query.match_phrase.model = 'iPhone 5s'
>>> device = es.search(body=query, doc_type='device')
{ "id":482,
"memory":"16",
"model":"iPhone 5s 16GB" }
>>> query.query.match_phrase.model = 'iPhone 6'
>>> device = es.search(body=query, doc_type='device')
{ "id":485,
"memory":"32",
"model":"iPhone 6 32GB" }
Any clues highly appreciated.

I would change the type of the "memory" field to "integer" in your mapping, and index the data appropriately, then it's easy to get the result you want.
So, with a mapping like this:
PUT /test_index
{
"mappings": {
"doc": {
"_id": {
"path": "id"
},
"properties": {
"id": {
"type": "integer"
},
"memory": {
"type": "integer"
},
"model": {
"type": "string"
}
}
}
}
}
and these documents indexed:
POST /test_index/doc/_bulk
{"index":{}}
{"id":482,"memory":16,"model":"iPhone 5s 16GB"}
{"index":{}}
{"id":483,"memory":32,"model":"iPhone 5s 32GB"}
{"index":{"_id":1}}
{"id":484,"memory":16,"model":"iPhone 5c 16GB"}
{"index":{}}
{"id":486,"memory":64,"model":"iPhone 6 64GB"}
{"index":{}}
{"id":485,"memory":32,"model":"iPhone 6 32GB"}
{"index":{}}
You can query like this to get the lowest memory hit on "iPhone 5s":
POST /test_index/_search
{
"query": {
"match": {
"model": {
"query": "iPhone 5s",
"operator": "and"
}
}
},
"sort": [
{
"memory": {
"order": "asc"
}
}
],
"size": 1
}
...
{
"took": 2,
"timed_out": false,
"_shards": {
"total": 1,
"successful": 1,
"failed": 0
},
"hits": {
"total": 2,
"max_score": null,
"hits": [
{
"_index": "test_index",
"_type": "doc",
"_id": "482",
"_score": null,
"_source": {
"id": 482,
"memory": 16,
"model": "iPhone 5s 16GB"
},
"sort": [
16
]
}
]
}
}
Here's the code I used:
http://sense.qbox.io/gist/8441d7379485e03a75fdbaa9ae0bf9748098be33

Related

cant do case insensitive search in elastic search

I'm new to elastic search and trying to do this query right.
So I'm having a document like this:
{
"id": 1,
"name": "Văn Hiến"
}
I want to get that document in 3 cases:
1/ User input is: "v" or "h" or "i",...
2/ User input is: "Văn" or "văn" or "hiến",...
3/ User input is: "va" or "van" or "van hi",...
I'm currently can search for case 1 and 2, but not case 3, where the user input don't have the 'tonal' of the Vietnamese language
This is my query, I'm using Python:
query = {
"bool": {
"should": [
{
"match": {
"name": name.lower()
}
},
{
"wildcard": {
"name": {
"value": f"*{name.lower()}*"
}
}
}
]
}
}
Can anyone help me with this? Any helps will be apperciated
Use the lowercase_filter and mapping_character_filter functions in your mapping.
the following mapping and query will work for all the three usecases you mentioned
Mapping Example:
{
"settings": {
"analysis": {
"analyzer": {
"my_analyzer": {
"type": "custom",
"tokenizer": "my_tokenizer",
"filter": [
"lowercase"
],
"char_filter": [
"my_mappings_char_filter"
]
}
},
"char_filter": {
"my_mappings_char_filter": {
"type": "mapping",
"mappings": [
"ă => a",
"ế => e"
]
}
},
"tokenizer": {
"my_tokenizer": {
"type": "ngram",
"min_gram": 1,
"max_gram": 10,
"token_chars": [
"letter"
]
}
}
},
"max_ngram_diff" : "9"
},
"mappings": {
"properties": {
"name": {
"type": "text",
"analyzer": "my_analyzer",
"fields": {
"facet": {
"type": "keyword",
"ignore_above": 256
}
}
}
}
}
}
Example Query:
{
"query" : {
"query_string" :{
"query":"van hi",
"type": "best_fields",
"default_field": "name"
}
}
}

groupby query on joined collection in flask mongoDB

I am currently stuck in this problem, i am relatively new to MongoDB, and i have to retrieve number of reports(count of reports done by users ) for a specific user with his name(name), last reported time(time of last reported post), last reason(report_description) ,
i am stuck here since 2 days now, help will be appreciated .
reported posts collection
{
"created_at": {
"$date": "2021-12-21T18:45:27.489Z"
},
"updated_at": {
"$date": "2021-12-21T18:45:27.489Z"
},
"post_id": {
"$oid": "61955ac35b3475f1d9759255"
},
"user_id": 2,
"report_type": "this is test",
"report_description": "this"
}
Post collection
{
"created_at": {
"$date": "2021-11-17T19:24:53.484Z"
},
"updated_at": {
"$date": "2021-11-17T19:24:53.484Z"
},
"user_id": 8,
"privacy_type": "public",
"post_type": "POST",
"post": "Om Sai Ram",
"total_like": 7,
"total_comment": 0,
"total_share": 0,
"image_url_list": [{
"image_url": "post_images/user-8/a31e39334987463bb9faa964391a935e.jpg",
"image_ratio": "1"
}],
"video_url_list": [],
"tag_list": [],
"is_hidden": false
}
User collection
{
"name": "sathish",
"user_id": 1,
"device_id": "faTOi3aVTjyQnBPFz0L7xm:APA91bHNLE9anWYrKWfwoHgmGWL2BlbWqgiVjU5iy7JooWxu26Atk9yZFxVnNp2OF1IXrXm4I6HdVJPGukEppQjSiUPdMoQ64KbOt78rpctxnYWPWliLrdxc9o1VdKL0DGYwE7Y6hx1H",
"user_name": "sathishkumar",
"updated_at": {
"$date": "2021-11-17T19:13:52.668Z"
},
"profile_picture_url": "1"
}
flask_snip.py
flagged_posts = mb.db_report.aggregate([{
'$group':{
'_id':'$user_id',
}
}])
expected out should be list e.g
[
{
'user_id':1,
'name' :'somename',
'no_of_reports':30,
'last_reported_time':sometime,
'reason':'reason_of lastreported_post',
'post_link':'someurl',
},
{
'user_id':2,
'name' :'somename',
'no_of_reports':30,
'last_reported_time':sometime,
'reason':'reason_of last_reported_post',
'post_link':'someurl',
},
{
'user_id':3,
'name' :'somename',
'no_of_reports':30,
'last_reported_time':sometime,
'reason':'reason_of lastreported_post',
'post_link':'someurl',
},
]
Starting from the reported collection, you can $group to get the last_reason and last_reported_time. Then, perform a $lookup to user collection to get the name.
db.reported.aggregate([
{
"$sort": {
updated_at: -1
}
},
{
"$group": {
"_id": "$user_id",
"last_reported_time": {
"$first": "$updated_at"
},
"last_reason": {
"$first": "$report_description"
},
"no_of_reports": {
$sum: 1
}
}
},
{
"$lookup": {
"from": "user",
"localField": "_id",
"foreignField": "user_id",
"as": "userLookup"
}
},
{
"$unwind": "$userLookup"
},
{
"$project": {
"user_id": "$_id",
"name": "$userLookup.user_name",
"no_of_reports": 1,
"last_reported_time": 1,
"last_reason": 1
}
}
])
Here is the Mongo playground for your reference.

Access nested objects in Elasticsearch using a script

I'm trying to use data from ElasticSearch 6 results in setting up the scoring for my results.
Part of my mapping looks like:
{
"properties": {
"annotation_date": {
"type": "date"
},
"annotation_date_time": {
"type": "date"
},
"annotations": {
"properties": {
"details": {
"type": "nested",
"properties": {
"filter": {
"type": "text",
"fielddata": True,
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"bucket": {
"type": "text",
"fielddata": True,
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"keyword": {
"type": "text",
"fielddata": True,
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"frequency": {
"type": "long",
}
}
}
}
}
}
}
Example part of a document JSON:
"annotations": {
"details": [
{
"filter": "filter_A",
"bucket": "bucket_A",
"keyword": "keyword_A",
"frequency": 6
},
{
"filter": "filter_B",
"bucket": "bucket_B",
"keyword": "keyword_B",
"frequency": 7
}
]
I want to use the the frequency of my annotation.details if it hits a certain 'bucket', which I try to do with the following:
GET my_index/_search
{
"size": 10000,
"query": {
"function_score": {
"query": {
"match": { "title": "<search term>" }
},
"script_score": {
"script": {
"lang": "painless",
"source": """
int score = 0;
for (int i = 0; i < doc['annotations.details.filter'].length; i++){
if (doc['annotations.details.filter'][i].keyword == "bucket_A"){
score += doc['annotations.details.frequency'][i].value;
}
}
return score;
"""
}
}
}
}
}
Ultimately, this would mean that in this specific situation a score is expected of 6. If it would have hit on more buckets, the score is incremented with the frequency it hit on.
You should use bool,must with range and gt
example
GET /_search
{
"query": {
"nested" : {
"path" : "obj1",
"score_mode" : "avg",
"query" : {
"bool" : {
"must" : [
{ "match" : {"obj1.name" : "blue"} },
{ "range" : {"obj1.count" : {"gt" : 5}} }
]
}
}
}
}
}

Google Assistant API V2 userstorage

I'm attempting to develop a Google Action with the Dialogflow v2 API
My function saves a value to userstorage as follows
def save_value(value):
res = {
"fulfillmentText": "Set value to {}".format(int(value)),
"payload": {
"google": {
"userStorage": str(value)
}
}
}
print ("Saved value")
response = jsonify(res)
return response
And I get the following back from testing in Dialogflow
{
"fulfillmentText": "Set value to 36237269",
"payload": {
"google": {
"userStorage": "36237269"
}
}
}
This works for the duration of session, I am able to use this in later intents via
value = request_json['originalRequest']['data']['user']['userStorage']
However, the data is only stored for one session - if I invoke my action again, there is nothing saved.
Is this the correct way of using userstorage? Has anyone successfully used it with Python?
Failed "response"
{
"responseMetadata": {
"status": {
"code": 10,
"message": "Failed to parse Dialogflow response into AppResponse because of empty speech response",
"details": [
{
"#type": "type.googleapis.com/google.protobuf.Value",
"value": "{\"id\":\"816605a7-f7e0-4d37-a490-c84ff63fb7dd\",\"timestamp\":\"2018-11-08T17:18:49.422Z\",\"lang\":\"en-us\",\"result\":{},\"alternateResult\":{},\"status\":{\"code\":206,\"errorType\":\"partial_content\",\"errorDetails\":\"Webhook call failed. Error: 500 Internal Server Error\"},\"sessionId\":\"ABwppHHai3qsY2WPZWezmh9Q_bUF45aD51GbQ81sUDF7iSrRLA2m8KFgZ1ZYavnCv3fAckW1tcoJdydZTXQY5Nw\"}"
}
]
}
}
}
Working "response"
{
"conversationToken": "[]",
"finalResponse": {
"richResponse": {
"items": [
{
"simpleResponse": {
"textToSpeech": "Text"
}
}
]
}
},
"responseMetadata": {
"status": {
"message": "Success (200)"
},
"queryMatchInfo": {
"queryMatched": true,
"intent": "047ad9d9-0180-47f9-88bd-e5ffc8936c08"
}
},
"userStorage": "36237269"
}
Working "Request"
{
"user": {
"userId": "ABwppHE5H0FKrXKk8PjJyzZJ12OSMQcjxuT2NnfPAgLvai0UsfWEoYE8R_L8qLQdqY29sOnsZhQhE5G4XVVXiGs",
"locale": "en-US",
"lastSeen": "2018-11-08T17:18:16Z",
"userStorage": "36237269"
},
"conversation": {
"conversationId": "ABwppHE6BwK2zIBKxHA8hc9uBGumVgKbbNGHhRVFz7O6yrxxa1WJ_xtKNqhesj3EwNCVlestM-bF6tDWzZhqUXE",
"type": "ACTIVE",
"conversationToken": "[]"
},
"inputs": [
{
"intent": "actions.intent.TEXT",
"rawInputs": [
{
"inputType": "KEYBOARD",
"query": "when is the next bus"
}
],
"arguments": [
{
"name": "text",
"rawText": "when is the next bus",
"textValue": "when is the next bus"
}
]
}
],
"surface": {
"capabilities": [
{
"name": "actions.capability.MEDIA_RESPONSE_AUDIO"
},
{
"name": "actions.capability.WEB_BROWSER"
},
{
"name": "actions.capability.AUDIO_OUTPUT"
},
{
"name": "actions.capability.SCREEN_OUTPUT"
}
]
},
"isInSandbox": true,
"availableSurfaces": [
{
"capabilities": [
{
"name": "actions.capability.WEB_BROWSER"
},
{
"name": "actions.capability.AUDIO_OUTPUT"
},
{
"name": "actions.capability.SCREEN_OUTPUT"
}
]
}
],
"requestType": "SIMULATOR"
}
Failed "request"
{
"user": {
"userId": "ABwppHE5H0FKrXKk8PjJyzZJ12OSMQcjxuT2NnfPAgLvai0UsfWEoYE8R_L8qLQdqY29sOnsZhQhE5G4XVVXiGs",
"locale": "en-US",
"lastSeen": "2018-11-08T17:18:41Z"
},
"conversation": {
"conversationId": "ABwppHHai3qsY2WPZWezmh9Q_bUF45aD51GbQ81sUDF7iSrRLA2m8KFgZ1ZYavnCv3fAckW1tcoJdydZTXQY5Nw",
"type": "ACTIVE",
"conversationToken": "[]"
},
"inputs": [
{
"intent": "actions.intent.TEXT",
"rawInputs": [
{
"inputType": "KEYBOARD",
"query": "when is the next bus"
}
],
"arguments": [
{
"name": "text",
"rawText": "when is the next bus",
"textValue": "when is the next bus"
}
]
}
],
"surface": {
"capabilities": [
{
"name": "actions.capability.WEB_BROWSER"
},
{
"name": "actions.capability.MEDIA_RESPONSE_AUDIO"
},
{
"name": "actions.capability.SCREEN_OUTPUT"
},
{
"name": "actions.capability.AUDIO_OUTPUT"
}
]
},
"isInSandbox": true,
"availableSurfaces": [
{
"capabilities": [
{
"name": "actions.capability.WEB_BROWSER"
},
{
"name": "actions.capability.SCREEN_OUTPUT"
},
{
"name": "actions.capability.AUDIO_OUTPUT"
}
]
}
],
"requestType": "SIMULATOR"
}

Optimizing MongoDB Aggregation Pipeline (Group, Lookup, Match)

I'm new on NoSQL Database and i choose MongoDB as my first NoSQL Database. I made an aggregation pipeline to shows the data that i want, here's my document sample:
Document sample from Users Collection
{
"_id": 9,
"name": "Sample Name",
"email": "email#example.com",
"password": "password hash"
}
Document sample from Pages Collection (this one doesn't really matter)
{
"_id": 42,
"name": "Product Name",
"description": "Product Description",
"user_id": 8,
"rating_categories": [{
"_id": 114,
"name": "Build Quality"
}, {
"_id": 115,
"name": "Price"
}, {
"_id": 116,
"name": "Feature"
}, {
"_id": 117,
"name": "Comfort"
}, {
"_id": 118,
"name": "Switch"
}]
}
Document sample from Reviews Collection
{
"_id": 10,
"page_id": 42, #ID reference from pages collection
"user_id": 8, #ID reference from users collection
"review": "The review of the product",
"ratings": [{
"_id": 114, #ID Reference from pages collection of what rating category it is
"rating": 5
}, {
"_id": 115,
"rating":4
}, {
"_id": 116,
"rating": 5
}, {
"_id": 117,
"rating": 3
}, {
"_id": 118,
"rating": 4
}],
"created": "1582825968963", #Date Object
"votes": {
"downvotes": [],
"upvotes": [9] #IDs of users who upvote this review
}
}
I want to get reviews by page_id which can be accessed from the API i made, here's the expected result from the aggregation:
[
{
"_id": 10, #Review of the ID
"created": "Thu, 27 Feb 2020 17:52:48 GMT",
"downvote_count": 0, #Length of votes.downvotes from reviews collection
"page_id": 42, #Page ID
"ratings": [ #Stores what rate at what rating category id
{
"_id": 114,
"rating": 5
},
{
"_id": 115,
"rating": 4
},
{
"_id": 116,
"rating": 5
},
{
"_id": 117,
"rating": 3
},
{
"_id": 118,
"rating": 4
}
],
"review": "The Review",
"upvote_count": 0, #Length of votes.upvotes from reviews collection
"user": { #User who reviewed
"_id": 8, #User ID
"downvote_count": 0, #How many downvotes this user receive from all of the user's reviews
"name": "Sample Name", #Username
"review_count": 1, #How many reviews the user made
"upvote_count": 1 #How many upvotes this user receive from all of the user's reviews
},
"vote_state": 0 #Determining vote state from the user (who requested to the API) for this review, 0 for no vote, -1 for downvote, 1 for upvote
},
...
]
Here's the pipeline of the aggregation for reviews collection that i made for the result above:
user_id = 9
page_id = 42
pipeline = [
{"$group": {
"_id": {"user_id":"$user_id", "page_id": "$page_id"},
"review_id": {"$last": "$_id"},
"page_id": {"$last": "$page_id"},
"user_id" : {"$last": "$user_id"},
"ratings": {"$last": "$ratings"},
"review": {"$last": "$review"},
"created": {"$last": "$created"},
"votes": {"$last": "$votes"},
"upvote_count": {"$sum":
{"$cond": [
{"$ifNull": ["$votes.upvotes", False]},
{"$size": "$votes.upvotes"},
0
]}
},
"downvote_count": {"$sum":
{"$cond": [
{"$ifNull": ["$votes.downvotes", False]},
{"$size": "$votes.downvotes"},
0
]}
}}},
{"$lookup": {
"from": "users",
"localField": "user_id",
"foreignField": "_id",
"as": "user"
}},
{"$unwind": "$user"},
{"$lookup": {
"from": "reviews",
"localField": "user._id",
"foreignField": "user_id",
"as": "user.reviews"
}},
{"$addFields":{
"_id": "$review_id",
"user.review_count": {"$size": "$user.reviews"},
"user.upvote_count": {"$sum":{
"$map":{
"input":"$user.reviews",
"in":{"$cond": [
{"$ifNull": ["$$this.votes.upvotes", False]},
{"$size": "$$this.votes.upvotes"},
0
]}
}
}},
"user.downvote_count": {"$sum":{
"$map":{
"input":"$user.reviews",
"in":{"$cond": [
{"$ifNull": ["$$this.votes.downvotes", False]},
{"$size": "$$this.votes.downvotes"},
0
]}
}
}},
"vote_state": {"$switch": {
"branches": [
{"case": { "$and" : [
{"$ifNull": ["$votes.upvotes", False]},
{"$in": [user_id, "$votes.upvotes"]}
]}, "then": 1
},
{"case": { "$and" : [
{"$ifNull": ["$votes.downvotes", False]},
{"$in": [user_id, "$votes.downvotes"]}
]}, "then": -1
},
],
"default": 0
}},
}},
{"$project":{
"user.password": 0,
"user.email": 0,
"user_id": 0,
"review_id" : 0,
"votes": 0,
"user.reviews": 0
}},
{"$sort": {"created": -1}},
{"$match": {"page_id": page_id}},
]
Note: User can make multiple reviews for same page_id, but only the latest will be shown
I'm using pymongo btw, that's why operators have quotation mark
My questions are:
Is there any room to optimize my aggregation pipeline?
Is it considered as a good practice to have multiple small aggregate execution to get datas like above, or its always better to have 1 big aggregation (or as less as possible) to get the data that i want?
As you can see, every time i want to access votes.upvotes or votes.downvotes from a document on review collection, i checked whether the field is null or not, that's because the field votes.upvotes and votes.downvotes isn't being made when user make a review, instead it's being made when an user gives a vote to that review. Should i make an empty field on votes.upvotes and votes.downvotes when user make a review and remove the $ifNull? Will that increase the performance of the aggregation?
Thanks
Check if this aggregation has better performance.
Create these indexes if you don't have already:
db.reviews.create_index([("page_id", 1)])
Note: We can improve even more the performance avoiding $lookup reviews again.
db.reviews.aggregate([
{
$match: {
page_id: page_id
}
},
{
$addFields: {
request_user_id: user_id
}
},
{
$group: {
_id: {
page_id: "$page_id",
user_id: "$user_id",
request_user_id: "$request_user_id"
},
data: {
$push: "$$ROOT"
}
}
},
{
$lookup: {
"from": "users",
"let": {
root_user_id: "$_id.user_id"
},
"pipeline": [
{
$match: {
$expr: {
$eq: [
"$$root_user_id",
"$_id"
]
}
}
},
{
$lookup: {
"from": "reviews",
"let": {
root_user_id: "$$root_user_id"
},
"pipeline": [
{
$match: {
$expr: {
$eq: [
"$$root_user_id",
"$user_id"
]
}
}
},
{
$project: {
user_id: 1,
downvote_count: {
$size: "$votes.downvotes"
},
upvote_count: {
$size: "$votes.upvotes"
}
}
},
{
$group: {
_id: null,
review_count: {
$sum: {
$cond: [
{
$eq: [
"$$root_user_id",
"$user_id"
]
},
1,
0
]
}
},
upvote_count: {
$sum: "$upvote_count"
},
downvote_count: {
$sum: "$downvote_count"
}
}
},
{
$unset: "_id"
}
],
"as": "stats"
}
},
{
$project: {
tmp: {
$mergeObjects: [
{
_id: "$_id",
name: "$name"
},
{
$arrayElemAt: [
"$stats",
0
]
}
]
}
}
},
{
$replaceWith: "$tmp"
}
],
"as": "user"
}
},
{
$addFields: {
first: {
$mergeObjects: [
"$$ROOT",
{
$arrayElemAt: [
"$data",
0
]
},
{
user: {
$arrayElemAt: [
"$user",
0
]
},
created: {
$toDate: {
$toLong: {
$arrayElemAt: [
"$data.created",
0
]
}
}
},
downvote_count: {
$reduce: {
input: "$data.votes.downvotes",
initialValue: 0,
in: {
$add: [
"$$value",
{
$size: "$$this"
}
]
}
}
},
upvote_count: {
$reduce: {
input: "$data.votes.upvotes",
initialValue: 0,
in: {
$add: [
"$$value",
{
$size: "$$this"
}
]
}
}
},
vote_state: {
$cond: [
{
$gt: [
{
$size: {
$filter: {
input: "$data.votes.upvotes",
cond: {
$in: [
"$_id.request_user_id",
"$$this"
]
}
}
}
},
0
]
},
1,
{
$cond: [
{
$gt: [
{
$size: {
$filter: {
input: "$data.votes.downvotes",
cond: {
$in: [
"$_id.request_user_id",
"$$this"
]
}
}
}
},
0
]
},
-1,
0
]
}
]
}
}
]
}
}
},
{
$unset: [
"first.data",
"first.votes",
"first.user_id",
"first.request_user_id"
]
},
{
$replaceWith: "$first"
},
{
"$sort": {
"created": -1
}
}
])
MongoPlayground

Categories

Resources