Search a list of comma separated strings on elasticsearch

Search a list of comma separated strings on elasticsearch - python

I have elasticsearch configured for my django project.
Elasticsearch index has two fields user_id and address, my goal is to search a list of comma separated addresses on elasticsearch.
Example:
i have this list of addresses ["abc", "def","ghi","jkl","mno"] and i want to search them on elasticsearch in one hit, the result i'm expecting for the above list is ["abc", "def","ghi"] if these three addresses "abc", "def" and "ghi" (individually) exist on elasticsearch in address field.

Ingest data
POST test_foki/_doc
{
"user_id": 1,
"address": "abc"
}
POST test_foki/_doc
{
"user_id": 2,
"address": "def"
}
POST test_foki/_doc
{
"user_id": 3,
"address": "ghi"
}
If you want to do exact matches then you can use a terms query to filter up by an array of addresses.
Request
We use filter because we dont care about score on exact matches (it matches or not)
POST test_foki/_search
{
"query": {
"bool": {
"filter": [
{
"terms": {
"address.keyword": [
"abc",
"def",
"ghi",
"jkl",
"mno"
]
}
}
]
}
}
}
Response
{
"took" : 1,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 3,
"relation" : "eq"
},
"max_score" : 1.0,
"hits" : [
{
"_index" : "test_foki",
"_type" : "_doc",
"_id" : "YzkL4HcBv0SJscHMrZB8",
"_score" : 1.0,
"_source" : {
"user_id" : 1,
"address" : "abc"
}
},
{
"_index" : "test_foki",
"_type" : "_doc",
"_id" : "ZDkL4HcBv0SJscHMsZAx",
"_score" : 1.0,
"_source" : {
"user_id" : 2,
"address" : "def"
}
},
{
"_index" : "test_foki",
"_type" : "_doc",
"_id" : "ZTkL4HcBv0SJscHMtpAd",
"_score" : 1.0,
"_source" : {
"user_id" : 3,
"address" : "ghi"
}
}
]
}
}
If you want to do full text searches you will have to do a boolean query
POST test_foki/_search
{
"query": {
"bool": {
"should": [
{
"match": {
"address": "abc"
}
},
{
"match": {
"address": "def"
}
},
{
"match": {
"address": "ghi"
}
},
{
"match": {
"address": "jkl"
}
},
{
"match": {
"address": "mno"
}
}
]
}
}
}
This produces the same Lucene query address:abc address:def address:ghi address:jkl address:mno
POST test_foki/_search
{
"query": {
"match": {
"address": "abc def ghi jkl mno"
}
}
}

Related

How to retrieve elasticsearch data from index based on timestamp?

I want to retrieve data from elasticsearch based on timestamp. The timestamp is in epoch_millis and I tried to retrieve the data like this:
{
"query": {
"bool": {
"must":[
{
"range": {
"TimeStamp": {
"gte": "1632844180",
"lte": "1635436180"
}
}
}
]
}
},
"size": 10
}
But the response is this:
{
"took" : 0,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 0,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
}
}
How can I retrieve data for a given period of time from a certain index?
The data looks like this:
{
"_index" : "my-index",
"_type" : "_doc",
"_id" : "zWpMNXcBTeKmGB84eksSD",
"_score" : 1.0,
"_source" : {
"Source" : "Market",
"Category" : "electronics",
"Value" : 20,
"Price" : 45.6468,
"Currency" : "EUR",
"TimeStamp" : 1611506922000 }
Also, the result has 10.000 hits when using the _search on the index. How could I access other entries? (more than 10.000 results) and to be able to choose the desired timestamp interval.

For your first question, assume that you have the mappings like this:
{
"mappings": {
"properties": {
"Source": {
"type": "keyword"
},
"Category": {
"type": "keyword"
},
"Value": {
"type": "integer"
},
"Price": {
"type": "float"
},
"Currency": {
"type": "keyword"
},
"TimeStamp": {
"type": "date"
}
}
}
}
Then I indexed 2 sample documents (1 is yours above, but the timestamp is definitely not in your range):
[{
"Source": "Market",
"Category": "electronics",
"Value": 30,
"Price": 55.6468,
"Currency": "EUR",
"TimeStamp": 1633844180000
},
{
"Source": "Market",
"Category": "electronics",
"Value": 20,
"Price": 45.6468,
"Currency": "EUR",
"TimeStamp": 1611506922000
}]
If you really need to query using the range above, you will first need to convert your TimeStamp field to seconds (/1000), then query based on that field:
{
"runtime_mappings": {
"secondTimeStamp": {
"type": "long",
"script": "emit(doc['TimeStamp'].value.millis/1000);"
}
},
"query": {
"bool": {
"must": [
{
"range": {
"secondTimeStamp": {
"gte": 1632844180,
"lte": 1635436180
}
}
}
]
}
},
"size": 10
}
Then you will get the first document.
About your second question, by default, Elasticsearch's max_result_window is only 10000. You can increase this limit by updating the settings, but it will increase the memory usage.
PUT /index/_settings
{
"index.max_result_window": 999999
}
You should use the search_after API instead.

Query subdocument in Monogo DB

I have the following Mongo Document:
{
"_id" : ObjectId("5ea0576c2671f799e1ad65db"),
"model" : "Volkswagen",
"make" : "Volkswagen",
"year" : 1969,
"mileage" : 15000.0,
"vi_number" : "ba37d9b98fe940dba100fdf0330c30f0",
"engine" : {
"horsepower" : 400,
"liters" : 5.0,
"mpg" : 20.0,
"serial_number" : "a9ff4120-ee34-4588-af1a-77f62457531f"
},
"service_history" : [
{
"date" : ISODate("2020-04-22T12:10:55.622Z"),
"description" : "Cleaning",
"price" : 150.0,
"customer_rating" : 5
},
{
"date" : ISODate("2020-04-22T12:14:53.439Z"),
"description" : "Flat tire",
"price" : 100.0,
"customer_rating" : 2
},
{
"date" : ISODate("2020-04-22T12:30:54.071Z"),
"description" : "Waxing",
"price" : 77.0,
"customer_rating" : 4
}
]
}
I created the following query in Python and it returns all subdocuments:
cars = Car.objects().filter(service_history__customer_rating__lte=2)
When I run the following query directly in Mongo:
db.getCollection('cars').find({'service_history.customer_rating': {$lte: 2}})
It returns again all the subdocuments instead of just one.
I checked the documentation and everything seems correct.

db.getCollection('cars').find({'service_history.customer_rating': {$lte: 2}})
edit:
What happens here is it checks for the customer_rating and since it finds a match it returns the Document. The whole thing that is returned is the document.
PS: Sorry for the confusion earlier.

please look at the following query. Hope this will help you. You need to use MongoDB Aggregations to find $lte operator.
db.collection.aggregate([
{
$unwind: "$service_history"
},
{
$match: {
"_id": ObjectId("5ea0576c2671f799e1ad65db")
}
},
{
$match: {
$expr: {
$lte: [
"$service_history.customer_rating",
2
]
}
}
}
])
This would be the sample data you would be getting if you would use the above query.
[
{
"_id": ObjectId("5ea0576c2671f799e1ad65db"),
"engine": {
"horsepower": 400,
"liters": 5,
"mpg": 20,
"serial_number": "a9ff4120-ee34-4588-af1a-77f62457531f"
},
"make": "Volkswagen",
"mileage": 15000,
"model": "Volkswagen",
"service_history": {
"customer_rating": 2,
"date": ISODate("2020-04-22T12:14:53.439Z"),
"description": "Flat tire",
"price": 100
},
"vi_number": "ba37d9b98fe940dba100fdf0330c30f0",
"year": 1969
}
]
Updated query
db.collection.aggregate([
{
$unwind: "$service_history"
},
{
$match: {
$expr: {
$lte: [
"$service_history.customer_rating",
2
]
}
}
}
])
You can refer to this code.

How to fetch unique score search result from Elastic Search

I am trying to displaying unique 'description' rows from elastic search. I am trying to fetch only one of the rows in many duplicate rows with same description. I don't want to aggregate as i need other information from other columns as well. Below code is what i am trying to achieve but is not working out.
res = esconnection.search(index='data', body={
# "query": {
# "match": {"description": query_input}
# },
# "size": 30
"query": {
"multi_match": {
"description": query_input
}
},
"aggs": {
"top-descriptions": {
"terms": {
"field": "description"
},
"aggs": {
"top_description_hits": {
"top_hits": {
"sort": [
{
"_score": {
"order": "desc"
}
}
],
"size": 1
}
}
}
}
}
})
return res["hits"]["hits"]

Field collapsing can be used to group documents on a field
Allows to collapse search results based on field values. The
collapsing is done by selecting only the top sorted document per
collapse key. For instance the query below retrieves the best tweet
for each user and sorts them by number of likes.
Data
[
{
"_index" : "index4",
"_type" : "_doc",
"_id" : "P1lTjHEBF99yL6wF31iA",
"_score" : 1.0,
"_source" : {
"description" : "brown fox"
}
},
{
"_index" : "index4",
"_type" : "_doc",
"_id" : "QFlTjHEBF99yL6wF8liO",
"_score" : 1.0,
"_source" : {
"description" : "brown fox"
}
},
{
"_index" : "index4",
"_type" : "_doc",
"_id" : "QVlTjHEBF99yL6wF91gU",
"_score" : 1.0,
"_source" : {
"description" : "brown fox"
}
},
{
"_index" : "index4",
"_type" : "_doc",
"_id" : "QllUjHEBF99yL6wFFFh5",
"_score" : 1.0,
"_source" : {
"description" : "brown dog"
}
},
{
"_index" : "index4",
"_type" : "_doc",
"_id" : "Q1lUjHEBF99yL6wFGFhQ",
"_score" : 1.0,
"_source" : {
"description" : "brown dog"
}
}
]
I have three documents with description as "brown fox" and two documents with description as "brown dog"
Query:
{
"query": {
"match": {
"description": {
"query": "brown"
}
}
},
"collapse": {
"field": "description.keyword" --> notice keyword
}
}
Result:
"hits" : [
{
"_index" : "index4",
"_type" : "_doc",
"_id" : "P1lTjHEBF99yL6wF31iA",
"_score" : 0.087011375,
"_source" : {
"description" : "brown fox"
},
"fields" : {
"description.keyword" : [
"brown fox"
]
}
},
{
"_index" : "index4",
"_type" : "_doc",
"_id" : "QllUjHEBF99yL6wFFFh5",
"_score" : 0.087011375,
"_source" : {
"description" : "brown dog"
},
"fields" : {
"description.keyword" : [
"brown dog"
]
}
}
]
Only 2 documents are returned.
Field collapsing gives features like "inner_hits": if you want to see documents under a group. Using sort you can decide which document to show.

Elastic Search composite grouping with range

Consider the following documents are in my elastic search . I want to group the documents based on rank, but any rank below 1000 must be displayed individually and anything above 1000 must be grouped how do I achieve this using composite aggregation, I am new and I am using composite because I want to use the after key function to allow pagination.
Documents
{
rank : 200,
name:abcd,
score1 :100,
score2:200
},
{
rank 300,
name:abcd,
score1:100,
score2:200
}
Expected Result:
{
key:{
rank:101
},
doc_count:1,
_score1: {value:3123}
_score2 : {value :3323}
}
{
key:{
rank:1000-*
},
doc_count:1,
_score1: {value:3123}
_score2 : {value :3323}
},
{
key:{
rank:300
},
doc_count:1,
_score1: {value:3123}
_score2 : {value :3323}
}
######## QUery that I tried
{
"query":{"match_all":{}},
"aggs":{
"_scores":{
"composite"{
"sources":[
{"_rank":{"terms":{"field":"rank"}}}
]
}
},
"aggs":{
"_ranks":{
"field":"rank:[
{"to":1000},
{"from":1000}
]
}
"_score1": {"sum": {"field": "score1"}}
"_score2": {"sum": {"field": "score2"}}
}
}
}

From what I understand, you want to
Group the aggregations whose value is below 1000 rank to their own buckets
Group the aggregations whose value is 1000 and above to a single bucket with key 1000-*
And for each buckets, calculate the sum of _score1 of all buckets
Similarly calculate the sum of _score2 of all buckets
For this scenario, you can simply make use of Terms Aggregation as I've mentioned in below answer.
I've mentioned sample mapping, sample documents, query and response so that you'll have clarity on what's happening.
Mapping:
PUT my_sample_index
{
"mappings": {
"properties": {
"rank":{
"type": "integer"
},
"name":{
"type": "keyword"
},
"_score1": {
"type":"integer"
},
"_score2":{
"type": "integer"
}
}
}
}
Sample Documents:
POST my_sample_index/_doc/1
{
"rank": 100,
"name": "john",
"_score1": 100,
"_score2": 100
}
POST my_sample_index/_doc/2
{
"rank": 1001, <--- Rank > 1000
"name": "constantine",
"_score1": 200,
"_score2": 200
}
POST my_sample_index/_doc/3
{
"rank": 200,
"name": "bruce",
"_score1": 100,
"_score2": 100
}
POST my_sample_index/_doc/4
{
"rank": 2001, <--- Rank > 1000
"name": "arthur",
"_score1": 200,
"_score2": 200
}
Aggregation Query:
POST my_sample_index/_search
{
"size":0,
"aggs": {
"_score": {
"terms": {
"script": {
"source": """
if(doc['rank'].value < 1000){
return doc['rank'];
}else
return '1000-*';
"""
}
},
"aggs":{
"_score1_sum":{
"sum": {
"field": "_score1"
}
},
"_score2_sum":{
"sum":{
"field": "_score2"
}
}
}
}
}
}
Note that I've used Scripted Terms Aggregation where I've mentioned by logic in the script. Logic I believe is self-explainable once you go through it.
Response:
{
"took" : 5,
"timed_out" : false,
"_shards" : {
"total" : 1,
"successful" : 1,
"skipped" : 0,
"failed" : 0
},
"hits" : {
"total" : {
"value" : 4,
"relation" : "eq"
},
"max_score" : null,
"hits" : [ ]
},
"aggregations" : {
"_score" : {
"doc_count_error_upper_bound" : 0,
"sum_other_doc_count" : 0,
"buckets" : [
{
"key" : "1000-*", <---- Note this
"doc_count" : 2, <---- Note this
"_score2_sum" : {
"value" : 400.0
},
"_score1_sum" : {
"value" : 400.0
}
},
{
"key" : "100",
"doc_count" : 1,
"_score2_sum" : {
"value" : 100.0
},
"_score1_sum" : {
"value" : 100.0
}
},
{
"key" : "200",
"doc_count" : 1,
"_score2_sum" : {
"value" : 100.0
},
"_score1_sum" : {
"value" : 100.0
}
}
]
}
}
}
Note that there are two keys having rank > 1000, both of their scores for _score1 and _score2 sum to 400, which is what is expected.
Let me know if this helps!

Mongodb aggregation with $or nested clauses

I have mongodb documents like this:
{
"_id" : ObjectId("5d35ba501545d248c383871f"),
"key1" : 1,
"currentTime" : ISODate("2019-07-18T19:41:54.000Z"),
"iState" : "START - 1",
"errGyro" : -4.0,
"states" : [
{
"ts" : 3,
"accY" : -165.877227783203,
"gyroZ" : 8.2994499206543,
},
{
"ts" : 4,
"accY" : -15.843573,
"gyroZ" : 12.434643,
},
{
"ts" : 3,
"accY" : 121.32667,
"gyroZ" : 98.45566,
}
]
}
I want to return all the states objects and the parent document where "ts" is 3 or 5.
I tried this query at first:
db.getCollection('logs').find(
{"states" :
{ "$elemMatch" : { "$or":[
{ "ts":
{ "$eq" : 3}
},
{ "ts":
{ "$eq" : 5}
}
]
}
}
},{"states.$":1 })
But this returns only the first "state" document where the "eq" occurred.
How can I return all the matching documents?
Thank you.

You can use aggregation pipelines
db.getCollection('logs').aggregate([
{
$unwind: "$states"
},
{
$match: {
$or: [
{ "states.ts": 3 },
{ "states.ts": 5 },
]
}
},
{
$group: {
_id: "$_id",
"key1": { $first: "key1" },
"currentTime": { $first: "currentTime" },
"iState": { $first: "$iState" },
"errGyro": { $first: "$errGyro" },
states: { $push: "$states" }
}
}
])

As $elemMatch returns only the first matching element of array, you have to use aggregation to achieve your goal. Here's the query :
db.collection.aggregate([
{
$match: {
$or: [
{
"states.ts": {
$eq: 3
}
},
{
"states.ts": {
$eq: 5
}
}
]
}
},
{
$project: {
states: 1
}
},
{
$unwind: "$states"
},
{
$match: {
$or: [
{
"states.ts": {
$eq: 3
}
},
{
"states.ts": {
$eq: 5
}
}
]
}
},
{
$group: {
_id: "$_id",
states: {
$push: "$states"
}
}
}
])
First $match and $project stages are here for query optimization and memory save, if many documents are returned.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Search a list of comma separated strings on elasticsearch - python

Related

How to retrieve elasticsearch data from index based on timestamp?

Query subdocument in Monogo DB

How to fetch unique score search result from Elastic Search

Elastic Search composite grouping with range

Mongodb aggregation with $or nested clauses

Categories

Resources