count total in ES - python

If I have below data in ES and dynamoDB. How can I count total books_borrowed using query?
user
user_id: 1,
name: "ABC",
books_borrwoed: 5,
unit_price: 10
user_id: 2,
name: "dead",
books_borrwoed: 5,
unit_price: 10
I cant use loop, as there can be more than 10k records in a month.

You can use sum aggregation, to get the sum of books_borrwed field
{
"size": 0,
"aggs": {
"total_books": {
"sum": {
"field": "books_borrwoed"
}
}
}
}
Search result will be :
"aggregations": {
"total_books": {
"value": 10.0
}
}
Update 1:
If you need to multiply price of each book by 5, you can use script with sum aggregation
{
"size": 0,
"aggs": {
"total_books": {
"sum": {
"script": {
"lang": "painless",
"inline": "doc['books_borrwoed'].value * 5"
}
}
}
}
}
Search Result will be
"aggregations": {
"total_books": {
"value": 50.0
}
}
And if you want to take the value from your data, you can use below query
{
"size": 0,
"aggs": {
"total_books": {
"sum": {
"script": {
"lang": "painless",
"inline": "doc['books_borrwoed'].value * doc['unit_price'].value"
}
}
}
}
}

Related

Elasticserach, getting error when trying to query on on time

I have documents with timestamp of following format:
2022-11-17T17:16:26.397Z
I try to get all documents on each day between two dates, and on each day between, lets say 11:05 and 15:05.
This is my query:
"query": {
"bool": {
"filter": {
"bool": {
"must": [
{
"range": {
"timestamp": {
"gte": "2022-11-01",
"lte": "2022-11-30"
}
}
}, {
"script": {
"script": {
"source": "doc.timestamp.getHourOfDay() >= params.min && doc.timestamp.getHourOfDay() <= params.max",
"params": {
"min": 11,
"max": 15
}
}
}
}
]
}
}
}
}
}
EDIT#rabbitbt i ran you query on two different documents:
Okay, after lots of testing with your Query i find out that it gives a runtime Error whenever the timestamp includes a 0 directly after the T.
For example
"timestamp": "2022-11-07T01:04:39.357551"
Any idea how i can change the query to fix this?
Thanks for all the help, in the end i got it working by replacing the line in my original query:
"source": "doc.timestamp.getHourOfDay() >= params.min && doc.timestamp.getHourOfDay() <= params.max",
to
source": "doc['timestamp'].value.getHour() >= params.min &&
doc['timestamp'].value.getHour() <= params.max",
My suggestion:
POST bar/_doc
{
"date":"2022-11-14T11:12:46"
}
Python code
doc = {"date": "2022-11-17T11:16:26.397Z"}
response_index = get_client_es().index(index="bar", body=doc, refresh="wait_for")
print(response_index)
query = {
"query": {
"bool": {
"filter": [
{
"range": {
"date": {
"gte": "2022-11-01",
"lte": "2022-11-30"
}
}
},
{
"script": {
"script": {
"lang": "painless",
"source": """
def targetDate = doc['date'].value;
def targetMinute = targetDate.getMinute();
if(targetDate.getMinute() < 10)
{
targetMinute = "0" + targetDate.getMinute();
}
def timeFrom = LocalTime.parse(params.timeFrom);
def timeTo = LocalTime.parse(params.timeTo);
def target = LocalTime.parse(targetDate.getHour().toString()
+ ":"+ targetMinute);
if(target.isBefore(timeTo) && target.isAfter(timeFrom)) {
return true;
}
""",
"params": {
"timeFrom": "10:30",
"timeTo": "15:13"
}
}
}
}
]
}
}
}
result = get_client_es().search(index="bar", body=query)
print(result)

How to perform sum aggregation on n number of documents after sorting records (descending) ? elastic

{so, i want latest 30 document between 20/6 to 20/4 and perform the sum aggregation on field duration_seconds of those 30 latest doc. we had tried multiple aggregation on that like top_hits, terms for sorting but then we got the sum of all doc between 20/6 to 20/4}
"size": 1,
"query": {
"bool": {
"must": [
{
"range": {
"create_datetime": {
"gte": "2022-04-20",
"lte": "2022-06-20"
}
}
}
]
}
},
"sort": [
{
"create_datetime": {
"order": "desc"
}
}
],
"aggs": {
"videosession": {
"sampler": {
"shard_size":30
},
"aggs": {
"sum_duration_seconds": {
"sum": {
"field": "duration_seconds"
}
}
}
}
}
}```

Access nested objects in Elasticsearch using a script

I'm trying to use data from ElasticSearch 6 results in setting up the scoring for my results.
Part of my mapping looks like:
{
"properties": {
"annotation_date": {
"type": "date"
},
"annotation_date_time": {
"type": "date"
},
"annotations": {
"properties": {
"details": {
"type": "nested",
"properties": {
"filter": {
"type": "text",
"fielddata": True,
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"bucket": {
"type": "text",
"fielddata": True,
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"keyword": {
"type": "text",
"fielddata": True,
"fields": {
"keyword": {
"type": "keyword",
"ignore_above": 256
}
}
},
"frequency": {
"type": "long",
}
}
}
}
}
}
}
Example part of a document JSON:
"annotations": {
"details": [
{
"filter": "filter_A",
"bucket": "bucket_A",
"keyword": "keyword_A",
"frequency": 6
},
{
"filter": "filter_B",
"bucket": "bucket_B",
"keyword": "keyword_B",
"frequency": 7
}
]
I want to use the the frequency of my annotation.details if it hits a certain 'bucket', which I try to do with the following:
GET my_index/_search
{
"size": 10000,
"query": {
"function_score": {
"query": {
"match": { "title": "<search term>" }
},
"script_score": {
"script": {
"lang": "painless",
"source": """
int score = 0;
for (int i = 0; i < doc['annotations.details.filter'].length; i++){
if (doc['annotations.details.filter'][i].keyword == "bucket_A"){
score += doc['annotations.details.frequency'][i].value;
}
}
return score;
"""
}
}
}
}
}
Ultimately, this would mean that in this specific situation a score is expected of 6. If it would have hit on more buckets, the score is incremented with the frequency it hit on.
You should use bool,must with range and gt
example
GET /_search
{
"query": {
"nested" : {
"path" : "obj1",
"score_mode" : "avg",
"query" : {
"bool" : {
"must" : [
{ "match" : {"obj1.name" : "blue"} },
{ "range" : {"obj1.count" : {"gt" : 5}} }
]
}
}
}
}
}

elasticsearch.exceptions.TransportError: TransportError 503: Data too large

I'm trying to get the response from ES hitting it from python code but it is showing the below error:
elasticsearch.exceptions.TransportError: TransportError(503, u'search_phase_execution_exception', u'[request] Data too large, data for [<agg [POSCodeModifier]>] would be [623327280/594.4mb], which is larger than the limit of [623326003/594.4mb]')
If i hit the same code from kibana i get the results but using python i'm getting this error. I'm using aggregation in my code. if someone can explain if i need to set some properties or how to optimise it??
Below is the structure for request i'm sending and if i set start and end date greater than 5 days it gives me the error, otherwise i'm getting the results
unmtchd_ESdata= es.search(index='cstore_new',body={'size' : 0, "aggs": {
"filtered": {
"filter": {
"bool": {
"must_not": [
{
"match": {
"CSPAccountNo": store_id
}
}
],
"must": [
{
"range": {
"ReportDate": {
"gte": start_dt,
"lte": end_dt
}
}
}
]
}
}
,
"aggs": {
"POSCode": {
"terms": {
"field": "POSCode",
"size": 10000
},
"aggs": {
"POSCodeModifier": {
"terms": {
"field": "POSCodeModifier",
"size": 10000
},
"aggs": {
"CSP": {
"terms": {
"field": "CSPAccountNo",
"size": 10000
},
"aggs": {
"per_stock": {
"date_histogram": {
"field": "ReportDate",
"interval": "week",
"format": "yyyy-MM-dd",
"min_doc_count": 0,
"extended_bounds": {
"min": start_dt,
"max": end_dt
}
},
"aggs": {
"avg_week_qty_sales": {
"sum": {
"field": "TotalCount"
}
}
}
},
"market_week_metrics": {
"extended_stats_bucket": {
"buckets_path": "per_stock>avg_week_qty_sales"
}
}
}
}
}
}
}
}
}
}
}},request_timeout=1000)
Edit1:
Result variables needed from elastic search response
for i in range(len(unmtchd_ESdata['aggregations']['filtered']['POSCode']['buckets'])):
list6.append(unmtchd_ESdata['aggregations']['filtered']['POSCode']['buckets'][i]['POSCodeModifier']['buckets'][0]['CSP']['buckets'][0]['market_week_metrics']['avg'])
list7.append(unmtchd_ESdata['aggregations']['filtered']['POSCode']['buckets'][i]['key'])
list8.append(unmtchd_ESdata['aggregations']['filtered']['POSCode']['buckets'][i]['POSCodeModifier']['buckets'][0]['CSP']['buckets'][0]['market_week_metrics']['max']-unmtchd_ESdata['aggregations']['filtered']['POSCode']['buckets'][i]['POSCodeModifier']['buckets'][0]['CSP']['buckets'][0]['market_week_metrics']['min'])
list9.append(unmtchd_ESdata['aggregations']['filtered']['POSCode']['buckets'][i]['POSCodeModifier']['buckets'][0]['CSP']['buckets'][0]['market_week_metrics']['max'])
list10.append(unmtchd_ESdata['aggregations']['filtered']['POSCode']['buckets'][i]['POSCodeModifier']['buckets'][0]['CSP']['buckets'][0]['market_week_metrics']['min'])

Elasticsearch query - print certain field based on other field

My goal is to find max value in one field and print another field in this found document.
My query so far:
{
"fields": ["text"], //NOT WORKING
"query": {
"query_string": {
"query": "_type:bmw AND _exists_:car_type",
"analyze_wildcard": True
}
},
"size": 0,
"aggs": {
"2": {
"terms": {
"field": "compound",
"size": 5,
"order": {
"2-orderAgg": "desc"
}
},
"aggs": {
"2-orderAgg": {
"max": {
"field": "compound"
}
}
}
}
}
}
Result is
'buckets': [{'doc_count': 1, '2-orderAgg': {'value': 0.8442}, 'key': 0.8442}, {'doc_count': 1, '2-orderAgg': {'value': 0.7777}, 'key': 0.7777}, {'doc_count': 1, '2-orderAgg': {'value': 0.7579}, 'key': 0.7579}, {'doc_count': 1, '2-orderAgg': {'value': 0.6476}, 'key': 0.6476}, {'doc_count': 1, '2-orderAgg': {'value': 0.6369}, 'key': 0.6369}]
Now I need to print text field in document contains compound value 0.8442 and so on.. Thank you for your advice.
I achieved this with a small workaroud. It's not pretty but at final I get what I wanted.
Firstly I used response from first query. Than I grabbed all keys from those dictionary and perform new query to find certain document's id.
{
"size": 0,
"query": {
"query_string": {
"analyze_wildcard": True,
"query": "_type:bmw AND compound:"+str(0.8442)+" AND _exists_:car_type"
}
},
"aggs": {
"3": {
"terms": {
"field": "id_str",
"size": 20,
"order": {
"_count": "desc"
}
}
}
}
}
than iterate through response and search document by this id field
for y in res1:
res3 = es.search(index='indexname', body={
"size" : 1,
"query": {
"bool": {
"must": [
{
"match": {
"id_str": y['key']
}
}
]
}
}
})
for x in res3['hits']['hits']:
print (x['_source']['text'])
now result is
Diamond stitch leather is a great addition to any custom vehicle. Prices start from 2k! #bmw i8 getting under car...
which is text what I wanted.

Categories

Resources