reindex elasticsearch api timeout with large size of document [duplicate] - python

This question already has answers here:
Elasticsearch reindex error - client request timeout
(2 answers)
Closed 3 years ago.
I am re-indexing one index from python but size of document is large (6gig) and it take 60 min, so I am getting time out in api.
Code:
def Reindex(src, dest):
query = {
"source": {
"index": src,
"query": {
"range": {
"UTC_date": {
"lt": "now-15d/d"
}
}
}
},
"dest": {
"index": dest
}
}
Query = {
"query": {
"range": {
"UTC_date": {
"lt": "now-15d/d"
}
}
}
}
try:
result = es.reindex(query, wait_for_completion=True, request_timeout=300)
except:
pass

i found solution.because i reindex 6gig it takse more time so i increased time out and now it work
def Reindex(src, dest):
print("[X] START Reindex")
query = {
"source": {
"index": src,
"query": {
"range": {
"UTC_date": {
"lt": "now-1d/d"
}
}
}
},
"dest": {
"index": dest
}
}
Query = {
"query": {
"range": {
"UTC_date": {
"lt": "now-1d/d"
}
}
}
}
try:
result = es.reindex(query, wait_for_completion=True, request_timeout=10000,conflicts="proceed")
print(result)
log_dict = {}
log_dict['total']=result['total']
log_dict['created']=result['created']
log_dict['updated']=result['updated']
log_dict["Timestamp"] = datetime.datetime.now().strftime("%Y-%m-%dT%H:%M:%S")
if log_dict['total']==(log_dict['created']+log_dict['updated']):
print("gggggg")
log_dict['status'] = 'success'
Delete(src)
else:
log_dict['status'] = 'failure'
access_logger.info(json.dumps(log_dict))

Related

Elasticserach, getting error when trying to query on on time

I have documents with timestamp of following format:
2022-11-17T17:16:26.397Z
I try to get all documents on each day between two dates, and on each day between, lets say 11:05 and 15:05.
This is my query:
"query": {
"bool": {
"filter": {
"bool": {
"must": [
{
"range": {
"timestamp": {
"gte": "2022-11-01",
"lte": "2022-11-30"
}
}
}, {
"script": {
"script": {
"source": "doc.timestamp.getHourOfDay() >= params.min && doc.timestamp.getHourOfDay() <= params.max",
"params": {
"min": 11,
"max": 15
}
}
}
}
]
}
}
}
}
}
EDIT#rabbitbt i ran you query on two different documents:
Okay, after lots of testing with your Query i find out that it gives a runtime Error whenever the timestamp includes a 0 directly after the T.
For example
"timestamp": "2022-11-07T01:04:39.357551"
Any idea how i can change the query to fix this?
Thanks for all the help, in the end i got it working by replacing the line in my original query:
"source": "doc.timestamp.getHourOfDay() >= params.min && doc.timestamp.getHourOfDay() <= params.max",
to
source": "doc['timestamp'].value.getHour() >= params.min &&
doc['timestamp'].value.getHour() <= params.max",
My suggestion:
POST bar/_doc
{
"date":"2022-11-14T11:12:46"
}
Python code
doc = {"date": "2022-11-17T11:16:26.397Z"}
response_index = get_client_es().index(index="bar", body=doc, refresh="wait_for")
print(response_index)
query = {
"query": {
"bool": {
"filter": [
{
"range": {
"date": {
"gte": "2022-11-01",
"lte": "2022-11-30"
}
}
},
{
"script": {
"script": {
"lang": "painless",
"source": """
def targetDate = doc['date'].value;
def targetMinute = targetDate.getMinute();
if(targetDate.getMinute() < 10)
{
targetMinute = "0" + targetDate.getMinute();
}
def timeFrom = LocalTime.parse(params.timeFrom);
def timeTo = LocalTime.parse(params.timeTo);
def target = LocalTime.parse(targetDate.getHour().toString()
+ ":"+ targetMinute);
if(target.isBefore(timeTo) && target.isAfter(timeFrom)) {
return true;
}
""",
"params": {
"timeFrom": "10:30",
"timeTo": "15:13"
}
}
}
}
]
}
}
}
result = get_client_es().search(index="bar", body=query)
print(result)

Aggregation query fails using ElasticSearch Python client

Here is an aggregation query that works as expected when I use dev tools in on Elastic Search :
search_query = {
"aggs": {
"SHAID": {
"terms": {
"field": "identiferid",
"order": {
"sort": "desc"
},
# "size": 100000
},
"aggs": {
"update": {
"date_histogram": {
"field": "endTime",
"calendar_interval": "1d"
},
"aggs": {
"update1": {
"sum": {
"script": {
"lang": "painless",
"source":"""
if (doc['distanceIndex.att'].size()!=0) {
return doc['distanceIndex.att'].value;
}
else {
if (doc['distanceIndex.att2'].size()!=0) {
return doc['distanceIndex.att2'].value;
}
return null;
}
"""
}
}
},
"update2": {
"sum": {
"script": {
"lang": "painless",
"source":"""
if (doc['distanceIndex.att3'].size()!=0) {
return doc['distanceIndex.att3'].value;
}
else {
if (doc['distanceIndex.at4'].size()!=0) {
return doc['distanceIndex.att4'].value;
}
return null;
}
"""
}
}
},
}
},
"sort": {
"sum": {
"field": "time2"
}
}
}
}
},
"size": 0,
"query": {
"bool": {
"filter": [
{
"match_all": {}
},
{
"range": {
"endTime": {
"gte": "2021-11-01T00:00:00Z",
"lt": "2021-11-03T00:00:00Z"
}
}
}
]
}
}
}
When I attempt to execute this aggregation using the Python ElasticSearch client (https://elasticsearch-py.readthedocs.io/en/v7.15.1/) I receive the exception :
exception search() got multiple values for keyword argument 'size'
If I remove the attribute :
"size": 0,
From the query then the exception is not thrown but the aggregation does not run as "size": 0, is required for an aggregation.
Is there a different query format I should use for performing aggregations using the Python ElasticSearch client ?
Update :
Here is code used to invoke the query :
import elasticsearch
from elasticsearch import Elasticsearch, helpers
es_client = Elasticsearch(
["https://test-elastic.com"],
scheme="https",
port=443,
http_auth=("test-user", "test-password"),
maxsize=400,
timeout=120,
max_retries=10,
retry_on_timeout=True
)
query_response = helpers.scan(client=es_client,
query=search_query,
index="test_index",
clear_scroll=False,
request_timeout=1500)
rows = []
try:
for row in query_response:
rows.append(row)
except Exception as e:
print('exception' , e)
Using es_client :
es_client.search(index="test_index", query=search_query)
results in error :
/opt/oss/conda3/lib/python3.7/site-packages/elasticsearch/connection/base.py in _raise_error(self, status_code, raw_data)
336
337 raise HTTP_EXCEPTIONS.get(status_code, TransportError)(
--> 338 status_code, error_message, additional_info
339 )
340
RequestError: RequestError(400, 'parsing_exception', 'unknown query [aggs]')
Is aggs valid for search api ?
helpers.scan is a
Simple abstraction on top of the scroll() api - a simple iterator that yields all hits as returned by underlining scroll requests.
It's meant to iterate through large result sets and comes with a default keyword argument of size=1000
To run an aggregation, use the es_client.search() method directly, passing in your query as body, and including "size": 0 in the query should be fine.

Query an elasticsearch index by an attribute, with a given range?

I want to query my index so that it matches whenever a particular attribute shows up called sitename, but I want all the data from a certain time range. I thought it might be something of the below but unsure:
{
"query": {
"range": {
"timestamp": {
"gte": "now-1h/h",
"lt": "now/h"
}
},
"match": {"sitename" : "HARB00ZAF0" }
}
}
You're almost there, but you need to leverage the bool queries
{
"query": {
"bool": {
"filter": [
{
"range": {
"timestamp": {
"gte": "now-1h/h",
"lt": "now/h"
}
}
}
],
"must": [
{
"match": {
"sitename": "HARB00ZAF0"
}
}
]
}
}
}

How to check for key(last_password_changed_at) exists in index if not exists, Update the `field' in the accounts index to current date using python

currently I am working on API in python which will check the password validity(Users will require to change their password for every 60 days), need to check whether the key(last_password_changed_at)exists in account index, If exists I simply take the key and compare with today datetime it will give no. of days and check the condition(<=60, >60)and returns an should_update flag with a Boolean value, If not exists updating the `last_password_changed_at' in the accounts index to current date & time. This is my code.
from elasticsearch import Elasticsearch
client = Elasticsearch([{'host': 'localhost', }])
profile = client.search(
index='accounts',
doc_type='accounts',
scroll='5m',
size=100,
body={
"query": {
"match": {
"username": request_object.username_or_email
}
}
})
import datetime
current_datetime = datetime.datetime.now()
for info in profile['hits']['hits']:
if (info['_source']['last_password_changed_at']):
format_last_password_changed_at = datetime.datetime.strptime((info['_source']['last_password_changed_at']),'%Y-%m-%dT%H:%M:%S.%f')
days_diff = (current_datetime - format_last_password_changed_at).days
# if (days_diff == 0) or (days_diff <= 60):
if (days_diff > 60):
last_password_changed_at = format_last_password_changed_at
return jsonify({"should_update": bool(days_diff), "last_password_update": last_password_changed_at })
else:
# elif (days_diff == 0) or (days_diff <= 60):
last_password_changed_at = format_last_password_changed_at
return jsonify({"should_update": bool(days_diff), "last_password_update": last_password_changed_at })
if (info['_source']['last_password_changed_at'] not in info):
print('if field not exists, creating the field and updating last_password_changed_at to current datetime')
updating_current_datetime = info.update(
body={
"script": {
"source": "ctx._source.last_password_changed_at+= last_password_changed_at",
"params": {
"last_password_changed_at": current_datetime
},
},
"query": {
"match": {
"_id": info['_id']
}
}
}
)
# i['_source']['last_password_changed_at'] = current_datetime.strftime('%Y-%m-%d %H:%M:%S.%f')
return jsonify({"should_update": 'false', "last_password_update": updating_current_datetime})
**Expected output 1:**(If >60 days)
{
"should_update": true,
"last_password_update": "2021-01-03T08:32:36.054082",
}
**Expected output 2:**(If <=60 days)
{
"should_update": False,
"last_password_update": "2021-01-03T08:32:36.054082",
}
Here is my account index
{
"settings": {
},
"mappings": {
"accounts": {
"dynamic": false,
"properties": {
"username": {
"type": "text",
"fields": {
"raw": {
"type": "keyword",
}
}
},
"password": {
"type": "keyword"
},
"email": {
"type": "keyword",
"fields": {
"search": {
"type": "text",
}
}
},
"last_password_changed_at": {
"type": "date"
}
}
}
}
}
The above code is working for user account who's having last_password_changed_at(field) and not working for account don't have this last_password_changed_at(field), means only if condition is working all the time.
Error:
if (info['_source']['last_password_changed_at']):
KeyError: 'last_password_changed_at'
please help me guys
for info in profile['hits']['hits']:
item = info['_source']
item1 = info['_source']['last_password_changed_at']
last_password_changed_details = item.get('last_password_changed_at')
if (last_password_changed_details):
print('I have a field')
if not last_password_changed_details:
print('i dont have a field')

Get elasticsearch documents older than a certain age in minutes

I have a field in some of my documents if they've been individually queried before which is a unix timestamp:
"timelock": 1,561,081,724.254
Some documents don't have this if they've never been individually queried. I would like to also have a query that only returns documents that either DO NOT have the field or have the field but the difference between it's timestamp and the current time is greater than 10 minutes (600sec)
documents = es.search(index='index', size=10000, body={
"query": {
"bool": {
"must": [
{
"match_all": {}
},
],
"filter": [],
"should": [],
"must_not": [
]
}
}})
So I guess in pseudo-code I'd do it like:
if 'timelock' exists:
if current_time - 'timlock' > 600:
include in query
else:
exclude from query
else:
include in query
I'm using the python module for ES.
Why not simply using date math ?
{
"query": {
"bool": {
"minimum_should_match": 1,
"should": [
{
"bool": {
"must_not": [
{
"exists": {
"field": "timelock"
}
}
]
}
},
{
"range": {
"timelock": {
"lt": "now-10m"
}
}
}
]
}
}
}
I'm not aware of python syntax but what I can suggest via sudo code is to use the logic below:
compare_stamp = current_timestamp - 600
if 'timelock' exists:
if timelock < compare_stamp:
include document
else:
exclude document
else:
include document
Since you can easily get the compare_stamp in python script. This value can then be used in elastic query below:
{
"query": {
"bool": {
"should": [
{
"bool": {
"must_not": [
{
"exists": {
"field": "timelock"
}
}
]
}
},
{
"range": {
"timelock": {
"lt": compare_timestamp
}
}
}
]
}
}
}

Categories

Resources