Pytest with Moto, change the status of an athena query using the backend - python

I am using moto to test aws functionality in my codebase. One of the issues I have ran into is that when testing athena, the query status stayed in "QUEUED" indefinitely, causing the test to fail or time out.
Here is the method to be tested:
import time
import boto3
class Athena:
CLIENT = boto3.client("athena")
class QueryError(Exception):
"""A class for exceptions related to queries."""
#classmethod
def execute_query(cls, query, result_location, check_status=True,
time_limit=10):
"""
Execute a query in Athena.
"""
_result_configuration = {"OutputLocation": result_location}
_kwargs = {"QueryString": query, "ResultConfiguration":
_result_configuration}
response = cls.CLIENT.start_query_execution(**_kwargs)
query_id = response["QueryExecutionId"]
if check_status:
old_time = time.time()
while True:
status = cls.CLIENT.get_query_execution(
QueryExecutionId=query_id)
status = status["QueryExecution"]["Status"]["State"]
if status in ["SUCCEEDED", "FAILED", "CANCELLED"]:
if status == "FAILED":
raise cls.QueryError("error")
break
time.sleep(0.2) # 200ms
if time.time() - old_time > time_limit and status
== "QUEUED":
raise cls.QueryError("time limit reached")
return query_id
Here is the fixture passed into the test
from moto.s3 import mock_s3
import boto3
#pytest.fixture
def s3():
with mock_s3():
s3 = boto3.client("s3")
yield s3
Here is the test (keep in mind you need to change from x to the module with the above method)
import uuid
import boto3
import pytest
from moto.athena import mock_athena
from moto.s3 import mock_s3
#mock_s3
#mock_athena
def test_execute_query_check(s3):
from x import Athena
"""
Test for 'execute_query' (with status check)
"""
CLIENT = s3
bucket_name = "pytest." + str(uuid.uuid4())
# Bucket creation
bucket_config = {"LocationConstraint": "us-east-2"}
CLIENT.create_bucket(Bucket=bucket_name,
CreateBucketConfiguration=bucket_config)
waiter = CLIENT.get_waiter("bucket_exists")
waiter.wait(Bucket=bucket_name)
s3_location = f"s3://{bucket_name}/"
query = "SELECT current_date, current_time;"
query_id = Athena.execute_query(query, s3_location,
check_status=True)
assert query_id
This test fails because moto does not change the status of the query past "QUEUED" and the test is expecting a changed to state otherwise it triggers an exception.
I would like to be able to do something like:
from moto.athena import athena_backends
athena_backends['us-east-2'].job_flows[query_id].state = "SUCCEEDED"
as was suggested in this issue: https://github.com/spulec/moto/issues/380
However the "job flows" attribute does not seem to exist anymore on the boto3 mapreduce backend, and I cant find a method to explicitly change it.
Ideally this would be able to happen somewhere in the test to manually change the state of the query to simulate how it would be with actual resources.

State can be accessed and changed as follows:
athena_backends['us-east-2'].executions.get(query_id).status
Sample code snippet
from moto.athena import athena_backends
query = "SELECT stuff"
location = "s3://bucket-name/prefix/"
database = "database"
# Start Query
exex_id = self.client.start_query_execution(
QueryString=query,
QueryExecutionContext={"Database": database},
ResultConfiguration={"OutputLocation": location},
)["QueryExecutionId"]
athena_backends['us-west-2'].executions.get(exex_id).status = "CANCELLED"

It seems to me that moto only returns QUEUED for the start_query_execution, you can take a look at the source code here.
Another approach is using from unittest import mock, and then you can do something like:
cls.CLIENT = mock.Mock()
cls.CLIENT.start_query_execution.side_effect = [
'QUEUED',
'SUCCEEDED'
]
So then, the first time cls.CLIENT.start_query_execution(..) is called, it will return that the query is queued, but the second time will return that succeed, and then you will be able to test both path executions.
And also, with moto it won't be able to test all the cases, because apart from queued status, you only can set the query status to CANCELLED, as you can see here.

Related

Why open the same one database with sqlalchemy, but get different, how can I update it?

I write some tests with pytest, I want to test create user and email with post method.
With some debug, I know the issue is I open two databases in memory, but they are same database SessionLocal().
So how can I fix this, I try db.flush(), but it doesn't work.
this is the post method code
#router.post("/", response_model=schemas.User)
def create_user(
*,
db: Session = Depends(deps.get_db), #the get_db is SessionLocal()
user_in: schemas.UserCreate,
current_user: models.User = Depends(deps.get_current_active_superuser),
) -> Any:
"""
Create new user.
"""
user = crud.user.get_by_email(db, email=user_in.email)
if user:
raise HTTPException(
status_code=400,
detail="The user with this username already exists in the system.",
)
user = crud.user.create(db, obj_in=user_in)
print("====post====")
print(db.query(models.User).count())
print(db)
if settings.EMAILS_ENABLED and user_in.email:
send_new_account_email(
email_to=user_in.email, username=user_in.email, password=user_in.password
)
return user
and the test code is:
def test_create_user_new_email(
client: TestClient, superuser_token_headers: dict, db: Session # db is SessionLocal()
) -> None:
username = random_email()
password = random_lower_string()
data = {"email": username, "password": password}
r = client.post(
f"{settings.API_V1_STR}/users/", headers=superuser_token_headers, json=data,
)
assert 200 <= r.status_code < 300
created_user = r.json()
print("====test====")
print(db.query(User).count())
print(db)
user = crud.user.get_by_email(db, email=username)
assert user
assert user.email == created_user["email"]
and the test result is
> assert user
E assert None
====post====
320
<sqlalchemy.orm.session.Session object at 0x7f0a9f660910>
====test====
319
<sqlalchemy.orm.session.Session object at 0x7f0aa09c4d60>
Your code does not provide enough information to help you, the key issues are probably in what is hidden and explained by your comments.
And it seems like you are confusing sqlalchemy session and databases. If you are not familiar with these concepts, I highly recommend you to have a look at SQLAlchemy documentation.
But, looking at your code structure, it seems like you are using FastAPI.
Then, if you want to test SQLAlchemy with pytest, I recommend you to use pytest fixture with SQL transactions.
Here is my suggestion on how to implement such a test. I'll suppose that you want to run the test on your actual database and not create a new database especially for the tests. This implementation is heavily based on this github gist (the author made a "feel free to use statement", so I suppose he is ok with me copying his code here):
# test.py
import pytest
from sqlalchemy import create_engine
from sqlalchemy.orm import Session
from fastapi.testclient import TestClient
from myapp.models import BaseModel
from myapp.main import app # import your fastapi app
from myapp.database import get_db # import the dependency
client = TestClient(app)
# scope="session" mean that the engine will last for the whole test session
#pytest.fixture(scope="session")
def engine():
return create_engine("postgresql://localhost/test_database")
# at the end of the test session drops the created metadata using fixture with yield
#pytest.fixture(scope="session")
def tables(engine):
BaseModel.metadata.create_all(engine)
yield
BaseModel.metadata.drop_all(engine)
# here scope="function" (by default) so each time a test finished, the database is cleaned
#pytest.fixture
def dbsession(engine, tables):
"""Returns an sqlalchemy session, and after the test tears down everything properly."""
connection = engine.connect()
# begin the nested transaction
transaction = connection.begin()
# use the connection with the already started transaction
session = Session(bind=connection)
yield session
session.close()
# roll back the broader transaction
transaction.rollback()
# put back the connection to the connection pool
connection.close()
## end of the gist.github code
#pytest.fixture
def db_fastapi(dbsession):
def override_get_db():
db = dbsession
try:
yield db
finally:
db.close()
client.app.dependency_overrides[get_db] = override_get_db
yield db
# Now you can run your test
def test_create_user_new_email(db_fastapi):
username = random_email()
# ...

Google Cloud Functions randomly retrying on success

I have a Google Cloud Function triggered by a PubSub. The doc states messages are acknowledged when the function end with success.
link
But randomly, the function retries (same execution ID) exactly 10 minutes after execution. It is the PubSub ack max timeout.
I also tried to get message ID and acknowledge it programmatically in Function code but the PubSub API respond there is no message to ack with that id.
In StackDriver monitoring, I see some messages not being acknowledged.
Here is my code : main.py
import base64
import logging
import traceback
from google.api_core import exceptions
from google.cloud import bigquery, error_reporting, firestore, pubsub
from sql_runner.runner import orchestrator
logging.getLogger().setLevel(logging.INFO)
def main(event, context):
bigquery_client = bigquery.Client()
firestore_client = firestore.Client()
publisher_client = pubsub.PublisherClient()
subscriber_client = pubsub.SubscriberClient()
logging.info(
'event=%s',
event
)
logging.info(
'context=%s',
context
)
try:
query_id = base64.b64decode(event.get('data',b'')).decode('utf-8')
logging.info(
'query_id=%s',
query_id
)
# inject dependencies
orchestrator(
query_id,
bigquery_client,
firestore_client,
publisher_client
)
sub_path = (context.resource['name']
.replace('topics', 'subscriptions')
.replace('function-sql-runner', 'gcf-sql-runner-europe-west1-function-sql-runner')
)
# explicitly ack message to avoid duplicates invocations
try:
subscriber_client.acknowledge(
sub_path,
[context.event_id] # message_id to ack
)
logging.warning(
'message_id %s acknowledged (FORCED)',
context.event_id
)
except exceptions.InvalidArgument as err:
# google.api_core.exceptions.InvalidArgument: 400 You have passed an invalid ack ID to the service (ack_id=982967258971474).
logging.info(
'message_id %s already acknowledged',
context.event_id
)
logging.debug(err)
except Exception as err:
# catch all exceptions and log to prevent cold boot
# report with error_reporting
error_reporting.Client().report_exception()
logging.critical(
'Internal error : %s -> %s',
str(err),
traceback.format_exc()
)
if __name__ == '__main__': # for testing
from collections import namedtuple # use namedtuple to avoid Class creation
Context = namedtuple('Context', 'event_id resource')
context = Context('666', {'name': 'projects/my-dev/topics/function-sql-runner'})
script_to_start = b' ' # launch the 1st script
script_to_start = b'060-cartes.sql'
main(
event={"data": base64.b64encode(script_to_start)},
context=context
)
Here is my code : runner.py
import logging
import os
from retry import retry
PROJECT_ID = os.getenv('GCLOUD_PROJECT') or 'my-dev'
def orchestrator(query_id, bigquery_client, firestore_client, publisher_client):
"""
if query_id empty, start the first sql script
else, call the given query_id.
Anyway, call the next script.
If the sql script is the last, no call
retrieve SQL queries from FireStore
run queries on BigQuery
"""
docs_refs = [
doc_ref.get() for doc_ref in
firestore_client.collection(u'sql_scripts').list_documents()
]
sorted_queries = sorted(docs_refs, key=lambda x: x.id)
if not bool(query_id.strip()) : # first execution
current_index = 0
else:
# find the query to run
query_ids = [ query_doc.id for query_doc in sorted_queries]
current_index = query_ids.index(query_id)
query_doc = sorted_queries[current_index]
bigquery_client.query(
query_doc.to_dict()['request'], # sql query
).result()
logging.info(
'Query %s executed',
query_doc.id
)
# exit if the current query is the last
if len(sorted_queries) == current_index + 1:
logging.info('All scripts were executed.')
return
next_query_id = sorted_queries[current_index+1].id.encode('utf-8')
publish(publisher_client, next_query_id)
#retry(tries=5)
def publish(publisher_client, next_query_id):
"""
send a message in pubsub to call the next query
this mechanism allow to run one sql script per Function instance
so as to not exceed the 9min deadline limit
"""
logging.info('Calling next query %s', next_query_id)
future = publisher_client.publish(
topic='projects/{}/topics/function-sql-runner'.format(PROJECT_ID),
data=next_query_id
)
# ensure publish is successfull
message_id = future.result()
logging.info('Published message_id = %s', message_id)
It looks like the pubsub message is not ack on success.
I do not think I have background activity in my code.
My question : why my Function is randomly retrying even when success ?
Cloud Functions does not guarantee that your functions will run exactly once. According to the documentation, background functions, including pubsub functions, are given an at-least-once guarantee:
Background functions are invoked at least once. This is because of the
asynchronous nature of handling events, in which there is no caller
that waits for the response. The system might, in rare circumstances,
invoke a background function more than once in order to ensure
delivery of the event. If a background function invocation fails with
an error, it will not be invoked again unless retries on failure are
enabled for that function.
Your code will need to expect that it could possibly receive an event more than once. As such, your code should be idempotent:
To make sure that your function behaves correctly on retried execution
attempts, you should make it idempotent by implementing it so that an
event results in the desired results (and side effects) even if it is
delivered multiple times. In the case of HTTP functions, this also
means returning the desired value even if the caller retries calls to
the HTTP function endpoint. See Retrying Background Functions for more
information on how to make your function idempotent.

Flask Celery task locking

I am using Flask with Celery and I am trying to lock a specific task so that it can only be run one at a time. In the celery docs it gives a example of doing this Celery docs, Ensuring a task is only executed one at a time. This example that was given was for Django however I am using flask I have done my best to convert this to work with Flask however I still see myTask1 which has the lock can be run multiple times.
One thing that is not clear to me is if I am using the cache correctly, I have never used it before so all of it is new to me. One thing from the doc's that is mentioned but not explained is this
Doc Notes:
In order for this to work correctly you need to be using a cache backend where the .add operation is atomic. memcached is known to work well for this purpose.
Im not truly sure what that means, should i be using the cache in conjunction with a database and if so how would I do that? I am using mongodb. In my code I just have this setup for the cache cache = Cache(app, config={'CACHE_TYPE': 'simple'}) as that is what was mentioned in the Flask-Cache doc's Flask-Cache Docs
Another thing that is not clear to me is if there is anything different I need to do as I am calling my myTask1 from within my Flask route task1
Here is an example of my code that I am using.
from flask import (Flask, render_template, flash, redirect,
url_for, session, logging, request, g, render_template_string, jsonify)
from flask_caching import Cache
from contextlib import contextmanager
from celery import Celery
from Flask_celery import make_celery
from celery.result import AsyncResult
from celery.utils.log import get_task_logger
from celery.five import monotonic
from flask_pymongo import PyMongo
from hashlib import md5
import pymongo
import time
app = Flask(__name__)
cache = Cache(app, config={'CACHE_TYPE': 'simple'})
app.config['SECRET_KEY']= 'super secret key for me123456789987654321'
######################
# MONGODB SETUP
#####################
app.config['MONGO_HOST'] = 'localhost'
app.config['MONGO_DBNAME'] = 'celery-test-db'
app.config["MONGO_URI"] = 'mongodb://localhost:27017/celery-test-db'
mongo = PyMongo(app)
##############################
# CELERY ARGUMENTS
##############################
app.config['CELERY_BROKER_URL'] = 'amqp://localhost//'
app.config['CELERY_RESULT_BACKEND'] = 'mongodb://localhost:27017/celery-test-db'
app.config['CELERY_RESULT_BACKEND'] = 'mongodb'
app.config['CELERY_MONGODB_BACKEND_SETTINGS'] = {
"host": "localhost",
"port": 27017,
"database": "celery-test-db",
"taskmeta_collection": "celery_jobs",
}
app.config['CELERY_TASK_SERIALIZER'] = 'json'
celery = Celery('task',broker='mongodb://localhost:27017/jobs')
celery = make_celery(app)
LOCK_EXPIRE = 60 * 2 # Lock expires in 2 minutes
#contextmanager
def memcache_lock(lock_id, oid):
timeout_at = monotonic() + LOCK_EXPIRE - 3
# cache.add fails if the key already exists
status = cache.add(lock_id, oid, LOCK_EXPIRE)
try:
yield status
finally:
# memcache delete is very slow, but we have to use it to take
# advantage of using add() for atomic locking
if monotonic() < timeout_at and status:
# don't release the lock if we exceeded the timeout
# to lessen the chance of releasing an expired lock
# owned by someone else
# also don't release the lock if we didn't acquire it
cache.delete(lock_id)
#celery.task(bind=True, name='app.myTask1')
def myTask1(self):
self.update_state(state='IN TASK')
lock_id = self.name
with memcache_lock(lock_id, self.app.oid) as acquired:
if acquired:
# do work if we got the lock
print('acquired is {}'.format(acquired))
self.update_state(state='DOING WORK')
time.sleep(90)
return 'result'
# otherwise, the lock was already in use
raise self.retry(countdown=60) # redeliver message to the queue, so the work can be done later
#celery.task(bind=True, name='app.myTask2')
def myTask2(self):
print('you are in task2')
self.update_state(state='STARTING')
time.sleep(120)
print('task2 done')
#app.route('/', methods=['GET', 'POST'])
def index():
return render_template('index.html')
#app.route('/task1', methods=['GET', 'POST'])
def task1():
print('running task1')
result = myTask1.delay()
# get async task id
taskResult = AsyncResult(result.task_id)
# push async taskid into db collection job_task_id
mongo.db.job_task_id.insert({'taskid': str(taskResult), 'TaskName': 'task1'})
return render_template('task1.html')
#app.route('/task2', methods=['GET', 'POST'])
def task2():
print('running task2')
result = myTask2.delay()
# get async task id
taskResult = AsyncResult(result.task_id)
# push async taskid into db collection job_task_id
mongo.db.job_task_id.insert({'taskid': str(taskResult), 'TaskName': 'task2'})
return render_template('task2.html')
#app.route('/status', methods=['GET', 'POST'])
def status():
taskid_list = []
task_state_list = []
TaskName_list = []
allAsyncData = mongo.db.job_task_id.find()
for doc in allAsyncData:
try:
taskid_list.append(doc['taskid'])
except:
print('error with db conneciton in asyncJobStatus')
TaskName_list.append(doc['TaskName'])
# PASS TASK ID TO ASYNC RESULT TO GET TASK RESULT FOR THAT SPECIFIC TASK
for item in taskid_list:
try:
task_state_list.append(myTask1.AsyncResult(item).state)
except:
task_state_list.append('UNKNOWN')
return render_template('status.html', data_list=zip(task_state_list, TaskName_list))
Final Working Code
from flask import (Flask, render_template, flash, redirect,
url_for, session, logging, request, g, render_template_string, jsonify)
from flask_caching import Cache
from contextlib import contextmanager
from celery import Celery
from Flask_celery import make_celery
from celery.result import AsyncResult
from celery.utils.log import get_task_logger
from celery.five import monotonic
from flask_pymongo import PyMongo
from hashlib import md5
import pymongo
import time
import redis
from flask_redis import FlaskRedis
app = Flask(__name__)
# ADDING REDIS
redis_store = FlaskRedis(app)
# POINTING CACHE_TYPE TO REDIS
cache = Cache(app, config={'CACHE_TYPE': 'redis'})
app.config['SECRET_KEY']= 'super secret key for me123456789987654321'
######################
# MONGODB SETUP
#####################
app.config['MONGO_HOST'] = 'localhost'
app.config['MONGO_DBNAME'] = 'celery-test-db'
app.config["MONGO_URI"] = 'mongodb://localhost:27017/celery-test-db'
mongo = PyMongo(app)
##############################
# CELERY ARGUMENTS
##############################
# CELERY USING REDIS
app.config['CELERY_BROKER_URL'] = 'redis://localhost:6379/0'
app.config['CELERY_RESULT_BACKEND'] = 'mongodb://localhost:27017/celery-test-db'
app.config['CELERY_RESULT_BACKEND'] = 'mongodb'
app.config['CELERY_MONGODB_BACKEND_SETTINGS'] = {
"host": "localhost",
"port": 27017,
"database": "celery-test-db",
"taskmeta_collection": "celery_jobs",
}
app.config['CELERY_TASK_SERIALIZER'] = 'json'
celery = Celery('task',broker='mongodb://localhost:27017/jobs')
celery = make_celery(app)
LOCK_EXPIRE = 60 * 2 # Lock expires in 2 minutes
#contextmanager
def memcache_lock(lock_id, oid):
timeout_at = monotonic() + LOCK_EXPIRE - 3
print('in memcache_lock and timeout_at is {}'.format(timeout_at))
# cache.add fails if the key already exists
status = cache.add(lock_id, oid, LOCK_EXPIRE)
try:
yield status
print('memcache_lock and status is {}'.format(status))
finally:
# memcache delete is very slow, but we have to use it to take
# advantage of using add() for atomic locking
if monotonic() < timeout_at and status:
# don't release the lock if we exceeded the timeout
# to lessen the chance of releasing an expired lock
# owned by someone else
# also don't release the lock if we didn't acquire it
cache.delete(lock_id)
#celery.task(bind=True, name='app.myTask1')
def myTask1(self):
self.update_state(state='IN TASK')
print('dir is {} '.format(dir(self)))
lock_id = self.name
print('lock_id is {}'.format(lock_id))
with memcache_lock(lock_id, self.app.oid) as acquired:
print('in memcache_lock and lock_id is {} self.app.oid is {} and acquired is {}'.format(lock_id, self.app.oid, acquired))
if acquired:
# do work if we got the lock
print('acquired is {}'.format(acquired))
self.update_state(state='DOING WORK')
time.sleep(90)
return 'result'
# otherwise, the lock was already in use
raise self.retry(countdown=60) # redeliver message to the queue, so the work can be done later
#celery.task(bind=True, name='app.myTask2')
def myTask2(self):
print('you are in task2')
self.update_state(state='STARTING')
time.sleep(120)
print('task2 done')
#app.route('/', methods=['GET', 'POST'])
def index():
return render_template('index.html')
#app.route('/task1', methods=['GET', 'POST'])
def task1():
print('running task1')
result = myTask1.delay()
# get async task id
taskResult = AsyncResult(result.task_id)
# push async taskid into db collection job_task_id
mongo.db.job_task_id.insert({'taskid': str(taskResult), 'TaskName': 'myTask1'})
return render_template('task1.html')
#app.route('/task2', methods=['GET', 'POST'])
def task2():
print('running task2')
result = myTask2.delay()
# get async task id
taskResult = AsyncResult(result.task_id)
# push async taskid into db collection job_task_id
mongo.db.job_task_id.insert({'taskid': str(taskResult), 'TaskName': 'task2'})
return render_template('task2.html')
#app.route('/status', methods=['GET', 'POST'])
def status():
taskid_list = []
task_state_list = []
TaskName_list = []
allAsyncData = mongo.db.job_task_id.find()
for doc in allAsyncData:
try:
taskid_list.append(doc['taskid'])
except:
print('error with db conneciton in asyncJobStatus')
TaskName_list.append(doc['TaskName'])
# PASS TASK ID TO ASYNC RESULT TO GET TASK RESULT FOR THAT SPECIFIC TASK
for item in taskid_list:
try:
task_state_list.append(myTask1.AsyncResult(item).state)
except:
task_state_list.append('UNKNOWN')
return render_template('status.html', data_list=zip(task_state_list, TaskName_list))
if __name__ == '__main__':
app.secret_key = 'super secret key for me123456789987654321'
app.run(port=1234, host='localhost')
Here is also a screen shot you can see that I ran myTask1 two times and myTask2 a single time. Now I have the expected behavior for myTask1. Now myTask1 will be run by a single worker if another worker attempt to pick it up it will just keep retrying based on whatever i define.
In your question, you point out this warning from the Celery example you used:
In order for this to work correctly you need to be using a cache backend where the .add operation is atomic. memcached is known to work well for this purpose.
And you mention that you don't really understand what this means. Indeed, the code you show demonstrates that you've not heeded that warning, because your code uses an inappropriate backend.
Consider this code:
with memcache_lock(lock_id, self.app.oid) as acquired:
if acquired:
# do some work
What you want here is for acquired to be true only for one thread at a time. If two threads enter the with block at the same time, only one should "win" and have acquired be true. This thread that has acquired true can then proceed with its work, and the other thread has to skip doing the work and try again later to acquire the lock. In order to ensure that only one thread can have acquired true, .add must be atomic.
Here's some pseudo code of what .add(key, value) does:
1. if <key> is already in the cache:
2. return False
3. else:
4. set the cache so that <key> has the value <value>
5. return True
If the execution of .add is not atomic, this could happen if two threads A and B execute .add("foo", "bar"). Assume an empty cache at the start.
Thread A executes 1. if "foo" is already in the cache and finds that "foo" is not in the cache, and jumps to line 3 but the thread scheduler switches control to thread B.
Thread B also executes 1. if "foo" is already in the cache, and also finds that "foo" is not in the cache. So it jumps to line 3 and then executes line 4 and 5 which sets the key "foo" to the value "bar" and the call returns True.
Eventually, the scheduler gives control back to Thread A, which continues executing 3, 4, 5 and also sets the key "foo" to the value "bar" and also returns True.
What you have here is two .add calls that return True, if these .add calls are made within memcache_lock this entails that two threads can have acquired be true. So two threads could do work at the same time, and your memcache_lock is not doing what it should be doing, which is only allow one thread to work at a time.
You are not using a cache that ensures that .add is atomic. You initialize it like this:
cache = Cache(app, config={'CACHE_TYPE': 'simple'})
The simple backend is scoped to a single process, has no thread-safety, and has an .add operation which is not atomic. (This does not involve Mongo at all by the way. If you wanted your cache to be backed by Mongo, you'd have to specify a backed specifically made to send data to a Mongo database.)
So you have to switch to another backend, one that guarantees that .add is atomic. You could follow the lead of the Celery example and use the memcached backend, which does have an atomic .add operation. I don't use Flask, but I've does essentially what you are doing with Django and Celery, and used the Redis backend successfully to provide the kind of locking you're using here.
I also found this to be a surprisingly hard problem. Inspired mainly by Sebastian's work on implementing a distributed locking algorithm in redis I wrote up a decorator function.
A key point to bear in mind about this approach is that we lock tasks at the level of the task's argument space, e.g. we allow multiple game update/process order tasks to run concurrently, but only one per game. That's what argument_signature achieves in the code below. You can see documentation on how we use this in our stack at this gist:
import base64
from contextlib import contextmanager
import json
import pickle as pkl
import uuid
from backend.config import Config
from redis import StrictRedis
from redis_cache import RedisCache
from redlock import Redlock
rds = StrictRedis(Config.REDIS_HOST, decode_responses=True, charset="utf-8")
rds_cache = StrictRedis(Config.REDIS_HOST, decode_responses=False, charset="utf-8")
redis_cache = RedisCache(redis_client=rds_cache, prefix="rc", serializer=pkl.dumps, deserializer=pkl.loads)
dlm = Redlock([{"host": Config.REDIS_HOST}])
TASK_LOCK_MSG = "Task execution skipped -- another task already has the lock"
DEFAULT_ASSET_EXPIRATION = 8 * 24 * 60 * 60 # by default keep cached values around for 8 days
DEFAULT_CACHE_EXPIRATION = 1 * 24 * 60 * 60 # we can keep cached values around for a shorter period of time
REMOVE_ONLY_IF_OWNER_SCRIPT = """
if redis.call("get",KEYS[1]) == ARGV[1] then
return redis.call("del",KEYS[1])
else
return 0
end
"""
#contextmanager
def redis_lock(lock_name, expires=60):
# https://breadcrumbscollector.tech/what-is-celery-beat-and-how-to-use-it-part-2-patterns-and-caveats/
random_value = str(uuid.uuid4())
lock_acquired = bool(
rds.set(lock_name, random_value, ex=expires, nx=True)
)
yield lock_acquired
if lock_acquired:
rds.eval(REMOVE_ONLY_IF_OWNER_SCRIPT, 1, lock_name, random_value)
def argument_signature(*args, **kwargs):
arg_list = [str(x) for x in args]
kwarg_list = [f"{str(k)}:{str(v)}" for k, v in kwargs.items()]
return base64.b64encode(f"{'_'.join(arg_list)}-{'_'.join(kwarg_list)}".encode()).decode()
def task_lock(func=None, main_key="", timeout=None):
def _dec(run_func):
def _caller(*args, **kwargs):
with redis_lock(f"{main_key}_{argument_signature(*args, **kwargs)}", timeout) as acquired:
if not acquired:
return TASK_LOCK_MSG
return run_func(*args, **kwargs)
return _caller
return _dec(func) if func is not None else _dec
Implementation in our task definitions file:
#celery.task(name="async_test_task_lock")
#task_lock(main_key="async_test_task_lock", timeout=UPDATE_GAME_DATA_TIMEOUT)
def async_test_task_lock(game_id):
print(f"processing game_id {game_id}")
time.sleep(TASK_LOCK_TEST_SLEEP)
How we test against a local celery cluster:
from backend.tasks.definitions import async_test_task_lock, TASK_LOCK_TEST_SLEEP
from backend.tasks.redis_handlers import rds, TASK_LOCK_MSG
class TestTaskLocking(TestCase):
def test_task_locking(self):
rds.flushall()
res1 = async_test_task_lock.delay(3)
res2 = async_test_task_lock.delay(5)
self.assertFalse(res1.ready())
self.assertFalse(res2.ready())
res3 = async_test_task_lock.delay(5)
res4 = async_test_task_lock.delay(5)
self.assertEqual(res3.get(), TASK_LOCK_MSG)
self.assertEqual(res4.get(), TASK_LOCK_MSG)
time.sleep(TASK_LOCK_TEST_SLEEP)
res5 = async_test_task_lock.delay(3)
self.assertFalse(res5.ready())
(as a goodie there's also a quick example of how to setup a redis_cache)
With this setup, you should still expect to see workers receiving the task, since the lock is checked inside of the task itself. The only difference will be that the work won't be performed if the lock is acquired by another worker.
In the example given in the docs, this is the desired behavior; if a lock already exists, the task will simply do nothing and finish as successful. What you want is slightly different; you want the work to be queued up instead of ignored.
In order to get the desired effect, you would need to make sure that the task will be picked up by a worker and performed some time in the future. One way to accomplish this would be with retrying.
#task(bind=True, name='my-task')
def my_task(self):
lock_id = self.name
with memcache_lock(lock_id, self.app.oid) as acquired:
if acquired:
# do work if we got the lock
print('acquired is {}'.format(acquired))
return 'result'
# otherwise, the lock was already in use
raise self.retry(countdown=60) # redeliver message to the queue, so the work can be done later

boto3 check if Athena database exists

Im making a script that creates a database in AWS Athena and then creates tables for that database, today the DB creation was taking ages, so the tables being created referred to a db that doesn't exists, is there a way to check if a DB is already created in Athena using boto3?
This is the part that created the db:
client = boto3.client('athena')
client.start_query_execution(
QueryString='create database {}'.format('db_name'),
ResultConfiguration=config
)
# -*- coding: utf-8 -*-
import logging
import os
from time import sleep
import boto3
import pandas as pd
from backports.tempfile import TemporaryDirectory
logger = logging.getLogger(__name__)
class AthenaQueryFailed(Exception):
pass
class Athena(object):
S3_TEMP_BUCKET = "please-replace-with-your-bucket"
def __init__(self, bucket=S3_TEMP_BUCKET):
self.bucket = bucket
self.client = boto3.Session().client("athena")
def execute_query_in_athena(self, query, output_s3_directory, database="csv_dumps"):
""" Useful when client executes a query in Athena and want result in the given `s3_directory`
:param query: Query to be executed in Athena
:param output_s3_directory: s3 path in which client want results to be stored
:return: s3 path
"""
response = self.client.start_query_execution(
QueryString=query,
QueryExecutionContext={"Database": database},
ResultConfiguration={"OutputLocation": output_s3_directory},
)
query_execution_id = response["QueryExecutionId"]
filename = "{filename}.csv".format(filename=response["QueryExecutionId"])
s3_result_path = os.path.join(output_s3_directory, filename)
logger.info(
"Query query_execution_id <<{query_execution_id}>>, result_s3path <<{s3path}>>".format(
query_execution_id=query_execution_id, s3path=s3_result_path
)
)
self.wait_for_query_to_complete(query_execution_id)
return s3_result_path
def wait_for_query_to_complete(self, query_execution_id):
is_query_running = True
backoff_time = 10
while is_query_running:
response = self.__get_query_status_response(query_execution_id)
status = response["QueryExecution"]["Status"][
"State"
] # possible responses: QUEUED | RUNNING | SUCCEEDED | FAILED | CANCELLED
if status == "SUCCEEDED":
is_query_running = False
elif status in ["CANCELED", "FAILED"]:
raise AthenaQueryFailed(status)
elif status in ["QUEUED", "RUNNING"]:
logger.info("Backing off for {} seconds.".format(backoff_time))
sleep(backoff_time)
else:
raise AthenaQueryFailed(status)
def __get_query_status_response(self, query_execution_id):
response = self.client.get_query_execution(QueryExecutionId=query_execution_id)
return response
As pointed in above answer, Athena Waiter is still not there implemented.
I use this light weighted Athena client to do the query, it returns the s3 path of result when the query is completed.
The waiter functions for Athena are not implemented yet: Athena Waiter
See: Support AWS Athena waiter feature for a possible workaround until it is implemented in Boto3. This is how it is implemented in AWS CLI.
while True:
stats = self.athena.get_query_execution(execution_id)
status = stats['QueryExecution']['Status']['State']
if status in ['SUCCEEDED', 'FAILED', 'CANCELLED']:
break
time.sleep(0.2)

Mocking boto3 S3 client method Python

I'm trying to mock a singluar method from the boto3 s3 client object to throw an exception. But I need all other methods for this class to work as normal.
This is so I can test a singular Exception test when and error occurs performing a upload_part_copy
1st Attempt
import boto3
from mock import patch
with patch('botocore.client.S3.upload_part_copy', side_effect=Exception('Error Uploading')) as mock:
client = boto3.client('s3')
# Should return actual result
o = client.get_object(Bucket='my-bucket', Key='my-key')
# Should return mocked exception
e = client.upload_part_copy()
However this gives the following error:
ImportError: No module named S3
2nd Attempt
After looking at the botocore.client.py source code I found that it is doing something clever and the method upload_part_copy does not exist. I found that it seems to call BaseClient._make_api_call instead so I tried to mock that
import boto3
from mock import patch
with patch('botocore.client.BaseClient._make_api_call', side_effect=Exception('Error Uploading')) as mock:
client = boto3.client('s3')
# Should return actual result
o = client.get_object(Bucket='my-bucket', Key='my-key')
# Should return mocked exception
e = client.upload_part_copy()
This throws an exception... but on the get_object which I want to avoid.
Any ideas about how I can only throw the exception on the upload_part_copy method?
Botocore has a client stubber you can use for just this purpose: docs.
Here's an example of putting an error in:
import boto3
from botocore.stub import Stubber
client = boto3.client('s3')
stubber = Stubber(client)
stubber.add_client_error('upload_part_copy')
stubber.activate()
# Will raise a ClientError
client.upload_part_copy()
Here's an example of putting a normal response in. Additionally, the stubber can now be used in a context. It's important to note that the stubber will verify, so far as it is able, that your provided response matches what the service will actually return. This isn't perfect, but it will protect you from inserting total nonsense responses.
import boto3
from botocore.stub import Stubber
client = boto3.client('s3')
stubber = Stubber(client)
list_buckets_response = {
"Owner": {
"DisplayName": "name",
"ID": "EXAMPLE123"
},
"Buckets": [{
"CreationDate": "2016-05-25T16:55:48.000Z",
"Name": "foo"
}]
}
expected_params = {}
stubber.add_response('list_buckets', list_buckets_response, expected_params)
with stubber:
response = client.list_buckets()
assert response == list_buckets_response
As soon as I posted on here I managed to come up with a solution. Here it is hope it helps :)
import botocore
from botocore.exceptions import ClientError
from mock import patch
import boto3
orig = botocore.client.BaseClient._make_api_call
def mock_make_api_call(self, operation_name, kwarg):
if operation_name == 'UploadPartCopy':
parsed_response = {'Error': {'Code': '500', 'Message': 'Error Uploading'}}
raise ClientError(parsed_response, operation_name)
return orig(self, operation_name, kwarg)
with patch('botocore.client.BaseClient._make_api_call', new=mock_make_api_call):
client = boto3.client('s3')
# Should return actual result
o = client.get_object(Bucket='my-bucket', Key='my-key')
# Should return mocked exception
e = client.upload_part_copy()
Jordan Philips also posted a great solution using the the botocore.stub.Stubber class. Whilst a cleaner solution I was un-able to mock specific operations.
If you don't want to use either moto or the botocore stubber (the stubber does not prevent HTTP requests being made to AWS API endpoints it seems), you can use the more verbose unittest.mock way:
foo/bar.py
import boto3
def my_bar_function():
client = boto3.client('s3')
buckets = client.list_buckets()
...
bar_test.py
import unittest
from unittest import mock
class MyTest(unittest.TestCase):
#mock.patch('foo.bar.boto3.client')
def test_that_bar_works(self, mock_s3_client):
self.assertTrue(mock_s3_client.return_value.list_buckets.call_count == 1)
Here's an example of a simple python unittest that can be used to fake client = boto3.client('ec2') api call...
import boto3
class MyAWSModule():
def __init__(self):
client = boto3.client('ec2')
tags = client.describe_tags(DryRun=False)
class TestMyAWSModule(unittest.TestCase):
#mock.patch("boto3.client.describe_tags")
#mock.patch("boto3.client")
def test_open_file_with_existing_file(self, mock_boto_client, mock_describe_tags):
mock_describe_tags.return_value = mock_get_tags_response
my_aws_module = MyAWSModule()
mock_boto_client.assert_call_once('ec2')
mock_describe_tags.assert_call_once_with(DryRun=False)
mock_get_tags_response = {
'Tags': [
{
'ResourceId': 'string',
'ResourceType': 'customer-gateway',
'Key': 'string',
'Value': 'string'
},
],
'NextToken': 'string'
}
hopefully that helps.
What about simply using moto?
It comes with a very handy decorator:
from moto import mock_s3
#mock_s3
def test_my_model_save():
pass
I had to mock boto3 client for some integration testing and it was a bit painful! The problem that I had is that moto does not support KMS very well, yet I did not want to rewrite my own mock for the S3 buckets. So I created this morph of all of the answers. Also it works globally which is pretty cool!
I have it setup with 2 files.
First one is aws_mock.py. For the KMS mocking I got some predefined responses that came from live boto3 client.
from unittest.mock import MagicMock
import boto3
from moto import mock_s3
# `create_key` response
create_resp = { ... }
# `generate_data_key` response
generate_resp = { ... }
# `decrypt` response
decrypt_resp = { ... }
def client(*args, **kwargs):
if args[0] == 's3':
s3_mock = mock_s3()
s3_mock.start()
mock_client = boto3.client(*args, **kwargs)
else:
mock_client = boto3.client(*args, **kwargs)
if args[0] == 'kms':
mock_client.create_key = MagicMock(return_value=create_resp)
mock_client.generate_data_key = MagicMock(return_value=generate_resp)
mock_client.decrypt = MagicMock(return_value=decrypt_resp)
return mock_client
Second one is the actual test module. Let's call it test_my_module.py. I've omitted the code of my_module. As well as functions that are under the test. Let's call those foo, bar functions.
from unittest.mock import patch
import aws_mock
import my_module
#patch('my_module.boto3')
def test_my_module(boto3):
# Some prep work for the mock mode
boto3.client = aws_mock.client
conn = boto3.client('s3')
conn.create_bucket(Bucket='my-bucket')
# Actual testing
resp = my_module.foo()
assert(resp == 'Valid')
resp = my_module.bar()
assert(resp != 'Not Valid')
# Etc, etc, etc...
One more thing, not sure if that is fixed but I found out that moto was not happy unless you set some environmental variables like credentials and region. They don't have to be actual credentials but they do need to be set. There is a chance it might be fixed by the time you read this! But here is some code in case you do need it, shell code this time!
export AWS_ACCESS_KEY_ID='foo'
export AWS_SECRET_ACCESS_KEY='bar'
export AWS_DEFAULT_REGION='us-east-1'
I know it is probably not the prettiest piece of code but if you are looking for something universal it should work pretty well!
Here is my solution for patching a boto client used in the bowels of my project, with pytest fixtures. I'm only using 'mturk' in my project.
The trick for me was to create my own client, and then patch boto3.client with a function that returns that pre-created client.
#pytest.fixture(scope='session')
def patched_boto_client():
my_client = boto3.client('mturk')
def my_client_func(*args, **kwargs):
return my_client
with patch('bowels.of.project.other_module.boto3.client', my_client_func):
yield my_client_func
def test_create_hit(patched_boto_client):
client = patched_boto_client()
stubber = Stubber(client)
stubber.add_response('create_hit_type', {'my_response':'is_great'})
stubber.add_response('create_hit_with_hit_type', {'my_other_response':'is_greater'})
stubber.activate()
import bowels.of.project # this module imports `other_module`
bowels.of.project.create_hit_function_that_calls_a_function_in_other_module_which_invokes_boto3_dot_client_at_some_point()
I also define another fixture that sets up dummy aws creds so that boto doesn't accidentally pick up some other set of credentials on the system. I literally set 'foo' and 'bar' as my creds for testing -- that's not a redaction.
It's important that AWS_PROFILE env be unset because otherwise boto will go looking for that profile.
#pytest.fixture(scope='session')
def setup_env():
os.environ['AWS_ACCESS_KEY_ID'] = 'foo'
os.environ['AWS_SECRET_ACCESS_KEY'] = 'bar'
os.environ.pop('AWS_PROFILE', None)
And then I specify setup_env as a pytest usefixtures entry so that it gets used for every test run.
I had a slightly different use case where the client is set up during a setup() method in a Class, as it does a few things such as listing things from the AWS service it's talking to (Connect, in my case). Lots of the above approaches weren't quite working, so here's my working version for future Googlers.
In order to get everything to work properly, I had to do this:
In the class under test (src/flow_manager.py):
class FlowManager:
client: botocore.client.BaseClient
def setup(self):
self.client = boto3.client('connect')
def set_instance(self):
response = self.client.list_instances()
... do stuff ....
In the test file (tests/unit/test_flow_manager.py):
#mock.patch('src.flow_manager.boto3.client')
def test_set_instance(self, mock_client):
expected = 'bar'
instance_list = {'alias': 'foo', 'id': 'bar'}
mock_client.list_instances.return_value = instance_list
actual = flow_manager.FlowManager("", "", "", "", 'foo')
actual.client = mock_client
actual.set_instance()
self.assertEqual(expected, actual.instance_id)
I've truncated the code to the relevant bits for this answer.

Categories

Resources