boto3 check if Athena database exists - python

Im making a script that creates a database in AWS Athena and then creates tables for that database, today the DB creation was taking ages, so the tables being created referred to a db that doesn't exists, is there a way to check if a DB is already created in Athena using boto3?
This is the part that created the db:
client = boto3.client('athena')
client.start_query_execution(
QueryString='create database {}'.format('db_name'),
ResultConfiguration=config
)

# -*- coding: utf-8 -*-
import logging
import os
from time import sleep
import boto3
import pandas as pd
from backports.tempfile import TemporaryDirectory
logger = logging.getLogger(__name__)
class AthenaQueryFailed(Exception):
pass
class Athena(object):
S3_TEMP_BUCKET = "please-replace-with-your-bucket"
def __init__(self, bucket=S3_TEMP_BUCKET):
self.bucket = bucket
self.client = boto3.Session().client("athena")
def execute_query_in_athena(self, query, output_s3_directory, database="csv_dumps"):
""" Useful when client executes a query in Athena and want result in the given `s3_directory`
:param query: Query to be executed in Athena
:param output_s3_directory: s3 path in which client want results to be stored
:return: s3 path
"""
response = self.client.start_query_execution(
QueryString=query,
QueryExecutionContext={"Database": database},
ResultConfiguration={"OutputLocation": output_s3_directory},
)
query_execution_id = response["QueryExecutionId"]
filename = "{filename}.csv".format(filename=response["QueryExecutionId"])
s3_result_path = os.path.join(output_s3_directory, filename)
logger.info(
"Query query_execution_id <<{query_execution_id}>>, result_s3path <<{s3path}>>".format(
query_execution_id=query_execution_id, s3path=s3_result_path
)
)
self.wait_for_query_to_complete(query_execution_id)
return s3_result_path
def wait_for_query_to_complete(self, query_execution_id):
is_query_running = True
backoff_time = 10
while is_query_running:
response = self.__get_query_status_response(query_execution_id)
status = response["QueryExecution"]["Status"][
"State"
] # possible responses: QUEUED | RUNNING | SUCCEEDED | FAILED | CANCELLED
if status == "SUCCEEDED":
is_query_running = False
elif status in ["CANCELED", "FAILED"]:
raise AthenaQueryFailed(status)
elif status in ["QUEUED", "RUNNING"]:
logger.info("Backing off for {} seconds.".format(backoff_time))
sleep(backoff_time)
else:
raise AthenaQueryFailed(status)
def __get_query_status_response(self, query_execution_id):
response = self.client.get_query_execution(QueryExecutionId=query_execution_id)
return response
As pointed in above answer, Athena Waiter is still not there implemented.
I use this light weighted Athena client to do the query, it returns the s3 path of result when the query is completed.

The waiter functions for Athena are not implemented yet: Athena Waiter
See: Support AWS Athena waiter feature for a possible workaround until it is implemented in Boto3. This is how it is implemented in AWS CLI.
while True:
stats = self.athena.get_query_execution(execution_id)
status = stats['QueryExecution']['Status']['State']
if status in ['SUCCEEDED', 'FAILED', 'CANCELLED']:
break
time.sleep(0.2)

Related

Does my lambda function go inside my main python script?

I don't know how to write a Lambda. Here is my main_script.py that executes 2 stored procedures. It inserts records every day then finds the difference between yesterday's and today's records and writes them to a table.
import logging
import pymysql as pm
import os
import json
class className:
env=None
config=None
def __init__(self, env_filename):
self.env=env_filename
self.config=self.get_config()
def get_config(self):
with open(self.env) as file_in:
return json.load(file_in)
def DB_connection(self):
config=className.get_config(self)
username=config["exceptions"]["database-secrets"]["aws_secret_username"]
password=config["exceptions"]["database-secrets"]["aws_secret_password"]
host=config["exceptions"]["database-secrets"]["aws_secret_host"]
port=config["exceptions"]["database-secrets"]["aws_secret_port"]
database=config["exceptions"]["database-secrets"]["aws_secret_db"]
return pm.connect(
user=username,
password=password,
host=host,
port=port,
database=database
)
def run_all(self):
def test_function(self):
test_function_INSERT_QUERY = "CALL sp_test_insert();"
test_function_EXCEPTIONS_QUERY = "CALL sp_test_exceptions();"
test = self.config["exceptions"]["functions"]["test_function"]
if test:
with self.DB_connection() as cnxn:
with cnxn.cursor() as cur:
try:
cur.execute(test_function_INSERT_QUERY)
print("test_function_INSERT_QUERY insertion query ran successfully, {} records updated.".format(cur.rowcount))
cur.execute(test_function_EXCEPTIONS_QUERY)
print("test_function_EXCEPTIONS_QUERY exceptions query ran successfully, {} exceptions updated.".format(cur.rowcount))
except pm.Error as e:
print(f"Error: {e}")
except Exception as e:
logging.exception(e)
else:
cnxn.commit()
test_function(self)
def main():
cwd=os.getcwd()
vfc=(cwd+"\_config"+".json")
ve=className(vfc)
ve.run_all()
if __name__ == "__main__":
main()
Would I write my lambda_handler function inside my script above or have it as a separate script?
def lambda_handler(event, context):
#some code
I would treat lambda_handler(event, context) as the equivalent of main() with the exception that you do not need if __name__ ... clause because you never run a lambda function from the console.
You would also need to use boto3 library to abstract away AWS services and their functions. Have a look at the tutorial to get started.
As the first order of business, I would put the DB credentials out of the file system and into a secure datastore. You can of course configure Lambda environment variables, but Systems Manager Parameter Store is more secure and super-easy to call from the code, e.g.:
import boto3
ssm = boto3.client('ssm', region_name='us-east-1')
def lambda_handler(event, context):
password = ssm.get_parameters(Names=['/pathto/password'], WithDecryption=True)['Parameters'][0]['Value']
return {"password": password}
There is a more advanced option, the Secrets Manager, which for a little money will even rotate passwords for you (because it is fully integrated with Relational Database Service).

How do i trigger sql file using psql command from ephemeral storage of lambda function in python

connection and downloaded file output# Access sql files from S3 to lambda and execute.
S3 -> Lambda -> RDS instance
a. Integration between funtion, databse and S3-> DONE
a1. download the .sql file from the S3 bucket and write the file to the /tmp storage of the Lambda function. -> DONE
b1. Import a Python library or create Lambda layer to include the relevant libraries/dependencies into the Lambda function to perform the psql command to execute the SQL file.
or
b2. you can download the file and convert it to a string and pass the string as a parameter to 'ExecuteSql' API call which allows you to run one or more SQL statements.
c. Once we able to successfully execute the sql files then check how to export the generated .csv,txt,html,TAB files to S3 OUTPUT path.
So far i have integrated function, S3 and RDS and able to view the output of table (to test connection) and download the sql file from S3 path to ephemeral storage /tmp of lambda function.
Now looking forward that how to execute downloaded sql file from /tmp of lambda function using psql command or convert file to a string and pass the string as a parameter to 'ExecuteSql' API which allows you to run one or more SQL statements. Please help share any ways to achieve.
Please refer below code in python which i am using with lambda function.
from dataclasses import dataclass
import psycopg2
from psycopg2.extras import RealDictCursor
import json
from datetime import datetime
import csv
import boto3
from botocore.exceptions import ClientError
import os
def get_secret():
secret_name = "baardsnonprod-qa2db"
region_name = "us-east-1"
# Create a Secrets Manager client
session = boto3.session.Session()
client = session.client(
service_name='secretsmanager',
region_name=region_name
)
secret = client.get_secret_value(
SecretId=secret_name
)
secret_dict = json.loads(secret['SecretString'])
return secret_dict
def download_sql_from_s3(sql_filename):
s3 = boto3.resource('s3')
bucket = 'baa-non-prod-baa-assets'
key = "RDS-batch/Reports/" + sql_filename
local_path = '/tmp/' + sql_filename
response = s3.Bucket(bucket).download_file(key, local_path)
return "file successfully downloaded"
def lambda_handler(event, context):
secret_dict = get_secret()
print(secret_dict)
hostname = secret_dict['host']
portnumber = secret_dict['port']
databasename = secret_dict['database']
username = secret_dict['username']
passwd = secret_dict['password']
print(hostname,portnumber,databasename,username,passwd)
conn = psycopg2.connect(host = hostname, port = portnumber, database = databasename, user = username, password = passwd)
cur = conn.cursor(cursor_factory = RealDictCursor)
cur.execute("SELECT * FROM PROFILE")
results = cur.fetchall()
json_result = json.dumps(results, default = str)
print(json_result)
status = download_sql_from_s3("ConsumerPopularFIErrors.sql")
# file_status = os.path.isfile('/tmp/ConsumerPopularFIErrors.sql')
print(status)
# print(file_status)
# with open('/tmp/ConsumerPopularFIErrors.sql') as file:
# content = file.readlines()
# for line in content:
# print(line)
#lambda_handler()

Python-Bottle internal Server Error 500 when uploading to S3 bucket and creating an PostgreSQL entry in RDS

so i explore the AWS at the moment and have a problem regarding an own little project.
I have a python bottle on an ec2-instance, a S3 bucket for uploads and a RDS instance (postgreSQL).
I have a script which lets me upload images (jpg and png).
Now I want to have an entry in my database for every upload I do.
The connection is established and all rules allow traffic from ec2-instance to S3 bucket, to RDS-instance and back.
After uploading, normally i get a message that it worked out, but now after including code in the python script to Write and Read from the database it just shows Internal Server Error 500.
Before it also put the upload in my S3 bucket, but now it puts it into my ec2-instance directory.
And creates a directory "user_uploads" which, actually should point to the S3 bucket, like it did before including the Code to Read/Write to the DB.
Can someone help me?
`
CREATE SCHEMA bottletube;
SET SCHEMA 'bottletube';
CREATE TABLE IF NOT EXISTS image_uploads
(
id int GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
url VARCHAR(20) NOT NULL,
category VARCHAR(64)
);
`
`
#!/usr/bin/python3
import time
import os
import uuid
import psycopg2
import requests
from bottle import route, run, template, request
from boto3 import resource
BUCKET_NAME = 'python-bottle-hw' # Replace with your bucket name
SAVE_PATH = 'user_uploads'
#route('/home')
#route('/')
def home():
# SQL Query goes here later, now dummy data only
# Read Entries from database
items = []
cursor.execute('SELECT * FROM image_uploads ORDER BY id')
for record in cursor.fetchall():
items.append({'id': record[0], 'filename': record[1], 'category': record[2]})
return template('home.tpl', name='BoTube Home', items=items)
#route('/upload', method='GET')
def do_upload_get():
return template('upload.tpl', name='Upload Image')
#route('/upload', method='POST')
def do_upload_post():
category = request.forms.get('category')
upload = request.files.get('file_upload')
# Check for errors
error_messages = []
if not upload:
error_messages.append('Please upload a file.')
if not category:
error_messages.append('Please enter a category.')
try:
name, ext = os.path.splitext(upload.filename)
if ext not in ('.png', '.jpg', '.jpeg'):
error_messages.append('File Type not allowed.')
except:
error_messages.append('Unknown error.')
if error_messages:
return template('upload.tpl', name='Upload Image', error_messages=error_messages)
# Save to SAVE_PATH directory
if not os.path.exists(SAVE_PATH):
os.makedirs(SAVE_PATH)
save_filename = f'{name}_{time.strftime("%Y%m%d-%H%M%S")}{ext}'
with open(f'{SAVE_PATH}{save_filename}', 'wb') as open_file:
open_file.write(upload.file.read())
if ext == '.png':
content_type = 'image/png'
else:
content_type = 'image/jpeg'
# Upload to S3
data = open(SAVE_PATH + save_filename, 'rb')
s3_resource.Bucket(BUCKET_NAME).put_object(Key=f'user_uploads/{save_filename}',
Body=data,
Metadata={'Content-Type': content_type},
ACL='public-read')
# Write to DB
cursor.execute(f"INSERT INTO image_uploads (url, category) VALUES ('user_uploads/{save_filename}', '{category}');")
connection.commit()
# Return template
return template('upload_success.tpl', name='Upload Image')
if __name__ == '__main__':
# Connect to DB
connection = psycopg2.connect(user="postgres", host="endpoint of my database", password="here would be my password", database="bottletube")
cursor = connection.cursor()
cursor.execute("SET SCHEMA 'bottletube';")
connection.commit()
# Connect to S3
s3_resource = resource('s3', region_name='us-east-1')
# Needs to be customized
# run(host='your_public_dns_name',
run(host=requests.get('http://169.254.169.254/latest/meta-data/public-hostname').text,
port=80)
`
So this is my code, i know, never include passwords in code, i know how AWS secretmanager works but wanted to check function before i do that

Capture Athena Query Execution Details using Lambda

Under CloudWatch, I created a rule to capture Athena Query State Change Event that will (1) write a log to a log group (2) trigger a Lambda function that will capture the Athena Query Execution details and pipe it to a s3 bucket. Point 2 fails as no Athena Query Execution details are piped it to a s3 bucket. Below is the Lambda Function I used:
import json
import boto3
from botocore.config import Config
my_config = Config(
region_name = '<my_region>')
print('Loading function')
def lambda_handler(event, context):
print("Received event: " + json.dumps(event))
print("QuertID: " + event['id'])
#get query statistics
client = boto3.client('athena', config=my_config)
queries = client.get_query_execution( QueryExecutionId=event['detail']['QueryExecutionId'])
del queries['QueryExecution']['Status']
#saving the query statistics to s3
s3 = boto3.resource('s3')
object = s3.Object('<s3_bucket_path>','query_statistics_json/' + event['detail']['QueryExecutionId'])
object.put(Body=str(queries['QueryExecution']))
return 0
I used this AWS Documentation as reference:
https://docs.aws.amazon.com/athena/latest/ug/control-limits.html
The body should be of the type binary data.
object.put(Body=some binary data)
Maybe you can write the str(queries['QueryExecution'] to a txt file in lambda's /tmp directory and upload it.
content="String content to write to a new S3 file"
s3.Object('my-bucket-name', '/tmp/newfile.txt').put(Body=content)
it's just an indentation problem, after line 11, all should be indented...

Pytest with Moto, change the status of an athena query using the backend

I am using moto to test aws functionality in my codebase. One of the issues I have ran into is that when testing athena, the query status stayed in "QUEUED" indefinitely, causing the test to fail or time out.
Here is the method to be tested:
import time
import boto3
class Athena:
CLIENT = boto3.client("athena")
class QueryError(Exception):
"""A class for exceptions related to queries."""
#classmethod
def execute_query(cls, query, result_location, check_status=True,
time_limit=10):
"""
Execute a query in Athena.
"""
_result_configuration = {"OutputLocation": result_location}
_kwargs = {"QueryString": query, "ResultConfiguration":
_result_configuration}
response = cls.CLIENT.start_query_execution(**_kwargs)
query_id = response["QueryExecutionId"]
if check_status:
old_time = time.time()
while True:
status = cls.CLIENT.get_query_execution(
QueryExecutionId=query_id)
status = status["QueryExecution"]["Status"]["State"]
if status in ["SUCCEEDED", "FAILED", "CANCELLED"]:
if status == "FAILED":
raise cls.QueryError("error")
break
time.sleep(0.2) # 200ms
if time.time() - old_time > time_limit and status
== "QUEUED":
raise cls.QueryError("time limit reached")
return query_id
Here is the fixture passed into the test
from moto.s3 import mock_s3
import boto3
#pytest.fixture
def s3():
with mock_s3():
s3 = boto3.client("s3")
yield s3
Here is the test (keep in mind you need to change from x to the module with the above method)
import uuid
import boto3
import pytest
from moto.athena import mock_athena
from moto.s3 import mock_s3
#mock_s3
#mock_athena
def test_execute_query_check(s3):
from x import Athena
"""
Test for 'execute_query' (with status check)
"""
CLIENT = s3
bucket_name = "pytest." + str(uuid.uuid4())
# Bucket creation
bucket_config = {"LocationConstraint": "us-east-2"}
CLIENT.create_bucket(Bucket=bucket_name,
CreateBucketConfiguration=bucket_config)
waiter = CLIENT.get_waiter("bucket_exists")
waiter.wait(Bucket=bucket_name)
s3_location = f"s3://{bucket_name}/"
query = "SELECT current_date, current_time;"
query_id = Athena.execute_query(query, s3_location,
check_status=True)
assert query_id
This test fails because moto does not change the status of the query past "QUEUED" and the test is expecting a changed to state otherwise it triggers an exception.
I would like to be able to do something like:
from moto.athena import athena_backends
athena_backends['us-east-2'].job_flows[query_id].state = "SUCCEEDED"
as was suggested in this issue: https://github.com/spulec/moto/issues/380
However the "job flows" attribute does not seem to exist anymore on the boto3 mapreduce backend, and I cant find a method to explicitly change it.
Ideally this would be able to happen somewhere in the test to manually change the state of the query to simulate how it would be with actual resources.
State can be accessed and changed as follows:
athena_backends['us-east-2'].executions.get(query_id).status
Sample code snippet
from moto.athena import athena_backends
query = "SELECT stuff"
location = "s3://bucket-name/prefix/"
database = "database"
# Start Query
exex_id = self.client.start_query_execution(
QueryString=query,
QueryExecutionContext={"Database": database},
ResultConfiguration={"OutputLocation": location},
)["QueryExecutionId"]
athena_backends['us-west-2'].executions.get(exex_id).status = "CANCELLED"
It seems to me that moto only returns QUEUED for the start_query_execution, you can take a look at the source code here.
Another approach is using from unittest import mock, and then you can do something like:
cls.CLIENT = mock.Mock()
cls.CLIENT.start_query_execution.side_effect = [
'QUEUED',
'SUCCEEDED'
]
So then, the first time cls.CLIENT.start_query_execution(..) is called, it will return that the query is queued, but the second time will return that succeed, and then you will be able to test both path executions.
And also, with moto it won't be able to test all the cases, because apart from queued status, you only can set the query status to CANCELLED, as you can see here.

Categories

Resources