How do i trigger sql file using psql command from ephemeral storage of lambda function in python - python

connection and downloaded file output# Access sql files from S3 to lambda and execute.
S3 -> Lambda -> RDS instance
a. Integration between funtion, databse and S3-> DONE
a1. download the .sql file from the S3 bucket and write the file to the /tmp storage of the Lambda function. -> DONE
b1. Import a Python library or create Lambda layer to include the relevant libraries/dependencies into the Lambda function to perform the psql command to execute the SQL file.
or
b2. you can download the file and convert it to a string and pass the string as a parameter to 'ExecuteSql' API call which allows you to run one or more SQL statements.
c. Once we able to successfully execute the sql files then check how to export the generated .csv,txt,html,TAB files to S3 OUTPUT path.
So far i have integrated function, S3 and RDS and able to view the output of table (to test connection) and download the sql file from S3 path to ephemeral storage /tmp of lambda function.
Now looking forward that how to execute downloaded sql file from /tmp of lambda function using psql command or convert file to a string and pass the string as a parameter to 'ExecuteSql' API which allows you to run one or more SQL statements. Please help share any ways to achieve.
Please refer below code in python which i am using with lambda function.
from dataclasses import dataclass
import psycopg2
from psycopg2.extras import RealDictCursor
import json
from datetime import datetime
import csv
import boto3
from botocore.exceptions import ClientError
import os
def get_secret():
secret_name = "baardsnonprod-qa2db"
region_name = "us-east-1"
# Create a Secrets Manager client
session = boto3.session.Session()
client = session.client(
service_name='secretsmanager',
region_name=region_name
)
secret = client.get_secret_value(
SecretId=secret_name
)
secret_dict = json.loads(secret['SecretString'])
return secret_dict
def download_sql_from_s3(sql_filename):
s3 = boto3.resource('s3')
bucket = 'baa-non-prod-baa-assets'
key = "RDS-batch/Reports/" + sql_filename
local_path = '/tmp/' + sql_filename
response = s3.Bucket(bucket).download_file(key, local_path)
return "file successfully downloaded"
def lambda_handler(event, context):
secret_dict = get_secret()
print(secret_dict)
hostname = secret_dict['host']
portnumber = secret_dict['port']
databasename = secret_dict['database']
username = secret_dict['username']
passwd = secret_dict['password']
print(hostname,portnumber,databasename,username,passwd)
conn = psycopg2.connect(host = hostname, port = portnumber, database = databasename, user = username, password = passwd)
cur = conn.cursor(cursor_factory = RealDictCursor)
cur.execute("SELECT * FROM PROFILE")
results = cur.fetchall()
json_result = json.dumps(results, default = str)
print(json_result)
status = download_sql_from_s3("ConsumerPopularFIErrors.sql")
# file_status = os.path.isfile('/tmp/ConsumerPopularFIErrors.sql')
print(status)
# print(file_status)
# with open('/tmp/ConsumerPopularFIErrors.sql') as file:
# content = file.readlines()
# for line in content:
# print(line)
#lambda_handler()

Related

Python-Bottle internal Server Error 500 when uploading to S3 bucket and creating an PostgreSQL entry in RDS

so i explore the AWS at the moment and have a problem regarding an own little project.
I have a python bottle on an ec2-instance, a S3 bucket for uploads and a RDS instance (postgreSQL).
I have a script which lets me upload images (jpg and png).
Now I want to have an entry in my database for every upload I do.
The connection is established and all rules allow traffic from ec2-instance to S3 bucket, to RDS-instance and back.
After uploading, normally i get a message that it worked out, but now after including code in the python script to Write and Read from the database it just shows Internal Server Error 500.
Before it also put the upload in my S3 bucket, but now it puts it into my ec2-instance directory.
And creates a directory "user_uploads" which, actually should point to the S3 bucket, like it did before including the Code to Read/Write to the DB.
Can someone help me?
`
CREATE SCHEMA bottletube;
SET SCHEMA 'bottletube';
CREATE TABLE IF NOT EXISTS image_uploads
(
id int GENERATED ALWAYS AS IDENTITY PRIMARY KEY,
url VARCHAR(20) NOT NULL,
category VARCHAR(64)
);
`
`
#!/usr/bin/python3
import time
import os
import uuid
import psycopg2
import requests
from bottle import route, run, template, request
from boto3 import resource
BUCKET_NAME = 'python-bottle-hw' # Replace with your bucket name
SAVE_PATH = 'user_uploads'
#route('/home')
#route('/')
def home():
# SQL Query goes here later, now dummy data only
# Read Entries from database
items = []
cursor.execute('SELECT * FROM image_uploads ORDER BY id')
for record in cursor.fetchall():
items.append({'id': record[0], 'filename': record[1], 'category': record[2]})
return template('home.tpl', name='BoTube Home', items=items)
#route('/upload', method='GET')
def do_upload_get():
return template('upload.tpl', name='Upload Image')
#route('/upload', method='POST')
def do_upload_post():
category = request.forms.get('category')
upload = request.files.get('file_upload')
# Check for errors
error_messages = []
if not upload:
error_messages.append('Please upload a file.')
if not category:
error_messages.append('Please enter a category.')
try:
name, ext = os.path.splitext(upload.filename)
if ext not in ('.png', '.jpg', '.jpeg'):
error_messages.append('File Type not allowed.')
except:
error_messages.append('Unknown error.')
if error_messages:
return template('upload.tpl', name='Upload Image', error_messages=error_messages)
# Save to SAVE_PATH directory
if not os.path.exists(SAVE_PATH):
os.makedirs(SAVE_PATH)
save_filename = f'{name}_{time.strftime("%Y%m%d-%H%M%S")}{ext}'
with open(f'{SAVE_PATH}{save_filename}', 'wb') as open_file:
open_file.write(upload.file.read())
if ext == '.png':
content_type = 'image/png'
else:
content_type = 'image/jpeg'
# Upload to S3
data = open(SAVE_PATH + save_filename, 'rb')
s3_resource.Bucket(BUCKET_NAME).put_object(Key=f'user_uploads/{save_filename}',
Body=data,
Metadata={'Content-Type': content_type},
ACL='public-read')
# Write to DB
cursor.execute(f"INSERT INTO image_uploads (url, category) VALUES ('user_uploads/{save_filename}', '{category}');")
connection.commit()
# Return template
return template('upload_success.tpl', name='Upload Image')
if __name__ == '__main__':
# Connect to DB
connection = psycopg2.connect(user="postgres", host="endpoint of my database", password="here would be my password", database="bottletube")
cursor = connection.cursor()
cursor.execute("SET SCHEMA 'bottletube';")
connection.commit()
# Connect to S3
s3_resource = resource('s3', region_name='us-east-1')
# Needs to be customized
# run(host='your_public_dns_name',
run(host=requests.get('http://169.254.169.254/latest/meta-data/public-hostname').text,
port=80)
`
So this is my code, i know, never include passwords in code, i know how AWS secretmanager works but wanted to check function before i do that

Capture Athena Query Execution Details using Lambda

Under CloudWatch, I created a rule to capture Athena Query State Change Event that will (1) write a log to a log group (2) trigger a Lambda function that will capture the Athena Query Execution details and pipe it to a s3 bucket. Point 2 fails as no Athena Query Execution details are piped it to a s3 bucket. Below is the Lambda Function I used:
import json
import boto3
from botocore.config import Config
my_config = Config(
region_name = '<my_region>')
print('Loading function')
def lambda_handler(event, context):
print("Received event: " + json.dumps(event))
print("QuertID: " + event['id'])
#get query statistics
client = boto3.client('athena', config=my_config)
queries = client.get_query_execution( QueryExecutionId=event['detail']['QueryExecutionId'])
del queries['QueryExecution']['Status']
#saving the query statistics to s3
s3 = boto3.resource('s3')
object = s3.Object('<s3_bucket_path>','query_statistics_json/' + event['detail']['QueryExecutionId'])
object.put(Body=str(queries['QueryExecution']))
return 0
I used this AWS Documentation as reference:
https://docs.aws.amazon.com/athena/latest/ug/control-limits.html
The body should be of the type binary data.
object.put(Body=some binary data)
Maybe you can write the str(queries['QueryExecution'] to a txt file in lambda's /tmp directory and upload it.
content="String content to write to a new S3 file"
s3.Object('my-bucket-name', '/tmp/newfile.txt').put(Body=content)
it's just an indentation problem, after line 11, all should be indented...

How to move a blob data to Snowflake thru Python

I am trying to move the data from ADLS blob to Snowflake table.
I am able to do the same with UI.
Steps followed for UI :
Generated the following SAS token :
sp=rl&st=2021-06-01T05:45:37Z&se=2021-06-01T13:45:37Z&spr=https&sv=2020-02-10&sr=c&sig=rYYY4o%2YY3jj%2XXXXXAB%2Bo8ygrtyAVCnPOxomlOc%3D
Able to load the table with the above token in Snowflake Web UI :
copy into FIRST_LEVEL.MOVIES
from 'azure://adlsedmadifpoc.blob.core.windows.net/airflow-dif/raw-area/'
credentials=(azure_sas_token='sp=rl&st=2021-06-01T05:45:37Z&se=2021-06-01T13:45:37Z&spr=https&sv=2020-02-10&sr=c&sig=rYYY4o%2YY3jj%2XXXXXAB%2Bo8ygrtyAVCnPOxomlOc%3D')
FORCE = TRUE file_format = (TYPE = CSV);
I am trying to do the same with Python :
from azure.storage.blob import BlobServiceClient,generate_blob_sas,BlobSasPermissions
from datetime import datetime,timedelta
import snowflake.connector
def generate_sas_token(file_name):
sas = generate_blob_sas(account_name="xxxx",
account_key="p5V2GELxxxxQ4tVgLdj9inKwwYWlAnYpKtGHAg==", container_name="airflow-dif",blob_name=file_name,permission=BlobSasPermissions(read=True),
expiry=datetime.utcnow() + timedelta(hours=2))
print (sas)
return sas
sas = generate_sas_token("raw-area/moviesDB.csv")
# Connectio string
conn = snowflake.connector.connect(user='xx',password='xx#123',account='xx.southeast-asia.azure',database='xx')
# Create cursor
cur = conn.cursor()
cur.execute(
f"copy into FIRST_LEVEL.MOVIES FROM 'azure://xxx.blob.core.windows.net/airflow-dif/raw-area/moviesDB.csv' credentials=(azure_sas_token='{sas}') file_format = (TYPE = CSV) ;")
cur.execute(f" Commit ;")
# Execute SQL statement
cur.close()
conn.close()
SAS token generated in the code :
se=2021-06-01T07%3A42%3A11Z&sp=rt&sv=2020-06-12&sr=b&sig=ZhZMPSI%yyyyAPTqqE0%3D
I am unable to use List permission while generating sas token thru python.
I am facing the below error :
cursor=cursor,
snowflake.connector.errors.ProgrammingError: 091003 (22000): Failure using stage area. Cause: [Server failed to authenticate the request. Make sure the value of Authorization header is formed correctly including the signature. (Status Code: 403; Error Code: AuthenticationFailed)]
I might have list of csv files in future in that folder.
Any help appreciated. Thanks.
The following code worked :
from azure.storage.blob import generate_container_sas, ContainerSasPermissions
from datetime import datetime,timedelta
import snowflake.connector
def get_sas_token():
container_sas_token = generate_container_sas(
account_name = 'XX',
account_key = 'p5V2GEL3AqGuPMMYXXXQ4tVgLdj9inKwwYWlAnYpKtGHAg==',
container_name = 'airflow-dif',
permission=ContainerSasPermissions(read=True,list=True),
expiry=datetime.utcnow() + timedelta(hours=1)
)
print (container_sas_token)
return container_sas_token
sas = get_sas_token()
# Connectio string
conn = snowflake.connector.connect(user='XX',password='XX#123',account='XX.southeast-asia.azure',database='XX')
# Create cursor
cur = conn.cursor()
cur.execute(
f"copy into FIRST_LEVEL.MOVIES FROM 'azure://XX.blob.core.windows.net/airflow-dif/raw-area/' credentials=(azure_sas_token='{sas}') FORCE = TRUE file_format = (TYPE = CSV) ;")
print (cur.fetchone())
cur.execute(f" Commit ;")
# Execute SQL statement
cur.close()
conn.close()
Thank you Gaurav for your inputs.

Unable to read data from AWS Glue Database/Tables using Python

My requirement is to use python script to read data from AWS Glue Database into a dataframe. When I researched I fought the library - "awswrangler". I'm using the below code to connect and read data:
import awswrangler as wr
profile_name = 'aws_profile_dev'
REGION = 'us-east-1'
#Retreiving credentials to connect to AWS
ACCESS_KEY_ID, SECRET_ACCESS_KEY,SESSION_TOKEN = get_profile_credentials(profile_name)
session = boto3.session.Session(
aws_access_key_id=ACCESS_KEY_ID,
aws_secret_access_key=SECRET_ACCESS_KEY,
aws_session_token=SESSION_TOKEN
)
my_df= wr.athena.read_sql_table(table= 'mytable_1', database= 'shared_db', boto3_session=session)
However, when I'm running the above code, I'm getting the following error - "ValueError: year 0 is out of range"
Alternatively, I tried using another library - "pyathena". The code I'm trying to use is:
from pyathena import connect
import pandas as pd
conn = connect(aws_access_key_id=ACCESS_KEY_ID,
aws_secret_access_key=SECRET_ACCESS_KEY,
aws_session_token=SESSION_TOKEN,
s3_staging_dir='s3://my-sample-bucket/',
region_name='us-east-1')
df = pd.read_sql("select * from AwsDataCatalog.shared_db.mytable_1 limit 1000", conn)
Using this, I'm able to retrieve data, but it works only if I'm using limit. i.e.., If I'm just running query without limit i.e.., "select * from AwsDataCatalog.shared_db.mytable_1", it's giving the error - ValueError: year 0 is out of range
Weird behavior - For example, If I run:
df = pd.read_sql("select * from AwsDataCatalog.shared_db.mytable_1 limit 1200", conn)
sometimes it's giving the same error, and if I simply reduce the limit value and run (for example as limit 1199), and later again when I run it back with limit 1200 it works. But this doesn't work if I'm trying to read more than ~1300 rows. I have a total 2002 rows in the table. I need to read the entire table.
Please help! Thank you!
Use following code in python to get data what you are looking for.
import boto3
query = "SELECT * from table_name"
s3_resource = boto3.resource("s3")
s3_client = boto3.client('s3')
DATABASE = 'database_name'
output='s3://output-bucket/output-folder'
athena_client = boto3.client('athena')
# Execution
response = athena_client.start_query_execution(
QueryString=query,
QueryExecutionContext={
'Database': DATABASE
},
ResultConfiguration={
'OutputLocation': output,
}
)
queryId = response['QueryExecutionId']
I have found a way using awswrangler to query data directly from Athena into pandas dataframe on your local machine. This doesn't require us to provide output location on S3.
profile_name = 'Dev-AWS'
REGION = 'us-east-1'
#this automatically retrieves credentials from your aws credentials file after you run aws configure on command-line
ACCESS_KEY_ID, SECRET_ACCESS_KEY,SESSION_TOKEN = get_profile_credentials(profile_name)
session = boto3.session.Session(
aws_access_key_id=ACCESS_KEY_ID,
aws_secret_access_key=SECRET_ACCESS_KEY,
aws_session_token=SESSION_TOKEN
)
wr.athena.read_sql_query("select * from table_name", database="db_name", boto3_session=session)
Alternatively, if you don't want to query Athena, but want to read entire glue table, you can use:
my_df = wr.athena.read_sql_table(table= 'my_table', database= 'my_db', boto3_session=session)

boto3 check if Athena database exists

Im making a script that creates a database in AWS Athena and then creates tables for that database, today the DB creation was taking ages, so the tables being created referred to a db that doesn't exists, is there a way to check if a DB is already created in Athena using boto3?
This is the part that created the db:
client = boto3.client('athena')
client.start_query_execution(
QueryString='create database {}'.format('db_name'),
ResultConfiguration=config
)
# -*- coding: utf-8 -*-
import logging
import os
from time import sleep
import boto3
import pandas as pd
from backports.tempfile import TemporaryDirectory
logger = logging.getLogger(__name__)
class AthenaQueryFailed(Exception):
pass
class Athena(object):
S3_TEMP_BUCKET = "please-replace-with-your-bucket"
def __init__(self, bucket=S3_TEMP_BUCKET):
self.bucket = bucket
self.client = boto3.Session().client("athena")
def execute_query_in_athena(self, query, output_s3_directory, database="csv_dumps"):
""" Useful when client executes a query in Athena and want result in the given `s3_directory`
:param query: Query to be executed in Athena
:param output_s3_directory: s3 path in which client want results to be stored
:return: s3 path
"""
response = self.client.start_query_execution(
QueryString=query,
QueryExecutionContext={"Database": database},
ResultConfiguration={"OutputLocation": output_s3_directory},
)
query_execution_id = response["QueryExecutionId"]
filename = "{filename}.csv".format(filename=response["QueryExecutionId"])
s3_result_path = os.path.join(output_s3_directory, filename)
logger.info(
"Query query_execution_id <<{query_execution_id}>>, result_s3path <<{s3path}>>".format(
query_execution_id=query_execution_id, s3path=s3_result_path
)
)
self.wait_for_query_to_complete(query_execution_id)
return s3_result_path
def wait_for_query_to_complete(self, query_execution_id):
is_query_running = True
backoff_time = 10
while is_query_running:
response = self.__get_query_status_response(query_execution_id)
status = response["QueryExecution"]["Status"][
"State"
] # possible responses: QUEUED | RUNNING | SUCCEEDED | FAILED | CANCELLED
if status == "SUCCEEDED":
is_query_running = False
elif status in ["CANCELED", "FAILED"]:
raise AthenaQueryFailed(status)
elif status in ["QUEUED", "RUNNING"]:
logger.info("Backing off for {} seconds.".format(backoff_time))
sleep(backoff_time)
else:
raise AthenaQueryFailed(status)
def __get_query_status_response(self, query_execution_id):
response = self.client.get_query_execution(QueryExecutionId=query_execution_id)
return response
As pointed in above answer, Athena Waiter is still not there implemented.
I use this light weighted Athena client to do the query, it returns the s3 path of result when the query is completed.
The waiter functions for Athena are not implemented yet: Athena Waiter
See: Support AWS Athena waiter feature for a possible workaround until it is implemented in Boto3. This is how it is implemented in AWS CLI.
while True:
stats = self.athena.get_query_execution(execution_id)
status = stats['QueryExecution']['Status']['State']
if status in ['SUCCEEDED', 'FAILED', 'CANCELLED']:
break
time.sleep(0.2)

Categories

Resources