Unable to read Athena query into pandas dataframe

Unable to read Athena query into pandas dataframe - python

I have the below code, and want to get it to return a dataframe properly. The polling logic works, but the dataframe doesn't seem to get created/returned. Right now it just returns None when called.
import boto3
import pandas as pd
import io
import re
import time
AK='mykey'
SAK='mysecret'
params = {
'region': 'us-west-2',
'database': 'default',
'bucket': 'my-bucket',
'path': 'dailyreport',
'query': 'SELECT * FROM v_daily_report LIMIT 100'
}
session = boto3.Session(aws_access_key_id=AK,aws_secret_access_key=SAK)
# In[32]:
def athena_query(client, params):
response = client.start_query_execution(
QueryString=params["query"],
QueryExecutionContext={
'Database': params['database']
},
ResultConfiguration={
'OutputLocation': 's3://' + params['bucket'] + '/' + params['path']
}
)
return response
def athena_to_s3(session, params, max_execution = 5):
client = session.client('athena', region_name=params["region"])
execution = athena_query(client, params)
execution_id = execution['QueryExecutionId']
df = poll_status(execution_id, client)
return df
def poll_status(_id, client):
'''
poll query status
'''
result = client.get_query_execution(
QueryExecutionId = _id
)
state = result['QueryExecution']['Status']['State']
if state == 'SUCCEEDED':
print(state)
print(str(result))
s3_key = 's3://' + params['bucket'] + '/' + params['path']+'/'+ _id + '.csv'
print(s3_key)
df = pd.read_csv(s3_key)
return df
elif state == 'QUEUED':
print(state)
print(str(result))
time.sleep(1)
poll_status(_id, client)
elif state == 'RUNNING':
print(state)
print(str(result))
time.sleep(1)
poll_status(_id, client)
elif state == 'FAILED':
return result
else:
print(state)
raise Exception
df_data = athena_to_s3(session, params)
print(df_data)
I plan to move the dataframe load out of the polling function, but just trying to get it to work as is right now.

I recommend you to take a look at AWS Wrangler instead of using the traditional boto3 Athena API. This newer and more specific interface to all things data in AWS including queries to Athena and giving more functionality.
import awswrangler as wr
df = wr.pandas.read_sql_athena(
sql="select * from table",
database="database"
)
Thanks to #RagePwn comment it is worth checking PyAthena as an alternative to the boto3 option to query Athena.

If it is returning None, then it is because state == 'FAILED'. You need to investigate the reason it failed, which may be in 'StateChangeReason'.
{
'QueryExecution': {
'QueryExecutionId': 'string',
'Query': 'string',
'StatementType': 'DDL'|'DML'|'UTILITY',
'ResultConfiguration': {
'OutputLocation': 'string',
'EncryptionConfiguration': {
'EncryptionOption': 'SSE_S3'|'SSE_KMS'|'CSE_KMS',
'KmsKey': 'string'
}
},
'QueryExecutionContext': {
'Database': 'string'
},
'Status': {
'State': 'QUEUED'|'RUNNING'|'SUCCEEDED'|'FAILED'|'CANCELLED',
'StateChangeReason': 'string',
'SubmissionDateTime': datetime(2015, 1, 1),
'CompletionDateTime': datetime(2015, 1, 1)
},
'Statistics': {
'EngineExecutionTimeInMillis': 123,
'DataScannedInBytes': 123,
'DataManifestLocation': 'string',
'TotalExecutionTimeInMillis': 123,
'QueryQueueTimeInMillis': 123,
'QueryPlanningTimeInMillis': 123,
'ServiceProcessingTimeInMillis': 123
},
'WorkGroup': 'string'
}
}

Just to elaborate on the RagePwn's answer of using PyAthena -that's what I ultimately did as well. For some reason AwsWrangler choked on me and couldn't handle the JSON that was being returned from S3. Here's the code snippet that worked for me based on PyAthena's PyPi page
import os
from pyathena import connect
from pyathena.util import as_pandas
aws_access_key_id = os.getenv('ATHENA_ACCESS_KEY')
aws_secret_access_key = os.getenv('ATHENA_SECRET_KEY')
region_name = os.getenv('ATHENA_REGION_NAME')
staging_bucket_dir = os.getenv('ATHENA_STAGING_BUCKET')
cursor = connect(aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region_name=region_name,
s3_staging_dir=staging_bucket_dir,
).cursor()
cursor.execute(sql)
df = as_pandas(cursor)
The above assumes you have defined as environment variables the following:
ATHENA_ACCESS_KEY: the AWS access key id for your AWS account
ATHENA_SECRET_KEY: the AWS secret key
ATHENA_REGION_NAME: the AWS region name
ATHENA_STAGING_BUCKET: a bucket in the same account that has the correct access settings (explanation of which is outside the scope of this answer)

Related

Flashbots "X-Flashbots-Signature" header not working correctly with web3.py

Recently I've been having some trouble with the X-Flashbots-Signature header when sending a request to the flashbots goerli endpoint.
My python code looks like this:
import requests
import json
import secrets
from eth_account import Account, messages
from web3 import Web3
from math import ceil
rpcUrl = GOERLI_RPC_NODE_PROVIDER
web3 = Web3(Web3.HTTPProvider(rpcUrl))
publicKey = ETH_PUBLIC_KEY
privateKey = ETH_PRIVATE_KEY
contractAddress = GOERLI_TEST_CONTRACT # Goerli test contract
data = CONTRACT_DATA # Contract data to execute
signed = []
for _ in range(2):
nonce = web3.eth.getTransactionCount(publicKey, 'pending')
checksumAddress = Web3.toChecksumAddress(contractAddress)
checksumPublic = Web3.toChecksumAddress(publicKey)
tx = {
'nonce': nonce,
'to': checksumAddress,
'from': checksumPublic,
'value': 0,
'gasPrice': web3.toWei(200, 'gwei'),
'data': data
}
gas = web3.eth.estimateGas(tx)
tx['gas'] = ceil(gas + gas * .1)
signed_tx = web3.eth.account.signTransaction(tx, privateKey)
signed.append(Web3.toHex(signed_tx.rawTransaction))
dt = {
'jsonrpc': '2.0',
'method': 'eth_sendBundle',
'params': [
{
'txs': [
signed[0], signed[1] # Signed txs with web3.eth.account.signTransaction
],
'blockNumber': web3.eth.block_number + 1,
'minTimestamp': '0x0',
'maxTimestamp': '0x0',
'revertingTxHashes': []
}
],
'id': 1337
}
pvk = secrets.token_hex(32)
pbk = Account.from_key(pvk).address
body = json.dumps(dt)
message = messages.encode_defunct(text=Web3.keccak(text=body).hex())
signature = pbk + ':' + Account.sign_message(message, pvk).signature.hex()
hd = {
'Content-Type': 'application/json',
'X-Flashbots-Signature': signature,
}
res = requests.post('https://relay-goerli.flashbots.net/', headers=hd, data=body)
print(res.text)
This code is a modified version of code taken straight from the flashbots docs: https://docs.flashbots.net/flashbots-auction/searchers/advanced/rpc-endpoint/#authentication
Upon running this code I get an internal server error error response. At first, I thought the problem might be fixed by replacing text=Web3.keccak(text=body).hex() to hexstr=Web3.keccak(text=body).hex() or primative=Web3.keccak(text=body), as per the definition of messages.encode_defunct: https://eth-account.readthedocs.io/en/stable/eth_account.html#eth_account.messages.encode_defunct. But after making this replacement, I got the error signer address does not equal expected. This is very confusing, especially because I have resolved the message
with the signature myself and the public key does match. But whenever I send it to the flashbots endpoint, I am left with this error.
Any ideas would be greatly appreciated.

error with dynamo occurred (ValidationException) when calling the Query operation: Invalid KeyConditionExpression:

I am a bit new to dynamodb
See error I get when trying to get the max id of my dynamodb table in python lambda function using instructions in below StackOverflow post in below link
Dynamodb max value
An error occurred (ValidationException) when calling the Query operation: Invalid KeyConditionExpression: The expression can not be empty;\"}"
see my lambda function code below
import json
import boto3
TABLE_NAME = 'user-profiles'
dynamo_DB = boto3.resource('dynamodb')
def lambda_handler(event, context):
user_id = event['user_id']
email = event['email']
bvn = event['bvn']
password = event['password']
phone = event['phone']
gender = event['gender']
output = ''
if len(user_id) > 1 and len(password) > 5:
try:
table = dynamo_DB.Table(TABLE_NAME)
values = list(table.query(
KeyConditionExpression='',
ScanIndexForward=False,
Limit=1
)
)
max_id = values[0]['id']
new_id = max_id + 1
Item = {
'id': str(new_id),
'profile-id': str(new_id),
'user_id': user_id,
'email': email,
'bvn': bvn,
'password': password,
'phone': phone,
'gender': gender
}
table.put_item(Item=Item)
output += 'Data Inserted To Dynamodb Successfully'
except Exception as e:
output += 'error with dynamo registration ' + str(e)
# print(output)
else:
output += 'invalid user or password entered, this is ' \
'what i received:\nusername: ' \
+ str(user_id) + '\npassword: ' + str(password)
return {
"statusCode": 200,
"body": json.dumps({
"message": output,
}),
}
# print(output)

You cannot query with empty KeyConditionExpression, if you need to read all records from the table you need to use scan. But you cannot use ScanIndexForward there to order records forward.
Seems like you're trying to implement primary key incrementation. I want to warn you, your solution is not really awesome, because you easily can hit a race condition.
What I would suggest:
I guess you are using id as a primary key (aka partition key). it's okay. what I would do is upsert an extra record in the table, with say increment value:
increment = table.update_item(
Key={'id': 'increment'},
UpdateExpression='ADD #increment :increment',
ExpressionAttributeNames={'#increment': 'increment'},
ExpressionAttributeValues={':increment': 1},
ReturnValues='UPDATED_NEW',
)
new_id = increment['Attributes']['increment']
This query will update the existing record with id: 'increment' and store a new incremented number in the record, if it is the very first query the record will be created with increment: 1 and subsequent calls will increment it. ReturnValues means the query will return the result after the update and you will get a new id.
put the code in place instead of where you query the last record
so your code would look like:
import json
import boto3
TABLE_NAME = 'user-profiles'
dynamo_DB = boto3.resource('dynamodb')
def lambda_handler(event, context):
user_id = event['user_id']
email = event['email']
bvn = event['bvn']
password = event['password']
phone = event['phone']
gender = event['gender']
output = ''
if len(user_id) > 1 and len(password) > 5:
try:
table = dynamo_DB.Table(TABLE_NAME)
increment = table.update_item(
Key={'id': 'increment'},
UpdateExpression='ADD #increment :increment',
ExpressionAttributeNames={'#increment': 'increment'},
ExpressionAttributeValues={':increment': 1},
ReturnValues='UPDATED_NEW',
)
new_id = increment['Attributes']['increment']
Item = {
'id': str(new_id),
'profile-id': str(new_id),
'user_id': user_id,
'email': email,
'bvn': bvn,
'password': password,
'phone': phone,
'gender': gender
}
table.put_item(Item=Item)
output += 'Data Inserted To Dynamodb Successfully'
except Exception as e:
output += 'error with dynamo registration ' + str(e)
# print(output)
else:
output += 'invalid user or password entered, this is ' \
'what i received:\nusername: ' \
+ str(user_id) + '\npassword: ' + str(password)
return {
"statusCode": 200,
"body": json.dumps({
"message": output,
}),
}
# print(output)
and you're good.
Extra thoughts:
And to be 100% sure that there is no race condition on incrementation, you can implement a locking mechanism this way: Before incrementing, put an extra record with id value lock and lock attribute with any value, and use ConditionExpression='attribute_not_exists(lock)'. Then make an increment and then release the lock by removing the record lock. So while the record is there the second attempt to 'make a lock' would break by the condition that attribute lock exists and throw error ConditionalCheckFailedException (you can catch the error and show to a user that the record is locked or whatever.)
Here is an example in JavaScript sorry:
module.exports.DynamoDbClient = class DynamoDbClient {
constructor(tableName) {
this.dynamoDb = new DynamoDB.DocumentClient();
this.tableName = tableName;
}
async increment() {
await this.lock();
const {Attributes: {increment}} = await this.dynamoDb.update({
TableName: this.tableName,
Key: {id: 'increment'},
UpdateExpression: 'ADD #increment :increment',
ExpressionAttributeNames: {'#increment': 'increment'},
ExpressionAttributeValues: {':increment': 1},
ReturnValues: 'UPDATED_NEW',
}).promise();
await this.unlock();
return increment;
}
async lock(key) {
try {
await this.dynamoDb.put({
TableName: this.tableName,
Item: {id: 'lock', _lock: true},
ConditionExpression: 'attribute_not_exists(#lock)',
ExpressionAttributeNames: {'#lock': '_lock'},
}).promise();
} catch (error) {
if (error.code === 'ConditionalCheckFailedException') {
throw new LockError(`Key is locked.`);
}
throw error;
}
}
unlock() {
return this.delete({id: 'lock'});
}
async delete(key) {
await this.dynamoDb.delete({
TableName: this.tableName,
Key: key,
}).promise();
}
}
// usage
const client = new DynamoDbClient('table');
const newId = await client.increment();
...

AWS Lambda invoke from Codepipeline permission denied error

I've set my pipeline to invoke a AWS Lamba function. After running for 30 mins it shows the error
The AWS Lambda function cloudfront-invalidation failed to return a
result. Check the function to verify that it has permission to call
the PutJobSuccessResult action and that it made a call to
PutJobSuccessResult.
Lambda Role has Permissions to set PutJobSuccessResult and
Codepipeline Service role has permission to invoke lambda functions.
Here is my lambda code:
import boto3
import time
def lambda_handler(context, event):
sts_connection = boto3.client('sts')
acct_b = sts_connection.assume_role(
RoleArn="arn:aws:iam::1234567890:role/AssumeRole",
RoleSessionName="cross_acct_lambda"
)
ACCESS_KEY = acct_b['Credentials']['AccessKeyId']
SECRET_KEY = acct_b['Credentials']['SecretAccessKey']
SESSION_TOKEN = acct_b['Credentials']['SessionToken']
client = boto3.client(
'cloudfront',
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY,
aws_session_token=SESSION_TOKEN,
)
response = client.create_invalidation(
DistributionId='ABC',
InvalidationBatch={
'Paths': {
'Quantity': 1,
'Items': [
'/*',
]
},
'CallerReference': str(time.time()).replace(".", "")
}
)
invalidation_id = response['Invalidation']['Id']
print("Invalidation created successfully with Id: " + invalidation_id)
pipeline = boto3.client('codepipeline')
response = pipeline.put_job_success_result(
jobId= event['CodePipeline.job']['id']
)
return response

Issue resolved. Updated lambda below:
import boto3
import time
import json
import logging
def lambda_handler(event, context):
sts_connection = boto3.client('sts')
acct_b = sts_connection.assume_role(
RoleArn="arn:aws:iam::123456789:role/CloudfrontAssumeRole",
RoleSessionName="cross_acct_lambda"
)
ACCESS_KEY = acct_b['Credentials']['AccessKeyId']
SECRET_KEY = acct_b['Credentials']['SecretAccessKey']
SESSION_TOKEN = acct_b['Credentials']['SessionToken']
client = boto3.client(
'cloudfront',
aws_access_key_id=ACCESS_KEY,
aws_secret_access_key=SECRET_KEY,
aws_session_token=SESSION_TOKEN,
)
response = client.create_invalidation(
DistributionId='ABCD',
InvalidationBatch={
'Paths': {
'Quantity': 1,
'Items': [
'/*',
]
},
'CallerReference': str(time.time()).replace(".", "")
}
)
invalidation_id = response['Invalidation']['Id']
print("Invalidation created successfully with Id: " + invalidation_id)
logger = logging.getLogger()
logger.setLevel(logging.INFO)
logger.debug(json.dumps(event))
codepipeline = boto3.client('codepipeline')
job_id = event['CodePipeline.job']['id']
try:
logger.info('Success!')
response = codepipeline.put_job_success_result(jobId=job_id)
logger.debug(response)
except Exception as error:
logger.exception(error)
response = codepipeline.put_job_failure_result(
jobId=job_id,
failureDetails={
'type': 'JobFailed',
'message': f'{error.__class__.__name__}: {str(error)}'
}
)
logger.debug(response)

Export DynamoDB to CSV on S3 with Lambda function (python)

Hello im trying to generate a CSV from dynamoDB to S3 using lambda function. the thing is I just get an empty file on s3. Please your help!
import csv
import boto3
import json
dynamodb = boto3.resource('dynamodb')
db = dynamodb.Table('ReporteTelefonica')
def lambda_handler(event, context):
AWS_BUCKET_NAME = 'reportetelefonica'
s3 = boto3.resource('s3')
bucket = s3.Bucket(AWS_BUCKET_NAME)
path = 'test.csv'
try:
response = db.scan()
myFile = open(path, 'w')
for i in response['Items']:
csv.register_dialect('myDialect', delimiter=',', quoting=csv.QUOTE_NONE)
with myFile:
writer = csv.writer(myFile, dialect='myDialect')
writer.writerows(i)
print(i)
except :
print("error")
bucket.put_object(
ACL='public-read-write',
ContentType='application/csv',
Key=path,
# Body=json.dumps(i),
)
# print("here")
body = {
"uploaded": "true",
"bucket": AWS_BUCKET_NAME,
"path": path,
}
# print("then here")
return {
"statusCode": 200,
"body": json.dumps(body)
}
I'm kind of noob on this, so I was wondering what should I modify to successfully make a complete scan of the table and write the values on the CSV on S3???

Here's a working lambda that will do the job.
import boto3
import json
import os
import pandas as pd
TABLE_NAME = os.environ.get("DDB_TABLE_NAME")
OUTPUT_BUCKET = os.environ.get("BUCKET_NAME")
TEMP_FILENAME = '/tmp/export.csv'
OUTPUT_KEY = 'export.csv'
s3_resource = boto3.resource('s3')
dynamodb_resource = boto3.resource('dynamodb')
table = dynamodb_resource.Table(TABLE_NAME)
def lambda_handler(event, context):
response = table.scan()
df = pd.DataFrame(response['Items'])
df.to_csv(TEMP_FILENAME, index=False, header=True)
# Upload temp file to S3
s3_resource.Bucket(OUTPUT_BUCKET).upload_file(TEMP_FILENAME, OUTPUT_KEY)
return {
'statusCode': 200,
'headers': {
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Credentials": True,
"content-type": "application/json"
},
'body': json.dumps('OK')
}

You either have to close the file after you finished writing the cvs records and then reopen for reading and pass to the put_obkect method.
Alternatively you open the file for reading and writing and after writing you seek to position 0 so that the put_object method reads from the start.

Runtime.MarshalError in python

I am Getting this error. I am executing code of aws lambda function using python 3.7 to know quicksight dashboard version. Thanks in advance!
errorMessage: "Unable to marshal response: Object of type datetime is not JSON serializable",
errorType : "Runtime.MarshalError"
Code-
import boto3
import time
import sys
client = boto3.client('quicksight')
def lambda_handler(event, context):
response = client.list_dashboard_versions(AwsAccountId='11111', DashboardId='2222',MaxResults=10)
return response

I quick fix could be:
import boto3
import time
import sys
import json
client = boto3.client('quicksight')
def lambda_handler(event, context):
response = client.list_dashboard_versions(AwsAccountId='11111', DashboardId='2222',MaxResults=10)
return json.dumps(response, default=str)

Looking at https://boto3.amazonaws.com/v1/documentation/api/1.14.8/reference/services/quicksight.html#QuickSight.Client.list_dashboard_versions the return looks like this -
{
'DashboardVersionSummaryList': [
{
'Arn': 'string',
'CreatedTime': datetime(2015, 1, 1),
'VersionNumber': 123,
'Status': 'CREATION_IN_PROGRESS'|'CREATION_SUCCESSFUL'|'CREATION_FAILED'|'UPDATE_IN_PROGRESS'|'UPDATE_SUCCESSFUL'|'UPDATE_FAILED',
'SourceEntityArn': 'string',
'Description': 'string'
},
],
'NextToken': 'string',
'Status': 123,
'RequestId': 'string'
}
As you can see, CreatedTime is returned as datetime. If you want to return this as a JSON, you should transform this value.

I was struggling with this today with a method that also returns a datetime.
In my example 'JoinedTimestamp': datetime(2015, 1, 1) resulting in the same Unable to marshal response.
If you don't need the CreatedTime value you might as well remove it from the response as:
for account in list_accounts_response["Accounts"]:
if "JoinedTimestamp" in account:
del account["JoinedTimestamp"]
To follow up on Joseph Lane's answer, transforming this value could be something along the lines of:
for account in list_accounts_response["Accounts"]:
if "JoinedTimestamp" in account:
account["JoinedTimestamp"] = str(account["JoinedTimestamp"])

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Unable to read Athena query into pandas dataframe - python

Related

Flashbots "X-Flashbots-Signature" header not working correctly with web3.py

error with dynamo occurred (ValidationException) when calling the Query operation: Invalid KeyConditionExpression:

AWS Lambda invoke from Codepipeline permission denied error

Export DynamoDB to CSV on S3 with Lambda function (python)

Runtime.MarshalError in python

Categories

Resources