I have set up a trigger/lambda to upload into DynamoDB however i get the following error when uploading..not sure what is going wrong.
So far i have just created a blank dDB table with the primary key of "PlayerWeekID" as string but nothing else. Is this an issue because DDB isnt reading in the data types? Do I need to specify these in the Lamdda or set up in DDB before running the code?
Update:
This is the python code:
#change dataframe to json
sdl_fpl_data = dffinal.to_json(orient='records', lines=True)
s3 = boto3.resource('s3')
obj = s3.Object('bucket-name','sdl_fpl_data.json')
obj.put(Body=json.dumps(sdl_fpl_data))
Lambda:
import boto3
import json
s3_client = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
def lambda_handler(event, context):
bucket = event['Records'][0]['s3']['bucket']['name']
json_file_name = event['Records'][0]['s3']['object']['key']
json_object = s3_client.get_object(Bucket=bucket,Key=json_file_name)
jsonFileReader = json_object['Body'].read()
jsonDict = json.loads(jsonFileReader)
table = dynamodb.Table('my-table')
table.put_item(Item=jsonDict)
[ERROR] ParamValidationError: Parameter validation failed:
Invalid type for parameter Item, value:
{
"GW": "GW1",
"OR": "2,149,169",
"GWP": 66,
"PB": 3,
"TM": 0,
"TC": 0,
"£": 100,
"Manager": "XXXXX",
"Team Name": "XXXXXX",
"Player_Number": "372",
"TP": 66,
"PlayerWeekID": "372GW1"
}
, type: <class 'str'>, valid types: <class 'dict'>
Traceback (most recent call last):
File "/var/task/lambda_function.py", line 16, in lambda_handler
table.put_item(Item=jsonDict)
Output of jsonDict
Can you share the output of your variable jsonDict.
DynamoDB needs a JSON object as payload: {}
From what I understand it looks like you're trying to save a list [].
Ensure you are saving an object which contains the keys of your table and you should have no issue.
Working example:
import boto3
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('test')
item = {
"GW": "GW1",
"OR": "2,149,169",
"GWP": 66,
"PB": 3,
"TM": 0,
"TC": 0,
"£": 100,
"Manager": "XXXXX",
"Team Name": "XXXXXX",
"Player_Number": "372",
"TP": 66,
"PlayerWeekID": "372GW1"
}
try:
res = table.put_item(Item=item)
print(res)
except Exception as e:
print(e)
Related
Requirement: To read data from S3 to pass into API
Error: "error": {"code": "ModelStateInvalid", "message": "The request has exceeded the maximum number of validation errors.", "target": "HttpRequest"
When I pass data directly in the code as below document , it works fine as below
def create_doc(self,client):
self.n_docs = int(self.n_docs)
document = {'addresses': {'SingleLocation': {'city': 'ABC',
'country': 'US',
'line1': 'Main',
'postalCode': '00000',
'region': 'CA'
}
},
'commit': False,
}
response = client.cr_transc(document)
jsn = response.json()
But when tried having data in the file in the s3 and read it from s3 , it throws into error
def create_doc(self,client):
self.n_docs = int(self.n_docs)
document = data_from_s3()
response = client.cr_transc(document)
jsn = response.json()
def data_from_s3(self):
s3 = S3Hook()
data = s3.read_key(bucket_name = self.bucket_name, key = self.data_key)
return data
Below link is for read_key method in airflow
https://airflow.apache.org/docs/apache-airflow/1.10.6/_modules/airflow/hooks/S3_hook.html#S3Hook:~:text=%5Bdocs%5D%20%20%20%20def-,read_key,-(self%2C
Checking the source code:
def read_key(self, key, bucket_name=None):
"""
Reads a key from S3
:param key: S3 key that will point to the file
:type key: str
:param bucket_name: Name of the bucket in which the file is stored
:type bucket_name: str
"""
obj = self.get_key(key, bucket_name)
return obj.get()['Body'].read().decode('utf-8')
This returns a str. You might need to use the json module to convert it:
import json
def create_doc(self,client):
self.n_docs = int(self.n_docs)
document = json.loads(data_from_s3()) # <----- convert here
response = client.cr_transc(document)
jsn = response.json()
def data_from_s3(self):
s3 = S3Hook()
data = s3.read_key(bucket_name = self.bucket_name, key = self.data_key)
return data
I am trying to insert a large csv file (5M records) to dynamodb using dynamodb_client.batch_write_item().
When I insert using dynamodb_client.put_item(), it works fine but I need to be able to use it with batch_write_item() too.
Here is my code snippet for few records (more than 1):
import json
import boto3
import csv
import pandas as pd
from datetime import datetime
roleARN = 'arn:aws:iam::123:role/xyz_role'
boto3.setup_default_session(profile_name='test_profile')
client = boto3.client('sts')
response = client.assume_role(RoleArn=roleARN,
RoleSessionName='RoleSessionName',
DurationSeconds=1800)
dynamodb_client = boto3.client('dynamodb', region_name='ap-south-1',
aws_access_key_id=response['Credentials']['AccessKeyId'],
aws_secret_access_key=response['Credentials']['SecretAccessKey'],
aws_session_token = response['Credentials']['SessionToken'])
#Fetching time for population
current_time = datetime.utcnow().isoformat()[:-3] + 'Z'
def convert_csv_to_json_list(file):
items = []
with open(file) as csvfile:
reader = csv.DictReader(csvfile)
for row in reader:
data = {}
data['col1'] = row['col1']
data['col2'] = int(row['col2'])
data['col3'] = int(row['col3'])
data['Row_Created'] = current_time
data['col4'] = row['col4']
data['col5'] = int(row['col5'])
data['Row_Updated'] = current_time
items.append(data)
return items
def batch_write(items):
table = "sample_table"
#writing batch
try:
print(type(items))
dynamodb_client.batch_write_item(RequestItems = {
table: [{'PutRequest':
{
'Item' : items
}}]
})
print(f'resource, specify all types : write succeeded.')
except Exception as e:
print(f'resource, specify all types : write failed: {e}')
inp_file = "sample_mapping.csv"
json_data = convert_csv_to_json_list(inp_file)
batch_write(json_data)
I keep getting :
<class 'list'>
resource, specify all types : write failed: Parameter validation failed:
Invalid type for parameter RequestItems.sample_table[0][{'col1': 'abc', 'col2': 59, 'col3': 0
, 'Row_Created': '2021-10-08T04:36:04.787Z', 'col4': 'dfrwfr', 'col5': 1, 'Row_Updated': '2021-10-08T04:36:04.787Z'}, {'col1': 'sffr', 'col2': 45, 'col3': 0
, 'Row_Created': '2021-10-08T04:36:04.787Z', 'col4': 'gty7u', 'col5': 1, 'Row_Updated': '2021-10-08T04:36:04.787Z'}], type: <class 'list'>, valid types: <class
'dict'>
Can someone help me where I am going wrong with batch insertion, tried looking up the documentation too.
Each item should be in a separate PutRequest key.
RequestItems = {
table: [
{'PutRequest': {'Item': {}}},
{'PutRequest': {'Item': {}}}
]
}
There are certain limitations with using batch_write_item, such as there cannot be more than 25 items in the request.
Code is below
import boto3
dynamodb = boto3.resource ('dynamodb')
table =dynamodb.Table('test')
def lambda_handler(event, context):
response = table.update_item(
Key={
'id': "100",
'name': "David"
})
I have created a DynamoDB table test my primary key is id which is string.
in DynamoDB my table value for id 100 is John i need to update to David. Above is the code. Why error is throwing the meta schema
Full error is below
"errorMessage": "An error occurred (ValidationException) when calling the UpdateItem operation: The document path provided in the update expression is invalid for update",
"errorType": "ClientError",
Tried below code
import boto3
dynamodb = boto3.resource ('dynamodb')
table =dynamodb.Table('test')
def lambda_handler(event, context):
response = table.update_item(
Key={
'id': '100'
},
UpdateExpression='SET name = :val1',
ExpressionAttributeValues={
':val1': 'David'
})
Adding one more table for replicate the case
TO put the table: Output >> Success
First create table newTable in DynamoDB
import boto3
def lambda_handler(event, context):
dynamodb = boto3.resource ('dynamodb')
table =dynamodb.Table('newTable')
response = table.put_item(
Item={
'username': 'Ac',
'first_name': 'DEF',
'last_name': 'FHI',
'age': 10,
'account': 'GOld'
})
How to get the item ? Output >> Error
import boto3
def lambda_handler(event, context):
dynamodb = boto3.resource ('dynamodb')
table =dynamodb.Table('newTable')
response = table.get_item(
Key={
'username':'Ac'
}
)
print (response)
Error >> Response:
"errorMessage": "An error occurred (ValidationException) when calling the GetItem operation: The provided key element does not match the schema",
"errorType": "ClientError",
Answer of second one
get and update need the exact item to be updated not batches, so you also need to provide the corresponding sort key
Courtesy #Sairsreenivas
import boto3
def lambda_handler(event, context):
dynamodb = boto3.resource ('dynamodb')
table =dynamodb.Table('newTable')
# response = table.put_item(
# Item={
# 'username': 'Ac',
# 'first_name': 'DEF',
# 'last_name': 'GH',
# 'age': 10,
# 'account': 'GOld'
# })
# try:
# response = table.get_item(Key={'username':'Mak'})
# except Exception as e:
# print(e.response['Error']['Message'])
# else:
# return response['Item']
# item = response['Item']
# print (item)
#Get Item
response = table.get_item(Key={'username':'Ac', 'last_name':'GH'})
print (response['Item'])
table.update_item(
Key ={
'username':'Ac', 'last_name':'GH'
},
UpdateExpression = 'SET age = :value1',
ExpressionAttributeValues={
':value1':20
}
)
print ("After update \n")
response = table.get_item(Key={'username':'Ac', 'last_name':'GH'})
print (response['Item'])
I have the below code, and want to get it to return a dataframe properly. The polling logic works, but the dataframe doesn't seem to get created/returned. Right now it just returns None when called.
import boto3
import pandas as pd
import io
import re
import time
AK='mykey'
SAK='mysecret'
params = {
'region': 'us-west-2',
'database': 'default',
'bucket': 'my-bucket',
'path': 'dailyreport',
'query': 'SELECT * FROM v_daily_report LIMIT 100'
}
session = boto3.Session(aws_access_key_id=AK,aws_secret_access_key=SAK)
# In[32]:
def athena_query(client, params):
response = client.start_query_execution(
QueryString=params["query"],
QueryExecutionContext={
'Database': params['database']
},
ResultConfiguration={
'OutputLocation': 's3://' + params['bucket'] + '/' + params['path']
}
)
return response
def athena_to_s3(session, params, max_execution = 5):
client = session.client('athena', region_name=params["region"])
execution = athena_query(client, params)
execution_id = execution['QueryExecutionId']
df = poll_status(execution_id, client)
return df
def poll_status(_id, client):
'''
poll query status
'''
result = client.get_query_execution(
QueryExecutionId = _id
)
state = result['QueryExecution']['Status']['State']
if state == 'SUCCEEDED':
print(state)
print(str(result))
s3_key = 's3://' + params['bucket'] + '/' + params['path']+'/'+ _id + '.csv'
print(s3_key)
df = pd.read_csv(s3_key)
return df
elif state == 'QUEUED':
print(state)
print(str(result))
time.sleep(1)
poll_status(_id, client)
elif state == 'RUNNING':
print(state)
print(str(result))
time.sleep(1)
poll_status(_id, client)
elif state == 'FAILED':
return result
else:
print(state)
raise Exception
df_data = athena_to_s3(session, params)
print(df_data)
I plan to move the dataframe load out of the polling function, but just trying to get it to work as is right now.
I recommend you to take a look at AWS Wrangler instead of using the traditional boto3 Athena API. This newer and more specific interface to all things data in AWS including queries to Athena and giving more functionality.
import awswrangler as wr
df = wr.pandas.read_sql_athena(
sql="select * from table",
database="database"
)
Thanks to #RagePwn comment it is worth checking PyAthena as an alternative to the boto3 option to query Athena.
If it is returning None, then it is because state == 'FAILED'. You need to investigate the reason it failed, which may be in 'StateChangeReason'.
{
'QueryExecution': {
'QueryExecutionId': 'string',
'Query': 'string',
'StatementType': 'DDL'|'DML'|'UTILITY',
'ResultConfiguration': {
'OutputLocation': 'string',
'EncryptionConfiguration': {
'EncryptionOption': 'SSE_S3'|'SSE_KMS'|'CSE_KMS',
'KmsKey': 'string'
}
},
'QueryExecutionContext': {
'Database': 'string'
},
'Status': {
'State': 'QUEUED'|'RUNNING'|'SUCCEEDED'|'FAILED'|'CANCELLED',
'StateChangeReason': 'string',
'SubmissionDateTime': datetime(2015, 1, 1),
'CompletionDateTime': datetime(2015, 1, 1)
},
'Statistics': {
'EngineExecutionTimeInMillis': 123,
'DataScannedInBytes': 123,
'DataManifestLocation': 'string',
'TotalExecutionTimeInMillis': 123,
'QueryQueueTimeInMillis': 123,
'QueryPlanningTimeInMillis': 123,
'ServiceProcessingTimeInMillis': 123
},
'WorkGroup': 'string'
}
}
Just to elaborate on the RagePwn's answer of using PyAthena -that's what I ultimately did as well. For some reason AwsWrangler choked on me and couldn't handle the JSON that was being returned from S3. Here's the code snippet that worked for me based on PyAthena's PyPi page
import os
from pyathena import connect
from pyathena.util import as_pandas
aws_access_key_id = os.getenv('ATHENA_ACCESS_KEY')
aws_secret_access_key = os.getenv('ATHENA_SECRET_KEY')
region_name = os.getenv('ATHENA_REGION_NAME')
staging_bucket_dir = os.getenv('ATHENA_STAGING_BUCKET')
cursor = connect(aws_access_key_id=aws_access_key_id,
aws_secret_access_key=aws_secret_access_key,
region_name=region_name,
s3_staging_dir=staging_bucket_dir,
).cursor()
cursor.execute(sql)
df = as_pandas(cursor)
The above assumes you have defined as environment variables the following:
ATHENA_ACCESS_KEY: the AWS access key id for your AWS account
ATHENA_SECRET_KEY: the AWS secret key
ATHENA_REGION_NAME: the AWS region name
ATHENA_STAGING_BUCKET: a bucket in the same account that has the correct access settings (explanation of which is outside the scope of this answer)
I need to read csv file from s3 bucket and insert each row on dynamoDB
def load_users_dynamodb():
s3 = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table("test")
obj = s3.get_object(Bucket='test-app-config', Key='extract_Users.csv')
#return obj
data = obj['Body'].read().split('\n')
#return json.dumps(data)
with table.batch_writer() as batch:
for row in data:
batch.put_item(Item={
'registration': row.split(',')[0],
'name': row.split(',')[1],
'role': row.split(',')[2],
'company': row.split(',')[3],
'hiredcompany': row.split(',')[4],
'region': row.split(',')[5]
})
return 'OK'
im getting exception and I can't proceed:
Response:
{
"errorMessage": "a bytes-like object is required, not 'str'",
"errorType": "TypeError",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 10, in lambda_handler\n 'body': load_users_dynamodb()\n",
" File \"/var/task/lambda_function.py\", line 21, in load_users_dynamodb\n data = obj['Body'].read().split('\\n')\n"
]
}
Someone can help me please? o/
Your issue related to decoding the object return from s3.You need to read the file as csv.
Take a look at the following code snippet:
import boto3
import csv
s3 = boto3.client('s3')
def lambda_handler(event, context):
obj = s3.get_object(Bucket='Bucket_Name', Key='File_Name.csv')
data = obj['Body'].read().decode('utf-8').splitlines()
lines = csv.reader(data)
headers = next(lines)
print('headers: %s' %(headers))
for line in lines:
print(line)
Output :
Dummy csv.