I have a below aws lambda code which is basically for ONTAP FileSystem monitoring and works if I do not integrate that to Dynamodb, while using this for now its giving me an error element does not match the schema.
Being a First time user of DynamoDB, i would love you seek some guidance on this.
Code:
import json
import os
import boto3
from datetime import datetime, timedelta
from boto3.dynamodb.conditions import Key
from botocore.exceptions import ClientError
def lambda_handler(event, context):
fsx = boto3.client('fsx')
cloudwatch = boto3.client('cloudwatch')
ses = boto3.client('ses')
region_name = os.environ['AWS_REGION']
dynamodb = boto3.resource('dynamodb', region_name=region_name)
dbtable = dynamodb.Table('FsxNMonitorFsx')
now = datetime.utcnow()
start_time = (now - timedelta(minutes=5)).strftime('%Y-%m-%dT%H:%M:%SZ')
end_time = now.strftime('%Y-%m-%dT%H:%M:%SZ')
table = []
result = []
next_token = None
while True:
if next_token:
response = fsx.describe_file_systems(NextToken=next_token)
else:
response = fsx.describe_file_systems()
for filesystem in response.get('FileSystems'):
filesystem_id = filesystem.get('FileSystemId')
table.append(filesystem_id)
next_token = response.get('NextToken')
if not next_token:
break
try:
# Create the DynamoDB table if it does not exist
dbtable = dynamodb.create_table(
TableName='FsxNMonitorFsx',
KeySchema=[
{
'AttributeName': filesystem_id,
'KeyType': 'HASH'
},
{
'AttributeName': 'alert_sent',
'KeyType': 'RANGE'
}
],
AttributeDefinitions=[
{
'AttributeName': filesystem_id,
'AttributeType': 'S'
},
{
'AttributeName': 'alert_sent',
'AttributeType': 'B'
}
],
ProvisionedThroughput={
'ReadCapacityUnits': 10,
'WriteCapacityUnits': 10
}
)
# Wait for the table to be created
dbtable.meta.client.get_waiter('table_exists').wait(TableName='FsxNMonitorFsx')
except ClientError as e:
if e.response['Error']['Code'] != 'ResourceInUseException':
raise
# Code to retrieve metric data and check if alert needs to be sent
for filesystem_id in table:
response = cloudwatch.get_metric_data(
MetricDataQueries=[
{
'Id': 'm1',
'MetricStat': {
'Metric': {
'Namespace': 'AWS/FSx',
'MetricName': 'StorageCapacity',
'Dimensions': [
{
'Name': 'FileSystemId',
'Value': filesystem_id
},
{
'Name': 'StorageTier',
'Value': 'SSD'
},
{
'Name': 'DataType',
'Value': 'All'
}
]
},
'Period': 60,
'Stat': 'Sum'
},
'ReturnData': True
},
{
'Id': 'm2',
'MetricStat': {
'Metric': {
'Namespace': 'AWS/FSx',
'MetricName': 'StorageUsed',
'Dimensions': [
{
'Name': 'FileSystemId',
'Value': filesystem_id
},
{
'Name': 'StorageTier',
'Value': 'SSD'
},
{
'Name': 'DataType',
'Value': 'All'
}
]
},
'Period': 60,
'Stat': 'Sum'
},
'ReturnData': True
}
],
StartTime=start_time,
EndTime=end_time
)
storage_capacity = response['MetricDataResults'][0]['Values']
storage_used = response['MetricDataResults'][1]['Values']
if storage_capacity:
storage_capacity = storage_capacity[0]
else:
storage_capacity = None
if storage_used:
storage_used = storage_used[0]
else:
storage_used = None
if storage_capacity and storage_used:
percent_used = (storage_used / storage_capacity) * 100
else:
percent_used = None
######################################################################
### Check if an alert has already been sent for this filesystem_id ###
######################################################################
response = dbtable.get_item(
Key={'filesystem_id': filesystem_id}
)
if 'Item' in response:
alert_sent = response['Item']['alert_sent']
else:
alert_sent = False
# Send alert if storage usage exceeds threshold and no alert has been sent yet
if percent_used > 80 and not alert_sent:
email_body = "Dear Team,<br><br> Please Find the FSx ONTAP FileSystem Alert Report Below for the {} region.".format(region)
email_body += "<br></br>"
email_body += "<table>"
email_body += "<tr>"
email_body += "<th style='text-align: left'>FileSystemId</th>"
email_body += "<th style='text-align: right'>Used %</th>"
email_body += "</tr>"
for fs in result:
if fs['percent_used'] > 80:
email_body += "<tr>"
email_body += "<td style='text-align: left'>" + fs['filesystem_id'] + "</td>"
email_body += "<td style='text-align: right; color:red;'>" + str(round(fs['percent_used'], 2)) + "%</td>"
email_body += "</tr>"
email_body += "</table>"
email_body += "<br></br>"
email_body += "Sincerely,<br>AWS FSx Alert Team"
email_subject = "FSx ONTAP FileSystem Alert Report - {}".format(region)
ses.send_email(
Source='test#example.com',
Destination={
'ToAddresses': ['test#example.com'],
},
Message={
'Subject': {
'Data': email_subject
},
'Body': {
'Html': {
'Data': email_body
}
}
}
)
dbtable.update_item(
TableName='FsxNMonitorFsx',
Key={'filesystem_id': {'S': filesystem_id}},
UpdateExpression='SET alert_sent = :val',
ExpressionAttributeValues={':val': {'BOOL': True}}
)
return {
'statusCode': 200,
'body': json.dumps('Email sent!')
}
Result without using DB:
FileSystemId Used %
fs-0c700005a823f755c 87.95%
fs-074999ef7111b8315 84.51%
Execution Error:
[ERROR] ClientError: An error occurred (ValidationException) when calling the GetItem operation: The provided key element does not match the schema
Code edit based on the feedback:
import os
import boto3, json
from datetime import datetime, timedelta
from boto3.dynamodb.conditions import Key
from botocore.exceptions import ClientError
fsx = boto3.client('fsx')
cloudwatch = boto3.client('cloudwatch')
ses = boto3.client('ses')
region_name = os.environ['AWS_REGION']
dynamodb = boto3.resource('dynamodb', region_name=region_name)
def lambda_handler(event, context):
now = datetime.utcnow()
start_time = (now - timedelta(minutes=5)).strftime('%Y-%m-%dT%H:%M:%SZ')
end_time = now.strftime('%Y-%m-%dT%H:%M:%SZ')
table = []
result = []
next_token = None
while True:
if next_token:
response = fsx.describe_file_systems(NextToken=next_token)
else:
response = fsx.describe_file_systems()
for filesystem in response.get('FileSystems'):
filesystem_id = filesystem.get('FileSystemId')
table.append(filesystem_id)
next_token = response.get('NextToken')
if not next_token:
break
try:
# Create the DynamoDB table if it does not exist
dbtable = dynamodb.Table('FsxNMonitorFsx')
dbtable = dynamodb.create_table(
TableName='FsxNMonitorFsx',
KeySchema=[
{
'AttributeName': 'filesystem_id',
'KeyType': 'HASH'
}
],
AttributeDefinitions=[
{
'AttributeName': 'filesystem_id',
'AttributeType': 'S'
}
],
ProvisionedThroughput={
'ReadCapacityUnits': 10,
'WriteCapacityUnits': 10
}
)
# Wait for the table to be created
dbtable.meta.client.get_waiter(
'table_exists').wait(TableName='FsxNMonitorFsx')
except ClientError as e:
if e.response['Error']['Code'] != 'ResourceInUseException':
raise
# Code to retrieve metric data and check if alert needs to be sent
for filesystem_id in table:
response = cloudwatch.get_metric_data(
MetricDataQueries=[
{
'Id': 'm1',
'MetricStat': {
'Metric': {
'Namespace': 'AWS/FSx',
'MetricName': 'StorageCapacity',
'Dimensions': [
{
'Name': 'FileSystemId',
'Value': filesystem_id
},
{
'Name': 'StorageTier',
'Value': 'SSD'
},
{
'Name': 'DataType',
'Value': 'All'
}
]
},
'Period': 60,
'Stat': 'Sum'
},
'ReturnData': True
},
{
'Id': 'm2',
'MetricStat': {
'Metric': {
'Namespace': 'AWS/FSx',
'MetricName': 'StorageUsed',
'Dimensions': [
{
'Name': 'FileSystemId',
'Value': filesystem_id
},
{
'Name': 'StorageTier',
'Value': 'SSD'
},
{
'Name': 'DataType',
'Value': 'All'
}
]
},
'Period': 60,
'Stat': 'Sum'
},
'ReturnData': True
}
],
StartTime=start_time,
EndTime=end_time
)
storage_capacity = response['MetricDataResults'][0]['Values']
storage_used = response['MetricDataResults'][1]['Values']
if storage_capacity:
storage_capacity = storage_capacity[0]
else:
storage_capacity = None
if storage_used:
storage_used = storage_used[0]
else:
storage_used = None
if storage_capacity and storage_used:
percent_used = (storage_used / storage_capacity) * 100
else:
percent_used = None
######################################################################
### Check if an alert has already been sent for this filesystem_id ###
######################################################################
response = dbtable.get_item(
Key={'filesystem_id': filesystem_id}
)
if 'Item' in response:
alert_sent = response['Item']['alert_sent']
else:
alert_sent = False
# Send alert if storage usage exceeds threshold and no alert has been sent yet
if percent_used > 80 and not alert_sent:
email_body = "Dear Team,<br><br> Please Find the FSx ONTAP FileSystem Alert Report Below for the {} region.".format(
region_name)
email_body += "<br></br>"
email_body += "<table>"
email_body += "<tr>"
email_body += "<th style='text-align: left'>FileSystemId</th>"
email_body += "<th style='text-align: right'>Used %</th>"
email_body += "</tr>"
for fs in result:
if fs['percent_used'] > 80:
email_body += "<tr>"
email_body += "<td style='text-align: left'>" + \
fs['filesystem_id'] + "</td>"
email_body += "<td style='text-align: right; color:red;'>" + \
str(round(fs['percent_used'], 2)) + "%</td>"
email_body += "</tr>"
email_body += "</table>"
email_body += "<br></br>"
email_body += "Sincerely,<br>AWS FSx Alert Team"
email_subject = "FSx ONTAP FileSystem Alert Report - {}".format(
region_name)
ses.send_email(
Source='test#example.com',
Destination={
'ToAddresses': ['test#example.com'],
},
Message={
'Subject': {
'Data': email_subject
},
'Body': {
'Html': {
'Data': email_body
}
}
}
)
dbtable.put_item(
Item={
'filesystem_id': filesystem_id,
'alert_sent': now.strftime('%Y-%m-%d %H:%M:%S')
}
)
return {
'statusCode': 200,
'body': json.dumps('Email sent!')
}
Above doesnt through any error but send empty e-mail and keep Db also empty, i'm lost a bit
You have another problem on your lambda function as well.
You are creating table with variable of filesystem_id. I think you want to create table partition key as filesystem_id not with variable value of filesystem_id
dbtable = dynamodb.create_table(
TableName='FsxNMonitorFsx',
KeySchema=[
{
'AttributeName': 'filesystem_id',
'KeyType': 'HASH'
},
{
'AttributeName': 'alert_sent',
'KeyType': 'RANGE'
}
],
AttributeDefinitions=[
{
'AttributeName': 'filesystem_id',
'AttributeType': 'S'
},
{
'AttributeName': 'alert_sent',
'AttributeType': 'B'
}
],
ProvisionedThroughput={
'ReadCapacityUnits': 10,
'WriteCapacityUnits': 10
}
)
And you can not use get_item with only Hash_key you need you use query if you want to fetch data only with filesystem_id.
UPDATE LAMBDA CODE
import os
import boto3
import json
from datetime import datetime, timedelta
from boto3.dynamodb.conditions import Key
from botocore.exceptions import ClientError
fsx = boto3.client('fsx')
cloudwatch = boto3.client('cloudwatch')
ses = boto3.client('ses')
region_name = os.environ['AWS_REGION']
dynamodb = boto3.resource('dynamodb', region_name=region_name)
dbtable = dynamodb.Table('FsxNMonitorFsx')
def lambda_handler(event, context):
now = datetime.utcnow()
start_time = (now - timedelta(minutes=5)).strftime('%Y-%m-%dT%H:%M:%SZ')
end_time = now.strftime('%Y-%m-%dT%H:%M:%SZ')
filesystem_ids = []
result = []
next_token = None
# get all filesystem_ids
while True:
if next_token:
response = fsx.describe_file_systems(NextToken=next_token)
else:
response = fsx.describe_file_systems()
for filesystem in response.get('FileSystems'):
filesystem_id = filesystem.get('FileSystemId')
filesystem_ids.append(filesystem_id)
next_token = response.get('NextToken')
if not next_token:
break
# create table if not exist
# I think here is not good point to create table. (I prefer you create table outside of this lambda)
try:
# Create the DynamoDB table if it does not exist
dbtable = dynamodb.create_table(
TableName='FsxNMonitorFsx',
KeySchema=[
{
'AttributeName': 'filesystem_id',
'KeyType': 'HASH'
}
],
AttributeDefinitions=[
{
'AttributeName': 'filesystem_id',
'AttributeType': 'S'
}
],
ProvisionedThroughput={
'ReadCapacityUnits': 10,
'WriteCapacityUnits': 10
}
)
# Wait for the table to be created
dbtable.meta.client.get_waiter(
'table_exists').wait(TableName='FsxNMonitorFsx')
except ClientError as e:
if e.response['Error']['Code'] != 'ResourceInUseException':
raise
# Code to retrieve metric data and check if alert needs to be sent
for filesystem_id in filesystem_ids:
response = cloudwatch.get_metric_data(
MetricDataQueries=[
{
'Id': 'm1',
'MetricStat': {
'Metric': {
'Namespace': 'AWS/FSx',
'MetricName': 'StorageCapacity',
'Dimensions': [
{
'Name': 'FileSystemId',
'Value': filesystem_id
},
{
'Name': 'StorageTier',
'Value': 'SSD'
},
{
'Name': 'DataType',
'Value': 'All'
}
]
},
'Period': 60,
'Stat': 'Sum'
},
'ReturnData': True
},
{
'Id': 'm2',
'MetricStat': {
'Metric': {
'Namespace': 'AWS/FSx',
'MetricName': 'StorageUsed',
'Dimensions': [
{
'Name': 'FileSystemId',
'Value': filesystem_id
},
{
'Name': 'StorageTier',
'Value': 'SSD'
},
{
'Name': 'DataType',
'Value': 'All'
}
]
},
'Period': 60,
'Stat': 'Sum'
},
'ReturnData': True
}
],
StartTime=start_time,
EndTime=end_time
)
storage_capacity = response['MetricDataResults'][0]['Values']
storage_used = response['MetricDataResults'][1]['Values']
if storage_capacity:
storage_capacity = storage_capacity[0]
else:
storage_capacity = None
if storage_used:
storage_used = storage_used[0]
else:
storage_used = None
if storage_capacity and storage_used:
percent_used = (storage_used / storage_capacity) * 100
else:
percent_used = None
######################################################################
### Check if an alert has already been sent for this filesystem_id ###
######################################################################
response = dbtable.get_item(Key={'filesystem_id': filesystem_id})
if 'Item' in response:
alert_sent = response['Item']['alert_sent']
else:
alert_sent = False
# Send alert if storage usage exceeds threshold and no alert has been sent yet
if percent_used > 80 and not alert_sent:
result.append({'filesystem_id': filesystem_id, 'percent_used': percent_used})
header = f"""
Dear Team,<br><br> Please Find the FSx ONTAP FileSystem Alert Report Below for the {region_name} region.
<br></br>
<table>
<tr>
<th style='text-align: left'>FileSystemId</th>
<th style='text-align: right'>Used %</th>
</tr>
"""
body = ""
for fs in result:
body += f"""
<tr>
<td style='text-align: left'>{fs['filesystem_id']}</td>
<td style='text-align: right; color:red;'>{str(round(fs['percent_used'], 2))}%</td>
</tr>
"""
footer = f"""</table>
<br></br>
Sincerely,<br>AWS FSx Alert Team
FSx ONTAP FileSystem Alert Report - {region_name}
"""
email_body = header + body + footer
ses.send_email(
Source='test#example.com',
Destination={
'ToAddresses': ['test#example.com'],
},
Message={
'Subject': {
'Data': "Emai Subject"
},
'Body': {
'Html': {
'Data': email_body
}
}
}
)
for fs in result:
filesystem_id = fs['filesystem_id']
dbtable.put_item(
Item = {
'filesystem_id': filesystem_id,
'alert_sent': True
}
)
return {
'statusCode': 200,
'body': json.dumps('Email sent!')
}
You are setting your table with a Partition Key and Sort Key, but your GetItem only indicates the Partition Key. You can do one of two things:
Supply Sort Key also
response = dbtable.get_item(
Key={
'filesystem_id': filesystem_id,
'alert_sent': alert_value
}
)
Use Query
Note: This option will return multiple items, if multiple items should exist for a given filesystem_id
response = dbtable.query(
KeyConditionExpression='#id=:id',
ExpressionAttributeValues={':id':filesystem_id},
ExpressionAttributeNames={'#id':'filesystem_id'}
)
Table Creation
filesystem_id should be a string not your variables value.
KeySchema=[
{
'AttributeName': 'filesystem_id',
'KeyType': 'HASH'
},
{
'AttributeName': 'alert_sent',
'KeyType': 'RANGE'
}
],
Lambda clients
Clients should be created outside of the request handler
import json
import os
import boto3
from datetime import datetime, timedelta
from boto3.dynamodb.conditions import Key
from botocore.exceptions import ClientError
fsx = boto3.client('fsx')
cloudwatch = boto3.client('cloudwatch')
ses = boto3.client('ses')
region_name = os.environ['AWS_REGION']
dynamodb = boto3.resource('dynamodb', region_name=region_name)
dbtable = dynamodb.Table('FsxNMonitorFsx')
def lambda_handler(event, context):
Code
import os
import boto3, json
from datetime import datetime, timedelta
from boto3.dynamodb.conditions import Key
from botocore.exceptions import ClientError
fsx = boto3.client('fsx')
cloudwatch = boto3.client('cloudwatch')
ses = boto3.client('ses')
region_name = os.environ['AWS_REGION']
dynamodb = boto3.resource('dynamodb', region_name=region_name)
dbtable = dynamodb.Table('FsxNMonitorFsx')
def lambda_handler(event, context):
now = datetime.utcnow()
start_time = (now - timedelta(minutes=5)).strftime('%Y-%m-%dT%H:%M:%SZ')
end_time = now.strftime('%Y-%m-%dT%H:%M:%SZ')
table = []
result = []
next_token = None
while True:
if next_token:
response = fsx.describe_file_systems(NextToken=next_token)
else:
response = fsx.describe_file_systems()
for filesystem in response.get('FileSystems'):
filesystem_id = filesystem.get('FileSystemId')
table.append(filesystem_id)
next_token = response.get('NextToken')
if not next_token:
break
try:
# Create the DynamoDB table if it does not exist
dbtable = dynamodb.create_table(
TableName='FsxNMonitorFsx',
KeySchema=[
{
'AttributeName': 'filesystem_id',
'KeyType': 'HASH'
}
],
AttributeDefinitions=[
{
'AttributeName': 'filesystem_id',
'AttributeType': 'S'
}
],
ProvisionedThroughput={
'ReadCapacityUnits': 10,
'WriteCapacityUnits': 10
}
)
# Wait for the table to be created
dbtable.meta.client.get_waiter(
'table_exists').wait(TableName='FsxNMonitorFsx')
except ClientError as e:
if e.response['Error']['Code'] != 'ResourceInUseException':
raise
# Code to retrieve metric data and check if alert needs to be sent
for filesystem_id in table:
response = cloudwatch.get_metric_data(
MetricDataQueries=[
{
'Id': 'm1',
'MetricStat': {
'Metric': {
'Namespace': 'AWS/FSx',
'MetricName': 'StorageCapacity',
'Dimensions': [
{
'Name': 'FileSystemId',
'Value': filesystem_id
},
{
'Name': 'StorageTier',
'Value': 'SSD'
},
{
'Name': 'DataType',
'Value': 'All'
}
]
},
'Period': 60,
'Stat': 'Sum'
},
'ReturnData': True
},
{
'Id': 'm2',
'MetricStat': {
'Metric': {
'Namespace': 'AWS/FSx',
'MetricName': 'StorageUsed',
'Dimensions': [
{
'Name': 'FileSystemId',
'Value': filesystem_id
},
{
'Name': 'StorageTier',
'Value': 'SSD'
},
{
'Name': 'DataType',
'Value': 'All'
}
]
},
'Period': 60,
'Stat': 'Sum'
},
'ReturnData': True
}
],
StartTime=start_time,
EndTime=end_time
)
storage_capacity = response['MetricDataResults'][0]['Values']
storage_used = response['MetricDataResults'][1]['Values']
if storage_capacity:
storage_capacity = storage_capacity[0]
else:
storage_capacity = None
if storage_used:
storage_used = storage_used[0]
else:
storage_used = None
if storage_capacity and storage_used:
percent_used = (storage_used / storage_capacity) * 100
else:
percent_used = None
######################################################################
### Check if an alert has already been sent for this filesystem_id ###
######################################################################
response = dbtable.get_item(
Key={'filesystem_id': filesystem_id}
)
if 'Item' in response:
alert_sent = response['Item']['alert_sent']
else:
alert_sent = False
# Send alert if storage usage exceeds threshold and no alert has been sent yet
if percent_used > 80 and not alert_sent:
email_body = "Dear Team,<br><br> Please Find the FSx ONTAP FileSystem Alert Report Below for the {} region.".format(
region_name)
email_body += "<br></br>"
email_body += "<table>"
email_body += "<tr>"
email_body += "<th style='text-align: left'>FileSystemId</th>"
email_body += "<th style='text-align: right'>Used %</th>"
email_body += "</tr>"
for fs in result:
if fs['percent_used'] > 80:
email_body += "<tr>"
email_body += "<td style='text-align: left'>" + \
fs['filesystem_id'] + "</td>"
email_body += "<td style='text-align: right; color:red;'>" + \
str(round(fs['percent_used'], 2)) + "%</td>"
email_body += "</tr>"
email_body += "</table>"
email_body += "<br></br>"
email_body += "Sincerely,<br>AWS FSx Alert Team"
email_subject = "FSx ONTAP FileSystem Alert Report - {}".format(
region_name)
ses.send_email(
Source='test#example.com',
Destination={
'ToAddresses': ['test#example.com'],
},
Message={
'Subject': {
'Data': email_subject
},
'Body': {
'Html': {
'Data': email_body
}
}
}
)
dbtable.update_item(
TableName='FsxNMonitorFsx',
Key={'filesystem_id': {'S': filesystem_id}},
UpdateExpression='SET alert_sent = :val',
ExpressionAttributeValues={':val': {'BOOL': True}}
)
return {
'statusCode': 200,
'body': json.dumps('Email sent!')
}
I am trying to follow the tutorial within the documentation:
https://docs.aws.amazon.com/amazondynamodb/latest/developerguide/GettingStarted.Python.01.html
as such:
def create_movie_table(dynamodb=None):
if not dynamodb:
dynamodb = boto3.resource('dynamodb',
aws_access_key_id="anything",
aws_secret_access_key="anything",
region_name = 'us-east-2',
endpoint_url="http://localhost:8000")
table = dynamodb.create_table(
TableName='DailyMovers',
KeySchema=[
{
'AttributeName': 'date',
'KeyType': 'HASH' # Partition key
},
{
'AttributeName': 'type',
'KeyType': 'RANGE' # Sort key
}
],
AttributeDefinitions=[
{
'AttributeName': 'date',
'AttributeType': 'S'
},
{
'AttributeName': 'type',
'AttributeType': 'S'
},
],
ProvisionedThroughput={
'ReadCapacityUnits': 5,
'WriteCapacityUnits': 5
}
)
return table
if __name__ == '__main__':
create_movie_table()
However, I keep running into the localhost:8000 connection error.
I am installed boto3 properly.
I am not quite sure what I am doing wrong.
Thank you.
The version of DynamoDB that runs from localhost for testing is a separate download from Amazon which requires Java to run. I don't believe it's part of any other package including AWS-cli. You can find instructions for it at Setting Up DynamoDB Local (Downloadable Version).
I have an existing DynamoDB table, and I want to write some Python code to append an attribute (of type List) to the table. Here is what I tried:
users.put_item(
Item={
"new_attribute": []
}
)
But this didn't work. I looked everywhere online but couldn't find anything, I know I must be missing something basic. Any help?
Here is a full example which works
### Simulating an Insert and Update to a List
#Create Table
import boto3
dynamodb = boto3.resource('dynamodb')
try:
table = dynamodb.create_table(
TableName='Test_list',
KeySchema=[
{
'AttributeName': '_id',
'KeyType': 'HASH' # Partition key
}
],
AttributeDefinitions=[
{
'AttributeName': '_id',
'AttributeType': 'N'
}
],
ProvisionedThroughput={
'ReadCapacityUnits': 5,
'WriteCapacityUnits': 5
}
)
except ClientError as e:
if e.response['Error']['Code']:
print(e.response['Error']['Message'])
print( e.response)
## Add a record with a list
table= dynamodb.Table('Test_list')
ll=['one','two']
resp=table.put_item(
Item={
'_id': 1,
'mylist': ll
}
)
#Update the list
new_ll=['three','four']
response = table.update_item(
Key={
'_id': 1
},
UpdateExpression="SET #l = list_append(#l, :vals)",
ExpressionAttributeNames={
"#l": 'mylist'
},
ExpressionAttributeValues={
":vals": new_ll
}
)
# fetch the record to verify
resp=table.get_item(Key={'_id':1})
resp['Item']
You will see the output :
{'_id': Decimal('1'), 'mylist': ['one', 'two', 'three', 'four']}
import boto3
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table('<your-ddb-table-name>')
table.update_item(
Key={
'PK': '<pk>',
'SK': '<sk>'
},
UpdateExpression='SET new_attribute = :list',
ExpressionAttributeValues={
':list': []
}
)
The data looks like this :
The expected Json fomat is like this
{
"DataExtractName": "SalesDataExtract",
"BusinessName" : {
"InvoiceDate": {
"SourceSystem": {
"MYSQL" : "Invc_Dt",
"CSV" : "Invc_Date"
},
"DataType": {
"MYSQL" : "varchar",
"CSV" : "string"
}
},
"Description": {
"SourceSystem": {
"MYSQL" : "Prod_Desc",
"CSV" : "Prod_Descr"
},
"DataType": {
"MYSQL" : "varchar",
"CSV" : "string"
}
}
}
},
{
"DataExtractName": "DateDataExtract",
"BusinessName" : {
"InvoiceDate": {
"SourceSystem": {
"MYSQL" : "Date"
},
"DataType": {
"MYSQL" : "varchar"
}
}
}
}
How do i achieve this using python dataframes? Or do i need to write some script to make the data like this?
Note
I've tried using -
df.to_json
df.to_dict
With so many nested structures, you should use marshmallow. It is built with your use case in mind. Please check out the excellent documentation: https://marshmallow.readthedocs.io/en/stable/ . All you need is the masic usage.
It is a lot of code, but better be explicit than clever. I am sure a shorter solution exists, but it is probably unmaintainable. Also I had to build your dataframe. Please provide it in a data format next time.
import pandas as pd
import marshmallow as ma
# build test data
df = pd.DataFrame.from_records([
['InvoiceDate', 'MYSQL', 'Invc_Dt', 'varchar', 'SalesDataExtract'],
['InvoiceDate', 'CSV', 'Invc_Date', 'string', 'SalesDataExtract'],
['Description', 'MYSQL', 'Prod_Descr', 'varchar', 'SalesDataExtract'],
['Description', 'CSV', 'Prod_Descr', 'string', 'SalesDataExtract'],
['InvoiceDate', 'MYSQL', 'Date', 'varchar', 'DateDataExtract'],]
)
df.columns = ['BusinessName', 'SourceSystem', 'FunctionalName', 'DataType', 'DataExtractName']
# define marshmallow schemas
class SourceSystemTypeSchema(ma.Schema):
MYSQL = ma.fields.String()
CSV = ma.fields.String()
class DataTypeSchema(ma.Schema):
MYSQL = ma.fields.String()
CSV = ma.fields.String()
class InvoiceDateSchema(ma.Schema):
InvoiceDate = ma.fields.Nested(SourceSystemTypeSchema())
DataType = ma.fields.Nested(DataTypeSchema())
class DescriptionSchema(ma.Schema):
SourceSystem = ma.fields.Nested(SourceSystemTypeSchema())
DataType = ma.fields.Nested(DataTypeSchema())
class BusinessNameSchema(ma.Schema):
InvoiceDate = ma.fields.Nested(InvoiceDateSchema())
Description = ma.fields.Nested(DescriptionSchema())
class DataSchema(ma.Schema):
DataExtractName = ma.fields.String()
BusinessName = ma.fields.Nested(BusinessNameSchema())
# building json
result = []
mask_business_name_invoicedate = df.BusinessName == 'InvoiceDate'
mask_business_name_description = df.BusinessName == 'Description'
for data_extract_name in set(df['DataExtractName'].to_list()):
mask_data_extract_name = df.DataExtractName == data_extract_name
# you need these two helper dfs to get the dictionaries
df_source_system = df[mask_data_extract_name & mask_business_name_invoicedate].set_index('SourceSystem').to_dict(orient='dict')
df_description = df[mask_data_extract_name & mask_business_name_description].set_index('SourceSystem').to_dict(orient='dict')
# all dictionaries are defined, so you can use your schemas
source_system_type = SourceSystemTypeSchema().dump(df_source_system['FunctionalName'])
data_type = DataTypeSchema().dump(df_source_system['DataType'])
source_system = SourceSystemTypeSchema().dump(df_description['FunctionalName'])
invoice_date = InvoiceDateSchema().dump({'SourceSystemType': source_system_type, 'DataType': data_type})
description = DescriptionSchema().dump({'SourceSystem': source_system, 'DataType': data_type})
business_name = BusinessNameSchema().dump({'InvoiceDate': invoice_date, 'Description': description})
data = DataSchema().dump({'DataExtractName': data_extract_name, 'BusinessName': business_name})
# end result
result.append(data)
Now,
ma.pprint(result)
returns
[{'BusinessName': {'Description': {'DataType': {'CSV': 'string',
'MYSQL': 'varchar'},
'SourceSystem': {'CSV': 'Prod_Descr',
'MYSQL': 'Prod_Descr'}},
'InvoiceDate': {'DataType': {'CSV': 'string',
'MYSQL': 'varchar'}}},
'DataExtractName': 'SalesDataExtract'},
{'BusinessName': {'Description': {'DataType': {'MYSQL': 'varchar'},
'SourceSystem': {}},
'InvoiceDate': {'DataType': {'MYSQL': 'varchar'}}},
'DataExtractName': 'DateDataExtract'}]
I want to execute spark submit job on AWS EMR cluster based on the file upload event on S3. I am using AWS Lambda function to capture the event but I have no idea how to submit spark submit job on EMR cluster from Lambda function.
Most of the answers that i searched talked about adding a step in the EMR cluster. But I do not know if I can add add any step to fire "spark submit --with args" in the added step.
You can, I had to same thing last week!
Using boto3 for Python (other languages would definitely have a similar solution) you can either start a cluster with the defined step, or attach a step to an already up cluster.
Defining the cluster with the step
def lambda_handler(event, context):
conn = boto3.client("emr")
cluster_id = conn.run_job_flow(
Name='ClusterName',
ServiceRole='EMR_DefaultRole',
JobFlowRole='EMR_EC2_DefaultRole',
VisibleToAllUsers=True,
LogUri='s3n://some-log-uri/elasticmapreduce/',
ReleaseLabel='emr-5.8.0',
Instances={
'InstanceGroups': [
{
'Name': 'Master nodes',
'Market': 'ON_DEMAND',
'InstanceRole': 'MASTER',
'InstanceType': 'm3.xlarge',
'InstanceCount': 1,
},
{
'Name': 'Slave nodes',
'Market': 'ON_DEMAND',
'InstanceRole': 'CORE',
'InstanceType': 'm3.xlarge',
'InstanceCount': 2,
}
],
'Ec2KeyName': 'key-name',
'KeepJobFlowAliveWhenNoSteps': False,
'TerminationProtected': False
},
Applications=[{
'Name': 'Spark'
}],
Configurations=[{
"Classification":"spark-env",
"Properties":{},
"Configurations":[{
"Classification":"export",
"Properties":{
"PYSPARK_PYTHON":"python35",
"PYSPARK_DRIVER_PYTHON":"python35"
}
}]
}],
BootstrapActions=[{
'Name': 'Install',
'ScriptBootstrapAction': {
'Path': 's3://path/to/bootstrap.script'
}
}],
Steps=[{
'Name': 'StepName',
'ActionOnFailure': 'TERMINATE_CLUSTER',
'HadoopJarStep': {
'Jar': 's3n://elasticmapreduce/libs/script-runner/script-runner.jar',
'Args': [
"/usr/bin/spark-submit", "--deploy-mode", "cluster",
's3://path/to/code.file', '-i', 'input_arg',
'-o', 'output_arg'
]
}
}],
)
return "Started cluster {}".format(cluster_id)
Attaching a step to an already running cluster
As per here
def lambda_handler(event, context):
conn = boto3.client("emr")
# chooses the first cluster which is Running or Waiting
# possibly can also choose by name or already have the cluster id
clusters = conn.list_clusters()
# choose the correct cluster
clusters = [c["Id"] for c in clusters["Clusters"]
if c["Status"]["State"] in ["RUNNING", "WAITING"]]
if not clusters:
sys.stderr.write("No valid clusters\n")
sys.stderr.exit()
# take the first relevant cluster
cluster_id = clusters[0]
# code location on your emr master node
CODE_DIR = "/home/hadoop/code/"
# spark configuration example
step_args = ["/usr/bin/spark-submit", "--spark-conf", "your-configuration",
CODE_DIR + "your_file.py", '--your-parameters', 'parameters']
step = {"Name": "what_you_do-" + time.strftime("%Y%m%d-%H:%M"),
'ActionOnFailure': 'CONTINUE',
'HadoopJarStep': {
'Jar': 's3n://elasticmapreduce/libs/script-runner/script-runner.jar',
'Args': step_args
}
}
action = conn.add_job_flow_steps(JobFlowId=cluster_id, Steps=[step])
return "Added step: %s"%(action)
AWS Lambda function python code if you want to execute Spark jar using spark submit command:
from botocore.vendored import requests
import json
def lambda_handler(event, context):
headers = { "content-type": "application/json" }
url = 'http://ip-address.ec2.internal:8998/batches'
payload = {
'file' : 's3://Bucket/Orchestration/RedshiftJDBC41.jar
s3://Bucket/Orchestration/mysql-connector-java-8.0.12.jar
s3://Bucket/Orchestration/SparkCode.jar',
'className' : 'Main Class Name',
'args' : [event.get('rootPath')]
}
res = requests.post(url, data = json.dumps(payload), headers = headers, verify = False)
json_data = json.loads(res.text)
return json_data.get('id')