Export DynamoDB to CSV on S3 with Lambda function (python) - python

Hello im trying to generate a CSV from dynamoDB to S3 using lambda function. the thing is I just get an empty file on s3. Please your help!
import csv
import boto3
import json
dynamodb = boto3.resource('dynamodb')
db = dynamodb.Table('ReporteTelefonica')
def lambda_handler(event, context):
AWS_BUCKET_NAME = 'reportetelefonica'
s3 = boto3.resource('s3')
bucket = s3.Bucket(AWS_BUCKET_NAME)
path = 'test.csv'
try:
response = db.scan()
myFile = open(path, 'w')
for i in response['Items']:
csv.register_dialect('myDialect', delimiter=',', quoting=csv.QUOTE_NONE)
with myFile:
writer = csv.writer(myFile, dialect='myDialect')
writer.writerows(i)
print(i)
except :
print("error")
bucket.put_object(
ACL='public-read-write',
ContentType='application/csv',
Key=path,
# Body=json.dumps(i),
)
# print("here")
body = {
"uploaded": "true",
"bucket": AWS_BUCKET_NAME,
"path": path,
}
# print("then here")
return {
"statusCode": 200,
"body": json.dumps(body)
}
I'm kind of noob on this, so I was wondering what should I modify to successfully make a complete scan of the table and write the values on the CSV on S3???

Here's a working lambda that will do the job.
import boto3
import json
import os
import pandas as pd
TABLE_NAME = os.environ.get("DDB_TABLE_NAME")
OUTPUT_BUCKET = os.environ.get("BUCKET_NAME")
TEMP_FILENAME = '/tmp/export.csv'
OUTPUT_KEY = 'export.csv'
s3_resource = boto3.resource('s3')
dynamodb_resource = boto3.resource('dynamodb')
table = dynamodb_resource.Table(TABLE_NAME)
def lambda_handler(event, context):
response = table.scan()
df = pd.DataFrame(response['Items'])
df.to_csv(TEMP_FILENAME, index=False, header=True)
# Upload temp file to S3
s3_resource.Bucket(OUTPUT_BUCKET).upload_file(TEMP_FILENAME, OUTPUT_KEY)
return {
'statusCode': 200,
'headers': {
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Credentials": True,
"content-type": "application/json"
},
'body': json.dumps('OK')
}

You either have to close the file after you finished writing the cvs records and then reopen for reading and pass to the put_obkect method.
Alternatively you open the file for reading and writing and after writing you seek to position 0 so that the put_object method reads from the start.

Related

Upload csv via API gateway to S3

I am trying to set up an AWS API Gateway that could receive a POST request an upload a csv file to S3. Ideally, I would like to make some transformations to the file before uploading it to S3 (renaming and formatting some columns to normalize their names accross different uploads).
So far, I have set up my API Gateway to receive the request and send it to an AWS Lambda. I use Lambda proxy integration. The triggered lambda is as follows:
import logging
import pandas as pd
import boto3
logger = logging.getLogger()
logger.setLevel(logging.INFO)
s3 = boto3.client("s3")
def handler(event, context):
logger.info(f"Event: {event}")
df = pd.read_csv(event['body']['file'])
logger.info(f"df1: {df}")
# Provided parameters
try:
code = event['body']['code']
except KeyError:
logger.info('Code not provided')
code = 'Code'
try:
date = event['body']['date']
except KeyError:
logger.info('Date not provided')
date = 'Date'
try:
debit = event['body']['debit']
except KeyError:
logger.info('Debit not provided')
debit = 'Debit'
try:
credit = event['body']['credit']
except KeyError:
logger.info('Credit not provided')
credit = 'Credit'
try:
id= event['body']['id']
except KeyError:
logger.info('Id not provided')
id = '001'
df.rename(columns={code: 'Code', date: 'Date', credit: 'Credit', debit: 'Debit'})
df.to_csv(f's3://bucket/{id}/file.csv', line_terminator='\n', sep = ';', date_format='%Y-%m-%d %H:%M:%S')
return {
'statusCode': 200,
'headers': {
'Content-Type': 'text/csv',
'Access-Control-Allow-Origin': '*'
},
'body': {
'uploaded': True
},
'isBase64Encoded': False
}
To test this API, I use the following function:
import requests
csv_file = open("file.csv", 'rb')
headers = {"x-api-key": "xxx", "Content-Type":"text/csv"}
url = "https://xxx.execute-api.xxx.amazonaws.com/xxx"
body = {
"file": csv_file,
"code": "my_code"
}
# files = {
# "file": ("file.csv", open('file.csv', 'r'), 'text/csv')
# }
r = requests.post(url=url, headers=headers, data=body)
print(r.text)
The output is {"message": "Internal server error"}, and if I look in CloudWatch logs, I see that the event is encoded this way:
'body': 'file=%EF%BB%BFCol1%3BCol2%3BCol3%3BCol4%0D%0A&file=11%3B12%3B13%3B14%3B%0D%0A&file=21%3B22%3B23%3B24%3B...'
It looks like the body is encoded and passed row by row into different "file" fields. For a file with about 5000 rows I get the error OSError: [Errno 36] File name too long when trying to read it.
Is there another way to proceed in order to get a full dataset that I can transform into a pandas dataframe?
I have also seen suggestions with multipart/form-data, using files=files in the request or using csv library but I keep getting similar errors.
Thank you

Using AWS Lambda to convert JSON files stored in S3 Bucket to CSV

I'm new to Lambda and Python and I've faced an issue with my Lambda function.
I have several JSON files stored in a S3 bucket, and I wish to convert all JSON files to CSV format.
As I was referring to the Lambda function posted in this tutorial: https://sysadmins.co.za/convert-csv-to-json-files-with-aws-lambda-and-s3-events/
Lambda function:
import json
import csv
import boto3
import os
import datetime as dt
s3 = boto3.client('s3')
def lambda_handler(event, context):
datestamp = dt.datetime.now().strftime("%Y/%m/%d")
timestamp = dt.datetime.now().strftime("%s")
filename_json = "/tmp/file_{ts}.json".format(ts=timestamp)
filename_csv = "/tmp/file_{ts}.csv".format(ts=timestamp)
keyname_s3 = "uploads/output/{ds}/{ts}.json".format(ds=datestamp, ts=timestamp)
json_data = []
for record in event['Records']:
bucket_name = record['s3']['bucket']['name']
key_name = record['s3']['object']['key']
s3_object = s3.get_object(Bucket=bucket_name, Key=key_name)
data = s3_object['Body'].read()
contents = data.decode('utf-8')
with open(filename_csv, 'a') as csv_data:
csv_data.write(contents)
with open(filename_csv) as csv_data:
csv_reader = csv.DictReader(csv_data)
for csv_row in csv_reader:
json_data.append(csv_row)
with open(filename_json, 'w') as json_file:
json_file.write(json.dumps(json_data))
with open(filename_json, 'r') as json_file_contents:
response = s3.put_object(Bucket=bucket_name, Key=keyname_s3, Body=json_file_contents.read())
os.remove(filename_csv)
os.remove(filename_json)
return {
'statusCode': 200,
'body': json.dumps('CSV converted to JSON and available at: {bucket}/{key}'.format(bucket=bucket_name,key=keyname_s3))
}
I want to achieve a similar outcome using Lambda, but from JSON to CSV instead. How may I go about doing this?
I'd suggest having a look at convtools library:
from convtools import conversion as c
from convtools.contrib.tables import Table
import json
# input.json
"""
{
"records": [
{"a": 1, "b": "c"},
{"a": 2, "b": "d"},
{"a": 3, "b": "e"}
]
}
"""
with open("input.json") as f:
input_data = json.load(f)
Table.from_rows(input_data["records"]).into_csv("output.csv")
# output.csv
"""
a,b
1,c
2,d
3,e
"""

How to access information from config.json file to a python file?

The problem is I'm unable to access the information from config.json file to my python file
I have provided the JSON data and python code bellow
I have tried everything in the request module
but I can access the response without the config file but,
I need with config file
The following is a json file
{
"api_data": {
"request_url": "https://newapi.zivame.com/api/v1/catalog/list",
"post_data" : {"category_ids" : "948",
"limit" : "10000"},
"my_headers":{"Content-Type": "application/json"}
},
"redshift":{
"host":"XXX.XXXX.XXX",
"user":"XXXX",
"password":"XXXXXXXX",
"port": 8080,
"db":"XXXX"
},
"s3":{
"access_key":"XXXXXXXXX",
"secret_key":"XXXXXXXXXX",
"region":"XX-XXXXX-1",
"path":"XXXXXXXXXXXX/XXX",
"table":"XXXXXX",
"bucket":"XXXX",
"file": "XXXXXX",
"copy_column": "XXX",
"local_path": "XXXXX"
},
"csv_file": {
"promo_zivame": ""
}
}
and this is the program
#!/usr/bin/python
import json
import psycopg2
import requests
import os
BASE_PATH = os.path.dirname(os.path.realpath(__file__))
with open(BASE_PATH+'/config.json') as json_data_file:
data = json.load(json_data_file)
#api_config = data['api_data']
#redshift = data['redshift']
s3_config = data['s3']
#x = print(api_config.get('request_url'))
class ApiResponse:
#api response
def api_data(self, api_config):
print("starting api_data")
try:
self.ApiResponse = requests.post(api_config['request_url'], api_config['post_data'], api_config['my_headers'])
data_1 = self.ApiResponse
#data = json.dump(self.ApiResponse)
print("API Result Response")
print(())
print(self.ApiResponse)
return (self.ApiResponse)
except Exception:
print("response not found")
return False
def redshift_connect(self, redshift):
try:
# Amazon Redshift connect string
self.con = psycopg2.connect(
host=redshift['host'],
user=redshift['user'],
port=redshift['port'],
password=redshift['password'],
dbname=redshift['db'])
print(self.con)
return self.con
except Exception:
print("Error in Redshift connection")
return False
def main():
c1 = ApiResponse()
api_config = data['api_data']
redshift = data['redshift']
c1.api_data(api_config)
c1.api_data(data)
c1.redshift_connect(redshift)
if __name__=='__main__':
main()
Third argument to requests.post() is json. To provide headers, you need to use the name of the argument explicitly as #JustinEzequiel suggested. See the requests doc here: 2.python-requests.org/en/v1.1.0/user/quickstart/#custom-headers
requests.post(api_config['request_url'], json=api_config['post_data'], headers=api_config['my_headers'])
Borrowing code from https://stackoverflow.com/a/16696317/5386938
import requests
api_config = {
"request_url": "https://newapi.zivame.com/api/v1/catalog/list",
"post_data" : {"category_ids" : "948", "limit" : "10000"},
"my_headers":{"Content-Type": "application/json"}
}
local_filename = 'the_response.json'
with requests.post(api_config['request_url'], json=api_config['post_data'], headers=api_config['my_headers'], stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
saves the response into a file ('the_response.json') you can then pass around. Note the stream=True passed to requests.post

Fail to get csv from S3 and convert it with Python

I need to read csv file from s3 bucket and insert each row on dynamoDB
def load_users_dynamodb():
s3 = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table("test")
obj = s3.get_object(Bucket='test-app-config', Key='extract_Users.csv')
#return obj
data = obj['Body'].read().split('\n')
#return json.dumps(data)
with table.batch_writer() as batch:
for row in data:
batch.put_item(Item={
'registration': row.split(',')[0],
'name': row.split(',')[1],
'role': row.split(',')[2],
'company': row.split(',')[3],
'hiredcompany': row.split(',')[4],
'region': row.split(',')[5]
})
return 'OK'
im getting exception and I can't proceed:
Response:
{
"errorMessage": "a bytes-like object is required, not 'str'",
"errorType": "TypeError",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 10, in lambda_handler\n 'body': load_users_dynamodb()\n",
" File \"/var/task/lambda_function.py\", line 21, in load_users_dynamodb\n data = obj['Body'].read().split('\\n')\n"
]
}
Someone can help me please? o/
Your issue related to decoding the object return from s3.You need to read the file as csv.
Take a look at the following code snippet:
import boto3
import csv
s3 = boto3.client('s3')
def lambda_handler(event, context):
obj = s3.get_object(Bucket='Bucket_Name', Key='File_Name.csv')
data = obj['Body'].read().decode('utf-8').splitlines()
lines = csv.reader(data)
headers = next(lines)
print('headers: %s' %(headers))
for line in lines:
print(line)
Output :
Dummy csv.

Python Post Request - Getting 415 Error When Sending Files via Outlook API

I've been having some trouble sending files via python's rest module. I can send emails without attachments just fine but as soon as I try and add a files parameter, the call fails and I get a 415 error.
I've looked through the site and found out it was maybe because I wasn't sending the content type of the files when building that array of data so altered it to query the content type with mimetypes; still 415.
This thread: python requests file upload made a couple of more edits but still 415.
The error message says:
"A supported MIME type could not be found that matches the content type of the response. None of the supported type(s)"
Then lists a bunch of json types e.g: "'application/json;odata.metadata=minimal;odata.streaming=true;IEEE754Compatible=false"
then says:
"matches the content type 'multipart/form-data; boundary=0e5485079df745cf0d07777a88aeb8fd'"
Which of course makes me think I'm still not handling the content type correctly somewhere.
Can anyone see where I'm going wrong in my code?
Thanks!
Here's the function:
def send_email(access_token):
import requests
import json
import pandas as pd
import mimetypes
url = "https://outlook.office.com/api/v2.0/me/sendmail"
headers = {
'Authorization': 'Bearer '+access_token,
}
data = {}
data['Message'] = {
'Subject': "Test",
'Body': {
'ContentType': 'Text',
'Content': 'This is a test'
},
'ToRecipients': [
{
'EmailAddress':{
'Address': 'MY TEST EMAIL ADDRESS'
}
}
]
}
data['SaveToSentItems'] = "true"
json_data = json.dumps(data)
#need to convert the above json_data to dict, otherwise it won't work
json_data = json.loads(json_data)
###ATTACHMENT WORK
file_list = ['test_files/test.xlsx', 'test_files/test.docx']
files = {}
pos = 1
for file in file_list:
x = file.split('/') #seperate file name from file path
files['file'+str(pos)] = ( #give the file a unique name
x[1], #actual filename
open(file,'rb'), #open the file
mimetypes.MimeTypes().guess_type(file)[0] #add in the contents type
)
pos += 1 #increase the naming iteration
#print(files)
r = requests.post(url, headers=headers, json=json_data, files=files)
print("")
print(r)
print("")
print(r.text)
I've figured it out! Took a look at the outlook API documentation and realised I should be adding attachments as encoded lists within the message Json, not within the request.post function. Here's my working example:
import requests
import json
import pandas as pd
import mimetypes
import base64
url = "https://outlook.office.com/api/v2.0/me/sendmail"
headers = {
'Authorization': 'Bearer '+access_token,
}
Attachments = []
file_list = ['test_files/image.png', 'test_files/test.xlsx']
for file in file_list:
x = file.split('/') #file the file path so we can get it's na,e
filename = x[1] #get the filename
content = open(file,'rb') #load the content
#encode the file into bytes then turn those bytes into a string
encoded_string = ''
with open(file, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read())
encoded_string = encoded_string.decode("utf-8")
#append the file to the attachments list
Attachments.append({
"#odata.type": "#Microsoft.OutlookServices.FileAttachment",
"Name": filename,
"ContentBytes": encoded_string
})
data = {}
data['Message'] = {
'Subject': "Test",
'Body': {
'ContentType': 'Text',
'Content': 'This is a test'
},
'ToRecipients': [
{
'EmailAddress':{
'Address': 'EMAIL_ADDRESS'
}
}
],
"Attachments": Attachments
}
data['SaveToSentItems'] = "true"
json_data = json.dumps(data)
json_data = json.loads(json_data)
r = requests.post(url, headers=headers, json=json_data)
print(r)

Categories

Resources