Hello im trying to generate a CSV from dynamoDB to S3 using lambda function. the thing is I just get an empty file on s3. Please your help!
import csv
import boto3
import json
dynamodb = boto3.resource('dynamodb')
db = dynamodb.Table('ReporteTelefonica')
def lambda_handler(event, context):
AWS_BUCKET_NAME = 'reportetelefonica'
s3 = boto3.resource('s3')
bucket = s3.Bucket(AWS_BUCKET_NAME)
path = 'test.csv'
try:
response = db.scan()
myFile = open(path, 'w')
for i in response['Items']:
csv.register_dialect('myDialect', delimiter=',', quoting=csv.QUOTE_NONE)
with myFile:
writer = csv.writer(myFile, dialect='myDialect')
writer.writerows(i)
print(i)
except :
print("error")
bucket.put_object(
ACL='public-read-write',
ContentType='application/csv',
Key=path,
# Body=json.dumps(i),
)
# print("here")
body = {
"uploaded": "true",
"bucket": AWS_BUCKET_NAME,
"path": path,
}
# print("then here")
return {
"statusCode": 200,
"body": json.dumps(body)
}
I'm kind of noob on this, so I was wondering what should I modify to successfully make a complete scan of the table and write the values on the CSV on S3???
Here's a working lambda that will do the job.
import boto3
import json
import os
import pandas as pd
TABLE_NAME = os.environ.get("DDB_TABLE_NAME")
OUTPUT_BUCKET = os.environ.get("BUCKET_NAME")
TEMP_FILENAME = '/tmp/export.csv'
OUTPUT_KEY = 'export.csv'
s3_resource = boto3.resource('s3')
dynamodb_resource = boto3.resource('dynamodb')
table = dynamodb_resource.Table(TABLE_NAME)
def lambda_handler(event, context):
response = table.scan()
df = pd.DataFrame(response['Items'])
df.to_csv(TEMP_FILENAME, index=False, header=True)
# Upload temp file to S3
s3_resource.Bucket(OUTPUT_BUCKET).upload_file(TEMP_FILENAME, OUTPUT_KEY)
return {
'statusCode': 200,
'headers': {
"Access-Control-Allow-Origin": "*",
"Access-Control-Allow-Credentials": True,
"content-type": "application/json"
},
'body': json.dumps('OK')
}
You either have to close the file after you finished writing the cvs records and then reopen for reading and pass to the put_obkect method.
Alternatively you open the file for reading and writing and after writing you seek to position 0 so that the put_object method reads from the start.
Related
I am trying to set up an AWS API Gateway that could receive a POST request an upload a csv file to S3. Ideally, I would like to make some transformations to the file before uploading it to S3 (renaming and formatting some columns to normalize their names accross different uploads).
So far, I have set up my API Gateway to receive the request and send it to an AWS Lambda. I use Lambda proxy integration. The triggered lambda is as follows:
import logging
import pandas as pd
import boto3
logger = logging.getLogger()
logger.setLevel(logging.INFO)
s3 = boto3.client("s3")
def handler(event, context):
logger.info(f"Event: {event}")
df = pd.read_csv(event['body']['file'])
logger.info(f"df1: {df}")
# Provided parameters
try:
code = event['body']['code']
except KeyError:
logger.info('Code not provided')
code = 'Code'
try:
date = event['body']['date']
except KeyError:
logger.info('Date not provided')
date = 'Date'
try:
debit = event['body']['debit']
except KeyError:
logger.info('Debit not provided')
debit = 'Debit'
try:
credit = event['body']['credit']
except KeyError:
logger.info('Credit not provided')
credit = 'Credit'
try:
id= event['body']['id']
except KeyError:
logger.info('Id not provided')
id = '001'
df.rename(columns={code: 'Code', date: 'Date', credit: 'Credit', debit: 'Debit'})
df.to_csv(f's3://bucket/{id}/file.csv', line_terminator='\n', sep = ';', date_format='%Y-%m-%d %H:%M:%S')
return {
'statusCode': 200,
'headers': {
'Content-Type': 'text/csv',
'Access-Control-Allow-Origin': '*'
},
'body': {
'uploaded': True
},
'isBase64Encoded': False
}
To test this API, I use the following function:
import requests
csv_file = open("file.csv", 'rb')
headers = {"x-api-key": "xxx", "Content-Type":"text/csv"}
url = "https://xxx.execute-api.xxx.amazonaws.com/xxx"
body = {
"file": csv_file,
"code": "my_code"
}
# files = {
# "file": ("file.csv", open('file.csv', 'r'), 'text/csv')
# }
r = requests.post(url=url, headers=headers, data=body)
print(r.text)
The output is {"message": "Internal server error"}, and if I look in CloudWatch logs, I see that the event is encoded this way:
'body': 'file=%EF%BB%BFCol1%3BCol2%3BCol3%3BCol4%0D%0A&file=11%3B12%3B13%3B14%3B%0D%0A&file=21%3B22%3B23%3B24%3B...'
It looks like the body is encoded and passed row by row into different "file" fields. For a file with about 5000 rows I get the error OSError: [Errno 36] File name too long when trying to read it.
Is there another way to proceed in order to get a full dataset that I can transform into a pandas dataframe?
I have also seen suggestions with multipart/form-data, using files=files in the request or using csv library but I keep getting similar errors.
Thank you
I'm new to Lambda and Python and I've faced an issue with my Lambda function.
I have several JSON files stored in a S3 bucket, and I wish to convert all JSON files to CSV format.
As I was referring to the Lambda function posted in this tutorial: https://sysadmins.co.za/convert-csv-to-json-files-with-aws-lambda-and-s3-events/
Lambda function:
import json
import csv
import boto3
import os
import datetime as dt
s3 = boto3.client('s3')
def lambda_handler(event, context):
datestamp = dt.datetime.now().strftime("%Y/%m/%d")
timestamp = dt.datetime.now().strftime("%s")
filename_json = "/tmp/file_{ts}.json".format(ts=timestamp)
filename_csv = "/tmp/file_{ts}.csv".format(ts=timestamp)
keyname_s3 = "uploads/output/{ds}/{ts}.json".format(ds=datestamp, ts=timestamp)
json_data = []
for record in event['Records']:
bucket_name = record['s3']['bucket']['name']
key_name = record['s3']['object']['key']
s3_object = s3.get_object(Bucket=bucket_name, Key=key_name)
data = s3_object['Body'].read()
contents = data.decode('utf-8')
with open(filename_csv, 'a') as csv_data:
csv_data.write(contents)
with open(filename_csv) as csv_data:
csv_reader = csv.DictReader(csv_data)
for csv_row in csv_reader:
json_data.append(csv_row)
with open(filename_json, 'w') as json_file:
json_file.write(json.dumps(json_data))
with open(filename_json, 'r') as json_file_contents:
response = s3.put_object(Bucket=bucket_name, Key=keyname_s3, Body=json_file_contents.read())
os.remove(filename_csv)
os.remove(filename_json)
return {
'statusCode': 200,
'body': json.dumps('CSV converted to JSON and available at: {bucket}/{key}'.format(bucket=bucket_name,key=keyname_s3))
}
I want to achieve a similar outcome using Lambda, but from JSON to CSV instead. How may I go about doing this?
I'd suggest having a look at convtools library:
from convtools import conversion as c
from convtools.contrib.tables import Table
import json
# input.json
"""
{
"records": [
{"a": 1, "b": "c"},
{"a": 2, "b": "d"},
{"a": 3, "b": "e"}
]
}
"""
with open("input.json") as f:
input_data = json.load(f)
Table.from_rows(input_data["records"]).into_csv("output.csv")
# output.csv
"""
a,b
1,c
2,d
3,e
"""
The problem is I'm unable to access the information from config.json file to my python file
I have provided the JSON data and python code bellow
I have tried everything in the request module
but I can access the response without the config file but,
I need with config file
The following is a json file
{
"api_data": {
"request_url": "https://newapi.zivame.com/api/v1/catalog/list",
"post_data" : {"category_ids" : "948",
"limit" : "10000"},
"my_headers":{"Content-Type": "application/json"}
},
"redshift":{
"host":"XXX.XXXX.XXX",
"user":"XXXX",
"password":"XXXXXXXX",
"port": 8080,
"db":"XXXX"
},
"s3":{
"access_key":"XXXXXXXXX",
"secret_key":"XXXXXXXXXX",
"region":"XX-XXXXX-1",
"path":"XXXXXXXXXXXX/XXX",
"table":"XXXXXX",
"bucket":"XXXX",
"file": "XXXXXX",
"copy_column": "XXX",
"local_path": "XXXXX"
},
"csv_file": {
"promo_zivame": ""
}
}
and this is the program
#!/usr/bin/python
import json
import psycopg2
import requests
import os
BASE_PATH = os.path.dirname(os.path.realpath(__file__))
with open(BASE_PATH+'/config.json') as json_data_file:
data = json.load(json_data_file)
#api_config = data['api_data']
#redshift = data['redshift']
s3_config = data['s3']
#x = print(api_config.get('request_url'))
class ApiResponse:
#api response
def api_data(self, api_config):
print("starting api_data")
try:
self.ApiResponse = requests.post(api_config['request_url'], api_config['post_data'], api_config['my_headers'])
data_1 = self.ApiResponse
#data = json.dump(self.ApiResponse)
print("API Result Response")
print(())
print(self.ApiResponse)
return (self.ApiResponse)
except Exception:
print("response not found")
return False
def redshift_connect(self, redshift):
try:
# Amazon Redshift connect string
self.con = psycopg2.connect(
host=redshift['host'],
user=redshift['user'],
port=redshift['port'],
password=redshift['password'],
dbname=redshift['db'])
print(self.con)
return self.con
except Exception:
print("Error in Redshift connection")
return False
def main():
c1 = ApiResponse()
api_config = data['api_data']
redshift = data['redshift']
c1.api_data(api_config)
c1.api_data(data)
c1.redshift_connect(redshift)
if __name__=='__main__':
main()
Third argument to requests.post() is json. To provide headers, you need to use the name of the argument explicitly as #JustinEzequiel suggested. See the requests doc here: 2.python-requests.org/en/v1.1.0/user/quickstart/#custom-headers
requests.post(api_config['request_url'], json=api_config['post_data'], headers=api_config['my_headers'])
Borrowing code from https://stackoverflow.com/a/16696317/5386938
import requests
api_config = {
"request_url": "https://newapi.zivame.com/api/v1/catalog/list",
"post_data" : {"category_ids" : "948", "limit" : "10000"},
"my_headers":{"Content-Type": "application/json"}
}
local_filename = 'the_response.json'
with requests.post(api_config['request_url'], json=api_config['post_data'], headers=api_config['my_headers'], stream=True) as r:
r.raise_for_status()
with open(local_filename, 'wb') as f:
for chunk in r.iter_content(chunk_size=8192):
if chunk: # filter out keep-alive new chunks
f.write(chunk)
saves the response into a file ('the_response.json') you can then pass around. Note the stream=True passed to requests.post
I need to read csv file from s3 bucket and insert each row on dynamoDB
def load_users_dynamodb():
s3 = boto3.client('s3')
dynamodb = boto3.resource('dynamodb')
table = dynamodb.Table("test")
obj = s3.get_object(Bucket='test-app-config', Key='extract_Users.csv')
#return obj
data = obj['Body'].read().split('\n')
#return json.dumps(data)
with table.batch_writer() as batch:
for row in data:
batch.put_item(Item={
'registration': row.split(',')[0],
'name': row.split(',')[1],
'role': row.split(',')[2],
'company': row.split(',')[3],
'hiredcompany': row.split(',')[4],
'region': row.split(',')[5]
})
return 'OK'
im getting exception and I can't proceed:
Response:
{
"errorMessage": "a bytes-like object is required, not 'str'",
"errorType": "TypeError",
"stackTrace": [
" File \"/var/task/lambda_function.py\", line 10, in lambda_handler\n 'body': load_users_dynamodb()\n",
" File \"/var/task/lambda_function.py\", line 21, in load_users_dynamodb\n data = obj['Body'].read().split('\\n')\n"
]
}
Someone can help me please? o/
Your issue related to decoding the object return from s3.You need to read the file as csv.
Take a look at the following code snippet:
import boto3
import csv
s3 = boto3.client('s3')
def lambda_handler(event, context):
obj = s3.get_object(Bucket='Bucket_Name', Key='File_Name.csv')
data = obj['Body'].read().decode('utf-8').splitlines()
lines = csv.reader(data)
headers = next(lines)
print('headers: %s' %(headers))
for line in lines:
print(line)
Output :
Dummy csv.
I've been having some trouble sending files via python's rest module. I can send emails without attachments just fine but as soon as I try and add a files parameter, the call fails and I get a 415 error.
I've looked through the site and found out it was maybe because I wasn't sending the content type of the files when building that array of data so altered it to query the content type with mimetypes; still 415.
This thread: python requests file upload made a couple of more edits but still 415.
The error message says:
"A supported MIME type could not be found that matches the content type of the response. None of the supported type(s)"
Then lists a bunch of json types e.g: "'application/json;odata.metadata=minimal;odata.streaming=true;IEEE754Compatible=false"
then says:
"matches the content type 'multipart/form-data; boundary=0e5485079df745cf0d07777a88aeb8fd'"
Which of course makes me think I'm still not handling the content type correctly somewhere.
Can anyone see where I'm going wrong in my code?
Thanks!
Here's the function:
def send_email(access_token):
import requests
import json
import pandas as pd
import mimetypes
url = "https://outlook.office.com/api/v2.0/me/sendmail"
headers = {
'Authorization': 'Bearer '+access_token,
}
data = {}
data['Message'] = {
'Subject': "Test",
'Body': {
'ContentType': 'Text',
'Content': 'This is a test'
},
'ToRecipients': [
{
'EmailAddress':{
'Address': 'MY TEST EMAIL ADDRESS'
}
}
]
}
data['SaveToSentItems'] = "true"
json_data = json.dumps(data)
#need to convert the above json_data to dict, otherwise it won't work
json_data = json.loads(json_data)
###ATTACHMENT WORK
file_list = ['test_files/test.xlsx', 'test_files/test.docx']
files = {}
pos = 1
for file in file_list:
x = file.split('/') #seperate file name from file path
files['file'+str(pos)] = ( #give the file a unique name
x[1], #actual filename
open(file,'rb'), #open the file
mimetypes.MimeTypes().guess_type(file)[0] #add in the contents type
)
pos += 1 #increase the naming iteration
#print(files)
r = requests.post(url, headers=headers, json=json_data, files=files)
print("")
print(r)
print("")
print(r.text)
I've figured it out! Took a look at the outlook API documentation and realised I should be adding attachments as encoded lists within the message Json, not within the request.post function. Here's my working example:
import requests
import json
import pandas as pd
import mimetypes
import base64
url = "https://outlook.office.com/api/v2.0/me/sendmail"
headers = {
'Authorization': 'Bearer '+access_token,
}
Attachments = []
file_list = ['test_files/image.png', 'test_files/test.xlsx']
for file in file_list:
x = file.split('/') #file the file path so we can get it's na,e
filename = x[1] #get the filename
content = open(file,'rb') #load the content
#encode the file into bytes then turn those bytes into a string
encoded_string = ''
with open(file, "rb") as image_file:
encoded_string = base64.b64encode(image_file.read())
encoded_string = encoded_string.decode("utf-8")
#append the file to the attachments list
Attachments.append({
"#odata.type": "#Microsoft.OutlookServices.FileAttachment",
"Name": filename,
"ContentBytes": encoded_string
})
data = {}
data['Message'] = {
'Subject': "Test",
'Body': {
'ContentType': 'Text',
'Content': 'This is a test'
},
'ToRecipients': [
{
'EmailAddress':{
'Address': 'EMAIL_ADDRESS'
}
}
],
"Attachments": Attachments
}
data['SaveToSentItems'] = "true"
json_data = json.dumps(data)
json_data = json.loads(json_data)
r = requests.post(url, headers=headers, json=json_data)
print(r)