Send xls file to S3 in python - python

I created a .xls file in my python program that I try to push to S3 with boto.
My code:
just_models = [{"key": "1"}, {"key2": 2}]
df_just_models = pd.DataFrame(just_models)
# Notice desired file extension.
just_models = df_just_models.to_excel('../file.xls')
def save_just_models():
# Save to S3
date_file = datetime.now().strftime("%Y-%m-%d")
s3 = boto3.resource('s3')
obj = s3.Object('directory', 'indice_na/just_models' + date_file + '_indice_na.xls')
obj.put(Body=just_models)
save_just_models()
My error:
Traceback (most recent call last):
File "indicena.py", line 11985, in <module>
save_just_models()
File "indicena.py", line 11984, in save_just_models
obj.put(Body=just_models)
File "/home/bolgi/.local/lib/python3.8/site-packages/boto3/resources/factory.py", line 520, in do_action
response = action(self, *args, **kwargs)
File "/home/bolgi/.local/lib/python3.8/site-packages/boto3/resources/action.py", line 83, in __call__
response = getattr(parent.meta.client, operation_name)(*args, **params)
File "/home/bolgi/.local/lib/python3.8/site-packages/botocore/client.py", line 316, in _api_call
return self._make_api_call(operation_name, kwargs)
File "/home/bolgi/.local/lib/python3.8/site-packages/botocore/client.py", line 598, in _make_api_call
request_dict = self._convert_to_request_dict(
File "/home/bolgi/.local/lib/python3.8/site-packages/botocore/client.py", line 646, in _convert_to_request_dict
request_dict = self._serializer.serialize_to_request(
File "/home/bolgi/.local/lib/python3.8/site-packages/botocore/validate.py", line 297, in serialize_to_request
raise ParamValidationError(report=report.generate_report())
botocore.exceptions.ParamValidationError: Parameter validation failed:
Invalid type for parameter Body, value: None, type: <class 'NoneType'>, valid types: <class 'bytes'>, <class 'bytearray'>, file-like object
The error came from the obj.put() but I don't know exactly how to resolve it.

Yes, you can write directly to S3 from memory. No need to save the xls file on your local hard drive. This is possible as you can write to BytesIO instead of the file, and the send the BytesIO to S3 as shown below. Also default pandas xlsx writer is deprecated, so you may consider using newer one such as xlsxwriter:
import io
import pandas as pd
just_models = [{"key": "1"}, {"key2": 2}]
df_just_models = pd.DataFrame(just_models)
mem_file = io.BytesIO();
df_just_models.to_excel(mem_file, engine='xlsxwriter')
def save_just_models():
# Save to S3
date_file = datetime.now().strftime("%Y-%m-%d")
s3 = boto3.resource('s3')
obj = s3.Object('directory', 'indice_na/just_models' + date_file + '_indice_na.xls')
obj.put(Body=mem_file.getvalue())
save_just_models()

Related

Newer version of botocore breaks integration test

Imagine the follow functions, that should upload and copy something to S3
class TestAttachments(TestCase):
# YAML is the only serializer supporting binary
#override_env(AWS_DEFAULT_REGION='eu-west-1') # So the test doesn't fail depending on env. vars
#my_vcr.use_cassette(serializer='yaml')
def test_copy_attachments_to_sent_folder(self):
with self.assertRaises(
CopyAttachmentException,
msg="Failed to copy attachments to sent folder for attachment URL: http://example.com/foo.jpg"
) as cm:
copy_attachments_to_sent_folder(["http://example.com/foo.jpg"])
self.assertEqual(cm.exception.__cause__.__class__, InvalidAttachmentURL)
TEST_UUIDS = ["uuid_0"]
with patch.object(uuid, 'uuid4', side_effect=TEST_UUIDS):
result = copy_attachments_to_sent_folder([
f"https://{settings.MESSAGE_ATTACHMENTS_S3_BUCKET}.s3.amazonaws.com/attachments/Test+video.mov"
])
self.assertEqual(
[AttachmentMetadata(
s3_key=f"attachments/sent/{TEST_UUIDS[0]}/video/quicktime/Test video.mov",
filename="Test video.mov",
mime_type="video/quicktime",
size=178653,
)],
result
)
It should test the following function:
def copy_attachments_to_sent_folder(urls: List[str]) -> List[AttachmentMetadata]:
# Copy the attachment to the sent folder in parallel
with futures.ThreadPoolExecutor(max_workers=4) as executor:
urls_by_future = {executor.submit(copy_attachment_to_sent_folder, url): url for url in urls}
results_by_url = {}
for future in futures.as_completed(urls_by_future.keys()):
try:
results_by_url[urls_by_future[future]] = future.result()
except Exception as e:
raise CopyAttachmentException(
f"Failed to copy attachments to sent folder for attachment URL: {urls_by_future[future]}"
) from e
# The futures can complete out-of-order, so we need to re-order them to match the original order here
return [results_by_url[url] for url in urls]
Which finally uses this function inside:
def copy_attachment_to_sent_folder(url: str) -> AttachmentMetadata:
aws_session = attachments_aws_session()
s3_client = aws_session.client("s3")
parse_result = urlparse(url, allow_fragments=False)
# Allow both with/without AWS region in hostname
attachment_hostname_regex = fr"{settings.MESSAGE_ATTACHMENTS_S3_BUCKET}\.s3\.(.+?\.)?amazonaws\.com"
if not (
parse_result.hostname is not None and re.fullmatch(attachment_hostname_regex, parse_result.hostname) is not None
and parse_result.scheme == "https"
):
raise InvalidAttachmentURL(f"URL {url} is not a valid attachment URL")
path = unquote_plus(parse_result.path)
key = path.lstrip("/")
_, _, filename = key.rpartition("/")
object_metadata = get_s3_object_metadata(aws_session=aws_session, object_bucket=settings.MESSAGE_ATTACHMENTS_S3_BUCKET,
object_key=key)
s3_resource = aws_session.resource("s3")
# Place images in their own path so that provider1 only has access to these files
destination_key = f"attachments/sent/{uuid.uuid4()}/{object_metadata.mime_type}/{filename}"
try:
# Copy to sent attachments folder and set content-type
response = s3_resource.Object(settings.MESSAGE_ATTACHMENTS_S3_BUCKET, destination_key).copy_from(
# Tell S3 to set cache header to "Forever"
Expires=datetime(2100, 1, 1),
CopySource={"Bucket": settings.MESSAGE_ATTACHMENTS_S3_BUCKET, "Key": key},
ACL="private",
MetadataDirective="REPLACE",
# Set mime type on the destination file
ContentType=object_metadata.mime_type,
# Only copy it if the user did not modify the file since we fetched it to detect mimetype :)
# S3 allows overwriting of the files. A user could potentially overwrite
# the already uploaded file with a file of a different type after they upload it and after we detect the
# mimetype, but before we copy it to the destination. Setting CopySourceIfUnmodifiedSince prevents that.
CopySourceIfUnmodifiedSince=object_metadata.last_modified,
)
except ClientError as e:
raise CopyAttachmentException from e
if response.get("CopyObjectResult", {}).get("ETag", None) is None:
raise CopyAttachmentException(
f"Copy of object '{key}' to '{destination_key}' was not successful. Response: {response}"
)
return AttachmentMetadata(
s3_key=destination_key,
filename=filename,
mime_type=object_metadata.mime_type,
size=object_metadata.byte_size,
)
This works super fine on botocore==1.23.46
But as soon as we upgrade it, we get presented with the following error:
Error
Traceback (most recent call last):
File "/tl/test/messaging/attachments.py", line 105, in copy_attachments_to_sent_folder
results_by_url[urls_by_future[future]] = future.result()
File "/tl/.pyenv/versions/3.10.2/lib/python3.10/concurrent/futures/_base.py", line 439, in result
return self.__get_result()
File "/tl/.pyenv/versions/3.10.2/lib/python3.10/concurrent/futures/_base.py", line 391, in __get_result
raise self._exception
File "/tl/.pyenv/versions/3.10.2/lib/python3.10/concurrent/futures/thread.py", line 58, in run
result = self.fn(*self.args, **self.kwargs)
File "/tl/test/messaging/attachments.py", line 64, in copy_attachment_to_sent_folder
destination_key = f"attachments/sent/{uuid.uuid4()}/{object_metadata.mime_type}/{filename}"
File "/tl/.pyenv/versions/3.10.2/lib/python3.10/unittest/mock.py", line 1104, in __call__
return self._mock_call(*args, **kwargs)
File "/tl/.pyenv/versions/3.10.2/lib/python3.10/unittest/mock.py", line 1108, in _mock_call
return self._execute_mock_call(*args, **kwargs)
File "/tl/.pyenv/versions/3.10.2/lib/python3.10/unittest/mock.py", line 1165, in _execute_mock_call
result = next(effect)
StopIteration
The above exception was the direct cause of the following exception:
Traceback (most recent call last):
File "/tl/.pyenv/versions/3.10.2/lib/python3.10/contextlib.py", line 79, in inner
return func(*args, **kwds)
File "/tl/test/venv/lib/python3.10/site-packages/vcr/cassette.py", line 100, in __call__
return type(self)(self.cls, args_getter)._execute_function(function, args, kwargs)
File "/tl/test/venv/lib/python3.10/site-packages/vcr/cassette.py", line 114, in _execute_function
return self._handle_function(fn=handle_function)
File "/tl/test/venv/lib/python3.10/site-packages/vcr/cassette.py", line 138, in _handle_function
return fn(cassette)
File "/tl/test/venv/lib/python3.10/site-packages/vcr/cassette.py", line 107, in handle_function
return function(*args, **kwargs)
File "/tl/test/messaging/test_attachments.py", line 27, in test_copy_attachments_to_sent_folder
result = copy_attachments_to_sent_folder([
File "/tl/test/messaging/attachments.py", line 107, in copy_attachments_to_sent_folder
raise CopyAttachmentException(
messaging.attachments.CopyAttachmentException: Failed to copy attachments to sent folder for attachment URL: https://test-message-attachments.s3.amazonaws.com/attachments/Test+video.mov
I assume it has to do with destination_key = f"attachments/sent/{uuid.uuid4()}/{object_metadata.mime_type}/{filename}" but I am not sure what I am doing wrong? And why exactly it works on botocore==<1.2.46 and not after.

python2.7 throwing error when calling bigquery api

I am using google-api-python-client for inserting a json record to bigquery and when I try to unittest the method using python unittest, I am getting error in exactly this line
The code is as follows:
def write_to_bigquery(self, timeseries, metadata):
response = {}
json_msg_list = []
stats = {}
if not timeseries or "points" not in timeseries:
logging.debug("No timeseries data to write to BigQuery")
msgs_written = 0
metadata["msg_without_timeseries"] = 1
error_msg_cnt = 0
else:
rows = build_rows(timeseries, metadata)
print("rows", rows) //This gets printed
bigquery = build('bigquery', 'v2', cache_discovery=False)
print("after rows", rows) //Control does not reach here
body = {
"kind": "bigquery#tableDataInsertAllRequest",
"skipInvalidRows": "false",
"rows": json_row_list
}
logging.debug('body: {}'.format(json.dumps(body, sort_keys=True, indent=4)))
response = bigquery.tabledata().insertAll(
projectId=app_identity.get_application_id(),
datasetId=config.BIGQUERY_DATASET,
tableId=config.BIGQUERY_STATS_TABLE,
body=body
).execute()
logging.debug("BigQuery said... = {}".format(response))
and this is the error I get
Traceback (most recent call last):
File "/home/barumugham/.local/lib/python2.7/site-packages/webapp2.py", line 1535, in __call__
rv = self.handle_exception(request, response, e)
File "/home/barumugham/.local/lib/python2.7/site-packages/webapp2.py", line 1529, in __call__
rv = self.router.dispatch(request, response)
File "/home/barumugham/.local/lib/python2.7/site-packages/webapp2.py", line 1278, in default_dispatcher
return route.handler_adapter(request, response)
File "/home/barumugham/.local/lib/python2.7/site-packages/webapp2.py", line 1102, in __call__
return handler.dispatch()
File "/home/barumugham/.local/lib/python2.7/site-packages/webapp2.py", line 572, in dispatch
return self.handle_exception(e, self.app.debug)
File "/home/barumugham/.local/lib/python2.7/site-packages/webapp2.py", line 570, in dispatch
return method(*args, **kwargs)
File "main.py", line 422, in post
File "/home/barumugham/.local/lib/python2.7/site-packages/webapp2.py", line 570, in dispatch
return method(*args, **kwargs)
File "main.py", line 422, in post
self.write_to_bigquery(data, metadata)
File "main.py", line 296, in write_to_bigquery
bigquery = build('bigquery', 'v2', cache_discovery=False)
File "/usr/local/lib/python2.7/dist-packages/googleapiclient/_helpers.py", line 134, in positional_wrapper
return wrapped(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/googleapiclient/discovery.py", line 258, in build
adc_key_path=adc_key_path,
File "/usr/local/lib/python2.7/dist-packages/googleapiclient/_helpers.py", line 134, in positional_wrapper
return wrapped(*args, **kwargs)
File "/usr/local/lib/python2.7/dist-packages/googleapiclient/discovery.py", line 423, in build_from_document
credentials = _auth.default_credentials()
File "/usr/local/lib/python2.7/dist-packages/googleapiclient/_auth.py", line 44, in default_credentials
credentials, project_id = checker()
File "/usr/local/lib/python2.7/dist-packages/google/auth/_default.py", line 186, in _get_gae_credentials
project_id = app_engine.get_project_id()
File "/usr/local/lib/python2.7/dist-packages/google/auth/app_engine.py", line 77, in get_project_id
return app_identity.get_application_id()
File "/usr/lib/google-cloud-sdk/platform/google_appengine/google/appengine/api/app_identity/app_identity.py", line 455, in get_application_id
_, domain_name, display_app_id = _ParseFullAppId(full_app_id)
File "/usr/lib/google-cloud-sdk/platform/google_appengine/google/appengine/api/app_identity/app_identity.py", line 436, in _ParseFullAppId
psep = app_id.find(_PARTITION_SEPARATOR)
AttributeError: 'NoneType' object has no attribute 'find'
I am new to python and bigquery so any help is appreciated thanks
I would recommend you using the BigQuery Python SDK
For that, you first need to install it in you Python. You can do that by running:
pip install google-cloud-bigquery
After that you use a code like this insert json records to your table:
from google.cloud import bigquery
# Construct a BigQuery client object.
client = bigquery.Client()
table_id = "project_id.dataset.table"
# Your JSON keys must correspond to your table column names
json_list = [{"your": "json", "data":"here"},{"your": "json", "data":"here"},{"your": "json", "data":"here"}, ...]
# Get table reference
table = client.get_table(table_id)
rows_to_insert = json_list
# Insert the data into your table
errors = client.insert_rows(table, rows_to_insert)
Finally, I'd like to say that Python 2 is considered deprecated already. If possible, update it to Python 3

Convert a list of dicts to CSV and attach to email using SendGrid

I'm trying to retrieve some data and mail it out from within a Google Cloud Function using SendGrid
I tried convert the data as a list of dictionaries (with the same, flat, structure) to CSV as detailed here but this fails as the filesystem is read only.
To counter this, I use io.StringIO() to store the CSV in-memory.
However, I get the following error/stack trace during execution:
Traceback (most recent call last):
File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py",
line 383, in run_background_function_function_handler.invoke_user_function(event_object)
File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py",
line 217, in invoke_user_function return call_user_function(request_or_event)
File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py",
line 214, in call_user_function event_context.Context(**request_or_event.context))
File "/user_code/main.py", line 267, in session_updated summarize_session(sessionid, doctorid) File "/user_code/main.py", line 375, in summarize_session send_email([docemail], data)
File "/user_code/main.py", line 328, in send_email response = sg.send(message)
File "/env/local/lib/python3.7/site-packages/sendgrid/sendgrid.py",
line 98, in send response = self.client.mail.send.post(request_body=message.get())
File "/env/local/lib/python3.7/site-packages/python_http_client/client.py",
line 251, in http_request data = json.dumps(request_body).encode('utf-8')
File "/opt/python3.7/lib/python3.7/json/__init__.py",
line 231, in dumps return _default_encoder.encode(obj)
File "/opt/python3.7/lib/python3.7/json/encoder.py",
line 199, in encode chunks = self.iterencode(o, _one_shot=True)
File "/opt/python3.7/lib/python3.7/json/encoder.py",
line 257, in iterencode return _iterencode(o, 0)
File "/opt/python3.7/lib/python3.7/json/encoder.py",
line 179, in default raise TypeError(f'Object of type {o.__class__.__name__} '
TypeError: Object of type StringIO is not JSON serializable
The code is as follows:
def send_email(to_emails, datadict):
message = Mail(
from_email='info#domain.com',
#to_emails= to_emails,
to_emails= ['dude#domain.com'],
subject='Summary of your session',
html_content='<strong>Data Summary</strong>\
<p>This email is a summary of your session. Please check the attachment for details. </p>')
sg = SendGridAPIClient(os.environ.get('SENDGRID_API_KEY'))
keys = datadict[0].keys()
try:
output_file = io.StringIO()
#output_file = io.BytesIO() # results in a typeerror
# https://stackoverflow.com/questions/34283178/typeerror-a-bytes-like-object-is-required-not-str-in-python-and-csv
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(datadict)
print("Attaching")
message.attachment = [
Attachment(FileContent(output_file),
FileType('text/csv'),
FileName('sessiondata.csv'),
Disposition('inline'),
ContentId('SessionData')),
]
except Exception as e:
print("Exception:")
print(e)
return
response = sg.send(message)
How do I convert the list of dictionaries to a CSV and attach it to an email without opening a physical file on the filesystem?

How to read the uploaded excel file in AWS Lambda?

I am uploading an excel file from Postman and trying to read it in AWS lambda with pandas. How can I do it?
I have tried reading the bytes from API gateway event with 'cgi.parse_multipart'. I was able to read the csv files successfully but not xlsx files.
from cgi import parse_header, parse_multipart
import pandas as pd
from io import BytesIO, StringIO
import json
def get_data(event):
c_type, c_data = parse_header(event['headers']['content-type'])
c_data['boundary'] = bytes(c_data['boundary'], "utf-8")
assert c_type == 'multipart/form-data'
body = event['body']
body = bytes(body, 'utf-8')
form_data = parse_multipart(BytesIO(body), c_data)
data = form_data['file'][0]
s=str(data,'utf-8')
d = StringIO(s)
# df=pd.read_csv(d)
df=pd.read_excel(d)
print(df)
def run(event, context):
output = {}
output['statusCode'] = 200
output['body'] = json.dumps(get_data(event))
return output
While trying to read xlsx file, I get the following error:
Traceback (most recent call last):
File "/var/task/upload_test.py", line 108, in run
output['body'] = json.dumps(get_data(event))
File "/var/task/upload_test.py", line 52, in get_data
df=pd.read_excel(d)
File "/opt/python/lib/python3.6/site-packages/pandas/util/_decorators.py", line 188, in wrapper
return func(*args, **kwargs)
File "/opt/python/lib/python3.6/site-packages/pandas/util/_decorators.py", line 188, in wrapper
return func(*args, **kwargs)
File "/opt/python/lib/python3.6/site-packages/pandas/io/excel.py", line 350, in read_excel
io = ExcelFile(io, engine=engine)
File "/opt/python/lib/python3.6/site-packages/pandas/io/excel.py", line 653, in __init__
self._reader = self._engines[engine](self._io)
File "/opt/python/lib/python3.6/site-packages/pandas/io/excel.py", line 422, in __init__
self.book = xlrd.open_workbook(file_contents=data)
File "/var/task/xlrd/__init__.py", line 157, in open_workbook
ragged_rows=ragged_rows,
File "/var/task/xlrd/book.py", line 92, in open_workbook_xls
biff_version = bk.getbof(XL_WORKBOOK_GLOBALS)
File "/var/task/xlrd/book.py", line 1274, in getbof
opcode = self.get2bytes()
File "/var/task/xlrd/book.py", line 675, in get2bytes
return (BYTES_ORD(hi) << 8) | BYTES_ORD(lo)
TypeError: unsupported operand type(s) for <<: 'str' and 'int'

Python Decompressing gzip csv in pandas csv reader

The following code works in Python3 but fails in Python2
r = requests.get("http://api.bitcoincharts.com/v1/csv/coinbaseUSD.csv.gz", stream=True)
decompressed_file = gzip.GzipFile(fileobj=r.raw)
data = pd.read_csv(decompressed_file, sep=',')
data.columns = ["timestamp", "price" , "volume"] # set df col headers
return data
The error I get in Python2 is the following:
TypeError: 'int' object has no attribute '__getitem__'
The error is on the line where I set data equal to pd.read_csv(...)
Seems to be a pandas error to me
Stacktrace:
Traceback (most recent call last):
File "fetch.py", line 51, in <module>
print(f.get_historical())
File "fetch.py", line 36, in get_historical
data = pd.read_csv(f, sep=',')
File "/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.py", line 709, in parser_f
return _read(filepath_or_buffer, kwds)
File "/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.py", line 449, in _read
parser = TextFileReader(filepath_or_buffer, **kwds)
File "/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.py", line 818, in __init__
self._make_engine(self.engine)
File "/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.py", line 1049, in _make_engine
self._engine = CParserWrapper(self.f, **self.options)
File "/usr/local/lib/python2.7/dist-packages/pandas/io/parsers.py", line 1695, in __init__
self._reader = parsers.TextReader(src, **kwds)
File "pandas/_libs/parsers.pyx", line 562, in pandas._libs.parsers.TextReader.__cinit__
File "pandas/_libs/parsers.pyx", line 760, in pandas._libs.parsers.TextReader._get_header
File "pandas/_libs/parsers.pyx", line 965, in pandas._libs.parsers.TextReader._tokenize_rows
File "pandas/_libs/parsers.pyx", line 2197, in pandas._libs.parsers.raise_parser_error
io.UnsupportedOperation: seek
The issue from the traceback you posted is related to the fact that the Response object's raw attribute is a file-like object that does not support the .seek method that typical file objects support. However, when ingesting the file object with pd.read_csv, pandas (in python2) seems to be making use of the seek method of the provided file object.
You can confirm that the returned response's raw data is not seekable by calling r.raw.seekable(), which should normally return False.
The way to circumvent this issue may be to wrap the returned data into an io.BytesIO object as follows:
import gzip
import io
import pandas as pd
import requests
# file_url = "http://api.bitcoincharts.com/v1/csv/coinbaseUSD.csv.gz"
file_url = "http://api.bitcoincharts.com/v1/csv/aqoinEUR.csv.gz"
r = requests.get(file_url, stream=True)
dfile = gzip.GzipFile(fileobj=io.BytesIO(r.raw.read()))
data = pd.read_csv(dfile, sep=',')
print(data)
0 1 2
0 1314964052 2.60 0.4
1 1316277154 3.75 0.5
2 1316300526 4.00 4.0
3 1316300612 3.80 1.0
4 1316300622 3.75 1.5
As you can see, I used a smaller file from the directory of files available. You can switch this to your desired file.
In any case, io.BytesIO(r.raw.read()) should be seekable, and therefore should help avoid the io.UnsupportedOperation exception you are encountering.
As for the TypeError exception, it is inexistent in this snippet of code.
I hope this helps.

Categories

Resources