I am uploading an excel file from Postman and trying to read it in AWS lambda with pandas. How can I do it?
I have tried reading the bytes from API gateway event with 'cgi.parse_multipart'. I was able to read the csv files successfully but not xlsx files.
from cgi import parse_header, parse_multipart
import pandas as pd
from io import BytesIO, StringIO
import json
def get_data(event):
c_type, c_data = parse_header(event['headers']['content-type'])
c_data['boundary'] = bytes(c_data['boundary'], "utf-8")
assert c_type == 'multipart/form-data'
body = event['body']
body = bytes(body, 'utf-8')
form_data = parse_multipart(BytesIO(body), c_data)
data = form_data['file'][0]
s=str(data,'utf-8')
d = StringIO(s)
# df=pd.read_csv(d)
df=pd.read_excel(d)
print(df)
def run(event, context):
output = {}
output['statusCode'] = 200
output['body'] = json.dumps(get_data(event))
return output
While trying to read xlsx file, I get the following error:
Traceback (most recent call last):
File "/var/task/upload_test.py", line 108, in run
output['body'] = json.dumps(get_data(event))
File "/var/task/upload_test.py", line 52, in get_data
df=pd.read_excel(d)
File "/opt/python/lib/python3.6/site-packages/pandas/util/_decorators.py", line 188, in wrapper
return func(*args, **kwargs)
File "/opt/python/lib/python3.6/site-packages/pandas/util/_decorators.py", line 188, in wrapper
return func(*args, **kwargs)
File "/opt/python/lib/python3.6/site-packages/pandas/io/excel.py", line 350, in read_excel
io = ExcelFile(io, engine=engine)
File "/opt/python/lib/python3.6/site-packages/pandas/io/excel.py", line 653, in __init__
self._reader = self._engines[engine](self._io)
File "/opt/python/lib/python3.6/site-packages/pandas/io/excel.py", line 422, in __init__
self.book = xlrd.open_workbook(file_contents=data)
File "/var/task/xlrd/__init__.py", line 157, in open_workbook
ragged_rows=ragged_rows,
File "/var/task/xlrd/book.py", line 92, in open_workbook_xls
biff_version = bk.getbof(XL_WORKBOOK_GLOBALS)
File "/var/task/xlrd/book.py", line 1274, in getbof
opcode = self.get2bytes()
File "/var/task/xlrd/book.py", line 675, in get2bytes
return (BYTES_ORD(hi) << 8) | BYTES_ORD(lo)
TypeError: unsupported operand type(s) for <<: 'str' and 'int'
Related
import dask.bag as db
class Converter():
def __init__(self,input,output):
"""Converter constructor"""
self.input = input
self.output = output
#staticmethod
def large_file_reader(file_path: str):
"""
File reader
"""
temp_data = db.read_avro(file_path)
data = temp_data.to_dataframe()
# just to check able to by read properly
print(data.head(6))
return data
#staticmethod
def large_file_writer(data, file_path: str) -> bool:
"""
File writer
"""
data.compute().to_csv(file_path, index=False)
def large_file_processor(self):
"Read then write"
input_file_path =self.input
output_file_path =self.output
data = Converter.large_file_reader(input_file_path)
Converter.large_file_writer(data=data, file_path=output_file_path)
if __name__ == "__main__":
c = Converter("/Users/csv_to_avro_new.avro", "/Users/test_avro_new.csv")
c.large_file_processor()
Traceback (most recent call last):
File "/Users/PycharmProjects/ms--py/new.py", line 41, in <module>
c.large_file_processor()
File "/Users/PycharmProjects/ms--py/new.py", line 36, in large_file_processor
Converter.large_file_writer(data=data, file_path=output_file_path)
File "/Users/PycharmProjects/ms--py/new.py", line 28, in large_file_writer
data.compute().to_csv(file_path, index=False)
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/base.py", line 315, in compute
(result,) = compute(self, traverse=False, **kwargs)
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/base.py", line 600, in compute
results = schedule(dsk, keys, **kwargs)
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/threaded.py", line 89, in get
results = get_async(
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/local.py", line 511, in get_async
raise_exception(exc, tb)
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/local.py", line 319, in reraise
raise exc
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/local.py", line 224, in execute_task
result = _execute_task(task, data)
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/core.py", line 119, in <genexpr>
return func(*(_execute_task(a, cache) for a in args))
File "/Users/adavsandeep/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/core.py", line 119, in _execute_task
return func(*(_execute_task(a, cache) for a in args))
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/dask/bag/avro.py", line 150, in read_chunk
chunk = read_block(f, off, l, head["sync"])
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/fsspec/utils.py", line 244, in read_block
found_start_delim = seek_delimiter(f, delimiter, 2**16)
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/fsspec/utils.py", line 187, in seek_delimiter
current = file.read(blocksize)
File "/Users/PycharmProjects/data-ingest/lib/python3.10/site-packages/fsspec/implementations/local.py", line 337, in read
return self.f.read(*args, **kwargs)
ValueError: read of closed file
Process finished with exit code 1
`Added the error traceback.check it once.Thanks.
Error
ValueError: read of closed file
Tried convert to some other formats like csv to json in the same way it was working
But not able to convert avro to csv, avro to json and avro to parquet
My file size is more than 2GB .Thats the reason I am using dask.
Thanks in advance .
`
Hi everybody I have a problem uploading a excel file with Pandas
I have taken the file in archive, if I uploaded it directly it gaves me an error. If I cope and paste the excel file there is no problem.
The code is very easy:
data = pd.read_excel(r"C:\Users\obett\Desktop\Corporate Governance\pandas.xlsx")
and this is the error:
Traceback (most recent call last):
File "C:/Users/obett/PycharmProjects/pythonProject6/main.py", line 24, in <module>
data = pd.read_excel(r"C:\Users\obett\Desktop\Corporate Governance\Aida_Export_67.xlsx")
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\util\_decorators.py", line 299, in wrapper
return func(*args, **kwargs)
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_base.py", line 344, in read_excel
data = io.parse(
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_base.py", line 1170, in parse
return self._reader.parse(
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_base.py", line 492, in parse
data = self.get_sheet_data(sheet, convert_float)
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_openpyxl.py", line 549, in get_sheet_data
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_openpyxl.py", line 549, in <listcomp>
converted_row = [self._convert_cell(cell, convert_float) for cell in row]
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\pandas\io\excel\_openpyxl.py", line 514, in _convert_cell
elif cell.is_date:
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\openpyxl\cell\read_only.py", line 101, in is_date
return Cell.is_date.__get__(self)
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\openpyxl\cell\cell.py", line 256, in is_date
self.data_type == 'n' and is_date_format(self.number_format)
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\openpyxl\cell\read_only.py", line 66, in number_format
_id = self.style_array.numFmtId
File "C:\Users\obett\PycharmProjects\pythonProject6\venv\lib\site-packages\openpyxl\cell\read_only.py", line 56, in style_array
return self.parent.parent._cell_styles[self._style_id]
IndexError: list index out of range
Thank you very much
I created a .xls file in my python program that I try to push to S3 with boto.
My code:
just_models = [{"key": "1"}, {"key2": 2}]
df_just_models = pd.DataFrame(just_models)
# Notice desired file extension.
just_models = df_just_models.to_excel('../file.xls')
def save_just_models():
# Save to S3
date_file = datetime.now().strftime("%Y-%m-%d")
s3 = boto3.resource('s3')
obj = s3.Object('directory', 'indice_na/just_models' + date_file + '_indice_na.xls')
obj.put(Body=just_models)
save_just_models()
My error:
Traceback (most recent call last):
File "indicena.py", line 11985, in <module>
save_just_models()
File "indicena.py", line 11984, in save_just_models
obj.put(Body=just_models)
File "/home/bolgi/.local/lib/python3.8/site-packages/boto3/resources/factory.py", line 520, in do_action
response = action(self, *args, **kwargs)
File "/home/bolgi/.local/lib/python3.8/site-packages/boto3/resources/action.py", line 83, in __call__
response = getattr(parent.meta.client, operation_name)(*args, **params)
File "/home/bolgi/.local/lib/python3.8/site-packages/botocore/client.py", line 316, in _api_call
return self._make_api_call(operation_name, kwargs)
File "/home/bolgi/.local/lib/python3.8/site-packages/botocore/client.py", line 598, in _make_api_call
request_dict = self._convert_to_request_dict(
File "/home/bolgi/.local/lib/python3.8/site-packages/botocore/client.py", line 646, in _convert_to_request_dict
request_dict = self._serializer.serialize_to_request(
File "/home/bolgi/.local/lib/python3.8/site-packages/botocore/validate.py", line 297, in serialize_to_request
raise ParamValidationError(report=report.generate_report())
botocore.exceptions.ParamValidationError: Parameter validation failed:
Invalid type for parameter Body, value: None, type: <class 'NoneType'>, valid types: <class 'bytes'>, <class 'bytearray'>, file-like object
The error came from the obj.put() but I don't know exactly how to resolve it.
Yes, you can write directly to S3 from memory. No need to save the xls file on your local hard drive. This is possible as you can write to BytesIO instead of the file, and the send the BytesIO to S3 as shown below. Also default pandas xlsx writer is deprecated, so you may consider using newer one such as xlsxwriter:
import io
import pandas as pd
just_models = [{"key": "1"}, {"key2": 2}]
df_just_models = pd.DataFrame(just_models)
mem_file = io.BytesIO();
df_just_models.to_excel(mem_file, engine='xlsxwriter')
def save_just_models():
# Save to S3
date_file = datetime.now().strftime("%Y-%m-%d")
s3 = boto3.resource('s3')
obj = s3.Object('directory', 'indice_na/just_models' + date_file + '_indice_na.xls')
obj.put(Body=mem_file.getvalue())
save_just_models()
I am trying to read a cloud storage file into a Pandas dataframe locally and then load it into a Big Query table using Python but I get the following error:
google.auth.exceptions.RefreshError: ('invalid_grant: Bad Request', '{\n "error": "invalid_grant",\n "error_description": "Bad Request"\n}')
My code is:
import pandas as pd
from google.cloud import storage
from google.oauth2 import service_account
import gcsfs
scopes=["https://www.googleapis.com/auth/bigquery","https://www.googleapis.com/auth/cloud-platform"]
file='<<filename>>.json'
credentials = service_account.Credentials.from_service_account_file(file, scopes=scopes)
def run(file):
client = storage.Client.from_service_account_json(file)
bucket = client.bucket('my_bucket')
blobs = bucket.list_blobs()
list_temp_raw = []
for file in blobs:
filename = file.name
temp = pd.read_csv('gs://<<>my_bucket>>/' + filename, encoding='utf-8')
list_temp_raw.append(temp)
df = pd.concat(list_temp_raw, encoding='utf-8')
df.to_gbq('<<dataset>>.<<table>>',if_exists='append',credentials=credentials)
run(file)
It is the temp = pd.read_csv('gs://<<>my_bucket>>/' + filename, encoding='utf-8') causing the error:
Traceback (most recent call last):
File "test-load-bq.py", line 23, in <module>
run(file)
File "test-load-bq.py", line 18, in run
temp = pd.read_csv('gs://project-test/' + filename, encoding='utf-8')
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 676, in parser_f
return _read(filepath_or_buffer, kwds)
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/pandas/io/parsers.py", line 431, in _read
filepath_or_buffer, encoding, compression
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/pandas/io/common.py", line 192, in get_filepath_or_buffer
filepath_or_buffer, encoding=encoding, compression=compression, mode=mode
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/pandas/io/gcs.py", line 17, in get_filepath_or_buffer
filepath_or_buffer = fs.open(filepath_or_buffer, mode)
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/fsspec/spec.py", line 936, in open
**kwargs
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/gcsfs/core.py", line 1287, in _open
**kwargs,
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/gcsfs/core.py", line 1412, in __init__
**kwargs,
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/fsspec/spec.py", line 1257, in __init__
self.details = fs.info(path)
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/fsspec/asyn.py", line 121, in wrapper
return maybe_sync(func, self, *args, **kwargs)
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/fsspec/asyn.py", line 100, in maybe_sync
return sync(loop, func, *args, **kwargs)
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/fsspec/asyn.py", line 71, in sync
raise exc.with_traceback(tb)
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/fsspec/asyn.py", line 55, in f
result[0] = await future
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/gcsfs/core.py", line 825, in _info
return await self._get_object(path)
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/gcsfs/core.py", line 607, in _get_object
res = await self._call("GET", "b/{}/o/{}", bucket, key, json_out=True)
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/gcsfs/core.py", line 503, in _call
self.maybe_refresh()
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/gcsfs/core.py", line 404, in maybe_refresh
self.credentials.refresh(req)
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/google/oauth2/credentials.py", line 198, in refresh
self._scopes,
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/google/oauth2/_client.py", line 248, in refresh_grant
response_data = _token_endpoint_request(request, token_uri, body)
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/google/oauth2/_client.py", line 124, in _token_endpoint_request
_handle_error_response(response_body)
File "/Users/willcharles/Documents/project/venv/lib/python3.6/site-packages/google/oauth2/_client.py", line 60, in _handle_error_response
raise exceptions.RefreshError(error_details, response_body)
google.auth.exceptions.RefreshError: ('invalid_grant: Bad Request', '{\n "error": "invalid_grant",\n "error_description": "Bad Request"\n}')
I have given my service account the following roles:
Storage Admin & BigQuery Admin
Any ideas why the error is happening?
I'm trying to retrieve some data and mail it out from within a Google Cloud Function using SendGrid
I tried convert the data as a list of dictionaries (with the same, flat, structure) to CSV as detailed here but this fails as the filesystem is read only.
To counter this, I use io.StringIO() to store the CSV in-memory.
However, I get the following error/stack trace during execution:
Traceback (most recent call last):
File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py",
line 383, in run_background_function_function_handler.invoke_user_function(event_object)
File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py",
line 217, in invoke_user_function return call_user_function(request_or_event)
File "/env/local/lib/python3.7/site-packages/google/cloud/functions/worker.py",
line 214, in call_user_function event_context.Context(**request_or_event.context))
File "/user_code/main.py", line 267, in session_updated summarize_session(sessionid, doctorid) File "/user_code/main.py", line 375, in summarize_session send_email([docemail], data)
File "/user_code/main.py", line 328, in send_email response = sg.send(message)
File "/env/local/lib/python3.7/site-packages/sendgrid/sendgrid.py",
line 98, in send response = self.client.mail.send.post(request_body=message.get())
File "/env/local/lib/python3.7/site-packages/python_http_client/client.py",
line 251, in http_request data = json.dumps(request_body).encode('utf-8')
File "/opt/python3.7/lib/python3.7/json/__init__.py",
line 231, in dumps return _default_encoder.encode(obj)
File "/opt/python3.7/lib/python3.7/json/encoder.py",
line 199, in encode chunks = self.iterencode(o, _one_shot=True)
File "/opt/python3.7/lib/python3.7/json/encoder.py",
line 257, in iterencode return _iterencode(o, 0)
File "/opt/python3.7/lib/python3.7/json/encoder.py",
line 179, in default raise TypeError(f'Object of type {o.__class__.__name__} '
TypeError: Object of type StringIO is not JSON serializable
The code is as follows:
def send_email(to_emails, datadict):
message = Mail(
from_email='info#domain.com',
#to_emails= to_emails,
to_emails= ['dude#domain.com'],
subject='Summary of your session',
html_content='<strong>Data Summary</strong>\
<p>This email is a summary of your session. Please check the attachment for details. </p>')
sg = SendGridAPIClient(os.environ.get('SENDGRID_API_KEY'))
keys = datadict[0].keys()
try:
output_file = io.StringIO()
#output_file = io.BytesIO() # results in a typeerror
# https://stackoverflow.com/questions/34283178/typeerror-a-bytes-like-object-is-required-not-str-in-python-and-csv
dict_writer = csv.DictWriter(output_file, keys)
dict_writer.writeheader()
dict_writer.writerows(datadict)
print("Attaching")
message.attachment = [
Attachment(FileContent(output_file),
FileType('text/csv'),
FileName('sessiondata.csv'),
Disposition('inline'),
ContentId('SessionData')),
]
except Exception as e:
print("Exception:")
print(e)
return
response = sg.send(message)
How do I convert the list of dictionaries to a CSV and attach it to an email without opening a physical file on the filesystem?