Streaming tarfile from AWS Glue job to S3 file shows up empty - python

When I run this glue job function I get the folders but the actual csv after extracting is empty with 0 bytes, but the compressed tar.gz shows 108 bytes. What is wrong?
import sys
import os
import tarfile
import boto3
import io
from awsglue.utils import getResolvedOptions
args = getResolvedOptions(sys.argv, ['INPUT_FILE_PATH','PROCESS_DATE','TARGET_BUCKET','TARGET_PATH','S3_RAW_BUCKET'])
s3 = boto3.resource('s3')
def createTarFileStream(s3RawBucket, srcBucketFilePath, targetBucketName, targetBucketFilePath, processDate, fileExt='csv'):
bucket = s3.Bucket(s3RawBucket)
filesCollection = bucket.objects.filter(Prefix=srcBucketFilePath).all()
archive = io.BytesIO()
with tarfile.open(name=None, mode='w|gz', fileobj=archive) as tar:
for file in filesCollection:
info = tarfile.TarInfo(name=os.path.basename(file.key))
tar.addfile(tarinfo=info, fileobj=file.get()['Body'].read())
archive.seek(0)
s3.Object(targetBucketName, f"{targetBucketFilePath}/process_date={processDate}.tar.gz").upload_fileobj(archive)
archive.close()
return
createTarFileStream(args["S3_RAW_BUCKET"], args["INPUT_FILE_PATH"], args["TARGET_BUCKET"], args["TARGET_PATH"], args["PROCESS_DATE"])
UPDATE I have found the solution. I had to update the TarInfo object with the size of the content in order to keep the tar from being empty. Here is the final working code, also note the headers required to upload gzip to S3:
with tarfile.open(tfile, "w:gz") as archive:
for file in filesCollection:
with io.BytesIO(file.get()['Body'].read()) as f:
info = tarfile.TarInfo(os.path.basename(file.key))
f.seek(0, io.SEEK_END)
info.size = f.tell()
f.seek(0, io.SEEK_SET)
archive.addfile(info, f)
with open(tfile, 'rb') as t:
s3.Object(targetBucketName, f"{targetBucketFilePath}/{tfile}.tar.gz").upload_fileobj(t, {'ContentType': 'text/plain', 'ContentEncoding': 'gzip'})

Related

Stream download S3 files, zip them, and stream the zip file back to S3 - Python

People upload files to s3 bucket and I need to be able to programmatically zip certain files.
I am doing this using Fargate and a lot of times all the files that need to be zipped are over 300GB in aggregate.
Therefore, it is important that the files are streamed from S3 and the zip file is streamed back to S3 as there is not enough disk space or memory to hold everything at once.
I have found two answers here on StackOverflow but neither has worked and I am not been able to figure out why after trying to troubleshoot.
The first is:
from io import RawIOBase
from zipfile import ZipFile
from zipfile import ZipInfo
from zipfile import ZIP_DEFLATED
import boto3
session = boto3.Session(aws_access_key_id='x', aws_secret_access_key='x', region_name='us-east-2')
s3 = boto3.client('s3')
bucket_name = 'x'
class UnseekableStream(RawIOBase):
def __init__(self):
self._buffer = b''
def writable(self):
return True
def write(self, b):
if self.closed:
raise ValueError('The stream was closed!')
self._buffer += b
return len(b)
def get(self):
chunk = self._buffer
self._buffer = b''
return chunk
def zipfile_generator(path, stream):
with ZipFile(stream, mode='w') as zip_archive:
z_info = ZipInfo.from_file(path)
z_info.compress_type = ZIP_DEFLATED
with open(path, 'rb') as entry, zip_archive.open(z_info, mode='w') as dest:
for chunk in iter(lambda: entry.read(16384), b''):
dest.write(chunk)
yield stream.get()
yield stream.get()
items_to_zip = ['file1.jpg', 'file2.jpg', 'file3.jpg']
stream = UnseekableStream()
with open("test.zip", "wb") as f:
for file in items_to_zip:
obj = s3.get_object(Bucket=bucket_name, Key=file)
for i in zipfile_generator(obj.get(obj), stream):
f.write(i)
f.flush()
stream.close()
f.close()
This one gives me an error saying:
for i in zipfile_generator(obj.get(obj), stream):
TypeError: unhashable type: 'dict'
The second is:
import boto3
import smart_open
from smart_open import s3
session = boto3.Session()
source_bucket_name = "x"
bucket = session.resource('s3').Bucket(source_bucket_name)
prefix = "xx" # s3 prefix for the files under a "folder"
output_path = "s3://xx/streamedzip.zip"
with smart_open.open(output_path, 'wb') as fout:
for key, content in s3.iter_bucket(source_bucket_name, prefix = prefix):
fout.write(content)
This one uploads a file back to S3 but it appears to be a corrupted zip file.
I am lost as to where to go from here.
Much thanks
For the second approach, you have to use a another context manager for the zip-File:
with smart_open.open(output_path, 'wb') as fout:
with zipfile.ZipFile(fout, 'w') as zip:
for key, content in s3.iter_bucket(source_bucket_name, prefix = prefix):
zip.writestr(key, content)

Creating an AWS lambda function to split pdf files in a s3 bucket

I want to write an AWS Lambda function that:
Takes pdf file from s3 bucket -> splits the pdf file -> Stores split files to S3 bucket.
I am using PyPDF module, so need to know how I can use it in aws lambda function as well.
The code to split pdf files:
import os
from PyPDF2 import PdfFileReader, PdfFileWriter
pdf_file_path = 'filename.pdf'
file_base_name = pdf_file_path.replace('.pdf','')
output_folder_path = os.path.join(os.getcwd(), 'output')
pdf = PdfFileReader(pdf_file_path)
for page_num in range(pdf.numPages):
pdfWriter = PdfFileWriter()
pdfWriter.addPage(pdf.getPage(page_num))
with open(os.path.join(output_folder_path, '{0}_Page{1}.pdf'.format(file_base_name,page_num+1)), 'wb') as f:
pdfWriter.write(f)
f.close()
What should be my lambda function for this?(The code)
Your lambda code needs to look something like this. In this case I am reading a S3 file using boto3. You pass arguments to your lambda function in the event.
import boto3
from content_reader_lambda.pdf import reader
def read_pdf_from_bucket(event, context):
bucket_name = event['bucket_name']
file_name = event['file_name']
s3 = boto3.resource('s3')
obj = s3.Object(bucket_name, file_name)
s3_file = obj.get()['Body'].read()
return reader.pdf_as_text(s3_file, 'pdf')
I am using pymupdf to read the PDF and return text like this.
def pdf_as_text(file_stream, filetype):
text = ''
with fitz.open(stream=file_stream, filetype=filetype) as doc:
for page in doc:
# Sort reads the text in display/reading order. https://pymupdf.readthedocs.io/en/latest/page.html#Page.get_textpage
text+= page.get_text('text', sort=True)
return text
You can replace that with your code and use boto3 to write your PDF back to S3.
Deploying your lambda to AWS together with the third party libraries that you use is a whole different topic. For that I suggest using layers. Smaller libraries are a lot easier to deploy given AWS size limits.
pypdf can work with filestreams (docs):
Reading:
from io import BytesIO
# Prepare example
with open("example.pdf", "rb") as fh:
bytes_stream = BytesIO(fh.read())
# Read from bytes_stream
reader = PdfReader(bytes_stream)
# Write to bytes_stream
writer = PdfWriter()
with BytesIO() as bytes_stream:
writer.write(bytes_stream)
Writing:
from io import BytesIO
import boto3
from pypdf import PdfReader, PdfWriter
reader = PdfReader(BytesIO(raw_bytes_data))
writer = PdfWriter()
# Add all pages to the writer
for page in reader.pages:
writer.add_page(page)
# Add a password to the new PDF
writer.encrypt("my-secret-password")
# Save the new PDF to a file
with BytesIO() as bytes_stream:
writer.write(bytes_stream)
bytes_stream.seek(0)
s3 = boto3.client("s3")
s3.write_get_object_response(
Body=bytes_stream, RequestRoute=request_route, RequestToken=request_token
)

How to stream a large gzipped .tsv file from s3, process it, and write back to a new file on s3?

I have a large file s3://my-bucket/in.tsv.gz that I would like to load and process, write back its processed version to an s3 output file s3://my-bucket/out.tsv.gz.
How do I streamline the in.tsv.gz directly from s3 without loading all the file to memory (it cannot fit the memory)
How do I write the processed gzipped stream directly to s3?
In the following code, I show how I was thinking to load the input gzipped dataframe from s3, and how I would write the .tsv if it were located locally bucket_dir_local = ./.
import pandas as pd
import s3fs
import os
import gzip
import csv
import io
bucket_dir = 's3://my-bucket/annotations/'
df = pd.read_csv(os.path.join(bucket_dir, 'in.tsv.gz'), sep='\t', compression="gzip")
bucket_dir_local='./'
# not sure how to do it with an s3 path
with gzip.open(os.path.join(bucket_dir_local, 'out.tsv.gz'), "w") as f:
with io.TextIOWrapper(f, encoding='utf-8') as wrapper:
w = csv.DictWriter(wrapper, fieldnames=['test', 'testing'], extrasaction="ignore")
w.writeheader()
for index, row in df.iterrows():
my_dict = {"test": index, "testing": row[6]}
w.writerow(my_dict)
Edit: smart_open looks like the way to go.
Here is a dummy example to read a file from s3 and write it back to s3 using smart_open
from smart_open import open
import os
bucket_dir = "s3://my-bucket/annotations/"
with open(os.path.join(bucket_dir, "in.tsv.gz"), "rb") as fin:
with open(
os.path.join(bucket_dir, "out.tsv.gz"), "wb"
) as fout:
for line in fin:
l = [i.strip() for i in line.decode().split("\t")]
string = "\t".join(l) + "\n"
fout.write(string.encode())
For downloading the file you can stream the S3 object directly in python. I'd recommend reading that entire post but some key lines from it
import boto3
s3 = boto3.client('s3', aws_access_key_id='mykey', aws_secret_access_key='mysecret') # your authentication may vary
obj = s3.get_object(Bucket='my-bucket', Key='my/precious/object')
import gzip
body = obj['Body']
with gzip.open(body, 'rt') as gf:
for ln in gf:
process(ln)
Unfortunately S3 doesn't support true streaming input but this SO answer has an implementation that chunks out the file and sends each chunk up to S3. While not a "true stream" it will let you upload large files without needing to keep the entire thing in memory

upload a dataframe as a zipped csv directly to s3 without saving it on the local machine

How can I upload a data frame as a zipped csv into S3 bucket without saving it on my local machine first?
I have the connection to that bucket already running using:
self.s3_output = S3(bucket_name='test-bucket', bucket_subfolder='')
We can make a file-like object with BytesIO and zipfile from the standard library.
# 3.7
from io import BytesIO
import zipfile
# .to_csv returns a string when called with no args
s = df.to_csv()
with zipfile.ZipFile(BytesIO(), mode="w",) as z:
z.writestr("df.csv", s)
# upload file here
You'll want to refer to upload_fileobj in order to customize how the upload behaves.
yourclass.s3_output.upload_fileobj(z, ...)
This works equally well for zip and gz:
import boto3
import gzip
import pandas as pd
from io import BytesIO, TextIOWrapper
s3_client = boto3.client(
service_name = "s3",
endpoint_url = your_endpoint_url,
aws_access_key_id = your_access_key,
aws_secret_access_key = your_secret_key
# Your file name inside zip
your_filename = "test.csv"
s3_path = f"path/to/your/s3/compressed/file/test.zip"
bucket = "your_bucket"
df = your_df
gz_buffer = BytesIO()
with gzip.GzipFile(
filename = your_filename,
mode = 'w',
fileobj = gz_buffer ) as gz_file:
df.to_csv(TextIOWrapper(gz_file, 'utf8'), index=False)
s3.put_object(
Bucket=bucket, Key=s3_path, Body=gz_buffer.getvalue()
)

How to load a pickle file from S3 to use in AWS Lambda?

I am currently trying to load a pickled file from S3 into AWS lambda and store it to a list (the pickle is a list).
Here is my code:
import pickle
import boto3
s3 = boto3.resource('s3')
with open('oldscreenurls.pkl', 'rb') as data:
old_list = s3.Bucket("pythonpickles").download_fileobj("oldscreenurls.pkl", data)
I get the following error even though the file exists:
FileNotFoundError: [Errno 2] No such file or directory: 'oldscreenurls.pkl'
Any ideas?
Super simple solution
import pickle
import boto3
s3 = boto3.resource('s3')
my_pickle = pickle.loads(s3.Bucket("bucket_name").Object("key_to_pickle.pickle").get()['Body'].read())
As shown in the documentation for download_fileobj, you need to open the file in binary write mode and save to the file first. Once the file is downloaded, you can open it for reading and unpickle.
import pickle
import boto3
s3 = boto3.resource('s3')
with open('oldscreenurls.pkl', 'wb') as data:
s3.Bucket("pythonpickles").download_fileobj("oldscreenurls.pkl", data)
with open('oldscreenurls.pkl', 'rb') as data:
old_list = pickle.load(data)
download_fileobj takes the name of an object in S3 plus a handle to a local file, and saves the contents of that object to the file. There is also a version of this function called download_file that takes a filename instead of an open file handle and handles opening it for you.
In this case it would probably be better to use S3Client.get_object though, to avoid having to write and then immediately read a file. You could also write to an in-memory BytesIO object, which acts like a file but doesn't actually touch a disk. That would look something like this:
import pickle
import boto3
from io import BytesIO
s3 = boto3.resource('s3')
with BytesIO() as data:
s3.Bucket("pythonpickles").download_fileobj("oldscreenurls.pkl", data)
data.seek(0) # move back to the beginning after writing
old_list = pickle.load(data)
This is the easiest solution. You can load the data without even downloading the file locally using S3FileSystem
from s3fs.core import S3FileSystem
s3_file = S3FileSystem()
data = pickle.load(s3_file.open('{}/{}'.format(bucket_name, file_path)))
According to my implementation, S3 file path read with pickle.
import pickle
import boto3
name = img_url.split('/')[::-1][0]
folder = 'media'
file_name = f'{folder}/{name}'
bucket_name = bucket_name
s3 = boto3.client('s3', aws_access_key_id=aws_access_key_id,aws_secret_access_key=aws_secret_access_key)
response = s3.get_object(Bucket=bucket_name, Key=file_name)
body = response['Body'].read()
data = pickle.loads(body)

Categories

Resources