saving wordcloud plot from AWS lambda to S3 bucket

saving wordcloud plot from AWS lambda to S3 bucket - python

I am currently trying to save a word cloud I am generating in my AWS lambda function to my s3 bucket, my code executes gut gives "errorMessage": "Parameter validation failed:\nInvalid type for parameter Body, value: <wordcloud.wordcloud.WordCloud object at 0x7f36643fce10>, type: <class 'wordcloud.wordcloud.WordCloud'>, valid types: <class 'bytes'>, <class 'bytearray'>, file-like object",
as an error, I have looked online and cant seem to find the cause, do I need to to convert the plot to bytes to be able to store it in S3 like this?
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import boto3
text = "cat cat cat dog dog dog test test hello one two three four five"
generate_word_cloud(text)
def generate_word_cloud(text):
wordcloud = WordCloud(width = 3000,
height = 2000,
random_state=1,
background_color='salmon',
colormap='Pastel1',
collocations=False,
stopwords = STOPWORDS).generate(text)
save_to_bucket(wordcloud)
def save_to_bucket(wordcloud):
#Save tweet list to an s3 bucket
BUCKET_NAME = "1706224-tweets"
FILE_NAME = "wordcloud.png"
s3 = boto3.resource('s3')
object = s3.Object(BUCKET_NAME, FILE_NAME)
object.put(Body=wordcloud)

In the end I used the /tmp storage from lambda to store the image temporarily, then uploaded the image after
wordcloud.to_file("/tmp/"+FILE_NAME)
s3 = boto3.client('s3')
s3.upload_file("/tmp/"+FILE_NAME,BUCKET_NAME,FILE_NAME)

you could convert the PIL image to a byte array and use that, for instance in your case you could do the following in your save_to_bucket method
def save_to_bucket(wordcloud):
#Save tweet list to an s3 bucket
BUCKET_NAME = "1706224-tweets"
FILE_NAME = "wordcloud.png"
s3 = boto3.resource('s3')
object = s3.Object(BUCKET_NAME, FILE_NAME)
// here you convert the PIL image that generate wordcloud to byte array
image_byte = image_to_byte_array(wordcloud.to_image())
object.put(Body=image_byte)
def image_to_byte_array(image: Image, format: str = 'png'):
result = io.BytesIO()
image.save(result, format=format)
result = result.getvalue()
return result

Related

Saving a pymupdf fitz object to s3 as a pdf

I am trying to crop a pdf and save it to s3 with same name using lambda. I am getting error on the data type being a fitz.fitz.page
import os
import json
import boto3
from urllib.parse import unquote_plus
import fitz, sys
from io import BytesIO
OUTPUT_BUCKET_NAME = os.environ["OUTPUT_BUCKET_NAME"]
OUTPUT_S3_PREFIX = os.environ["OUTPUT_S3_PREFIX"]
SNS_TOPIC_ARN = os.environ["SNS_TOPIC_ARN"]
SNS_ROLE_ARN = os.environ["SNS_ROLE_ARN"]
def lambda_handler(event, context):
textract = boto3.client("textract")
if event:
file_obj = event["Records"][0]
bucketname = str(file_obj["s3"]["bucket"]["name"])
filename = unquote_plus(str(file_obj["s3"]["object"]["key"]))
doc = fitz.open()
s3 = boto3.resource('s3')
obj = s3.Object(bucketname, filename)
fs = obj.get()['Body'].read()
pdf=fitz.open("pdf", stream=BytesIO(fs))
#pdf.close()
rect=fitz.Rect(0.0, 0.0, 595.0, 842.0)
#page = pdf[0]
page1 = doc.new_page(width = rect.width, # new page with ...
height = rect.height)
page1.show_pdf_page(rect, pdf, 0)
print(type(doc))
print(type(page1))
s3.Bucket(bucketname).put_object(Key=filename, Body=page1)

This is happening because the page1 object is defined using fitz.fitz.page and the type expected by S3 put object is bytes.
In order to solve the issue, you can use the write function of the new PDF (doc) and get the output of it which is in bytes format that you could pass to S3 then.
# Save fil first.
new_bytes = doc.write()
s3.Bucket(bucketname).put_object(Key=filename, Body=new_bytes)

For some reason for me the doc.write() method didn't return a bytes object like stated above. Here's an additional way to create a new doc, convert to bytes using BytesIO, and then save it to s3 as a pdf:
import fitz
from io import BytesIO
client = boto3.client("s3")
# create new doc object
single_page = fitz.open()
# insert a page from original_pdf_doc
single_page.insert_pdf(
original_pdf_doc, from_page=from_page_num, to_page=to_page_num
)
# Use BytesIO and .write() method to save to a bytes object
bytes_ = BytesIO(single_page.write())
# Upload the bytes object!
client.put_object(Body=bytes_, Bucket=bucket, Key=key)

Google Vision API problem with batch annotations

I wanted to use Cloud Vision API to detect labels from ca. 40K photographs and download the results as CSV files. I uploaded photos into the cloud storage and used the following code, but the error occured. I asked a person who uses python in his job but he cannot deal with this error. Can you help mi with fixing it?
TypeError: Invalid constructor input for BatchAnnotateImagesRequest: [{'image': source {
image_uri: "gs://bucket/image-path.jpg"
}
, 'features': [{'type': <Type.LABEL_DETECTION: 4>}]}]
The code I used:
from google.cloud import
from google.cloud import storage
from google.cloud.vision_v1 import ImageAnnotatorClient
from google.cloud.vision_v1 import types
import os
import json
import numpy as np
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]='C://file-path.json'
#(created in step 1)
# Get GCS bucket
storage_client = storage.Client()
bucket = storage_client.bucket('bucket_name')
image_paths = []
for blob in list(bucket.list_blobs()):
image_paths.append("gs://bucket_name/"+blob.name)
# We can send a maximum of 16 images per request.
start = 0
end = 16
label_output = []
for i in range(int(np.floor(len(image_paths)/16))+1):
requests = []
client = vision.ImageAnnotatorClient()
for image_path in image_paths[start:end]:
image = types.Image()
image.source.image_uri = image_path
requests.append({'image': image,'features': [{'type': vision.Feature.Type.LABEL_DETECTION}]})
response = client.batch_annotate_images(requests)
for image_path, i in zip(image_paths[start:end], response.responses):
labels = [{label.description: label.score} for label in i.label_annotations]
labels = {k: v for d in labels for k, v in d.items()}
filename = os.path.basename(image_path)
l = {'filename': filename, 'labels': labels}
label_output.append(l)
start = start+16
end = end+16
#export results to CSV file
for l in label_output:
print('"' + label_output[l]['filename'] + '";', end = '')
for label in label_output[l]["labels"]:
print('"' + label + '";"' + label_output[l][label] + '";', end = '')
print("")

batch_annotate_images() is not getting the contents of requests properly. To fix this, just assign your variable requests explicitly to the parameter requests of batch_annotate_images().
response = client.batch_annotate_images(requests=requests)
See batch_annotate_images() for reference. Also if you are planning to update your Vision API to 2.3.1, you might encounter errors on features: see this reference for the updated usage of its parameters.

How to convert Bytes object to _io.BytesIO python?

I am making a simple flask API for uploading an image and do some progresses then store it in the data base as binary, then i want to download it by using send_file() function but, when i am passing an image like a bytes it gives me an error:
return send_file(BytesIO.read(image.data), attachment_filename='f.jpg', as_attachment=True) TypeError: descriptor
'read' requires a '_io.BytesIO' object but received a 'bytes'
and My code for upload an image as follow:
#app.route('/upload', methods=['POST'])
def upload():
images = request.files.getlist('uploadImages')
n = 0
for image in images:
fileName = image.filename.split('.')[0]
fileFormat = image.filename.split('.')[1]
imageRead = image.read()
img = BytesIO(imageRead)
with graph.as_default():
caption = generate_caption_from_file(img)
newImage = imageDescription(name=fileName, format=fileFormat, description=caption,
data=imageRead)
db.session.add(newImage)
db.session.commit()
n = n + 1
return str(n) + ' Image has been saved successfully'
And my code for downloading an image:
#app.route('/download/<int:id>')
def download(id):
image = imageDescription.query.get_or_404(id)
return send_file(BytesIO.read(image.data), attachment_filename='f.jpg', as_attachment=True)
any one can help please???

It seems you are confused io.BytesIO. Let's look at some examples of using BytesIO.
>>> from io import BytesIO
>>> inp_b = BytesIO(b'Hello World', )
>>> inp_b
<_io.BytesIO object at 0x7ff2a71ecb30>
>>> inp.read() # read the bytes stream for first time
b'Hello World'
>>> inp.read() # now it is positioned at the end so doesn't give anything.
b''
>>> inp.seek(0) # position it back to begin
>>> BytesIO.read(inp) # This is same as above and prints bytes stream
b'Hello World'
>>> inp.seek(0)
>>> inp.read(4) # Just read upto four bytes of stream.
>>> b'Hell'
This should give you an idea of how read on BytesIO works. I think what you need to do is this.
return send_file(
BytesIO(image.data),
mimetype='image/jpg',
as_attachment=True,
attachment_filename='f.jpg'
)

Boto3 and Flask: images are not uploading to S3 properly

I am building a gallery for my Flask website and am having trouble generating thumbnails in my CMS. I have a function that takes the uploaded images (stored in a list), I duplicate the list, and use zip() to iterate through both of the lists. I make thumbnails with one list and I upload the full images with the other list and I send both to AWS S3. The issue is that I do make the thumbnails but the full images get uploaded as images with sizes of 0 bytes. I also have a function that renames the images. That is working, so I assume that I am doing something right.
Why am I getting files of 0 bytes? How does that happen? What is 'emptying' the file?
#application.route("/upload", methods=['GET', 'POST'])
#login_required
def upload():
form = UploadForm()
uploaded_files = request.files.getlist("photo")
conn = boto3.client(
's3',
region_name="region",
aws_access_key_id='aws_access_key_id',
aws_secret_access_key='aws_secret_access_key'
)
bucket_name = "bucket"
if form.validate_on_submit():
i = 1
thumbs_list = list(uploaded_files)
for z, w in zip(thumbs_list, uploaded_files):
x = photo_rename(w)
thumb = save_thumbnail(z)
conn.upload_fileobj(x.stream, bucket_name, 'gallery/fulls/'+w.filename)
conn.upload_fileobj(thumb, bucket_name, 'gallery/thumbs/'+w.filename)
form_commit = Gallery(event=form.event.data,
date=form.date.data,
image_order=i,
image_file_fl=w.filename,
image_file_th=w.filename
)
db.session.add(form_commit)
i += 1
db.session.commit()
return '', 204

I was able to get both uploads to work using io.BytesIO() in my for loop:
for z, w in zip(thumbs_list, uploaded_files):
x = photo_rename(w)
c = Image.open(x)
in_mem_file = io.BytesIO()
c.save(in_mem_file, "JPEG")
in_mem_file.seek(0)
thumb = save_thumbnail(z)
conn.upload_fileobj(in_mem_file, bucket_name, 'gallery/fulls/'+w.filename)

How to annotate MULTIPLE images from a single call using Google's vision API? Python

I recently started using Google's vision API. I am trying to annotate a batch of images and therefore issued the 'batch image annotation offline' guide from their documentation.
However, it is not clear to me how I can annotate MULTIPLE images from one API call. So let's say I have stored 10 images in my google cloud bucket. How can I annotate all these images at once and store them in one JSON file? Right now, I wrote a program that calls their example function and it works, but to put it simple, why can't I say: 'Look in this folder and annotate all images in it.'?
Thanks in advance.
from batch_image_labeling import sample_async_batch_annotate_images
counter = 0
for file in os.listdir('my_directory'):
filename = file
sample_async_batch_annotate_images('gs://my_bucket/{}'.format(filename), 'gs://my_bucket/{}'.format(counter))
counter += 1
from google.cloud import vision_v1
from google.cloud.vision_v1 import enums
import six
def sample_async_batch_annotate_images(input_image_uri, output_uri):
"""Perform async batch image annotation"""
client = vision_v1.ImageAnnotatorClient()
if isinstance(input_image_uri, six.binary_type):
input_image_uri = input_image_uri.decode('utf-8')
if isinstance(output_uri, six.binary_type):
output_uri = output_uri.decode('utf-8')
source = {'image_uri': input_image_uri}
image = {'source': source}
type_ = enums.Feature.Type.LABEL_DETECTION
features_element = {'type': type_}
type_2 = enums.Feature.Type.IMAGE_PROPERTIES
features_element_2 = {'type': type_2}
features = [features_element, features_element_2]
requests_element = {'image': image, 'features': features}
requests = [requests_element]
gcs_destination = {'uri': output_uri}
# The max number of responses to output in each JSON file
batch_size = 2
output_config = {'gcs_destination': gcs_destination, 'batch_size': batch_size}
operation = client.async_batch_annotate_images(requests, output_config)
print('Waiting for operation to complete...')
response = operation.result()
# The output is written to GCS with the provided output_uri as prefix
gcs_output_uri = response.output_config.gcs_destination.uri
print('Output written to GCS with prefix: {}'.format(gcs_output_uri))

It's somewhat unclear from that example, but your call to async_batch_annotate_images takes a requests parameter which is a list of multiple requests. So you can do something like this:
rom google.cloud import vision_v1
from google.cloud.vision_v1 import enums
import six
def generate_request(input_image_uri):
if isinstance(input_image_uri, six.binary_type):
input_image_uri = input_image_uri.decode('utf-8')
if isinstance(output_uri, six.binary_type):
output_uri = output_uri.decode('utf-8')
source = {'image_uri': input_image_uri}
image = {'source': source}
type_ = enums.Feature.Type.LABEL_DETECTION
features_element = {'type': type_}
type_2 = enums.Feature.Type.IMAGE_PROPERTIES
features_element_2 = {'type': type_2}
features = [features_element, features_element_2]
requests_element = {'image': image, 'features': features}
return requests_element
def sample_async_batch_annotate_images(input_uri, output_uri):
"""Perform async batch image annotation"""
client = vision_v1.ImageAnnotatorClient()
requests = [
generate_request(input_uri.format(filename))
for filename in os.listdir('my_directory')
]
gcs_destination = {'uri': output_uri}
# The max number of responses to output in each JSON file
batch_size = 1
output_config = {'gcs_destination': gcs_destination, 'batch_size': batch_size}
operation = client.async_batch_annotate_images(requests, output_config)
print('Waiting for operation to complete...')
response = operation.result()
# The output is written to GCS with the provided output_uri as prefix
gcs_output_uri = response.output_config.gcs_destination.uri
print('Output written to GCS with prefix: {}'.format(gcs_output_uri))
sample_async_batch_annotate_images('gs://my_bucket/{}', 'gs://my_bucket/results')
This can annotate up to 2,000 images in a single request. The only downside is that you can only specify a single output_uri as a destination, so you won't be able to use counter to put each result in a separate file, but you can set batch_size = 1 to ensure each response is written separately if this is what you want.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

saving wordcloud plot from AWS lambda to S3 bucket - python

In the end I used the /tmp storage from lambda to store the image temporarily, then uploaded the image after wordcloud.to_file("/tmp/"+FILE_NAME) s3 = boto3.client('s3') s3.upload_file("/tmp/"+FILE_NAME,BUCKET_NAME,FILE_NAME)

Related

Saving a pymupdf fitz object to s3 as a pdf

Google Vision API problem with batch annotations

How to convert Bytes object to _io.BytesIO python?

Boto3 and Flask: images are not uploading to S3 properly

How to annotate MULTIPLE images from a single call using Google's vision API? Python

Categories

Resources