Saving a pymupdf fitz object to s3 as a pdf - python

I am trying to crop a pdf and save it to s3 with same name using lambda. I am getting error on the data type being a fitz.fitz.page
import os
import json
import boto3
from urllib.parse import unquote_plus
import fitz, sys
from io import BytesIO
OUTPUT_BUCKET_NAME = os.environ["OUTPUT_BUCKET_NAME"]
OUTPUT_S3_PREFIX = os.environ["OUTPUT_S3_PREFIX"]
SNS_TOPIC_ARN = os.environ["SNS_TOPIC_ARN"]
SNS_ROLE_ARN = os.environ["SNS_ROLE_ARN"]
def lambda_handler(event, context):
textract = boto3.client("textract")
if event:
file_obj = event["Records"][0]
bucketname = str(file_obj["s3"]["bucket"]["name"])
filename = unquote_plus(str(file_obj["s3"]["object"]["key"]))
doc = fitz.open()
s3 = boto3.resource('s3')
obj = s3.Object(bucketname, filename)
fs = obj.get()['Body'].read()
pdf=fitz.open("pdf", stream=BytesIO(fs))
#pdf.close()
rect=fitz.Rect(0.0, 0.0, 595.0, 842.0)
#page = pdf[0]
page1 = doc.new_page(width = rect.width, # new page with ...
height = rect.height)
page1.show_pdf_page(rect, pdf, 0)
print(type(doc))
print(type(page1))
s3.Bucket(bucketname).put_object(Key=filename, Body=page1)

This is happening because the page1 object is defined using fitz.fitz.page and the type expected by S3 put object is bytes.
In order to solve the issue, you can use the write function of the new PDF (doc) and get the output of it which is in bytes format that you could pass to S3 then.
# Save fil first.
new_bytes = doc.write()
s3.Bucket(bucketname).put_object(Key=filename, Body=new_bytes)

For some reason for me the doc.write() method didn't return a bytes object like stated above. Here's an additional way to create a new doc, convert to bytes using BytesIO, and then save it to s3 as a pdf:
import fitz
from io import BytesIO
client = boto3.client("s3")
# create new doc object
single_page = fitz.open()
# insert a page from original_pdf_doc
single_page.insert_pdf(
original_pdf_doc, from_page=from_page_num, to_page=to_page_num
)
# Use BytesIO and .write() method to save to a bytes object
bytes_ = BytesIO(single_page.write())
# Upload the bytes object!
client.put_object(Body=bytes_, Bucket=bucket, Key=key)

Related

Google Vision API problem with batch annotations

I wanted to use Cloud Vision API to detect labels from ca. 40K photographs and download the results as CSV files. I uploaded photos into the cloud storage and used the following code, but the error occured. I asked a person who uses python in his job but he cannot deal with this error. Can you help mi with fixing it?
TypeError: Invalid constructor input for BatchAnnotateImagesRequest: [{'image': source {
image_uri: "gs://bucket/image-path.jpg"
}
, 'features': [{'type': <Type.LABEL_DETECTION: 4>}]}]
The code I used:
from google.cloud import
from google.cloud import storage
from google.cloud.vision_v1 import ImageAnnotatorClient
from google.cloud.vision_v1 import types
import os
import json
import numpy as np
os.environ["GOOGLE_APPLICATION_CREDENTIALS"]='C://file-path.json'
#(created in step 1)
# Get GCS bucket
storage_client = storage.Client()
bucket = storage_client.bucket('bucket_name')
image_paths = []
for blob in list(bucket.list_blobs()):
image_paths.append("gs://bucket_name/"+blob.name)
# We can send a maximum of 16 images per request.
start = 0
end = 16
label_output = []
for i in range(int(np.floor(len(image_paths)/16))+1):
requests = []
client = vision.ImageAnnotatorClient()
for image_path in image_paths[start:end]:
image = types.Image()
image.source.image_uri = image_path
requests.append({'image': image,'features': [{'type': vision.Feature.Type.LABEL_DETECTION}]})
response = client.batch_annotate_images(requests)
for image_path, i in zip(image_paths[start:end], response.responses):
labels = [{label.description: label.score} for label in i.label_annotations]
labels = {k: v for d in labels for k, v in d.items()}
filename = os.path.basename(image_path)
l = {'filename': filename, 'labels': labels}
label_output.append(l)
start = start+16
end = end+16
#export results to CSV file
for l in label_output:
print('"' + label_output[l]['filename'] + '";', end = '')
for label in label_output[l]["labels"]:
print('"' + label + '";"' + label_output[l][label] + '";', end = '')
print("")
batch_annotate_images() is not getting the contents of requests properly. To fix this, just assign your variable requests explicitly to the parameter requests of batch_annotate_images().
response = client.batch_annotate_images(requests=requests)
See batch_annotate_images() for reference. Also if you are planning to update your Vision API to 2.3.1, you might encounter errors on features: see this reference for the updated usage of its parameters.

saving wordcloud plot from AWS lambda to S3 bucket

I am currently trying to save a word cloud I am generating in my AWS lambda function to my s3 bucket, my code executes gut gives "errorMessage": "Parameter validation failed:\nInvalid type for parameter Body, value: <wordcloud.wordcloud.WordCloud object at 0x7f36643fce10>, type: <class 'wordcloud.wordcloud.WordCloud'>, valid types: <class 'bytes'>, <class 'bytearray'>, file-like object",
as an error, I have looked online and cant seem to find the cause, do I need to to convert the plot to bytes to be able to store it in S3 like this?
from wordcloud import WordCloud, STOPWORDS
import matplotlib.pyplot as plt
import boto3
text = "cat cat cat dog dog dog test test hello one two three four five"
generate_word_cloud(text)
def generate_word_cloud(text):
wordcloud = WordCloud(width = 3000,
height = 2000,
random_state=1,
background_color='salmon',
colormap='Pastel1',
collocations=False,
stopwords = STOPWORDS).generate(text)
save_to_bucket(wordcloud)
def save_to_bucket(wordcloud):
#Save tweet list to an s3 bucket
BUCKET_NAME = "1706224-tweets"
FILE_NAME = "wordcloud.png"
s3 = boto3.resource('s3')
object = s3.Object(BUCKET_NAME, FILE_NAME)
object.put(Body=wordcloud)
In the end I used the /tmp storage from lambda to store the image temporarily, then uploaded the image after
wordcloud.to_file("/tmp/"+FILE_NAME)
s3 = boto3.client('s3')
s3.upload_file("/tmp/"+FILE_NAME,BUCKET_NAME,FILE_NAME)
you could convert the PIL image to a byte array and use that, for instance in your case you could do the following in your save_to_bucket method
def save_to_bucket(wordcloud):
#Save tweet list to an s3 bucket
BUCKET_NAME = "1706224-tweets"
FILE_NAME = "wordcloud.png"
s3 = boto3.resource('s3')
object = s3.Object(BUCKET_NAME, FILE_NAME)
// here you convert the PIL image that generate wordcloud to byte array
image_byte = image_to_byte_array(wordcloud.to_image())
object.put(Body=image_byte)
def image_to_byte_array(image: Image, format: str = 'png'):
result = io.BytesIO()
image.save(result, format=format)
result = result.getvalue()
return result

How to convert Bytes object to _io.BytesIO python?

I am making a simple flask API for uploading an image and do some progresses then store it in the data base as binary, then i want to download it by using send_file() function but, when i am passing an image like a bytes it gives me an error:
return send_file(BytesIO.read(image.data), attachment_filename='f.jpg', as_attachment=True) TypeError: descriptor
'read' requires a '_io.BytesIO' object but received a 'bytes'
and My code for upload an image as follow:
#app.route('/upload', methods=['POST'])
def upload():
images = request.files.getlist('uploadImages')
n = 0
for image in images:
fileName = image.filename.split('.')[0]
fileFormat = image.filename.split('.')[1]
imageRead = image.read()
img = BytesIO(imageRead)
with graph.as_default():
caption = generate_caption_from_file(img)
newImage = imageDescription(name=fileName, format=fileFormat, description=caption,
data=imageRead)
db.session.add(newImage)
db.session.commit()
n = n + 1
return str(n) + ' Image has been saved successfully'
And my code for downloading an image:
#app.route('/download/<int:id>')
def download(id):
image = imageDescription.query.get_or_404(id)
return send_file(BytesIO.read(image.data), attachment_filename='f.jpg', as_attachment=True)
any one can help please???
It seems you are confused io.BytesIO. Let's look at some examples of using BytesIO.
>>> from io import BytesIO
>>> inp_b = BytesIO(b'Hello World', )
>>> inp_b
<_io.BytesIO object at 0x7ff2a71ecb30>
>>> inp.read() # read the bytes stream for first time
b'Hello World'
>>> inp.read() # now it is positioned at the end so doesn't give anything.
b''
>>> inp.seek(0) # position it back to begin
>>> BytesIO.read(inp) # This is same as above and prints bytes stream
b'Hello World'
>>> inp.seek(0)
>>> inp.read(4) # Just read upto four bytes of stream.
>>> b'Hell'
This should give you an idea of how read on BytesIO works. I think what you need to do is this.
return send_file(
BytesIO(image.data),
mimetype='image/jpg',
as_attachment=True,
attachment_filename='f.jpg'
)

Python requests base64 image

I am using requests to get the image from remote URL. Since the images will always be 16x16, I want to convert them to base64, so that I can embed them later to use in HTML img tag.
import requests
import base64
response = requests.get(url).content
print(response)
b = base64.b64encode(response)
src = "data:image/png;base64," + b
The output for response is:
response = b'GIF89a\x80\x00\x80\x00\xc4\x1f\x00\xff\xff\xff\x00\x00\x00\xff\x00\x00\xff\x88\x88"""\xffff\...
The HTML part is:
<img src="{{src}}"/>
But the image is not displayed.
How can I properly base-64 encode the response?
I think it's just
import base64
import requests
response = requests.get(url)
uri = ("data:" +
response.headers['Content-Type'] + ";" +
"base64," + base64.b64encode(response.content))
Assuming content-type is set.
This worked for me:
import base64
import requests
response = requests.get(url)
uri = ("data:" +
response.headers['Content-Type'] + ";" +
"base64," + base64.b64encode(response.content).decode("utf-8"))
You may use the base64 package.
import requests
import base64
response = requests.get(url).content
print(response)
b64response = base64.b64encode(response)
print b64response
Here's my code to send/receive images over Http requests, encoded with base64
Send Request:
# Read Image
image_data = cv2.imread(image_path)
# Convert numpy array To PIL image
pil_detection_img = Image.fromarray(cv2.cvtColor(img_detections, cv2.COLOR_BGR2RGB))
# Convert PIL image to bytes
buffered_detection = BytesIO()
# Save Buffered Bytes
pil_detection_img.save(buffered_detection, format='PNG')
# Base 64 encode bytes data
# result : bytes
base64_detection = base64.b64encode(buffered_detection.getvalue())
# Decode this bytes to text
# result : string (utf-8)
base64_detection = base64_detection.decode('utf-8')
base64_plate = base64_plate.decode('utf-8')
data = {
"cam_id": "10415",
"detecion_image": base64_detection,
}
Recieve Request
content = request.json
encoded_image = content['image']
decoded_image = base64.b64decode(encoded_image)
out_image = open('image_name', 'wb')
out_image.write(decoded_image)

How do you convert a PIL `Image` to a Django `File`?

I'm trying to convert an UploadedFile to a PIL Image object to thumbnail it, and then convert the PIL Image object that my thumbnail function returns back into a File object. How can I do this?
The way to do this without having to write back to the filesystem, and then bring the file back into memory via an open call, is to make use of StringIO and Django InMemoryUploadedFile. Here is a quick sample on how you might do this. This assumes that you already have a thumbnailed image named 'thumb':
import StringIO
from django.core.files.uploadedfile import InMemoryUploadedFile
# Create a file-like object to write thumb data (thumb data previously created
# using PIL, and stored in variable 'thumb')
thumb_io = StringIO.StringIO()
thumb.save(thumb_io, format='JPEG')
# Create a new Django file-like object to be used in models as ImageField using
# InMemoryUploadedFile. If you look at the source in Django, a
# SimpleUploadedFile is essentially instantiated similarly to what is shown here
thumb_file = InMemoryUploadedFile(thumb_io, None, 'foo.jpg', 'image/jpeg',
thumb_io.len, None)
# Once you have a Django file-like object, you may assign it to your ImageField
# and save.
...
Let me know if you need more clarification. I have this working in my project right now, uploading to S3 using django-storages. This took me the better part of a day to properly find the solution here.
I've had to do this in a few steps, imagejpeg() in php requires a similar process. Not to say theres no way to keep things in memory, but this method gives you a file reference to both the original image and thumb (usually a good idea in case you have to go back and change your thumb size).
save the file
open it from filesystem with PIL,
save to a temp directory with PIL,
then open as a Django file for this to work.
Model:
class YourModel(Model):
img = models.ImageField(upload_to='photos')
thumb = models.ImageField(upload_to='thumbs')
Usage:
#in upload code
uploaded = request.FILES['photo']
from django.core.files.base import ContentFile
file_content = ContentFile(uploaded.read())
new_file = YourModel()
#1 - get it into the DB and file system so we know the real path
new_file.img.save(str(new_file.id) + '.jpg', file_content)
new_file.save()
from PIL import Image
import os.path
#2, open it from the location django stuck it
thumb = Image.open(new_file.img.path)
thumb.thumbnail(100, 100)
#make tmp filename based on id of the model
filename = str(new_file.id)
#3. save the thumbnail to a temp dir
temp_image = open(os.path.join('/tmp',filename), 'w')
thumb.save(temp_image, 'JPEG')
#4. read the temp file back into a File
from django.core.files import File
thumb_data = open(os.path.join('/tmp',filename), 'r')
thumb_file = File(thumb_data)
new_file.thumb.save(str(new_file.id) + '.jpg', thumb_file)
This is actual working example for python 3.5 and django 1.10
in views.py:
from io import BytesIO
from django.core.files.base import ContentFile
from django.core.files.uploadedfile import InMemoryUploadedFile
def pill(image_io):
im = Image.open(image_io)
ltrb_border = (0, 0, 0, 10)
im_with_border = ImageOps.expand(im, border=ltrb_border, fill='white')
buffer = BytesIO()
im_with_border.save(fp=buffer, format='JPEG')
buff_val = buffer.getvalue()
return ContentFile(buff_val)
def save_img(request)
if request.POST:
new_record = AddNewRecordForm(request.POST, request.FILES)
pillow_image = pill(request.FILES['image'])
image_file = InMemoryUploadedFile(pillow_image, None, 'foo.jpg', 'image/jpeg', pillow_image.tell, None)
request.FILES['image'] = image_file # really need rewrite img in POST for success form validation
new_record.image = request.FILES['image']
new_record.save()
return redirect(...)
Putting together comments and updates for Python 3+
from io import BytesIO
from django.core.files.base import ContentFile
import requests
# Read a file in
r = request.get(image_url)
image = r.content
scr = Image.open(BytesIO(image))
# Perform an image operation like resize:
width, height = scr.size
new_width = 320
new_height = int(new_width * height / width)
img = scr.resize((new_width, new_height))
# Get the Django file object
thumb_io = BytesIO()
img.save(thumb_io, format='JPEG')
photo_smaller = ContentFile(thumb_io.getvalue())
To complete for those who, like me, want to couple it with Django's FileSystemStorage:
(What I do here is upload an image, resize it to 2 dimensions and save both files.
utils.py
def resize_and_save(file):
size = 1024, 1024
thumbnail_size = 300, 300
uploaded_file_url = getURLforFile(file, size, MEDIA_ROOT)
uploaded_thumbnail_url = getURLforFile(file, thumbnail_size, THUMBNAIL_ROOT)
return [uploaded_file_url, uploaded_thumbnail_url]
def getURLforFile(file, size, location):
img = Image.open(file)
img.thumbnail(size, Image.ANTIALIAS)
thumb_io = BytesIO()
img.save(thumb_io, format='JPEG')
thumb_file = InMemoryUploadedFile(thumb_io, None, file.name, 'image/jpeg', thumb_io.tell, None)
fs = FileSystemStorage(location=location)
filename = fs.save(file.name, thumb_file)
return fs.url(filename)
In views.py
if request.FILES:
fl, thumbnail = resize_and_save(request.FILES['avatar'])
#delete old profile picture before saving new one
try:
os.remove(BASE_DIR + user.userprofile.avatarURL)
except Exception as e:
pass
user.userprofile.avatarURL = fl
user.userprofile.thumbnailURL = thumbnail
user.userprofile.save()
Here is an app that can do that: django-smartfields
from django.db import models
from smartfields import fields
from smartfields.dependencies import FileDependency
from smartfields.processors import ImageProcessor
class ImageModel(models.Model):
image = fields.ImageField(dependencies=[
FileDependency(processor=ImageProcessor(
scale={'max_width': 150, 'max_height': 150}))
])
Make sure to pass keep_orphans=True to the field, if you want to keep old files, otherwise they are cleaned up upon replacement.
For those using django-storages/-redux to store the image file on S3, here's the path I took (the example below creates a thumbnail of an existing image):
from PIL import Image
import StringIO
from django.core.files.storage import default_storage
try:
# example 1: use a local file
image = Image.open('my_image.jpg')
# example 2: use a model's ImageField
image = Image.open(my_model_instance.image_field)
image.thumbnail((300, 200))
except IOError:
pass # handle exception
thumb_buffer = StringIO.StringIO()
image.save(thumb_buffer, format=image.format)
s3_thumb = default_storage.open('my_new_300x200_image.jpg', 'w')
s3_thumb.write(thumb_buffer.getvalue())
s3_thumb.close()

Categories

Resources