Output image data using PIL pillow - python

Can someone explain to me why are these two codes produce two different outputs.
a.
import urllib
txt = urllib.urlopen("image.jpeg").read()
print(txt)
b.
from PIL import Image
import requests
from io import BytesIO
import io
img = Image.open('image.jpeg')
#img.save("./test.jpeg", "JPEG")# <- this saves a correct file
with io.BytesIO() as output:
img.save(output, format="JPEG")
contents = output.getvalue()
print(contents)# <- this prints something totally different ???

Related

PILLOW throws `OSError: cannot identify image file <_io.BytesIO object at 0x08B3B060>`

I'm trying to extract texts from CAPTCHA pictures. The idea is to use lxml to get the image data from the form. The image data is prepended with a header that defines the data type. I'm guessing the CAPTCHA picture is a PNG image encoded in Base64. The image data is decoded from Base64 into the initial binary format. Meanwhile PIL wraps the binary data with BytesIO before it is passed to the PIL.Image class.
Here is the snippet's first section.
import lxml.html
import urllib.request as urllib2
import pprint
import http.cookiejar as cookielib
from io import BytesIO
import lxml.html
from PIL import Image
import pytesseract
def parse_form(html):
tree = lxml.html.fromstring(html)
data = {}
for e in tree.cssselect('form input'):
if e.get('name'):
data[e.get('name')] = e.get('value')
return data
REGISTER_URL = 'http://tracuunnt.gdt.gov.vn/tcnnt/mstdn.jsp'
cj = cookielib.CookieJar()
opener = urllib2.build_opener(urllib2.HTTPCookieProcessor(cj))
html = opener.open(REGISTER_URL).read()
form = parse_form(html)
Here, this function raises OSError: cannot identify image file <_io.BytesIO object at 0x08B3B060>:
def get_captcha(html):
tree = lxml.html.fromstring(html)
img_data = tree.cssselect('div img')[0].get('src')
img_data = img_data.partition('-')[-1]
binary_img_data = img_data.decode('base64')
file_like = BytesIO(binary_img_data)
img = Image.open(file_like)
return img
img = get_captcha(html)
I'm suspecting that it is the binary_img_data variable. I've tried to read up on decoding, encoding, PIL doc, and binary data on how to PIL can possibly read a web-based image i.e CAPTCHA but got nothing helpful.
To decode the base64 string, try the following:
from base64 import b64decode
binary_img_data = b64decode(img_data)
The method your code uses (img_data.decode('base64')) was valid in Python 2, but will not work in Python 3.
Totally overlooked the solution at the beginning. PILLOW couldn't read the image in binary data with that logic so I simply called the content of request.get() that bears the image's binary form and called Pillow to open it on the fly with BytesIO().
import lxml.html
import urllib.request as urllib2
from io import BytesIO
import lxml.html
from PIL import Image
img_data = tree.cssselect('div img')[0].get('src')
img_link = 'http://tracuunnt.gdt.gov.vn'+ img_data
response = requests.get(img_link)
img = Image.open(BytesIO(response.content))

Storing image from StringIO to a file creates a distorted image

I stored an image to StringIO from PIL. When I store it to a file from stringIO, it doesn't produce the original image.
Code:
from PIL import Image
from cStringIO import StringIO
buff=StringIO()
img = Image.open("test.jpg")
img.save(buff,format='JPEG')
#img=img.crop((1,1,100,100))
buff.seek(0)
#Produces a distorted image
with open("vv.jpg", "w") as handle:
handle.write(buff.read())
Original Image is below
Output image is below
What is wrong with the above code
You need to use BytesIO and not StringIO.
Also the destination file has to be opened in binary mode using "wb"
Here is code that works (cStringIO is replaced with io)
from PIL import Image
from io import BytesIO
buff=BytesIO()
img = Image.open('test.jpg')
img.save(buff,format='JPEG')
#img=img.crop((1,1,100,100))
buff.seek(0)
#Produces a distorted image
with open('vv.jpg', "wb") as handle:
handle.write(buff.read())

Converting a remote PDF's pages to temporary images for OCR

I have a remote PDF file that I need to read page by page and keep passing each to an OCR which will give me its OCR text.
import pytesseract
from pyPdf import PdfFileWriter, PdfFileReader
import cStringIO
from wand.image import Image
import urllib2
import tempfile
import pytesseract
from PIL import Image
remoteFile = urllib2.urlopen(urllib2.Request("file:///home/user/Documents/TestDocs/test.pdf")).read()
memoryFile = cStringIO.StringIO(remoteFile)
pdfFile = PdfFileReader(memoryFile)
for pageNum in xrange(pdfFile.getNumPages()):
currentPage = pdfFile.getPage(pageNum)
## somehow convert currentPage to wand type
## image and then pass to tesseract-api
##
## TEMP_IMAGE = some conversion to temp file
## pytesseract.image_to_string(Image.open(TEMP_IMAGE))
memoryFile.close()
I thought of using cStringIO or tempfile but I cannot figure out how to use them for this purpose.
How can solve this issue?
There's a couple options for doing this, the more compatible way given the code you supplied is to store the images temporarily in that directory and then delete them after reading the text using pytesseract. I create a wand type image to extract each image from the PDF individually, then convert it to a PIL type image for pytesseract. Here's the code I used for this with the detected text bring written to an array 'text' where each element is an image in the original PDF, I also updated some of your imports to make it compatible with Python3 (cStringIO->io and urllib2->urllib.request).
import PyPDF2
import os
import pytesseract
from wand.image import Image
from PIL import Image as PILImage
import urllib.request
import io
with urllib.request.urlopen('file:///home/user/Documents/TestDocs/test.pdf') as response:
pdf_read = response.read()
pdf_im = PyPDF2.PdfFileReader(io.BytesIO(pdf_read))
text = []
for p in range(pdf_im.getNumPages()):
with Image(filename='file:///home/user/Documents/TestDocs/test.pdf' + '[' + str(p) + ']') as img:
with Image(image = img) as converted: #Need second with to convert SingleImage object from wand to Image
converted.save(filename=tempFile_Location)
text.append(pytesseract.image_to_string(PILImage.open(tempFile_Location)))
os.remove(tempFile_Location)
Alternatively, if you want to avoid creating and deleting a temporary file for each image you can use numpy and OpenCV to extract the image as a blob, convert it to a numpy array and then turn it into a PIL image for pytesseract to perform OCR on (reference)
import PyPDF2
import os
import pytesseract
from wand.image import Image
from PIL import Image as PILImage
import urllib.request
import io
import numpy as np
import cv2
with urllib.request.urlopen('file:///home/user/Documents/TestDocs/test.pdf') as response:
pdf_read = response.read()
pdf_im = PyPDF2.PdfFileReader(io.BytesIO(pdf_read))
text = []
for p in range(pdf_im.getNumPages()):
with Image(filename=('file:///home/user/Documents/TestDocs/test.pdf') + '[' + str(p) + ']') as img:
img_buffer=np.asarray(bytearray(img.make_blob()), dtype=np.uint8)
retval = cv2.imdecode(img_buffer, cv2.IMREAD_GRAYSCALE)
text.append(pytesseract.image_to_string(PILImage.fromarray(retval)))

python Image PIL to binary Hex

from PIL import Image
from PIL import ImageDraw
from PIL import ImageFont
import urllib.request
import io
import binascii
data = urllib.request.urlopen('http://pastebin.ca/raw/2311595').read()
r_data = binascii.unhexlify(data)
stream = io.BytesIO(r_data)
img = Image.open(stream)
draw = ImageDraw.Draw(img)
font = ImageFont.truetype("arial.ttf",14)
draw.text((0, 220),"This is a test11",(0,255,0),font=font)
draw = ImageDraw.Draw(img)
with open(img,'rb') as in_file: #error on here invalid file:
hex_data = in_file.read()
# Unhexlify the data.
bin_data = binascii.unhexlify(bytes(hex_data))
print(bin_data)
Question
converting hex to image and draw a text on the image, after that convert image to binary hex,but having the problem at here with open(img,'rb') as in_file:, how to convert img to hex?
The img object needs to be saved again; write it to another BytesIO object:
output = io.BytesIO()
img.save(output, format='JPEG')
then get the written data with the .getvalue() method:
hex_data = output.getvalue()
The PIL-for-python-3 landscape is rather muddled at the moment. The Pillow fork looks to be the best, maintained version out there at the moment. It includes fixes that make saving to a BytesIO object work. If you run into a io.UnsupportedOperation: fileno exception using the above code, you have a version that was not yet fixed, in which case you'll have to resort to using a temporary file instead.

How do I read image data from a URL?

What I'm trying to do is fairly simple when we're dealing with a local file, but the problem comes when I try to do this with a remote URL.
Basically, I'm trying to create a PIL image object from a file pulled from a URL. Sure, I could always just fetch the URL and store it in a temp file, then open it into an image object, but that feels very inefficient.
Here's what I have:
Image.open(urlopen(url))
It flakes out complaining that seek() isn't available, so then I tried this:
Image.open(urlopen(url).read())
But that didn't work either. Is there a Better Way to do this, or is writing to a temporary file the accepted way of doing this sort of thing?
In Python3 the StringIO and cStringIO modules are gone.
In Python3 you should use:
from PIL import Image
import requests
from io import BytesIO
response = requests.get(url)
img = Image.open(BytesIO(response.content))
Using a StringIO
import urllib, cStringIO
file = cStringIO.StringIO(urllib.urlopen(URL).read())
img = Image.open(file)
The following works for Python 3:
from PIL import Image
import requests
im = Image.open(requests.get(url, stream=True).raw)
References:
https://github.com/python-pillow/Pillow/pull/1151
https://github.com/python-pillow/Pillow/blob/master/CHANGES.rst#280-2015-04-01
Using requests:
from PIL import Image
import requests
from StringIO import StringIO
response = requests.get(url)
img = Image.open(StringIO(response.content))
Python 3
from urllib.request import urlopen
from PIL import Image
img = Image.open(urlopen(url))
img
Jupyter Notebook and IPython
import IPython
url = 'https://newevolutiondesigns.com/images/freebies/colorful-background-14.jpg'
IPython.display.Image(url, width = 250)
Unlike other methods, this method also works in a for loop!
Use StringIO to turn the read string into a file-like object:
from StringIO import StringIO
from PIL import Image
import urllib
Image.open(StringIO(urllib.request.urlopen(url).read()))
For those doing some sklearn/numpy post processing (i.e. Deep learning) you can wrap the PIL object with np.array(). This might save you from having to Google it like I did:
from PIL import Image
import requests
import numpy as np
from StringIO import StringIO
response = requests.get(url)
img = np.array(Image.open(StringIO(response.content)))
The arguably recommended way to do image input/output these days is to use the dedicated package ImageIO. Image data can be read directly from a URL with one simple line of code:
from imageio import imread
image = imread('https://cdn.sstatic.net/Sites/stackoverflow/img/logo.png')
Many answers on this page predate the release of that package and therefore do not mention it. ImageIO started out as component of the Scikit-Image toolkit. It supports a number of scientific formats on top of the ones provided by the popular image-processing library PILlow. It wraps it all in a clean API solely focused on image input/output. In fact, SciPy removed its own image reader/writer in favor of ImageIO.
select the image in chrome, right click on it, click on Copy image address, paste it into a str variable (my_url) to read the image:
import shutil
import requests
my_url = 'https://www.washingtonian.com/wp-content/uploads/2017/06/6-30-17-goat-yoga-congressional-cemetery-1-994x559.jpg'
response = requests.get(my_url, stream=True)
with open('my_image.png', 'wb') as file:
shutil.copyfileobj(response.raw, file)
del response
open it;
from PIL import Image
img = Image.open('my_image.png')
img.show()
Manually wrapping in BytesIO is no longer needed since PIL >= 2.8.0. Just use Image.open(response.raw)
Adding on top of Vinícius's comment:
You should pass stream=True as noted https://requests.readthedocs.io/en/master/user/quickstart/#raw-response-content
So
img = Image.open(requests.get(url, stream=True).raw)
USE urllib.request.urlretrieve() AND PIL.Image.open() TO DOWNLOAD AND READ IMAGE DATA :
import requests
import urllib.request
import PIL
urllib.request.urlretrieve("https://i.imgur.com/ExdKOOz.png", "sample.png")
img = PIL.Image.open("sample.png")
img.show()
or Call requests.get(url) with url as the address of the object file to download via a GET request. Call io.BytesIO(obj) with obj as the content of the response to load the raw data as a bytes object. To load the image data, call PIL.Image.open(bytes_obj) with bytes_obj as the bytes object:
import io
response = requests.get("https://i.imgur.com/ExdKOOz.png")
image_bytes = io.BytesIO(response.content)
img = PIL.Image.open(image_bytes)
img.show()
from PIL import Image
import cv2
import numpy as np
import requests
image=Image.open(requests.get("https://previews.123rf.com/images/darrenwhi/darrenwhi1310/darrenwhi131000024/24022179-photo-of-many-cars-with-one-a-different-color.jpg", stream=True).raw)
#image =resize((420,250))
image_array=np.array(image)
image
To directly get image as numpy array without using PIL
import requests, io
import matplotlib.pyplot as plt
response = requests.get(url).content
img = plt.imread(io.BytesIO(response), format='JPG')
plt.imshow(img)

Categories

Resources