Extract text from PNG images using Python tesseract - python

Recently, I took a project. Converting a scanned PDF to searchable PDF/word using Python tesseract.
After few attempts, I could able to convert scanned PDF to PNG image files and afterwards, I'm struck could anyone please help me to convert the PNG files to Word/PDF searchable.my piece of code attached
Please find the attached image for reference.
Import os
Import sys
from PIL import image
Import pytesseract
from pytesseract import image_to_string
Libpath =r'_______' #site-package
Pop_path=r'_______' #poppler dlls
Sys.path.insert(0,LibPath)
from pdf2image import convert_from_path
Pdfpath=r'_______' # PDF file directory
imgpath=r'_______' #image output path
images= convert_from_path(pdf_path = pdfpath,
dpi=500, poppler_path= pop_path)
for idx, of in enumerate (images):
pg.save(imgPath+'PDF_Page_'+'.png',"PNG")
print('{} page converted'.format(str(idx)))
try:
from PIL import image
except ImportError:
import image
import pytesseract
def ocr-core(images):
Text =
pytesseract.image_to_string(image.open(images))
return text
print(ocr_core("image path/imagename))
that's it, I've written.....then I got multiple ".PNG" images...now I can only able to convert one PNG images to text.
How to convert all the images and save it in CSV/word?

from PIL import image
from pdf2image import convert_from_path
import pytesseract
import OS
import sys
Pdf_file_path = '_______' #your file path
Images = convert_from_path(Pdf_file_path, dpi=500)
Counter=1
for page in Images:
idx= "image_"+str(Counter)+".jpg" ##or ".png"
page.save(idx, 'JPEG')
Counter = Counter+1
file=Counter-1
Output= '_____' #where you want to save and file name
f=open(output, "w")
for i in range(1,file+1):
idx= "image_"+str(Counter)+".jpg" ##or ".png"
text=str(pytesseract.image_to_string(Image.open(idx)))
f.write(text)
f.close()

Related

PDF to Image and downloading it to a specific folder using Wand Python

I am trying to convert all the pages of a PDF to images and save them to a specific working directory.
The code is:
from wand.image import Image
from wand.image import Image as wi
pdf = wi(filename="work.pdf", resolution=300)
pdfimage = pdf.convert("jpeg")
i=1
for img in pdfimage.sequence:
page = wi(image=img)
page.save(filename=r"C:\Users\...\work" + str(i) + ".jpg")
i +=1
As you can see, I am converting each page to jpg format and then am trying to save them in the folder. But due to some reason, it is not working.
If instead of the second last line, I try:
from wand.image import Image as wi
pdf = wi(filename="work.pdf", resolution=300)
pdfimage = pdf.convert("jpeg")
i=1
for img in pdfimage.sequence:
page = wi(image=img)
#page.save(filename=r"C:\Users\...\work" + str(i) + ".jpg")
page.save(filename=str(i)+".jpg")
i +=1
then it saves successfully but in the folder C:\Users\Me.
How can I save them in the working directory?
Try this...
import os
from wand.image import Image as wi
with wi(filename="work.pdf", resolution=300) as pdf:
pdf.scene = 1
pdf.save(filename=os.path.join(os.getcwd(),"work%02d.jpg")
Wand should also support pathlib, or other classes that implement __fspath__() itereface.

I want to convert pdf to image using pdf2image in python on Mac OS X

I want to convert pdf to image using pdf2image in python on Mac OS X.
from pdf2image import convert_from_path, convert_from_bytes
from pdf2image.exceptions import (
PDFInfoNotInstalledError,
PDFPageCountError,
PDFSyntaxError
)
# define pdf path
# convert pdf to image(1200dpi)
pdf_path = Path(".")
images = convert_from_path(str(pdf_path), 1200)
# save image files one by one
image_dir = Path(".")
for i, page in enumerate(pages):
file_name = pdf_path.stem + "_{:02d}".format(i + 1) + ".jpeg"
image_path = image_dir / file_name
# save JPEG
page.save(str(image_path), "JPEG")
and then I get empty files...
I cannot understand what is happening.
Any thoughts from anyone??
Hiro
By using the pdf2image library can be used convert pdf to image like this way,
from pdf2image import convert_from_path
pages = convert_from_path('pdf_file', 500) // where 500 is dpi
Saving pages in jpeg format
for page in pages:
page.save('out.jpg', 'JPEG')
For converting the first page of the PDF and nothing else check this Example,
from pdf2image import convert_from_path
pages = convert_from_path('file.pdf', 500)
pages = convert_from_path('file.pdf', 500, single_file=True)
pages[0].save('file.jpg', 'JPEG')

Unable to read image data when converting from PDF to Image

I am trying to convert the PDF to Image to proceed further with the Tesseract. It works when I convert using cmd:
magick convert a.pdf b.png
But doesn't work when I try to do the same using Python:
from wand.image import Image
with Image (filename='a.pdf') as img:
img.save(filename = 'sample.png')`
The error I get is:
unable to read image data D:/Users/UserName/AppData/Local/Temp/magick-4908Cq41DDA5FxlX1 # error/pnm.c/ReadPNMImage/1346
I have also installed ghostscipt but the error is still there.
EDIT:
I took the code provided in the reply below and modified it to read all the pages. The original issue is still there and the code below uses pdf2image:
from pdf2image import convert_from_path
import os
pdf_dir = "D:/Users/UserName/Desktop/scraping"
for pdf_file in os.listdir(pdf_dir):
if pdf_file.endswith(".pdf"):
pages = convert_from_path(pdf_file, 300)
pdf_name = pdf_file[:-4]
for page in pages:
page.save("%s-page%d.jpg" % (pdf_name, pages.index(page)), "JPEG")
Instead of using wand.image, you can use pdf2image. Install it like this:
pip install pdf2image
Here is a code that loops through every page in the PDF, finally converting them to JPEG:
import os
import tempfile
from pdf2image import convert_from_path
filename = 'target.pdf'
with tempfile.TemporaryDirectory() as path:
images_from_path = convert_from_path(filename, output_folder=path, last_page=1, first_page =0)
base_filename = os.path.splitext(os.path.basename(filename))[0] + '.jpg'
save_dir = 'dir'
for page in images_from_path:
page.save(os.path.join(save_dir, base_filename), 'JPEG')

PIL isn't able to show images

I've been trying to show a image with PIL but I don't know why the image viewer keeps giving me this:"Windows photo viewer can't open this picture because either the picture is deleted,or it's in a location that isn't available.
I've checked the pic is on my desktop and doesn't have any problem.
Here is my code:
from PIL import Image
img = Image.open('photo_2016-08-04_22-38-11.jpg')
img.show()
Could any one help me with this?
Try this little snippet:
from PIL import Image
import os
home_path = os.path.expanduser('~')
filename = os.path.join(home_path,'Desktop','photo_2016-08-04_22-38-11.jpg')
if os.path.exists(filename):
print "Opening filename {0}".format(filename)
img = Image.open(filename)
img.show()
else:
print "Filename {0} doesn't exist".format(filename)

Converting a remote PDF's pages to temporary images for OCR

I have a remote PDF file that I need to read page by page and keep passing each to an OCR which will give me its OCR text.
import pytesseract
from pyPdf import PdfFileWriter, PdfFileReader
import cStringIO
from wand.image import Image
import urllib2
import tempfile
import pytesseract
from PIL import Image
remoteFile = urllib2.urlopen(urllib2.Request("file:///home/user/Documents/TestDocs/test.pdf")).read()
memoryFile = cStringIO.StringIO(remoteFile)
pdfFile = PdfFileReader(memoryFile)
for pageNum in xrange(pdfFile.getNumPages()):
currentPage = pdfFile.getPage(pageNum)
## somehow convert currentPage to wand type
## image and then pass to tesseract-api
##
## TEMP_IMAGE = some conversion to temp file
## pytesseract.image_to_string(Image.open(TEMP_IMAGE))
memoryFile.close()
I thought of using cStringIO or tempfile but I cannot figure out how to use them for this purpose.
How can solve this issue?
There's a couple options for doing this, the more compatible way given the code you supplied is to store the images temporarily in that directory and then delete them after reading the text using pytesseract. I create a wand type image to extract each image from the PDF individually, then convert it to a PIL type image for pytesseract. Here's the code I used for this with the detected text bring written to an array 'text' where each element is an image in the original PDF, I also updated some of your imports to make it compatible with Python3 (cStringIO->io and urllib2->urllib.request).
import PyPDF2
import os
import pytesseract
from wand.image import Image
from PIL import Image as PILImage
import urllib.request
import io
with urllib.request.urlopen('file:///home/user/Documents/TestDocs/test.pdf') as response:
pdf_read = response.read()
pdf_im = PyPDF2.PdfFileReader(io.BytesIO(pdf_read))
text = []
for p in range(pdf_im.getNumPages()):
with Image(filename='file:///home/user/Documents/TestDocs/test.pdf' + '[' + str(p) + ']') as img:
with Image(image = img) as converted: #Need second with to convert SingleImage object from wand to Image
converted.save(filename=tempFile_Location)
text.append(pytesseract.image_to_string(PILImage.open(tempFile_Location)))
os.remove(tempFile_Location)
Alternatively, if you want to avoid creating and deleting a temporary file for each image you can use numpy and OpenCV to extract the image as a blob, convert it to a numpy array and then turn it into a PIL image for pytesseract to perform OCR on (reference)
import PyPDF2
import os
import pytesseract
from wand.image import Image
from PIL import Image as PILImage
import urllib.request
import io
import numpy as np
import cv2
with urllib.request.urlopen('file:///home/user/Documents/TestDocs/test.pdf') as response:
pdf_read = response.read()
pdf_im = PyPDF2.PdfFileReader(io.BytesIO(pdf_read))
text = []
for p in range(pdf_im.getNumPages()):
with Image(filename=('file:///home/user/Documents/TestDocs/test.pdf') + '[' + str(p) + ']') as img:
img_buffer=np.asarray(bytearray(img.make_blob()), dtype=np.uint8)
retval = cv2.imdecode(img_buffer, cv2.IMREAD_GRAYSCALE)
text.append(pytesseract.image_to_string(PILImage.fromarray(retval)))

Categories

Resources