OCR all .tif images in directory

OCR all .tif images in directory - python

I have python code which does OCR for one tiff file and prints result in python window. I have more number of tiff files in a directory, it will take more hours to OCR all images one by one using my code.
Since I'm a beginner, I'm getting error while adding 'for' loop in code
import cv2
import numpy as np
import pytesseract
from PIL import Image
# Path of working folder on Disk
src_path = "D:/OpenCV and Tesseract/Image Split/Test 1/"
def get_string(img_path):
# Read image with opencv
img = cv2.imread(img_path)
# Convert to gray
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Apply dilation and erosion to remove some noise
kernel = np.ones((1, 1), np.uint8)
img = cv2.dilate(img, kernel, iterations=1)
img = cv2.erode(img, kernel, iterations=1)
# Write image after removed noise
cv2.imwrite(src_path + "removed_noise.png", img)
# Apply threshold to get image with only black and white
#img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)
# Write the image after apply opencv to do some ...
cv2.imwrite(src_path + "thres.png", img)
# Recognize text with tesseract for python
result = pytesseract.image_to_string(Image.open(src_path + "thres.png"))
# Remove template file
#os.remove(temp)
return result
print '--- Start recognize text from image ---'
print get_string(src_path + "21.tif")
print "------ Done -------"
Someone help me to modify the python code to do OCR for all images in a directory and store all text in a single .txt file line by line

Related

Image enhancement for pytesseract 7seg detection

I'm in the middle of developing a system that predict numbers from 7Seg LCD and I'm using for the matter tesseract OCR engine and it's wrapper for python pytesseract.
I'm taking pictures with a camera then cropping the Region of Interest and I found out that I have to enhance my Image quality to increase the accuracy of the OCR engine.
I used some Image processing techniques (gray scale --> Gaussian Blur --> threshold) and I got a quiet good image but tesseract still can't detect the numbers in the image.
I use the code:
image = cv2.imread('test.jpg')
image = image[50:200, 300:540]
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
image = cv2.GaussianBlur(image, (3,3), 0)
_, image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
cv2.imshow('result', image)
cv2.waitKey()
cv2.destroyAllWindows()
cv2.imwrite('enhanced.jpg', image)
tess_dir_config = r'--tessdata-dir "C:\Program Files\Tesseract-OCR\tessdata"'
text = image_to_string(image, lang='letsgodigital', config=tess_dir_config)
print(text)
The Output Image:
The Input Image:
The engine usually have an empty output and if not it will not detect the number correctly.
Is there some sort of other image processing that I can use to get the potential of the Engine.
Note: I'am using letsgodigital weights

This works for me, if I improve the crop a little, and use page segmentation mode 7. (This mode does no page segmentation and assumes a single line of text.)
import cv2
import matplotlib.pyplot as plt
import pytesseract
image = cv2.imread('seven_seg_disp.jpg')
# Strip off top of meter and little percent symbol.
image = image[90:200, 300:520]
# plt.imshow(image)
image = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
image = cv2.GaussianBlur(image, (3,3), 0)
_, image = cv2.threshold(image, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)
# plt.imshow(image)
tess_dir_config = r'--tessdata-dir "../.tesseract" --psm 7'
text = pytesseract.image_to_string(image, lang='letsgodigital', config=tess_dir_config)
text = text.strip()
print(text) # prints 75
Note: I changed the value of tessdata-dir because it's in a different place on my computer.

How to improve Tesseract accuracy

I am trying to run OCR on set of images that are similar but can vary in size. For some reason I cannot get a predictable result. Is there anything I can do do get better results.
Tesseract with or without cv2 preprocessing works beautifully on some images and fails on some and there is no pattern. Images are more or less similar.
Upper image represents processed image
def filter_img(img):
# Read pil image as cv2
img = np.array(img)
img = cv2.resize(img, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
# Converting image to grayscale (important for applying threshold)
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
#Apply dilation and erosion to remove some noise
kernel = np.ones((1, 1), np.uint8)
# img = cv2.dilate(img, kernel, iterations=1)
img = cv2.erode(img, kernel, iterations=1)
# Apply blur to smooth out the edges
img = cv2.GaussianBlur(img, (5, 5), 0)
# img = cv.medianBlur(img,5)
# Apply threshold to get image with only b&w (binarization)
img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
img = Image.fromarray(img)
img = ImageOps.expand(img,border=2,fill='black')
visualize.show_labeled_image(img,boxes)
return img
# Applying Tesseract OCR
def run_tesseract(img):
# Tesseract cmd setup
# pytesseract.pytesseract.tesseract_cmd = "tesseract"
whitelist = string.ascii_uppercase + string.digits + ".-"
parameters = '-c load_freq_dawg=0 -c tessedit_char_whitelist="{}"'.format(whitelist)
psm = 8
custom_oem_psm_config = "--dpi 300 --oem 3 --psm {psm} {parameters}".format(parameters=parameters, psm=psm)
try:
text = pytesseract.image_to_string(img, config=custom_oem_psm_config, timeout=2)
return text.strip()
except RuntimeError:
print ("TIMEOUT")
return ""

If your image format is highly consistent, you might consider using split images. And after ocr the image, use conditional judgments on the first letter or number for error-prone areas, such as 0 and O are confusing. Of course, all of the above is only valid if the image is highly consistent.
enter code here
import cv2
import numpy as np
import pytesseract
import matplotlib.pyplot as plt
pytesseract.pytesseract.tesseract_cmd = 'D://Program Files/Tesseract-
OCR/tesseract.exe'
img = cv2.imread('vATKQ.png')
img2 = img[100:250, 180:650] #split to region you want
plt.imshow(img2)
text=pytesseract.image_to_string(img2)
print(text)

Extracting particular text associated value from an image

I have an image, and from the image I want to extract key and value pair details.
As an example, I want to extract the value of "MASTER-AIRWAYBILL NO:"
I have written to extract the entire text from the image using python opencv and OCR, but I don't have any clue how to extract only the value for "MASTER-AIRWAYBILL NO:" from the entire result text of the image.
Please find the code:
import cv2
import numpy as np
import pytesseract
from PIL import Image
print ("Hello")
src_path = "C:\\Users\Venkatraman.R\Desktop\\alpha_bill.jpg"
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files (x86)\Tesseract-OCR\tesseract.exe"
print (src_path)
# Read image with opencv
img = cv2.imread(src_path)
# Convert to gray
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# Apply dilation and erosion to remove some noise
kernel = np.ones((1, 1), np.uint8)
img = cv2.dilate(img, kernel, iterations=1)
img = cv2.erode(img, kernel, iterations=1)
# Write image after removed noise
cv2.imwrite(src_path + "removed_noise.png", img)
# Apply threshold to get image with only black and white
#img = cv2.adaptiveThreshold(img, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)
# Write the image after apply opencv to do some ...
cv2.imwrite(src_path + "thres.png", img)
# Recognize text with tesseract for python
result = pytesseract.image_to_string(Image.open(src_path + "thres.png"))
# Remove template file
#os.remove(temp)
print ('--- Start recognize text from image ---')
print (result)
So output should be like:
MASTER-AIRWAYBILL NO: 157-46637194

You can use pytesseract image_to_string() and a regex to extract the desired text, i.e.:
from PIL import Image
import pytesseract, re
f = "ocr.jpg"
t = pytesseract.image_to_string(Image.open(f))
m = re.findall(r"MASTER-AIRWAYBILL NO: [\d—-]+", t)
if m:
print(m[0])
Output:
MASTER-AIRWAYBILL NO: 157—46637194

I m using python 2.7 and i m also want to finde vendor name from the image
how should i find?
m = re.findall(r"MASTER-AIRWAYBILL NO: [\d—-]+", t)
for the above line its showing error
and if i use m=re.findall(r'Vendor Name:[\d--]+', t) then also its showing error

You can try this after installing tesseract.
from PIL import Image
import pytesseract, re
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
t = pytesseract.image_to_string(Image.open("path"))
m = re.findall(r"Invoice No. [\d—-]+", t)
if m:
print(m[0])

Reading low resolution image with pytesseract

I'm trying to read off some stats off the cropped (manually) sections of tables in pdf files.
Here is the image I'm trying to process
The current result I get has most of the numbers but not all of the text, as seen below:
Hmuwinu'fg. cm’: -009,d1-I (F -o.761.l= .om,
Tamar wuall ma: 2 1.41(F-o.167
Tao! hr aubgrwp dimes: Nol wvwe
I've tried using interpolations other than inter-cubic during the resizing step, and played around changing the kernel size but 1x1 seems to work the best.
Here is the current code:
# import the packages
from PIL import Image
import pytesseract
import numpy as np
import argparse
import cv2
import os
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True,help="path to input image to OCR'd")
ap.add_argument("-p","--preprocess",type=str,default="thresh",help="type of preprocessing to be done")
args = vars(ap.parse_args())
#load the example image
image = cv2.imread(args["image"])
# Rescale image
image = cv2.resize(image,None,fx=1.5,fy=1.5,interpolation=cv2.INTER_CUBIC)
#Apply dilation and erosion to remove some noise
kernel = np.ones((1,1),np.uint8)
image = cv2.dilate(image,kernel,iterations=1)
image = cv2.erode(image,kernel,iterations=1)
#Convert it to grayscale
gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
# check to see if we should apply thresholding to process image
if args["preprocess"] == "thresh":
gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# make a check to see if median blurring should be applied
elif args["preprocess"] == "blur":
gray = cv2.medianBlur(gray,3)
#write the gray scale image to a disk as a temp file so we can OCR it
filename = "{}.png".format(os.getpid())
cv2.imwrite(filename,gray)
#load the image as a PIL/pillow image, apploy OCR, then delete temp file
text = pytesseract.image_to_string(Image.open(filename))
os.remove(filename)
print(text)
# show the output images
cv2.imshow("Image",image)
cv2.imshow("Output",gray)
cv2.waitKey(0)
Any suggestions or methods are really appreciated.

I applied adaptive-threshold + bitwise-not operations and result is:
Now, when I read:
txt = pytesseract.image_to_string(bnt, config="--psm 6")
print(txt)
Result:
Hewrogenedty: Chit «0.09, die 1 (P = 0,78); If 0.0%
Teal for overall ettect: Z = 1.41 (P = 0.16)
Test tor subgroup ditlrenote: Not appliaalle
Not prefect but at least numbers are correct (If I'm not mistaken)
Code:
import cv2
import pytesseract
img = cv2.imread("Q8iIo.png")
img = cv2.resize(img, None, fx=2.5, fy=2.5,
interpolation=cv2.INTER_CUBIC)
gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
thr = cv2.adaptiveThreshold(gry, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY_INV, 25, 28)
bnt = cv2.bitwise_not(thr)
txt = pytesseract.image_to_string(bnt, config="--psm 6")
print(txt)

PIL - TypeError: src is not a numpy array, neither a scalar

from PIL import Image
import pytesseract
import argparse
import cv2
import os
image = Image.open("C:/Users/NB/Desktop/Scan/Arti818.jpg")
#image = "C:/Users/NB/Desktop/Scan/Arti818.jpg"
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# check to see if we should apply thresholding to preprocess the
# image
gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# make a check to see if median blurring should be done to remove
# noise
# write the grayscale image to disk as a temporary file so we can
# apply OCR to it
filename = "{}.png".format(os.getpid())
cv2.imwrite(filename, gray)
# load the image as a PIL/Pillow image, apply OCR, and then delete
# the temporary file
text = pytesseract.image_to_string(Image.open(filename))
os.remove(filename)
print(text)
# show the output images
cv2.imshow("Image", image)
cv2.imshow("Output", gray)
cv2.waitKey(0)
This is my code and I am getting following error:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
TypeError: src is not a numpy array, neither a scalar

Read the docs. It clearly says:
PIL.Image.open(fp, mode='r')
Opens and identifies the given image file.
Returns: An Image object.
The object returned is of Image type, not a numpy.ndarray. If you want an array, convert image to one:
gray = cv2.cvtColor(np.asarray(image), cv2.COLOR_BGR2GRAY)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

OCR all .tif images in directory - python

Related

Image enhancement for pytesseract 7seg detection

How to improve Tesseract accuracy

Extracting particular text associated value from an image

Reading low resolution image with pytesseract

PIL - TypeError: src is not a numpy array, neither a scalar

Categories

Resources