How to improve Tesseract accuracy - python

I am trying to run OCR on set of images that are similar but can vary in size. For some reason I cannot get a predictable result. Is there anything I can do do get better results.
Tesseract with or without cv2 preprocessing works beautifully on some images and fails on some and there is no pattern. Images are more or less similar.
Upper image represents processed image
def filter_img(img):
# Read pil image as cv2
img = np.array(img)
img = cv2.resize(img, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
# Converting image to grayscale (important for applying threshold)
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
#Apply dilation and erosion to remove some noise
kernel = np.ones((1, 1), np.uint8)
# img = cv2.dilate(img, kernel, iterations=1)
img = cv2.erode(img, kernel, iterations=1)
# Apply blur to smooth out the edges
img = cv2.GaussianBlur(img, (5, 5), 0)
# img = cv.medianBlur(img,5)
# Apply threshold to get image with only b&w (binarization)
img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
img = Image.fromarray(img)
img = ImageOps.expand(img,border=2,fill='black')
visualize.show_labeled_image(img,boxes)
return img
# Applying Tesseract OCR
def run_tesseract(img):
# Tesseract cmd setup
# pytesseract.pytesseract.tesseract_cmd = "tesseract"
whitelist = string.ascii_uppercase + string.digits + ".-"
parameters = '-c load_freq_dawg=0 -c tessedit_char_whitelist="{}"'.format(whitelist)
psm = 8
custom_oem_psm_config = "--dpi 300 --oem 3 --psm {psm} {parameters}".format(parameters=parameters, psm=psm)
try:
text = pytesseract.image_to_string(img, config=custom_oem_psm_config, timeout=2)
return text.strip()
except RuntimeError:
print ("TIMEOUT")
return ""

If your image format is highly consistent, you might consider using split images. And after ocr the image, use conditional judgments on the first letter or number for error-prone areas, such as 0 and O are confusing. Of course, all of the above is only valid if the image is highly consistent.
enter code here
import cv2
import numpy as np
import pytesseract
import matplotlib.pyplot as plt
pytesseract.pytesseract.tesseract_cmd = 'D://Program Files/Tesseract-
OCR/tesseract.exe'
img = cv2.imread('vATKQ.png')
img2 = img[100:250, 180:650] #split to region you want
plt.imshow(img2)
text=pytesseract.image_to_string(img2)
print(text)

Related

how to improve pytesseract arguments to work properly

I would like to read this captcha using pytesseract:
I follow the advice here: Use pytesseract OCR to recognize text from an image
My code is:
import pytesseract
import cv2
def captcha_to_string(picture):
image = cv2.imread(picture)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (3,3), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Morph open to remove noise and invert image
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
invert = 255 - opening
cv2.imwrite('thresh.jpg', thresh)
cv2.imwrite('opening.jpg', opening)
cv2.imwrite('invert.jpg', invert)
# Perform text extraction
text = pytesseract.image_to_string(invert, lang='eng', config='--psm 10 --oem 3 -c tessedit_char_whitelist=0123456789')
return text
But my code returns 8\n\x0c which is nonsense.
This is how thresh looks like:
This is how opening looks like:
This is how invert looks like:
Can you help me, how can I improve captcha_to_string function to read the captcha properly? Thanks a lot.
You are on the right way. Removing the noise (small black spots in the inverted image) looks like the way to extract the text successfully.
FYI, the configuration of pytessearct makes the outcome worse only. So, I removed it.
My approach is as follows:
import pytesseract
import cv2
import matplotlib.pyplot as plt
import numpy as np
def remove_noise(img,threshold):
"""
remove salt-and-pepper noise in a binary image
"""
filtered_img = np.zeros_like(img)
labels,stats= cv2.connectedComponentsWithStats(img.astype(np.uint8),connectivity=8)[1:3]
label_areas = stats[1:, cv2.CC_STAT_AREA]
for i,label_area in enumerate(label_areas):
if label_area > threshold:
filtered_img[labels==i+1] = 1
return filtered_img
def preprocess(img_path):
"""
convert the grayscale captcha image to a clean binary image
"""
img = cv2.imread(img_path,0)
blur = cv2.GaussianBlur(img, (3,3), 0)
thresh = cv2.threshold(blur, 150, 255, cv2.THRESH_BINARY_INV)[1]
filtered_img = 255-remove_noise(thresh,20)*255
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
erosion = cv2.erode(filtered_img,kernel,iterations = 1)
return erosion
def extract_letters(img):
text = pytesseract.image_to_string(img)#,config='--psm 10 --oem 3 -c tessedit_char_whitelist=0123456789')
return text
img = preprocess('captcha.jpg')
text=extract_letters(img)
print(text)
plt.imshow(img,'gray')
plt.show()
This is the processed image.
And, the script returns 18L9R.

How to process this captcha image for Pytesseract?

I want to solve automatically captchas like this one (all of them with red background and white letters) with Pytesseract
I have been trying processing image to make Pytesseract be able to read it, but no success. Would be great to receive your ideas to process this image. Here my code:
import cv2
import pytesseract
tessdata_dir_config = '--tessdata-dir "C:\\Program Files\\Tesseract-OCR\\tessdata"'
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
img = cv2.imread("captcha.png")
img = cv2.resize(img, None, fx=2, fy=2)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
adaptive = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 85, 20)
print((pytesseract.image_to_string(img, config=tessdata_dir_config)).strip())
print((pytesseract.image_to_string(gray, config=tessdata_dir_config)).strip())
print((pytesseract.image_to_string(adaptive, config=tessdata_dir_config)).strip())
cv2.imshow("Captcha", img) # Output: IMQW
cv2.imshow("Gray", gray) # Output: IMOW
cv2.imshow("Adaptive", adaptive) # Output: IMOW,
cv2.waitKey(7000)
I have a three-step solution
Resize
Closing
Threshold
Step-1: Resize
Resizing the image enables the OCR-algorithm to detect the character or digit strokes in the input image.
Step-2: Closing
Closing is a morphological operation aims to remove the small-holes in the input image.
If we look carefully Q and W characters consists of lots of small holes.
Step-3: Threhsold
We will apply simple-threhsolding to binarize the image. Our aim to remove any leftover artifacts from the image.
Resize
Closing
Threshold
Result:
IMQW
Code:
import cv2
from pytesseract import image_to_string
img = cv2.imread("QUfxY.png")
gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
(h, w) = gry.shape[:2]
gry = cv2.resize(gry, (w*2, h*2))
cls = cv2.morphologyEx(gry, cv2.MORPH_CLOSE, None)
thr = cv2.threshold(cls, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
txt = image_to_string(thr)
print(txt)

Text Extraction from Image with Single letter in it

I have an image not of much good quality with a single letter in it. I need to extract the value from this
I tried doing this with open CV. the code works on good quality image but need help to extract from this image
from PIL import Image
import pytesseract
import argparse
import os
import cv2
import numpy as np
img = cv2.imread(r"/home/ubuntu/xyz/xyz.jpg")
img = cv2.resize(img, None, fx=1.5, fy=1.5, interpolation=cv2.INTER_CUBIC)
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
kernel = np.ones((1, 1), np.uint8)
img = cv2.dilate(img, kernel, iterations=1)
img = cv2.erode(img, kernel, iterations=1)
img = cv2.GaussianBlur(img, (5, 5), 0)
img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)\[1\]
# Save the filtered image
cv2.imwrite(r"/home/ubuntu/xyz/rr.jpg", img)
# Read text with tesseract for python
result = pytesseract.image_to_string(img, lang="eng")
result
why u need Gaussian Blur in this situation
img = cv2.GaussianBlur(img, (5, 5), 0)
with a big window (5,5)
I think you can make a white border outside instead of resizing the image,
and you may use erosion technical to remove the noise from image

Reading low resolution image with pytesseract

I'm trying to read off some stats off the cropped (manually) sections of tables in pdf files.
Here is the image I'm trying to process
The current result I get has most of the numbers but not all of the text, as seen below:
Hmuwinu'fg. cm’: -009,d1-I (F -o.761.l= .om,
Tamar wuall ma: 2 1.41(F-o.167
Tao! hr aubgrwp dimes: Nol wvwe
I've tried using interpolations other than inter-cubic during the resizing step, and played around changing the kernel size but 1x1 seems to work the best.
Here is the current code:
# import the packages
from PIL import Image
import pytesseract
import numpy as np
import argparse
import cv2
import os
# construct the argument parse and parse the arguments
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True,help="path to input image to OCR'd")
ap.add_argument("-p","--preprocess",type=str,default="thresh",help="type of preprocessing to be done")
args = vars(ap.parse_args())
#load the example image
image = cv2.imread(args["image"])
# Rescale image
image = cv2.resize(image,None,fx=1.5,fy=1.5,interpolation=cv2.INTER_CUBIC)
#Apply dilation and erosion to remove some noise
kernel = np.ones((1,1),np.uint8)
image = cv2.dilate(image,kernel,iterations=1)
image = cv2.erode(image,kernel,iterations=1)
#Convert it to grayscale
gray = cv2.cvtColor(image,cv2.COLOR_BGR2GRAY)
# check to see if we should apply thresholding to process image
if args["preprocess"] == "thresh":
gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# make a check to see if median blurring should be applied
elif args["preprocess"] == "blur":
gray = cv2.medianBlur(gray,3)
#write the gray scale image to a disk as a temp file so we can OCR it
filename = "{}.png".format(os.getpid())
cv2.imwrite(filename,gray)
#load the image as a PIL/pillow image, apploy OCR, then delete temp file
text = pytesseract.image_to_string(Image.open(filename))
os.remove(filename)
print(text)
# show the output images
cv2.imshow("Image",image)
cv2.imshow("Output",gray)
cv2.waitKey(0)
Any suggestions or methods are really appreciated.
I applied adaptive-threshold + bitwise-not operations and result is:
Now, when I read:
txt = pytesseract.image_to_string(bnt, config="--psm 6")
print(txt)
Result:
Hewrogenedty: Chit «0.09, die 1 (P = 0,78); If 0.0%
Teal for overall ettect: Z = 1.41 (P = 0.16)
Test tor subgroup ditlrenote: Not appliaalle
Not prefect but at least numbers are correct (If I'm not mistaken)
Code:
import cv2
import pytesseract
img = cv2.imread("Q8iIo.png")
img = cv2.resize(img, None, fx=2.5, fy=2.5,
interpolation=cv2.INTER_CUBIC)
gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
thr = cv2.adaptiveThreshold(gry, 255, cv2.ADAPTIVE_THRESH_MEAN_C,
cv2.THRESH_BINARY_INV, 25, 28)
bnt = cv2.bitwise_not(thr)
txt = pytesseract.image_to_string(bnt, config="--psm 6")
print(txt)

Use pytesseract OCR to recognize text from an image

I need to use Pytesseract to extract text from this picture:
and the code:
from PIL import Image, ImageEnhance, ImageFilter
import pytesseract
path = 'pic.gif'
img = Image.open(path)
img = img.convert('RGBA')
pix = img.load()
for y in range(img.size[1]):
for x in range(img.size[0]):
if pix[x, y][0] < 102 or pix[x, y][1] < 102 or pix[x, y][2] < 102:
pix[x, y] = (0, 0, 0, 255)
else:
pix[x, y] = (255, 255, 255, 255)
img.save('temp.jpg')
text = pytesseract.image_to_string(Image.open('temp.jpg'))
# os.remove('temp.jpg')
print(text)
and the "temp.jpg" is
Not bad, but the result of print is ,2 WW
Not the right text2HHH, so how can I remove those black dots?
Here's a simple approach using OpenCV and Pytesseract OCR. To perform OCR on an image, its important to preprocess the image. The idea is to obtain a processed image where the text to extract is in black with the background in white. To do this, we can convert to grayscale, apply a slight Gaussian blur, then Otsu's threshold to obtain a binary image. From here, we can apply morphological operations to remove noise. Finally we invert the image. We perform text extraction using the --psm 6 configuration option to assume a single uniform block of text. Take a look here for more options.
Here's a visualization of the image processing pipeline:
Input image
Convert to grayscale -> Gaussian blur -> Otsu's threshold
Notice how there are tiny specs of noise, to remove them we can perform morphological operations
Finally we invert the image
Result from Pytesseract OCR
2HHH
Code
import cv2
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# Grayscale, Gaussian blur, Otsu's threshold
image = cv2.imread('1.png')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (3,3), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Morph open to remove noise and invert image
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
invert = 255 - opening
# Perform text extraction
data = pytesseract.image_to_string(invert, lang='eng', config='--psm 6')
print(data)
cv2.imshow('thresh', thresh)
cv2.imshow('opening', opening)
cv2.imshow('invert', invert)
cv2.waitKey()
Here is my solution:
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
im = Image.open("temp.jpg") # the second one
im = im.filter(ImageFilter.MedianFilter())
enhancer = ImageEnhance.Contrast(im)
im = enhancer.enhance(2)
im = im.convert('1')
im.save('temp2.jpg')
text = pytesseract.image_to_string(Image.open('temp2.jpg'))
print(text)
I have something different pytesseract approach for our community.
Here is my approach
import pytesseract
from PIL import Image
text = pytesseract.image_to_string(Image.open("temp.jpg"), lang='eng',
config='--psm 10 --oem 3 -c tessedit_char_whitelist=0123456789')
print(text)
To extract the text directly from the web, you can try the following implementation (making use of the first image):
import io
import requests
import pytesseract
from PIL import Image, ImageFilter, ImageEnhance
response = requests.get('https://i.stack.imgur.com/HWLay.gif')
img = Image.open(io.BytesIO(response.content))
img = img.convert('L')
img = img.filter(ImageFilter.MedianFilter())
enhancer = ImageEnhance.Contrast(img)
img = enhancer.enhance(2)
img = img.convert('1')
img.save('image.jpg')
imagetext = pytesseract.image_to_string(img)
print(imagetext)
Here is my small advancement with removing noise and arbitrary line within certain colour frequency range.
import pytesseract
from PIL import Image, ImageEnhance, ImageFilter
im = Image.open(img) # img is the path of the image
im = im.convert("RGBA")
newimdata = []
datas = im.getdata()
for item in datas:
if item[0] < 112 or item[1] < 112 or item[2] < 112:
newimdata.append(item)
else:
newimdata.append((255, 255, 255))
im.putdata(newimdata)
im = im.filter(ImageFilter.MedianFilter())
enhancer = ImageEnhance.Contrast(im)
im = enhancer.enhance(2)
im = im.convert('1')
im.save('temp2.jpg')
text = pytesseract.image_to_string(Image.open('temp2.jpg'),config='-c tessedit_char_whitelist=0123456789abcdefghijklmnopqrstuvwxyz -psm 6', lang='eng')
print(text)
you only need grow up the size of picture by cv2.resize
image = cv2.resize(image,(0,0),fx=7,fy=7)
my picture 200x40 -> HZUBS
resized same picture 1400x300 -> A 1234 (so, this is right)
and then,
retval, image = cv2.threshold(image,200,255, cv2.THRESH_BINARY)
image = cv2.GaussianBlur(image,(11,11),0)
image = cv2.medianBlur(image,9)
and change parameters for enhance results
Page segmentation modes:
0 Orientation and script detection (OSD) only.
1 Automatic page segmentation with OSD.
2 Automatic page segmentation, but no OSD, or OCR.
3 Fully automatic page segmentation, but no OSD. (Default)
4 Assume a single column of text of variable sizes.
5 Assume a single uniform block of vertically aligned text.
6 Assume a single uniform block of text.
7 Treat the image as a single text line.
8 Treat the image as a single word.
9 Treat the image as a single word in a circle.
10 Treat the image as a single character.
11 Sparse text. Find as much text as possible in no particular order.
12 Sparse text with OSD.
13 Raw line. Treat the image as a single text line,
bypassing hacks that are Tesseract-specific.
from PIL import Image, ImageEnhance, ImageFilter
import pytesseract
path = 'hhh.gif'
img = Image.open(path)
img = img.convert('RGBA')
pix = img.load()
for y in range(img.size[1]):
for x in range(img.size[0]):
if pix[x, y][0] < 102 or pix[x, y][1] < 102 or pix[x, y][2] < 102:
pix[x, y] = (0, 0, 0, 255)
else:
pix[x, y] = (255, 255, 255, 255)
text = pytesseract.image_to_string(Image.open('hhh.gif'))
print(text)

Categories

Resources