how to improve pytesseract arguments to work properly

how to improve pytesseract arguments to work properly - python

I would like to read this captcha using pytesseract:
I follow the advice here: Use pytesseract OCR to recognize text from an image
My code is:
import pytesseract
import cv2
def captcha_to_string(picture):
image = cv2.imread(picture)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (3,3), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Morph open to remove noise and invert image
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
invert = 255 - opening
cv2.imwrite('thresh.jpg', thresh)
cv2.imwrite('opening.jpg', opening)
cv2.imwrite('invert.jpg', invert)
# Perform text extraction
text = pytesseract.image_to_string(invert, lang='eng', config='--psm 10 --oem 3 -c tessedit_char_whitelist=0123456789')
return text
But my code returns 8\n\x0c which is nonsense.
This is how thresh looks like:
This is how opening looks like:
This is how invert looks like:
Can you help me, how can I improve captcha_to_string function to read the captcha properly? Thanks a lot.

You are on the right way. Removing the noise (small black spots in the inverted image) looks like the way to extract the text successfully.
FYI, the configuration of pytessearct makes the outcome worse only. So, I removed it.
My approach is as follows:
import pytesseract
import cv2
import matplotlib.pyplot as plt
import numpy as np
def remove_noise(img,threshold):
"""
remove salt-and-pepper noise in a binary image
"""
filtered_img = np.zeros_like(img)
labels,stats= cv2.connectedComponentsWithStats(img.astype(np.uint8),connectivity=8)[1:3]
label_areas = stats[1:, cv2.CC_STAT_AREA]
for i,label_area in enumerate(label_areas):
if label_area > threshold:
filtered_img[labels==i+1] = 1
return filtered_img
def preprocess(img_path):
"""
convert the grayscale captcha image to a clean binary image
"""
img = cv2.imread(img_path,0)
blur = cv2.GaussianBlur(img, (3,3), 0)
thresh = cv2.threshold(blur, 150, 255, cv2.THRESH_BINARY_INV)[1]
filtered_img = 255-remove_noise(thresh,20)*255
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
erosion = cv2.erode(filtered_img,kernel,iterations = 1)
return erosion
def extract_letters(img):
text = pytesseract.image_to_string(img)#,config='--psm 10 --oem 3 -c tessedit_char_whitelist=0123456789')
return text
img = preprocess('captcha.jpg')
text=extract_letters(img)
print(text)
plt.imshow(img,'gray')
plt.show()
This is the processed image.
And, the script returns 18L9R.

Related

Using pytesseract to get text from an image

I'm trying to use pytesseract to convert some images into text. The images are very basic and I tried using some preprocessing:
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gray = cv2.bitwise_not(gray)
gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
The original image looks like this:
The resulting image looks like this:
I do this for a bunch of numbers with the same font in the same location here are the results:
It still gives no text in the output. For a few of the images, it does, but not for all and the images look nearly identical.
Here is a snippet of the code I'm using:
def checkCurrentState():
"""image = pyautogui.screenshot()
image = cv2.cvtColor(np.array(image), cv2.COLOR_RGB2BGR)
cv2.imwrite("screenshot.png", image)"""
image = cv2.imread("screenshot.png")
checkNumbers(image)
def checkNumbers(image):
numbers = []
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
gray = cv2.bitwise_not(gray)
gray = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
for i in storeLocations:
cropped = gray[i[1]:i[1]+storeHeight, i[0]:i[0]+storeWidth]
number = pytesseract.image_to_string(cropped)
numbers.append(number)
print(number)
cv2.imshow("Screenshot", cropped)
cv2.waitKey(0)

To perform OCR on an image, its important to preprocess the image. The idea is to obtain a processed image where the text to extract is in black with the background in white. Here's a simple approach using OpenCV and Pytesseract OCR.
To do this, we convert to grayscale, apply a slight Gaussian blur, then Otsu's threshold to obtain a binary image. From here, we can apply morphological operations to remove noise. We perform text extraction using the --psm 6 configuration option to assume a single uniform block of text. Take a look here for more options.
Here's a visualization of each step:
Input image
Convert to grayscale -> Gaussian blur
Otsu's threshold -> Morph open to remove noise
Result from Pytesseract OCR
1100
Code
import cv2
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# Grayscale, Gaussian blur, Otsu's threshold
image = cv2.imread('1.png')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (3,3), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Morph open to remove noise
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3,3))
opening = cv2.morphologyEx(thresh, cv2.MORPH_OPEN, kernel, iterations=1)
# Perform text extraction
data = pytesseract.image_to_string(opening, lang='eng', config='--psm 6')
print(data)
cv2.imshow('blur', blur)
cv2.imshow('thresh', thresh)
cv2.imshow('opening', opening)
cv2.waitKey()

Unable to OCR alphanumerical image with Tesseract

I'm trying to read some alphanumerical strings in python with pytesseract.
I pre-process the images to reduce noise and make them black and white, but I consistently have issues reading the digits inside the string.
original:
after cleanup:
Extracted text: WISOMW
Code used:
def convert(path):
image = cv2.imread(path)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (3, 3), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
invert = 255 - thresh
cv2.imwrite("processed.jpg", invert)
# Perform text extraction
return pytesseract.image_to_string(invert, config="--psm 7")
I've tried different configuration options for tesseract:
oem: tried 1, 3
psm: tried different modes
tessedit_char_whitelist: limited to alphanumerical characters
I feel I'm missing something obvious given that it reliably reads the alpha characters. Any ideas of what can it be?

You were so close. A dilate helps increase white/decrease black. The resolution is low, so a small kernel is used for dilate. If you remove the _INV from your threshold step, you don't need to do another inversion.
import cv2
import numpy as np
import pytesseract
img = cv2.imread('wis9mw.jpg', cv2.IMREAD_GRAYSCALE )
img = cv2.GaussianBlur(img, (3, 3), 0)
img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
kernel = np.ones((1,1), np.uint8)
img = cv2.dilate(img, kernel, iterations=1)
cv2.imwrite('processed.jpg', img)
text = pytesseract.image_to_string(img, config="--psm 6")
print(text)
gives
WIS9MW

Unable to read text from Image using pytesseract.image_to_string

The problem here is I need to remove the lines and write code to recognize the characters. Till now I have seen solutions, where the char was in solid, but this has char with double border.

For this specific captcha, there's quite a simple solution. But, there's no guarantee for this approach to work on other, even very similar captchas – due to the "nature" of captchas as already mentioned in the comments, and in general when dealing with image-processing tasks with limited provided input data.
Read the image as grayscale.
Threshold the image at nearly white cutoff.
Flood fill the "background" with black.
Run pytesseract with -psm 6 option.
That'd be the whole code:
import cv2
import pytesseract
# Read image as grayscale
img = cv2.imread('FuZEJ.png', cv2.IMREAD_GRAYSCALE)
# Threshold at nearly white cutoff
thr = cv2.threshold(img, 224, 255, cv2.THRESH_BINARY)[1]
# Floodfill "background" with black
ff = cv2.floodFill(thr, None, (0, 0), 0)[1]
# OCR using pytesseract
text = pytesseract.image_to_string(ff, config='--psm 6').replace('\n', '').replace('\f', '')
print(text)
# xwphs
Caveat: I use a special version of Tesseract from the Mannheim University Library.
----------------------------------------
System information
----------------------------------------
Platform: Windows-10-10.0.16299-SP0
Python: 3.9.1
PyCharm: 2021.1.1
OpenCV: 4.5.1
pytesseract: 5.0.0-alpha.20201127
----------------------------------------

I would try a mask:
import cv2
import numpy as np
def process(img): # To process the image
img_gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
_, img_gray = cv2.threshold(img_gray, 224, 255, cv2.THRESH_TOZERO_INV)
img_blur = cv2.GaussianBlur(img_gray, (7, 7), 6)
img_canny = cv2.Canny(img_blur, 0, 100)
return cv2.dilate(img_canny, np.ones((1, 5)), iterations=1)
def get_mask(img): # To generate the mask
mask = np.zeros(img.shape[:2], 'uint8')
contours, _ = cv2.findContours(process(img), cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
for cnt in contours:
cv2.drawContours(mask, [cnt], -1, 255, -1)
return mask
def crop(img, mask): # To mask an image and use white background
bg = np.full(img.shape, 255, 'uint8')
fg = cv2.bitwise_or(img, img, mask=mask)
fg_back_inv = cv2.bitwise_or(bg, bg, mask=cv2.bitwise_not(mask))
return cv2.bitwise_or(fg, fg_back_inv)
img = cv2.imread("image.png")
img = cv2.pyrUp(cv2.pyrUp(img)) # To enlarge image by 4x
cv2.imshow("Masked Image", crop(img, get_mask(img)))
cv2.waitKey(0)
Before:
After:

How to process this captcha image for Pytesseract?

I want to solve automatically captchas like this one (all of them with red background and white letters) with Pytesseract
I have been trying processing image to make Pytesseract be able to read it, but no success. Would be great to receive your ideas to process this image. Here my code:
import cv2
import pytesseract
tessdata_dir_config = '--tessdata-dir "C:\\Program Files\\Tesseract-OCR\\tessdata"'
pytesseract.pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
img = cv2.imread("captcha.png")
img = cv2.resize(img, None, fx=2, fy=2)
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
adaptive = cv2.adaptiveThreshold(
gray, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 85, 20)
print((pytesseract.image_to_string(img, config=tessdata_dir_config)).strip())
print((pytesseract.image_to_string(gray, config=tessdata_dir_config)).strip())
print((pytesseract.image_to_string(adaptive, config=tessdata_dir_config)).strip())
cv2.imshow("Captcha", img) # Output: IMQW
cv2.imshow("Gray", gray) # Output: IMOW
cv2.imshow("Adaptive", adaptive) # Output: IMOW,
cv2.waitKey(7000)

I have a three-step solution
Resize
Closing
Threshold
Step-1: Resize
Resizing the image enables the OCR-algorithm to detect the character or digit strokes in the input image.
Step-2: Closing
Closing is a morphological operation aims to remove the small-holes in the input image.
If we look carefully Q and W characters consists of lots of small holes.
Step-3: Threhsold
We will apply simple-threhsolding to binarize the image. Our aim to remove any leftover artifacts from the image.
Resize
Closing
Threshold
Result:
IMQW
Code:
import cv2
from pytesseract import image_to_string
img = cv2.imread("QUfxY.png")
gry = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
(h, w) = gry.shape[:2]
gry = cv2.resize(gry, (w*2, h*2))
cls = cv2.morphologyEx(gry, cv2.MORPH_CLOSE, None)
thr = cv2.threshold(cls, 0, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
txt = image_to_string(thr)
print(txt)

How to improve Tesseract accuracy

I am trying to run OCR on set of images that are similar but can vary in size. For some reason I cannot get a predictable result. Is there anything I can do do get better results.
Tesseract with or without cv2 preprocessing works beautifully on some images and fails on some and there is no pattern. Images are more or less similar.
Upper image represents processed image
def filter_img(img):
# Read pil image as cv2
img = np.array(img)
img = cv2.resize(img, None, fx=2, fy=2, interpolation=cv2.INTER_CUBIC)
# Converting image to grayscale (important for applying threshold)
img = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
#Apply dilation and erosion to remove some noise
kernel = np.ones((1, 1), np.uint8)
# img = cv2.dilate(img, kernel, iterations=1)
img = cv2.erode(img, kernel, iterations=1)
# Apply blur to smooth out the edges
img = cv2.GaussianBlur(img, (5, 5), 0)
# img = cv.medianBlur(img,5)
# Apply threshold to get image with only b&w (binarization)
img = cv2.threshold(img, 0, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
img = Image.fromarray(img)
img = ImageOps.expand(img,border=2,fill='black')
visualize.show_labeled_image(img,boxes)
return img
# Applying Tesseract OCR
def run_tesseract(img):
# Tesseract cmd setup
# pytesseract.pytesseract.tesseract_cmd = "tesseract"
whitelist = string.ascii_uppercase + string.digits + ".-"
parameters = '-c load_freq_dawg=0 -c tessedit_char_whitelist="{}"'.format(whitelist)
psm = 8
custom_oem_psm_config = "--dpi 300 --oem 3 --psm {psm} {parameters}".format(parameters=parameters, psm=psm)
try:
text = pytesseract.image_to_string(img, config=custom_oem_psm_config, timeout=2)
return text.strip()
except RuntimeError:
print ("TIMEOUT")
return ""

If your image format is highly consistent, you might consider using split images. And after ocr the image, use conditional judgments on the first letter or number for error-prone areas, such as 0 and O are confusing. Of course, all of the above is only valid if the image is highly consistent.
enter code here
import cv2
import numpy as np
import pytesseract
import matplotlib.pyplot as plt
pytesseract.pytesseract.tesseract_cmd = 'D://Program Files/Tesseract-
OCR/tesseract.exe'
img = cv2.imread('vATKQ.png')
img2 = img[100:250, 180:650] #split to region you want
plt.imshow(img2)
text=pytesseract.image_to_string(img2)
print(text)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

how to improve pytesseract arguments to work properly - python

Related

Using pytesseract to get text from an image

Unable to OCR alphanumerical image with Tesseract

Unable to read text from Image using pytesseract.image_to_string

How to process this captcha image for Pytesseract?

How to improve Tesseract accuracy

Categories

Resources