Segmenting image files with text (and pictures) into blocks

Segmenting image files with text (and pictures) into blocks - python

I'm trying to create bounding boxes for the text in an image I have. An example is the one below.
I would like to add a bounding box around each This is a test line. Unfortunately I'm not sure why this method is not automatically identifying the bounding boxes
import re
import cv2
import numpy as np
import pytesseract
from pytesseract import Output
from matplotlib import pyplot as plt
# Plot character boxes on image using pytesseract.image_to_boxes() function
image = cv2.imread('Image.jpg')
b, g, r = cv2.split(image)
image = cv2.merge([r,g,b])
d = pytesseract.image_to_data(image, output_type=Output.DICT)
print('DATA KEYS: \n', d.keys())
n_boxes = len(d['text'])
for i in range(n_boxes):
# condition to only pick boxes with a confidence > 60%
if int(d['conf'][i]) > 60:
(x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
image = cv2.rectangle(image, (x, y), (x + w, y + h), (0, 255, 0), 2)
b, g, r = cv2.split(image)
rgb_img = cv2.merge([r, g, b])
plt.figure(figsize=(16, 12))
plt.imshow(rgb_img)
plt.title('SAMPLE IMAGE WITH WORD LEVEL BOXES')
plt.show()

Here is a different way to do that with Python/OpenCV.
Read the input
Convert to gray
(OTSU) Threshold (white text on black background)
Apply morphology dilate with horizontal kernel longer than letter spacing and then smaller vertical kernel to remove thin horizontal lines remaining from line in page.
Find contours
Draw bounding boxes of contours on input
Save result
Input:
import cv2
import numpy as np
# load image
img = cv2.imread("test_text.jpg")
# convert to gray
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
# threshold the grayscale image
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# use morphology erode to blur horizontally
#kernel = np.ones((500,3), np.uint8)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (250, 3))
morph = cv2.morphologyEx(thresh, cv2.MORPH_DILATE, kernel)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (3, 17))
morph = cv2.morphologyEx(morph, cv2.MORPH_OPEN, kernel)
# find contours
cntrs = cv2.findContours(morph, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cntrs = cntrs[0] if len(cntrs) == 2 else cntrs[1]
# Draw contours
result = img.copy()
for c in cntrs:
x,y,w,h = cv2.boundingRect(c)
cv2.rectangle(result, (x, y), (x+w, y+h), (0, 0, 255), 2)
# write result to disk
cv2.imwrite("test_text_threshold.png", thresh)
cv2.imwrite("test_text_morph.png", morph)
cv2.imwrite("test_text_lines.jpg", result)
cv2.imshow("GRAY", gray)
cv2.imshow("THRESH", thresh)
cv2.imshow("MORPH", morph)
cv2.imshow("RESULT", result)
cv2.waitKey(0)
cv2.destroyAllWindows()
Thresholded image:
Dilated image:
Result:

Related

What kind of parameters should I use to find and crop objects in an image?

I am new to deep learning and try to implement a ML algorithm for image clustering. The problem is that I can't crop the objects in an image in Python using OpenCV.
Here is the code I have implemented and it works for some objects if the color of the object is very different(in RGB values) from the background but it doesn't work for the image I need for ML algorithm. What kind of parameters should I have/change? Any suggestions?
import cv2
import numpy as np
from PIL import Image
import tkinter as tk
from tkinter import filedialog as fd
from tkinter import*
import random
#!/usr/bin/python
from PIL import Image
import sys
myFile = 'Path' + '/crop.png'
nr_of_im = 1
q = 0
r = 0
x_list = []
y_list = []
img = cv2.imread(myFile, cv2.IMREAD_UNCHANGED)
ret, thresh = cv2.threshold(cv2.cvtColor(img.copy(), cv2.COLOR_BGR2GRAY) , 30, 255, cv2.THRESH_BINARY)
contours, hier = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
for contour in contours:
print("len",len(contours))
if cv2.contourArea(contour) > 80:
x, y, w, h = cv2.boundingRect(contour)
q = w
r = h
x_list.append(x)
y_list.append(y)
font = cv2.FONT_HERSHEY_SIMPLEX
ROI = img[y-10:y+10+h, x-10:x+10+w]
ROI = cv2.resize(ROI,(300,300))
file_all = "/images/%d.jpg"%nr_of_im
nr_of_im += 1
cv2.imwrite(file_all,ROI)
There are 21 objects in the image but the length of contours returns 1. The image looks like so
crop.png:

Your threshold is too low and produces a totally white image for me. You need to increase your threshold. Always view your thresholding to be sure it is working the way you expect. You can always remove the viewing later.
The following works for me using Otsu thresholding with a threshold value of 97. I get 21 contours.
Input:
import cv2
import numpy as np
# read image
img = cv2.imread('blocks.jpg')
# convert to grayscale
gray = cv2.cvtColor(img,cv2.COLOR_BGR2GRAY)
# threshold
ret, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY+cv2.THRESH_OTSU)
print(ret)
# apply morphology fill and separate large regions and remove small ones
kernel = cv2.getStructuringElement(cv2.MORPH_RECT , (9,9))
morph = cv2.morphologyEx(thresh, cv2.MORPH_CLOSE, kernel)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT , (15,15))
morph = cv2.morphologyEx(morph, cv2.MORPH_OPEN, kernel)
# get contours
result = img.copy()
contours = cv2.findContours(morph, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
contours = contours[0] if len(contours) == 2 else contours[1]
# get count of contours
print(len(contours))
# draw bounding boxes on contours
for cntr in contours:
x,y,w,h = cv2.boundingRect(cntr)
cv2.rectangle(result, (x, y), (x+w, y+h), (0, 0, 255), 2)
#print("x,y,w,h:",x,y,w,h)
# save results
cv2.imwrite("blocks_thresh.jpg", thresh)
cv2.imwrite("blocks_morphology.jpg", morph)
cv2.imwrite("blocks_bboxes.jpg", result)
# show thresh and result
cv2.imshow("thresh", thresh)
cv2.imshow("morph", morph)
cv2.imshow("result", result)
cv2.waitKey(0)
cv2.destroyAllWindows()
Threshold image:
Morphology cleaned image:
Resulting bounding boxes from contours:

opencv, python, how to read grouped text in boxes

I would like to get from the image in the groups that are on the image
I have managed to remove first contour (as described below), but issue is that when I try to read the text, I have some missing text, I expect that this is because of other contours that have stayed on the image, but while I try to remove them, I loose the grouping or part of text...
for i in range(len(contours)):
if 800 < cv2.contourArea(contours[i]) < 2000:
x, y, width, height = cv2.boundingRect(contours[i])
roi = img[y:y + height, x:x + width]
roi_h = roi.shape[0]
roi_w = roi.shape[1]
resize_roi = cv2.resize(roi,(int(roi_w*6),int(roi_h*6)), interpolation=cv2.INTER_LINEAR)
afterd = cv2.cvtColor(resize_roi, cv2.COLOR_BGR2GRAY)
retim, threshm = cv2.threshold(afterd, 210, 225, cv2.THRESH_BINARY + cv2.THRESH_OTSU)
contoursm, hierarchym = cv2.findContours(threshm, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
mask = np.ones(resize_roi.shape[:2], dtype="uint8") * 255
for m in range(len(contoursm)):
if 10000 < cv2.contourArea(contoursm[m]) < 33000:
cv2.drawContours(mask, contoursm, m, 0, 7)
afterd = cv2.bitwise_not(afterd)
afterd = cv2.bitwise_and(afterd, afterd, mask=mask)
afterd = cv2.bitwise_not(afterd)
print(pytesseract.image_to_string(afterd, lang='eng', config='--psm 3'))

Instead of dealing with all the boxes, I suggest deleting them by finding connected components, and filling the large clusters with background color.
You may use the following stages:
Convert image to Grayscale, apply threshold, and invert polarity.
Delete all clusters having more than 100 pixels (assume letters are smaller).
Dilate thresh for uniting text areas to single "blocks".
Find contours on the dilated thresh image.
Find bounding rectangles, and apply OCR to the rectangle.
Here is the complete code sample:
import numpy as np
import cv2
import pytesseract
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files\Tesseract-OCR\tesseract.exe' # I am using Windows
img = cv2.imread('img.png') # Read input image
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Convert to Grayscale.
ret, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) # Convert to binary and invert polarity
nlabel,labels,stats,centroids = cv2.connectedComponentsWithStats(thresh, connectivity=8)
thresh_size = 100
# Delete all lines by filling large clusters with zeros.
for i in range(1, nlabel):
if stats[i, cv2.CC_STAT_AREA] > thresh_size:
thresh[labels == i] = 0
# Dilate thresh for uniting text areas to single blocks.
dilated_thresh = cv2.dilate(thresh, np.ones((5,5)))
# Find contours on dilated thresh
contours, hierarchy = cv2.findContours(dilated_thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
# Iterate contours, find bounding rectangles
for c in contours:
# Get bounding rectangle
x, y, w, h = cv2.boundingRect(c)
# Draw green rectangle for testing
cv2.rectangle(img, (x, y), (x+w, y+h), (0, 255, 0), thickness = 1)
# Get the slice with the text (slice with margins).
afterd = thresh[y-3:y+h+3, x-3:x+w+3]
# Show afterd as image for testing
# cv2.imshow('afterd', afterd)
# cv2.waitKey(100)
# The OCR works only when image is enlarged and black text?
resized_afterd = cv2.resize(afterd, (afterd.shape[1]*5, afterd.shape[0]*5), interpolation=cv2.INTER_LANCZOS4)
print(pytesseract.image_to_string(255 - resized_afterd, lang='eng', config='--psm 3'))
cv2.imshow('thresh', thresh)
cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()
Result strings after OCR:
DF6DF645
RFFTW
2345
2277
AABBA
DF1267
ABCET5456
Input image with green boxes around the text:
Update:
Grouping contours:
For contours contours you may use the hierarchy result of cv2.findContours with cv2.RETR_TREE.
See Contours Hierarchy documentation.
You may use the parent-child relationship for grouping contours.
Here is an incomplete sample code for using the hierarchy:
img = cv2.imread('img.png') # Read input image
gray = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY) # Convert to Grayscale.
ret, thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU) # Convert to binary and invert polarity
nlabel,labels,stats,centroids = cv2.connectedComponentsWithStats(thresh, connectivity=8)
thresh_boxes = np.zeros_like(thresh)
thresh_size = 100
# Delete all lines by filling large clusters with zeros.
# Make new image that contains only boxes - without text
for i in range(1, nlabel):
if stats[i, cv2.CC_STAT_AREA] > thresh_size:
thresh[labels == i] = 0
thresh_boxes[labels == i] = 255
# Find contours on thresh_boxes, use cv2.RETR_TREE to build tree with hierarchy
contours, hierarchy = cv2.findContours(thresh_boxes, cv2.RETR_TREE, cv2.CHAIN_APPROX_NONE)
# Iterate contours, and hierarchy
for c, i in zip(contours, range(len(contours))):
h = hierarchy[0, i, :]
h_child = h[2]
# if contours has no child (last level)
if h_child == -1:
h_parent = h[3]
x, y, w, h = cv2.boundingRect(c)
cv2.putText(img, str(h_parent), (x+w//2-4, y+h//2+8), fontFace=cv2.FONT_HERSHEY_SIMPLEX, fontScale=1, color=(0, 0, 255), thickness=2)
cv2.imshow('thresh', thresh)
cv2.imshow('img', img)
cv2.waitKey(0)
cv2.destroyAllWindows()
Result:

Drawing bounding boxes with Pytesseract / OpenCV

I'm using pytesseract (0.3.2) with openCV (4.1.2) to identify digits in images. While image_to_string is working, image_to_data and image_to_boxes are not. I need to be able to draw the bounding boxes on the images and this has stumped me. I've tried different images, older versions of pytesseract, etc. I'm using Windows and Jupyter Notebooks.
import cv2
import pytesseract
#erosion
def erode(image):
kernel = np.ones((5,5),np.uint8)
return cv2.erode(image, kernel, iterations = 1)
#grayscale
def get_grayscale(image):
return cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
#thresholding
def thresholding(image):
#return cv2.adaptiveThreshold(image, 255, cv2.ADAPTIVE_THRESH_GAUSSIAN_C, cv2.THRESH_BINARY, 31, 2)
return cv2.threshold(image, 200, 255, cv2.THRESH_BINARY + cv2.THRESH_OTSU)[1]
img = cv2.imread('my_image.jpg')
pytesseract.pytesseract.tesseract_cmd = r'C:\mypath\tesseract.exe'
gray = get_grayscale(img)
thresh = thresholding(gray)
erode = remove_noise(thresh)
custom_config = r'-c tessedit_char_whitelist=0123456789 --psm 6'
print(pytesseract.image_to_string(erode, config=custom_config))
cv2.imwrite("test.jpg", erode)
#these return nothing
print(pytesseract.image_to_boxes(Image.open('test.jpg')))
print(pytesseract.image_to_data(Image.open('test.jpg')))

Instead of using image_to_boxes, an alternative approach is to simply find contours with cv2.findContours, obtain the bounding rectangle coordinates with cv2.boundingRect, and draw the bounding box with cv2.rectangle
Using this sample input image
Drawn boxes
Result from OCR
1234567890
Code
import cv2
import pytesseract
import numpy as np
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# Load image, grayscale, Otsu's threshold
image = cv2.imread('1.png')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Draw bounding boxes
cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
x,y,w,h = cv2.boundingRect(c)
cv2.rectangle(image, (x, y), (x + w, y + h), (36,255,12), 2)
# OCR
data = pytesseract.image_to_string(255 - thresh, lang='eng',config='--psm 6')
print(data)
cv2.imshow('thresh', thresh)
cv2.imshow('image', image)
cv2.waitKey()

Please try the following code:
from pytesseract import Output
import pytesseract
import cv2
image = cv2.imread("my_image.jpg")
#swap color channel ordering from BGR (OpenCV’s default) to RGB (compatible with Tesseract and pytesseract).
# By default OpenCV stores images in BGR format and since pytesseract assumes RGB format,
# we need to convert from BGR to RGB format/mode:
rgb = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
pytesseract.pytesseract.tesseract_cmd = r'C:\mypath\tesseract.exe'
custom_config = r'-c tessedit_char_whitelist=0123456789 --psm 6'
results = pytesseract.image_to_data(rgb, output_type=Output.DICT,lang='eng',config=custom_config)
boxresults = pytesseract.image_to_boxes(rgb,output_type=Output.DICT,lang='eng',config=custom_config)
print(results)
print(boxresults)
for i in range(0, len(results["text"])):
# extract the bounding box coordinates of the text region from the current result
tmp_tl_x = results["left"][i]
tmp_tl_y = results["top"][i]
tmp_br_x = tmp_tl_x + results["width"][i]
tmp_br_y = tmp_tl_y + results["height"][i]
tmp_level = results["level"][i]
conf = results["conf"][i]
text = results["text"][i]
if(tmp_level == 5):
cv2.putText(image, text, (tmp_tl_x, tmp_tl_y - 10), cv2.FONT_HERSHEY_SIMPLEX,0.5, (0, 0, 255), 1)
cv2.rectangle(image, (tmp_tl_x, tmp_tl_y), (tmp_br_x, tmp_br_y), (0, 0, 255), 1)
for j in range(0,len(boxresults["left"])):
left = boxresults["left"][j]
bottom = boxresults["bottom"][j]
right = boxresults["right"][j]
top = boxresults["top"][j]
cv2.rectangle(image, (left, top), (right, bottom), (255, 0, 0), 1)
cv2.imshow("image",image)
cv2.waitKey(0)

Recognize single characters on a page with Tesseract

this image returns empty string;
basically I am trying to make a bot for WOW game, but I am really new to this OCR thing. I cannot make tesseract to read this image; I want an unordered list of characters and if possible coordinates of each square containing them. Is there anyway to do this?
Thank you for your time!
here is my code:
from PIL import Image
import cv2
from pytesseract import image_to_string
column = Image.open('photo.png')
gray = column.convert('L')
blackwhite = gray.point(lambda x: 255 if x < 200 else 0, '1')
blackwhite.save("code_bw.jpg")
print(image_to_string(cv2.imread("code_bw.jpg")))

You need to do some preprocessing to isolate the text characters. A simple approach is to Otsu's threshold to obtain a binary image then we can find contours and filter using aspect ratio + contour area. This will give us the bounding box coordinates of the text where we can draw this onto a mask. We bitwise-and the mask with the input image to get our cleaned image then throw it into OCR. Here's the result:
Detected text characters
Result
Result from OCR
A
A R
P
Code
import cv2
import pytesseract
import numpy as np
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
# Load image, grayscale, Otsu's threshold
image = cv2.imread('1.jpg')
original = image.copy()
mask = np.zeros(image.shape, dtype=np.uint8)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Find contours and filter using aspect ratio and area
cnts = cv2.findContours(thresh, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
area = cv2.contourArea(c)
x,y,w,h = cv2.boundingRect(c)
ar = w / float(h)
if area > 1000 and ar > .85 and ar < 1.2:
cv2.rectangle(image, (x, y), (x + w, y + h), (36,255,12), 2)
cv2.rectangle(mask, (x, y), (x + w, y + h), (255,255,255), -1)
ROI = original[y:y+h, x:x+w]
# Bitwise-and to isolate characters
result = cv2.bitwise_and(original, mask)
result[mask==0] = 255
# OCR
data = pytesseract.image_to_string(result, lang='eng',config='--psm 6')
print(data)
cv2.imshow('image', image)
cv2.imshow('thresh', thresh)
cv2.imshow('result', result)
cv2.waitKey()

How to detect paragraphs in a text document image for a non-consistent text structure in Python OpenCV

I am trying to identify paragraphs of text in a .pdf document by first converting it into an image then using OpenCV. But I am getting bounding boxes on lines of text instead of paragraphs. How can I set some threshold or some other limit to get paragraphs instead of lines?
Here is the sample input image:
Here is the output I am getting for the above sample:
I am trying to get a single bounding box on the paragraph in the middle. I am using this code.
import cv2
import numpy as np
large = cv2.imread('sample image.png')
rgb = cv2.pyrDown(large)
small = cv2.cvtColor(rgb, cv2.COLOR_BGR2GRAY)
# kernel = cv2.getStructuringElement(cv2.MORPH_ELLIPSE, (3, 3))
kernel = np.ones((5, 5), np.uint8)
grad = cv2.morphologyEx(small, cv2.MORPH_GRADIENT, kernel)
_, bw = cv2.threshold(grad, 0.0, 255.0, cv2.THRESH_BINARY | cv2.THRESH_OTSU)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (9, 1))
connected = cv2.morphologyEx(bw, cv2.MORPH_CLOSE, kernel)
# using RETR_EXTERNAL instead of RETR_CCOMP
contours, hierarchy = cv2.findContours(connected.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
#For opencv 3+ comment the previous line and uncomment the following line
#_, contours, hierarchy = cv2.findContours(connected.copy(), cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_NONE)
mask = np.zeros(bw.shape, dtype=np.uint8)
for idx in range(len(contours)):
x, y, w, h = cv2.boundingRect(contours[idx])
mask[y:y+h, x:x+w] = 0
cv2.drawContours(mask, contours, idx, (255, 255, 255), -1)
r = float(cv2.countNonZero(mask[y:y+h, x:x+w])) / (w * h)
if r > 0.45 and w > 8 and h > 8:
cv2.rectangle(rgb, (x, y), (x+w-1, y+h-1), (0, 255, 0), 2)
cv2.imshow('rects', rgb)
cv2.waitKey(0)

This is a classic situation for dilate. Whenever you want to connect multiple items together, you can dilate them to join adjacent contours into a single contour. Here's a simple approach:
Obtain binary image. Load the image, convert to grayscale, Gaussian blur, then Otsu's threshold to obtain a binary image.
Connect adjacent words together. We create a rectangular kernel and dilate to merge individual contours together.
Detect paragraphs. From here we find contours, obtain the rectangular bounding rectangle coordinates and highlight the rectangular contours.
Otsu's threshold to obtain a binary image
Here's where the magic happens. We can assume that a paragraph is a section of words that are close together, to achieve this we dilate to connect adjacent words
Result
import cv2
import numpy as np
# Load image, grayscale, Gaussian blur, Otsu's threshold
image = cv2.imread('1.png')
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
blur = cv2.GaussianBlur(gray, (7,7), 0)
thresh = cv2.threshold(blur, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
# Create rectangular structuring element and dilate
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
dilate = cv2.dilate(thresh, kernel, iterations=4)
# Find contours and draw rectangle
cnts = cv2.findContours(dilate, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
cnts = cnts[0] if len(cnts) == 2 else cnts[1]
for c in cnts:
x,y,w,h = cv2.boundingRect(c)
cv2.rectangle(image, (x, y), (x + w, y + h), (36,255,12), 2)
cv2.imshow('thresh', thresh)
cv2.imshow('dilate', dilate)
cv2.imshow('image', image)
cv2.waitKey()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Segmenting image files with text (and pictures) into blocks - python

Related

What kind of parameters should I use to find and crop objects in an image?

opencv, python, how to read grouped text in boxes

Drawing bounding boxes with Pytesseract / OpenCV

Recognize single characters on a page with Tesseract

How to detect paragraphs in a text document image for a non-consistent text structure in Python OpenCV

Categories

Resources