I a working on a text recognition project.
I have built a classifier using TensorFlow to predict digits but I would like to implement a more complex algorithm of text recognition by using text localization and text segmentation (separating each character) but I didn't find an implementation for those parts of the algorithms.
So, do you know some algorithms/implementation/tips I, using TensorFlow, to localize text and do text segmentation in natural scenes pictures (actually localize and segmentation of text in the scoreboard for sports pictures)?
Thank you very much for any help.
To group elements on a page, like paragraphs of text and images, you can use some clustering algo, and/or blob detection with some tresholds.
You can use Radon transform to recognize lines and detect skew of a scanned page.
I think that for character separation you will have to mess with fonts. Some polynomial matching/fitting or something. (this is a very wild guess for now, don't take it seriously).
But similar aproach would allow you to get the character out of the line and recognize it in same step.
As for recognition, once you have a character, there is a nice trigonometric trick of comparing angles of the character to the angles stored in a database.
Works great on handwriting too.
I am not an expert on how page segmentation exactly works, but it seems that I am on my way to become one. Just working on a project including it.
So give me a month and I'll be able to tell you more. :D
Anyway, you should go and read Tesseract code to see how HP and Google did it there. It should give you pretty good ideas.
Good luck!
After you are done with Object Detection, you can perform text detection which can be passed on to tesseract. There can multiple variation to enhance image before passing it to detector function.
Reference Papers
https://arxiv.org/abs/1704.03155v2
https://arxiv.org/pdf/2002.07662.pdf
def text_detector(image):
#hasFrame, image = cap.read()
orig = image
(H, W) = image.shape[:2]
(newW, newH) = (640, 320)
rW = W / float(newW)
rH = H / float(newH)
image = cv2.resize(image, (newW, newH))
(H, W) = image.shape[:2]
layerNames = [
"feature_fusion/Conv_7/Sigmoid",
"feature_fusion/concat_3"]
blob = cv2.dnn.blobFromImage(image, 1.0, (W, H),
(123.68, 116.78, 103.94), swapRB=True, crop=False)
net.setInput(blob)
(scores, geometry) = net.forward(layerNames)
(numRows, numCols) = scores.shape[2:4]
rects = []
confidences = []
for y in range(0, numRows):
scoresData = scores[0, 0, y]
xData0 = geometry[0, 0, y]
xData1 = geometry[0, 1, y]
xData2 = geometry[0, 2, y]
xData3 = geometry[0, 3, y]
anglesData = geometry[0, 4, y]
# loop over the number of columns
for x in range(0, numCols):
# if our score does not have sufficient probability, ignore it
if scoresData[x] < 0.5:
continue
# compute the offset factor as our resulting feature maps will
# be 4x smaller than the input image
(offsetX, offsetY) = (x * 4.0, y * 4.0)
# extract the rotation angle for the prediction and then
# compute the sin and cosine
angle = anglesData[x]
cos = np.cos(angle)
sin = np.sin(angle)
# use the geometry volume to derive the width and height of
# the bounding box
h = xData0[x] + xData2[x]
w = xData1[x] + xData3[x]
# compute both the starting and ending (x, y)-coordinates for
# the text prediction bounding box
endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
startX = int(endX - w)
startY = int(endY - h)
# add the bounding box coordinates and probability score to
# our respective lists
rects.append((startX, startY, endX, endY))
confidences.append(scoresData[x])
boxes = non_max_suppression(np.array(rects), probs=confidences)
for (startX, startY, endX, endY) in boxes:
startX = int(startX * rW)
startY = int(startY * rH)
endX = int(endX * rW)
endY = int(endY * rH)
# draw the bounding box on the image
cv2.rectangle(orig, (startX, startY), (endX, endY), (0, 255, 0), 3)
return orig
Related
im trying to capture position of license plate with webcam feed using YOLOv4 tiny then input the result to easyOCR to extract the characters. The detection works well in real time, however when i apply the OCR the webcam stream become really laggy. Is there anyway i can improve this code to make it make it less laggy?
my YOLOv4 detection
#detection
while 1:
#_, pre_img = cap.read()
#pre_img= cv2.resize(pre_img, (640, 480))
_, img = cap.read()
#img = cv2.flip(pre_img,1)
hight, width, _ = img.shape
blob = cv2.dnn.blobFromImage(img, 1 / 255, (416, 416), (0, 0, 0), swapRB=True, crop=False)
net.setInput(blob)
output_layers_name = net.getUnconnectedOutLayersNames()
layerOutputs = net.forward(output_layers_name)
boxes = []
confidences = []
class_ids = []
for output in layerOutputs:
for detection in output:
score = detection[5:]
class_id = np.argmax(score)
confidence = score[class_id]
if confidence > 0.7:
center_x = int(detection[0] * width)
center_y = int(detection[1] * hight)
w = int(detection[2] * width)
h = int(detection[3] * hight)
x = int(center_x - w / 2)
y = int(center_y - h / 2)
boxes.append([x, y, w, h])
confidences.append((float(confidence)))
class_ids.append(class_id)
indexes = cv2.dnn.NMSBoxes(boxes, confidences, .5, .4)
boxes = []
confidences = []
class_ids = []
for output in layerOutputs:
for detection in output:
score = detection[5:]
class_id = np.argmax(score)
confidence = score[class_id]
if confidence > 0.5:
center_x = int(detection[0] * width)
center_y = int(detection[1] * hight)
w = int(detection[2] * width)
h = int(detection[3] * hight)
x = int(center_x - w / 2)
y = int(center_y - h / 2)
boxes.append([x, y, w, h])
confidences.append((float(confidence)))
class_ids.append(class_id)
indexes = cv2.dnn.NMSBoxes(boxes, confidences, .8, .4)
font = cv2.FONT_HERSHEY_PLAIN
colors = np.random.uniform(0, 255, size=(len(boxes), 3))
if len(indexes) > 0:
for i in indexes.flatten():
x, y, w, h = boxes[i]
label = str(classes[class_ids[i]])
confidence = str(round(confidences[i], 2))
color = colors[i]
cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
# detection= cv2.rectangle(img, (x, y), (x + w, y + h), color, 2)
detected_image = img[y:y+h, x:x+w]
cv2.putText(img, label + " " + confidence, (x, y + 400), font, 2, color, 2)
#print(detected_image)
cv2.imshow('detection',detected_image)
cv2.imwrite('lp5.jpg',detected_image)
cropped_image = cv2.imread('lp5.jpg')
cv2.waitKey(5000)
print("system is waiting")
result = OCR(cropped_image)
print(result)
easy OCR function
def OCR(cropped_image):
reader = easyocr.Reader(['en'], gpu=False) # what the reader expect from the image
result = reader.readtext(cropped_image)
text = ''
for result in result:
text += result[1] + ' '
spliced = (remove(text))
return spliced
There are several points.
cv2.waitKey(5000) in your loop causes some delay even though you keep pressing a key. So remove it if you are not debugging.
You are saving a detected region into a JPEG image file and loading it each time. So pass the region(Numpy array) on the OCR module directly.
EasyOCR is a DNN model based on ResNet, but you are not using a GPU(gpu=False). So use GPU.(See this benchmark by Liao.)
You are creating an easyocr.Reader instance each time in a loop. Creating it requires to load and initialize a DNN model. This is a huge workload causing the major bottleneck. So create only single instance before the loop and reuse it inside a loop.
You are essentially saying "the while loop must be fast."
And of course the OCR() call is a bit slow.
Ok, good.
Don't call OCR() from within the loop.
Rather, enqueue a request,
and let another thread / process / host
worry about the OCR computation,
while the loop quickly continues
upon its merry way.
You could use a threaded Queue,
or a subprocess,
or blast it over to RabbitMQ or Kafka.
The simplest approach would be to
simply overwrite /tmp/cropped_image.png
within the loop,
and have another process notice such
updates and (slowly) call OCR(),
appending the results to a log file.
There might be a couple of updates
to the image file while a single
OCR call is in progress, and that's fine.
The two are decoupled from one another,
each progressing at their own pace.
Downside of a queue would be OCR
sometimes falling behind -- you actually
want to shed load by skipping some
(redundant) cropped images.
The two are racing, and that's fine.
But take care to do things in atomic
fashion -- you wouldn't want to OCR
an image that starts with one frame
and ends with part of a subsequent
frame.
Write to a temp file and, after close(),
use os.rename() to atomically
make those pixels available under
the name that the OCR daemon
will read from.
Once it has a file descriptor
open for read, it will have no
problem reading to EOF without
interference, the kernel takes
care of that for us.
I am trying to create a GAN model which will remove watermark. After doing some homework, I got to this Google AI Blog which makes things worse. Thus I need to create a dataset from these websites Shutterstock, Adobe Stock, Fotolia and Canstock and manymore.
So, when I try to do same image using reverse image search. I founded out that the resolutions, images are changed which makes it more worse.
Thus, I'm only left to create a custom dataset doing the same watermark like from these websites and that's why I need to create same watermark like them on images from unsplash and so..
Can anyone please help me create same watermark which we can get from Shutterstock and Adobe Stock. It'd be a great help.
Note: I have gone through this link for watermark using Imagemagick but I need it in python. If someone can show me a way of doing the same in python. That'd be a great help.
EDIT1: If you look at this Example of Shutterstock. Zoom in and you will find that not only lines but text and rounded symbols are curved and also name and rounded symbol with different opacity. So, that's what I want to replicate.
Here is one way to do that in Python/OpenCV.
Read the input
Create an image of the text
Rotate the text image
Tile out the rotated text image to the size of the input
Blend the tiled, rotated text image with the input image
Save the output
Input:
import cv2
import numpy as np
import math
text = "WATERMARK"
thickness = 2
scale = 0.75
pad = 5
angle = -45
blend = 0.25
def rotate_bound(image, angle):
# function to rotate an image
# from https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py
# grab the dimensions of the image and then determine the center
(h, w) = image.shape[:2]
(cX, cY) = (w / 2, h / 2)
# grab the rotation matrix (applying the negative of the
# angle to rotate clockwise), then grab the sine and cosine
# (i.e., the rotation components of the matrix)
M = cv2.getRotationMatrix2D((cX, cY), -angle, 1.0)
cos = np.abs(M[0, 0])
sin = np.abs(M[0, 1])
# compute the new bounding dimensions of the image
nW = int((h * sin) + (w * cos))
nH = int((h * cos) + (w * sin))
# adjust the rotation matrix to take into account translation
M[0, 2] += (nW / 2) - cX
M[1, 2] += (nH / 2) - cY
# perform the actual rotation and return the image
return cv2.warpAffine(image, M, (nW, nH))
# read image
photo = cv2.imread('lena.jpg')
ph, pw = photo.shape[:2]
# determine size for text image
(wd, ht), baseLine = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, scale, thickness)
print (wd, ht, baseLine)
# add text to black background image padded all around
pad2 = 2 * pad
text_img = np.zeros((ht+pad2,wd+pad2,3), dtype=np.uint8)
text_img = cv2.putText(text_img, text, (pad,ht+pad), cv2.FONT_HERSHEY_SIMPLEX, scale, (255,255,255), thickness)
# rotate text image
text_rot = rotate_bound(text_img, angle)
th, tw = text_rot.shape[:2]
# tile the rotated text image to the size of the input
xrepeats = math.ceil(pw/tw)
yrepeats = math.ceil(ph/th)
print(yrepeats,xrepeats)
tiled_text = np.tile(text_rot, (yrepeats,xrepeats,1))[0:ph, 0:pw]
# combine the text with the image
result = cv2.addWeighted(photo, 1, tiled_text, blend, 0)
# save results
cv2.imwrite("text_img.png", text_img)
cv2.imwrite("text_img_rot.png", text_rot)
cv2.imwrite("lena_tiled_rotated_text_img.jpg", result)
# show the results
cv2.imshow("text_img", text_img)
cv2.imshow("text_rot", text_rot)
cv2.imshow("tiled_text", tiled_text)
cv2.imshow("result", result)
cv2.waitKey(0)
Text Image:
Rotated Text Image:
Result:
Here is another variation in Python/OpenCV that does outline font for the watermark. I have made the font size larger so that the outline is more visible.
import cv2
import numpy as np
import math
text = "WATERMARK"
thickness = 2
scale = 1.5
pad = 5
angle = -45
blend = 0.4
# function to rotate an image
def rotate_bound(image, angle):
# from https://github.com/PyImageSearch/imutils/blob/master/imutils/convenience.py
# grab the dimensions of the image and then determine the center
(h, w) = image.shape[:2]
(cX, cY) = (w / 2, h / 2)
# grab the rotation matrix (applying the negative of the
# angle to rotate clockwise), then grab the sine and cosine
# (i.e., the rotation components of the matrix)
M = cv2.getRotationMatrix2D((cX, cY), -angle, 1.0)
cos = np.abs(M[0, 0])
sin = np.abs(M[0, 1])
# compute the new bounding dimensions of the image
nW = int((h * sin) + (w * cos))
nH = int((h * cos) + (w * sin))
# adjust the rotation matrix to take into account translation
M[0, 2] += (nW / 2) - cX
M[1, 2] += (nH / 2) - cY
# perform the actual rotation and return the image
return cv2.warpAffine(image, M, (nW, nH))
# read image
photo = cv2.imread('lena.jpg')
ph, pw = photo.shape[:2]
# determine size for text image
(wd, ht), baseLine = cv2.getTextSize(text, cv2.FONT_HERSHEY_SIMPLEX, scale, thickness)
print (wd, ht, baseLine)
# add text to black background image padded all around
# write thicker white text and then write over that with thinner gray text to make outline text
pad2 = 2 * pad
text_img = np.zeros((ht+pad2,wd+pad2,3), dtype=np.uint8)
text_img = cv2.putText(text_img, text, (pad,ht+pad), cv2.FONT_HERSHEY_SIMPLEX, scale, (256,256,256), thickness+3)
text_img = cv2.putText(text_img, text, (pad,ht+pad), cv2.FONT_HERSHEY_SIMPLEX, scale, (128,128,128), thickness)
# rotate text image
text_rot = rotate_bound(text_img, angle)
th, tw = text_rot.shape[:2]
# tile the rotated text image to the size of the input
xrepeats = math.ceil(pw/tw)
yrepeats = math.ceil(ph/th)
print(yrepeats,xrepeats)
tiled_text = np.tile(text_rot, (yrepeats,xrepeats,1))[0:ph, 0:pw]
# combine the text with the image
#result = cv2.addWeighted(photo, 1, tiled_text, blend, 0)
mask = blend * cv2.threshold(tiled_text, 0, 255, cv2.THRESH_BINARY)[1]
result = (mask * tiled_text.astype(np.float64) + (255-mask)*photo.astype(np.float64))/255
result = result.clip(0,255).astype(np.uint8)
# save results
cv2.imwrite("text_img.png", text_img)
cv2.imwrite("text_img_rot.png", text_rot)
cv2.imwrite("lena_tiled_rotated_text_img2.jpg", result)
# show the results
cv2.imshow("text_img", text_img)
cv2.imshow("text_rot", text_rot)
cv2.imshow("tiled_text", tiled_text)
cv2.imshow("result", result)
cv2.waitKey(0)
Result:
I am currently working on creating a python software that tracks players on a soccer field. I got the player detection working with YoloV3 and was able to output quite a nice result with players centroids and boxes drawn. What i want to do now is translate the players position and project their centroids onto a png/jpg of a soccerfield. For this I inteded to use two arrays with refrence points one for the soccerfield-image and one for the source video. But my question now is how do I translate the coordinates of the centroids to the soccerfield image.
Similiar example:
Example
How the boxes and Markers are drawn:
def draw_labels_and_boxes(img, boxes, confidences, classids, idxs, colors, labels):
# If there are any detections
if len(idxs) > 0:
for i in idxs.flatten():
# Get the bounding box coordinates
x, y = boxes[i][0], boxes[i][1]
w, h = boxes[i][2], boxes[i][3]
# Draw the bounding box rectangle and label on the image
cv.rectangle(img, (x, y), (x + w, y + h), (255, 255, 255), 2)
cv.drawMarker (img, (int(x + w / 2), int(y + h / 2)), (x, y), 0, 20, 3)
return img
Boxes are generated like this:
def generate_boxes_confidences_classids(outs, height, width, tconf):
boxes = []
confidences = []
classids = []
for out in outs:
for detection in out:
# print (detection)
# a = input('GO!')
# Get the scores, classid, and the confidence of the prediction
scores = detection[5:]
classid = np.argmax(scores)
confidence = scores[classid]
# Consider only the predictions that are above a certain confidence level
if confidence > tconf:
# TODO Check detection
box = detection[0:4] * np.array([width, height, width, height])
centerX, centerY, bwidth, bheight = box.astype('int')
# Using the center x, y coordinates to derive the top
# and the left corner of the bounding box
x = int(centerX - (bwidth / 2))
y = int(centerY - (bheight / 2))
# Append to list
boxes.append([x, y, int(bwidth), int(bheight)])
confidences.append(float(confidence))
classids.append(classid)
return boxes, confidences, classids
Assuming a stationary camera,
Find the coordinates of the four corners of the field.
Find the corresponding four corners in the top-view image that you want to create.
Find a homography matrix using these two sets of points. You can use OpenCV's findHomography for this.
Transform all the centroids using this homography matrix and that should give you your coordinates in the new image space. You can use warpPerspective for doing this.
Recently during COVID19 pandemic many developers have developed "social-distancing-monitoring-system". There a few of them also developed "Bird's Eye View" of the system. Your problem is just similar. As external links are not accepted here, so I am not able to post the exact link(s). Please check their codes in GitHub.
I have a number of blueprints where I would like to detect the numbers on the blueprint such that I can turn them into proper models.
for example I have the following image and would like all the numbers on this image so I ran the following code:
import pytesseract
from pytesseract import Output
import cv2
import numpy as np
img = cv2.imread('vdb7C.jpg')
custom_config = r' (--oem 2 --psm 10'
d = pytesseract.image_to_data(img,config=custom_config,lang='eng', output_type=Output.DICT)
n_boxes = len(d['level'])
for i in range(n_boxes):
text=d["text"][i]
print(text+ str(str.isdigit(text)))
if str.isdigit(text):
(x, y, w, h) = (d['left'][i], d['top'][i], d['width'][i], d['height'][i])
cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
cv2.imwrite("output.jpg" , img)
This gave me the following result: . As you can see it does correctly identify a number of numbers on the blueprint, however it misses quite a few others and falsely detect a few that aren't really there. I care more about getting all the numbers than a few false positives but would still like to keep those to a minimum so any suggestions there?
I have already tried thinning operations, re-scaling the images, rotating the images and smoothing the images but all of those don't appear to make much difference, extreme rescaling (*0.1 or *10) does change a few things but any gains made in one part of the image are undone by faults appearing in other parts.
Especially difficult are situations such as on the left building where we have lines numbers close to or even overlapping part of the design.
Here we see 2 examples of such situations
also note that font usage is not consistent between images.
It's worth noting that the lines are almost always obviously thinner then the fond used for the numbers so perhaps something could be done with that?
I have also tried using the EAST OCR system with the following code:
img = cv2.imread('vdb7C.jpg')
W=5664
H=4000
dim = (W, H)
img = cv2.resize(img, dim, interpolation = cv2.INTER_AREA)
net = cv2.dnn.readNet("frozen_east_text_detection.pb")
blob = cv2.dnn.blobFromImage(img, 1.0, (W, H),
(123.68, 116.78, 103.94), swapRB=True, crop=False)
net.setInput(blob)
(scores, geometry) = net.forward(["feature_fusion/Conv_7/Sigmoid",
"feature_fusion/concat_3"])
(numRows, numCols) = scores.shape[2:4]
rects = []
confidences = []
# loop over the number of rows
for y in range(0, numRows):
# extract the scores (probabilities), followed by the geometrical
# data used to derive potential bounding box coordinates that
# surround text
scoresData = scores[0, 0, y]
xData0 = geometry[0, 0, y]
xData1 = geometry[0, 1, y]
xData2 = geometry[0, 2, y]
xData3 = geometry[0, 3, y]
anglesData = geometry[0, 4, y]
for x in range(0, numCols):
if scoresData[x] < confidence:
continue
(offsetX, offsetY) = (x * 4.0, y * 4.0)
angle = anglesData[x]
cos = np.cos(angle)
sin = np.sin(angle)
h = xData0[x] + xData2[x]
w = xData1[x] + xData3[x]
endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]))
endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]))
startX = int(endX - w)
startY = int(endY - h)
rects.append((startX, startY, endX, endY))
confidences.append(scoresData[x])
boxes = non_max_suppression(np.array(rects), probs=confidences)
for box in boxes:
(y,h,x,w) = box
print(box)
print(np.shape(img))
cv2.rectangle(img, (x, y), (x + w, y + h), (0, 255, 0), 2)
cv2.imwrite("output.jpg" , img)
however this causes quite a number of bounding boxes to be outside of the image and in general the bounding boxes seem unrelated to the content, so anyone know what's up there?
Any suggestions? I have 8000 images right now and need to eventually process a total of about 400k images.
I suggest using a solution that applies neural networks like keras-ocr which applies CRAFT and CRNN. It does a better job in detecting text that overlaps with the design. This is what I got using it out of the box:
import matplotlib.pyplot as plt
import keras_ocr
detector = keras_ocr.detection.Detector()
image = keras_ocr.tools.read('vdb7C.jpg')
boxes = detector.detect(images=[image])[0]
canvas = keras_ocr.tools.drawBoxes(image, boxes)
plt.imshow(canvas)
Result:
Run your tesseract piece of code, but only use results with 3 or more digits. This should provide you with enough good examples of digits. Extract each digit to separate file and save their positions. Now you can go two ways.
You can go the simple way if you will see that the fonts of the digits are quite similar. Then you can create a set of templates for the digits (say 15-30). Remember that you can get the size of the digits for specific image? Resize your digits template to the right size and run the most trivial template matching. This will definitely create some false detections (especially for "1"s), and you will have to find a way to reduce their amount to acceptable level.
More complex way is to build a custom CNN detector and train it on your data. So from the first stage you will get several hundred examples of digits (and their positions) that you want to detect. You can look at this project or this one as references. Also this article can provide you some guidance.
One more thing that can be useful. Your images have lots of long perpendicular lines. If you align them to the axis, you can remove the lines very easily by binarizing the original, shifting the result (right or down) by several pixels and ANDing them. This will left only the long lines. Find their length, and you will be able to remove lines above certain length in the original image.
the full error says:
cv2.error: OpenCV(4.1.0) C:\projects\opencv-python\opencv\modules\dnn\src\caffe\caffe_io.cpp:1132: error: (-2:Unspecified error) FAILED: fs.is_open(). Can't open "frozen_east_text_detection.pb" in function 'c
here i have the code:
import time
from imutils.object_detection import non_max_suppression
import numpy as np
import cv2
import argparse
imagePath = "C:/xampp/htdocs/Tensorflow/TextDetection-master/images/tabla1.jpg"
east = "frozen_east_text_detection.pb"
newW = 640
newH = 480
min_confidence = 0.5
image = cv2.imread(imagePath) # it is working variable
orig = image.copy()
(H,W) = image.shape[:2]
set the new width an height and then determine the ratio in change
rW = W / float(newW)
rH = H / float(newH)
resize the image and grab the new image dimensions
image = cv2.resize(image, (newW, newH))
(H,W) = image.shape[:2]
"""
In order to perform text detection using OpenCV and the EAST deep learning model,
we need to extract the output feature maps of TWO LAYERS
"""
# Define the TWO output layer names ofr the EAST detector model
# FIRST LAYER : output probabilities of a region containing text or not.
# SECOND LAYER: bounding box coordinates of text.
layerNames = [
"feature_fusion/Conv_7/Sigmoid",
"feature_fusion/concat_3"
]
print("[INFO] loading EAST detector")
net = cv2.dnn.readNet( east ) #Load the Nerual Network into memory
# construct a blob from the image and then perform a forward pass of
# the model to obtain the two output layer sets
blob = cv2.dnn.blobFromImage( #https://www.pyimagesearch.com/2017/11/06/deep-learning-opencvs-blobfromimage-works/
image , 1.0 , (W,H) , (123.68, 116.78, 103.94) , swapRB=True, crop=False
)
start = time.time()
net.setInput(blob)
(scores, geometry) = net.forward(layerNames)
end = time.time()
print("[INFO] text detection took {:.6f} seconds".format(end-start))
# grab the umber of rows and columns from the scores volume, then
# initialize aour set of bounding box rectagles and corresponding
# cofidence scores
(numRows, numCols) = scores.shape[2:4]
rects = []
confidences = []
# loop over the number of rows
for y in range(0,numRows):
#extract the scores (probabilities), followed by the geometrical
#data used to derive potential bounding box coordinates that
#surround text
scoresData = scores[0, 0, y]
xData0 = geometry[0, 0, y]
xData1 = geometry[0, 1, y]
xData2 = geometry[0, 2, y]
xData3 = geometry[0, 3, y]
anglesData = geometry[0, 4, y]
#loop over the number of columns
for x in range(0, numCols):
#i our score does not have sufficient probability, ignore it
if (scoresData[x] < min_confidence):
continue
# compute the offset factor as our resulting feature maps will
# be 4x smaller than the input image
(offsetX, offsetY) = (x * 4.0 , y * 4.0)
# extract the rotation angle for the prediction and then
# compute the sin and cosine
angle = anglesData[x]
cos = np.cos(angle)
sin = np.sin(angle)
# use the geometry volume to derive the width and height of
# the bounding box
h = xData0[x] + xData2[x]
w = xData1[x] + xData3[x]
# compute both the starting and ending (x,y)-coordinates for
# the text prediction bounding box
endX = int(offsetX + (cos * xData1[x]) + (sin * xData2[x]) )
endY = int(offsetY - (sin * xData1[x]) + (cos * xData2[x]) )
startX = int(endX - w)
startY = int(endY - h)
#add the bounding box coordinates and probability score to
#our respective lists
rects.append( (startX, startY, endX, endY) )
confidences.append( scoresData[x] )
#apply non-maxima suppression to suppress weak, overlapping bounding boxes
boxes = non_max_suppression(np.array(rects) , probs=confidences) #imutils-> https://github.com/jrosebr1/imutils/blob/master/imutils/object_detection.py#L4
#loop over the bounding boxes
for (startX, startY, endX, endY) in boxes:
# scale the bounding box coordinates based on the respective ratios
startX = int(startX * rW)
startY = int(startY * rH)
endX = int(endX * rW)
endY = int(endY *rH)
#draw the bounding box on the image
cv2.rectangle( orig , (startX,startY) , (endX,endY) , (0,255,0) , 2 )
#show the outut image
cv2.imshow("Text detection",orig)
cv2.waitKey(0)
cv2.destroyAllWindows()
you can fix this by providing the absolute filepath of frozen_east_text_detection.pb from the main function of your code
Use this link to download model
https://www.dropbox.com/s/r2ingd0l3zt8hxs/frozen_east_text_detection.tar.gz?dl=1
you can further refer to OpenCV official blog
https://learnopencv.com/deep-learning-based-text-detection-using-opencv-c-python/