Related
I am trying to detect objects in a certain area using yolov7 and deepSORT algorithm, but in the results I get, I see that the IDs are always changing. I leave 3 photos for you to understand.
As you can see the IDs are different in all frames.
`
#class base virtual zone tracking
import random
import torch
import numpy as np
from models.experimental import attempt_load
from utils.torch_utils import TracedModel
from utils.datasets import letterbox
from utils.plots import plot_one_box, plot_one_box_center
from utils.general import check_img_size, non_max_suppression, scale_coords
import cv2
import time
from google.colab.patches import cv2_imshow
from shapely.geometry import Point
from shapely.geometry.polygon import Polygon
#deep sort
import os
import tensorflow as tf
physical_devices = tf.config.experimental.list_physical_devices('GPU')
if len(physical_devices) > 0:
tf.config.experimental.set_memory_growth(physical_devices[0], True)
from tensorflow.compat.v1 import ConfigProto
from deep_sort.tracker import Tracker
from deep_sort.detection import Detection
import matplotlib.pyplot as plt
from deep_sort import preprocessing, nn_matching
from tracking_helpers import read_class_names, create_box_encoder
from detection_helpers import *
class YOLOv7:
def __init__(self, weights: str, image_size:int,device:str):
self.device = device
self.weights = weights
self.model = attempt_load(self.weights, map_location=self.device) # Model Load FP32
self.stride = int(self.model.stride.max())
self.image_size = check_img_size(image_size, self.stride)
if self.device != 'cpu':
self.half = True
else:
self.half = False
if self.half:
self.model.half() # FP16
self.names = self.model.module.names if hasattr(self.model , 'module') else self.model.names
color_values = [[random.randint(0, 255) for _ in range(3)] for _ in range(len(self.names))]
self.colors = {i:color_values[i] for i in range(len(self.names))}
def detect(self, raw_image: np.ndarray, conf_thresh =0.45, iou_thresh =0.45, classes = [0]): #default class people
# Run inference
if self.device != 'cpu':
self.model(torch.zeros(1, 3, self.image_size, self.image_size).to(self.device).type_as(next(self.model.parameters())))
with torch.no_grad():
image = letterbox(raw_image, self.image_size, stride=self.stride)[0]
image = image[:, :, ::-1].transpose(2, 0, 1)
image = np.ascontiguousarray(image)
image = torch.from_numpy(image).to(self.device)
image = image.half() if self.half else image.float()
image /= 255.0
if image.ndimension() == 3:
image = image.unsqueeze(0)
# Inference
detections = self.model(image, augment=False)[0]
# Apply NMS
detections = non_max_suppression(detections, conf_thresh, iou_thresh, classes=classes, agnostic=False)[0]
# Rescale boxes from img_size to raw image size
detections[:, :4] = scale_coords(image.shape[2:], detections[:, :4], raw_image.shape).round()
return detections
def tracking(self, video_frame, yolo_dets, inside_poly = True, count_objects:bool=False,verbose=False, reID_model_path = "./deep_sort/model_weights/mars-small128.pb", nms_max_overlap:float=1.0, max_cosine_distance:float=0.4, nn_budget:float=None):
class_names = read_class_names()
encoder = create_box_encoder(reID_model_path, batch_size=1)
nms_max_overlap = nms_max_overlap
metric = nn_matching.NearestNeighborDistanceMetric("cosine", max_cosine_distance, nn_budget)
tracker = Tracker(metric)
*xyxy, conf, cls = yolo_dets
frame = cv2.cvtColor(video_frame, cv2.COLOR_BGR2RGB)
if yolo_dets is None:
bboxes = []
scores = []
classes = []
num_objects = 0
else:
bboxes = yolo_dets[:,:4]
bboxes[:,2] = bboxes[:,2] - bboxes[:,0] # convert from xyxy to xywh
bboxes[:,3] = bboxes[:,3] - bboxes[:,1]
scores = yolo_dets[:,4]
classes = yolo_dets[:,-1]
num_objects = bboxes.shape[0]
#how many object you track
names = []
for i in range(num_objects): # loop through objects and use class index to get class name
class_indx = int(classes[i])
class_name = class_names[class_indx]
names.append(class_name)
names = np.array(names)
count = len(names)
if count_objects:
cv2.putText(frame, "both inside and outside the polygon detection: {}".format(count), (5, 35), cv2.FONT_HERSHEY_COMPLEX_SMALL, 1.5, (0, 0, 0), 2)
# DeepSORT tacker work starts here
features = encoder(frame, bboxes) # encode detections and feed to tracker. [No of BB / detections per frame, embed_size]
detections = [Detection(bbox, score, class_name, feature) for bbox, score, class_name, feature in zip(bboxes, scores, names, features)] # [No of BB per frame] deep_sort.detection.Detection object
cmap = plt.get_cmap('tab20b') #initialize color map
colors = [cmap(i)[:3] for i in np.linspace(0, 1, 20)]
boxs = np.array([d.tlwh for d in detections]) # run non-maxima supression below
scores = np.array([d.confidence for d in detections])
classes = np.array([d.class_name for d in detections])
indices = preprocessing.non_max_suppression(boxs, classes, nms_max_overlap, scores)
detections = [detections[i] for i in indices]
tracker.predict() # Call the tracker
tracker.update(detections) # updtate using Kalman Gain
for track in tracker.tracks: # update new findings AKA tracks
#if not track.is_confirmed() or track.time_since_update > 1:
#continue
bbox = track.to_tlbr()
class_name = track.get_class()
color = colors[int(track.track_id) % len(colors)] # draw bbox on screen
color = [i * 255 for i in color]
#drawing poly
#pts = np.array([[6,449], [1052, 2], [1914, 6], [1766, 1074], [2, 1076]])
#frame = cv2.polylines(frame, [pts], True, (0,0,255), 5)
#creating poly
#poli = Polygon([(6,449), (1052, 2), (1914, 6), (1766, 1074), (2, 1076)])
#center = (int((bbox[0] + bbox[2]) / 2), int((bbox[1] + bbox[3]) / 2)) #center point ( (x1 + x2) / 2, (y1 + y2) / 2 )
#point = Point(center)
if inside_poly:
#drawing poly
pts = np.array([[6,449], [1052, 2], [1914, 6], [1766, 1074], [2, 1076]])
frame = cv2.polylines(frame, [pts], True, (0,0,255), 5)
#creating poly
poli = Polygon([(6,449), (1052, 2), (1914, 6), (1766, 1074), (2, 1076)])
center = (int((bbox[0] + bbox[2]) / 2), int((bbox[1] + bbox[3]) / 2)) #center point ( (x1 + x2) / 2, (y1 + y2) / 2 )
point = Point(center)
if poli.contains(point):
cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), color, 2)
cv2.rectangle(frame, (int(bbox[0]), int(bbox[1]-30)), (int(bbox[0])+(len(class_name)+len(str(track.track_id)))*17, int(bbox[1])), color, -1)
cv2.putText(frame, class_name + " : " + str(track.track_id),(int(bbox[0]), int(bbox[1]-11)),0, 0.6, (255,255,255),1, lineType=cv2.LINE_AA)
cv2.putText(frame, "0", center,0, 0.6, (255,255,255),1, lineType=cv2.LINE_AA)
else:
cv2.rectangle(frame, (int(bbox[0]), int(bbox[1])), (int(bbox[2]), int(bbox[3])), color, 2)
cv2.rectangle(frame, (int(bbox[0]), int(bbox[1]-30)), (int(bbox[0])+(len(class_name)+len(str(track.track_id)))*17, int(bbox[1])), color, -1)
cv2.putText(frame, class_name + " : " + str(track.track_id),(int(bbox[0]), int(bbox[1]-11)),0, 0.6, (255,255,255),1, lineType=cv2.LINE_AA)
if verbose == 2:
print("Tracker ID: {}, Class: {}, BBox Coords (xmin, ymin, xmax, ymax): {}".format(str(track.track_id), class_name, (int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3]))))
result = np.asarray(frame)
result = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
return result
if __name__=='__main__':
yolov7=YOLOv7(weights='yolov7x.pt', device='cpu', image_size=800)
cap = cv2.VideoCapture('street5sn.mp4')
torch.cuda.empty_cache()
#writer
width = int(cap.get(cv2.CAP_PROP_FRAME_WIDTH)) # by default VideoCapture returns float instead of int
height = int(cap.get(cv2.CAP_PROP_FRAME_HEIGHT))
fps = int(cap.get(cv2.CAP_PROP_FPS))
codec = cv2.VideoWriter_fourcc(*"DIVX")
out = cv2.VideoWriter("./output/video_out_track5sn-d2.mp4", codec, fps, (width, height))
while True:
t1 = time.time()
ret, frame = cap.read()
if not ret:
break
detections=yolov7.detect(frame)
vir = yolov7.tracking(frame, detections, count_objects = True, inside_poly = False)
out.write(vir)
cv2_imshow(vir) #colab imshow kodu
print("add frame ...")
if cv2.waitKey(1) & 0xFF == ord('q'):
break
out.release()
cap.release()
cv2.destroyAllWindows()
`
I use this repo repo
I did not make any changes to other files.
I have multiple different folders with the images have same naming like a.png etc. I want to modify the above code to read this same named files in different directories and give their opencv output using yolo at the same time. To be more specific I have 10 files which contains images transported with different categories like one folder contains rgb files and the other contains gray files etc. To compare their output, I want to show the images with same naming but in different folders. I know it should not be that hard but I am pretty confused. Thanks in advance!
import cv2
import numpy as np
import os
import matplotlib.pyplot as plt
import tkinter
from tkinter import filedialog
def cal_alpB(minMax):
minD = minMax[0]
maxD = minMax[1]
alpha = 255/(maxD-minD)
beta = -alpha*minD
return [alpha, beta]
def getMinMax(path):
with open(path+'/config') as f:
minMax = f.read().splitlines()
minMax = minMax[0].split(',')
minMax = [eval(x) for x in minMax]
return minMax
def normalizeData(minMax, img):
alpB = cal_alpB(minMax)
img[img>minMax[1]] = minMax[1]
img[img<0] = 0
return alpB
def boxDrawing(layerOutput, frameWidth, frameHeight, class_ids, confidences, boxes, img):
for output in layerOutput:
for detection in output:
score = detection[5:]
class_id = np.argmax(score)
confidence = score[class_id]
if confidence > 0.5:
center_x = int(detection[0] * frameWidth)
center_y = int(detection[1] * frameHeight)
width = int(detection[2] * frameWidth)
height = int(detection[3] * frameHeight)
left = int(center_x - width / 2)
top = int(center_y - height / 2)
class_ids.append(class_id)
confidences.append(float(confidence))
boxes.append([left, top, width, height])
indexes = cv2.dnn.NMSBoxes(boxes, confidences, 0.8, 0.7)
font = cv2.FONT_HERSHEY_PLAIN
colors = np.random.uniform(0, 255, size = (len(boxes),3))
for i in range(len(boxes)):
if i in indexes:
x,y,w,h = boxes[i]
label = str(classes[class_ids[i]])
confi = str(round(confidences[i],2))
color = colors[i]
cv2.rectangle(img, (x,y), (x+w,y+h), color,1)
cv2.putText(img, label+" "+ confi, (x,y+20), font, 1, (255,255,255),1)
def algorythmYolo():
tkinter.Tk().withdraw()
folder = filedialog.askdirectory()
minMax = getMinMax(folder)
for filename in sorted(os.listdir(folder)):
img = cv2.imread(os.path.join(folder,filename),-1)
if img is not None:
alpB = normalizeData(minMax,img)
img = cv2.convertScaleAbs(img, alpha=alpB[0], beta= alpB[1])
img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
frameHeight, frameWidth, channels = img.shape
blob = cv2.dnn.blobFromImage(img, 1/255, (frameWidth,frameHeight), (0,0,0), swapRB = True, crop = False)
yolo.setInput(blob)
layerOutput = yolo.forward(outputLayers)
boxes = []
confidences = []
class_ids = []
boxDrawing(layerOutput,frameWidth, frameHeight,class_ids,confidences,boxes,img)
cv2.imshow("window", img)
cv2.setWindowTitle('window', folder)
cv2.waitKey(1)
else:
break
cv2.destroyAllWindows()
yolo = cv2.dnn.readNet("./yolov3.weights","./yolov3.cfg")
with open("./coco.names","r") as f:
classes = f.read().splitlines()
layers_names = yolo.getLayerNames()
outputLayers = [layers_names[i-1] for i in yolo.getUnconnectedOutLayers()]
cv2.namedWindow("window", cv2.WINDOW_NORMAL)
algorythmYolo()
I'm trying to detect the region here (circled in red) more effectively. As it currently stands, I have a few steps to get the area:
Brighten the input image
to increase contrast and likelihood to pick up edges of the image file to get this image
Crop and threshold the region of interest, and add Gaussian blur to get this image:
Use OpenCV to detect Hough Circles on the thresholded image
Select the top 10 largest circles found, then choose the one closest to the grid intersection (in the code as fv_cx and fv_cy) vertically and closest to the edge of the image horizontally.
While this works well in general
often times it misses the right circle
or it encircles an area too small
.
Using these input images, is there a better way to work on this problem?
This is my code so far:
from __future__ import print_function
import pandas as pd
from pandas.api.types import is_numeric_dtype
import os
from PIL import Image, ImageDraw, ImageFont
import math
import cv2
import matplotlib.pyplot as plt
import time
import re
import csv
from skimage import data, color, io, img_as_ubyte
from skimage.transform import hough_circle, hough_circle_peaks, hough_ellipse
from skimage.feature import canny
from skimage.draw import circle_perimeter, ellipse_perimeter
from skimage.util import img_as_ubyte
from builtins import input
import numpy as np
def append_list_as_row(file_name, list_of_elem):
with open(file_name, 'a+', newline='', encoding='utf-8') as write_obj:
csv_writer = csv.writer(write_obj, dialect='excel')
csv_writer.writerow(list_of_elem)
def round_up_to_odd(f):
return int(np.ceil(f) // 2 * 2 + 1)
# Folder path here
folder = r""
csv_file = folder + os.sep + "Measurements.csv"
csv_file2 = folder + os.sep + "Measurements2.csv"
df2 = pd.DataFrame(columns = ["filepath","od_cx","od_cy", "fv_x", "fv_y"])
for subdir, dirs, files in os.walk(folder):
for file in files:
#print os.path.join(subdir, file)
filepath = subdir + os.sep + file
if filepath.endswith(".jpeg") or filepath.endswith(".tiff") and not filepath.endswith("_OD.tiff") and not filepath.endswith("_bright.tiff") and not filepath.endswith("_FV.tiff") and not filepath.endswith("_mask.tiff"):
og_cv = cv2.imread(filepath, cv2.IMREAD_COLOR)
if "left" in str(filepath):
od = "left"
elif "right" in str(filepath):
od = "right"
OD_path = subdir + os.sep + "OD"
if not os.path.exists(str(OD_path)):
os.mkdir(str(OD_path))
OD = OD_path + os.sep + str(os.path.splitext(file)[0]) + "_OD.tiff"
fovea_path = OD_path + os.sep + str(os.path.splitext(file)[0]) + "_FV.tiff"
temp_path = subdir + os.sep + "Temp"
if not os.path.exists(str(temp_path)):
os.mkdir(str(temp_path))
bright = temp_path + os.sep + str(os.path.splitext(file)[0]) + "_bright.tiff"
thresholded_od = temp_path + os.sep + str(os.path.splitext(file)[0]) + "_thresholded_OD.tiff"
thresholded_fv = temp_path + os.sep + str(os.path.splitext(file)[0]) + "_thresholded_FV.tiff"
mask_file = temp_path + os.sep + str(os.path.splitext(file)[0]) + "_mask.tiff"
## Fovea
image = cv2.imread(filepath)
image = cv2.cvtColor(image, cv2.COLOR_BGR2RGB)
h = image.shape[0]
w = image.shape[1]
# loop over the image
fv_cx = []
fv_cy = []
for y in range(0, h):
for x in range(0, w):
# threshold the pixel
if np.all(image[y, x] == (255, 0, 255)) and np.all(image[y, x+3] == (0, 255, 255)) and np.all(image[y, x-3] == (0, 255, 255)):
print("Found fovea")
fv_cx.append(x)
fv_cy.append(y)
# Draw them
# fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(10, 4))
# image = color.gray2rgb(image)
image_draw = image
if image.shape[2] == 3:
image[fv_cy, fv_cx] = (220, 20, 20)
if image.shape[2] == 4:
image[fv_cy, fv_cx] = (220, 20, 20, 0)
plt.imsave(OD, image)
print(fv_cx, fv_cy)
# else:
# fv_cx = "No fv_cx"
# fv_cy = "No fv_cy"
## Find image dimensions
source_img = Image.open(filepath)
width, height = source_img.size
x_max = int(width)
y_max = int(height)
print(x_max)
print(y_max)
#Load image
im = cv2.imread(filepath, cv2.IMREAD_COLOR)
background = Image.open(filepath).convert('RGB')
width, height = background.size
x_max = int(width)
y_max = int(height)
# Brightness adjustment - https://docs.opencv.org/3.4/d3/dc1/tutorial_basic_linear_transform.html
new_image = np.zeros(im.shape, im.dtype)
alpha = 1.0 # contrast control
beta = 0 # brightness control
new_image = cv2.convertScaleAbs(im, alpha=alpha, beta=100)
# cv2.imshow('New Image', new_image)
# cv2.waitKey(0)
cv2.imwrite(bright, new_image)
new_image = cv2.imread(bright, cv2.IMREAD_COLOR)
## OD
#Convert to HLS, so we can remove the saturated fovea
HLS = cv2.cvtColor(new_image,cv2.COLOR_BGR2HLS)
Schannel = HLS[:,:,2]
mask = cv2.inRange(Schannel, 0, 0)
# res = cv2.bitwise_and(new_image,new_image, mask= mask)
new_image = cv2.cvtColor(new_image,cv2.COLOR_BGR2GRAY)
thresh_x = round_up_to_odd((21/1033) * width)
thresh_x = 21
#### Thresholding Example Options
# img = cv2.bitwise_and(new_image,new_image, mask= mask)
img = cv2.medianBlur(new_image,5)
pil_im = Image.fromarray(mask)
# mask_width, mask_height = pil_im.size
mask_width, mask_height = (165 * (width/290)), (165 * (width/290))
print(width, height)
print(mask_width, mask_height)
margin = 10
if "_L" in filepath or "OS" in filepath:
x_center = width/2
crop_x_start = 0
crop_x_stop = int(x_center-(mask_width/2)) + margin
crop_img = img[0:height, crop_x_start:crop_x_stop]
# cv2.imshow("cropped", crop_img)
cv2.waitKey()
if "_R" in filepath or "OD" in filepath:
x_center = width/2
crop_x_start = int((x_center+(mask_width/2))) - margin
crop_x_stop = width
crop_img = img[0:height, crop_x_start:crop_x_stop]
# cv2.imshow("cropped", crop_img)
cv2.waitKey()
th2 = cv2.adaptiveThreshold(crop_img,255,cv2.ADAPTIVE_THRESH_MEAN_C,\
cv2.THRESH_BINARY,thresh_x,2)
th2 = cv2.GaussianBlur(th2,(21,21),0)
# cv2.imshow("cropped", th2)
cv2.waitKey()
cv2.imwrite(thresholded_od, th2)
## Hough Circle
# Load picture and detect edges
image = img_as_ubyte(th2)
edges = canny(image, sigma=3, low_threshold=10, high_threshold=50)
# Detect two radii
x=50
y=500
z=2
start = math.ceil((x/1033) * width)
stop = math.ceil((y/1033) * width)
step = math.ceil((z/1033) * width)
hough_radii = np.arange(start, stop, step)
hough_res = hough_circle(edges, hough_radii)
if fv_cy != []:
# Select the most prominent 3 circles
accums, cx, cy, radii = hough_circle_peaks(hough_res, hough_radii,
total_num_peaks=10)
df = pd.DataFrame(columns = ["index", "distance", "area", "cX", "cY"])
idx = (0)
# Draw them
fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(10, 4))
# image = color.gray2rgb(image)
image = io.imread(filepath)
cx = (cx + crop_x_start)
for center_y, center_x, radius in zip(cy, cx, radii):
# d = math.sqrt(((center_x-fv_cx)**2) + ((center_y-fv_cy)**2))
p = abs(center_y - fv_cy)
q = -1 * abs(center_x - fv_cx)
d = p + q
print(d)
area = math.pi * (radius**2)
df.loc[idx, 'index'] = idx
df.loc[idx, 'area'] = int(area)
df.loc[idx, 'distance'] = int(d)
df.loc[idx, 'cX'] = int(center_x)
df.loc[idx, 'cY'] = int(center_y)
df.loc[idx, 'radius'] = int(radius)
idx += 1
df['distance'] = pd.to_numeric(df['distance'])
df['radius'] = pd.to_numeric(df['radius'])
print("DF?")
print(df)
if len(df["distance"]) > 0:
print("pass")
df_radius = df.nsmallest(3, 'distance')
print(df_radius)
if (df_radius['radius'].max()-df_radius['radius'].min()) < 3:
idx = df_radius['radius'].idxmax()
else:
idx = df['distance'].idxmin()
center_y = int(df.loc[idx, 'cY'])
center_x = int(df.loc[idx, 'cX'])
radius = int(df.loc[idx, 'radius'])
print(center_y, center_x, radius)
# Draw them
# fig, ax = plt.subplots(ncols=1, nrows=1, figsize=(10, 4))
# image = color.gray2rgb(image)
image = io.imread(filepath)
image = image_draw
circy, circx = circle_perimeter(center_y, center_x, radius,
shape=image.shape)
print(image.shape)
print(image.shape[2])
if image.shape[2] == 3:
image_draw[circy, circx] = (220, 20, 20)
circy, circx = circle_perimeter(center_y, center_x, 0,
shape=image.shape)
image_draw[circy, circx] = (220, 20, 20)
if image.shape[2] == 4:
image_draw[circy, circx] = (220, 20, 20, 0)
circy, circx = circle_perimeter(center_y, center_x, 0,
shape=image.shape)
image_draw[circy, circx] = (220, 20, 20, 0)
# final = ax.imshow(image, cmap=plt.cm.gray)
# fig = plt.show()
## Need to fix saving
plt.imsave(OD, image_draw)
else:
hough_radii = np.arange(start, stop, step)
hough_res = hough_circle(edges, hough_radii)
# Select the most prominent 3 circles
accums, cx, cy, radii = hough_circle_peaks(hough_res, hough_radii,
total_num_peaks=1)
if cx != None:
print("Found OD")
od_cx = (re.search(r"\[([A-Za-z0-9_]+)\]",str(cx))).group(1)
od_cy = (re.search(r"\[([A-Za-z0-9_]+)\]",str(cy))).group(1)
else:
od_cx = "Not found"
od_cy = "Not found"
# Draw them
#fig, ax = plt.subplots(ncols=1, nrows=1, #figsize=(10, 4))
# image = color.gray2rgb(image)
image = io.imread(filepath)
image = image_draw
for center_y, center_x, radius in zip(cy, cx, radii):
circy, circx = circle_perimeter(center_y, center_x, radius,
shape=image.shape)
print(image.shape)
print(image.shape[2])
if image.shape[2] == 3:
image_draw[circy, circx] = (220, 20, 20)
circy, circx = circle_perimeter(center_y, center_x, 0,
shape=image.shape)
image_draw[circy, circx] = (220, 20, 20)
if image.shape[2] == 4:
image_draw[circy, circx] = (220, 20, 20, 0)
circy, circx = circle_perimeter(center_y, center_x, 0,
shape=image.shape)
image_draw[circy, circx] = (220, 20, 20, 0)
# final = ax.imshow(image, cmap=plt.cm.gray)
# fig = plt.show()
## Need to fix saving
plt.imsave(OD, image_draw)
append_list_as_row(csv_file,[filepath,center_x,center_y, fv_cx, fv_cy])
plt.close('all')
df2 = df2.append({"filepath":filepath,"od_cx":center_x, "od_cy":center_y, "fv_x":fv_cx, "fv_y":fv_cy}, ignore_index=True)
print(df2)
df2.to_csv(csv_file2)
Hello I am trying to identify the odometer reading from the image attached using open CV and EAST model along with Pyteserract.
Following is my code :
import cv2
import numpy as np
import matplotlib.pyplot as plt
# assuming you have the result image store in median
median = cv2.imread("odo_4.jpg", 0)
image_gray = median
binary = cv2.bitwise_not(image_gray)
blur = cv2.GaussianBlur(image_gray,(5,5),0)
ret2,th2 = cv2.threshold(blur,0,255,cv2.THRESH_BINARY+cv2.THRESH_OTSU)
edged = cv2.Canny(th2, 50, 80, 255)
#threshold = cv2.adaptiveThreshold(edged,255,cv2.ADAPTIVE_THRESH_GAUSSIAN_C,cv2.THRESH_BINARY,11,2)
kernel = cv2.getStructuringElement(cv2.MORPH_RECT, (5,5))
close = cv2.morphologyEx(edged, cv2.MORPH_CLOSE, kernel, iterations=1)
contours = cv2.findContours(close, cv2.RETR_TREE, cv2.CHAIN_APPROX_SIMPLE)
contours = contours[0] if len(contours) == 2 else contours[1]
rect_cnts = []
for cnt in contours:
peri = cv2.arcLength(cnt, True)
approx = cv2.approxPolyDP(cnt, 0.04 * peri, True)
(x, y, w, h) = cv2.boundingRect(cnt)
ar = w / float(h)
if (len(approx) == 4) & (ar >= 0.95 and ar <= 1.05) : # shape filtering condition
pass
else :
rect_cnts.append(cnt)
max_area = 0
football_square = None
for cnt in rect_cnts:
(x, y, w, h) = cv2.boundingRect(cnt)
if max_area < w*h:
max_area = w*h
football_square = cnt
image = cv2.cvtColor(image_gray, cv2.COLOR_GRAY2RGB)
(x, y, w, h) = cv2.boundingRect(football_square)
new_image = image[y:y+h, x:x+w]
new = new_image
import cv2 as cv
orig = new.copy()
(origH, origW) = new.shape[:2]
rW = origW / 320.0
rH = origH / 320.0
# resize the original image to new dimensions
new = cv.resize(new, (320, 320))
(H, W) = new.shape[:2]
# construct a blob from the image to forward pass it to EAST model
blob = cv.dnn.blobFromImage(new, 1.0, (W, H),
(123.68, 116.78, 103.94), swapRB=True, crop=False)
net = cv.dnn.readNet('frozen_east_text_detection.pb')
layerNames = [
"feature_fusion/Conv_7/Sigmoid",
"feature_fusion/concat_3"]
net.setInput(blob)
(scores, geometry) = net.forward(layerNames)
def predictions(prob_score, geo):
(numR, numC) = prob_score.shape[2:4]
boxes = []
confidence_val = []
# loop over rows
for y in range(0, numR):
scoresData = prob_score[0, 0, y]
x0 = geo[0, 0, y]
x1 = geo[0, 1, y]
x2 = geo[0, 2, y]
x3 = geo[0, 3, y]
anglesData = geo[0, 4, y]
# loop over the number of columns
for i in range(0, numC):
if scoresData[i] < 0.5:
continue
(offX, offY) = (i * 4.0, y * 4.0)
# extracting the rotation angle for the prediction and computing the sine and cosine
angle = anglesData[i]
cos = np.cos(angle)
sin = np.sin(angle)
# using the geo volume to get the dimensions of the bounding box
h = x0[i] + x2[i]
w = x1[i] + x3[i]
# compute start and end for the text pred bbox
endX = int(offX + (cos * x1[i]) + (sin * x2[i]))
endY = int(offY - (sin * x1[i]) + (cos * x2[i]))
startX = int(endX - w)
startY = int(endY - h)
boxes.append((startX, startY, endX, endY))
confidence_val.append(scoresData[i])
# return bounding boxes and associated confidence_val
return (boxes, confidence_val)
(boxes, confidence_val) = predictions(scores, geometry)
boxes = non_max_suppression(np.array(boxes), probs=confidence_val)
# initialize the list of results
results = []
# loop over the bounding boxes to find the coordinate of bounding boxes
for (startX, startY, endX, endY) in boxes:
# scale the coordinates based on the respective ratios in order to reflect bounding box on the original image
startX = int(startX * rW)
startY = int(startY * rH)
endX = int(endX * rW)
endY = int(endY * rH)
#extract the region of interest
r = orig[startY:endY, startX:endX]
plt.imshow(r)
#configuration setting to convert image to string.
configuration = ("-l eng --oem 1 --psm 7")
##This will recognize the text from the image of bounding box
text = pytesseract.image_to_string(r, config=configuration)
# append bbox coordinate and associated text to the list of results
results.append(((startX, startY, endX, endY), text))
The results are bad - but my EAST model is identify the contour ( area) where the digits are present. Can you please help me ? I have tried different psm values in config for image_to_string.
Use InRange() for selection. See example:
import cv2 as cv
low_H = 80
low_S = 160
low_V = 200
high_H = 100
high_S = 255
high_V = 255
frame = cv.imread('OAPgE.jpg')
frame_HSV = cv.cvtColor(frame, cv.COLOR_BGR2HSV)
frame_threshold = cv.inRange(frame_HSV, (low_H, low_S, low_V), (high_H, high_S, high_V))
frame_threshold=cv.bitwise_not(frame_threshold)
cv.imwrite('out_36.png', frame_threshold)
I use a code to locate text boxes and create a rectangle around them. This allows me to rebuild the grid around the table structure in the image.
However, even if the text box detection works very well, if I try to define the characters present in each rectangle, pytesseract does not identify them well and does not allow to find the original text.
Here is my Python code :
import os
import cv2
import imutils
import argparse
import numpy as np
import pytesseract
# This only works if there's only one table on a page
# Important parameters:
# - morph_size
# - min_text_height_limit
# - max_text_height_limit
# - cell_threshold
# - min_columns
def pre_process_image(img, save_in_file, morph_size=(8, 8)):
# get rid of the color
pre = cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
def img_estim(img, threshold=127):
is_dark = np.mean(img) < threshold
return True if is_dark else False
# Negative
if img_estim(pre):
print("non")
pre = cv2.bitwise_not(pre)
# Contrast & Brightness control
contrast = 2.0 #0 to 3
brightness = 0 #0 to 100
for y in range(pre.shape[0]):
for x in range(pre.shape[1]):
pre[y,x] = np.clip(contrast*pre[y,x] + brightness, 0, 255)
# Otsu threshold
pre = cv2.threshold(pre, 250, 255, cv2.THRESH_BINARY | cv2.THRESH_OTSU)[1]
# dilate the text to make it solid spot
cpy = pre.copy()
struct = cv2.getStructuringElement(cv2.MORPH_RECT, morph_size)
cpy = cv2.dilate(~cpy, struct, anchor=(-1, -1), iterations=1)
pre = ~cpy
if save_in_file is not None:
cv2.imwrite(save_in_file, pre)
return pre
def find_text_boxes(pre, min_text_height_limit=15, max_text_height_limit=40):
# Looking for the text spots contours
# OpenCV 3
# img, contours, hierarchy = cv2.findContours(pre, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
# OpenCV 4
contours, hierarchy = cv2.findContours(pre, cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
# Getting the texts bounding boxes based on the text size assumptions
boxes = []
for contour in contours:
box = cv2.boundingRect(contour)
h = box[3]
if min_text_height_limit < h < max_text_height_limit:
boxes.append(box)
return boxes
def find_table_in_boxes(boxes, cell_threshold=10, min_columns=2):
rows = {}
cols = {}
# Clustering the bounding boxes by their positions
for box in boxes:
(x, y, w, h) = box
col_key = x // cell_threshold
row_key = y // cell_threshold
cols[row_key] = [box] if col_key not in cols else cols[col_key] + [box]
rows[row_key] = [box] if row_key not in rows else rows[row_key] + [box]
# Filtering out the clusters having less than 2 cols
table_cells = list(filter(lambda r: len(r) >= min_columns, rows.values()))
# Sorting the row cells by x coord
table_cells = [list(sorted(tb)) for tb in table_cells]
# Sorting rows by the y coord
table_cells = list(sorted(table_cells, key=lambda r: r[0][1]))
return table_cells
def build_lines(table_cells):
if table_cells is None or len(table_cells) <= 0:
return [], []
max_last_col_width_row = max(table_cells, key=lambda b: b[-1][2])
max_x = max_last_col_width_row[-1][0] + max_last_col_width_row[-1][2]
max_last_row_height_box = max(table_cells[-1], key=lambda b: b[3])
max_y = max_last_row_height_box[1] + max_last_row_height_box[3]
hor_lines = []
ver_lines = []
for box in table_cells:
x = box[0][0]
y = box[0][1]
hor_lines.append((x, y, max_x, y))
for box in table_cells[0]:
x = box[0]
y = box[1]
ver_lines.append((x, y, x, max_y))
(x, y, w, h) = table_cells[0][-1]
ver_lines.append((max_x, y, max_x, max_y))
(x, y, w, h) = table_cells[0][0]
hor_lines.append((x, max_y, max_x, max_y))
return hor_lines, ver_lines
if __name__ == "__main__":
ap = argparse.ArgumentParser()
ap.add_argument("-i", "--image", required=True,
help="path to input image to be OCR'd")
# ap.add_argument("-east", "--east", type=str,
# help="path to input EAST text detector")
args = vars(ap.parse_args())
in_file = os.path.join("images", args["image"])
pre_file = os.path.join("images", "pre.png")
out_file = os.path.join("images", "out.png")
img = cv2.imread(os.path.join(in_file))
top, bottom, left, right = [25]*4
img = cv2.copyMakeBorder(img, top, bottom, left, right, cv2.BORDER_REPLICATE)
orig = img.copy()
pre_processed = pre_process_image(img, pre_file)
text_boxes = find_text_boxes(pre_processed)
cells = find_table_in_boxes(text_boxes)
hor_lines, ver_lines = build_lines(cells)
# (H, W) = img.shape[:2]
# net = cv2.dnn.readNet(args["east"])
# blob = cv2.dnn.blobFromImage(img, 1.0, (W, H),(123.68, 116.78, 103.94), swapRB=True, crop=False)
# net.setInput(blob)
# Visualize the result
vis = img.copy()
results = []
for box in text_boxes:
(x, y, w, h) = box
startX = x -2
startY = y -2
endX = x + w
endY = y + h
cv2.rectangle(vis, (startX, startY), (endX, endY), (0, 255, 0), 1)
roi=orig[startX:endX,startY:endY]
config = ("-l eng --psm 6")
pytesseract.pytesseract.tesseract_cmd = r'C:\Program Files (x86)\Tesseract-OCR\tesseract.exe'
text = pytesseract.image_to_string(roi,config=config )
results.append(((startX, startY, (endX), (endY)), text))
results = sorted(results, key=lambda r:r[0][1])
output = orig.copy()
for ((startX, startY, endX, endY), text) in results:
print("{}\n".format(text))
text = "".join([c if ord(c) < 128 else "" for c in text]).strip()
cv2.rectangle(output, (startX, startY), (endX, endY),(0, 0, 255), 1)
cv2.putText(output, text, (startX, startY - 20),cv2.FONT_HERSHEY_SIMPLEX, 1, (0, 0, 255), 3)
# for line in hor_lines:
# [x1, y1, x2, y2] = line
# cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
# for line in ver_lines:
# [x1, y1, x2, y2] = line
# cv2.line(vis, (x1, y1), (x2, y2), (0, 0, 255), 1)
cv2.imwrite(out_file, vis)
cv2.imshow("Text Detection", output)
cv2.waitKey(0)
Initial image :
Initial image
Preprocessed image with detection of text outlines to define the dimensions of rectangles :
Preprocessed image with detection of text outlines to define the dimensions of rectangles
Final image :
Final image
Résultat obtenu par OCR :
"
a
ra
at
12
1
"
Thank you in advance for your help, hope my description is clear enough.
When performing OCR, it is extrememly important to preprocess the image to get the foreground text in black with the background in white. In addition, enlarging the image can help improve the detection results. I've also found that adding a slight Gaussian blur improves accuracy before throwing it into Pytesseract. Here's the results with --psm 6 to treat the image as a single block of text. Look here for more configuration options.
Preprocessed enlarged, thresholded, and slightly blurred image
Results from Pytesseract OCR
Series Type Scan Range CTDIvol DLP Phantom
(mm) (mGy) — (mGy-cm) cm
1 Scout - - - -
1 Scout - - - -
2 Axial = 113.554-1272.929 11.22 269.35 Body 32
Total Exam DLP: = 269.35
1/1
Code
import cv2
import pytesseract
import imutils
pytesseract.pytesseract.tesseract_cmd = r"C:\Program Files\Tesseract-OCR\tesseract.exe"
image = cv2.imread('1.jpg')
image = imutils.resize(image, width=700)
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
thresh = cv2.threshold(gray, 0, 255, cv2.THRESH_BINARY_INV + cv2.THRESH_OTSU)[1]
thresh = cv2.GaussianBlur(thresh, (3,3), 0)
data = pytesseract.image_to_string(thresh, lang='eng', config='--psm 6')
print(data)
cv2.imshow('thresh', thresh)
cv2.imwrite('thresh.png', thresh)
cv2.waitKey()