Get the coordinates of detected words using pytesseract

Get the coordinates of detected words using pytesseract - python

Am making a digital assistant as final year project and I want to automate the browser tab by getting title of tab including the location of the detected words and use the coordinates to click the tab. I tried this answers I found here but not what I needed.
from pytesseract import *
from PIL import ImageGrab
pytesseract.tesseract_cmd = 'C:\\Program Files\\Tesseract-OCR\\tesseract.exe'
while True:
cap = ImageGrab.grab(bbox=(0, 0, 1182, 34)) # bounding box of the browser tab bar
text = pytesseract.image_to_data(cap, output_type=Output.DICT)
print(text)
data = {}
for i in range(len(text['line_num'])):
txt = text['text'][i]
block_num = text['block_num'][i]
line_num = text['line_num'][i]
top, left = text['top'][i], text['left'][i]
width, height = text['width'][i], text['height'][i]
if not txt == '' or txt.isspace():
tup = (txt, left, top, width, height)
if block_num in data:
if line_num in data[block_num]:
data[block_num][line_num].append(tup)
else:
data[block_num][line_num] = [tup]
else:
data[block_num] = {}
data[block_num][line_num] = [tup]
linedata = {}
idx = 0
for _, b in data.items():
for _, i in b.items():
linedata[idx] = i
idx += 1
line_idx = 1
for _, line in linedata.items():
xmin, ymin = line[0][1], line[0][2]
xmax, ymax = (line[-1][1] + line[-1][3]), (line[-1][2] + line[-1][4])
print(f'line{line_idx}, {xmin}, {ymin}, {xmax}, {ymax}, {txt}')
line_idx += 1
I need the title of each tab and their coordinates. Am pretty confident that pytesseract.image_to_data() is what I need but don't know how to extract the information that need

Related

Python-pptx - How to insert a slide number for all slides

I am trying to automatically insert slide numbers to all my slides in PPT that is messy and strictly formatted. When we do manually,we are not able to do it for all the PPTs due to manual work involved. Is there anyway to do this for all 35 plus slides in 20 plus PPT files? When we do using python inbuilt option menus ,it is not working and qe are unable to figure out why. Could be because of our messy format. So, want to use python pptx, insert text box and number them
I referred the SO post and tried the below
i=0
for slide in slides
i = i + 1
txBox = slide.shapes.add_textbox(0,0,10,12)
tf = txBox.text_frame
tf.text = i
tf.font.size = (11)
But not sure whether am doing it right. How can I make the slide numbers appear at the bottom of slide?
update based on Luke answer
for slide in presentation.slides:
i = i + 1
txBox = slide.shapes.add_textbox(0,0,5,6)
txBox.height = Inches(0.5)
txBox.width = Inches(0.5)
txBox.top = SH - txBox.height
#if (i % 2) != 0:
# txBox.left = SW - Inches(OutsideMargin) - txBox.width
#else:
#
txBox.right = Inches(float(OutsideMargin)) #updated here
tf = txBox.text_frame
tf.vertical_anchor = MSO_ANCHOR.MIDDLE
p = tf.paragraphs[0]
p.alignment = PP_ALIGN.RIGHT #updated here
run = p.add_run()
run.text = "Page" + str(i)
run.font.size = Pt(11)

UPDATE
If you want the textbox to always be on the right side just do this
txBox.left = presentation.slide_width - txBox.width
You just need to reposition the textbox. maybe something like this...
from pptx import *
from pptx.util import Inches, Pt
from pptx.enum.text import MSO_ANCHOR, MSO_AUTO_SIZE, PP_ALIGN
presentation = Presentation("input.pptx")
SH = presentation.slide_height
SW = presentation.slide_width
OutsideMargin = 0.5
i=0
for slide in presentation.slides:
i = i + 1
txBox = slide.shapes.add_textbox(0,0,10,12)
txBox.height = Inches(1)
txBox.width = Inches(2)
txBox.top = SH - txBox.height
if (i % 2) != 0:
txBox.left = SW - Inches(OutsideMargin) - txBox.width
else:
txBox.left = Inches(float(OutsideMargin))
tf = txBox.text_frame
tf.vertical_anchor = MSO_ANCHOR.MIDDLE
p = tf.paragraphs[0]
p.alignment = PP_ALIGN.CENTER
run = p.add_run()
run.text = "Page " + str(i)
run.font.size = Pt(11)
presentation.save("output.pptx")

Trying to print using ZPL to zebra printer

So im trying to print using ZPL language to a zebra printer, but when i run the below code all i get is blank labels, any idea why its not reading the ZPL variable "label". I have put the code into labelary and it looks correct but i cant get it to print the ZPL its just blank labels.
import os
from PIL import Image
import zpl
from zebra import Zebra
lines = []
with open(
r'C:\Users\matthew.vandruff\Downloads\Hard Drives (version '
r'1)-b0dd6970-7dfd-4bb7-b04d-fc9c3ff4bc8a-a54c9c76-f7e2-40b1-8ff4-6d7eec1c99bb.csv') as f:
for line in f.readlines():
l, name = line.strip().split(',')
lines.append((l, name))
list = lines
x = 1
while x < len(list):
HD_Serial, HD_Number = list[x]
# print(HD_Serial, HD_Number)
x += 1
l = zpl.Label(100, 60)
height = 0
height += 5
l.origin(30, height)
l.write_barcode(height=70, barcode_type='C', check_digit='Y')
l.write_text(f'{HD_Number}')
l.endorigin()
height += 0
image_width = 12
l.origin((l.width - image_width) / 3, height)
image_height = l.write_graphic(
Image.open(os.path.join(os.path.dirname(zpl.__file__), 'MTC_Logo.png')),
image_width)
l.endorigin()
height += image_height + 5
l.origin(15, height)
l.write_barcode(height=70, barcode_type='C', check_digit='Y')
l.write_text(f'{HD_Serial}')
l.endorigin()
print(l.dumpZPL())
z = Zebra()
Q = z.getqueues()
z.setqueue(Q[0])
z.setup()
z.autosense()
z.output(l)

So I'm guessing my issue was in the settings of the printer but this is my final working code for anyone that comes looking and is only getting blanks when trying to print.
import os
from PIL import Image
import zpl
from zebra import Zebra
lines = []
with open(
r'C:\Users\matthew.vandruff\Downloads\Hard Drives (version '
r'1)-b0dd6970-7dfd-4bb7-b04d-fc9c3ff4bc8a-a54c9c76-f7e2-40b1-8ff4-6d7eec1c99bb.csv') as f:
for line in f.readlines():
l, name = line.strip().split(',')
lines.append((l, name))
list = lines
x = 1
while x < len(list):
HD_Serial, HD_Number = list[x]
# print(HD_Serial, HD_Number)
x += 1
l = zpl.Label(100, 100)
height = 0
height += 15
l.origin(23, height)
l.write_barcode(height=80, barcode_type='C', check_digit='Y')
l.write_text(f'{HD_Number}')
l.endorigin()
height += 0
image_width = 12
l.origin((l.width - image_width) / 10, height)
image_height = l.write_graphic(
Image.open(os.path.join(os.path.dirname(zpl.__file__), 'MTC_Logo.bmp')),
image_width)
l.endorigin()
height += image_height + 5
l.origin(5, height)
l.write_barcode(height=80, barcode_type='C', check_digit='Y')
l.write_text(f'{HD_Serial}')
l.endorigin()
label = l.dumpZPL()
l.preview()
z = Zebra()
Q = z.getqueues()
z.setqueue(Q[0])
z.setup()
z.output(label)

How to run a script but print only the output obtained after 1 min for first time and 3 min after every time till the script runs?(Python)

So I have am working with YOLOv4 to process video frames for object detection of one class : Human and every time a Human is detected in frame it prints a line in the terminal " Number of human detected :" and gives the number of human detected in a particular frame. Now I want the code to run as it is but instead of printing the above output for every frame, it should print the output of the videoframe it processes at the first 1 min mark and there after at every 3 min mark till the video is fully processed. So for a 5 min video, i would want the statement to be printed at the following videotimestamps: 1:00, 4:00. For a 8 min video it would be: 1:00, 4:00, 7:00.... and so on. I tried using schedule module but it seems to just schedule the entire code to run after 1 min.
from imutils.video import VideoStream
from imutils.video import FPS
import numpy as np
import argparse
import imutils
import time
import cv2
from collections import OrderedDict
import numpy as np
import matplotlib.pyplot as plt
import datetime
import schedule
import time
from time import sleep
file = "test2"
input = "C:/Users/asmita.nandi/Downloads/" + file + ".mp4"
output = "C:/Users/asmita.nandi/Downloads/" + file + ".avi"
net = cv2.dnn.readNet( "C:/Users/asmita.nandi/Downloads/custom-yolov4-tiny_human-608
(1).cfg","C:/Users/asmita.nandi/Downloads/custom-yolov4-tiny-detector_human.weights")
labelsPath = "C:/Users/asmita.nandi/Downloads/human_label.txt"
def event(input,output,net,labelsPath):
LABELS = open(labelsPath).read().strip().split("\n")
np.random.seed(1)
cmap = plt.get_cmap('tab20b')
colors = [cmap(i)[:3] for i in np.linspace(0, 1, 6)]
CONF_THRESH, NMS_THRESH = 0.25, 0.25
vs = cv2.VideoCapture(input)
fp = vs.get(cv2.CAP_PROP_FPS)
writer = None
W = None
H = None
totalFrames = 0
TotalHuman = 0
while True:
frame = vs.read()
frame = frame[1] if input else frame
if input is not None and frame is None:
break
(H, W) = frame.shape[:2]
print(H,W)
if W is None or H is None:
(H, W) = frame.shape[:2]
if output is not None and writer is None:
fourcc = cv2.VideoWriter_fourcc(*"MJPG")
writer = cv2.VideoWriter(output, fourcc,fp,(W,H), True)
ln = net.getLayerNames()
ln = [ln[i[0] - 1] for i in net.getUnconnectedOutLayers()]
blob = cv2.dnn.blobFromImage(frame, 1 / 255.0, (608,608), swapRB=True, crop=False)
net.setInput(blob)
start = time.time()
layerOutputs = net.forward(ln)
print(layerOutputs)
end = time.time()
boxes = []
confidences = []
classIDs = []
(H, W) = frame.shape[:2]
# loop over each of the layer outputs
for output in layerOutputs:
# loop over each of the detections
for detection in output:
scores = detection[5:]
#print(detection)
classID = np.argmax(scores)
confidence = scores[classID]
if confidence > CONF_THRESH:
#print("Box")
box = detection[0:4] * np.array([W, H, W, H])
(centerX, centerY, width, height) = box.astype("int")
x = int(centerX - (width / 2))
y = int(centerY - (height / 2))
boxes.append([x, y, int(width), int(height)])
confidences.append(float(confidence))
classIDs.append(classID)
idxs = cv2.dnn.NMSBoxes(boxes, confidences, CONF_THRESH,NMS_THRESH)
ObjectCount = {}
if len(idxs) > 0:
for i in idxs.flatten():
(x, y) = (boxes[i][0], boxes[i][1])
(w, h) = (boxes[i][2], boxes[i][3])
color = colors[classIDs[i]]
color = [i * 255 for i in color]
class_name = LABELS[classIDs[i]]
cv2.rectangle(frame,(x,y),(x+w,y+h),color,2)
# if class_name == "Human":
cv2.putText(frame, class_name,(x, y-10),0, 0.5,color,2)
obj, conf = LABELS[classIDs[i]], confidences[i]
if obj not in ObjectCount.keys():
ObjectCount[obj] = 1
else:
ObjectCount[obj] += 1
allvalues=[]
allvalues.append(ObjectCount[obj])
print("Number of Humans detected ", max(allvalues))
if writer is not None:
writer.write(frame)
# show the output frame
#cv2_imshow(frame)
key = cv2.waitKey(1) & 0xFF
# if the `q` key was pressed, break from the loop
if key == ord("q"):
break
# increment the total number of frames processed thus far and
# then update the FPS counter
totalFrames += 1
fps.update()
# stop the timer and display FPS information
#for (objectID, centroid) in objects.items():
#print(objectID, centroids)
fps.stop()
print("[INFO] elapsed time: {:.2f}".format(fps.elapsed()))
print(totalFrames)
#print(info)
# check to see if we need to release the video writer pointer
if writer is not None:
writer.release()
# if we are not using a video file, stop the camera video stream
if not input:
vs.stop()
# otherwise, release the video file pointer
else:
vs.release()
# close any open windows
cv2.destroyAllWindows()
schedule.every(1).minutes.do(event(input,output,net,labelsPath))
while 1:
schedule.run_pending()
time.sleep(1)

Is it possible to run two YOLO (yolov4) object detection models in a single application?

I understand that it would be much simpler to just train YOLOv4 accordingly but:
I have limited computational resources and was hoping I could combine pre-trained models I found online, saving my time, or train a model and combine it with other models available online.
If I have one '.weights' file of a custom object detector that detects traffic signs and another '.weights' file of detector that detect pedestrians. Is there a way to combine these models, so that when run on a video/image (or in real-time capture), it detects pedestrians and traffic signs simultaneously.
By combining I mean, either to edit the 'weights' file somehow to achieve this, or editing the python code (while running the detector) that gets this done. (or any other way)
If not possible - is there any way to make them run in a sequence, efficiently?

Yes, It is possible to do that. Create two different python files and load the model in them. Then create a new file and initialise both of them in the new file. Then take the continous video / image feed as an input, the file will give output of detected traffic signs. Use this output as an input to your pedestrian problem.
import numpy as np
import pandas as pd
import cv2
import time
margin = 20
import detection_cls
run = detection_cls.Ppe_Detection_1()
class Ppe_Detection():
def __init__(self):
self.weightfile = 'yolov4_pretrained.weights'
self.cfgfile = 'cfg/yolov4_pretrained.cfg'
self.PpeNet = cv2.dnn.readNet(self.weightfile,self.cfgfile)
self.classes = self.get_classes()
layer_names = self.PpeNet.getLayerNames()
self.output_layers = [layer_names[i - 1] for i in self.PpeNet.getUnconnectedOutLayers()]
# = [self.PpeNet.getLayerNames()[(i[0] - 1)] for i in self.PpeNet.getUnconnectedOutLayers()]
def get_classes(self):
# self.classes = []
with open("coco.names","r") as f:
self.classes_val = [line.strip() for line in f.readlines()]
return self.classes_val
def detection(self,img):
start = time.perf_counter()
height,width,channels = img.shape
# Detecting objects
blob = cv2.dnn.blobFromImage(img,0.00392,(416,416),(0,0,0),True,crop=False)
self.PpeNet.setInput(blob)
outs = self.PpeNet.forward(self.output_layers)
time_took = time.perf_counter() - start
fps = str(int(1/time_took))
# getting the list
class_ids = []
confidences = []
boxes = []
for out in outs:
for detection in out:
# print(detection)
scores = detection[5:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > 0.4:
# object detected
center_x = int(detection[0]*width)
center_y = int(detection[1]*height)
w = int(detection[2]*width)
h = int(detection[3]*height)
# Rectangle Coordinates
x = int(center_x-w/2)
y = int(center_y-h/2)
boxes.append([x,y,w,h])
confidences.append(float(confidence))
class_ids.append(class_id)
indexes = cv2.dnn.NMSBoxes(boxes,confidences,0.4,0.4)
info = []
if len(indexes) > 0:
for i in indexes.flatten():
x,y = boxes[i][0], boxes[i][1]
w,h = boxes[i][2], boxes[i][3]
conf = confidences[i]
if x<0:
x = 0
if y < 0:
y = 0
type = '{}'.format(self.classes[class_ids[i]])
info.append([x,y,w,h,type,conf])
new_frame_time = time.time()
font = cv2.FONT_HERSHEY_PLAIN
for i in range(len(boxes)):
for i in indexes:
x,y,w,h = boxes[i]
label = str(self.classes[class_ids[i]])
color = (0,0,145)
# cv2.rectangle(img,(x,y),(x+w,y+h),color,2)
cv2.putText(img,fps,(10,30),font,3,color,3)
cv2.putText(img,label,(x,y+30),font,3,color,3)
# print(boxes,confidences,class_ids)
# print("opened")
# print(info)
return info
def cropping_detection(self):
count = 0
cropped_frame = []
value_img = []
cap = cv2.VideoCapture(0)
while cap.isOpened():
r,f = cap.read()
# counts +=1
detection = ppe.detection(f)
try:
x,y,w,h,cls,conf = detection[0]
except Exception as e:
pass
# for i in range(len(detection)):
# print(detection[i])
try:
if cls=='person':
print("Person detected")
cropped_img = f[y-margin:y+h+margin,x-margin:x+w+margin]
# print(cropped_img)
# value = ppe.detection(cropped_img)
final = run.detection(cropped_img)
print('final',final)
# print(value)
# print(cropped_img)
# value_img.append(value)
# cropped_frame.append(cropped_img)
# cv2.imwrite("data/frame%d.jpg" % count, cropped_img)
cv2.imshow("Hi",cropped_img)
if cv2.waitKey(1) & 0xff==ord('q'):
break
# count += 1
else:
pass
except Exception as e:
print("In except part")
# cap.release()
# cv2.destroyAllWindows()
# print(cropped_img)
return value,cropped_img
ppe = Ppe_Detection()
ppe.cropping_detection()
# ppe.get_classes()
# ppe.run_video()
# take_photo()
# inference_image()
The function that I've imported here can be found below
import numpy as np
import pandas as pd
import cv2
import time
class Ppe_Detection():
def __init__(self):
self.weightfile = 'backup/yolov3_best.weights'
self.cfgfile = 'yolov3.cfg'
self.PpeNet = cv2.dnn.readNet(self.weightfile,self.cfgfile)
self.classes = self.get_classes()
layer_names = self.PpeNet.getLayerNames()
self.output_layers = [layer_names[i - 1] for i in self.PpeNet.getUnconnectedOutLayers()]
# = [self.PpeNet.getLayerNames()[(i[0] - 1)] for i in self.PpeNet.getUnconnectedOutLayers()]
def get_classes(self):
# self.classes = []
with open("obj.names","r") as f:
self.classes_val = [line.strip() for line in f.readlines()]
return self.classes_val
def print_all(self):
print(self.classes)
def detection(self,img):
start = time.perf_counter()
height,width,channels = img.shape
# Detecting objects
blob = cv2.dnn.blobFromImage(img,0.00392,(416,416),(0,0,0),True,crop=False)
self.PpeNet.setInput(blob)
outs = self.PpeNet.forward(self.output_layers)
time_took = time.perf_counter() - start
fps = str(int(1/time_took))
# getting the list
class_ids = []
confidences = []
boxes = []
for out in outs:
for detection in out:
scores = detection[5:]
class_id = np.argmax(scores)
confidence = scores[class_id]
if confidence > 0.4:
# object detected
center_x = int(detection[0]*width)
center_y = int(detection[1]*width)
w = int(detection[2]*width)
h = int(detection[3]*height)
# Rectangle Coordinates
x = int(center_x-w/2)
y = int(center_y-h/2)
boxes.append([x,y,w,h])
confidences.append(float(confidence))
class_ids.append(class_id)
indexes = cv2.dnn.NMSBoxes(boxes,confidences,0.4,0.4)
info = []
if len(indexes) > 0:
for i in indexes.flatten():
x,y = boxes[i][0], boxes[i][1]
w,h = boxes[i][2], boxes[i][3]
conf = confidences[i]
if x<0:
x = 0
if y < 0:
y = 0
type = '{}'.format(self.classes[class_ids[i]])
info.append([x,y,w,h,type,conf])
new_frame_time = time.time()
font = cv2.FONT_HERSHEY_PLAIN
for i in range(len(boxes)):
for i in indexes:
x,y,w,h = boxes[i]
label = str(self.classes[class_ids[i]])
color = (0,255,145)
cv2.rectangle(img,(x,y),(x+w,y+h),color,2)
cv2.putText(img,fps,(10,30),font,3,color,3)
cv2.putText(img,label,(x,y+30),font,3,color,3)
# print(boxes,confidences,class_ids)
# print("opened")
print(info)
return info
ppe = Ppe_Detection()
ppe.get_classes()
ppe.print_all()
#############
def inference_image():
img = cv2.imread("sumit_off.jpg")
ppe.detection(img)
cv2.imshow("Image",img)
cv2.waitKey(0)
cv2.destroyAllWindows()
def run_video():
cap = cv2.VideoCapture(0)
while cap.isOpened():
r,f = cap.read()
try:
info = ppe.detection(f)
except Exception as e:
print("______",e)
cv2.imshow("image",f)
if cv2.waitKey(1) & 0xFF == ord("q") :
break
cap.release()
cv2.destroyAllWindows()
run_video()

need guidance my vignette code in Jython/Python

i'm having trouble with having my CGI picture and vignette profile image going together to make a vignette picture where the picture is slightly darker around the edges of the picture without compromising the image anywhere else, i am getting everything so far that i think is right but my picture is showing up dark in the middle instead of showing the image normal but with slightly darker colored edges.
this is what i have currently:
def main():
inputPic = makePicture(pickAFile())
vignette = makePicture(pickAFile())
addVignette(inputPic, vignette)
def addVignette(inputPic, vignette):
if getWidth(inputPic) == getWidth(vignette) and getHeight(inputPic) == getHeight(vignette):
explore(inputPic)
explore(vignette)
px1 = getPixels(inputPic)
px2 = getPixels(vignette)
for px in getPixels(inputPic):
x = getX(px)
y = getY(px)
px2 = getPixelAt(vignette, x, y)
x2 = getX(px2)
y2 = getY(px2)
r1 = getRed(px)
r2 = getRed(px2)
g1 = getGreen(px)
g2 = getGreen(px2)
b1 = getBlue(px)
b2 = getBlue(px2)
newR = (r1-r2+104)
newG = (g1-g2+88)
newB = (b1-b2+48)
newC = makeColor(newR, newG, newB)
setColor(px, newC)
explore(inputPic)
folder = pickAFolder()
filename = requestString("enter file name: ")
path = folder+filename+".jpg"
writePictureTo(inputPic, path)
http://i.stack.imgur.com/PqW7K.jpg
picture 1 is what the image needs to be
http://i.stack.imgur.com/PtS4U.jpg
picture 2 is my image i get at the end of my coding
Any help to get me in the right direction would be very much appreciated

After absolutely getting this wrong the first 3 times I worked it out with the help of my little friend the modules operator.
def addVignette(inputPic, vignette):
# Create empty canvas
canvas = makeEmptyPicture(getWidth(inputPic), getHeight(inputPic))
for x in range(0, getWidth(inputPic)):
for y in range(0, getHeight(inputPic)):
px = getPixel(canvas, x, y)
inputPixel = getPixel(inputPic, x, y)
vignettePixel = getPixel(vignette, x, y)
# Make a new color from those values
newColor = getNewColorValues(inputPixel, vignettePixel)
# Assign this new color to the current pixel of the input image
setColor(px, newColor)
explore(canvas)
def getNewColorValues(inputPixel, vignettePixel):
inputRed = getRed(inputPixel)
vignetteRed = getRed(vignettePixel)
inputGreen = getGreen(inputPixel)
vignetteGreen = getGreen(vignettePixel)
inputBlue = getBlue(inputPixel)
vignetteBlue = getBlue(vignettePixel)
newR = inputRed - (255 % vignetteRed) / 3
newG = inputGreen - (255 % vignetteGreen) / 3
newB = inputBlue - (255 % vignetteBlue) / 3
newC = makeColor(newR, newG, newB)
return newC

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Get the coordinates of detected words using pytesseract - python

Related

Python-pptx - How to insert a slide number for all slides

Trying to print using ZPL to zebra printer

How to run a script but print only the output obtained after 1 min for first time and 3 min after every time till the script runs?(Python)

Is it possible to run two YOLO (yolov4) object detection models in a single application?

need guidance my vignette code in Jython/Python

Categories

Resources