Separating OCR text into lines with Python

Separating OCR text into lines with Python - python

What I'm trying to do is create a list of lines from a paragraph. The width of the lines cannot exceed a established amount of width.
Here's a class that is supposed to solve this, here's the code:
from font import Font
class Text:
def __init__(self, text, limit, size):
self.text = text
self.limit = limit
self.size = size
self.setText()
def setText(self):
textList = self.text.split(' ')
self.newList = tempo = []
spaceWidth = Font(self.size, ' ').width
count = 0
for x in textList:
word = Font(self.size, x)
count = count + word.width + spaceWidth
if count >= self.limit:
self.newList.append(' '.join(tempo))
tempo = []; tempo = [x]
count = word.width
else:
tempo.append(x)
self.newList.append(' '.join(tempo))
as you can see I'm using another class called Font, here it is:
from PIL import Image,ImageFont
class Font:
def __init__(self, fontSize, text):
self.font = ImageFont.truetype('tomnr.ttf', fontSize)
self.width, self.height = self.font.getsize(text)
There are no execution errors in the code but the result is not correct: for example,
from text import Text
text = Text("Art inspired apparel for Creative Individuals. Do you SurVibe?", 452, 25)
print text.newList
What this code is supposed to do is to create lines that are max. width 452 pixels. It should print
['Art inspired apparel for Creative', 'Individuals. Do you SurVibe?']
but instead it prints:
['Art', 'inspired', 'apparel', 'for', 'Creative', 'Art inspired apparel for Creative', 'Individuals. Do you SurVibe?']
And I can't find out what's going on. I think my loop is fine and everything run smoothly! I'm pretty sure it's a silly mistake but couldn't figure it out on my own. Thanks in advance.

Error is here:
self.newList = tempo = []
Both variables point to the same list.

Related

How to add multiline text to an image using Python, while having each word or character drawn using a different color?

I am using the following code to generate images of multiline words. How can I make it draw each word or character in a different color?
from PIL import Image, ImageDraw, ImageFont
import random
import textwrap
from better_profanity import profanity
import time
width = 2000
height = 2000
number_of_words = 2400
number_of_characters = 300
font = ImageFont.truetype(r'cour.ttf', 48)
how_many = int(input("How many? "))
while how_many > 0:
def list_of_words():
words = []
for i in range(number_of_words):
words.append(random.choice(open("pwd2.txt").read().split()))
return(" ".join(words))
wrapper = textwrap.TextWrapper(width=number_of_characters)
value = str(list_of_words())
wrapped = wrapper.fill(text=value)
censored = profanity.censor(wrapped, "")
r = random.randint(40,200)
g = random.randint(40,200)
b = random.randint(40,200)
rgb = (r,g,b)
date_string = time.strftime("%Y-%m-%d-%H_%M_%S")
img = Image.new('RGB', (width, height), color='black')
imgDraw = ImageDraw.Draw(img)
imgDraw.multiline_text((-12, -5), censored, font = font, fill=rgb)
img.save('output/background-' + str(how_many) + '_' + date_string + '.png')
how_many -=1
The text is coming from a file named pwd2.txt, which is a list of the world's most commonly used passwords. Note that I use "better_profanity" to remove cursewords, because people apparently like to use them in their passwords. Here's the text file: https://raw.githubusercontent.com/nonfunjible/word-backgrounds/main/pwd2.txt
The code above asks you how many images to generate, then pulls random words and draws them to an image in a random color. Example output of 2 images:
https://i.stack.imgur.com/frPuU.png
https://i.stack.imgur.com/xVsRi.png
Each image is generated with a text color randomly selected. But how would I make each word (preferred) or character (acceptable) a different color?
So far I tried what appeared to be related solution here: https://stackoverflow.com/a/19213360/
In it, in my understanding the text is pasted as a series of images of each letter. I adapted it to my project as follows.
from PIL import Image, ImageDraw, ImageFont
import random
import textwrap
from better_profanity import profanity
import time
width = 2000
height = 2000
number_of_words = 2400
number_of_characters = 300
font = ImageFont.truetype(r'cour.ttf', 48)
make_color = lambda : (random.randint(50, 255), random.randint(50, 255), random.randint(50,255))
how_many = int(input("How many? "))
while how_many > 0:
def list_of_words():
words = []
for i in range(number_of_words):
words.append(random.choice(open("pwd.txt").read().split()))
return(" ".join(words))
wrapper = textwrap.TextWrapper(width=number_of_characters)
value = str(list_of_words())
wrapped = wrapper.fill(text=value)
censored = profanity.censor(wrapped, "")
img = Image.new("RGB", (width, height), (0,0,0)) # scrap image
draw = ImageDraw.Draw(img)
img2 = Image.new("RGB", (width, height), (0,0,0)) # final image
fill = " o "
x = 0
w_fill, y = draw.textsize(fill)
x_draw, x_paste = 0, 0
for c in censored:
w_full = draw.textsize(fill+c)[0]
w = w_full - w_fill # the width of the character on its own
draw.text((x_draw,0), fill+c, make_color())
iletter = img.crop((x_draw+w_fill, 0, x_draw+w_full, y))
img2.paste(iletter, (x_paste, 0))
x_draw += w_full
x_paste += w
date_string = time.strftime("%Y-%m-%d-%H_%M_%S")
img2.save('output/background-' + str(how_many) + '_' + date_string + '.png')
how_many -=1
However, probably due to my lack of understanding how it works, I'm unable to have it wrap text as in my first code snippet above, or even change the font. Here is example output of 2 images:
https://i.stack.imgur.com/xyBTo.png
https://i.stack.imgur.com/3neNY.png
I'm wondering if that related solution may not be the best way to accomplish what I'm trying to do. Is there another approach entirely to draw multiline text on an image using Python in a way that enables you to print each word or character in a different color?
(As another possible avenue to explore, I've noticed this wordcloud generator seems to print words in different colors, but I don't understand how it works: https://www.geeksforgeeks.org/generating-word-cloud-python/)

How to use MagickImage/Wand to create an image comprised of equally spaced bordered boxes of text in Python

I need to make an image that looks like the following:
To do so, I've implemented the use of MagickImage/Wand. Here is my current implementation
import re
from unicodedata import normalize
from docx import Document
from wand.image import Image
from wand.drawing import Drawing
from wand.font import Font
doc = Document("P.docx")
docText = []
for para in doc.paragraphs:
docText.append(para.text)
fullText = "\n".join(docText)
ct = 242
def get(source, begin, end):
try:
start = source.index(len(begin)) + len(begin)
finish = source.index(len(end), len(start))
return source[start:finish]
except ValueError:
return ""
def capitalize(string):
cap = ("".join(j[0].upper() + j[1:]) for j in string)
return cap
def find_matches(text):
return capitalize(
[
m
for m in re.findall(
r"^[^0-9]\s+([^.;]+\s*)+[.;]+", normalize("NFKD", text), re.MULTILINE
)
]
)
with Image(width=300, height=300, psuedo='xc:black') as canvas:
left, top, width, height = 50, 10, 100, 150
for match in find_matches(text=fullText):
ct += 1
match_words = match.split(" ")
match = " ".join(match_words[:-1])
with Drawing() as context:
context.fill_color = 'white'
context.rectangle(left=left, top=top, width=width, height=height)
canvas.font = Font('/System/Library/Fonts/arial.ttf')
context(canvas)
canvas.caption(match + '\r' + 'ct', left=left, top=top, width=width, height=height, gravity='center')
canvas.save(filename='patdrawTest.png')
I'm not quite certain on how to create borders or how to properly space things with this tool, and as such, this is my current output:
I understand I need to have a base image that is iterated over. I also understand that I will need flags in order to keep track of the height/width/etc. of the previous blocks of text (unless there is an easier way of doing so with this tool). However, the way my code currently works is that it takes in words from a word document, parses it to get specific matches, and then is supposed to put it into an image like the first image I showed above. Yet, I am at a loss. Any help would be greatly appreciated.

Here's the code I've come up with in order to make equally-spaced boxes of text.
import re
from unicodedata import normalize
from docx import Document
from wand.image import Image
from wand.drawing import Drawing
from wand.font import Font
doc = Document("P.docx")
docText = []
for para in doc.paragraphs:
docText.append(para.text)
fullText = "\n".join(docText)
ct = 242
def get(source, begin, end):
try:
start = source.index(len(begin)) + len(begin)
finish = source.index(len(end), len(start))
return source[start:finish]
except ValueError:
return ""
def capitalize(string):
cap = ("".join(j[0].upper() + j[1:]) for j in string)
return cap
def find_matches(text):
return capitalize(
[
m
for m in re.findall(
r"^[^0-9]\s+([^.;]+\s*)+[.;]+", normalize("NFKD", text), re.MULTILINE
)
]
)
with Image(width=400, height=1000, pseudo='xc:white') as canvas:
left, top, width, height = 2, 2, 395, 131
for match in find_matches(text=fullText):
ct += 1
match_words = match.split(" ")
match = " ".join(match_words[:-1])
with Drawing() as context:
context.fill_color = 'black'
context.rectangle(left=left, top=top, width=width, height=height)
context.fill_color = 'white'
context.rectangle(left=(left+2), top=(top+2), width=(width-4), height=(height-4))
canvas.font = Font('/System/Library/Fonts/timesnewroman.ttf')
context(canvas)
canvas.caption(match + '\n' + str(ct), left=(left+5), top=top, width=(width-10), height=height,
gravity='center')
top += 135
canvas.crop(bottom=top)
canvas.save(filename='patdrawTest.png')
Here is the output with this code:
I do, however, still have something I'd like to address. While the boxes of text are all equally-spaced and look rather nice, I'd still prefer that all of the text looks the same; that is the same font-size, and the only way to do that is to have the borders and such be automatically re-sized such that it can work that way. I have no clue on how to do this, but for now here is this, should anyone else run into something like this.

How can I use text with html tags in a tkinter text box, or change it so that it works in a tkinter label?

I've been given a lot of text and asked to display it in a tkinter app. The text has a lot of html tags like <em>...<\em>, and <sup>...<\sup> where the text needs to be italicized or superscript.
Is there any way built into tkinter to do this? If not, is it even possible to write a function to, for example, italicize all text between <em> tags, then delete the tags?
I know I would be able to remove the tags by doing something like:
for tag in ["<em>", "<\em>", "<sup>", "<\sup>"]:
text = "".join(text.split(tag))
But I really need to, at least, italicize the text between <em> tags before removing them.
I'm new to tkinter, and I've been watching a lot of tutorials and googling for solutions, but it seems like tkinter can't naturally use html tags, and I can't find any solution.
EDIT:
I need to display this in a regular tkinter text widget.
I know I can use tkinter's font method with slant=italic to set text in a text box to italic. I just need to know a way to set the parameters to everything between <em> tags.

So, I worked this out myself over the last few days. First you have find the places in the text that you want to italicize, removing the html tags from the text as you go along, next you have to put the tag-free text into a text widget, then you have to identify the points in the widget's text to italicize.
It's a bit finicky because identifying points in the text-widget's text requires a decimal input where the number before the decimal point represents the line number, and the number after the decimal represents the index of the character in that line. This means you need to identify line numbers for each index, so you need a way of knowing exactly where one line ends and another begins. Also, line 2, character 4 is 2.4, and line 2, character 40 is 2.40 so Float(f"{line_number}.{character_number}") won't work as it will remove any trailing zeros, you have to use Decimal(f"{line_number}.{character_number}").
For example, in the text alphabet = 'abcd efg hijk\nlmnop qrs tuv wx yz', if you want to italicize all of the letters from "h" to "p" you first have to get an index for "h" to start italicizing at, start = alpha.find("h"), then after p to stop italicizing at, end = alphabet.find("p") + 1. Next you have to find which line the start point and end point are on and translate the indices (9 and 19 respectively) to decimal format (1.9 and 2.5):
start_line = alphabet[:start].count("\n") + 1
end_line = alphabet[:end].count("\n") + 1
line_start_point = len(alphabet[alphabet[:start].rfind("\n") + 1: start])
line_end_point = len(alphabet[alphabet[:end].rfind("\n") + 1: end])
start_point = Decimal(f"{start_line}.{line_start_point}")
end_point = Decimal(f"{end_line}.{line_end_point}")
Anyway, here's all of the code I ended up using to remove the unnecessary <sup>...</sup> tags and anything between them, and to italicize the everything between <em>...</em> tags:
from decimal import Decimal
from tkinter import *
from tkinter import font
def em_points(text):
suppat = re.compile(r'<sup>\w*</sup>')
suppatiter = suppat.findall(text)
if suppatiter:
for suptag in suppatiter:
text = "".join(text.split(suptag))
finds = list()
if "<em>" in text:
find_points = list()
emcount = text.count("<em>")
for _ in range(emcount):
find_open = text.find("<em>")
text = text[:find_open] + text[find_open + 4:]
find_close = text.find("</em>")
text = text[:find_close] + text[find_close + 5:]
find_points.append([find_open, find_close])
for points in find_points:
finds.append(text[points[0]: points[1]])
return [text, finds]
def italicize_text(text_box, finds):
italics_font = font.Font(text_box, text_box.cget("font"))
italics_font.configure(slant="italic")
text_box.tag_configure("italics", font=italics_font)
text_in_box = text_box.get(1.0, END)
used_points = list()
for find in finds:
if find not in text_in_box:
raise RuntimeError(f"Could not find text to italicise in textbox:\n {find}\n {text_in_box}")
else:
start_point = text_in_box.find(find)
end_point = start_point + len(find)
found_at = [start_point, end_point]
if found_at in used_points:
while found_at in used_points:
reduced_text = text_in_box[end_point:]
start_point = end_point + reduced_text.find(find)
end_point = start_point + len(find)
found_at = [start_point, end_point]
used_points.append(found_at)
text_to_startpoint = text_in_box[:start_point]
text_to_endpoint = text_in_box[:end_point]
start_line = text_to_startpoint.count("\n") + 1
end_line = text_to_endpoint.count("\n") + 1
if "\n" in text_to_startpoint:
line_start_point = len(text_in_box[text_to_startpoint.rfind("\n") + 1: start_point])
else:
line_start_point = start_point
if "\n" in text_to_endpoint:
line_end_point = len(text_in_box[text_to_endpoint.rfind("\n") + 1: end_point])
else:
line_end_point = end_point
start_point = Decimal(f"{start_line}.{line_start_point}")
end_point = Decimal(f"{end_line}.{line_end_point}")
text_box.tag_add("italics", start_point, end_point)
em_text = em_points(text)
clean_text = em_text[0]
em_list = em_text[1]
text_box = Text(root, width=80, height=5, font=("Courier", 12))
text_box.insert(1.0, clean_text)
italicize_text(text_box, em_list)

Python: Create strikethrough / strikeout / overstrike string type

I would appreciate some help in creating a function that iterates through a string and combines each character with a strikethrough character (\u0336). With the output being a striked out version of the original string. Like this..
Something like.
def strike(text):
i = 0
new_text = ''
while i < len(text):
new_text = new_text + (text[i] + u'\u0336')
i = i + 1
return(new_text)
So far I've only been able to concatenate rather than combine.

def strike(text):
result = ''
for c in text:
result = result + c + '\u0336'
return result
Cool effect.

How about:
from itertools import repeat, chain
''.join(chain.from_iterable(zip(text, repeat('\u0336'))))
or even more simply,
'\u0336'.join(text) + '\u0336'

Edited
As pointed out by roippi other answers so far are actually correct, and this one below is wrong. Leaving it here in case others get the same wrong idea that I did.
Other answers so far are wrong - they do not strike out the first character of the string. Try this instead:
def strike(text):
return ''.join([u'\u0336{}'.format(c) for c in text])
>>> print(strike('this should do the trick'))
'̶t̶h̶i̶s̶ ̶s̶h̶o̶u̶l̶d̶ ̶d̶o̶ ̶t̶h̶e̶ ̶t̶r̶i̶c̶k'
This will work in Python 2 and Python 3.

If you want to include spaces in the strikethrough, you'll have to replace normal spaces with non-break spaces:
def strikethrough(mytext):
''' replacing space with 'non-break space' and striking through'''
return("\u0336".join(mytext.replace(" ","\u00a0"))+ "\u0336")

Although '\u0336' can solve some problems, it may not work in different language situations.
Like: 我是誰 → ̶我̶是̶誰.
As you can see, the otherwise good text has turned into strange symbols that we can't read.
So I write the code below:
import tkinter as tk
root = tk.Tk()
root.state('zoomed')
class strikethrough(tk.Frame):
def __init__(self, frame, text, **options):
super().__init__(frame)
c = tk.Canvas(self, **options)
textId = c.create_text(0, 0, text = text, fill = "#FFFFFF", font = ("", 30, "bold"))
x1, y1, x2, y2 = c.bbox(textId)
linewidth = 3
lineXOffset = 3
lineId = c.create_line(x1, 0, x2, 0, width=linewidth)
c.pack(fill="both", expand=1)
c.bind("<Configure>", lambda event: TextPositionChange(c, textId, lineId, linewidth, lineXOffset))
self.canvas, self.textId = c, textId
def TextPositionChange(canvas, TextId, LineId, LineWidth, LineXOffset):
x1, y1, x2, y2 = canvas.bbox(TextId)
xOffSet, yOffSet = (x2-x1)/2, (y2-y1)/2
x, y = canvas.winfo_width()/2-xOffSet, canvas.winfo_height()/2-yOffSet #left_top_position
canvas.moveto(TextId, x, y)
canvas.moveto(LineId, x-LineXOffset, y+(y2-y1)/2-LineWidth/2)
frame = strikethrough(root, "我是誰", bg="#777777")
frame.place(relx=0.5, rely=0.5, relwidth=0.5, anchor="center")
root.mainloop()

wxPython show image certain amount of time

I am using code in wxPython to show images.
I created a screen with 2 panels, one left and right.
In one of the panels (randomly chosen), I want do display an image for exactly 150ms.
How can I program this? I am relatively new to Python, and I don't find any clear way on the internet.
My code for now (without the 150ms):
import wxversion
wxversion.select("3.0")
import wx
import random
import timeclass Screen_1(wx.Dialog):
ri = 0
def __init__(self,parent,id,title):
wx.Dialog.__init__(self,parent,id,title,size=(400,300))
self.randomImage = random.randrange(1,3)
self.randomSlot = random.randrange(1,3)
Screen_1.ri = self.randomImage
if(self.randomSlot == 1):
self.side = 'Left'
else:
self.side = 'Right'
file = open('User.txt','a')
panel_left = wx.Panel(self,11,(-1,-1),(200,200))
self.picture_left = wx.StaticBitmap(panel_left)
font = wx.Font(13,wx.DEFAULT,wx.NORMAL,wx.BOLD)
panel_centre = wx.Panel(self,12,(200,70),(10,100))
msg = wx.StaticText(panel_centre,-1,'+',size=(10,100))
msg.SetFont(font)
panel_right = wx.Panel(self,13,(210,0),(200,200))
self.picture_right = wx.StaticBitmap(panel_right)
**self.imageName = 'im_'+str(self.randomImage)+'.png'**
if self.randomSlot == 1:
self.picture_left.SetBitmap(wx.Bitmap(self.imageName))
else:
self.picture_right.SetBitmap(wx.Bitmap(self.imageName))
wx.FutureCall(1000,self.Destroy)
self.Centre()
self.ShowModal()
def OnClick(self,event):
self.Close()
Thanks a lot!

def OnTimeUp(self,e):
#change images
self.timer.Start(15,oneShot=True) # if you want to call it again in 15 ms
def StartTimer(self):
self.timer = wx.Timer()
self.timer.Bind(wx.EVT_TIMER,self.OnTimeUp)
self.timer.Start(15,oneShot=True)
something like that ... although 15ms is very fast ...

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Separating OCR text into lines with Python - python

Error is here: self.newList = tempo = [] Both variables point to the same list.

Related

How to add multiline text to an image using Python, while having each word or character drawn using a different color?

How to use MagickImage/Wand to create an image comprised of equally spaced bordered boxes of text in Python

How can I use text with html tags in a tkinter text box, or change it so that it works in a tkinter label?

Python: Create strikethrough / strikeout / overstrike string type

wxPython show image certain amount of time

Categories

Resources