Python 2 re.sub issue - python

I got this a function that replaces sub-string matches with the match surrounded with HTML tags. This function will consume string in English and Greek mostly.
The function:
def highlight_text(st, kwlist, start_tag=None, end_tag=None):
if start_tag is None:
start_tag = '<span class="nom">'
if end_tag is None:
end_tag = '</span>'
for kw in kwlist:
st = re.sub(r'\b' + kw + r'\b', '{}{}{}'.format(start_tag, kw, end_tag), st)
return st
The testing string is in Greek except the first sub-string [Korais]: st="Korais Ο Αδαμάντιος Κοραής (Σμύρνη, 27 Απριλίου 1748 – Παρίσι, 6 Απριλίου 1833), ήταν Έλληνας φιλόλογος με βαθιά γνώση του ελληνικού πολιτισμού. Ο Κοραής είναι ένας από τους σημαντικότερους εκπροσώπους του νεοελληνικού διαφωτισμού και μνημονεύεται, ανάμεσα σε άλλα, ως πρωτοπόρος στην έκδοση έργων αρχαίας ελληνικής γραμματείας, αλλά και για τις γλωσσικές του απόψεις στην υποστήριξη της καθαρεύουσας, σε μια μετριοπαθή όμως μορφή της με σκοπό την εκκαθάριση των πλείστων ξένων λέξεων που υπήρχαν στη γλώσσα του λαού."
The test code:
kwlist = ['ελληνικού', 'Σμύρνη', 'Αδαμάντιος', 'Korais']
d = highlight_text(st, kwlist, start_tag=None, end_tag=None)
print(d)
When I'm running the code [st is the above string] only sub-strings in English get tagged. Greek substr are ignored. Notice that I run the above block on Python 2.7. When I use Python 3.4 all sub-string get replaced.
Another issue is that when I'm running the above function withing Flask application, it throws me an error: unexpected end of regular expression.
How should I tackle the above issue without using external library if possible?
I'm pulling my hairs off my head two days now.

In Python 2.7, you need to explicitly convert text to Unicode. See the fixed snippet below:
# -*- coding: utf-8 -*-
import re
def highlight_text(st, kwlist, start_tag=None, end_tag=None):
if start_tag is None:
start_tag = '<span class="nom">'
if end_tag is None:
end_tag = '</span>'
for kw in kwlist:
st = re.sub(ur'\b' + kw.decode('utf8') + ur'\b',
u'{}{}{}'.format(start_tag.decode('utf8'), kw.decode('utf8'), end_tag.decode('utf8')),
st.decode('utf8'), 0, re.U).encode("utf8")
return st
st="Korais Ο Αδαμάντιος Κοραής (Σμύρνη, 27 Απριλίου 1748 – Παρίσι, 6 Απριλίου 1833), ήταν Έλληνας φιλόλογος με βαθιά γνώση του ελληνικού πολιτισμού. Ο Κοραής είναι ένας από τους σημαντικότερους εκπροσώπους του νεοελληνικού διαφωτισμού και μνημονεύεται, ανάμεσα σε άλλα, ως πρωτοπόρος στην έκδοση έργων αρχαίας ελληνικής γραμματείας, αλλά και για τις γλωσσικές του απόψεις στην υποστήριξη της καθαρεύουσας, σε μια μετριοπαθή όμως μορφή της με σκοπό την εκκαθάριση των πλείστων ξένων λέξεων που υπήρχαν στη γλώσσα του λαού."
kwlist = ['ελληνικού', 'Σμύρνη', 'Αδαμάντιος', 'Korais']
d = highlight_text(st, kwlist, start_tag=None, end_tag=None)
print(d)
See demo
Note that all literals are declared with u prefix and all variables are decodeed and the re.sub result is encoded back to UTF8.

English get tagged. Greek substr are ignored.
Where does your st come from? Please notice that in Python 2.x 'μορφή' != u'μορφή' Maybe you are comparing str with unicode.
Suggestions: Use unicode everywhere when you can, e.g.:
kwlist = [u'ελληνικού', u'Σμύρνη', u'Αδαμάντιος', u'Korais']

Related

How to determine if a Glyph can be displayed?

I have a large list of Unicode icons that I want to display. However, I would like to hide/skip any icon that I cannot display (because I don't have the correct font installed). Is there a programmatic way to determine this?
There's nothing built into Python for this. However, you can apply the fonttools module e.g. as follows (used in Windows 10):
# ToDo: find fallback font
# ToDo: reverse algorithm (font => characters) instead of (character => fonts)
# ToDo: check/print merely basic font (omit variants like Bold, Light, Condensed, …)
import unicodedata
import sys
import os
from fontTools.ttLib import TTFont, TTCollection
fontsPaths = []
fontcPaths = []
fontsdirs = [ os.path.join( os.getenv('SystemRoot'), 'Fonts') # r"c:\Windows\Fonts"
, r"D:\Downloads\MathJax-TeX-fonts-otf"
# , os.path.join( os.getenv('LOCALAPPDATA'), r'Microsoft\Windows\Fonts')
]
print(fontsdirs, file=sys.stderr)
for fontsdir in fontsdirs:
for root,dirs,files in os.walk( fontsdir ):
for file in files:
if file.endswith(".ttf") or file.endswith(".otf") or file.endswith(".ttc"):
tfile = os.path.join(root,file)
if file.endswith(".ttc"):
fontcPaths.append(tfile)
else:
fontsPaths.append(tfile)
# print( len(fonts), "fonts", fontsdir)
def char_in_font(unicode_char, font):
for cmap in font['cmap'].tables:
if cmap.isUnicode() or cmap.getEncoding() == 'utf_16_be':
if ord(unicode_char) in cmap.cmap:
# print(type(cmap))
auxcn = cmap.cmap[ord(unicode_char)]
# print(auxcn, type(auxcn))
return auxcn if auxcn != '' else '<nil>'
return ''
def checkfont(char,font,fontdict,fontpath):
nameID_index = 1 # works generally (not always)
for i,f in enumerate(font['name'].names):
# An Introduction to TrueType Fonts: A look inside the TTF format
# https://scripts.sil.org/cms/scripts/page.php?site_id=nrsi&id=IWS-Chapter08
# 1 = Font Family name, 2 = Font SubFamily name, 4 = Full font name
if f.nameID == 1:
nameID_index = i
break
fontname = font['name'].names[nameID_index].toStr()
if fontname not in fontdict.keys():
aux = char_in_font(char, font)
if aux != '':
fontdict[fontname] = "{} ({}) [{}] '{}' \t {} {}".format(
char,
'0x{:04x}'.format(ord(char)),
aux,
fontname, # string.decode('unicode-escape'),
# '', ''
'in', fontpath.split('\\')[-1]
)
def testfont(char):
fontdict = {}
for fontpath in fontsPaths:
font = TTFont(fontpath) # specify the path to the font
checkfont(char,font,fontdict,fontpath)
for fontpath in fontcPaths: # specify the path to the font collection
fonts = TTCollection(fontpath)
for ii in range(len(fonts)):
font = TTFont(fontpath, fontNumber=ii) # fontfile and index
checkfont(char,font,fontdict,fontpath)
return fontdict.values()
def testprint(char):
print('') # empty line for better readability
print(char, ' 0x{:04x}'.format(ord(char)), unicodedata.name(char, '???'))
fontarray = testfont(char)
for x in fontarray:
print(x)
if len(sys.argv) == 1:
# sample output
testprint(u"अ") # 0x0905 Devanagari Letter A
else:
for i in range( 1, len(sys.argv) ):
if len(sys.argv[i]) >=2:
try:
chars = chr(int(sys.argv[i])) # 0x042F or 1071
except:
try:
chars = chr(int(sys.argv[i],16)) # 042F
except:
chars = (sys.argv[i].
encode('raw_unicode_escape').
decode('unicode_escape')) # ➕🐈\U00010A30\u042F\xFE
else:
chars = sys.argv[i] # Я (Cyrillic Capital Letter Ya)
for char in chars:
testprint(char);
Sample output (if called without arguments): .\FontGlyphs.py
['C:\\WINDOWS\\Fonts', 'D:\\Downloads\\MathJax-TeX-fonts-otf']
अ 0x0905 DEVANAGARI LETTER A
अ (0x0905) [uni0905] 'Nirmala UI' in Nirmala.ttf
अ (0x0905) [uni0905] 'Nirmala UI Semilight' in NirmalaS.ttf
अ (0x0905) [uni0905] 'Unifont' in unifont-8.0.01.ttf
अ (0x0905) [uni0905] 'Unifont CSUR' in unifont_csur-8.0.01.ttf
Another example: .\FontGlyphs.py 🐈
['C:\\WINDOWS\\Fonts', 'D:\\Downloads\\MathJax-TeX-fonts-otf']
🐈 0x1f408 CAT
🐈 (0x1f408) [u1F408] 'EmojiOne Color' in EmojiOneColor-SVGinOT.ttf
🐈 (0x1f408) [u1F408] 'Segoe UI Emoji' in seguiemj.ttf
🐈 (0x1f408) [u1F408] 'Segoe UI Symbol' in seguisym.ttf
FYI, I have written similar script that shows output (glyphs) rendered using appropriate fonts (using default browser…
Limitation the script does not recognize Emoji Sequence, for instance
.\FontGlyphs.py 👍🏽
['C:\\WINDOWS\\Fonts', 'D:\\Downloads\\MathJax-TeX-fonts-otf']
👍 0x1f44d THUMBS UP SIGN
👍 (0x1f44d) [u1F44D] 'EmojiOne Color' in EmojiOneColor-SVGinOT.ttf
👍 (0x1f44d) [u1F44D] 'Segoe UI Emoji' in seguiemj.ttf
👍 (0x1f44d) [u1F44D] 'Segoe UI Symbol' in seguisym.ttf
🏽 0x1f3fd EMOJI MODIFIER FITZPATRICK TYPE-4
🏽 (0x1f3fd) [u1F3FD] 'EmojiOne Color' in EmojiOneColor-SVGinOT.ttf
🏽 (0x1f3fd) [u1F3FD] 'Segoe UI Emoji' in seguiemj.ttf
🏽 (0x1f3fd) [u1F3FD] 'Segoe UI Symbol' in seguisym.ttf
You can use pywin32 to check for the required fonts.
import win32gui
def fontFamProc(font, tm, fonttype, names):
names.append(font.lfFaceName)
return True
fonts = []
deviceContext = win32gui.GetDC(None)
win32gui.EnumFontFamilies(deviceContext, None, fontFamProc, fonts)
win32gui.ReleaseDC(deviceContext, None)
print(fonts)
Well, you could simply print all of Unicode and find out that way. E.g., (I can print most all if not all :
import io
with io.open("all_utf-8.txt", "w", encoding="utf8") as f:
for n in range(150000):
try:
i = chr(n)
if i.isprintable():
print(f"{i}", end="", file=f)
if n % 200 == 0:
print(file=f)
except UnicodeError:
pass
(note the use of the built-in Python str method isprintable())
& here's a bit of a zoom in so you can actually see the individual chars/glyphs... 🙂

How to fix attribute 'list'

I have some qeustion and problem with celaning text on my NLP model. I dont know why i get this error: AttributeError: 'list' object has no attribute 'split.
On below is my df['Text'].sample(5) :
26278 [RT, #davidsirota:, subset, people, website, t...
63243 [RT, #jmartNYT:, The, presses, Team, Biden, As...
61059 [RT, #caitoz:, BREAKING:, Biden, nominate, "Li...
43160 [RT, #K_JeanPierre:, I, profoundly, honored, P...
Name: Text, dtype: object
On below is my code
def tokenizer(text):
tokenized = [w for w in text.split() if w not in stopset]
return tokenized
df['Text'] = df['Text'].apply(tokenizer)
def remove_emoji(string):
emoji_pattern = re.compile("["
u"\U0001F600-\U0001F64F" # emoticons
u"\U0001F300-\U0001F5FF" # symbols & pictographs
u"\U0001F680-\U0001F6FF" # transport & map symbols
u"\U0001F1E0-\U0001F1FF" # flags (iOS)
u"\U00002702-\U000027B0"
u"\U000024C2-\U0001F251"
"]+", flags=re.UNICODE)
return emoji_pattern.sub(r'', string)
def remove_nonwords(Text):
if re.findall('\d',Text):
return ''
else:
return Text
def clean_text(Text):
text=' '.join([i for i in Text.split() if i not in stopset])
text=' '.join([stem.stem(word) for word in Text.split()])
return Text
df['text2'] = df['Text'].apply(clean_text)
Could you someone help me ?

pattern to dictionary of lists Python

I have a file like this
module1 instance1(.wire1 (connectionwire1), .wire2 (connectionwire2),.... ,wire100 (connectionwire100)) ; module 2 instance 2(.wire1 (newconnectionwire1), .wire2 (newconnectionwire2),.... ,wire99 (newconnectionwire99))
Ther wires are repeated along modules. There can be many modules.
I want to build a dictionary like this (not every wire in 2nd module is a duplicate).
[wire1:[(module1, instance1, connection1), (module2, instance2,newconnection1), wire2:[(module1 instance1 connection2),(module2, instance2,newconnection1)]... wire99:module2, instance2, connection99), ]
I am splitting the string on ; then splitting on , and then ( to get wire and connectionwire strings . I am not sure how to fill the data structure though so the wire is the key and module, instancename and connection are elements.
Goal- get this datastructure- [ wire: (module, instance, connectionwire) ]
filedata=file.read()
realindex=list(find_pos(filedata,';'))
tempindex=0
for l in realindex:
module=filedata[tempindex:l]
modulename=module.split()[0]
openbracketindex=module.find("(")
closebracketindex=module.strip("\n").find(");")
instancename=module[:openbracketindex].split()[1]
tempindex=l
tempwires=module[openbracketindex:l+1]
#got to split wires on commas
for tempw in tempwires.split(","):
wires=tempw
listofwires.append(wires)
Using the re module.
import re
from collections import defaultdict
s = "module1 instance1(.wire1 (connectionwire1), .wire2 (connectionwire2), .wire100 (connectionwire100)) ; module2 instance2(.wire1 (newconnectionwire1), .wire2 (newconnectionwire2), wire99 (newconnectionwire99))'
d = defaultdict(list)
module_pattern = r'(\w+)\s(\w+)\(([^;]+)'
mod_rex = re.compile(module_pattern)
wire_pattern = r'\.(\w+)\s\(([^\)]+)'
wire_rex = re.compile(wire_pattern)
for match in mod_rex.finditer(s):
#print '\n'.join(match.groups())
module, instance, wires = match.groups()
for match in wire_rex.finditer(wires):
wire, connection = match.groups()
#print '\t', wire, connection
d[wire].append((module, instance, connection))
for k, v in d.items():
print k, ':', v
Produces
wire1 : [('module1', 'instance1', 'connectionwire1'), ('module2', 'instance2', 'newconnectionwire1')]
wire2 : [('module1', 'instance1', 'connectionwire2'), ('module2', 'instance2', 'newconnectionwire2')]
wire100 : [('module1', 'instance1', 'connectionwire100')]
Answer provided by wwii using re is correct. I'm sharing an example of how you can solve your problem using pyparsing module which makes parsing human readable and easy to do.
from pyparsing import Word, alphanums, Optional, ZeroOrMore, Literal, Group, OneOrMore
from collections import defaultdict
s = 'module1 instance1(.wire1 (connectionwire1), .wire2 (connectionwire2), .wire100 (connectionwire100)) ; module2 instance2(.wire1 (newconnectionwire1), .wire2 (newconnectionwire 2), .wire99 (newconnectionwire99))'
connection = Word(alphanums)
wire = Word(alphanums)
module = Word(alphanums)
instance = Word(alphanums)
dot = Literal(".").suppress()
comma = Literal(",").suppress()
lparen = Literal("(").suppress()
rparen = Literal(")").suppress()
semicolon = Literal(";").suppress()
wire_connection = Group(dot + wire("wire") + lparen + connection("connection") + rparen + Optional(comma))
wire_connections = Group(OneOrMore(wire_connection))
module_instance = Group(module("module") + instance("instance") + lparen + ZeroOrMore(wire_connections("wire_connections")) + rparen + Optional(semicolon))
module_instances = OneOrMore(module_instance)
results = module_instances.parseString(s)
# create a dict
d = defaultdict(list)
for r in results:
m = r['module']
i = r['instance']
for wc in r['wire_connections']:
w = wc['wire']
c = wc['connection']
d[w].append((m, i, c))
print d
Output:
defaultdict(<type 'list'>, {'wire1': [('module1', 'instance1', 'connectionwire1'), ('module2', 'instance2', 'newconnectionwire1')], 'wire2': [('module1', 'instance1', 'connectionwire2'), ('module2', 'instance2', 'newconnectionwire2')], 'wire100': [('module1', 'instance1', 'connectionwire100')], 'wire99': [('module2', 'instance2', 'newconnectionwire99')]})

How to replace text in curly brackets with another text based on comparisons using Python Regex

I am quiet new to regular expressions. I have a string that looks like this:
str = "abc/def/([default], [testing])"
and a dictionary
dict = {'abc/def/[default]' : '2.7', 'abc/def/[testing]' : '2.1'}
and using Python RE, I want str in this form, after comparisons of each element in dict to str:
str = "abc/def/(2.7, 2.1)"
Any help how to do it using Python RE?
P.S. its not the part of any assignment, instead it is the part of my project at work and I have spent many hours to figure out solution but in vain.
import re
st = "abc/def/([default], [testing], [something])"
dic = {'abc/def/[default]' : '2.7',
'abc/def/[testing]' : '2.1',
'bcd/xed/[something]' : '3.1'}
prefix_regex = "^[\w*/]*"
tag_regex = "\[\w*\]"
prefix = re.findall(prefix_regex, st)[0]
tags = re.findall(tag_regex, st)
for key in dic:
key_prefix = re.findall(prefix_regex, key)[0]
key_tag = re.findall(tag_regex, key)[0]
if prefix == key_prefix:
for tag in tags:
if tag == key_tag:
st = st.replace(tag, dic[key])
print st
OUTPUT:
abc/def/(2.7, 2.1, [something])
Here is a solution using re module.
Hypotheses :
there is a dictionary whose keys are composed of a prefix and a variable part, the variable part is enclosed in brackets ([])
the values are strings by which the variable parts are to be replaced in the string
the string is composed by a prefix, a (, a list of variable parts and a )
the variable parts in the string are enclosed in []
the variable parts in the string are separated by a comma followed by optional spaces
Python code :
import re
class splitter:
pref = re.compile("[^(]+")
iden = re.compile("\[[^]]*\]")
def __init__(self, d):
self.d = d
def split(self, s):
m = self.pref.match(s)
if m is not None:
p = m.group(0)
elts = self.iden.findall(s, m.span()[1])
return p, elts
return None
def convert(self, s):
p, elts = self.split(s)
return p + "(" + ", ".join((self.d[p + elt] for elt in elts)) + ")"
Usage :
s = "abc/def/([default], [testing])"
d = {'abc/def/[default]' : '2.7', 'abc/def/[testing]' : '2.1'}
sp = splitter(d)
print(sp.convert(s))
output :
abc/def/(2.7, 2.1)
Regex is probably not required here. Hope this helps
lhs,rhs = str.split("/(")
rhs1,rhs2 = rhs.strip(")").split(", ")
lhs+="/"
print "{0}({1},{2})".format(lhs,dict[lhs+rhs1],dict[lhs+rhs2])
output
abc/def/(2.7,2.1)

Can't get the UNICODE chars

I have encountered in a problem while i'm trying to get the UNICODE chars and to put them in a list. The problem is that i'm getting the hex code of the symbols and not the symbols themselves..
Can anyone help me with that?
My code:
KeysLst = []
for i in range(1000, 1100):
char = unichr(i)
KeysLst.append(char)
print KeysLst
Output:
[u'\u03e8', u'\u03e9', u'\u03ea', u'\u03eb', u'\u03ec', u'\u03ed', u'\u03ee', u'\u03ef', u'\u03f0', u'\u03f1', u'\u03f2', u'\u03f3', u'\u03f4', u'\u03f5', u'\u03f6', u'\u03f7', u'\u03f8', u'\u03f9', u'\u03fa', u'\u03fb', u'\u03fc', u'\u03fd', u'\u03fe', u'\u03ff', u'\u0400', u'\u0401', u'\u0402', u'\u0403', u'\u0404', u'\u0405', u'\u0406', u'\u0407', u'\u0408', u'\u0409', u'\u040a', u'\u040b', u'\u040c', u'\u040d', u'\u040e', u'\u040f', u'\u0410', u'\u0411', u'\u0412', u'\u0413', u'\u0414', u'\u0415', u'\u0416', u'\u0417', u'\u0418', u'\u0419', u'\u041a', u'\u041b', u'\u041c', u'\u041d', u'\u041e', u'\u041f', u'\u0420', u'\u0421', u'\u0422', u'\u0423', u'\u0424', u'\u0425', u'\u0426', u'\u0427', u'\u0428', u'\u0429', u'\u042a', u'\u042b', u'\u042c', u'\u042d', u'\u042e', u'\u042f', u'\u0430', u'\u0431', u'\u0432', u'\u0433', u'\u0434', u'\u0435', u'\u0436', u'\u0437', u'\u0438', u'\u0439', u'\u043a', u'\u043b', u'\u043c', u'\u043d', u'\u043e', u'\u043f', u'\u0440', u'\u0441', u'\u0442', u'\u0443', u'\u0444', u'\u0445', u'\u0446', u'\u0447', u'\u0448', u'\u0449', u'\u044a', u'\u044b']
You did get unicode characters.
However, Python is showing you unicode literal escapes, to make debugging easier. Those u'\u03e8' values are still one-character unicoe strings though.
Try printing the individual values in your list:
>>> print KeysLst[0]
Ϩ
>>> print KeysLst[1]
ϩ
>>> KeysLst[0]
u'\u03e8'
>>> KeysLst[1]
u'\u03e9'
The unicode escape representation is used for any codepoint outside of the printable ASCII range:
>>> u'A'
u'A'
>>> u'\n'
u'\n'
>>> u'\x86'
u'\x86'
>>> u'\u0025'
u'%'
When you print a list, you get the repr of the elements inside the list (surrounded by brackets and delimited by a comma).
If you are trying to print the unicode glyphs, try
KeysLst = []
for i in range(1000, 1100):
char = unichr(i)
KeysLst.append(char)
for char in KeysLst:
print char,
which yields
Ϩ ϩ Ϫ ϫ Ϭ ϭ Ϯ ϯ ϰ ϱ ϲ ϳ ϴ ϵ ϶ Ϸ ϸ Ϲ Ϻ ϻ ϼ Ͻ Ͼ Ͽ Ѐ Ё Ђ Ѓ Є Ѕ І Ї Ј Љ Њ Ћ Ќ Ѝ Ў Џ А Б В Г Д Е Ж З И Й К Л М Н О П Р С Т У Ф Х Ц Ч Ш Щ Ъ Ы Ь Э Ю Я а б в г д е ж з и й к л м н о п р с т у ф х ц ч ш щ ъ ы

Categories

Resources