Syntax highlight python - python

I want to make a simple python highlighter for a homework, but things like comments or strings are not being colored, I'm supposed to use regular expressions for the homework (regex).
import re
#Expresiones regulares de regEX
palabrasRe= re.compile(r'\b(False|class|return|None|continue|lambda|try|True|def|nonlocal|while|for|and|del|not|with|as|elif|or|yield|assert|if|else|pass|break|except|raise|list|print|from|import|except|finally|raise|global|in|is|del|with|as|async|await)')
string1 = re.compile(r'[\'][\W\w]*[\']')
string2 = re.compile(r'[\"][\W\w]*[\"]')
comentario1 = re.compile(r'#.+\n')
comentario2 = re.compile(r'(\'{3}(.+\n)*\'{3})')
num = re.compile(r'(\d+(?:\.\d+)?)')
identificador = re.compile(r'\b[a-zA-z]\w*')
operadores= re.compile(r'(\+|\-|\/|\*|\%|\>|\<|\=)')
caractEspeciales = re.compile(r'\W')
#Abrir el archivo para leer
archivo = open('ej.py', "r")
lineas = archivo.readlines()
archivo.close()
#Archivo de html
html = open('holamundo.html','w')
mensaje = " "
#Listas para las coordenadas
pos_ren=0
copalRev=[]
costr1=[]
costr2=[]
cocomen1=[]
cocomen2=[]
conum=[]
coiden=[]
coop=[]
corcarE=[]
#Leer elementos del archivo para revisarlos
for i in lineas:
ren=lineas[pos_ren]
menren=ren
mensaje+="<p>"
search = operadores.finditer(menren)
for match in search:
cor=match.span()
coop.append(cor)
#print(match)
search = identificador.finditer(menren)
for match in search:
cor=match.span()
coiden.append(cor)
#print(match)
search = num.finditer(menren)
for match in search:
cor=match.span()
conum.append(cor)
#print(match)
search = palabrasRe.finditer(menren)
for match in search:
cor=match.span()
copalRev.append(cor)
#print(match)
search = string2.finditer(menren)
for match in search:
cor=match.span()
costr2.append(cor)
search = string1.finditer(menren)
for match in search:
cor=match.span()
costr1.append(cor)
search = comentario1.finditer(menren)
for match in search:
cor=match.span()
cocomen1.append(cor)
search = comentario2.finditer(menren)
for match in search:
cor=match.span()
cocomen2.append(cor)
search = caractEspeciales.finditer(menren)
for match in search:
cor=match.span()
corcarE.append(cor)
for i in coop:
x,y=i
print(i)
print(ren)
pal=ren[x:y]
mensaje2="<FONT COLOR='#FFA500'>"+pal+"</FONT>" # COLOR OPERADORES
menren= menren.replace(pal, mensaje2)
#ren[x:y]= mensaje2
coop=[]
for i in coiden:
x,y=i
print(i)
pal=ren[x:y]
print(pal)
mensaje4="<FONT COLOR='#5982D3'>"+pal+"</FONT>" # COLOR IDENTIFICADORES
menren= menren.replace(pal, mensaje4)
#ren[x:y]= mensaje3
coiden=[]
for i in conum:
x,y=i
print(i)
pal=ren[x:y]
print(pal)
mensaje5="<FONT COLOR='#8032E4'>"+pal+"</FONT>" # COLOR NUMEROS
menren= menren.replace(pal, mensaje5)
#ren[x:y]= mensaje3
conum=[]
for i in copalRev:
x,y=i
print(i)
pal=ren[x:y]
print(pal)
mensaje6="<FONT COLOR='#FFCE3O'>"+pal+"</FONT>" # COLOR PALABRAS RESERVADAS
menren= menren.replace(pal, mensaje6)
#ren[x:y]= mensaje3
copalRev=[]
for i in cocomen1:
x,y=i
print(i)
pal=ren[x:y]
print(pal)
mensaje7="<FONT COLOR='#FFFFFF'>"+pal+"</FONT>" # COLOR PALABRAS RESERVADAS
menren= menren.replace(pal, mensaje7)
#ren[x:y]= mensaje3
cocomen1=[]
for i in costr1:
x,y=i
print(i)
pal=ren[x:y]
print(pal)
mensaje8="<FONT COLOR='#FFFFFF'>"+pal+"</FONT>" # COLOR PALABRAS RESERVADAS
menren= menren.replace(pal, mensaje8)
#ren[x:y]= mensaje3
costr1=[]
mensaje+=menren+"</p>"
pos_ren+=1
html.write("<html><head></head><body bgcolor='#232231' >"+mensaje+"</body></html>")
html.close()
It should be coloring reserved words, both types of strings, both type of comments, numbers, identifiers, operators and special characters. So far I'm getting this:

I tried to refactor this to stop all of your manual copy/pasting with loops... but I'm too tired to debug why it's not running now. Here's how far I got.
import re
from collections import defaultdict
# Expresiones regulares de regEX
KEYWORDS = [
'False', 'class', 'return', 'None', 'continue', 'lambda', 'try', 'True', 'def', 'nonlocal', 'while', 'for', 'and',
'del', 'not', 'with', 'as', 'elif', 'or', 'yield', 'assert', 'if', 'else', 'pass', 'break', 'except', 'raise',
'list', 'print', 'from', 'import', 'except', 'finally', 'raise', 'global', 'in', 'is', 'del', 'with', 'as',
'async', 'await',
]
REGEXES = {
'palabrasRe': re.compile(fr'\b({"|".join(KEYWORDS)})'),
'string1': re.compile(r'[\'][\W\w]*[\']'),
'string2': re.compile(r'[\"][\W\w]*[\"]'),
'comentario1': re.compile(r'#.+\n'),
'comentario2': re.compile(r'(\'{3}(.+\n)*\'{3})'),
'num': re.compile(r'(\d+(?:\.\d+)?)'),
'identificador': re.compile(r'\b[a-zA-z]\w*'),
'operadores': re.compile(r'(\+|\-|\/|\*|\%|\>|\<|\=)'),
'caractEspeciales': re.compile(r'\W'),
}
COORDS_TRANSLATION = {
'palabrasRe': 'copalRev',
'string1': 'costr1',
'string2': 'costr2',
'comentario1': 'cocomen1',
'comentario2': 'cocomen2',
'num': 'conum',
'identificador': 'coiden',
'operadores': 'coop',
'caractEspeciales': 'corcarE',
}
COLORS = {
'copalRev': '#FFCE3O',
'costr1': '#FFFFFF',
'costr2': '#',
'cocomen1': '#FFFFFF',
'cocomen2': '#',
'conum': '#8032E4',
'coiden': '#5982D3',
'coop': '#FFA500',
'corcarE': '#',
}
# Abrir el archivo para leer
with open('scratch_2.py', "r") as archivo:
lineas = archivo.readlines()
# Archivo de html
html = open('holamundo.html', 'w')
mensaje = " "
# Listas para las coordenadas
# Lists for the coordinates
pos_ren = 0
coordinates = defaultdict(list)
# Leer elementos del archivo para revisarlos
for i in lineas:
ren = lineas[pos_ren]
menren = ren
mensaje += "<p>"
for name, regex in REGEXES.items():
search = REGEXES[name].finditer(menren)
for match in search:
cor = match.span()
coordinates[COORDS_TRANSLATION[name]].append(cor)
for name2, color in COLORS.items():
for j in coordinates[name2]:
x, y = j
print(j)
print(ren)
pal = ren[x:y]
mensaje = f"<FONT COLOR='#{color}'>{pal}</FONT>" # COLOR OPERADORES
menren = menren.replace(pal, mensaje)
coordinates[name2].clear()
mensaje += menren + "</p>"
pos_ren += 1
html.write(f"<html><head></head><body bgcolor='#232231' >{mensaje}</body></html>")
html.close()

Related

Update dictionary within dictionary dynamically return same character count for different parameters

I'm trying to retrieve wikipedia pages' characters count, for articles in different languages. I'm using a dictionary with as key the name of the page and as value a dictionary with the language as key and the count as value.
The code is:
pages = ["L'arte della gioia", "Il nome della rosa"]
langs = ["it", "en"]
dicty = {}
dicto ={}
numz = 0
for x in langs:
wikipedia.set_lang(x)
for y in pages:
pagelang = wikipedia.page(y)
splittedpage = pagelang.content
dicto[y] = dicty
for char in splittedpage:
numz +=1
dicty[x] = numz
If I print dicto, I get
{"L'arte della gioia": {'it': 72226, 'en': 111647}, 'Il nome della rosa': {'it': 72226, 'en': 111647}}
The count should be different for the two pages.
Please try this code. I didn't run it because I don't have the wikipedia module.
Notes:
Since your expected result is dict[page,dict[lan,cnt]], I think first iterate pages is more natural, then iterate languages. Maybe for performance reason you want first iterate languages, please comment.
Characters count of text can simply be len(text), why iterate and sum again?
Variable names. You will soon be lost in x y like variables.
pages = ["L'arte della gioia", "Il nome della rosa"]
langs = ["it", "en"]
dicto = {}
for page in pages:
lang_cnt_dict = {}
for lang in langs:
wikipedia.set_lang(lang)
page_lang = wikipedia.page(page)
chars_cnt = len(pagelang.content)
lang_cnt_dict[lan] = chars_cnt
dicto[page] = lang_cnt_dict
print(dicto)
update
If you want iterate langs first
pages = ["L'arte della gioia", "Il nome della rosa"]
langs = ["it", "en"]
dicto = {}
for lang in langs:
wikipedia.set_lang(lang)
for page in pages:
page_lang = wikipedia.page(page)
chars_cnt = len(pagelang.content)
if page in dicto:
dicto[page][lang] = chars_cnt
else:
dicto[page] = {lang: chars_cnt}
print(dicto)

String.Strip is skipping a character the second for loop

For some reason after the second loop in my array the code is skipping a character for some reason.
I think here is the problem:
for word in range(int(len(ShortArray))):
localString = LongArray[word]
#print(word)
if localString[:2] == ShortArray[word]:
print(LongArray[word])
print(word)
Here is the full code:
kleuren = ["Rood","Geel","Groen","Blauw","Wit","Paars","Oranje","Zwart"]
KleurenShort = []
def splitArray(string):
for lenght in range(int(len(string) / 2)):
KleurenShort.append(string[:2])
print(KleurenShort)
string = string.strip(string[:2])
return KleurenShort
def tekst_naar_kleur(string):
return 0
def matchFirst2Letters(ShortArray,LongArray):
for word in range(int(len(ShortArray))):
localString = LongArray[word]
#print(word)
if localString[:2] == ShortArray[word]:
print(LongArray[word])
print(word)
matchFirst2Letters(splitArray("RoGeGrBl"),kleuren)
The outcome is:
['Ro']
['Ro', 'Ge']
['Ro', 'Ge', 'rB']
['Ro', 'Ge', 'rB', 'l']
when it should be:
['Ro']
['Ro', 'Ge']
['Ro', 'Ge', 'Gr']
['Ro', 'Ge', 'Gr', 'Bl']
The problem is the use of the string.strip() method.
'aaaaaabcdb'.strip('ab')
gives 'cd' as every instance of 'a' and 'b' in your input string is removed. You can simply get rid of the first two letters of the input string by indexing:
'abcde'[2:] will give 'cde'.
Implemented in your code the corrected version is:
kleuren = ["Rood","Geel","Groen","Blauw","Wit","Paars","Oranje","Zwart"]
KleurenShort = []
def splitArray(string):
for lenght in range(int(len(string) / 2)):
KleurenShort.append(string[:2])
print(KleurenShort)
string = string[2:]
return KleurenShort
def tekst_naar_kleur(string):
return 0
def matchFirst2Letters(ShortArray,LongArray):
for word in range(int(len(ShortArray))):
localString = LongArray[word]
#print(word)
if localString[:2] == ShortArray[word]:
print(LongArray[word])
print(word)
matchFirst2Letters(splitArray("RoGeGrBl"),kleuren)
which outputs
['Ro']
['Ro', 'Ge']
['Ro', 'Ge', 'Gr']
['Ro', 'Ge', 'Gr', 'Bl']
Rood
0
Geel
1
Groen
2
Blauw
3
With the answer from the comment linked below, your splitArray function simply becomes:
def splitArray(string):
return [string[i:i+2] for i in range(0, len(string), 2)]

How do you build training dataset from scratch for a custom multi-class standfordNLP/Stanza NER tagging model in BIOES/BILOU format?

I am utilizing NLP for a custom application and I want to train my own NER tagger model in StanfordNLP currently known as Stanza.
The default model is limited to very general tags such as LOC, PER, MISC, COUNTRY, TIME etc.
My custom tags are more specific ex. Food, Sport, Software, Brand. How would I got about formatting data I scraped off of the web or from PDF files in BIOES/BILOU format?
https://en.wikipedia.org/wiki/Inside%E2%80%93outside%E2%80%93beginning_(tagging)
Do I necessarily have to tag them manually? or write a script to generate the data in the format shown below:
Alex S-PER
is O
playing O
basketball I-SPORT
with O
Marty B-PER
. O
Rick E-PER
likes O
to O
eat O
Pizza I-FOOD
in O
Los B-LOC
Angeles E-LOC
If so then what tools and libraries can I use in Python?
Thank you in advance.
you can write a piece of code to achieve this task.
Here is a solution I made.
from flashtext import KeywordProcessor
import stanza
nlp = stanza.Pipeline(lang='en', processors='tokenize,ner,pos')
# Tag tokens with standard NLP BIO tags
def bio_tagger(entity, start_char, end_char, tag):
bio_tagged = {"tag":None, "end_char":None, "start_char":None}
ne_tagged = entity.split()
if len(ne_tagged) == 1:
bio_tagged["start_char"] = start_char
bio_tagged["end_char"] = start_char + len(ne_tagged[0])
bio_tagged["tag"] = "B-"+tag
elif len(ne_tagged) == 2:
bio_tagged["start_char"] = start_char
bio_tagged["end_char"] = start_char + len(ne_tagged[0])
bio_tagged["tag"] = "B-"+tag
bio_tagged["start_char"] = end_char - len(ne_tagged[-1])
bio_tagged["end_char"] = end_char
bio_tagged["tag"] = "E-"+tag
elif len(ne_tagged) >= 3:
bio_tagged["start_char"] = start_char
bio_tagged["end_char"] = start_char + len(ne_tagged[0])
bio_tagged["tag"] = "B-"+tag
bio_tagged["start_char"] = end_char - len(ne_tagged[-1])
bio_tagged["end_char"] = end_char
bio_tagged["tag"] = "E-"+tag
cursor = start_char + len(ne_tagged[0]) + 2
for tok in ne_tagged[1:-1]:
bio_tagged["start_char"] = cursor
bio_tagged["end_char"] = cursor + len(tok)
bio_tagged["tag"] = "I-"+tag
cursor = cursor + len(tok) + 2
return bio_tagged
def bio_corpus_builder(text, tags2dict):
#tags2dict {'SPORT':['football','basektball'],
# 'FOOD':['banana','orange']}
corpus = []
document = []
for tag, tag_values in tags2dict.items():
keyword_processor = KeywordProcessor()
keyword_processor.add_keywords_from_list(tag_values)
for sent in get_sentences(text):
word_tag_tagger = []
each_token = {"text": None, "upos":None, "xpos":None,"tag":None, "end_char":None, "start_char":None}
entities_found = keyword_processor.extract_keywords(sent, span_info=True)
if entities_found:
# construct custom tag
for word_tag in entities_found:
word_tag_tagger.append(bio_tagger(word_tag[0], word_tag[1], word_tag[2], tag))
# read original tag
doc = nlp(sent)
sentence = doc.sentences[0]
print(word_tag_tagger)
each_sent = []
for token in sentence.tokens:
each_token["text"] = token.text
each_token["tag"] = token.ner
each_token["end_char"] = token.end_char
each_token["start_char"] = token.start_char
each_token["upos"] = token.to_dict()[0]["upos"]
each_token["xpos"] = token.to_dict()[0]["xpos"]
each_sent.append(dict(each_token))
#update tagging
for tok in each_sent:
for word2tags in word_tag_tagger:
if (int(tok["start_char"]) == int(word2tags["start_char"])) and (int(tok["end_char"]) == int(word2tags["end_char"])):
tok["tag"] = word2tags['tag']
document.append(each_sent)
del keyword_processor
return document
# if "__name__"=="__main__":
tags2dict ={'SPORT':['football','basektball'],'FOOD':['banana','orange']}
text = "Barack Obama was born in Hawaii. He love basektball."
bio_corpus_builder(text, tags2dict)
#output
# [[{'text': 'He',
# 'upos': 'PRON',
# 'xpos': 'PRP',
# 'tag': 'O',
# 'end_char': 2,
# 'start_char': 0},
# {'text': 'love',
# 'upos': 'VERB',
# 'xpos': 'VBP',
# 'tag': 'O',
# 'end_char': 7,
# 'start_char': 3},
# {'text': 'basektball',
# 'upos': 'NOUN',
# 'xpos': 'NN',
# 'tag': 'B-SPORT',
# 'end_char': 18,
# 'start_char': 8},
# {'text': '.',
# 'upos': 'PUNCT',
# 'xpos': '.',
# 'tag': 'O',
# 'end_char': 19,
# 'start_char': 18}]]

Python Identifier Identification

I'm reading a Python file in a Python program and I want to get the list of all identifiers, literals, separators and terminator in the Python file being read. Using identifiers as example:
one_var = "something"
two_var = "something else"
other_var = "something different"
Assuming the variables above are in the file being read, the result should be:
list_of_identifiers = [one_var, two_var, other_var]
Same thing goes for literals, terminators and separators. Thanks
I already wrote code for all operators and keywords:
import keyword, operator
list_of_operators = []
list_of_keywords = []
more_operators = ['+', '-', '/', '*', '%', '**', '//', '==', '!=', '>', '<', '>=', '<=', '=', '+=', '-=', '*=', '/=', '%=', '**=', '//=', '&', '|', '^', '~', '<<', '>>', 'in', 'not in', 'is', 'is not', 'not', 'or', 'and']
with open('file.py') as data_source:
for each_line in data_source:
new_string = str(each_line).split(' ')
for each_word in new_string:
if each_word in keyword.kwlist:
list_of_keywords.append(each_word)
elif each_word in operator.__all__ or each_word in more_operators:
list_of_operators.append(each_word)
print("Operators found:\n", list_of_operators)
print("Keywords found:\n", list_of_keywords)
import ast
with open('file.py') as data_source:
ast_root = ast.parse(data_source.read())
identifiers = set()
for node in ast.walk(ast_root):
if isinstance(node, ast.Name):
identifiers.add(node.id)
print(identifiers)

More Pythonic to write Functions with Regex

I've got 20'000+ court documents I want to pull specific data points out of: date, document number, verdict. I am using Python and Regex to perform this.
The verdicts are in three languages (German, French and Italian) and some of them have slightly different formatting. I am trying to develop functions for the various data points that take this and the different languages into regards.
I'm finding my functions very clumsy. Has anybody got a more pythonic way to develop these functions?
def gericht(doc):
Gericht = re.findall(
r"Beschwerde gegen [a-z]+ [A-Z][a-züöä]+ ([^\n\n]*)", doc)
Gericht1 = re.findall(
r"Beschwerde nach [A-Za-z]. [0-9]+ [a-z]+. [A-Z]+ [a-z]+ [a-z]+[A-Za-z]+ [a-z]+ [0-9]+. [A-Za-z]+ [0-9]+ ([^\n\n]*)", doc)
Gericht2 = re.findall(
r"Revisionsgesuch gegen das Urteil ([^\n\n]*)", doc)
Gericht3 = re.findall(
r"Urteil des ([^\n\n]*)", doc)
Gericht_it = re.findall(
r"ricorso contro la sentenza emanata il [0-9]+ [a-z]+ [0-9]+ [a-z]+ ([^\n\n]*)", doc)
Gericht_fr = re.findall(
r"recours contre l'arrêt ([^\n\n]*)", doc)
Gericht_fr_1 = re.findall(
r"recours contre le jugement ([^\n\n]*)", doc)
Gericht_fr_2 = re.findall(
r"demande de révision de l'arrêt ([^\n\n]*)", doc)
try:
if Gericht != None:
return Gericht[0]
except:
None
try:
if Gericht1 != None:
return Gericht1[0]
except:
None
try:
if Gericht2 != None:
return Gericht2[0]
except:
None
try:
if Gericht3 != None:
return Gericht3[0]
except:
None
try:
if Gericht_it != None:
return Gericht_it[0]
except:
None
try:
if Gericht_fr != None:
Gericht_fr = Gericht_fr[0].replace('de la ', '').replace('du ', '')
return Gericht_fr
except:
None
try:
if Gericht_fr_1 != None:
Gericht_fr_1 = Gericht_fr_1[0].replace('de la ', '').replace('du ', '')
return Gericht_fr_1
except:
None
try:
if Gericht_fr_2 != None:
Gericht_fr_2 = Gericht_fr_2[0].replace('de la ', '').replace('du ', '')
return Gericht_fr_2
except:
None
The result of re.findall() is never None, so all those if statements testing this are superfluous. Then using findall() when you just want the first result does not make sense.
The replacing in french results may remove too much. For instance the 'du ' replacement does not just remove the word du but also affects words ending with du.
def gericht(doc):
for pattern, is_french in [
(r'Beschwerde gegen [a-z]+ [A-Z][a-züöä]+ ([^\n]*)', False),
(
r'Beschwerde nach [A-Za-z]. [0-9]+ [a-z]+. [A-Z]+ [a-z]+'
r' [a-z]+[A-Za-z]+ [a-z]+ [0-9]+. [A-Za-z]+ [0-9]+ ([^\n]*)',
False
),
(r'Revisionsgesuch gegen das Urteil ([^\n]*)', False),
(r'Urteil des ([^\n]*)', False),
(
r'ricorso contro la sentenza emanata il [0-9]+ [a-z]+ [0-9]+'
r' [a-z]+ ([^\n]*)',
False
),
(r"recours contre l'arrêt ([^\n]*)", True),
(r'recours contre le jugement ([^\n]*)', True),
(r"demande de révision de l'arrêt ([^\n]*)", True),
]:
match = re.search(pattern, doc)
if match:
result = match.group(1)
if is_french:
for removable in [' de la ', ' du ']:
result = result.replace(removable, ' ')
return result
return None

Categories

Resources