Cleaning text files with regex of python - python

I have a huge file where there lines like this one:
"En g茅n茅ral un tr猫s bon hotel La terrasse du bar pr猫s du lobby"
How to remove these Sinographic characters from the lines of the file so I get a new file where these lines are with Roman alphabet characters only?
I was thinking of using regular expressions.
Is there a character class for all Roman alphabet characters, e.g. Arabic numerals, a-nA-N and other(punctuation)?

I find this regex cheet sheet to come in very handy for situations like these.
# -*- coding: utf-8
import re
import string
u = u"En.!?+ 123 g茅n茅ral un tr猫s bon hotel La terrasse du bar pr猫s du lobby"
p = re.compile(r"[^\w\s\d{}]".format(re.escape(string.punctuation)))
for m in p.finditer(u):
print m.group()
>>> 茅
>>> 茅
>>> 猫
>>> 猫
I'm also a huge fan of the unidecode module.
from unidecode import unidecode
u = u"En.!?+ 123 g茅n茅ral un tr猫s bon hotel La terrasse du bar pr猫s du lobby"
print unidecode(u)
>>> En.!?+ 123 gMao nMao ral un trMao s bon hotel La terrasse du bar prMao s du lobby

You can use the string module.
>>> string.ascii_letters
'abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ'
>>> string.digits
'0123456789'
>>> string.punctuation
'!"#$%&\'()*+,-./:;<=>?#[\\]^_`{|}~'
>>>
And it seems the code you want to replace is Chinese. If you all your string is unicode, you can use the simple range [\u4e00-\u9fa5] to replace them. This is not the whole range of Chinese but enough.
>>> s = u"En g茅n茅ral un tr猫s bon hotel La terrasse du bar pr猫s du lobby"
>>> s
u'En g\u8305n\u8305ral un tr\u732bs bon hotel La terrasse du bar pr\u732bs du lobby'
>>> import re
>>> re.sub(ur'[\u4e00-\u9fa5]', '', s)
u'En gnral un trs bon hotel La terrasse du bar prs du lobby'
>>>

You can do it without regexes.
To keep only ascii characters:
# -*- coding: utf-8 -*-
import unicodedata
unistr = u"En g茅n茅ral un tr猫s bon hotel La terrasse du bar pr猫s du lobby"
unistr = unicodedata.normalize('NFD', unistr) # to preserve `e` in `é`
ascii_bytes = unistr.encode('ascii', 'ignore')
To remove everything except ascii letters, numbers, punctuation:
from string import ascii_letters, digits, punctuation, whitespace
to_keep = set(map(ord, ascii_letters + digits + punctuation + whitespace))
all_bytes = range(0x100)
to_remove = bytearray(b for b in all_bytes if b not in to_keep)
text = ascii_bytes.translate(None, to_remove).decode()
# -> En gnral un trs bon hotel La terrasse du bar prs du lobby

Related

How to extract a substring using this regex pattern? It's give a ValueError: too many values to unpack (expected 1)

import re, random, os, datetime, time
from os import remove
from unicodedata import normalize
from glob import glob
def learn_in_real_time(input_text, text):
#Quita acentos y demas diacríticos excepto la ñ
input_text = re.sub(
r"([^n\u0300-\u036f]|n(?!\u0303(?![\u0300-\u036f])))[\u0300-\u036f]+", r"\1",
normalize("NFD", input_text), 0, re.I
)
input_text = normalize( 'NFC', input_text) # -> NFC
input_text_to_check = input_text.lower() #Convierte a minuscula todo
words = []
words_associations = []
regex_what_who = r"(.*)\¿?(que sabes|que sabias|que sabrias|que te referis|que te refieres|que te referias|que te habias referido|que habias referido|a que|que|quienes|quien)\s*(con que|con lo que|con la que|con|acerca de que|acerca de quienes|acerca de quien|sobre de que|sobre que|sobre de quienes|sobre quienes|sobre de quien|sobre quien|)\s*(son|sean|es|serian|seria)\s*(iguales|igual|similares|similar|parecidos|parecido|comparables|comparable|asociables|asociable|distinguibles|distinguible|distintos|distinto|diferentes|diferente|diferenciables|diferenciable|)\s*(a |del |de |)\s*((?:\w+\s*)+)?"
l = re.search(regex_what_who, input_text_to_check, re.IGNORECASE) #Con esto valido la regex haber si entra o no en el bloque de code
if l:
#print("C")
association, = l.groups()
association = association.strip()
association_check = association + "\n" #Uso estas para las comparaciones, ya que sino las consideraria erroneamente como palabras que no estan en la lista solo por no tener el \n
return text
return text
I need it to extract the word that is in ((?: \ W + \ s *) +) and save it to a variable as a string, but the problem is that it gives me this error:
Traceback (most recent call last):
File "answer_about_learned_in_txt.py", line 106, in <module>
print(learn_in_real_time(input_t, text))
File "answer_about_learned_in_txt.py", line 72, in learn_in_real_time
association, = l.groups()
ValueError: too many values to unpack (expected 1)
How do I extract all what is in ((?: \ W + \ s *) +), and save it in a variable?
Taking advantage now that I ask how I would do to:
a) to extract everything that is in ((?: \ W + \ s *) +) and if there are blank spaces that it does not cut and save everything, for example: "Hello, how are you?"
b) to extract everything that is in ((?: \ W + \ s *) +) but to save up to the first white space, for example: "Hello"
I have the problem that if I put the following, position 6 of the tuple does not catch me
if l:
#print("C")
#association, = l.groups()
print(l.groups())
association, _temp = l.group(6)
And it gives me this error
File "answer_about_learned_in_txt.py", line 74, in learn_in_real_time
association, _temp = l.group(6)
ValueError: not enough values to unpack (expected 2, got 0)
In the end I was able to solve it with the following
If you enter
Que son los cometas
print (l.groups ())
('', 'que', '', 'son', '', '', 'los cometas')
I'm interested in the seventh position of the tuple, counting from 1
association = l.group (7)
And this give me :
'los cometas'
let's update patterns string to a logical view and follow main feature.
regex_what_who = r"(que sabes|que sabias|que sabrias|que te referis|que te refieres|que te referias|que te habias referido|que habias referido|a que|que|quienes|quien|con que|con lo que|con la que|con|acerca de que|acerca de quienes|acerca de quien|sobre de que|sobre que|sobre de quienes|sobre quienes|sobre de quien|sobre quien|son|sean|es|serian|seria|iguales|igual|similares|similar|parecidos|parecido|comparables|comparable|asociables|asociable|distinguibles|distinguible|distintos|distinto|diferentes|diferente|diferenciables|diferenciable).*(a|del|de)\s*((?:\w+\s*)+)?"
then, fix error first error in case if we got one result or many:
association, _temp = l.groups()
It Work's! -)

Python Regex to Find Special Characters and characters in between

I have a csv file that looks like the following
Porta-a-Porta-d87134d1-e2bd-426b-b1f6-90d8dca68855;2842.020;2843.270;Unknown;; tecnici delle societ…
Porta-a-Porta-d87134d1-e2bd-426b-b1f6-90d8dca68855;2903.310;2906.360;Unknown;; pu• avere un profilo specifico
Porta-a-Porta-d87134d1-e2bd-426b-b1f6-90d8dca68855;2745.860;2749.060;Unknown;; Š quadruplicato rispetto al 1967.
Porta-a-Porta-d87134d1-e2bd-426b-b1f6-90d8dca68855;1023.580;1026.250;Unknown;; monitoraggio fosse completo e cosŤ via.
Porta-a-Porta-d87134d1-e2bd-426b-b1f6-90d8dca68855;708.870;711.290;Unknown;; Non solo un ponte, ma qualcosa di pi—.
Porta-a-Porta-d605218c-b8c5-4b3b-9086-b83e4c958bf5;4199.210;4200.540;Unknown;; piů straziante.
Porta-a-Porta-c28a23f4-d7b0-4624-8b49-72ba25be653e;4702.720;4703.900;Unknown;; tant'č che questo ragazzo
Presa-Diretta-Burocrazia-al-potere-ce58265f-da04-4b19-a1ad-2746830cac0a;4229.110;4232.130;Unknown;; a un testo di 13 pagine con 7/8.000 parole.<
Presa-Diretta-Burocrazia-al-potere-ce58265f-da04-4b19-a1ad-2746830cac0a;4541.560;4543.100;Unknown;; sei/otto ore al giorno.<
PresaDiretta-Il-capitale-naturale-8f39ea4f-a5fb-4c93-a504-a04d6482c086;1938.730;1941.830;Unknown;; abbattere i cervi.> Senza di loro, questa terra sarebbe
Quante-storie-15aef095-7ba8-4237-af6e-aded20d1d40a;19.920;22.630;Unknown;; questa puntata {an2}che ha come ospite una
Quante-storie-15aef095-7ba8-4237-af6e-aded20d1d40a;64.080;68.090;Unknown;; {an2}Sì, perché c'è come un ritegno a venire in una
Quante-storie-200b0694-7d54-4b5c-af5a-b54cae157ffd;446.730;447.790;Unknown;; della nostra Patria. {an2}[LA
Quante-storie-2583a3a2-2e8c-4589-bede-933736b65043;1781.910;1783.030;Unknown;; UDIBILI]
Porta-a-Porta-3b4b81d5-2f0f-4e51-9c29-00f9a2aa4444;4159.470;4160.890;Unknown;; bianca torneremo.#
Porta-a-Porta-3b4b81d5-2f0f-4e51-9c29-00f9a2aa4444;4196.930;4198.230;Unknown;; del sole#
and I am trying to spot unnecessary characters that should not belong in this file such as < or { or {an2} or [ and so on.
This is the regex I have right now and does the job well except it does not catch some cases like {an2} or # as described above. I would like to find everything including an2 and leave every Italian characters as is.
[^a-zA-Z0-9;'"\.\- ,\?:£\]\[\/()%!èàéùòìíŕěúůňčÂŤŠÈÉôü&+<>##$%^…—‚–]
Let me know if there is any easier way to solve this problem.
My guess is that, maybe we would find those undesired parts, then replace with an empty string, with some expressions similar to:
{.+?}|[\[\]<>]
Test
import re
regex = r"{.+?}|[\[\]<>]"
test_str = ("Porta-a-Porta-d87134d1-e2bd-426b-b1f6-90d8dca68855;2842.020;2843.270;Unknown;; tecnici delle societ…\n"
"Porta-a-Porta-d87134d1-e2bd-426b-b1f6-90d8dca68855;2903.310;2906.360;Unknown;; pu• avere un profilo specifico\n"
"Porta-a-Porta-d87134d1-e2bd-426b-b1f6-90d8dca68855;2745.860;2749.060;Unknown;; Š quadruplicato rispetto al 1967.\n"
"Porta-a-Porta-d87134d1-e2bd-426b-b1f6-90d8dca68855;1023.580;1026.250;Unknown;; monitoraggio fosse completo e cosŤ via.\n"
"Porta-a-Porta-d87134d1-e2bd-426b-b1f6-90d8dca68855;708.870;711.290;Unknown;; Non solo un ponte, ma qualcosa di pi—.\n"
"Porta-a-Porta-d605218c-b8c5-4b3b-9086-b83e4c958bf5;4199.210;4200.540;Unknown;; piů straziante.\n"
"Porta-a-Porta-c28a23f4-d7b0-4624-8b49-72ba25be653e;4702.720;4703.900;Unknown;; tant'č che questo ragazzo\n"
"Presa-Diretta-Burocrazia-al-potere-ce58265f-da04-4b19-a1ad-2746830cac0a;4229.110;4232.130;Unknown;; a un testo di 13 pagine con 7/8.000 parole.<\n"
"Presa-Diretta-Burocrazia-al-potere-ce58265f-da04-4b19-a1ad-2746830cac0a;4541.560;4543.100;Unknown;; sei/otto ore al giorno.<\n"
"PresaDiretta-Il-capitale-naturale-8f39ea4f-a5fb-4c93-a504-a04d6482c086;1938.730;1941.830;Unknown;; abbattere i cervi.> Senza di loro, questa terra sarebbe\n"
"Quante-storie-15aef095-7ba8-4237-af6e-aded20d1d40a;19.920;22.630;Unknown;; questa puntata {an2}che ha come ospite una\n"
"Quante-storie-15aef095-7ba8-4237-af6e-aded20d1d40a;64.080;68.090;Unknown;; {an2}Sì, perché c'è come un ritegno a venire in una\n"
"Quante-storie-200b0694-7d54-4b5c-af5a-b54cae157ffd;446.730;447.790;Unknown;; della nostra Patria. {an2}[LA\n"
"Quante-storie-2583a3a2-2e8c-4589-bede-933736b65043;1781.910;1783.030;Unknown;; UDIBILI]\n"
"Porta-a-Porta-3b4b81d5-2f0f-4e51-9c29-00f9a2aa4444;4159.470;4160.890;Unknown;; bianca torneremo.#\n"
"Porta-a-Porta-3b4b81d5-2f0f-4e51-9c29-00f9a2aa4444;4196.930;4198.230;Unknown;; del sole#")
subst = ""
result = re.sub(regex, subst, test_str, 0, re.MULTILINE)
if result:
print (result)
Demo

Change unicode hardwrited in csv to corresponding character

I have a csv with 1 column having hard writed unicode character :
["Investir dans un parc d'activit\u00e9s"]
["S\u00e9curiser, restaurer et g\u00e9rer 1 372 ha de milieux naturels impact\u00e9s par la construction de l'autoroute"]
["Am\u00e9liorer la consommation \u00e9nerg\u00e9tique de b\u00e2timents publics"]
["Favoriser la recherche, am\u00e9liorer la qualit\u00e9 des traitements et assurer un \u00e9gal acc\u00e8s des soins \u00e0 tous les patients de Franche-Comt\u00e9."]
I'm trying to fix/replace them with the corresponding char, but I can't seems to make it, I tried with
df['Objectif(s)'] = df['Objectif(s)'].replace('\u00e9', 'é')
but the column don't change
Seing that the code below work, I tried to loop over the row to fix it with no success
s = "d'activit\u00e9s"
print(s) # d'activités
print(s.replace('\u00e9', 'é' )) # d'activités
for case in df['Objectif(s)']:
s = str(case)
df['Objectif(s)'][case] = s # ["Investir dans un parc d'activit\u00e9s"]
if this '\u00e9' is actually written into the file as \ u 0 0 e 9 as normal characters by the source of the data, you need to do a string replace.
the trick here is that you need to escape the \ character in the replace function first parameter
s.replace('\\u00e9', 'é' )
or use a "raw string literal" by prefixing r
s.replace(r'\u00e9', 'é' )
Try replacing
df['Objectif(s)'] = df['Objectif(s)'].replace('\u00e9', 'é')
to
df['Objectif(s)'] = df['Objectif(s)'].str.replace('\u00e9', 'é')

How to delete invalid characters between multiple strings in python?

I'm working in a project with OCR in Spanish. The camera captures different frames in a line of text. The line of text contains this:
Este texto, es una prueba del dispositivo lector para no videntes.
After some operations I get strings like that:
s1 = "Este texto, es una p!"
s2 = "fste texto, es una |prueba u.-"
s3 = "jo, es una prueba del dispo‘"
s4 = "prueba del dispositivo \ec"
s5 = "del dispositivo lector par:"
s6 = "positivo lector para no xndev"
s7 = "lector para no videntes"
s8 = "¡r para no videntes."
I would like to join the string so that I can get the text of the scanned line in a final string like that:
sf = "Este texto, es una prueba del dispositivo lector para no videntes."
To begin I tried to use SequenceMatcher between two strings but it was not effective:
# -*- coding: utf-8 -*-
from difflib import SequenceMatcher as sq
s1 = "Este texto, es una p!"
s2 = "fste texto, es una prueba u.-"
match = sq(None, s1, s2).find_longest_match(0, len(s1), 0, len(s2))
print unicode(s1 + s2[match.b + match.size:])
The result has invalid characters like | or !:
>>>Este texto, es una p!|prueba u.-
Between s2 and s3:
>>>fste texto, es una |prueba u.-prueba del dispo‘
Etc. I'm using python 2.7 on Windows 7.
You should use regex
Do something like
import re
line = re.sub(r'\W', r'', line)
\W means any none word character. You may read more about regexes at site: https://docs.python.org/2/library/re.html

How to understand regular expression with python?

I'm new with python. Could anybody help me on how I can create a regular expression given a list of strings like this:
test_string = "pero pero CC
tan tan RG
antigua antiguo AQ0FS0
que que CS
según según SPS00
mi mi DP1CSS
madre madre NCFS000"
How to return a tuple like this:
> ([madre, NCFS00],[antigua, AQ0FS0])
I would like to return the word with it's associated tag given test_string, this is what I've done:
# -- coding: utf-8 --
import re
#str = "pero pero CC " \
"tan tan RG " \
"antigua antiguo AQ0FS0" \
"que que CS " \
"según según SPS00 " \
"mi mi DP1CSS " \
"madre madre NCFS000"
tupla1 = re.findall(r'(\w+)\s\w+\s(AQ0FS0)', str)
print tupla1
tupla2 = re.findall(r'(\w+)\s\w+\s(NCFS00)',str)
print tupla2
The output is the following:
[('antigua', 'AQ0FS0')] [('madre', 'NCFS00')]
The problem with this output is that if I pass it along test_string I need to preserve the "order" or "occurrence" of the tags (i.e. I only can print a tuple if and only if they have the following order: AQ0FS0 and NCFS000 in other words: female adjective, female noun).
^([a-zA-Z]+)\s+[a-zA-Z]+\s+([\w]+(?=\d$)\d)
Dont really know the basis for this selection but still you can get it like this.Just grab the captures.Dont forget to set the flags g and m.See demo.
http://regex101.com/r/nA6hN9/38

Categories

Resources