I have a bulk data in 'bulk_data' var, now need to find and copy it in sub var as per below, How to do it with python
bulk_data = """F0142514RM/JRSE1420 Mod/4758
F0144758RM/JRSE935 Mod/23
F014GS4RM/JRSE10 Mod/445
"""
typeA1 = <start with RM/>"JRSE1420"<until space> in 1st line
typeA2 = <start with RM/>"JRSE935"<until space> in 2nd line
typeA3 = <start with RM/>"JRSE10"<until space> in 3rd line
typeB1 = <start with space after typeA1>"Mod/4758"<until end of the line> in 1rd line
typeB2 = <start with space after typeA2>"Mod/23"<until end of the line> in 2nd line
typeB3 = <start with space after typeA3>"Mod/445"<until end of the line> in 3rd line
Overall result would be:
typeA1 = 'JRSE1420'
typeA2 = 'JRSE935'
typeA3 = 'JRSE10'
typeB1 = 'Mod/4758'
typeB2 = 'Mod/23'
typeB3 = 'Mod/445'
And also is there any study manual to deal with such type of data manipulation ?
You can use the re module
import re
bulk_data = '''F0142514RM/JRSE1420 Mod/4758
F0144758RM/JRSE935 Mod/23
F014GS4RM/JRSE10 Mod/445
'''
ptrn1 = re.compile(r'''
^ #matches the start of the string
.* #matches 0 or more of anything
RM\/ #matches "RM" followed by "/"
(\w+) #matches one or more alphanumeric character and the undescore
\b #matches empty string
.* #matches anything
$ #matches the end of string
''', re.MULTILINE | re.VERBOSE)
ptrn2 = re.compile(r'''
^ #matches the start of the string
.* #matches 0 or more of anything
\s #matches a space character
(Mod.*) #matches "Mod" follow by 0 or more of anything
$ #matches the end of string
''', re.MULTILINE | re.VERBOSE)
typeA1, typeA2, typeA3 = ptrn1.findall(bulk_data)
typeB1, typeB2, typeB3 = ptrn2.findall(bulk_data)
Why re? Looks like everything is already properly separated by different characters.
lines = bulk_data.splitlines()
typeA1_, typeB1 = lines[0].split(' ')
typeA1 = typeA1_.split('/')[1]
...
count = 1
li = []
with open('data') as f:
for line in f:
line = line.split()
if line:
a, b = line
a = a[a.index('/')+1:]
li.append("TypeA{} = {} ".format(count, a))
li.append("TypeB{} = {} ".format(count, b))
count += 1
for el in sorted(li):
print(el)
TypeA1 = JRSE1420
TypeA2 = JRSE935
TypeA3 = JRSE10
TypeB1 = Mod/4758
TypeB2 = Mod/23
TypeB3 = Mod/445
Related
this is my code so far:
import re
a = ["abc", " this is in blue color","(Refer: '(color:rgb(61, 142, 185); )Set the TEST VIN value'(color:rgb(0, 0, 0); ) in document: (color:rgb(61, 142, 185); )[UserGuide_Upgrade_2020_W10_final.pdf|CB:/displayDocument/UserGuide_Upgrade_2020_W10_final.pdf?task_id=12639618&artifact_id=48569866] )"]
p = re.compile(r'(color[\w]+\;)').sub('', a[i])
print(p)
Output required:
["abc", " this is in blue color","(Refer: 'Set the TEST VIN value' in document: [UserGuide_Upgrade_2020_W10_final.pdf|CB:/displayDocument/UserGuide_Upgrade_2020_W10_final.pdf)"]
The are 3 color parts to remove in the string and the part at the end from the question mark until right before the )
You could match all the parts using an alternation |
\(color:\w+\([^()]*\); \)|\?[^?]+(?=\)$)
Regex demo | Python demo
\(color: Match (color:
\w+\([^()]*\); \) Match 1+ word chars followed by matching from ( to ) a space and another )
| Or
\?[^?]+ Match ? and 1+ times all chars except ?
(?=\)$) Assert what is on the right is ) at the end of the string
Example code
import re
regex = r"\(color:\w+\([^()]*\); \)|\?[^?]+(?=\)$)"
test_str = " this is in blue color\",\"(Refer: 'Set the TEST VIN value' in document: [UserGuide_Upgrade_2020_W10_final.pdf|CB:/displayDocument/UserGuide_Upgrade_2020_W10_final.pdf)"
result = re.sub(regex, "", test_str)
print (result)
Output
this is in blue color","(Refer: 'Set the TEST VIN value' in document: [UserGuide_Upgrade_2020_W10_final.pdf|CB:/displayDocument/UserGuide_Upgrade_2020_W10_final.pdf)
I have the following example text / tweet:
RT #trader $AAPL 2012 is o´o´o´o´o´pen to ‘Talk’ about patents with GOOG definitely not the treatment #samsung got:-) heh url_that_cannot_be_posted_on_SO
I want to follow the procedure of Table 1 in Li, T, van Dalen, J, & van Rees, P.J. (Pieter Jan). (2017). More than just noise? Examining the information content of stock microblogs on financial markets. Journal of Information Technology. doi:10.1057/s41265-016-0034-2 in order to clean up the tweet.
They clean the tweet up in such a way that the final result is:
{RT|123456} {USER|56789} {TICKER|AAPL} {NUMBER|2012} notooopen nottalk patent {COMPANY|GOOG} notdefinetli treatment {HASH|samsung} {EMOTICON|POS} haha {URL}
I use the following script to tokenize the tweet based on the regex:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
emoticon_string = r"""
(?:
[<>]?
[:;=8] # eyes
[\-o\*\']? # optional nose
[\)\]\(\[dDpP/\:\}\{#\|\\] # mouth
|
[\)\]\(\[dDpP/\:\}\{#\|\\] # mouth
[\-o\*\']? # optional nose
[:;=8] # eyes
[<>]?
)"""
regex_strings = (
# URL:
r"""http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"""
,
# Twitter username:
r"""(?:#[\w_]+)"""
,
# Hashtags:
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
,
# Cashtags:
r"""(?:\$+[\w_]+[\w\'_\-]*[\w_]+)"""
,
# Remaining word types:
r"""
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
(?:[\w_]+) # Words without apostrophes or dashes.
|
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
(?:\S) # Everything else that isn't whitespace.
"""
)
word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
######################################################################
class Tokenizer:
def __init__(self, preserve_case=False):
self.preserve_case = preserve_case
def tokenize(self, s):
try:
s = str(s)
except UnicodeDecodeError:
s = str(s).encode('string_escape')
s = unicode(s)
# Tokenize:
words = word_re.findall(s)
if not self.preserve_case:
words = map((lambda x: x if emoticon_re.search(x) else x.lower()), words)
return words
if __name__ == '__main__':
tok = Tokenizer(preserve_case=False)
test = ' RT #trader $AAPL 2012 is oooopen to ‘Talk’ about patents with GOOG definitely not the treatment #samsung got:-) heh url_that_cannot_be_posted_on_SO'
tokenized = tok.tokenize(test)
print("\n".join(tokenized))
This yields the following output:
rt
#trader
$aapl
2012
is
oooopen
to
‘
talk
’
about
patents
with
goog
definitely
not
the
treatment
#samsung
got
:-)
heh
url_that_cannot_be_posted_on_SO
How can I adjust this script to get:
rt
{USER|trader}
{CASHTAG|aapl}
{NUMBER|2012}
is
oooopen
to
‘
talk
’
about
patents
with
goog
definitely
not
the
treatment
{HASHTAG|samsung}
got
{EMOTICON|:-)}
heh
{URL|url_that_cannot_be_posted_on_SO}
Thanks in advance for helping me out big time!
You really need to use named capturing groups (mentioned by thebjorn), and use groupdict() to get name-value pairs upon each match. It requires some post-processing though:
All pairs where the value is None must be discarded
If the self.preserve_case is false the value can be turned to lower case at once
If the group name is WORD, ELLIPSIS or ELSE the values are added to words as is
If the group name is HASHTAG, CASHTAG, USER or URL the values are added first stripped of $, # and # chars at the start and then added to words as {<GROUP_NAME>|<VALUE>} item
All other matches are added to words as {<GROUP_NAME>|<VALUE>} item.
Note that \w matches underscores by default, so [\w_] = \w. I optimized the patterns a little bit.
Here is a fixed code snippet:
import re
emoticon_string = r"""
(?P<EMOTICON>
[<>]?
[:;=8] # eyes
[-o*']? # optional nose
[][()dDpP/:{}#|\\] # mouth
|
[][()dDpP/:}{#|\\] # mouth
[-o*']? # optional nose
[:;=8] # eyes
[<>]?
)"""
regex_strings = (
# URL:
r"""(?P<URL>https?://(?:[-a-zA-Z0-9_$#.&+!*(),]|%[0-9a-fA-F][0-9a-fA-F])+)"""
,
# Twitter username:
r"""(?P<USER>#\w+)"""
,
# Hashtags:
r"""(?P<HASHTAG>\#+\w+[\w'-]*\w+)"""
,
# Cashtags:
r"""(?P<CASHTAG>\$+\w+[\w'-]*\w+)"""
,
# Remaining word types:
r"""
(?P<NUMBER>[+-]?\d+(?:[,/.:-]\d+[+-]?)?) # Numbers, including fractions, decimals.
|
(?P<WORD>\w+) # Words without apostrophes or dashes.
|
(?P<ELLIPSIS>\.(?:\s*\.)+) # Ellipsis dots.
|
(?P<ELSE>\S) # Everything else that isn't whitespace.
"""
)
word_re = re.compile(r"""({}|{})""".format(emoticon_string, "|".join(regex_strings)), re.VERBOSE | re.I | re.UNICODE)
#print(word_re.pattern)
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
######################################################################
class Tokenizer:
def __init__(self, preserve_case=False):
self.preserve_case = preserve_case
def tokenize(self, s):
try:
s = str(s)
except UnicodeDecodeError:
s = str(s).encode('string_escape')
s = unicode(s)
# Tokenize:
words = []
for x in word_re.finditer(s):
for key, val in x.groupdict().items():
if val:
if not self.preserve_case:
val = val.lower()
if key in ['WORD','ELLIPSIS','ELSE']:
words.append(val)
elif key in ['HASHTAG','CASHTAG','USER','URL']: # Add more here if needed
words.append("{{{}|{}}}".format(key, re.sub(r'^[##$]+', '', val)))
else:
words.append("{{{}|{}}}".format(key, val))
return words
if __name__ == '__main__':
tok = Tokenizer(preserve_case=False)
test = ' RT #trader $AAPL 2012 is oooopen to ‘Talk’ about patents with GOOG definitely not the treatment #samsung got:-) heh http://some.site.here.com'
tokenized = tok.tokenize(test)
print("\n".join(tokenized))
With test = ' RT #trader $AAPL 2012 is oooopen to ‘Talk’ about patents with GOOG definitely not the treatment #samsung got:-) heh http://some.site.here.com', it outputs
rt
{USER|trader}
{CASHTAG|aapl}
{NUMBER|2012}
is
oooopen
to
‘
talk
’
about
patents
with
goog
definitely
not
the
treatment
{HASHTAG|samsung}
got
{EMOTICON|:-)}
heh
{URL|http://some.site.here.com}
See the regex demo online.
def sauvegarder_canaux(self, nom_fichier:str) is the method giving me a problem when the file saves it only writes in this format:
5 - TQS (Télévision Quatres-saisons, 0.0 $ extra)
I need it to be like this:
5 : TQS : Télévision Quatres-saisons : 0.0 $ extra
This is the code that I have for now:
from canal import Canal
from forfait_tv import ForfaitTV
from abonne import Abonne
#============= Classe ===========================
class Distributeur :
"""
Description :
===========
Cette classe gère les listes de canaux, de forfaits (et plus tard
d'abonné).
Données membres privées :
======================
__canaux # [Canal] Liste de canaux existants
__forfaits # [ForfaitTV] Liste des forfaits disponibles
"""
#----------- Constructeur -----------------------------
def __init__(self):
self.__canaux = None
self.__forfaits = None
#code
self.__canaux = [] #list
self.__forfaits = [] #list
#----------- Accesseurs/Mutateurs ----------------------
def ajouter_canal(self,un_canal:Canal):
self.__canaux.append(un_canal)
def chercher_canal (self,p_poste:int):
i=0
postex = None
poste_trouve=None
for i in range(0,len(self.__canaux),1):
postex=self.__canaux[i]
if postex.get_poste()== p_poste:
poste_trouve=postex
return print(poste_trouve)
def telecharger_canaux(self,nom_fichier:str):
fichierCanaux = open(nom_fichier, "r")
for line in fichierCanaux:
eleCanal = line.strip(" : ")
canal = Canal(eleCanal[0],eleCanal[1],eleCanal[2],eleCanal[3])
self.__canaux.append(canal)
return canal
def sauvegarder_canaux(self, nom_fichier:str):
fichCanaux = open(nom_fichier,"w")
for i in self.__canaux:
fichCanaux.write(str(i) + "\n")
fichCanaux.close()
You need only to edit the string before you write it. The string.replace command is your friend. Perhaps ...
for i in self.__canaux:
out_line = str(i)
for char in "-(,":
out_line = out_line.replace(char, ':')
fichCanaux.write(out_line + "\n")
If removing the accents is okay, you can normalize the text to NFD with unicodedata, then find the segments of interest, modify them with the desired formatting, and replace them with the formatted segments using regex:
import unicodedata
import re
def format_string(test_str):
# normalize accents
test_str = test_str.decode("UTF-8")
test_str = unicodedata.normalize('NFD', test_str).encode('ascii', 'ignore')
# segment patterns
segment_1_ptn = re.compile(r"""[0-9]*(\s)* # natural number
[-](\s)* # dash
(\w)*(\s)* # acronym
""",
re.VERBOSE)
segment_2_ptn = re.compile(r"""(\w)*(\s)* # acronym
(\() # open parenthesis
((\w*[-]*)*(\s)*)* # words
""",
re.VERBOSE)
segment_3_ptn = re.compile(r"""((\w*[-]*)*(\s)*)* # words
(,)(\s)* # comma
[0-9]*(.)[0-9]*(\s)*(\$)(\s) # real number
""",
re.VERBOSE)
# format data
segment_1_match = re.search(segment_1, test_str).group()
test_str = test_str.replace(segment_1_match, " : ".join(segment_1_match.split("-")))
segment_2_match = re.search(segment_2, test_str).group()
test_str = test_str.replace(segment_2_match, " : ".join(segment_2_match.split("(")))
segment_3_match = re.search(segment_3, test_str).group()
test_str = test_str.replace(segment_3_match, " : ".join(segment_3_match.split(",")))[:-1]
test_str = " : ".join([txt.strip() for txt in test_str.split(":")])
return test_str
Then you can call this function within sauvegarder_canaux
def sauvegarder_canaux(self, nom_fichier:str):
with open(nom_fichier, "w") as fichCanaux
for i in self.__canaux:
fichCanaux.write(format_string(str(i)) + "\n")
You can also add format_string as a method within your Distributeur class.
Example input:
5 - TQS (Télévision Quatres-saisons, 0.0 $ extra)
Example output:
5 : TQS : Television Quatres-saisons : 0.0 $ extra
Need some help with RegEx over python.
I have this text:
part101_add(
name = "part101-1",
dev2_serial = "dev_l622_01",
serial_port = "/dev/tty-part101-1",
yok_serial = "YT8388"
)
yok_tar_add("YT8388", None)
part2_add(
name = "part2-1",
serial_number = "SERIALNUMBER",
serial_port = "/dev/tty-part2-1",
yok_serial = "YT03044",
yok_port_board = "N"
)
yok_tar_add("YT03044", None)
I need to select all part*_add and its content.
for example:
part101_add:
name = "part101-1",
dev2_serial = "dev_l622_01",
serial_port = "/dev/tty-part101-1",
yok_serial = "YT8388"
part2_add:
serial_number = "SERIALNUMBER",
serial_port = "/dev/tty-part2-1",
yok_serial = "YT03044",
yok_port_board = "N"
problem is that im unable to separate the results.
when using this pattern:
regex = r"(.*?_add)\([\s\S.]*\)"
Thanks for your help.
I would precise the pattern to only match at the start and end of the line, and use a lazy quantifier with [\s\S]:
r"(?m)^(part\d+_add)\([\s\S]*?\)$"
See this regex demo
Details:
(?m) - an inline re.MULTILINE modifier version to make ^ match the line start and $ to match the line end
^ - start of a line
(part\d+_add) - Group 1 capturing part, 1+ digits, _add
\( - a literal (
[\s\S]*? - any 0+ chars, as few as possible up to
\)$ - a ) at the end of the line.
I have tables which looks like this:
text = """
ID = 1234
Hello World 135,343 117,668 81,228
Another line of text (30,632) (48,063)
More text 0 11,205 0
Even more text 1,447 681
ID = 18372
Another table 35,323 38,302 909,381
Another line with text 13 15
More text here 7 0
Even more text here 7,011 1,447 681
"""
Is there a way to replace the "blank" entries in each table with 0? I am trying to set delimiters between the entries, but using the following code can't deal with blank spots in the tables:
for line in text.splitlines():
if 'ID' not in line:
line1 = line.split()
line = '|'.join((' '.join(line1[:-3]), '|'.join(line1[-3:])))
print line
else:
print line
The output is:
ID = 1234
|
Hello World|135,343|117,668|81,228
Another line of|text|(30,632)|(48,063)
More text|0|11,205|0
Even more|text|1,447|681
|
ID = 18372
|
Another table|35,323|38,302|909,381
Another line with|text|13|15
More text|here|7|0
Even more text here|7,011|1,447|681
As you can see, the first problem shows up on the second line of the first table. The word 'text' is considered the first column. Any way to fix this in Python to replace blank entries with 0?
Here is a function for finding columns in a bunch of lines. The second argument pat defines what a column is, and can be any regex.
import itertools as it
import re
def find_columns(lines, pat = r' '):
'''
Usage:
widths = find_columns(lines)
for line in lines:
if not line: continue
vals = [ line[widths[i]:widths[i+1]].strip() for i in range(len(widths)-1) ]
'''
widths = []
maxlen = max(len(line) for line in lines)
for line in lines:
line = ''.join([line, ' '*(maxlen-len(line))])
candidates = []
for match in re.finditer(pat, line):
candidates.extend(range(match.start(), match.end()+1))
widths.append(set(candidates))
widths = sorted(set.intersection(*widths))
diffs = [widths[i+1]-widths[i] for i in range(len(widths)-1)]
diffs = [None]+diffs
widths = [w for d, w in zip(diffs, widths) if d != 1]
if widths[0] != 0: widths = [0]+widths
return widths
def report(text):
for key, group in it.groupby(text.splitlines(), lambda line:line.startswith('ID')):
lines = list(group)
if key:
print('\n'.join(lines))
else:
# r' (?![a-zA-Z])' defines a column to be any whitespace
# not followed by alphabetic characters.
widths = find_columns(lines, pat = r'\s(?![a-zA-Z])')
for line in lines:
if not line: continue
vals = [ line[widths[i]:widths[i+1]] for i in range(len(widths)-1) ]
vals = [v if v.strip() else v[1:]+'0' for v in vals]
print('|'.join(vals))
text = """\
ID = 1234
Hello World 135,343 117,668 81,228
Another line of text (30,632) (48,063)
More text 0 11,205 0
Even more text 1,447 681
ID = 18372
Another table 35,323 38,302 909,381
Another line with text 13 15
More text here 7 0
Even more text here 7,011 1,447 681
"""
report(text)
yields
ID = 1234
Hello World | 135,343| 117,668| 81,228
Another line of text| (30,632)| 0| (48,063)
More text | 0 | 11,205| 0
Even more text | 0| 1,447 | 681
ID = 18372
Another table | 35,323| 38,302| 909,381
Another line with text| 13 | 15|0
More text here | 0| 7 | 0
Even more text here | 7,011| 1,447| 681