Tokenize tweet based on Regex - python

I have the following example text / tweet:
RT #trader $AAPL 2012 is o´o´o´o´o´pen to ‘Talk’ about patents with GOOG definitely not the treatment #samsung got:-) heh url_that_cannot_be_posted_on_SO
I want to follow the procedure of Table 1 in Li, T, van Dalen, J, & van Rees, P.J. (Pieter Jan). (2017). More than just noise? Examining the information content of stock microblogs on financial markets. Journal of Information Technology. doi:10.1057/s41265-016-0034-2 in order to clean up the tweet.
They clean the tweet up in such a way that the final result is:
{RT|123456} {USER|56789} {TICKER|AAPL} {NUMBER|2012} notooopen nottalk patent {COMPANY|GOOG} notdefinetli treatment {HASH|samsung} {EMOTICON|POS} haha {URL}
I use the following script to tokenize the tweet based on the regex:
#!/usr/bin/env python
# -*- coding: utf-8 -*-
import re
emoticon_string = r"""
(?:
[<>]?
[:;=8] # eyes
[\-o\*\']? # optional nose
[\)\]\(\[dDpP/\:\}\{#\|\\] # mouth
|
[\)\]\(\[dDpP/\:\}\{#\|\\] # mouth
[\-o\*\']? # optional nose
[:;=8] # eyes
[<>]?
)"""
regex_strings = (
# URL:
r"""http[s]?://(?:[a-zA-Z]|[0-9]|[$-_#.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+"""
,
# Twitter username:
r"""(?:#[\w_]+)"""
,
# Hashtags:
r"""(?:\#+[\w_]+[\w\'_\-]*[\w_]+)"""
,
# Cashtags:
r"""(?:\$+[\w_]+[\w\'_\-]*[\w_]+)"""
,
# Remaining word types:
r"""
(?:[+\-]?\d+[,/.:-]\d+[+\-]?) # Numbers, including fractions, decimals.
|
(?:[\w_]+) # Words without apostrophes or dashes.
|
(?:\.(?:\s*\.){1,}) # Ellipsis dots.
|
(?:\S) # Everything else that isn't whitespace.
"""
)
word_re = re.compile(r"""(%s)""" % "|".join(regex_strings), re.VERBOSE | re.I | re.UNICODE)
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
######################################################################
class Tokenizer:
def __init__(self, preserve_case=False):
self.preserve_case = preserve_case
def tokenize(self, s):
try:
s = str(s)
except UnicodeDecodeError:
s = str(s).encode('string_escape')
s = unicode(s)
# Tokenize:
words = word_re.findall(s)
if not self.preserve_case:
words = map((lambda x: x if emoticon_re.search(x) else x.lower()), words)
return words
if __name__ == '__main__':
tok = Tokenizer(preserve_case=False)
test = ' RT #trader $AAPL 2012 is oooopen to ‘Talk’ about patents with GOOG definitely not the treatment #samsung got:-) heh url_that_cannot_be_posted_on_SO'
tokenized = tok.tokenize(test)
print("\n".join(tokenized))
This yields the following output:
rt
#trader
$aapl
2012
is
oooopen
to
‘
talk
’
about
patents
with
goog
definitely
not
the
treatment
#samsung
got
:-)
heh
url_that_cannot_be_posted_on_SO
How can I adjust this script to get:
rt
{USER|trader}
{CASHTAG|aapl}
{NUMBER|2012}
is
oooopen
to
‘
talk
’
about
patents
with
goog
definitely
not
the
treatment
{HASHTAG|samsung}
got
{EMOTICON|:-)}
heh
{URL|url_that_cannot_be_posted_on_SO}
Thanks in advance for helping me out big time!

You really need to use named capturing groups (mentioned by thebjorn), and use groupdict() to get name-value pairs upon each match. It requires some post-processing though:
All pairs where the value is None must be discarded
If the self.preserve_case is false the value can be turned to lower case at once
If the group name is WORD, ELLIPSIS or ELSE the values are added to words as is
If the group name is HASHTAG, CASHTAG, USER or URL the values are added first stripped of $, # and # chars at the start and then added to words as {<GROUP_NAME>|<VALUE>} item
All other matches are added to words as {<GROUP_NAME>|<VALUE>} item.
Note that \w matches underscores by default, so [\w_] = \w. I optimized the patterns a little bit.
Here is a fixed code snippet:
import re
emoticon_string = r"""
(?P<EMOTICON>
[<>]?
[:;=8] # eyes
[-o*']? # optional nose
[][()dDpP/:{}#|\\] # mouth
|
[][()dDpP/:}{#|\\] # mouth
[-o*']? # optional nose
[:;=8] # eyes
[<>]?
)"""
regex_strings = (
# URL:
r"""(?P<URL>https?://(?:[-a-zA-Z0-9_$#.&+!*(),]|%[0-9a-fA-F][0-9a-fA-F])+)"""
,
# Twitter username:
r"""(?P<USER>#\w+)"""
,
# Hashtags:
r"""(?P<HASHTAG>\#+\w+[\w'-]*\w+)"""
,
# Cashtags:
r"""(?P<CASHTAG>\$+\w+[\w'-]*\w+)"""
,
# Remaining word types:
r"""
(?P<NUMBER>[+-]?\d+(?:[,/.:-]\d+[+-]?)?) # Numbers, including fractions, decimals.
|
(?P<WORD>\w+) # Words without apostrophes or dashes.
|
(?P<ELLIPSIS>\.(?:\s*\.)+) # Ellipsis dots.
|
(?P<ELSE>\S) # Everything else that isn't whitespace.
"""
)
word_re = re.compile(r"""({}|{})""".format(emoticon_string, "|".join(regex_strings)), re.VERBOSE | re.I | re.UNICODE)
#print(word_re.pattern)
emoticon_re = re.compile(regex_strings[1], re.VERBOSE | re.I | re.UNICODE)
######################################################################
class Tokenizer:
def __init__(self, preserve_case=False):
self.preserve_case = preserve_case
def tokenize(self, s):
try:
s = str(s)
except UnicodeDecodeError:
s = str(s).encode('string_escape')
s = unicode(s)
# Tokenize:
words = []
for x in word_re.finditer(s):
for key, val in x.groupdict().items():
if val:
if not self.preserve_case:
val = val.lower()
if key in ['WORD','ELLIPSIS','ELSE']:
words.append(val)
elif key in ['HASHTAG','CASHTAG','USER','URL']: # Add more here if needed
words.append("{{{}|{}}}".format(key, re.sub(r'^[##$]+', '', val)))
else:
words.append("{{{}|{}}}".format(key, val))
return words
if __name__ == '__main__':
tok = Tokenizer(preserve_case=False)
test = ' RT #trader $AAPL 2012 is oooopen to ‘Talk’ about patents with GOOG definitely not the treatment #samsung got:-) heh http://some.site.here.com'
tokenized = tok.tokenize(test)
print("\n".join(tokenized))
With test = ' RT #trader $AAPL 2012 is oooopen to ‘Talk’ about patents with GOOG definitely not the treatment #samsung got:-) heh http://some.site.here.com', it outputs
rt
{USER|trader}
{CASHTAG|aapl}
{NUMBER|2012}
is
oooopen
to
‘
talk
’
about
patents
with
goog
definitely
not
the
treatment
{HASHTAG|samsung}
got
{EMOTICON|:-)}
heh
{URL|http://some.site.here.com}
See the regex demo online.

Related

Parse string with tag that position might reverse sometimes

I'm trying to parse a string from log communicating with network which will be like
2019 Jun 30 15:40:17.561 NETWORK_MESSAGE
Direction = UE_TO_NETWORK
From: <1106994972>
To: <3626301680>
and here is my code:
import re
log = '2019 Jun 30 15:40:17.561 NETWORK_MESSAGE\r\nDirection = UE_TO_NETWORK\r\nFrom: <1106994972>\r\nTo: <3626301680>\r\n'
PATTERN = re.compile(
'(?P<time>\d{2}:\d{2}:\d{2}.\d{3}).*' # Time
'Direction = (?P<Direction>\S+).*' # Direction
'From: <(?P<From>\S+)>.*' # from
'To: <(?P<To>\S+)>', # to
re.DOTALL)
results = PATTERN.search(log)
print(results.group('From'))
However, I just found sometimes there will be reversed position between "From" and "To", just like the following.
2019 Jun 30 15:40:16.548 NETWORK_MESSAGE
Direction = NETWORK_TO_UE
To: <3626301680>
From: <1106994972>
Is it possible I can solve this with only one pattern?
Here is a solution that uses (From|To) to match either From or To and then explicitly checks which of the two places matched From:
import re
log1 = '2019 Jun 30 15:40:17.561 NETWORK_MESSAGE\r\nDirection = UE_TO_NETWORK\r\nFrom: <1106994972>\r\nTo: <3626301680>\r\n'
log2 = '2019 Jun 30 15:40:17.561 NETWORK_MESSAGE\r\nDirection = UE_TO_NETWORK\r\nTo: <3626301680>\r\nFrom: <1106994972>\r\n'
PATTERN = re.compile(
'(?P<time>\d{2}:\d{2}:\d{2}.\d{3}).*' # Time
'Direction = (?P<Direction>\S+).*' # Direction
'(?P<tag1>From|To): <(?P<val1>\S+)>.*' # from or to
'(?P<tag2>From|To): <(?P<val2>\S+)>', # from or to
re.DOTALL)
for log in [log1, log2]:
results = PATTERN.search(log)
if results.group('tag1') == 'From':
print(results.group('val1'))
elif results.group('tag2') == 'From':
print(results.group('val2'))
This matches your line but does not make sure there is exactly on From and one To.
I also considered this pattern
PATTERN = re.compile(
'(?P<time>\d{2}:\d{2}:\d{2}.\d{3}).*' # Time
'Direction = (?P<Direction>\S+).*' # Direction
'(?P<FromTo>(?P<tag1>From|To): <(?P<val1>\S+)>.*){2}', # from or to
re.DOTALL)
but this will only capture the last match in From and To (according to the docs "If a group is contained in a part of the pattern that matched multiple times, the last match is returned."). So if the two fields appear in the wrong order then you will not be able to get the value for From.
If things get more complicated you may have more readable code by using more than one pattern.
log1 = "2019 Jun 30 15:40:17.561 NETWORK_MESSAGE\r\nDirection = UE_TO_NETWORK\r\nFrom: <1106994972>\r\nTo: <3626301680>\r\n"
log2 = "2019 Jun 30 15:40:16.548 NETWORK_MESSAGE\r\nDirection = NETWORK_TO_UE\r\nTo: <3626301680>\r\nFrom: <1106994972>\r\n"
PATTERN = re.compile(
'(?P<time>\d{2}:\d{2}:\d{2}.\d{3}).*' # Time
'Direction = (?P<Direction>\S+).*' # Direction
'(From|To): <(?P<X>\S+)>.*'
'(To|From): <(?P<Y>\S+)>',
re.DOTALL)
print(re.findall(PATTERN, log1))
print(re.findall(PATTERN, log2))

Replace the all content starting from (color to ;) and starting ? to )

this is my code so far:
import re
a = ["abc", " this is in blue color","(Refer: '(color:rgb(61, 142, 185); )Set the TEST VIN value'(color:rgb(0, 0, 0); ) in document: (color:rgb(61, 142, 185); )[UserGuide_Upgrade_2020_W10_final.pdf|CB:/displayDocument/UserGuide_Upgrade_2020_W10_final.pdf?task_id=12639618&artifact_id=48569866] )"]
p = re.compile(r'(color[\w]+\;)').sub('', a[i])
print(p)
Output required:
["abc", " this is in blue color","(Refer: 'Set the TEST VIN value' in document: [UserGuide_Upgrade_2020_W10_final.pdf|CB:/displayDocument/UserGuide_Upgrade_2020_W10_final.pdf)"]
The are 3 color parts to remove in the string and the part at the end from the question mark until right before the )
You could match all the parts using an alternation |
\(color:\w+\([^()]*\); \)|\?[^?]+(?=\)$)
Regex demo | Python demo
\(color: Match (color:
\w+\([^()]*\); \) Match 1+ word chars followed by matching from ( to ) a space and another )
| Or
\?[^?]+ Match ? and 1+ times all chars except ?
(?=\)$) Assert what is on the right is ) at the end of the string
Example code
import re
regex = r"\(color:\w+\([^()]*\); \)|\?[^?]+(?=\)$)"
test_str = " this is in blue color\",\"(Refer: 'Set the TEST VIN value' in document: [UserGuide_Upgrade_2020_W10_final.pdf|CB:/displayDocument/UserGuide_Upgrade_2020_W10_final.pdf)"
result = re.sub(regex, "", test_str)
print (result)
Output
this is in blue color","(Refer: 'Set the TEST VIN value' in document: [UserGuide_Upgrade_2020_W10_final.pdf|CB:/displayDocument/UserGuide_Upgrade_2020_W10_final.pdf)

Copy value in var from structured data

I have a bulk data in 'bulk_data' var, now need to find and copy it in sub var as per below, How to do it with python
bulk_data = """F0142514RM/JRSE1420 Mod/4758
F0144758RM/JRSE935 Mod/23
F014GS4RM/JRSE10 Mod/445
"""
typeA1 = <start with RM/>"JRSE1420"<until space> in 1st line
typeA2 = <start with RM/>"JRSE935"<until space> in 2nd line
typeA3 = <start with RM/>"JRSE10"<until space> in 3rd line
typeB1 = <start with space after typeA1>"Mod/4758"<until end of the line> in 1rd line
typeB2 = <start with space after typeA2>"Mod/23"<until end of the line> in 2nd line
typeB3 = <start with space after typeA3>"Mod/445"<until end of the line> in 3rd line
Overall result would be:
typeA1 = 'JRSE1420'
typeA2 = 'JRSE935'
typeA3 = 'JRSE10'
typeB1 = 'Mod/4758'
typeB2 = 'Mod/23'
typeB3 = 'Mod/445'
And also is there any study manual to deal with such type of data manipulation ?
You can use the re module
import re
bulk_data = '''F0142514RM/JRSE1420 Mod/4758
F0144758RM/JRSE935 Mod/23
F014GS4RM/JRSE10 Mod/445
'''
ptrn1 = re.compile(r'''
^ #matches the start of the string
.* #matches 0 or more of anything
RM\/ #matches "RM" followed by "/"
(\w+) #matches one or more alphanumeric character and the undescore
\b #matches empty string
.* #matches anything
$ #matches the end of string
''', re.MULTILINE | re.VERBOSE)
ptrn2 = re.compile(r'''
^ #matches the start of the string
.* #matches 0 or more of anything
\s #matches a space character
(Mod.*) #matches "Mod" follow by 0 or more of anything
$ #matches the end of string
''', re.MULTILINE | re.VERBOSE)
typeA1, typeA2, typeA3 = ptrn1.findall(bulk_data)
typeB1, typeB2, typeB3 = ptrn2.findall(bulk_data)
Why re? Looks like everything is already properly separated by different characters.
lines = bulk_data.splitlines()
typeA1_, typeB1 = lines[0].split(' ')
typeA1 = typeA1_.split('/')[1]
...
count = 1
li = []
with open('data') as f:
for line in f:
line = line.split()
if line:
a, b = line
a = a[a.index('/')+1:]
li.append("TypeA{} = {} ".format(count, a))
li.append("TypeB{} = {} ".format(count, b))
count += 1
for el in sorted(li):
print(el)
TypeA1 = JRSE1420
TypeA2 = JRSE935
TypeA3 = JRSE10
TypeB1 = Mod/4758
TypeB2 = Mod/23
TypeB3 = Mod/445

Python - how to parse this with regex correctly? its parsing all the E.164 but except the local format

Its working for 0032, 32, +32 but not as 0487365060 (which is a valid term)
to_user = "0032487365060"
# ^(?:\+|00)(\d+)$ Parse the 0032, 32, +32 & 0487365060
match = re.search(r'^(?:\+|00)(\d+)$', to_user)
to_user = "32487365060"
match = re.search(r'^(?:\+|00)(\d+)$', to_user)
to_user = "+32487365060"
match = re.search(r'^(?:\+|00)(\d+)$', to_user)
Not working:
to_user = "0487365060"
match = re.search(r'^(?:\+|00)(\d+)$', to_user)
Your last example doesn't work because it does not start with either + or 00. If that is optional you need to mark it as such:
r'^(?:\+|00)?(\d+)$'
Note that neither does your second example match; it doesn't start with + or 00 either.
Demo:
>>> import re
>>> samples = ('0032487365060', '32487365060', '+32487365060', '0487365060')
>>> pattern = re.compile(r'^(?:\+|00)?(\d+)$')
>>> for sample in samples:
... match = pattern.search(sample)
... if match is not None:
... print 'matched:', match.group(1)
... else:
... print 'Sample {} did not match'.format(sample)
...
matched: 32487365060
matched: 32487365060
matched: 32487365060
matched: 0487365060
Taking account of the question AND the comment, and in absence of more info (particularly on the length of the sequence of digits that must follow the 32 part, and if it is always 32 or may be another sequence), what I finally understand you want cab be obtained with:
import re
for to_user in ("0032487365060",
"32487365060",
"+32487365060",
"0487365060"):
m = re.sub('^(?:\+32|0032|32|0)(\d{9})$','32\\1', to_user)
print m
Something like this #eyquem method, to cover all the international codes from + and 00 into without +, 00 only for Belgium it should be default 32+the number:
import re
for to_user in (# Belgium
"0032487365060",
"32487365060",
"+32487365060",
"0487365060",
# USA
"0012127773456",
"12127773456",
"+12127773456",
# UK
"004412345678",
"4412345678",
"+4412345678"):
m = re.sub('^(?:\+|00|32|0)(\d{9})$','32\\1', to_user)
m = m.replace("+","")
m = re.sub('^(?:\+|00)(\d+)$', '\\1', m)
print m
Output:
32487365060
32487365060
32487365060
32487365060
12127773456
12127773456
12127773456
4412345678
4412345678
4412345678
Execution Successful!
Why not to use phonenumbers lib
>>> phonenumbers.parse("0487365060", "BE")
PhoneNumber(country_code=32, national_number=487365060, extension=None, italian_leading_zero=None, number_of_leading_zeros=None, country_code_source=0, preferred_domestic_carrier_code=None)
and other 3 is ok to
>>> phonenumbers.parse("0032487365060", "BE")
PhoneNumber(country_code=32, national_number=487365060, extension=None, italian_leading_zero=None, number_of_leading_zeros=None, country_code_source=0, preferred_domestic_carrier_code=None)
>>> phonenumbers.parse("+320487365060", "BE")
PhoneNumber(country_code=32, national_number=487365060, extension=None, italian_leading_zero=None, number_of_leading_zeros=None, country_code_source=0, preferred_domestic_carrier_code=None)
>>> phonenumbers.parse("320487365060", "BE")
PhoneNumber(country_code=32, national_number=487365060, extension=None, italian_leading_zero=None, number_of_leading_zeros=None, country_code_source=0, preferred_domestic_carrier_code=None)

Python RE ( In a word to check first letter is case sensitive and rest all case insensitive)

In the below case i want to match string "Singapore" where "S" should always be capital and rest of the words may be in lower or in uppercase. but in the below string "s" is in lower case and it gets matched in search condition. can any body let me know how to implement this?
import re
st = "Information in sinGapore "
if re.search("S""(?i)(ingapore)" , st):
print "matched"
Singapore => matched
sIngapore => notmatched
SinGapore => matched
SINGAPORE => matched
As commented, the Ugly way would be:
>>> re.search("S[iI][Nn][Gg][Aa][Pp][Oo][Rr][Ee]" , "SingaPore")
<_sre.SRE_Match object at 0x10cea84a8>
>>> re.search("S[iI][Nn][Gg][Aa][Pp][Oo][Rr][Ee]" , "Information in sinGapore")
The more elegant way would be matching Singapore case-insensitive, and then checking that the first letter is S:
reg=re.compile("singapore", re.I)
>>> s="Information in sinGapore"
>>> reg.search(s) and reg.search(s).group()[0]=='S'
False
>>> s="Information in SinGapore"
>>> reg.search(s) and reg.search(s).group()[0]=='S'
True
Update
Following your comment - you could use:
reg.search(s).group().startswith("S")
Instead of:
reg.search(s).group()[0]==("S")
If it seems more readable.
Since you want to set a GV code according to the catched phrase (unique name or several name blank separated, I know that), there must be a step in which the code is choosen in a dictionary according to the catched phrase.
So it's easy to make a profit of this step to perform the test on the first letter (must be uppercased) or the first name in the phrase that no regex is capable of.
I choosed certain conditions to constitute the test. For example, a dot in a first name is not mandatory, but uppercased letters are. These conditions will be easily changed at need.
EDIT 1
import re
def regexize(cntry):
def doot(x):
return '\.?'.join(ch for ch in x) + '\.?'
to_join = []
for c in cntry:
cspl = c.split(' ',1)
if len(cspl)==1: # 'Singapore','Austria',...
to_join.append('(%s)%s'
% (doot(c[0]), doot(c[1:])))
else: # 'Den LMM','LMM Den',....
to_join.append('(%s) +%s'
% (doot(cspl[0]),
doot(cspl[1].strip(' ').lower())))
pattern = '|'.join(to_join).join('()')
return re.compile(pattern,re.I)
def code(X,CNTR,r = regexize):
r = regexize(CNTR)
for ma in r.finditer(X):
beg = ma.group(1).split(' ')[0]
if beg==ma.group(1):
GV = countries[beg[0]+beg[1:].replace('.','').lower()] \
if beg[0].upper()==beg[0] else '- bad match -'
else:
try:
k = (ki for ki in countries.iterkeys()
if beg.replace('.','')==ki.split(' ')[0]).next()
GV = countries[k]
except StopIteration:
GV = '- bad match -'
yield ' {!s:15} {!s:^13}'.format(ma.group(1), GV)
countries = {'Singapore':'SG','Austria':'AU',
'Swiss':'CH','Chile':'CL',
'Den LMM':'DN','LMM Den':'LM'}
s = (' Singapore SIngapore SiNgapore SinGapore'
' SI.Ngapore SIngaPore SinGaporE SinGAPore'
' SINGaporE SiNg.aPoR singapore sIngapore'
' siNgapore sinGapore sINgap.ore sIngaPore'
' sinGaporE sinGAPore sINGaporE siNgaPoRe'
' Austria Aus.trIA aUSTria AUSTRiA'
' Den L.M.M Den Lm.M DEn Lm.M.'
' DEN L.MM De.n L.M.M. Den LmM'
' L.MM DEn LMM DeN LM.m Den')
print '\n'
print '\n'.join(res for res in code(s,countries))
EDIT 2
I improved the code. It's shorter and more readable.
The instruction assert(.....] is to verify that the keys of the dictionaru are well formed for the purpose.
import re
def doot(x):
return '\.?'.join(ch for ch in x) + '\.?'
def regexize(labels,doot=doot,
wg2 = '(%s) *( %s)',wnog2 = '(%s)(%s)',
ri = re.compile('(.(?!.*? )|[^ ]+)( ?) *(.+\Z)')):
to_join = []
modlabs = {}
for K in labels.iterkeys():
g1,g2,g3 = ri.match(K).groups()
to_join.append((wg2 if g2 else wnog2)
% (doot(g1), doot(g3.lower())))
modlabs[g1+g2+g3.lower()] = labels[K]
return (re.compile('|'.join(to_join), re.I), modlabs)
def code(X,labels,regexize = regexize):
reglab,modlabs = regexize(labels)
for ma in reglab.finditer(X):
a,b = tuple(x for x in ma.groups() if x)
k = (a + b.lower()).replace('.','')
GV = modlabs[k] if k in modlabs else '- bad match -'
yield ' {!s:15} {!s:^13}'.format(a+b, GV)
countries = {'Singapore':'SG','Austria':'AU',
'Swiss':'CH','Chile':'CL',
'Den LMM':'DN','LMM Den':'LM'}
assert(all('.' not in k and
(k.count(' ')==1 or k[0].upper()==k[0])
for k in countries))
s = (' Singapore SIngapore SiNgapore SinGapore'
' SI.Ngapore SIngaPore SinGaporE SinGAPore'
' SINGaporE SiNg.aPoR singapore sIngapore'
' siNgapore sinGapore sINgap.ore sIngaPore'
' sinGaporE sinGAPore sINGaporE siNgaPoRe'
' Austria Aus.trIA aUSTria AUSTRiA'
' Den L.M.M Den Lm.M DEn Lm.M.'
' DEN L.MM De.n L.M.M. Den LmM'
' L.MM DEn LMM DeN LM.m Den')
print '\n'.join(res for res in code(s,countries))
You could write a simple lambda to generate the ugly-but-all-re-solution:
>>> leading_cap_re = lambda s: s[0].upper() + ''.join('[%s%s]' %
(c.upper(),c.lower())
for c in s[1:])
>>> leading_cap_re("Singapore")
'S[Ii][Nn][Gg][Aa][Pp][Oo][Rr][Ee]'
For multi-word cities, define a string-splitting version:
>>> leading_caps_re = lambda s : r'\s+'.join(map(leading_cap_re,s.split()))
>>> print leading_caps_re('Kuala Lumpur')
K[Uu][Aa][Ll][Aa]\s+L[Uu][Mm][Pp][Uu][Rr]
Then your code could just be:
if re.search(leading_caps_re("Singapore") , st):
...etc...
and the ugliness of the RE would be purely internal.
interestingly
/((S)((?i)ingapore))/
Does the right thing in perl but doesn't seem to work as needed in python. To be fair the python docs spell it out clearly, (?i) alters the whole regexp
This is the BEST answer:
(?-i:S)(?i)ingapore
ClickHere for proof:

Categories

Resources