I am trying to add spaces between characters only for acronyms (all consecutive all-caps words) in Python.
INPUT:
"The PNUD, UN, UCALP and USA and U N."
DESIRED OUTPUT:
"The P N U D, U N, U C A L P and U S A and U N."
I have this solution so far, but I am looking for something more efficient/elegant:
import re
data = "The PNUD, UN, UCALP and USA and U N."
result = re.sub(r'(?=(?!^)[^[a-z]|\s+|\W]*)', ' ', data)
result = re.sub(r'\s+(\W)', '\g<1>', result)
print(result)
I think the following regex is a lot more trivial solution for this problem
re.sub('([A-Z])(?=[A-Z])', '\\1 ', s)
I'm just using a positive lookahead and a backreference.
Another solution re.sub with lambda function:
import re
data = "The PNUD, UN, UCALP and USA and U N."
result = re.sub(r"\b[A-Z]{2,}\b", lambda g: " ".join(g.group(0)), data)
print(result)
Prints:
The P N U D, U N, U C A L P and U S A and U N.
EDIT: Small benchmark
import re
import regex
from timeit import timeit
pat1 = re.compile(r"\b[A-Z]{2,}\b")
pat2 = re.compile(r"([A-Z])(?=[A-Z])")
pat3 = re.compile(r"[A-Z](?=[A-Z])") # the same without capturing group
# using regex module instead of re
pat4 = regex.compile(r"\b[[:upper:]]{2,}\b")
data = "The PNUD, UN, UCALP and USA and U N."
def fn1():
return pat1.sub(lambda g: " ".join(g.group(0)), data)
def fn2():
return pat2.sub(r"\g<1> ", data)
def fn3():
return pat3.sub(r"\g<0> ", data)
def fn4():
return pat4.sub(lambda g: " ".join(g.group(0)), data)
t1 = timeit(fn1, number=10_000)
t2 = timeit(fn2, number=10_000)
t3 = timeit(fn3, number=10_000)
t4 = timeit(fn4, number=10_000)
print(t1)
print(t2)
print(t3)
print(t4)
Prints:
0.03805400081910193
0.10581987909972668
0.10386284696869552
0.044628452975302935
You can use a single call to re.sub and match a single uppercase char and assert another one to the right.
In the replacement use the match followed by a space using \g<0>
[A-Z](?=[A-Z])
Regex demo
Example
result = re.sub('[A-Z](?=[A-Z])', r'\g<0> ', data)
Using Lambda with regex instead of re is slightly slower but allows you to match also Unicode chars from non-English languages, which is caeteris paribus a more generalizable answer.
import re
import regex
from timeit import timeit
pat1 = re.compile(r"\b[A-Z]+\b")
pat2 = re.compile(r"([A-Z])(?=[A-Z])")
pat3 = regex.compile(r"\b[[:upper:]]+\b")
data = "The PNUD, UN, UCALP and USA and U N."
def fn1():
return pat1.sub(lambda g: " ".join(g.group(0)), data)
def fn2():
return pat2.sub(r"\g<1> ", data)
def fn3():
return pat3.sub(lambda g: " ".join(g.group(0)), data)
t1 = timeit(fn1, number=10_000)
t2 = timeit(fn2, number=10_000)
t3 = timeit(fn3, number=10_000)
print(fn1())
print(fn2())
print(fn3())
print(t1)
print(t2)
print(t3)
DEMO
Related
I want to write my each string's letter frequencies. My inputs and expected outputs are like this.
"aaaa" -> "a4"
"abb" -> "a1b2"
"abbb cc a" -> "a1b3 c2 a1"
"bbbaaacddddee" -> "b3a3c1d4e2"
"a b" -> "a1 b1"
I found this solution but it gives the frequencies in random order. How can I do this?
Does this satisfy your needs?
from itertools import groupby
s = "bbbaaac ddddee aa"
groups = groupby(s)
result = [(label, sum(1 for _ in group)) for label, group in groups]
res1 = "".join("{}{}".format(label, count) for label, count in result)
# 'b3a3c1 1d4e2 1a2'
# spaces just as spaces, do not include their count
import re
re.sub(' [0-9]+', ' ', res1)
'b3a3c1 d4e2 a2'
For me, it is a little bit trickier that it looks at first. For example, it does look that "bbbaaacddddee" -> "b3a3c1d4e2" needs the count results to be outputted in the order of appearance in the passed string:
import re
def unique_elements(t):
l = []
for w in t:
if w not in l:
l.append(w)
return l
def splitter(s):
res = []
tokens = re.split("[ ]+", s)
for token in tokens:
s1 = unique_elements(token) # or s1 = sorted(set(token))
this_count = "".join([k + str(v) for k, v in list(zip(s1, [token.count(x) for x in s1]))])
res.append(this_count)
return " ".join(res)
print(splitter("aaaa"))
print(splitter("abb"))
print(splitter("abbb cc a"))
print(splitter("bbbaaacddddee"))
print(splitter("a b"))
OUTPUT
a4
a1b2
a1b3 c2 a1
b3a3c1d4e2
a1 b1
If the order of appearance is not a real deal, you can disregard the unique_elements function and simply substitute something like s1 = sorted(set(token)) within splitter, as indicated in the comment.
here is you answer
test_str = "here is your answer"
res = {}
list=[]
list=test_str.split()
# print(list)
for a in list:
res={}
for keys in a:
res[keys] = res.get(keys, 0) + 1
for key,value in res.items():
print(f"{key}{value}",end="")
print(end=" ")
There is no need to iterate every character in every word.
This is an alternate solution. (If you don't want to use itertools, that looked pretty tidy.)
def word_stats(data: str=""):
all = []
for word in data.split(" "):
res = []
while len(word)>0:
res.append(word[:1] + str(word.count(word[:1])))
word = word.replace(word[:1],"")
res.sort()
all.append("".join(res))
return " ".join(all)
print(word_stats("asjssjbjbbhsiaiic ifiaficjxzjooro qoprlllkskrmsnm mmvvllvlxjxj jfnnfcncnnccnncsllsdfi"))
print(word_stats("abbb cc a"))
print(word_stats("bbbaaacddddee"))
This would output:
c5d1f3i1j1l2n7s2
a1b3 c2 a1
a3b3c1d4e2
I am trying to remove consecutively same characters from a string. For example:
abb --> ab
aaab --> ab
ababa --> ababa (since no two consecutive characters are same)
My code:
T=int(input())
l=[0]
S1=""
for i in range(T):
S=input()
for j in range(len(S)-1):
if S[j]!=S[j+1]:
if S[j] != l[len(l)-1]:
l=[]
l.append(S[j])
l.append(S[j+1])
print(l)
for k in l:
S1+=k
print(S1)
S1=""
l=[0]
The code doesn't work for the third case (ababa). How do I fix this?
One concise approach would use itertools.groupby:
from itertools import groupby
def clean(s):
return ''.join(k for k, _ in groupby(s))
>>> clean("abb")
'ab'
>>> clean("aaab")
'ab'
>>> clean("ababa")
'ababa'
A rather simplified quadratic loop-based approach (linear in comments):
def clean(s):
res = "" # res = []
for c in s:
if not res or res[-1] != c:
res += c # res.append(c)
return res # return ''.join(res)
A verbose way of doing it, may not be most efficient if the strings are large:
value = 'aaaaaabbbbaaaaaacdeeeeefff'
def no_dups(value):
r = ''
for i in value:
if not r or r[-1] != i:
r += i
return r
print(no_dups(value))
# abacdef
Using regex, we could do re.sub(r'([a-z])\1+', r'\1', string_data)
import re
test_data = 'abb aaab ababa'.split()
for data in test_data:
print(f"{data} -->", re.sub(r'([a-z])\1+', r'\1', data))
Came out with this code, works properly:
T=int(input()) #No of testcases; for testing multiple strings
S1=""
for i in range(T):
S=input()
for j in range(0,len(S),2):
if j!=len(S)-1:
if S[j]!=S[j+1]:
S1+=S[j]
S1+=S[j+1]
else:
if S1[len(S1)-1]!=S[j]:
S1+=S[j]
print(S1)
S1=""
You can use regex as:
for char in set(string):
string = re.sub(f'{char}+', char, string)
string
results in
abb --> ab
aaab --> ab
ababa --> ababa
I have a list with company names, some of them has abbreviations. ex:
compNames = ['Costa Limited', 'D B M LTD']
I need to convert compNames of text to a matrix of token counts using the following. But this does not output columns for B D M in D B M LTD
count_vect = CountVectorizer(analyzer='word')
count_vect.fit_transform(compNames).toarray()
What is the best way to concatenate the single characters in a text?
ex: 'D B M LTD' to 'DBM LTD'
import re
string = 'D B M LTD'
print re.sub("([^ ]) ", r"\1", re.sub(" ([^ ]{2,})", r" \1", string))
Awkward, but it should work. It introduces an additional space in front of LTD and then replaces "D " with "D", "B " with "B" and so on.
Here is a short function that breaks a string on white space characters to a list, iterates the list, builds a temporary string if the element is of length 1, appends the temp string to a new list when an element with length greater than one is encounters.
import re
a = 'D B M LTD'
def single_concat(s):
out = []
tmp = ''
for x in re.split(r'\s+', s):
if len(x) == 1:
tmp += x
else:
if tmp:
out.append(tmp)
out.append(x)
tmp = ''
return ' '.join(out)
single_concat(a)
# returns:
'DBM LTD'
import re
s = "D B M LTD"
first_part = ''
for chunk in re.compile("([A-Z]{1})\s").split(s):
if len(chunk) == 1:
first_part += chunk
elif len(chunk) > 1:
last_part = chunk
print(first_part + " " + last_part)
Prints DBM LTD.
import re
string = 'D B M LTD'
print re.sub(r"\+", r"", re.sub(r"\+(\w\B)", r" \1", re.sub(r"(\b\w) ", r"\1+", string)))
I'm using the + character as temporary, assuming there are no + characters in the string. If there are, use some other that doesn't occur.
Look, no re:
def mingle(s):
""" SO: 49692941 """
l = s.split()
r = []
t = []
for e in l:
if len(e) == 1:
t.append(e)
else:
j = "".join(t)
r.append( j )
r.append( e )
t = []
return " ".join(r)
print( mingle('D B M LTD') )
prints
DBM LTD
I'm trying to create a wordlist generator that creates a file with all possible combinations of uppercase letters and numbers but in this very specific format:
AAA00AA (uppercase, uppercase, uppercase, digit, digit, uppercase, uppercase)
So the first string would be AAA00AA and the last one ZZZ99ZZ.
There are over 1 billion possible combinations and I'm using the itertools.product function.
However, I'm stuck on how to loop through the results of each iteration in order to get each group (AAA 00 AA) to combine among themselves. Here's what I got so far, but each loop runs just once. For example when the first group AAA 00 AA reaches ZZZ 00 AA I then need to get the second group through 1 iteration to AAA 01 AA and so on until the third group.
I'm sure that my loop nesting logic is wrong or perhaps I need to use some other approach, but I have no idea what to do. Can anyone help, please? Here's my code so far.
import string
import itertools
import datetime
letters = string.ascii_uppercase
digits = string.digits
first_group = itertools.product(letters, repeat=3)
second_group = itertools.product(digits, repeat=2)
third_group = itertools.product(letters, repeat=2)
FILE = open("mylist.txt","w")
start = datetime.datetime.now()
for i in first_group:
first = ''.join(i)
FILE.write(first + '\n')
for a in second_group:
second = first +''.join(a)
FILE.write(second + '\n')
for x in third_group:
string = second +''.join(x)
FILE.write(string + '\n')
string = ''
FILE.close()
print 'DONE! - Finished in %s' % (datetime.datetime.now() - start)
You can use itertools.product to join the sub products again.
f, s, t = [
itertools.product(d, repeat=r)
for d, r in zip([letters, digits, letters], [3, 2, 2])
]
with open("mylist.txt", "w") as f:
for prod in itertools.product(f, s, t):
string = ''.join([''.join(k) for k in prod])
f.write(string + '\n')
# AAA00AA
# AAA00AB
# AAA00AC
# AAA00BA
# AAA00BB
# .......
import string
import itertools
import datetime
letters = string.ascii_uppercase
digits = string.digits
first_group = itertools.product(letters, repeat=3)
second_group = itertools.product(digits, repeat=2)
third_group = itertools.product(letters, repeat=2)
start = datetime.datetime.now()
with open("mylist.txt","w") as FILE:
for i in first_group:
first = ''.join(i)
for j in second_group:
second = ''.join(j)
for k in third_group:
FILE.write(first + second + ''.join(k) + '\n')
print 'DONE! - Finished in %s' % (datetime.datetime.now() - start)
Generates:
AAA00AA
AAA00AB
AAA00AC
AAA00AD
AAA00AE
AAA00AF
...
Everything else you can leave as they are. the itertools.product solution by #Coldspeed is however more elegant and probably faster too. I just wanted to correct your code.
Use a list comprehension:
res = ["".join(itertools.chain(a,b,c)) for c in third_group for b in second_group for a in first_group]
res
['AAA00AA', 'AAB00AA', 'AAC00AA', 'AAD00AA', 'AAE00AA', 'AAF00AA', 'AAG00AA', 'AAH00AA', 'AAI00AA', 'AAJ00AA', 'AAK00AA', 'AAL00AA', 'AAM00AA', 'AAN00AA', 'AAO00AA', 'AAP00AA', 'AAQ00AA', 'AAR00AA', 'AAS00AA', 'AAT00AA', 'AAU00AA', 'AAV00AA', 'AAW00AA', 'AAX00AA', 'AAY00AA',...]
You can even make it a generator object:
for e in ("".join(itertools.chain(a,b,c)) for c in third_group for b in second_group for a in first_group):
print e
I have the follwing string and I split it:
>>> st = '%2g%k%3p'
>>> l = filter(None, st.split('%'))
>>> print l
['2g', 'k', '3p']
Now I want to print the g letter two times, the k letter one time and the p letter three times:
ggkppp
How is it possible?
You could use generator with isdigit() to check wheter your first symbol is digit or not and then return following string with appropriate count. Then you could use join to get your output:
''.join(i[1:]*int(i[0]) if i[0].isdigit() else i for i in l)
Demonstration:
In [70]: [i[1:]*int(i[0]) if i[0].isdigit() else i for i in l ]
Out[70]: ['gg', 'k', 'ppp']
In [71]: ''.join(i[1:]*int(i[0]) if i[0].isdigit() else i for i in l)
Out[71]: 'ggkppp'
EDIT
Using re module when first number is with several digits:
''.join(re.search('(\d+)(\w+)', i).group(2)*int(re.search('(\d+)(\w+)', i).group(1)) if re.search('(\d+)(\w+)', i) else i for i in l)
Example:
In [144]: l = ['12g', '2kd', 'h', '3p']
In [145]: ''.join(re.search('(\d+)(\w+)', i).group(2)*int(re.search('(\d+)(\w+)', i).group(1)) if re.search('(\d+)(\w+)', i) else i for i in l)
Out[145]: 'ggggggggggggkdkdhppp'
EDIT2
For your input like:
st = '%2g_%3k%3p'
You could replace _ with empty string and then add _ to the end if the work from list endswith the _ symbol:
st = '%2g_%3k%3p'
l = list(filter(None, st.split('%')))
''.join((re.search('(\d+)(\w+)', i).group(2)*int(re.search('(\d+)(\w+)', i).group(1))).replace("_", "") + '_' * i.endswith('_') if re.search('(\d+)(\w+)', i) else i for i in l)
Output:
'gg_kkkppp'
EDIT3
Solution without re module but with usual loops working for 2 digits. You could define functions:
def add_str(ind, st):
if not st.endswith('_'):
return st[ind:] * int(st[:ind])
else:
return st[ind:-1] * int(st[:ind]) + '_'
def collect(l):
final_str = ''
for i in l:
if i[0].isdigit():
if i[1].isdigit():
final_str += add_str(2, i)
else:
final_str += add_str(1, i)
else:
final_str += i
return final_str
And then use them as:
l = ['12g_', '3k', '3p']
print(collect(l))
gggggggggggg_kkkppp
One-liner Regex way:
>>> import re
>>> st = '%2g%k%3p'
>>> re.sub(r'%|(\d*)(\w+)', lambda m: int(m.group(1))*m.group(2) if m.group(1) else m.group(2), st)
'ggkppp'
%|(\d*)(\w+) regex matches all % and captures zero or moredigit present before any word character into one group and the following word characters into another group. On replacement all the matched chars should be replaced with the value given in the replacement part. So this should loose % character.
or
>>> re.sub(r'%(\d*)(\w+)', lambda m: int(m.group(1))*m.group(2) if m.group(1) else m.group(2), st)
'ggkppp'
Assumes you are always printing single letter, but preceding number may be longer than single digit in base 10.
seq = ['2g', 'k', '3p']
result = ''.join(int(s[:-1] or 1) * s[-1] for s in seq)
assert result == "ggkppp"
LATE FOR THE SHOW BUT READY TO GO
Another way, is to define your function which converts nC into CCCC...C (ntimes), then pass it to a map to apply it on every element of the list l coming from the split over %, the finally join them all, as follows:
>>> def f(s):
x = 0
if s:
if len(s) == 1:
out = s
else:
for i in s:
if i.isdigit():
x = x*10 + int(i)
out = x*s[-1]
else:
out = ''
return out
>>> st
'%4g%10k%p'
>>> ''.join(map(f, st.split('%')))
'ggggkkkkkkkkkkp'
>>> st = '%2g%k%3p'
>>> ''.join(map(f, st.split('%')))
'ggkppp'
Or if you want to put all of these into one single function definition:
>>> def f(s):
out = ''
if s:
l = filter(None, s.split('%'))
for item in l:
x = 0
if len(item) == 1:
repl = item
else:
for c in item:
if c.isdigit():
x = x*10 + int(c)
repl = x*item[-1]
out += repl
return out
>>> st
'%2g%k%3p'
>>> f(st)
'ggkppp'
>>>
>>> st = '%4g%10k%p'
>>>
>>> f(st)
'ggggkkkkkkkkkkp'
>>> st = '%4g%101k%2p'
>>> f(st)
'ggggkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkpp'
>>> len(f(st))
107
EDIT :
In case of the presence of _ where the OP does not want this character to be repeated, then the best way in my opinion is to go with re.sub, it will make things easier, this way:
>>> def f(s):
pat = re.compile(r'%(\d*)([a-zA-Z]+)')
out = pat.sub(lambda m:int(m.group(1))*m.group(2) if m.group(1) else m.group(2), s)
return out
>>> st = '%4g_%12k%p__%m'
>>> f(st)
'gggg_kkkkkkkkkkkkp__m'
Loop the list, check first entry for number, and then append the second digit onwards:
string=''
l = ['2g', 'k', '3p']
for entry in l:
if len(entry) ==1:
string += (entry)
else:
number = int(entry[0])
for i in range(number):
string += (entry[1:])