I want to write my each string's letter frequencies. My inputs and expected outputs are like this.
"aaaa" -> "a4"
"abb" -> "a1b2"
"abbb cc a" -> "a1b3 c2 a1"
"bbbaaacddddee" -> "b3a3c1d4e2"
"a b" -> "a1 b1"
I found this solution but it gives the frequencies in random order. How can I do this?
Does this satisfy your needs?
from itertools import groupby
s = "bbbaaac ddddee aa"
groups = groupby(s)
result = [(label, sum(1 for _ in group)) for label, group in groups]
res1 = "".join("{}{}".format(label, count) for label, count in result)
# 'b3a3c1 1d4e2 1a2'
# spaces just as spaces, do not include their count
import re
re.sub(' [0-9]+', ' ', res1)
'b3a3c1 d4e2 a2'
For me, it is a little bit trickier that it looks at first. For example, it does look that "bbbaaacddddee" -> "b3a3c1d4e2" needs the count results to be outputted in the order of appearance in the passed string:
import re
def unique_elements(t):
l = []
for w in t:
if w not in l:
l.append(w)
return l
def splitter(s):
res = []
tokens = re.split("[ ]+", s)
for token in tokens:
s1 = unique_elements(token) # or s1 = sorted(set(token))
this_count = "".join([k + str(v) for k, v in list(zip(s1, [token.count(x) for x in s1]))])
res.append(this_count)
return " ".join(res)
print(splitter("aaaa"))
print(splitter("abb"))
print(splitter("abbb cc a"))
print(splitter("bbbaaacddddee"))
print(splitter("a b"))
OUTPUT
a4
a1b2
a1b3 c2 a1
b3a3c1d4e2
a1 b1
If the order of appearance is not a real deal, you can disregard the unique_elements function and simply substitute something like s1 = sorted(set(token)) within splitter, as indicated in the comment.
here is you answer
test_str = "here is your answer"
res = {}
list=[]
list=test_str.split()
# print(list)
for a in list:
res={}
for keys in a:
res[keys] = res.get(keys, 0) + 1
for key,value in res.items():
print(f"{key}{value}",end="")
print(end=" ")
There is no need to iterate every character in every word.
This is an alternate solution. (If you don't want to use itertools, that looked pretty tidy.)
def word_stats(data: str=""):
all = []
for word in data.split(" "):
res = []
while len(word)>0:
res.append(word[:1] + str(word.count(word[:1])))
word = word.replace(word[:1],"")
res.sort()
all.append("".join(res))
return " ".join(all)
print(word_stats("asjssjbjbbhsiaiic ifiaficjxzjooro qoprlllkskrmsnm mmvvllvlxjxj jfnnfcncnnccnncsllsdfi"))
print(word_stats("abbb cc a"))
print(word_stats("bbbaaacddddee"))
This would output:
c5d1f3i1j1l2n7s2
a1b3 c2 a1
a3b3c1d4e2
Related
I am trying to add spaces between characters only for acronyms (all consecutive all-caps words) in Python.
INPUT:
"The PNUD, UN, UCALP and USA and U N."
DESIRED OUTPUT:
"The P N U D, U N, U C A L P and U S A and U N."
I have this solution so far, but I am looking for something more efficient/elegant:
import re
data = "The PNUD, UN, UCALP and USA and U N."
result = re.sub(r'(?=(?!^)[^[a-z]|\s+|\W]*)', ' ', data)
result = re.sub(r'\s+(\W)', '\g<1>', result)
print(result)
I think the following regex is a lot more trivial solution for this problem
re.sub('([A-Z])(?=[A-Z])', '\\1 ', s)
I'm just using a positive lookahead and a backreference.
Another solution re.sub with lambda function:
import re
data = "The PNUD, UN, UCALP and USA and U N."
result = re.sub(r"\b[A-Z]{2,}\b", lambda g: " ".join(g.group(0)), data)
print(result)
Prints:
The P N U D, U N, U C A L P and U S A and U N.
EDIT: Small benchmark
import re
import regex
from timeit import timeit
pat1 = re.compile(r"\b[A-Z]{2,}\b")
pat2 = re.compile(r"([A-Z])(?=[A-Z])")
pat3 = re.compile(r"[A-Z](?=[A-Z])") # the same without capturing group
# using regex module instead of re
pat4 = regex.compile(r"\b[[:upper:]]{2,}\b")
data = "The PNUD, UN, UCALP and USA and U N."
def fn1():
return pat1.sub(lambda g: " ".join(g.group(0)), data)
def fn2():
return pat2.sub(r"\g<1> ", data)
def fn3():
return pat3.sub(r"\g<0> ", data)
def fn4():
return pat4.sub(lambda g: " ".join(g.group(0)), data)
t1 = timeit(fn1, number=10_000)
t2 = timeit(fn2, number=10_000)
t3 = timeit(fn3, number=10_000)
t4 = timeit(fn4, number=10_000)
print(t1)
print(t2)
print(t3)
print(t4)
Prints:
0.03805400081910193
0.10581987909972668
0.10386284696869552
0.044628452975302935
You can use a single call to re.sub and match a single uppercase char and assert another one to the right.
In the replacement use the match followed by a space using \g<0>
[A-Z](?=[A-Z])
Regex demo
Example
result = re.sub('[A-Z](?=[A-Z])', r'\g<0> ', data)
Using Lambda with regex instead of re is slightly slower but allows you to match also Unicode chars from non-English languages, which is caeteris paribus a more generalizable answer.
import re
import regex
from timeit import timeit
pat1 = re.compile(r"\b[A-Z]+\b")
pat2 = re.compile(r"([A-Z])(?=[A-Z])")
pat3 = regex.compile(r"\b[[:upper:]]+\b")
data = "The PNUD, UN, UCALP and USA and U N."
def fn1():
return pat1.sub(lambda g: " ".join(g.group(0)), data)
def fn2():
return pat2.sub(r"\g<1> ", data)
def fn3():
return pat3.sub(lambda g: " ".join(g.group(0)), data)
t1 = timeit(fn1, number=10_000)
t2 = timeit(fn2, number=10_000)
t3 = timeit(fn3, number=10_000)
print(fn1())
print(fn2())
print(fn3())
print(t1)
print(t2)
print(t3)
DEMO
I have a string of following types:
a1 = 'images1subimages1/folder100/hello1.png'
a1 = 'images1subimages1 folder100 hello1.png'
a1 = 'images1subimages1folder100hello1.png'
a1 = 'images1b100d1.png'
The first Integer of the string is num0 and we only care about it. We want to increase all occurrence of num0 by one and keep other numbers the same.
Required:
a2 = 'images2subimages2/folder100/hello2.png'
a2 = 'images2subimages2 folder100 hello2.png'
a2 = 'images2subimages2folder100hello2.png'
a2 = 'images2b100d2.png'
My attempt:
import re
a1 = 'images1subimages1/folder100/hello1.png'
nums = list(map(int, re.findall(r'\d+', a1)))
nums0 = nums[0]
nums_changed = [j+1 if j==nums[0] else j for i,j in enumerate(nums)]
parts = re.findall(r'(\w*\d+)',a1)
for i in range(len(parts)):
num_parts = list(map(int, re.findall(r'\d+', parts[i])))
for num_part in num_parts:
if num_part == nums0:
parts[i] = parts[i].replace(str(nums0), str(nums0+1))
ans = '/'.join(parts)
ans
This has the following result:
a1 = 'images1subimages1/folder100/hello1.png' # good
a1 = 'images1subimages1 folder100 hello1.png' # bad
Is there a general way to solve the problem using regex in python?
Ì suggest first extracting the first number and then increment all occurrences of this number when it is not enclosed with other digits with re.sub:
import re
a1 = 'images1subimages1/folder100/hello1.png'
num0_m = re.search(r'\d+', a1) # Extract the first chunk of 1+ digits
if num0_m: # If there is a match
rx = r'(?<!\d){}(?!\d)'.format(num0_m.group()) # Set a regex to match the number when not inside other digits
print(re.sub(rx, lambda x: str(int(x.group())+1), a1)) # Increment the matched numbers
# => images2subimages2/folder100/hello2.png
See the Python demo
You can split the string on numbers, increment the ones equal to the first one, and rebuild the string:
import re
def increment_first(s):
parts = re.split(r'(\d+)', s)
nums = list(map(int, parts[1::2]))
num0 = nums[0]
nums = [num + (num == num0) for num in nums]
parts[1::2] = map(str, nums)
return ''.join(parts)
Testing it on your data:
tests = ['images1subimages1/folder100/hello1.png',
'images1subimages1 folder100 hello1.png',
'images1subimages1folder100hello1.png',
'images1b100d1.png']
for test in tests:
print(test, increment_first(test))
Output:
images1subimages1/folder100/hello1.png images2subimages2/folder100/hello2.png
images1subimages1 folder100 hello1.png images2subimages2 folder100 hello2.png
images1subimages1folder100hello1.png images2subimages2folder100hello2.png
images1b100d1.png images2b100d2.png
Alas, I'm not as fast as some of these regex gurus. Here is my solution anyway.
Find the first occurrence of a number re.search(r'\d+', st).group(0)
Substitute the first occurrence where the found number is not preceded or followed by another number (?<!\d)+' + re.escape(first) + r'(?!\d)+.
import re
def increment_all_of_first_occurring_number(st):
first = re.search(r'\d+', st).group(0)
return re.sub(
r'(?<!\d)+' + re.escape(first) + r'(?!\d)+',
str(int(first) + 1),
st
)
if __name__ == '__main__':
a1 = 'images1subimages1/folder100/hello1.png'
a2 = 'images1subimages1 folder100 hello1.png'
a3 = 'images1subimages1folder100hello1.png'
a4 = 'images1b100d1.png'
b1 = 'images10subimages10/folder10101/hello10.png'
b2 = 'images10subimages10 folder10101 hello10.png'
b3 = 'images10subimages10folder10101hello10.png'
b4 = 'images10b10101d10.png'
print(increment_all_of_first_occurring_number(a1))
print(increment_all_of_first_occurring_number(a2))
print(increment_all_of_first_occurring_number(a3))
print(increment_all_of_first_occurring_number(a4))
print(increment_all_of_first_occurring_number(b1))
print(increment_all_of_first_occurring_number(b2))
print(increment_all_of_first_occurring_number(b3))
print(increment_all_of_first_occurring_number(b4))
Results
images2subimages2/folder100/hello2.png
images2subimages2 folder100 hello2.png
images2subimages2folder100hello2.png
images2b100d2.png
images11subimages11/folder10101/hello11.png
images11subimages11 folder10101 hello11.png
images11subimages11folder10101hello11.png
images11b10101d11.png
I have a list with company names, some of them has abbreviations. ex:
compNames = ['Costa Limited', 'D B M LTD']
I need to convert compNames of text to a matrix of token counts using the following. But this does not output columns for B D M in D B M LTD
count_vect = CountVectorizer(analyzer='word')
count_vect.fit_transform(compNames).toarray()
What is the best way to concatenate the single characters in a text?
ex: 'D B M LTD' to 'DBM LTD'
import re
string = 'D B M LTD'
print re.sub("([^ ]) ", r"\1", re.sub(" ([^ ]{2,})", r" \1", string))
Awkward, but it should work. It introduces an additional space in front of LTD and then replaces "D " with "D", "B " with "B" and so on.
Here is a short function that breaks a string on white space characters to a list, iterates the list, builds a temporary string if the element is of length 1, appends the temp string to a new list when an element with length greater than one is encounters.
import re
a = 'D B M LTD'
def single_concat(s):
out = []
tmp = ''
for x in re.split(r'\s+', s):
if len(x) == 1:
tmp += x
else:
if tmp:
out.append(tmp)
out.append(x)
tmp = ''
return ' '.join(out)
single_concat(a)
# returns:
'DBM LTD'
import re
s = "D B M LTD"
first_part = ''
for chunk in re.compile("([A-Z]{1})\s").split(s):
if len(chunk) == 1:
first_part += chunk
elif len(chunk) > 1:
last_part = chunk
print(first_part + " " + last_part)
Prints DBM LTD.
import re
string = 'D B M LTD'
print re.sub(r"\+", r"", re.sub(r"\+(\w\B)", r" \1", re.sub(r"(\b\w) ", r"\1+", string)))
I'm using the + character as temporary, assuming there are no + characters in the string. If there are, use some other that doesn't occur.
Look, no re:
def mingle(s):
""" SO: 49692941 """
l = s.split()
r = []
t = []
for e in l:
if len(e) == 1:
t.append(e)
else:
j = "".join(t)
r.append( j )
r.append( e )
t = []
return " ".join(r)
print( mingle('D B M LTD') )
prints
DBM LTD
I have the follwing string and I split it:
>>> st = '%2g%k%3p'
>>> l = filter(None, st.split('%'))
>>> print l
['2g', 'k', '3p']
Now I want to print the g letter two times, the k letter one time and the p letter three times:
ggkppp
How is it possible?
You could use generator with isdigit() to check wheter your first symbol is digit or not and then return following string with appropriate count. Then you could use join to get your output:
''.join(i[1:]*int(i[0]) if i[0].isdigit() else i for i in l)
Demonstration:
In [70]: [i[1:]*int(i[0]) if i[0].isdigit() else i for i in l ]
Out[70]: ['gg', 'k', 'ppp']
In [71]: ''.join(i[1:]*int(i[0]) if i[0].isdigit() else i for i in l)
Out[71]: 'ggkppp'
EDIT
Using re module when first number is with several digits:
''.join(re.search('(\d+)(\w+)', i).group(2)*int(re.search('(\d+)(\w+)', i).group(1)) if re.search('(\d+)(\w+)', i) else i for i in l)
Example:
In [144]: l = ['12g', '2kd', 'h', '3p']
In [145]: ''.join(re.search('(\d+)(\w+)', i).group(2)*int(re.search('(\d+)(\w+)', i).group(1)) if re.search('(\d+)(\w+)', i) else i for i in l)
Out[145]: 'ggggggggggggkdkdhppp'
EDIT2
For your input like:
st = '%2g_%3k%3p'
You could replace _ with empty string and then add _ to the end if the work from list endswith the _ symbol:
st = '%2g_%3k%3p'
l = list(filter(None, st.split('%')))
''.join((re.search('(\d+)(\w+)', i).group(2)*int(re.search('(\d+)(\w+)', i).group(1))).replace("_", "") + '_' * i.endswith('_') if re.search('(\d+)(\w+)', i) else i for i in l)
Output:
'gg_kkkppp'
EDIT3
Solution without re module but with usual loops working for 2 digits. You could define functions:
def add_str(ind, st):
if not st.endswith('_'):
return st[ind:] * int(st[:ind])
else:
return st[ind:-1] * int(st[:ind]) + '_'
def collect(l):
final_str = ''
for i in l:
if i[0].isdigit():
if i[1].isdigit():
final_str += add_str(2, i)
else:
final_str += add_str(1, i)
else:
final_str += i
return final_str
And then use them as:
l = ['12g_', '3k', '3p']
print(collect(l))
gggggggggggg_kkkppp
One-liner Regex way:
>>> import re
>>> st = '%2g%k%3p'
>>> re.sub(r'%|(\d*)(\w+)', lambda m: int(m.group(1))*m.group(2) if m.group(1) else m.group(2), st)
'ggkppp'
%|(\d*)(\w+) regex matches all % and captures zero or moredigit present before any word character into one group and the following word characters into another group. On replacement all the matched chars should be replaced with the value given in the replacement part. So this should loose % character.
or
>>> re.sub(r'%(\d*)(\w+)', lambda m: int(m.group(1))*m.group(2) if m.group(1) else m.group(2), st)
'ggkppp'
Assumes you are always printing single letter, but preceding number may be longer than single digit in base 10.
seq = ['2g', 'k', '3p']
result = ''.join(int(s[:-1] or 1) * s[-1] for s in seq)
assert result == "ggkppp"
LATE FOR THE SHOW BUT READY TO GO
Another way, is to define your function which converts nC into CCCC...C (ntimes), then pass it to a map to apply it on every element of the list l coming from the split over %, the finally join them all, as follows:
>>> def f(s):
x = 0
if s:
if len(s) == 1:
out = s
else:
for i in s:
if i.isdigit():
x = x*10 + int(i)
out = x*s[-1]
else:
out = ''
return out
>>> st
'%4g%10k%p'
>>> ''.join(map(f, st.split('%')))
'ggggkkkkkkkkkkp'
>>> st = '%2g%k%3p'
>>> ''.join(map(f, st.split('%')))
'ggkppp'
Or if you want to put all of these into one single function definition:
>>> def f(s):
out = ''
if s:
l = filter(None, s.split('%'))
for item in l:
x = 0
if len(item) == 1:
repl = item
else:
for c in item:
if c.isdigit():
x = x*10 + int(c)
repl = x*item[-1]
out += repl
return out
>>> st
'%2g%k%3p'
>>> f(st)
'ggkppp'
>>>
>>> st = '%4g%10k%p'
>>>
>>> f(st)
'ggggkkkkkkkkkkp'
>>> st = '%4g%101k%2p'
>>> f(st)
'ggggkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkpp'
>>> len(f(st))
107
EDIT :
In case of the presence of _ where the OP does not want this character to be repeated, then the best way in my opinion is to go with re.sub, it will make things easier, this way:
>>> def f(s):
pat = re.compile(r'%(\d*)([a-zA-Z]+)')
out = pat.sub(lambda m:int(m.group(1))*m.group(2) if m.group(1) else m.group(2), s)
return out
>>> st = '%4g_%12k%p__%m'
>>> f(st)
'gggg_kkkkkkkkkkkkp__m'
Loop the list, check first entry for number, and then append the second digit onwards:
string=''
l = ['2g', 'k', '3p']
for entry in l:
if len(entry) ==1:
string += (entry)
else:
number = int(entry[0])
for i in range(number):
string += (entry[1:])
This Python function interlocks the characters of two words (e.g., "sho" + "col" -> "school"). word1-char1 + word2-char1 + word1-char2 + ...
def interlock(a,b):
i = 0
c = ""
d = ""
while (i < len(a) and len(b)):
c = (a[i]+b[i])
d = d + c
i+=1
return(d)
interlock("sho", "col")
Now, I would like to apply this function to a list of words. The goal is to find out any interlock corresponds to an item of a list.
word_list = ["test", "col", "tele", "school", "tel", "sho", "aye"]
To do that, I would first have to create a new list that has all the interlocks in it. This is exactly where I am stuck - I don't know how to iterate over word_list using interlock.
Thanks for your help!
If you want all possible permutations of the list to pass to interlock without pairing a word with itself i.e we won't get interlock("col", "col"):
def interlock(s1,s2):
out = ""
while s1 and s2: # keep looping until any string is empty
out += s1[0] + s2[0]
s1, s2 = s1[1:], s2[1:]
return out + s1 + s2 # add the remainder of any longer string
word_list = ["test", "col", "tele", "school", "tel", "sho","col" "aye"]
from itertools import permutations
# get all permutations of len 2 from our word list
perms = permutations(word_list,2)
st = set(word_list)
for a, b in perms:
res = interlock(a,b)
if res in st:
print(res)
school
You can also achieve the same result using itertools.zip_longest using a fillvalue of "" to catch the end of the longer words:
from itertools import permutations, zip_longest
perms = permutations(word_list, 2)
st = set(word_list)
for a, b in perms:
res = "".join("".join(tup) for tup in zip_longest(a,b,fillvalue=""))
if res in st:
print(res)
You can do it using product function from itertools module:
from itertools import product
for a, b in product(word_list, word_list):
interlock(a, b)
https://docs.python.org/2/library/itertools.html#itertools.product
Try this.
def interlockList(A):
while Len(A) > 2:
B = interlock(A[0],A[1])
A.remove(A[0])
A.remove(A[1])
A.insert(0, B)
return B