Concatenate the single characters in texts - python

I have a list with company names, some of them has abbreviations. ex:
compNames = ['Costa Limited', 'D B M LTD']
I need to convert compNames of text to a matrix of token counts using the following. But this does not output columns for B D M in D B M LTD
count_vect = CountVectorizer(analyzer='word')
count_vect.fit_transform(compNames).toarray()
What is the best way to concatenate the single characters in a text?
ex: 'D B M LTD' to 'DBM LTD'

import re
string = 'D B M LTD'
print re.sub("([^ ]) ", r"\1", re.sub(" ([^ ]{2,})", r" \1", string))
Awkward, but it should work. It introduces an additional space in front of LTD and then replaces "D " with "D", "B " with "B" and so on.

Here is a short function that breaks a string on white space characters to a list, iterates the list, builds a temporary string if the element is of length 1, appends the temp string to a new list when an element with length greater than one is encounters.
import re
a = 'D B M LTD'
def single_concat(s):
out = []
tmp = ''
for x in re.split(r'\s+', s):
if len(x) == 1:
tmp += x
else:
if tmp:
out.append(tmp)
out.append(x)
tmp = ''
return ' '.join(out)
single_concat(a)
# returns:
'DBM LTD'

import re
s = "D B M LTD"
first_part = ''
for chunk in re.compile("([A-Z]{1})\s").split(s):
if len(chunk) == 1:
first_part += chunk
elif len(chunk) > 1:
last_part = chunk
print(first_part + " " + last_part)
Prints DBM LTD.

import re
string = 'D B M LTD'
print re.sub(r"\+", r"", re.sub(r"\+(\w\B)", r" \1", re.sub(r"(\b\w) ", r"\1+", string)))
I'm using the + character as temporary, assuming there are no + characters in the string. If there are, use some other that doesn't occur.

Look, no re:
def mingle(s):
""" SO: 49692941 """
l = s.split()
r = []
t = []
for e in l:
if len(e) == 1:
t.append(e)
else:
j = "".join(t)
r.append( j )
r.append( e )
t = []
return " ".join(r)
print( mingle('D B M LTD') )
prints
DBM LTD

Related

trying to reverse the string with "_" fixed at point

def r(s):
str = []
for i in len(s):
if (s[i]=='_'):
str = s[i] + str
continue
str = s[i] + str
return str
I tried using the above code to convert the following string
Input: ab_cde
Expected Output: ed_cba
s = 'ab_cde'
out = ''
for a, b in zip(s, s[::-1]):
if b != '_' and a != '_':
out += b
else:
out += a
print(out)
Prints:
ed_cba
EDIT: For more fixed points:
s = 'ab_cde_f_ghijk_l'
i, out = iter(ch for ch in s[::-1] if ch != '_'), ''
out = ''.join(ch if ch == '_' else next(i) for ch in s)
print(out)
Prints:
lk_jih_g_fedcb_a
The main idea is to check all the positions of the underscore _, save them and reverse the string without them, to insert them again after reversing.
import re
def r(s):
# check where all the underscore are
underscore_positions = [m.start() for m in re.finditer('_', s)]
# get list of reversed chars without underscores
reversed_chars = [c for c in reversed(s) if c != '_']
# put underscore back where they where
for p in underscore_positions:
reversed_chars.insert(p, '_')
# profit
return "".join(reversed_chars)
The function can be modified to have a different fixed character.
I also uses the package re for the regex function to identify the _, you can do with a simple loop as underscore_positions = [i for i, c in enumerate(s) if c =='_'] if you prefer.
def fixed_reverse(s, ch):
idxs = [-1] + [i for i, x in enumerate(s) if x == ch] + [len(s)]
idxs = [x - i + 1 for i, x in enumerate(idxs)]
chars = "".join(x for x in s if x != ch)[::-1]
return ch.join(chars[a:b] for a, b in zip(idxs[:-1], idxs[1:]))
>>> fixed_reverse("ab_cde_f_ghijk_l", "_")
'lk_jih_g_fedcb_a'
This works by:
Storing the locations of the fixed-point character "_".
Reversing the string with the "_" characters removed.
Inserting the "_" back into the correct locations.

Print letters in specific pattern in Python

I have the follwing string and I split it:
>>> st = '%2g%k%3p'
>>> l = filter(None, st.split('%'))
>>> print l
['2g', 'k', '3p']
Now I want to print the g letter two times, the k letter one time and the p letter three times:
ggkppp
How is it possible?
You could use generator with isdigit() to check wheter your first symbol is digit or not and then return following string with appropriate count. Then you could use join to get your output:
''.join(i[1:]*int(i[0]) if i[0].isdigit() else i for i in l)
Demonstration:
In [70]: [i[1:]*int(i[0]) if i[0].isdigit() else i for i in l ]
Out[70]: ['gg', 'k', 'ppp']
In [71]: ''.join(i[1:]*int(i[0]) if i[0].isdigit() else i for i in l)
Out[71]: 'ggkppp'
EDIT
Using re module when first number is with several digits:
''.join(re.search('(\d+)(\w+)', i).group(2)*int(re.search('(\d+)(\w+)', i).group(1)) if re.search('(\d+)(\w+)', i) else i for i in l)
Example:
In [144]: l = ['12g', '2kd', 'h', '3p']
In [145]: ''.join(re.search('(\d+)(\w+)', i).group(2)*int(re.search('(\d+)(\w+)', i).group(1)) if re.search('(\d+)(\w+)', i) else i for i in l)
Out[145]: 'ggggggggggggkdkdhppp'
EDIT2
For your input like:
st = '%2g_%3k%3p'
You could replace _ with empty string and then add _ to the end if the work from list endswith the _ symbol:
st = '%2g_%3k%3p'
l = list(filter(None, st.split('%')))
''.join((re.search('(\d+)(\w+)', i).group(2)*int(re.search('(\d+)(\w+)', i).group(1))).replace("_", "") + '_' * i.endswith('_') if re.search('(\d+)(\w+)', i) else i for i in l)
Output:
'gg_kkkppp'
EDIT3
Solution without re module but with usual loops working for 2 digits. You could define functions:
def add_str(ind, st):
if not st.endswith('_'):
return st[ind:] * int(st[:ind])
else:
return st[ind:-1] * int(st[:ind]) + '_'
def collect(l):
final_str = ''
for i in l:
if i[0].isdigit():
if i[1].isdigit():
final_str += add_str(2, i)
else:
final_str += add_str(1, i)
else:
final_str += i
return final_str
And then use them as:
l = ['12g_', '3k', '3p']
print(collect(l))
gggggggggggg_kkkppp
One-liner Regex way:
>>> import re
>>> st = '%2g%k%3p'
>>> re.sub(r'%|(\d*)(\w+)', lambda m: int(m.group(1))*m.group(2) if m.group(1) else m.group(2), st)
'ggkppp'
%|(\d*)(\w+) regex matches all % and captures zero or moredigit present before any word character into one group and the following word characters into another group. On replacement all the matched chars should be replaced with the value given in the replacement part. So this should loose % character.
or
>>> re.sub(r'%(\d*)(\w+)', lambda m: int(m.group(1))*m.group(2) if m.group(1) else m.group(2), st)
'ggkppp'
Assumes you are always printing single letter, but preceding number may be longer than single digit in base 10.
seq = ['2g', 'k', '3p']
result = ''.join(int(s[:-1] or 1) * s[-1] for s in seq)
assert result == "ggkppp"
LATE FOR THE SHOW BUT READY TO GO
Another way, is to define your function which converts nC into CCCC...C (ntimes), then pass it to a map to apply it on every element of the list l coming from the split over %, the finally join them all, as follows:
>>> def f(s):
x = 0
if s:
if len(s) == 1:
out = s
else:
for i in s:
if i.isdigit():
x = x*10 + int(i)
out = x*s[-1]
else:
out = ''
return out
>>> st
'%4g%10k%p'
>>> ''.join(map(f, st.split('%')))
'ggggkkkkkkkkkkp'
>>> st = '%2g%k%3p'
>>> ''.join(map(f, st.split('%')))
'ggkppp'
Or if you want to put all of these into one single function definition:
>>> def f(s):
out = ''
if s:
l = filter(None, s.split('%'))
for item in l:
x = 0
if len(item) == 1:
repl = item
else:
for c in item:
if c.isdigit():
x = x*10 + int(c)
repl = x*item[-1]
out += repl
return out
>>> st
'%2g%k%3p'
>>> f(st)
'ggkppp'
>>>
>>> st = '%4g%10k%p'
>>>
>>> f(st)
'ggggkkkkkkkkkkp'
>>> st = '%4g%101k%2p'
>>> f(st)
'ggggkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkkpp'
>>> len(f(st))
107
EDIT :
In case of the presence of _ where the OP does not want this character to be repeated, then the best way in my opinion is to go with re.sub, it will make things easier, this way:
>>> def f(s):
pat = re.compile(r'%(\d*)([a-zA-Z]+)')
out = pat.sub(lambda m:int(m.group(1))*m.group(2) if m.group(1) else m.group(2), s)
return out
>>> st = '%4g_%12k%p__%m'
>>> f(st)
'gggg_kkkkkkkkkkkkp__m'
Loop the list, check first entry for number, and then append the second digit onwards:
string=''
l = ['2g', 'k', '3p']
for entry in l:
if len(entry) ==1:
string += (entry)
else:
number = int(entry[0])
for i in range(number):
string += (entry[1:])

Regex to replace single delimiter

I'm looking to find a limiter in a string " - ". The problem is that there may be other occurrences of "-" which makes the regex a bit more complex
a-b
a - b
a-b - c-d
-a-b- - -c-d-
should end up as
a - b
a - b
a b - c d
a b - c d
One " - " delimiter only.
Other hyphen characters to be replaced by spaces.
import re
def clean_delimter(s):
regx = re.sub(r"(\s?-\s?)", " - ", s)
print regx
return regx
myList = [
"a-b",
"a - b",
"a-b - c-d",
"-a-b- - -c-d-"
]
for i in range(0, len(myList)):
clean_delimter(myList[i])
My regex changes all hyphens which is not what I'm after. But I don't know how to tell Reg how to find
\s*-|\s*-\s*|-\s*
and then look for other occurrences of "-", changing them to " "
since you data is symmetrical, it could be done without regex:
result = []
with open('data', 'r') as f:
for line in f:
line = line.strip()
if line.count('-') > 1:
n = (line.count('-') -1) // 2
line = line.replace('-',' ', n)
#reverse
line = line[::-1]
line = line.replace('-',' ', n)
# reverse to the original order
line = line[::-1]
line = re.sub(r'\s+',r' ', line)
result.append(line)
else:
result.append(line)
for line in result:
print(line)
a-b
a - b
a b - c d
a b - c d

pattern finding in a string python

I try to create a modified LZW which will find patterns of words inside a string. My problem is that 1st element is '' and last is not checked if it is in the list. I saw the pseudo-code from here : https://www.cs.duke.edu/csed/curious/compression/lzw.html . Here is my script for compression:
string = 'this is a test a test this is pokemon'
diction = []
x = ""
count = 0
for c in string.split():
print (c)
print (x)
#x = x + " " + c
if x in diction:
x += " " + c
#print("debug")
else:
#print(x)
diction.append(x)
x = c
count +=1
#print(count)
print (diction)
I tried to fix the 2nd problem by 'appending' a random word to the end of the string but I don't think that's the best solution.
For the 1st problem I tried just to define the variable "x" as str or None but I get this < class 'str' > inside the list.
The link deals with character and splitting a string will give an array of words.
In order to get not an empty string in the dictionary and parsing the last element.
string = 'this is a test a test this is pokemon'
diction = []
x = ""
count = 0
for c in string.split():
print (c)
if x+" "+c in diction:
x += " " + c
else:
diction.append(x+" "+c)
x = c
count +=1
print (diction)
But perhaps you would like something like :
string = 'this is a test a test this is pokemon'
diction = []
x = ""
count = 0
for c in string:
print (c)
if x+c in diction:
x += c
else:
diction.append(x+c)
x = c
count +=1
print (diction)
I'm not sure what the code pretends, but to fix the issues that you mentioned I think you could do this:
string = 'this is a test a test this is pokemon'
diction = []
x = None
count = 0
for c in string.split():
if x in diction:
x += " " + c
else:
if x: diction.append(x)
x = c
count += 1
if not x in diction: diction.append(x)
print (diction)
The output for that code would be:
['this', 'is', 'a', 'test', 'a test', 'this is', 'pokemon']

Insert "X" between identical consecutive letters in a string

Given a string, how can I break it up such that there are no consecutive identical letters, at n, n+1, where n is even.
Meaning, how can i get "abba" to remain "abba", but take "abbb" into "abbXb".
Thanks
Because everyone loves one-liners:
strings = ['ab', 'abba', 'abbb', 'abbba', 'abbababababbbaaaa', 'abcacbbbddbabbdd']
for s in strings:
r = ''.join('X' + v if (k and k % 2 and v == s[k - 1]) else v for (k,v) in enumerate(s))
print s, '->', r
The code reads like this: look at every character in the string. If it's not the first and if it's index is even and it is the same as the character before, prepend an 'X' to the character.
Output:
ab -> ab
abba -> abba
abbb -> abbXb
abbba -> abbXba
abbababababbbaaaa -> abbababababXbbaaXaa
abcacbbbddbabbdd -> abcacbbXbdXdbabXbdXd
Do your own homework, Kevin.
def foo(text, separator):
if len(text) < 2:
return text
result = ""
for i in range(1, len(text), 2):
if text[i] == text[i - 1]:
result += text[i - 1] + separator + text[i]
else:
result += text[i-1:i+1]
if len(text) % 2 != 0:
result += text[-1]
return result
print(foo("ab", "X"))
print(foo("abba", "X"))
print(foo("abbba", "X"))
print(foo("abbababababbbaaaa", "Z"))
Output:
>> ab
>> abba
>> abbXba
>> abbababababZbbaaZaa
You can use itertools.groupby:
from itertools import islice, groupby
import math
def solve(strs, n):
for k, g in groupby(strs):
lis = list(g)
it = iter(lis)
yield 'X'.join(''.join(islice(it, n)) for _ in xrange(int(math.ceil(len(lis)/float(n)))))
Demo:
>>> ''.join(solve("abba", 2))
'abba'
>>> ''.join(solve("abbb", 2))
'abbXb'
>>> ''.join(list(solve('abbbbyyyyy', 2)))
'abbXbbyyXyyXy'
>>> ''.join(solve('abbbbyyyyy', 4))
'abbbbyyyyXy'
May it be good?
from itertools import izip_longest
def X(s):
s_odd = s[::2]
s_even = s[1::2]
output = ''
for o, e in izip_longest(s_odd, s_even):
output += o or ''
if o == e:
output += 'X'
output += e or ''
return output
strings = ['ab', 'abba', 'abbb', 'abbba', 'abbababababbbaaaa', 'abcacbbbddbabbdd']
for s in strings:
print X(s)
result:
ab
abba
abbXb
abbXba
abbababababXbbaaXaa
abcacbbXbdXdbabXbdXd
EDIT
A simpler version:
def X(s):
output = ''
for i in range(0, len(s), 2):
o = s[i]
e = s[i+1] if i < len(s) - 1 else ''
output += o
if o == e:
output += 'X'
output += e
return output
strings = ['ab', 'abba', 'abbb', 'abbba', 'abbababababbbaaaa', 'abcacbbbddbabbdd']
for s in strings:
print X(s)

Categories

Resources