Merging overlapping string sequences in a list - python

I am trying to figure out how to merge overlapping strings in a list together, for example for
['aacc','accb','ccbe']
I would get
['aaccbe']
This following code works for the example above, however it does not provide me with the desired result in the following case:
s = ['TGT','GTT','TTC','TCC','CCC','CCT','CCT','CTG','TGA','GAA','AAG','AGC','GCG','CGT','TGC','GCT','CTC','TCT','CTT','TTT','TTT','TTC','TCA','CAT','ATG','TGG','GGA','GAT','ATC','TCT','CTA','TAT','ATG','TGA','GAT','ATT','TTC']
a = s[0]
b = s[-1]
final_s = a[:a.index(b[0])]+b
print(final_s)
>>>TTC
My output is clearly not right, and I don't know why it doesn't work in this case. Note that I have already organized the list with the overlapping strings next to each other.

You can use a trie to storing the running substrings and more efficiently determine overlap. When the possibility of an overlap occurs (i.e for an input string, there exists a string in the trie with a letter that starts or ends the input string), a breadth-first search to find the largest possible overlap takes place, and then the remaining bits of string are added to the trie:
from collections import deque
#trie node (which stores a single letter) class definition
class Node:
def __init__(self, e, p = None):
self.e, self.p, self.c = e, p, []
def add_s(self, s):
if s:
self.c.append(self.__class__(s[0], self).add_s(s[1:]))
return self
class Trie:
def __init__(self):
self.c = []
def last_node(self, n):
return n if not n.c else self.last_node(n.c[0])
def get_s(self, c, ls):
#for an input string, find a letter in the trie that the string starts or ends with.
for i in c:
if i.e in ls:
yield i
yield from self.get_s(i.c, ls)
def add_string(self, s):
q, d = deque([j for i in self.get_s(self.c, (s[0], s[-1])) for j in [(s, i, 0), (s, i, -1)]]), []
while q:
if (w:=q.popleft())[1] is None:
d.append((w[0] if not w[0] else w[0][1:], w[2], w[-1]))
elif w[0] and w[1].e == w[0][w[-1]]:
if not w[-1]:
if not w[1].c:
d.append((w[0][1:], w[1], w[-1]))
else:
q.extend([(w[0][1:], i, 0) for i in w[1].c])
else:
q.append((w[0][:-1], w[1].p, w[1], -1))
if not (d:={a:b for a, *b in d}):
self.c.append(Node(s[0]).add_s(s[1:]))
elif (m:=min(d, key=len)):
if not d[m][-1]:
d[m][0].add_s(m)
else:
t = Node(m[0]).add_s(m)
d[m][0].p = self.last_node(t)
Putting it all together
t = Trie()
for i in ['aacc','accb','ccbe']:
t.add_string(i)
def overlaps(trie, c = ''):
if not trie.c:
yield c+trie.e
else:
yield from [j for k in trie.c for j in overlaps(k, c+trie.e)]
r = [j for k in t.c for j in overlaps(k)]
Output:
['aaccbe']

Use difflib.find_longest_match to find the overlap and concatenate appropriately, then use reduce to apply the entire list.
import difflib
from functools import reduce
def overlap(s1, s2):
# https://stackoverflow.com/a/14128905/4001592
s = difflib.SequenceMatcher(None, s1, s2)
pos_a, pos_b, size = s.find_longest_match(0, len(s1), 0, len(s2))
return s1[:pos_a] + s2[pos_b:]
s = ['aacc','accb','ccbe']
result = reduce(overlap, s, "")
print(result)
Output
aaccbe

Related

How to find longest intersection between two strings in python?

I'm trying to write a program that would find the longest intersection between two strings. The conditions are:
If there is no common character the program returns an empty chain.
If there are multiple substrings of common characters with the same length it should return whichever is the largest, for example, for "bbaacc" and "aabb" the repeating substrings are "aa" and "bb" but as "bb" > "aa", so the programs must return only "bb".
Finally the program should return the longest common substring, for instance, for "programme" and "grammaire" the return should be "gramm" not "gramme".
My code has a problem with this last condition, how could I change it so it works as expected?
def intersection(v, w):
if not v or not w:
return ""
x, xs, y, ys = v[0], v[1:], w[0], w[1:]
if x == y:
return x + intersection(xs, ys)
else:
return max(intersection(v, ys), intersection(xs, w), key=len)
Driver:
print(intersection('programme', 'grammaire'))
cant find the issue with your code, but i solved it like this
def longest_str_intersection(a: str, b: str):
# identify all possible character sequences from str a
seqs = []
for pos1 in range(len(a)):
for pos2 in range(len(a)):
seqs.append(a[pos1:pos2+1])
# remove empty sequences
seqs = [seq for seq in seqs if seq != '']
# find segments in str b
max_len_match = 0
max_match_sequence = ''
for seq in seqs:
if seq in b:
if len(seq) > max_len_match:
max_len_match = len(seq)
max_match_sequence = seq
return max_match_sequence
longest_str_intersection('programme', 'grammaire')
-> 'gramm'
also interested to see if you found a more elegant solution!

Huffman Coding Tree traversal

Writing Huffman Coding Algorithm in Python. Have successfully managed to create a tree based on a string input but am stuck on the best way to traverse it while generating the codes for each letter.
from collections import Counter
class HuffNode:
def __init__(self, count, letter=None):
self.letter = letter
self.count = count
self.right = None
self.left = None
word = input()
d = dict(Counter(word))
Nodes = [HuffNode(d[w], w) for w in sorted(d, key=d.get, reverse=True)]
while len(Nodes) > 1:
a = Nodes.pop()
b = Nodes.pop()
c = HuffNode(a.count+b.count)
c.left, c.right = a, b
Nodes.append(c)
Nodes.sort(key=lambda x: x.count, reverse=True)
For a word like "hello".
d = dict(Counter(word)) would get the frequency of each letter in the string and convert it to a dict. Thus having {'e': 1, 'l': 2, 'h': 1, 'o': 1} Each letter if then converted to a HuffNode and stored in Nodes
The while loop then proceeds to generate a tree until we only have one root
When the loop exits I'll have:
Whats the best way to traverse this tree then generating the codes for each letter?
Thanks
Generally speaking, you would want a recursive function that, given a HuffNode h and a prefix p:
if h.letter is not empty (i.e. h is a leaf), yields (p, h.letter) -> this is the code for the letter
otherwise, calls itself on h.left with prefix p + '0' and on h.right with p + '1'
A possible implementation (not tested, may have typos):
def make_code(node, prefix):
if node is None:
return []
if node.letter is not None:
return [(prefix, node.letter)]
else:
result = []
result.extend(make_code(h.left, prefix + '0'))
result.extend(make_code(h.right, prefix + '1'))
return result
codes = make_code(root, '')
where rootis the Huffman tree you built in the first step. The first test (if node is None) is to manage lopsided nodes, where one of the children may be empty.

Python - partitioning a list of strings using an equivalence relation

I have a list of alphabetic strings [str1,str2,...] which I need to partition into equivalence classes using an equivalence relation R, where str1 R str2 (in relational notation) if str2 can be obtained from str1 by a sequence of valid one-letter changes, where 'valid' means it produces a valid alphabetic word, e.g. cat --> car is valid but cat --> 'cax is not. If the input list was ['cat','ace','car','zip','ape','pip'] then the code should return [['cat','car'],['ace','ape'],['zip','pip']].
I've got an initial working version which, however, produces some "classes" which contain duplicates.
I don't suppose there is any Python package which allows me to define such equivalence relations, but even otherwise what would be the best way of doing this?
Should work for different length strings. Obviously, ordering matters.
def is_one_letter_different(s1, s2):
if len(s1) != len(s2):
return False
diff_count = 0;
for char1, char2 in zip(s1, s2):
if char1 != char2:
diff_count += 1
return diff_count == 1
def group(candidates):
groups = []
for candidate in candidates:
for group in groups:
for word in group:
if is_one_letter_different(word, candidate):
group.append(candidate)
break
if candidate in group:
break
else:
groups.append([candidate])
return groups
print group(['bread','breed', 'bream', 'tread', 'treat', 'short', 'shorn', 'shirt', 'shore', 'store','eagle','mired', 'sired', 'hired'])
Output:
[['bread', 'breed', 'bream', 'tread', 'treat'], ['short', 'shorn', 'shirt', 'shore', 'store'], ['eagle'], ['mired', 'sired', 'hired']]
EDIT: Updated to follow additional testcases. I'm not sure of output correctness - please validate. And provide us good testcases next time.
I would do it something like this: construct an undirected graph where each word is a node, and each edge indicates that the relation holds between them. Then you can identify each disconnected "island" in the graph, each of which represents an equivalence class.
from collections import defaultdict
def exactly_one(iter):
count = 0
for x in iter:
if x:
count += 1
if count > 1:
break
return count == 1
def are_one_letter_apart(a,b):
if len(a) != len(b): return False
return exactly_one(a_char != b_char for a_char, b_char in zip(a,b))
def pairs(seq):
for i in range(len(seq)):
for j in range(i+1, len(seq)):
yield (seq[i], seq[j])
def search(graph, node):
seen = set()
to_visit = set()
to_visit.add(node)
while to_visit:
cur = to_visit.pop()
if cur in seen: continue
for neighbor in graph[cur]:
if neighbor not in seen:
to_visit.add(neighbor)
seen.add(cur)
return seen
def get_islands(graph):
seen = set()
islands = []
for item in graph.iterkeys():
if item in seen: continue
group = search(graph, item)
seen = seen | group
islands.append(group)
return islands
def create_classes(seq, f):
graph = defaultdict(list)
for a,b in pairs(seq):
if f(a,b):
graph[a].append(b)
graph[b].append(a)
#one last pass to pick up items with no relations to anything else
for item in seq:
if item not in graph:
graph[item].append(item)
return [list(group) for group in get_islands(graph)]
seq = ['cat','ace','car','zip','ape','pip']
print create_classes(seq, are_one_letter_apart)
Result:
[['ace', 'ape'], ['pip', 'zip'], ['car', 'cat']]

Permutation growing algorithm in Python and print strings

I've done this algorithm before, in school, but I've forgotten how to do it. Basically I want to return a result that are strings like 'a[0]', 'a[0].a[0]' ...
length = range(0,2) #length = 2
depth = range(0,3) #depth = 3
for i in length:
for k in depth:
… print each permutation
RESULT
a[0]
a[0].a[0]
a[0].a[1]
a[0].a[0].a[0]
a[0].a[0].a[1]
a[0].a[1].a[0]
a[0].a[1].a[1]
a[1]
a[1].a[0]
a[1].a[1]
a[1].a[0].a[0]
a[1].a[0].a[1]
a[1].a[1].a[0]
a[1].a[1].a[1]
Changing a bit the ordering of the output, so that it is the same on all levels:
def thing (length, depth, prefix = None):
if not depth: return
if not prefix: prefix = []
for l in range (length):
r = prefix + ['a[{}]'.format (l) ]
yield '.'.join (r)
for r in thing (length, depth - 1, r):
yield r
for x in thing (2, 3): print (x)
Output is:
a[0]
a[0].a[0]
a[0].a[0].a[0]
a[0].a[0].a[1]
a[0].a[1]
a[0].a[1].a[0]
a[0].a[1].a[1]
a[1]
a[1].a[0]
a[1].a[0].a[0]
a[1].a[0].a[1]
a[1].a[1]
a[1].a[1].a[0]
a[1].a[1].a[1]
An alternative approach; as Hyperboreus points out, the key is realizing that the ordering of the first-elements does not match that of the following elements; so I handle them separately.
from itertools import product
def item_fmt(i):
return "a[{}]".format(repr(i))
def make_result(*args):
return ".".join(item_fmt(arg) for arg in args)
def main():
items = [0, 1]
maxdepth = 3
for first in items: # in order by first-item
print(make_result(first)) # show first-item-only
for depth in range(1, maxdepth): # in order by depth
for combo in product(items, repeat=depth): # generate all combinations of given depth
print(make_result(first, *combo))
if __name__=="__main__":
main()
results in
a[0]
a[0].a[0]
a[0].a[1]
a[0].a[0].a[0]
a[0].a[0].a[1]
a[0].a[1].a[0]
a[0].a[1].a[1]
a[1]
a[1].a[0]
a[1].a[1]
a[1].a[0].a[0]
a[1].a[0].a[1]
a[1].a[1].a[0]
a[1].a[1].a[1]

Find all Occurences of Every Substring in String

I am trying to find all occurrences of sub-strings in a main string (of all lengths). My function takes one string and then returns a dictionary of every sub-string (which occurs more than once, of course) and how many times it occurs (format of the dictionary: {substring: # of occurrences, ...}). I am using collections.Counter(s) to help me with it.
Here is my function:
from collections import Counter
def patternFind(s):
patterns = {}
for index in range(1, len(s)+1)[::-1]:
d = nChunks(s, step=index)
parts = dict(Counter(d))
patterns.update({elem: parts[elem] for elem in parts.keys() if parts[elem] > 1})
return patterns
def nChunks(iterable, start=0, step=1):
return [iterable[i:i+step] for i in range(start, len(iterable), step)]
I have a string, data with about 2500 random letters (in a random order). However, there are 2 strings inserted into it (random points). Say this string is 'TEST'. data.count('TEST') returns 2. However, patternFind(data)['TEST'] gives me a KeyError. Therefore, my program does not detect the two strings in it.
What have I done wrong? Thanks!
Edit: My method of creating testing-instances:
def createNewTest():
n = randint(500, 2500)
x, y = randint(500, n), randint(500, n)
s = ''
for i in range(n):
s += choice(uppercase)
if i == x or i == y: s += "TEST"
return s
Using Regular Expressions
Apart from the count() method you described, regex is an obvious alternative
import re
needle = r'TEST'
haystack = 'khjkzahklahjTESTkahklaghTESTjklajhkhzkhjkzahklahjTESTkahklagh'
pattern = re.compile(needle)
print len(re.findall(pattern, haystack))
Short Cut
If you need to build a dictionary of substrings, possibly you can do this with only subset of those strings. Assuming you know the needle you are looking for in the data then you only need the dictionary of substrings of data that are the same length of needle. This is very fast.
from collections import Counter
needle = "TEST"
def gen_sub(s, len_chunk):
for start in range(0, len(s)-len_chunk+1):
yield s[start:start+len_chunk]
data = 'khjkzahklahjTESTkahklaghTESTjklajhkhzkhjkzahklahjTESTkahklaghTESz'
parts = Counter([sub for sub in gen_sub(data, len(needle))])
print parts[needle]
Brute Force: building dictionary of all substrings
If you need to have a count of all possible substrings, this works but it is very slow:
from collections import Counter
def gen_sub(s):
for start in range(0, len(s)):
for end in range(start+1, len(s)+1):
yield s[start:end]
data = 'khjkzahklahjTESTkahklaghTESTjklajhkhz'
parts = Counter([sub for sub in gen_sub(data)])
print parts['TEST']
Substring generator adapted from this: https://stackoverflow.com/a/8305463/1290420
While jurgenreza has explained why your program didn't work, the solution is still quite slow. If you only examine substrings s for which you know that s[:-1] repeats, you get a much faster solution (typically a hundred times faster and more):
from collections import defaultdict
def pfind(prefix, sequences):
collector = defaultdict(list)
for sequence in sequences:
collector[sequence[0]].append(sequence)
for item, matching_sequences in collector.items():
if len(matching_sequences) >= 2:
new_prefix = prefix + item
yield (new_prefix, len(matching_sequences))
for r in pfind(new_prefix, [sequence[1:] for sequence in matching_sequences]):
yield r
def find_repeated_substrings(s):
s0 = s + " "
return pfind("", [s0[i:] for i in range(len(s))])
If you want a dict, you call it like this:
result = dict(find_repeated_substrings(s))
On my machine, for a run with 2247 elements, it took 0.02 sec, while the original (corrected) solution took 12.72 sec.
(Note that this is a rather naive implementation; using indexes of instead of substrings should be even faster.)
Edit: The following variant works with other sequence types (not only strings). Also, it doesn't need a sentinel element.
from collections import defaultdict
def pfind(s, length, ends):
collector = defaultdict(list)
if ends[-1] >= len(s):
del ends[-1]
for end in ends:
if end < len(s):
collector[s[end]].append(end)
for key, matching_ends in collector.items():
if len(matching_ends) >= 2:
end = matching_ends[0]
yield (s[end - length: end + 1], len(matching_ends))
for r in pfind(s, length + 1, [end + 1 for end in matching_ends if end < len(s)]):
yield r
def find_repeated_substrings(s):
return pfind(s, 0, list(range(len(s))))
This still has the problem that very long substrings will exceed recursion depth. You might want to catch the exception.
The problem is in your nChunks function. It does not give you all the chunks that are necessary.
Let's consider a test string:
s='1test2345test'
For the chunks of size 4 your nChunks function gives this output:
>>>nChunks(s, step=4)
['1tes', 't234', '5tes', 't']
But what you really want is:
>>>def nChunks(iterable, start=0, step=1):
return [iterable[i:i+step] for i in range(len(iterable)-step+1)]
>>>nChunks(s, step=4)
['1tes', 'test', 'est2', 'st23', 't234', '2345', '345t', '45te', '5tes', 'test']
You can see that this way there are two 'test' chunks and your patternFind(s) will work like a charm:
>>> patternFind(s)
{'tes': 2, 'st': 2, 'te': 2, 'e': 2, 't': 4, 'es': 2, 'est': 2, 'test': 2, 's': 2}
here you can find a solution that uses a recursive wrapper around string.find() that searches all the occurences of a substring in a main string.
The collectallchuncks() function returns a defaultdict whith all the substrings as keys and for each substring a list of all the indexes where the substring is found in the main string.
import collections
# Minimum substring size, may be 1
MINSIZE = 3
# Recursive wrapper
def recfind(p, data, pos, acc):
res = data.find(p, pos)
if res == -1:
return acc
else:
acc.append(res)
return recfind(p, data, res+1, acc)
def collectallchuncks(data):
res = collections.defaultdict(str)
size = len(data)
for base in xrange(size):
for seg in xrange(MINSIZE, size-base+1):
chunk = data[base:base+seg]
if data.count(chunk) > 1:
res[chunk] = recfind(chunk, data, 0, [])
return res
if __name__ == "__main__":
data = 'khjkzahklahjTESTkahklaghTESTjklajhkhzkhjkzahklahjTESTkahklaghTESz'
allchuncks = collectallchuncks(data)
print 'TEST', allchuncks['TEST']
print 'hklag', allchuncks['hklag']
EDIT: If you just need the number of occurrences of each substring in the main string you can easily obtain it getting rid of the recursive function:
import collections
MINSIZE = 3
def collectallchuncks2(data):
res = collections.defaultdict(str)
size = len(data)
for base in xrange(size):
for seg in xrange(MINSIZE, size-base+1):
chunk = data[base:base+seg]
cnt = data.count(chunk)
if cnt > 1:
res[chunk] = cnt
return res
if __name__ == "__main__":
data = 'khjkzahklahjTESTkahklaghTESTjklajhkhzkhjkzahklahjTESTkahklaghTESz'
allchuncks = collectallchuncks2(data)
print 'TEST', allchuncks['TEST']
print 'hklag', allchuncks['hklag']

Categories

Resources