Find all Occurences of Every Substring in String

Find all Occurences of Every Substring in String - python

I am trying to find all occurrences of sub-strings in a main string (of all lengths). My function takes one string and then returns a dictionary of every sub-string (which occurs more than once, of course) and how many times it occurs (format of the dictionary: {substring: # of occurrences, ...}). I am using collections.Counter(s) to help me with it.
Here is my function:
from collections import Counter
def patternFind(s):
patterns = {}
for index in range(1, len(s)+1)[::-1]:
d = nChunks(s, step=index)
parts = dict(Counter(d))
patterns.update({elem: parts[elem] for elem in parts.keys() if parts[elem] > 1})
return patterns
def nChunks(iterable, start=0, step=1):
return [iterable[i:i+step] for i in range(start, len(iterable), step)]
I have a string, data with about 2500 random letters (in a random order). However, there are 2 strings inserted into it (random points). Say this string is 'TEST'. data.count('TEST') returns 2. However, patternFind(data)['TEST'] gives me a KeyError. Therefore, my program does not detect the two strings in it.
What have I done wrong? Thanks!
Edit: My method of creating testing-instances:
def createNewTest():
n = randint(500, 2500)
x, y = randint(500, n), randint(500, n)
s = ''
for i in range(n):
s += choice(uppercase)
if i == x or i == y: s += "TEST"
return s

Using Regular Expressions
Apart from the count() method you described, regex is an obvious alternative
import re
needle = r'TEST'
haystack = 'khjkzahklahjTESTkahklaghTESTjklajhkhzkhjkzahklahjTESTkahklagh'
pattern = re.compile(needle)
print len(re.findall(pattern, haystack))
Short Cut
If you need to build a dictionary of substrings, possibly you can do this with only subset of those strings. Assuming you know the needle you are looking for in the data then you only need the dictionary of substrings of data that are the same length of needle. This is very fast.
from collections import Counter
needle = "TEST"
def gen_sub(s, len_chunk):
for start in range(0, len(s)-len_chunk+1):
yield s[start:start+len_chunk]
data = 'khjkzahklahjTESTkahklaghTESTjklajhkhzkhjkzahklahjTESTkahklaghTESz'
parts = Counter([sub for sub in gen_sub(data, len(needle))])
print parts[needle]
Brute Force: building dictionary of all substrings
If you need to have a count of all possible substrings, this works but it is very slow:
from collections import Counter
def gen_sub(s):
for start in range(0, len(s)):
for end in range(start+1, len(s)+1):
yield s[start:end]
data = 'khjkzahklahjTESTkahklaghTESTjklajhkhz'
parts = Counter([sub for sub in gen_sub(data)])
print parts['TEST']
Substring generator adapted from this: https://stackoverflow.com/a/8305463/1290420

While jurgenreza has explained why your program didn't work, the solution is still quite slow. If you only examine substrings s for which you know that s[:-1] repeats, you get a much faster solution (typically a hundred times faster and more):
from collections import defaultdict
def pfind(prefix, sequences):
collector = defaultdict(list)
for sequence in sequences:
collector[sequence[0]].append(sequence)
for item, matching_sequences in collector.items():
if len(matching_sequences) >= 2:
new_prefix = prefix + item
yield (new_prefix, len(matching_sequences))
for r in pfind(new_prefix, [sequence[1:] for sequence in matching_sequences]):
yield r
def find_repeated_substrings(s):
s0 = s + " "
return pfind("", [s0[i:] for i in range(len(s))])
If you want a dict, you call it like this:
result = dict(find_repeated_substrings(s))
On my machine, for a run with 2247 elements, it took 0.02 sec, while the original (corrected) solution took 12.72 sec.
(Note that this is a rather naive implementation; using indexes of instead of substrings should be even faster.)
Edit: The following variant works with other sequence types (not only strings). Also, it doesn't need a sentinel element.
from collections import defaultdict
def pfind(s, length, ends):
collector = defaultdict(list)
if ends[-1] >= len(s):
del ends[-1]
for end in ends:
if end < len(s):
collector[s[end]].append(end)
for key, matching_ends in collector.items():
if len(matching_ends) >= 2:
end = matching_ends[0]
yield (s[end - length: end + 1], len(matching_ends))
for r in pfind(s, length + 1, [end + 1 for end in matching_ends if end < len(s)]):
yield r
def find_repeated_substrings(s):
return pfind(s, 0, list(range(len(s))))
This still has the problem that very long substrings will exceed recursion depth. You might want to catch the exception.

The problem is in your nChunks function. It does not give you all the chunks that are necessary.
Let's consider a test string:
s='1test2345test'
For the chunks of size 4 your nChunks function gives this output:
>>>nChunks(s, step=4)
['1tes', 't234', '5tes', 't']
But what you really want is:
>>>def nChunks(iterable, start=0, step=1):
return [iterable[i:i+step] for i in range(len(iterable)-step+1)]
>>>nChunks(s, step=4)
['1tes', 'test', 'est2', 'st23', 't234', '2345', '345t', '45te', '5tes', 'test']
You can see that this way there are two 'test' chunks and your patternFind(s) will work like a charm:
>>> patternFind(s)
{'tes': 2, 'st': 2, 'te': 2, 'e': 2, 't': 4, 'es': 2, 'est': 2, 'test': 2, 's': 2}

here you can find a solution that uses a recursive wrapper around string.find() that searches all the occurences of a substring in a main string.
The collectallchuncks() function returns a defaultdict whith all the substrings as keys and for each substring a list of all the indexes where the substring is found in the main string.
import collections
# Minimum substring size, may be 1
MINSIZE = 3
# Recursive wrapper
def recfind(p, data, pos, acc):
res = data.find(p, pos)
if res == -1:
return acc
else:
acc.append(res)
return recfind(p, data, res+1, acc)
def collectallchuncks(data):
res = collections.defaultdict(str)
size = len(data)
for base in xrange(size):
for seg in xrange(MINSIZE, size-base+1):
chunk = data[base:base+seg]
if data.count(chunk) > 1:
res[chunk] = recfind(chunk, data, 0, [])
return res
if __name__ == "__main__":
data = 'khjkzahklahjTESTkahklaghTESTjklajhkhzkhjkzahklahjTESTkahklaghTESz'
allchuncks = collectallchuncks(data)
print 'TEST', allchuncks['TEST']
print 'hklag', allchuncks['hklag']
EDIT: If you just need the number of occurrences of each substring in the main string you can easily obtain it getting rid of the recursive function:
import collections
MINSIZE = 3
def collectallchuncks2(data):
res = collections.defaultdict(str)
size = len(data)
for base in xrange(size):
for seg in xrange(MINSIZE, size-base+1):
chunk = data[base:base+seg]
cnt = data.count(chunk)
if cnt > 1:
res[chunk] = cnt
return res
if __name__ == "__main__":
data = 'khjkzahklahjTESTkahklaghTESTjklajhkhzkhjkzahklahjTESTkahklaghTESz'
allchuncks = collectallchuncks2(data)
print 'TEST', allchuncks['TEST']
print 'hklag', allchuncks['hklag']

Related

Optimize python pattern matching in nucleotide sequences

I'm currently working on a bioinformatic and modelling project where I need to do some pattern matching. Let's say I have a DNA fragment as follow 'atggcgtatagagc' and I split that fragment in micro-sequences of 8 nucleotides so that I have :
'atggcgta' 'tggcgtat' 'ggcgtata' 'gcgtatag' 'cgtataga' 'gtatagag' 'tatagagc'
And for each of these fragment I want to search in a whole genome and per chromosome the number of time they appear and the positions (starting positions) of the matches.
Here is how my code looks like :
you can download the genome fasta file here :
drive to the fasta file
import re
from Bio.SeqIO.FastaIO import FastaIterator
from Bio.Seq import Seq
def reverse_complement(sequence: str) -> str:
my_sequence = Seq(sequence)
return str(my_sequence.reverse_complement())
# you will need to unzip the file ant change the path below according to your working directory
path = '../data/Genome_S288c.fa'
genome = open(path, "r")
chr_sequences = {}
for record in FastaIterator(genome):
chr_id = record.id
seq = str(record.seq).lower()
rc_seq = reverse_complement(seq)
chr_sequences[chr_id] = {'5to3': seq, '3to5': rc_seq}
genome.close()
sequences = 'ATGACTAACGAAAAGGTCTGGATAGAGAAGTTGGATAATCCAACTCTTTCAGTGTTACCACATGACTTTTTACGCCCACAATCTTTAT'.lower()
micro_size = 8
micro_sequences = []
start = micro_size - 1
for i in range(start, len(sequences), 1):
current_micro_seq = sequences[i - start:i + 1]
micro_sequences.append(current_micro_seq)
genome_count = 0
chr_count = {}
chr_locations = {}
micro_fragment_stats = {}
for ii_micro, micro_seq in enumerate(micro_sequences):
for chr_idx in list(chr_sequences.keys()):
chr_counter = 0
seq = chr_sequences[chr_idx]['5to3']
pos = [m.start() for m in re.finditer(pattern=r'(?=(' + micro_seq + '))', string=seq)]
rc_seq = chr_sequences[chr_idx]['3to5']
rc_pos = [m.start() for m in re.finditer(pattern=r'(?=(' + micro_seq + '))', string=rc_seq)]
chr_locations[chr] = {'5to3': pos, '3to5': rc_pos}
chr_counter += len(pos) + len(rc_pos)
chr_count[chr_idx] = chr_counter
genome_count += chr_counter
micro_fragment_stats[ii_micro] = {'occurrences genome': genome_count,
'occurrences chromosomes': chr_count,
'locations chromosomes': chr_locations}
Actually my fragment is something like 2000bp long, so I took about 1 hour to compute all the micro-sequences. \
By the way, I use the r'(?=('+self.sequence+'))' to avoid the case of pattern that overlaps itself in the sequence, for instance :
pattern = 'aaggaaaaa'
string = 'aaggaaaaaggaaaaa'
expected output : (0, 7)
I am looking for a more efficient regex method that I can use for my case (in python if possible).
Thanks in advance

I would not recommend using regex for repetitive simple pattern matching. Outright comparison is expected to perform better. I did some basic testing and came up with the demo below.
import time
import re
import random
def compare(r1, r2, microseq_len, test_condition=1):
# condition 1: make microseqs/indexes from longer sequence and search against shorter
# condition 2: use regex to find position of microseq in reference sequence
# condition 3: use regex to find position of microseq in reference sequence after verifying if microseq in reference strain
start_time = time.time()
if test_condition == 1:
r1, r2 = r2, r1
# assemble dictionary containing microsequences and index positions
microseq_di = {}
for i in range(len(r1)-microseq_len):
microseq = r1[i:i+microseq_len]
if microseq not in microseq_di:
microseq_di[microseq] = []
microseq_di[microseq].append([i, i+microseq_len])
# mark for deletion
for microseq in microseq_di:
# condition 2
if test_condition == 2:
microseq_di[microseq] = [m.start() for m in re.finditer(pattern=r'(?=('+microseq+'))', string=r2)]
elif microseq not in r2:
microseq_di[microseq] = []
# condition 3
elif test_condition == 3:
microseq_di[microseq] = [m.start() for m in re.finditer(pattern=r'(?=('+microseq+'))', string=r2)]
print(time.time() - start_time) # run time
# delete and return
return({x:y for x, y in microseq_di.items() if y != []})
Input and Output:
r_short = "".join([random.choices(["A", "T", "G", "C"])[0] for x in range(500)])
r_long = "".join([random.choices(["A", "T", "G", "C"])[0] for x in range(100000)])
len(compare(r_short, r_long, 8, test_condition=1).keys())
0.19868111610412598
Out[1]: 400
len(compare(r_short, r_long, 8, test_condition=2).keys())
0.8831210136413574
Out[2]: 399
len(compare(r_short, r_long, 8, test_condition=3).keys())
0.7925639152526855
Out[3]: 399
Test condition 1 (microseqs from longer sequence) performed a lot better than the other two conditions using regex. Relative performance should improve with longer strings.
r_short = "".join([random.choices(["A", "T", "G", "C"])[0] for x in range(2000)])
r_long = "".join([random.choices(["A", "T", "G", "C"])[0] for x in range(1000000)])
len(compare3(r_short, r_long, 8, test_condition=1).keys())
2.2517480850219727
Out[4]: 1970
len(compare3(r_short, r_long, 8, test_condition=2).keys())
35.65084385871887
Out[5]: 1969
len(compare3(r_short, r_long, 8, test_condition=3).keys())
34.994577169418335
Out[6]: 1969
Note that condition 1 is not fully accommodating to your use-case since it doesn't exclude overlapping microseqs.

I've been playing with this question for a while and I end up with some ideas.
The algorithm is mainly divided in two parts: k-mer generation and k-mer searching in the reference.
For the k-mer generation part, I can see that your algorithm is quick, but it generates duplicates (that you have to filter afterwards when generating the dictionary). My approach has been to generate a deduplicated list directly. In my sample code I also modified your method to perform the deduplication at the same time, so you can avoid doing it later and, more important, allows for a fair time comparison with my approach.
You will see that using a set to keep the kmers offers us free deduplication, and is faster than using a list, as it has not to be traversed.
For the search of the kmer in the reference, given that you were doing exact searches, using a regex is overkill. It's far more cheaper to do a standard search. In this code, I used the methods provided by the Seq class: find and index. The idea is to find the first occurrence starting from the beginning, and the repeat the search starting with the next position after the last index found (if you want to avoid overlaps, then start after the last position found plus the k-mer size).
The code generated follows:
import re
from pathlib import Path
from timeit import timeit
from Bio.Seq import Seq
from Bio.SeqIO.FastaIO import FastaIterator
def reverse_complement(sequence: Seq) -> Seq:
return sequence.reverse_complement()
def generate_kmers(sequence: Seq, kmer_size: int) -> set[Seq]:
return {
Seq(sequence[i : i + kmer_size]) for i in range(len(sequence) - kmer_size + 1)
}
def generate_kmers_original(sequence: Seq, kmer_size: int) -> list[Seq]:
kmers: list[Seq] = []
start = kmer_size - 1
for i in range(start, len(sequence), 1):
current_micro_seq = Seq(sequence[i - start : i + 1])
# We had to add this check to avoid the duplication of k-mers
if current_micro_seq not in kmers:
kmers.append(current_micro_seq)
return kmers
def load_fasta(fasta_file: str) -> dict[str, dict[str, Seq]]:
fasta_dict: dict[str, dict[str, Seq]] = {}
with Path(fasta_file).open("r", encoding="UTF-8") as genome:
for record in FastaIterator(genome):
seq = record.seq.lower()
fasta_dict[record.id] = {"5to3": seq, "3to5": reverse_complement(seq)}
return fasta_dict
if __name__ == "__main__":
# Load the big fasta file
chr_sequences = load_fasta(
".../Saccharomyces_cerevisiae/S288c_R64/fasta/scerevisiae.S288c_R64.fasta"
)
# Generate the micro-sequences
micro_size = 8
sequences = Seq(
"ATGACTAACGAAAAGGTCTGGATAGAGAAGTTGGATAATCCAACTCTTTCAGTGTTACCACATGACTTTTTACGCCCACAATCTTTAT"
).lower()
micro_sequences = generate_kmers(sequences, micro_size)
# k-mer generation benchmark
test_size = 1000
kmer_generation_time = timeit(
"generate_kmers(sequences, micro_size)", number=test_size, globals=globals()
)
kmer_generation_original_time = timeit(
"generate_kmers_original(sequences, micro_size)",
number=test_size,
globals=globals(),
)
print(f"New k-mer generation time : {kmer_generation_time}")
print(f"Original k-mer generation time: {kmer_generation_original_time}")
print(f"There are {len(micro_sequences)} k-mers")
# Search for the kmers in the reference
def find_kmers_original(sequence: Seq, kmer: Seq) -> list[int]:
positions = [
m.start()
for m in re.finditer(
pattern=r"(?=(" + str(kmer) + "))", string=str(sequence)
)
]
return positions
def find_kmers_find(sequence: Seq, kmer: Seq) -> list[int]:
current = 0
positions: list[int] = []
while current < len(sequence):
index = sequence.find(kmer, current)
if index == -1:
break
positions.append(index)
current = index + 1
return positions
def find_kmers_index(sequence: Seq, kmer: Seq) -> list[int]:
positions: list[int] = []
current = 0
try:
while True:
index = sequence.index(kmer, current)
positions.append(index)
current = index + 1
except ValueError:
# Exception thrown when the kmer is not found
# This is our exit condition
pass
return positions
# k-mer search benchmark
test_size = 1000
haystack = next(iter(chr_sequences.values()))["5to3"]
needle = next(iter(micro_sequences))
search_original_time = timeit(
"find_kmers_original(haystack, needle)",
number=test_size,
globals=globals(),
)
search_find_time = timeit(
"find_kmers_find(haystack, needle)",
number=test_size,
globals=globals(),
)
search_index_time = timeit(
"find_kmers_index(haystack, needle)",
number=test_size,
globals=globals(),
)
print(f"Search with original time: {search_original_time}")
print(f"Search with find time : {search_find_time}")
print(f"Search with index time : {search_index_time}")
# Actual calculus
genome_count = 0
chr_count: dict[str, int] = {}
chr_locations: dict[str, dict[str, list[int]]] = {}
micro_fragment_stats: dict[
int, dict[str, int | dict[str, int] | dict[str, dict[str, list[int]]]]
] = {}
for ii_micro, micro_seq in enumerate(micro_sequences):
for chr_counter, (chromosome, contents) in enumerate(chr_sequences.items()):
pos = find_kmers_find(contents["5to3"], micro_seq)
rc_pos = find_kmers_find(contents["3to5"], micro_seq)
chr_locations[chromosome] = {"5to3": pos, "3to5": rc_pos}
chr_counter += len(pos) + len(rc_pos)
chr_count[chromosome] = chr_counter
genome_count += chr_counter
micro_fragment_stats[ii_micro] = {
"occurrences genome": genome_count,
"occurrences chromosomes": chr_count,
"locations chromosomes": chr_locations,
}
The output of this toy example is:
New k-mer generation time : 0.6696164240129292
Original k-mer generation time: 5.967410315992311
There are 81 k-mers
Search with original time: 3.1360475399997085
Search with find time : 0.5738343889825046
Search with index time : 0.5662875371053815
You can see that the k-mer generation is 9x faster and the search without the regex is around 5.5x faster.
In general, you will be better taking advantage of comprehensions and built-in data types (like the sets used here). And using the more simple approach also helps with performance. Regexes are powerful, but they need their time; if they are not required, better to avoid them. Specially in loops, where every small performance change is amplified.
Besides all of this benchmarking, you can also try to add the approach introduced by #Ghothi where the long and short sequences are exchanged. Maybe it could lead to some further improvement.
As a side note, Seq.find and Seq.index seems to offer the same performance, but I find it cleaner and more elegant the Seq.index version: you don't need a weird value to test against and the code intent is clearer. Also, the performance is slightly better, as it is avoiding a comparison in the loop, but this is a very minor improvement.

Permutations without itertools for two values (using recursion!)

Stackoverflow, I am once again asking for your help.
I'm aware there are other threads about this but I'll explain what makes my assignment different.
Basically my function would get a list of 0s and 1s, and return all the possible orders for the string. For example for "0111" we will get "0111", "1011", "1101", "1110".
Here's my code:
def permutations(string):
if len(string) == 1:
return [string]
lst = []
for j in range(len(string)):
remaining_elements = ''.join([string[i] for i in range(len(string)) if i != j])
mini_perm = permutations(remaining_elements)
for perm in mini_perm:
new_str = string[j] + perm
if new_str not in lst:
lst.append(new_str)
return lst
The problem is when I run a string like "000000000011" it takes a very long time to process. There is supposed to be a more efficient way to do it because it's just two numbers. So I shouldn't be using the indexes?
Please help me if you can figure out a more efficient say to do this.
(I am allowed to use loops just have to use recursion as well!)

Here is an example for creating permutations with recursion that is more efficient:
def permute(string):
string = list(string)
n = len(string)
# Base conditions
# If length is 0 or 1, there is only 1 permutation
if n in [0, 1]:
return [string]
# If length is 2, then there are only two permutations
# Example: [1,2] and [2,1]
if n == 2:
return [string, string[::-1]]
res = []
# For every number in array, choose 1 number and permute the remaining
# by calling permute recursively
for i in range(n):
permutations = permute(string[:i] + string[i+1:])
for p in permutations:
res.append([''.join(str(n) for n in [string[i]] + p)])
return res
This should also work for permute('000000000011') - hope it helps!

You can also use collections.Counter with a recursive generator function:
from collections import Counter
def permute(d):
counts = Counter(d)
def get_permuations(c, s = []):
if len(s) == sum(counts.values()):
yield ''.join(s)
else:
for a, b in c.items():
for i in range(1, b+1):
yield from get_permuations({**c, a:b - i}, s+([a]*i))
return list(set(get_permuations(counts)))
print(permute("0111"))
print(permute("000000000011"))
Output:
['0111', '1110', '1101', '1011']
['010000100000', '100000000001', '010000001000', '000000100001', '011000000000', '100000000010', '001001000000', '000000011000', '100000001000', '100000100000', '100001000000', '001000100000', '100010000000', '000000001100', '000100000100', '010010000000', '000000000011', '000000100010', '101000000000', '110000000000', '100000010000', '000100001000', '000001001000', '000000000101', '000000100100', '010000000001', '001000000100', '001000000010', '000110000000', '000011000000', '000001100000', '000000110000', '001000000001', '000010001000', '000100100000', '000001000001', '000010000001', '001100000000', '000100000001', '001000001000', '010000000100', '010000010000', '000000010001', '001000010000', '010001000000', '100000000100', '100100000000', '000000001001', '010100000000', '000010100000', '010000000010', '000000001010', '000010000100', '001010000000', '000000010010', '000001000010', '000100000010', '000101000000', '000000010100', '000100010000', '000000000110', '000001000100', '000010010000', '000000101000', '000001010000', '000010000010']

posting an answer someone gave me. Thanks for your responses!:
def permutations(zeroes, ones, lst, perm):
if zeroes == 0 and ones == 0:
lst.append(perm)
return
elif zeroes < 0 or ones < 0:
return
permutations(zeroes - 1, ones, lst, perm + '0')
permutations(zeroes, ones - 1, lst, perm + '1')

finding repeated substring in k length in a string using function

I just started using function and I'm trying to build one that's find a repeated substring that is length is at least k and returns the results into tuple that contains a dict.
the keys needs to be the substring and the value is how many times it was repeated, and then add to the tuple the length of the substring.
I just started but I didnt really knew how to continue but this is what I tried to do:
def longest_repeat(string, K)
longest = {} ,
if isinstance(K, int) and isinstance(string, str)
for sub_str in string:
if sub_str >= K:
longest[0][sub_seq] = DNA_seq_slic = []
a=0
b=k
for nuc in range(len(DNA_seq)-k+1):
DNA_seq_slic.append(DNA_seq[a:b])
a +=1
b +=1
import collections
for sub_seq in DNA_seq_slic:
repeated = [item for item, count in collections.Counter(DNA_seq_slic).items() if count > 1]
repeated_subseq_dict = dict(zip(repeated,[0 for x in range(0,len(repeated))]))
for key in repeated_subseq_dict:
repeated_subseq_dict[key] = DNA_seq_slic.count(key)
return(repeated_subseq_dict)
Im sorry if its a little bit messed up, I didnt really had direction and I tried to use other function I built to solve this and it didnt really worked. I can clarify more if needed.
the output should be something like this:
longest_repeated("ATAATACATAATA", 5)
output: longest = {ATAATA: 2} , 6
Really appreciate any kind of help! Thanks!

You can try re module:
import re
def longest_repeated(s, k):
m = re.findall(f"(.{{{k},}})(?=.*\\1)", s)
if m:
mx = max(m, key=len)
return {mx: s.count(mx)}, len(mx)
Some tests:
print(longest_repeated("ATAATACATAATA", 5))
({'ATAATA': 2}, 6)
print(longest_repeated("XXXXXATAATACATAATAXXXXX", 5))
({'ATAATA': 2}, 6)

String replacement combinations with fixed parts

Let's say I have the following string abcixigea and I want to replace the first 'i', 'e' and the second 'a' with '1', '3' and '4', getting all the combinations with those "progressive" replacement.
So, I need to get:
abc1xigea
abcixig3a
abcixig34
abc1xige4
...and so on.
I tried with itertools.product following this question python string replacement, all possible combinations #2 but the result I get is not exactly what I need and I can see why.
However I'm stuck on trying with combinations and keeping parts of the string fixed (changing just some chars as explained above).

from itertools import product
s = "abc{}xig{}{}"
for combo in product(("i", 1), ("e", 3), ("a", 4)):
print(s.format(*combo))
produces
abcixigea
abcixige4
abcixig3a
abcixig34
abc1xigea
abc1xige4
abc1xig3a
abc1xig34
Edit: in a more general way, you want something like:
from itertools import product
def find_nth(s, char, n):
"""
Return the offset of the nth occurrence of char in s,
or -1 on failure
"""
assert len(char) == 1
offs = -1
for _ in range(n):
offs = s.find(char, offs + 1)
if offs == -1:
break
return offs
def gen_replacements(base_string, *replacement_values):
"""
Generate all string combinations from base_string
by replacing some characters according to replacement_values
Each replacement_value is a tuple of
(original_char, occurrence, replacement_char)
"""
assert len(replacement_values) > 0
# find location of each character to be replaced
replacement_offsets = [
(find_nth(base_string, orig, occ), orig, occ, (orig, repl))
for orig,occ,repl in replacement_values
]
# put them in ascending order
replacement_offsets.sort()
# make sure all replacements are actually possible
if replacement_offsets[0][0] == -1:
raise ValueError("'{}' occurs less than {} times".format(replacement_offsets[0][1], replacement_offsets[0][2]))
# create format string and argument list
args = []
for i, (offs, _, _, arg) in enumerate(replacement_offsets):
# we are replacing one char with two, so we have to
# increase the offset of each replacement by
# the number of replacements already made
base_string = base_string[:offs + i] + "{}" + base_string[offs + i + 1:]
args.append(arg)
# ... and we feed that into the original code from above:
for combo in product(*args):
yield base_string.format(*combo)
def main():
s = "abcixigea"
for result in gen_replacements(s, ("i", 1, "1"), ("e", 1, "3"), ("a", 2, "4")):
print(result)
if __name__ == "__main__":
main()
which produces exactly the same output as above.

Longest common sequence between many sub-sequences

Fancy title :)
I have a file that contains the following:
>sequence_40
ABCDABDCABCDBACDBACDBACDBACDABDCDC
ACDCCDCABDCADCADBCACBDCABD
>sequence_41
DCBACDBACDADCDCDCABCDCACBDCBDACBDC
BCDBABABBABACDCDBCACDBACDBACDBACDC
BCDB
...
Then, I have a function that returns a dictionary (called dict) that returns the sequences as keys and the strings (combined on one line) as values for the keys. The sequences range from 40 to 59.
I want to take a dictionary of sequences and return the longest common sub-sequence found in ALL the sequences. Managed to find some help here on stackoverflow and made a code that only compares the LAST TWO strings in that dictionary, not all of them :).
This is the code
def longest_common_sequence(s1, s2):
m = [[0] * (1 + len(s2)) for i in range(1 + len(s1))]
longest, x_longest = 0, 0
for x in range(1, 1 + len(s1)):
for y in range(1, 1 + len(s2)):
if s1[x - 1] == s2[y - 1]:
m[x][y] = m[x - 1][y - 1] + 1
if m[x][y] > longest:
longest = m[x][y]
x_longest = x
else:
m[x][y] = 0
return s1[x_longest - longest: x_longest]
for i in range(40,59):
s1=str(dictionar['sequence_'+str(i)])
s2=str(dictionar['sequence_'+str(i+1)])
longest_common_sequence(s1,s2)
How can I modify it to get the common subsequence among ALL sequences in dictionary? Thanks!

EDIT: As #lmcarreiro pointed out, there is a relevant difference between substrings (or subarrays or sublists) and subsequences. To my understanding we are all talking about substrings here, so I will use this term in my answer.
Guillaumes answer can be improved:
def eachPossibleSubstring(string):
for size in range(len(string) + 1, 0, -1):
for start in range(len(string) - size + 1):
yield string[start:start+size]
def findLongestCommonSubstring(strings):
shortestString = min(strings, key=len)
for substring in eachPossibleSubstring(shortestString):
if all(substring in string
for string in strings if string != shortestString):
return substring
print findLongestCommonSubstring([
'ABCDABDCABCDBACDBACDBACDBACDABDCDCACDCCDCABDCADCADBCACBDCABD',
'DCBACDBACDADCDCDCABCDCACBDCBDACBDCBCDBABABBABACDCDBCACDBACDBACDBACDCBCDB',
])
This prints:
ACDBACDBACDBACD
This is faster because I return the first found and search from longest to shortest.
The basic idea is this: Take each possible substring of the shortest of your strings (in the order from the longest to the shortest) and see if this substring can be found in all other strings. If so, return it, otherwise try the next substring.
You need to understand generators. Try e. g. this:
for substring in eachPossibleSubstring('abcd'):
print substring
or
print list(eachPossibleSubstring('abcd'))

I'd start by defining a function to return all possible subsequences of a given sequence:
from itertools import combinations_with_replacement
def subsequences(sequence):
"returns all possible subquences of a given sequence"
for start, stop in combinations_with_replacement(range(len(sequence)), 2):
if start < stop:
yield sequence[start:stop]
then I'd make another method to check if a given subsequence in present in all given sequences:
def is_common_subsequence(sub, sequences):
"returns True if <sub> is a common subsequence in all <sequences>"
return all(sub in sequence for sequence in sequences)
then using the 2 methods above it is pretty easy to get all common subsequences in a given set of sequences:
def common_sequences(sequences):
"return all subsequences common in sequences"
shortest_seq = min(sequences, key=len)
return set(subsequence for subsequence in subsequences(shortest_seq) \
if is_common_subsequence(subsequence, sequences))
... and extracting the longuest sequence:
def longuest_common_subsequence(sequences):
"returns the longuest subsequence in sequences"
return max(common_sequences(sequences), key=len)
Result:
sequences = {
41: 'ABCDEFGHIJKLMNOPQRSTUVWXYZ',
42: '123ABCDEFGHIJKLMNOPQRSTUVW',
43: '123456ABCDEFGHIJKLMNOPQRST'
}
sequences2 = {
0: 'ABCDEFGHIJ',
1: 'DHSABCDFKDDSA',
2: 'SGABCEIDEFJRNF'
}
print(longuest_common_subsequence(sequences.values()))
>>> ABCDEFGHIJKLMNOPQRST
print(longuest_common_subsequence(sequences2.values()))
>>> ABC

Here you have a possible approach. First let's define a function that returns the longest substring between two strings:
def longest_substring(s1, s2):
t = [[0]*(1+len(s2)) for i in range(1+len(s1))]
l, xl = 0, 0
for x in range(1,1+len(s1)):
for y in range(1,1+len(s2)):
if s1[x-1] == s2[y-1]:
t[x][y] = t[x-1][y-1] + 1
if t[x][y]>l:
l = t[x][y]
xl = x
else:
t[x][y] = 0
return s1[xl-l: xl]
Now I'll create a random dict of sequences for the example:
import random
import string
d = {i : ''.join(random.choice(string.ascii_uppercase) for _ in range(50)) for i in range(10)}
print d
{0: 'ASCUCEVJNIGWVMWMBBQQBZYBBNGQAJRYXACGFEIFWHMBCNYRGL', 1: 'HKUKZOJJUCRTSBLNZXCIBARLPNAPAABRBZEVGVILJAFCGWGQVV', 2: 'MMHCYPKECRJFEWTGYITMHZSNHAFEZVFYDAVILRYRKIDDBEFRVX', 3: 'DGBULRFJINFZEELDASRFBIRSADWMRAYMGCDAOJDKQIMXIRLTEI', 4: 'VDUFWZSXLRGOIMAHOAMZAIWDPTHDVDXUACRBASJMCUHREDORRH', 5: 'RFGAVHOWNKRZMYMSFSSNUGCKEWUNVETCDWJXSPBJHKSTPFNSJO', 6: 'HFMLMHCFSOEXBXWFAROIRGJNPRTKRWCEPLFOKGMXNUPCPWREWX', 7: 'CNPGSHGVIRLDXAADXUVWCTJCXUHQLALBUOJMXQBKXWHKGSJHEH', 8: 'UWDXXTRCFNCBUBEYGYTDWTPLNTRHYQWKTHPRVCBAWIMNGHULDC', 9: 'OOCJRXBZKJIGHZEJOOIKWKMQKIEQVPEDTFPJQAUQKJQVLOMGJB'}
Finally, we need to find the longest subsequence between all sequences:
import itertools
max([longest_substring(i,j) for i,j in itertools.combinations(d.values(), 2)], key=len)
Output:
'VIL'

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Find all Occurences of Every Substring in String - python

Related

Optimize python pattern matching in nucleotide sequences

Permutations without itertools for two values (using recursion!)

finding repeated substring in k length in a string using function

String replacement combinations with fixed parts

Longest common sequence between many sub-sequences

Categories

Resources