Python - partitioning a list of strings using an equivalence relation - python

I have a list of alphabetic strings [str1,str2,...] which I need to partition into equivalence classes using an equivalence relation R, where str1 R str2 (in relational notation) if str2 can be obtained from str1 by a sequence of valid one-letter changes, where 'valid' means it produces a valid alphabetic word, e.g. cat --> car is valid but cat --> 'cax is not. If the input list was ['cat','ace','car','zip','ape','pip'] then the code should return [['cat','car'],['ace','ape'],['zip','pip']].
I've got an initial working version which, however, produces some "classes" which contain duplicates.
I don't suppose there is any Python package which allows me to define such equivalence relations, but even otherwise what would be the best way of doing this?

Should work for different length strings. Obviously, ordering matters.
def is_one_letter_different(s1, s2):
if len(s1) != len(s2):
return False
diff_count = 0;
for char1, char2 in zip(s1, s2):
if char1 != char2:
diff_count += 1
return diff_count == 1
def group(candidates):
groups = []
for candidate in candidates:
for group in groups:
for word in group:
if is_one_letter_different(word, candidate):
group.append(candidate)
break
if candidate in group:
break
else:
groups.append([candidate])
return groups
print group(['bread','breed', 'bream', 'tread', 'treat', 'short', 'shorn', 'shirt', 'shore', 'store','eagle','mired', 'sired', 'hired'])
Output:
[['bread', 'breed', 'bream', 'tread', 'treat'], ['short', 'shorn', 'shirt', 'shore', 'store'], ['eagle'], ['mired', 'sired', 'hired']]
EDIT: Updated to follow additional testcases. I'm not sure of output correctness - please validate. And provide us good testcases next time.

I would do it something like this: construct an undirected graph where each word is a node, and each edge indicates that the relation holds between them. Then you can identify each disconnected "island" in the graph, each of which represents an equivalence class.
from collections import defaultdict
def exactly_one(iter):
count = 0
for x in iter:
if x:
count += 1
if count > 1:
break
return count == 1
def are_one_letter_apart(a,b):
if len(a) != len(b): return False
return exactly_one(a_char != b_char for a_char, b_char in zip(a,b))
def pairs(seq):
for i in range(len(seq)):
for j in range(i+1, len(seq)):
yield (seq[i], seq[j])
def search(graph, node):
seen = set()
to_visit = set()
to_visit.add(node)
while to_visit:
cur = to_visit.pop()
if cur in seen: continue
for neighbor in graph[cur]:
if neighbor not in seen:
to_visit.add(neighbor)
seen.add(cur)
return seen
def get_islands(graph):
seen = set()
islands = []
for item in graph.iterkeys():
if item in seen: continue
group = search(graph, item)
seen = seen | group
islands.append(group)
return islands
def create_classes(seq, f):
graph = defaultdict(list)
for a,b in pairs(seq):
if f(a,b):
graph[a].append(b)
graph[b].append(a)
#one last pass to pick up items with no relations to anything else
for item in seq:
if item not in graph:
graph[item].append(item)
return [list(group) for group in get_islands(graph)]
seq = ['cat','ace','car','zip','ape','pip']
print create_classes(seq, are_one_letter_apart)
Result:
[['ace', 'ape'], ['pip', 'zip'], ['car', 'cat']]

Related

Merging overlapping string sequences in a list

I am trying to figure out how to merge overlapping strings in a list together, for example for
['aacc','accb','ccbe']
I would get
['aaccbe']
This following code works for the example above, however it does not provide me with the desired result in the following case:
s = ['TGT','GTT','TTC','TCC','CCC','CCT','CCT','CTG','TGA','GAA','AAG','AGC','GCG','CGT','TGC','GCT','CTC','TCT','CTT','TTT','TTT','TTC','TCA','CAT','ATG','TGG','GGA','GAT','ATC','TCT','CTA','TAT','ATG','TGA','GAT','ATT','TTC']
a = s[0]
b = s[-1]
final_s = a[:a.index(b[0])]+b
print(final_s)
>>>TTC
My output is clearly not right, and I don't know why it doesn't work in this case. Note that I have already organized the list with the overlapping strings next to each other.
You can use a trie to storing the running substrings and more efficiently determine overlap. When the possibility of an overlap occurs (i.e for an input string, there exists a string in the trie with a letter that starts or ends the input string), a breadth-first search to find the largest possible overlap takes place, and then the remaining bits of string are added to the trie:
from collections import deque
#trie node (which stores a single letter) class definition
class Node:
def __init__(self, e, p = None):
self.e, self.p, self.c = e, p, []
def add_s(self, s):
if s:
self.c.append(self.__class__(s[0], self).add_s(s[1:]))
return self
class Trie:
def __init__(self):
self.c = []
def last_node(self, n):
return n if not n.c else self.last_node(n.c[0])
def get_s(self, c, ls):
#for an input string, find a letter in the trie that the string starts or ends with.
for i in c:
if i.e in ls:
yield i
yield from self.get_s(i.c, ls)
def add_string(self, s):
q, d = deque([j for i in self.get_s(self.c, (s[0], s[-1])) for j in [(s, i, 0), (s, i, -1)]]), []
while q:
if (w:=q.popleft())[1] is None:
d.append((w[0] if not w[0] else w[0][1:], w[2], w[-1]))
elif w[0] and w[1].e == w[0][w[-1]]:
if not w[-1]:
if not w[1].c:
d.append((w[0][1:], w[1], w[-1]))
else:
q.extend([(w[0][1:], i, 0) for i in w[1].c])
else:
q.append((w[0][:-1], w[1].p, w[1], -1))
if not (d:={a:b for a, *b in d}):
self.c.append(Node(s[0]).add_s(s[1:]))
elif (m:=min(d, key=len)):
if not d[m][-1]:
d[m][0].add_s(m)
else:
t = Node(m[0]).add_s(m)
d[m][0].p = self.last_node(t)
Putting it all together
t = Trie()
for i in ['aacc','accb','ccbe']:
t.add_string(i)
def overlaps(trie, c = ''):
if not trie.c:
yield c+trie.e
else:
yield from [j for k in trie.c for j in overlaps(k, c+trie.e)]
r = [j for k in t.c for j in overlaps(k)]
Output:
['aaccbe']
Use difflib.find_longest_match to find the overlap and concatenate appropriately, then use reduce to apply the entire list.
import difflib
from functools import reduce
def overlap(s1, s2):
# https://stackoverflow.com/a/14128905/4001592
s = difflib.SequenceMatcher(None, s1, s2)
pos_a, pos_b, size = s.find_longest_match(0, len(s1), 0, len(s2))
return s1[:pos_a] + s2[pos_b:]
s = ['aacc','accb','ccbe']
result = reduce(overlap, s, "")
print(result)
Output
aaccbe

find the minimum number of words(distance) between repeated occurrences of a search string in the input string

Here are test cases for the code:
string - 'Tim had been saying that he had been there'
search - 'had'
expected output - 4
string - 'he got what he got and what he wanted'
search - 'he'
expected out - 2
def return_distance(input, search):
words = input.split()
distance = None
indx = []
if not input or not search:
return None
else:
if words.count(search) >1:
indx = [ index for index, word in enumerate(words) if word == search]
distance = indx[1] - indx[0]
for i in range(len(indx)-1):
distance = min(distance, indx[i+1] - indx[i])-1
return distance
I am thinking how to optimize the code. I admit it is poorly written.
How about
def min_distance_between_words(sentence, word):
idxes = [i for i, e in enumerate(sentence.split()) if e == word]
return min([y - x - 1 for x, y in zip(idxes, idxes[1:])])
This splits the input sentence, makes a list of every index that matches the target word, then iterates over this list to compute the differences between each index and returns the minimum difference.
Since behavior is unspecified when the sentence doesn't have a word, it raises an error but you can add a check for this and return the value of your choice if desired using min's default parameter:
def min_distance_between_words(sentence, word):
idxes = [i for i, e in enumerate(sentence.split()) if e == word]
return min([y - x - 1 for x, y in zip(idxes, idxes[1:])], default=None)
As an aside, naming a variable input overwrites a builtin and return_distance is a rather ambiguous name for a function.
Adding a precondition for parameters for None as done with if not input or not search: is not typically done in Python (we assume caller will always pass in a string and adhere to the function's contract).
If you want to generalize this further, move the split() duty to the domain of the caller which enables the function to operate on arbitrary iterables:
def min_distance_between_occurrences(it, target):
idxes = [i for i, e in enumerate(it) if e == target]
return min([y - x - 1 for x, y in zip(idxes, idxes[1:])], default=None)
Call with:
min_distance_between_occurrences("a b c a".split(), "a")
min_distance_between_occurrences([(1, 2), (1, 3), (1, 2)], (1, 2))
Refactoring aside, as pointed out in the comments, the original code isn't correct. Issues include:
search_str does not exist. You probably meant search.
distance and min_dist don't really work together. Pick one or the other and use it for all minimum calculations.
min(min_dist, indx[i+1] - indx[i])-1 subtracts 1 in the wrong place, throwing off the count.
Here's a potential fix for these issues:
def return_distance(input, search):
words = input.split()
distance = None
if words.count(search) > 1:
indx = [index for index, word in enumerate(words) if word == search]
distance = indx[1] - indx[0] - 1
# ^^^^
for i in range(len(indx) - 1):
distance = min(distance, indx[i+1] - indx[i] - 1)
# ^^^^
return distance
One way is to use min with list comprehension on indx
min_dist = min([(indx[i+1] - indx[i]-1) for i in range(len(indx)-1) ])

How to find longest intersection between two strings in python?

I'm trying to write a program that would find the longest intersection between two strings. The conditions are:
If there is no common character the program returns an empty chain.
If there are multiple substrings of common characters with the same length it should return whichever is the largest, for example, for "bbaacc" and "aabb" the repeating substrings are "aa" and "bb" but as "bb" > "aa", so the programs must return only "bb".
Finally the program should return the longest common substring, for instance, for "programme" and "grammaire" the return should be "gramm" not "gramme".
My code has a problem with this last condition, how could I change it so it works as expected?
def intersection(v, w):
if not v or not w:
return ""
x, xs, y, ys = v[0], v[1:], w[0], w[1:]
if x == y:
return x + intersection(xs, ys)
else:
return max(intersection(v, ys), intersection(xs, w), key=len)
Driver:
print(intersection('programme', 'grammaire'))
cant find the issue with your code, but i solved it like this
def longest_str_intersection(a: str, b: str):
# identify all possible character sequences from str a
seqs = []
for pos1 in range(len(a)):
for pos2 in range(len(a)):
seqs.append(a[pos1:pos2+1])
# remove empty sequences
seqs = [seq for seq in seqs if seq != '']
# find segments in str b
max_len_match = 0
max_match_sequence = ''
for seq in seqs:
if seq in b:
if len(seq) > max_len_match:
max_len_match = len(seq)
max_match_sequence = seq
return max_match_sequence
longest_str_intersection('programme', 'grammaire')
-> 'gramm'
also interested to see if you found a more elegant solution!

isomorphic python algorithms

Question:
Given two strings s and t, determine if they are isomorphic.
Two strings are isomorphic if the characters in s can be replaced to get t.
All occurrences of a character must be replaced with another character while preserving the order of characters. No two characters may map to the same character but a character may map to itself.
My code:
def isIsomorphic(self, s, t):
# write your code here
remap = dict()
if s == t:
return True
if len(s) != len(t):
return False
for i in range(len(s)):
if s[i] not in remap.keys() and t[i] in remap.values():
return False
elif s[i] not in remap.keys():
remap[s[i]] = t[i]
else:
if remap[s[i]] != t[i]:
return False
return True
error hint:
Your code ran too much time than we expected. Check your time complexity. Time limit exceeded usually caused by infinite loop if your time complexity is the best.
Pls ask how i improve my code
The strings will be isomorphic if the number of unique characters in each string is the same as the number of unique pairs of corresponding characters between them (they also have to be the same length).
So this function will do it concisely and much faster:
def isIsomorphic(w1,w2) :
if len(w1) != len(w2): return False
return len(set(w1)) == len(set(w2)) == len(set(zip(w1,w2)))
[EDIT] 3.3 seconds on my computer for 1 million iterations of a pair of 25 character strings (vs 12 seconds for Aran-Fey's updated code).
A good way to do this is to normalize your strings
import re,string
def normalize(s):
key={}
def replace_ltr(match):
ltr = match.group(1)
if ltr not in key:
key[ltr] = string.printable[len(key)]
return key[ltr]
return re.sub("([a-zA-Z])",replace_ltr,s)
print normalize("Hello")
print normalize("ratty")
print normalize("SAS") == normalize("QBQ")
once you do that you can simply compare the normalized versions
def can_transform(s1,s2):
return normalize(s1) == normalize(s2)
Pulled from Understanding isomorphic strings algorithm
from itertools import groupby
from collections import defaultdict
def isomorphic(a, b):
a_idxs, b_idxs = defaultdict(set), defaultdict(set)
for idx, ((a_grp, a_vals), (b_grp, b_vals)) in enumerate(zip(groupby(a), groupby(b))):
if sum(1 for _ in a_vals) != sum(1 for _ in b_vals):
return False
# ensure sequence is of same length
if a_grp in a_idxs and b_idxs[b_grp] != a_idxs[a_grp] or\
b_grp in b_idxs and a_idxs[a_grp] != b_idxs[b_grp]:
return False
# ensure previous occurrences are matching groups
a_idxs[a_grp].add(idx)
b_idxs[b_grp].add(idx)
# save indexes for future checks
return True
One problem in your code is this part:
if ... and t[i] in remap.values():
Since remap.values() is not a set or a dict, membership testing with in is a O(n) operation. This can slow down your code significantly if many characters have to be remapped.
You can speed this up by storing the remapped characters in a set:
def isIsomorphic():
remap = dict()
if s == t:
return True
if len(s) != len(t):
return False
remapped = set() # <- add this
for i in range(len(s)):
if s[i] not in remap.keys() and t[i] in remapped: # <- change this
return False
elif s[i] not in remap.keys():
remap[s[i]] = t[i]
remapped.add(t[i])
else:
if remap[s[i]] != t[i]:
return False
return True
Timed on two strings with 25 remapped characters and 1 million iterations, we notice a significant speedup:
original code 26.817705629997363 seconds
updated code 19.41265572499833 seconds
I need to check that characters from string1 are not in string2 and vice versa so use two dicts: a mapping mapper, and a reverse mapping revmap for this.
dict.setdefault is doing a lot of the heavy lifting here - a useful method to know.
I got stuck in writing it with one long main expression, hence the style.
I only got this far by creating a few tests too.
def is_iso(s1, s2):
mapper, revmap = {}, {}
return (len(s1) == len(s2)
and all(((ch1 not in mapper and ch2 not in revmap) or
(ch1 in mapper and ch2 in revmap))
and ch2 == mapper.setdefault(ch1, ch2)
and ch1 == revmap.setdefault(ch2, ch1)
for ch1, ch2 in zip(s1, s2))
), ' '.join(f'{fr}<->{to}' for fr, to in mapper.items())
The tests:
for s1, s2 in [("11", "aa"), ("ab", "aa"), ('abc', 'aaa'), ("foo", "bar"),
("egg", "add"), ("paper", "title"), ('aabccd', '112334'),
('aabccc', '112334')]:
print( f'is_iso({s1!r}, {s2!r}) = %s \t# mappings: %s' % is_iso(s1, s2))
Output:
is_iso('11', 'aa') = True # mappings: 1<->a
is_iso('ab', 'aa') = False # mappings: a<->a
is_iso('abc', 'aaa') = False # mappings: a<->a
is_iso('foo', 'bar') = False # mappings: f<->b o<->a
is_iso('egg', 'add') = True # mappings: e<->a g<->d
is_iso('paper', 'title') = True # mappings: p<->t a<->i e<->l r<->e
is_iso('aabccd', '112334') = True # mappings: a<->1 b<->2 c<->3 d<->4
is_iso('aabccc', '112334') = False # mappings: a<->1 b<->2 c<->3

matching results in a list of lists

I have a list with the following structure;
[('0','927','928'),('2','693','694'),('2','742','743'),('2','776','777'),('2','804','805'),
('2','987','988'),('2','997','998'),('2','1019','1020'),
('2','1038','1039'),('2','1047','1048'),('2','1083','1084'),('2','659','660'),
('2','677','678'),('2','743','744'),('2','777','778'),('2','805','806'),('2','830','831')
the 1st number is an id, the second a position of a word and the third number is the position of a second word. What I need to do and am struggling with is finding sets of words next to each other.
These results are given for searches of 3 words, so there is the positions of word 1 with word 2 and positions of word 2 with word 3. For example ;
I run the phrase query "women in science" I then get the values given in the list above, so ('2','776','777') is the results for 'women in' and ('2','777','778') is the results for 'in science'.
I need to find a way to match these results up, so for every document it groups the words together depending on amounts of word in the query. (so if there is 4 words in the query there will be 3 results that need to be matched together).
Is this possible?
You need to quickly find word info by its position. Create a dictionary keyed by word position:
# from your example; I wonder why you use strings and not numbers.
positions = [('0','927','928'),('2','693','694'),('2','742','743'),('2','776','777'),('2','804','805'),
('2','987','988'),('2','997','998'),('2','1019','1020'),
('2','1038','1039'),('2','1047','1048'),('2','1083','1084'),('2','659','660'),
('2','677','678'),('2','743','744'),('2','777','778'),('2','805','806'),('2','830','831')]
# create the dictionary
dict_by_position = {w_pos:(w_id, w_next) for (w_id, w_pos, w_next) in positions}
Now it's a piece of cake to follow chains:
>>> dict_by_position['776']
('2', '777')
>>> dict_by_position['777']
('2', '778')
Or programmatically:
def followChain(start, position_dict):
result = []
scanner = start
while scanner in position_dict:
next_item = position_dict[scanner]
result.append(next_item)
unused_id, scanner = next_item # unpack the (id, next_position)
return result
>>> followChain('776', dict_by_position)
[('2', '777'), ('2', '778')]
Finding all chains that are not subchains of each other:
seen_items = set()
for start in dict_by_position:
if start not in seen_items:
chain = followChain(start, dict_by_position)
seen_items.update(set(chain)) # mark all pieces of chain as seen
print chain # or do something reasonable instead
The following will do what you're asking, as I understand it - it's not the prettiest output in the world, and I think that if possible you should be using numbers if numbers are what you're trying to work with.
There are probably more elegant solutions, and simplifications that could be made to this:
positions = [('0','927','928'),('2','693','694'),('2','742','743'),('2','776','777'),('2','804','805'),
('2','987','988'),('2','997','998'),('2','1019','1020'),
('2','1038','1039'),('2','1047','1048'),('2','1083','1084'),('2','659','660'),
('2','677','678'),('2','743','744'),('2','777','778'),('2','805','806'),('2','830','831')]
sorted_dict = {}
sorted_list = []
grouped_list = []
doc_ids = []
def sort_func(positions):
for item in positions:
if item[0] not in doc_ids:
doc_ids.append(item[0])
for doc_id in doc_ids:
sorted_set = []
for item in positions:
if item[0] != doc_id:
continue
else:
if item[1] not in sorted_set:
sorted_set.append(item[1])
if item[2] not in sorted_set:
sorted_set.append(item[2])
sorted_list = sorted(sorted_set)
sorted_dict[doc_id] = sorted_list
for k in sorted_dict:
group = []
grouped_list = []
for i in sorted_dict[k]:
try:
if int(i)-1 == int(sorted_dict[k][sorted_dict[k].index(i)-1]):
group.append(i)
else:
if group != []:
grouped_list.append(group)
group = [i]
except IndexError:
group.append(i)
continue
if grouped_list != []:
sorted_dict[k] = grouped_list
else:
sorted_dict[k] = group
return sorted_dict
My output for the above was:
{'0': ['927', '928'], '2': [['1019', '1020'], ['1038', '1039'], ['1047', '1048'], ['1083', '1084'], ['659', '660'], ['677', '678'], ['693', '694'], ['742', '743', '744'], ['776', '777', '778'], ['804', '805', '806'], ['830', '831'], ['987', '988']]}

Categories

Resources