breadth first in python - computational max? - python

I'm trying to implement the following "game" in python:
Given a start word, find the goal word with modifications
allowed modification per step: remove or add any letter and permute
The task is to find the way from start to goal with the least steps.
My approach was to add/remove a letter, permute the resulting letters and look up each permutant in a dictionary.
This quickly results in long run times (for a 9-letter word and the first step approx 60sec).
Here is my code.
import time
from itertools import permutations
startword = 'croissant'
nodes = list()
nodes.append(startword)
dicts = set([line.rstrip('\n') for line in open('wordList.txt')])
alpha = set([chr(i) for i in range(ord('a'),ord('z')+1)])
def step(nodes):
nnodes = list()
for word in nodes:
for s in word:
new_word = word.replace(s, '', 1)
perms = [''.join(p) for p in permutations(new_word)]
for per in perms:
if per in dicts:
nnodes.append(per)
for s in alpha:
new_word = word + s
perms = [''.join(p) for p in permutations(new_word)]
for per in perms:
if per in dicts:
nnodes.append(per)
return set(nnodes)
btime = time.time()
step(nodes)
print time.time() - btime
How can I improve the performance/logic? We are specifically asked to use a breadth first search.

Interesting question! There is a way that you can dramatically improve the performance. Instead of checking each new word by traversing through the wordlist and checking if it is present, you can alphabetize the characters for each word in the wordlist and compare those directly to the alphabetized shortened/lengthened word of interest.
For example using croissant, I can alphabetize the characters to generate acinorsst instead. Then, I check to see if acinorsst with a character removed or added is in a defaultdict of lists where the keys are alphabetized strings and retrieve the corresponding value (which is the list of actual words corresponding to that alphabetical ordering).
It is much less confusing to explain in code, so I have posted it below:
from collections import defaultdict
import itertools as its
import time
def get_alpha_dicts():
wa_dict = {}#word: alphabetized dict
aw_dict = defaultdict(list)#alphabetized: corresponding WORDS dict
for line in open('wordList.txt'):
word = line.rstrip('\n')
alpha_word = ''.join(sorted(word))
wa_dict[word] = alpha_word
aw_dict[alpha_word].append(word)
return wa_dict, aw_dict
def step(nodes, aw_dict):
alpha = set([chr(i) for i in range(ord('a'),ord('z')+1)])
nnodes = list()
for word in nodes:
alpha_word = ''.join(sorted(word))
#remove a char from word
short_words = set([''.join(w) for w in its.combinations(alpha_word, len(start_word)-1)])
for short_word in short_words:
nnodes.extend(aw_dict[short_word])
#add a char to word
long_words = [''.join(sorted(alpha_word + s)) for s in alpha]
for long_word in long_words:
nnodes.extend(aw_dict[long_word])
return set(nnodes)
if __name__ == "__main__":
start_word = 'croissant'
nodes = list()
nodes.append(start_word)
wa_dict, aw_dict = get_alpha_dicts()
btime = time.time()
print step(nodes, aw_dict)
print time.time() - btime
Hope it helps!

Related

Word Ladder without replacement in python

I have question, where I need to implement ladder problem with different logic.
In each step, the player must either add one letter to the word
from the previous step, or take away one letter, and then rearrange the letters to make a new word.
croissant(-C) -> arsonist(-S) -> aroints(+E)->notaries(+B)->baritones(-S)->baritone
The new word should make sense from a wordList.txt which is dictionary of word.
Dictionary
My code look like this,
where I have calculated first the number of character removed "remove_list" and added "add_list". Then I have stored that value in the list.
Then I read the file, and stored into the dictionary which the sorted pair.
Then I started removing and add into the start word and matched with dictionary.
But now challenge is, some word after deletion and addition doesn't match with the dictionary and it misses the goal.
In that case, it should backtrack to previous step and should add instead of subtracting.
I am looking for some sort of recursive function, which could help in this or complete new logic which I could help to achieve the output.
Sample of my code.
start = 'croissant'
goal = 'baritone'
list_start = map(list,start)
list_goal = map(list, goal)
remove_list = [x for x in list_start if x not in list_goal]
add_list = [x for x in list_goal if x not in list_start]
file = open('wordList.txt','r')
dict_words = {}
for word in file:
strip_word = word.rstrip()
dict_words[''.join(sorted(strip_word))]=strip_word
file.close()
final_list = []
flag_remove = 0
for i in remove_list:
sorted_removed_list = sorted(start.replace(''.join(map(str, i)),"",1))
sorted_removed_string = ''.join(map(str, sorted_removed_list))
if sorted_removed_string in dict_words.keys():
print dict_words[sorted_removed_string]
final_list.append(sorted_removed_string)
flag_remove = 1
start = sorted_removed_string
print final_list
flag_add = 0
for i in add_list:
first_character = ''.join(map(str,i))
sorted_joined_list = sorted(''.join([first_character, final_list[-1]]))
sorted_joined_string = ''.join(map(str, sorted_joined_list))
if sorted_joined_string in dict_words.keys():
print dict_words[sorted_joined_string]
final_list.append(sorted_joined_string)
flag_add = 1
sorted_removed_string = sorted_joined_string
Recursion-based backtracking isn't a good idea for search problem of this sort. It blindly goes downward in search tree, without exploiting the fact that words are almost never 10-12 distance away from each other, causing StackOverflow (or recursion limit exceeded in Python).
The solution here uses breadth-first search. It uses mate(s) as helper, which given a word s, finds all possible words we can travel to next. mate in turn uses a global dictionary wdict, pre-processed at the beginning of the program, which for a given word, finds all it's anagrams (i.e re-arrangement of letters).
from queue import Queue
words = set(''.join(s[:-1]) for s in open("wordsEn.txt"))
wdict = {}
for w in words:
s = ''.join(sorted(w))
if s in wdict: wdict[s].append(w)
else: wdict[s] = [w]
def mate(s):
global wdict
ans = [''.join(s[:c]+s[c+1:]) for c in range(len(s))]
for c in range(97,123): ans.append(s + chr(c))
for m in ans: yield from wdict.get(''.join(sorted(m)),[])
def bfs(start,goal,depth=0):
already = set([start])
prev = {}
q = Queue()
q.put(start)
while not q.empty():
cur = q.get()
if cur==goal:
ans = []
while cur: ans.append(cur);cur = prev.get(cur)
return ans[::-1] #reverse the array
for m in mate(cur):
if m not in already:
already.add(m)
q.put(m)
prev[m] = cur
print(bfs('croissant','baritone'))
which outputs: ['croissant', 'arsonist', 'rations', 'senorita', 'baritones', 'baritone']

how to make this function more (time) efficient?

i ve got a dataframe series which contain sentences. (some are kind of long)
i ve also got 2 dictionaries which contain words as keys and ints as count.
Not all words from strings are present in both dictionaries. Some are in only one, some are in neither.
Dataframe is 124011 units long. function is taking me about 0.4 per string. which is waaaay to long.
W is just a reference value for the dictionary (weights = {}, weights[W] = {})
here is the function:
def match_share(string, W, weights, rel_weight):
words = string.split()
words_counts = Counter(words)
ratios = []
for word in words:
if ((word in weights[W].keys())&(word in rel_weight[W].keys())):
if (weights[W][word]!=0):
ratios.append(words_counts[word]*rel_weight[W][word]/weights[W][word])
else:
ratios.append(0)
if len(words)>0:
ratios = np.divide(ratios, float(len(words)))
ratio = np.sum(ratios)
return ratio
thx
Let's clean it up a bit:
def match_share(string, W, weights, rel_weight):
words = string.split()
words_counts = Counter(words)
words = string.split()
words_counts = Counter(words)
That's redundant! Replace 4 statements with 2:
def match_share(string, W, weights, rel_weight):
words = string.split()
words_counts = Counter(words)
Next:
ratios = []
for word in words:
if ((word in weights[W].keys())&(word in rel_weight[W].keys())):
if (weights[W][word]!=0):
ratios.append(words_counts[word]*rel_weight[W][word]/weights[W][word])
else:
ratios.append(0)
I don't know what you think that code does. I hope you're not being tricky. But .keys returns an iterable, and X in <iterable> is WAY slower than X in <dict>. Also, note: you don't append anything if the innermost (weights[W][word] != 0) condition fails. That might be a bug, since you try to append 0 in another else condition. (I don't know what you're doing, so I'm just pointing it out.) And this is Python, not Perl or C or Java. So no parens required around if <test>:
Let's go with that:
ratios = []
for word in words:
if word in weights[W] and word in rel_weight[W]:
if weights[W][word] != 0:
ratios.append(words_counts[word] * rel_weight[W][word] / weights[W][word])
else:
ratios.append(0)
Next:
if len(words)>0:
ratios = np.divide(ratios, float(len(words)))
You're trying to prevent dividing by zero. But you can use the truthiness of a list to check this, and avoid the comparison:
if words:
ratios = np.divide(ratios, float(len(words)))
The rest is fine, but you don't need the variable.
ratio = np.sum(ratios)
return ratio
With those mods applied, your function looks like this:
def match_share(string, W, weights, rel_weight):
words = string.split()
words_counts = Counter(words)
ratios = []
for word in words:
if word in weights[W] and word in rel_weight[W]:
if weights[W][word] != 0:
ratios.append(words_counts[word] * rel_weight[W][word] / weights[W][word])
else:
ratios.append(0)
if words:
ratios = np.divide(ratios, float(len(words)))
ratio = np.sum(ratios)
return ratio
Looking at it at little harder, I see you're doing this:
word_counts = Counter(words)
for word in words:
append( word_counts[word] * ...)
According to me, that means if "apple" appears 6 times, you are going to append 6*... to the list, once for each word. So you'll have 6 different occurrences of 6*... in your list. Are you sure that's what you want? Or should it be for word in word_counts to just iterate over the distinct words?
Another optimization is to remove lookups from inside your loop. You keep looking up weights[W] and rel_weight[W], even though the value of W never changes. Let's cache those values outside the loop. Also, let's cache a pointer to the ratios.append method.
def match_share(string, W, weights, rel_weight):
words = string.split()
words_counts = Counter(words)
ratios = []
# Cache these values for speed in loop
ratios_append = ratios.append
weights_W = weights[W]
rel_W = rel_weight[W]
for word in words:
if word in weights_W and word in rel_W:
if weights_W[word] != 0:
ratios_append(words_counts[word] * rel_W[word] / weights_W[word])
else:
ratios_append(0)
if words:
ratios = np.divide(ratios, float(len(words)))
ratio = np.sum(ratios)
return ratio
Try that, see how it works. Please look at the bold note above, and the questions. There might be bugs, there might be more ways to speed up.
I think that your time inefficiency may be coming from the fact you are using Counter instead of the dict. Some discussion here suggests that the dict class has parts written in pure c, while counter is written in python.
I suggest altering your code to using a dict and test to see if that provides a faster time
Also why is this code duplicated?:
words = string.split()
words_counts = Counter(words)
words = string.split()
words_counts = Counter(words)
ratios = []
It would be good if you had a profile of that function execution, but here are some generic ideas:
You needlessly get some elements on every iteration. You can extract these before the loop
For example
weights_W = weights[W]
rel_weights_W = rel_weights[W]
You don't need to call .keys() on dicts.
These are equivalent:
word in weights_W.keys()
word in weights_W
Attempt to get values without looking them up first. This will save you one lookup.
For example instead of:
if ((word in weights[W].keys())&(word in rel_weight[W].keys())):
if (weights[W][word]!=0):
you can do:
word_weight = weights_W.get(word)
if word_weight is not None:
word_rel_weight = rel_weights_W.get(word)
if word_rel_weight is not None:
if word_weight != 0: # lookup saved here

Python word game. Last letter of first word == first letter of second word. Find longest possible sequence of words

I'm trying to write a program that mimics a word game where, from a given set of words, it will find the longest possible sequence of words. No word can be used twice.
I can do the matching letters and words up, and storing them into lists, but I'm having trouble getting my head around how to handle the potentially exponential number of possibilities of words in lists. If word 1 matches word 2 and then I go down that route, how do I then back up to see if words 3 or 4 match up with word one and then start their own routes, all stemming from the first word?
I was thinking some way of calling the function inside itself maybe?
I know it's nowhere near doing what I need it to do, but it's a start. Thanks in advance for any help!
g = "audino bagon baltoy banette bidoof braviary bronzor carracosta charmeleon cresselia croagunk darmanitan deino emboar emolga exeggcute gabite girafarig gulpin haxorus"
def pokemon():
count = 1
names = g.split()
first = names[count]
master = []
for i in names:
print (i, first, i[0], first[-1])
if i[0] == first[-1] and i not in master:
master.append(i)
count += 1
first = i
print ("success", master)
if len(master) == 0:
return "Pokemon", first, "does not work"
count += 1
first = names[count]
pokemon()
Your idea of calling a function inside of itself is a good one. We can solve this with recursion:
def get_neighbors(word, choices):
return set(x for x in choices if x[0] == word[-1])
def longest_path_from(word, choices):
choices = choices - set([word])
neighbors = get_neighbors(word, choices)
if neighbors:
paths = (longest_path_from(w, choices) for w in neighbors)
max_path = max(paths, key=len)
else:
max_path = []
return [word] + max_path
def longest_path(choices):
return max((longest_path_from(w, choices) for w in choices), key=len)
Now we just define our word list:
words = ("audino bagon baltoy banette bidoof braviary bronzor carracosta "
"charmeleon cresselia croagunk darmanitan deino emboar emolga "
"exeggcute gabite girafarig gulpin haxorus")
words = frozenset(words.split())
Call longest_path with a set of words:
>>> longest_path(words)
['girafarig', 'gabite', 'exeggcute', 'emolga', 'audino']
A couple of things to know: as you point out, this has exponential complexity, so beware! Also, know that python has a recursion limit!
Using some black magic and graph theory I found a partial solution that might be good (not thoroughly tested).
The idea is to map your problem into a graph problem rather than a simple iterative problem (although it might work too!). So I defined the nodes of the graph to be the first letters and last letters of your words. I can only create edges between nodes of type first and last. I cannot map node first number X to node last number X (a word cannot be followed by it self). And from that your problem is just the same as the Longest path problem which tends to be NP-hard for general case :)
By taking some information here: stackoverflow-17985202 I managed to write this:
g = "audino bagon baltoy banette bidoof braviary bronzor carracosta charmeleon cresselia croagunk darmanitan deino emboar emolga exeggcute gabite girafarig gulpin haxorus"
words = g.split()
begin = [w[0] for w in words] # Nodes first
end = [w[-1] for w in words] # Nodes last
links = []
for i, l in enumerate(end): # Construct edges
ok = True
offset = 0
while ok:
try:
bl = begin.index(l, offset)
if i != bl: # Cannot map to self
links.append((i, bl))
offset = bl + 1 # next possible edge
except ValueError: # no more possible edge for this last node, Next!
ok = False
# Great function shamelessly taken from stackoverflow (link provided above)
import networkx as nx
def longest_path(G):
dist = {} # stores [node, distance] pair
for node in nx.topological_sort(G):
# pairs of dist,node for all incoming edges
pairs = [(dist[v][0]+1,v) for v in G.pred[node]]
if pairs:
dist[node] = max(pairs)
else:
dist[node] = (0, node)
node,(length,_) = max(dist.items(), key=lambda x:x[1])
path = []
while length > 0:
path.append(node)
length,node = dist[node]
return list(reversed(path))
# Construct graph
G = nx.DiGraph()
G.add_edges_from(links)
# TADAAAA!
print(longest_path(G))
Although it looks nice, there is a big drawback. You example works because there is no cycle in the resulting graph of input words, however, this solution fails on cyclic graphs.
A way around that is to detect cycles and break them. Detection can be done this way:
if nx.recursive_simple_cycles(G):
print("CYCLES!!! /o\")
Breaking the cycle can be done by just dropping a random edge in the cycle and then you will randomly find the optimal solution for your problem (imagine a cycle with a tail, you should cut the cycle on the node having 3 edges), thus I suggest brute-forcing this part by trying all possible cycle breaks, computing longest path and taking the longest of the longest path. If you have multiple cycles it becomes a bit more explosive in number of possibilities... but hey it's NP-hard, at least the way I see it and I didn't plan to solve that now :)
Hope it helps
Here's a solution that doesn't require recursion. It uses the itertools permutation function to look at all possible orderings of the words, and find the one with the longest length. To save time, as soon as an ordering hits a word that doesn't work, it stops checking that ordering and moves on.
>>> g = 'girafarig eudino exeggcute omolga gabite'
... p = itertools.permutations(g.split())
... longestword = ""
... for words in p:
... thistry = words[0]
... # Concatenates words until the next word doesn't link with this one.
... for i in range(len(words) - 1):
... if words[i][-1] != words[i+1][0]:
... break
... thistry += words[i+1]
... i += 1
... if len(thistry) > len(longestword):
... longestword = thistry
... print(longestword)
... print("Final answer is {}".format(longestword))
girafarig
girafariggabiteeudino
girafariggabiteeudinoomolga
girafariggabiteexeggcuteeudinoomolga
Final answer is girafariggabiteexeggcuteeudinoomolga
First, let's see what the problem looks like:
from collections import defaultdict
import pydot
words = (
"audino bagon baltoy banette bidoof braviary bronzor carracosta "
"charmeleon cresselia croagunk darmanitan deino emboar emolga "
"exeggcute gabite girafarig gulpin haxorus"
).split()
def main():
# get first -> last letter transitions
nodes = set()
arcs = defaultdict(lambda: defaultdict(list))
for word in words:
first = word[0]
last = word[-1]
nodes.add(first)
nodes.add(last)
arcs[first][last].append(word)
# create a graph
graph = pydot.Dot("Word_combinations", graph_type="digraph")
# use letters as nodes
for node in sorted(nodes):
n = pydot.Node(node, shape="circle")
graph.add_node(n)
# use first-last as directed edges
for first, sub in arcs.items():
for last, wordlist in sub.items():
count = len(wordlist)
label = str(count) if count > 1 else ""
e = pydot.Edge(first, last, label=label)
graph.add_edge(e)
# save result
graph.write_jpg("g:/temp/wordgraph.png", prog="dot")
if __name__=="__main__":
main()
results in
which makes the solution fairly obvious (path shown in red), but only because the graph is acyclic (with the exception of two trivial self-loops).

Concatenate strings if they have an overlapping region

I am trying to write a script that will find strings that share an overlapping region of 5 letters at the beginning or end of each string (shown in example below).
facgakfjeakfjekfzpgghi
pgghiaewkfjaekfjkjakjfkj
kjfkjaejfaefkajewf
I am trying to create a new string which concatenates all three, so the output would be:
facgakfjeakfjekfzpgghiaewkfjaekfjkjakjfkjaejfaefkajewf
Edit:
This is the input:
x = ('facgakfjeakfjekfzpgghi', 'kjfkjaejfaefkajewf', 'pgghiaewkfjaekfjkjakjfkj')
**the list is not ordered
What I've written so far *but is not correct:
def findOverlap(seq)
i = 0
while i < len(seq):
for x[i]:
#check if x[0:5] == [:5] elsewhere
x = ('facgakfjeakfjekfzpgghi', 'kjfkjaejfaefkajewf', 'pgghiaewkfjaekfjkjakjfkj')
findOverlap(x)
Create a dictionary mapping the first 5 characters of each string to its tail
strings = {s[:5]: s[5:] for s in x}
and a set of all the suffixes:
suffixes = set(s[-5:] for s in x)
Now find the string whose prefix does not match any suffix:
prefix = next(p for p in strings if p not in suffixes)
Now we can follow the chain of strings:
result = [prefix]
while prefix in strings:
result.append(strings[prefix])
prefix = strings[prefix][-5:]
print "".join(result)
A brute-force approach - do all combinations and return the first that matches linking terms:
def solution(x):
from itertools import permutations
for perm in permutations(x):
linked = [perm[i][:-5] for i in range(len(perm)-1)
if perm[i][-5:]==perm[i+1][:5]]
if len(perm)-1==len(linked):
return "".join(linked)+perm[-1]
return None
x = ('facgakfjeakfjekfzpgghi', 'kjfkjaejfaefkajewf', 'pgghiaewkfjaekfjkjakjfkj')
print solution(x)
Loop over each pair of candidates, reverse the second string and use the answer from here

Find all Occurences of Every Substring in String

I am trying to find all occurrences of sub-strings in a main string (of all lengths). My function takes one string and then returns a dictionary of every sub-string (which occurs more than once, of course) and how many times it occurs (format of the dictionary: {substring: # of occurrences, ...}). I am using collections.Counter(s) to help me with it.
Here is my function:
from collections import Counter
def patternFind(s):
patterns = {}
for index in range(1, len(s)+1)[::-1]:
d = nChunks(s, step=index)
parts = dict(Counter(d))
patterns.update({elem: parts[elem] for elem in parts.keys() if parts[elem] > 1})
return patterns
def nChunks(iterable, start=0, step=1):
return [iterable[i:i+step] for i in range(start, len(iterable), step)]
I have a string, data with about 2500 random letters (in a random order). However, there are 2 strings inserted into it (random points). Say this string is 'TEST'. data.count('TEST') returns 2. However, patternFind(data)['TEST'] gives me a KeyError. Therefore, my program does not detect the two strings in it.
What have I done wrong? Thanks!
Edit: My method of creating testing-instances:
def createNewTest():
n = randint(500, 2500)
x, y = randint(500, n), randint(500, n)
s = ''
for i in range(n):
s += choice(uppercase)
if i == x or i == y: s += "TEST"
return s
Using Regular Expressions
Apart from the count() method you described, regex is an obvious alternative
import re
needle = r'TEST'
haystack = 'khjkzahklahjTESTkahklaghTESTjklajhkhzkhjkzahklahjTESTkahklagh'
pattern = re.compile(needle)
print len(re.findall(pattern, haystack))
Short Cut
If you need to build a dictionary of substrings, possibly you can do this with only subset of those strings. Assuming you know the needle you are looking for in the data then you only need the dictionary of substrings of data that are the same length of needle. This is very fast.
from collections import Counter
needle = "TEST"
def gen_sub(s, len_chunk):
for start in range(0, len(s)-len_chunk+1):
yield s[start:start+len_chunk]
data = 'khjkzahklahjTESTkahklaghTESTjklajhkhzkhjkzahklahjTESTkahklaghTESz'
parts = Counter([sub for sub in gen_sub(data, len(needle))])
print parts[needle]
Brute Force: building dictionary of all substrings
If you need to have a count of all possible substrings, this works but it is very slow:
from collections import Counter
def gen_sub(s):
for start in range(0, len(s)):
for end in range(start+1, len(s)+1):
yield s[start:end]
data = 'khjkzahklahjTESTkahklaghTESTjklajhkhz'
parts = Counter([sub for sub in gen_sub(data)])
print parts['TEST']
Substring generator adapted from this: https://stackoverflow.com/a/8305463/1290420
While jurgenreza has explained why your program didn't work, the solution is still quite slow. If you only examine substrings s for which you know that s[:-1] repeats, you get a much faster solution (typically a hundred times faster and more):
from collections import defaultdict
def pfind(prefix, sequences):
collector = defaultdict(list)
for sequence in sequences:
collector[sequence[0]].append(sequence)
for item, matching_sequences in collector.items():
if len(matching_sequences) >= 2:
new_prefix = prefix + item
yield (new_prefix, len(matching_sequences))
for r in pfind(new_prefix, [sequence[1:] for sequence in matching_sequences]):
yield r
def find_repeated_substrings(s):
s0 = s + " "
return pfind("", [s0[i:] for i in range(len(s))])
If you want a dict, you call it like this:
result = dict(find_repeated_substrings(s))
On my machine, for a run with 2247 elements, it took 0.02 sec, while the original (corrected) solution took 12.72 sec.
(Note that this is a rather naive implementation; using indexes of instead of substrings should be even faster.)
Edit: The following variant works with other sequence types (not only strings). Also, it doesn't need a sentinel element.
from collections import defaultdict
def pfind(s, length, ends):
collector = defaultdict(list)
if ends[-1] >= len(s):
del ends[-1]
for end in ends:
if end < len(s):
collector[s[end]].append(end)
for key, matching_ends in collector.items():
if len(matching_ends) >= 2:
end = matching_ends[0]
yield (s[end - length: end + 1], len(matching_ends))
for r in pfind(s, length + 1, [end + 1 for end in matching_ends if end < len(s)]):
yield r
def find_repeated_substrings(s):
return pfind(s, 0, list(range(len(s))))
This still has the problem that very long substrings will exceed recursion depth. You might want to catch the exception.
The problem is in your nChunks function. It does not give you all the chunks that are necessary.
Let's consider a test string:
s='1test2345test'
For the chunks of size 4 your nChunks function gives this output:
>>>nChunks(s, step=4)
['1tes', 't234', '5tes', 't']
But what you really want is:
>>>def nChunks(iterable, start=0, step=1):
return [iterable[i:i+step] for i in range(len(iterable)-step+1)]
>>>nChunks(s, step=4)
['1tes', 'test', 'est2', 'st23', 't234', '2345', '345t', '45te', '5tes', 'test']
You can see that this way there are two 'test' chunks and your patternFind(s) will work like a charm:
>>> patternFind(s)
{'tes': 2, 'st': 2, 'te': 2, 'e': 2, 't': 4, 'es': 2, 'est': 2, 'test': 2, 's': 2}
here you can find a solution that uses a recursive wrapper around string.find() that searches all the occurences of a substring in a main string.
The collectallchuncks() function returns a defaultdict whith all the substrings as keys and for each substring a list of all the indexes where the substring is found in the main string.
import collections
# Minimum substring size, may be 1
MINSIZE = 3
# Recursive wrapper
def recfind(p, data, pos, acc):
res = data.find(p, pos)
if res == -1:
return acc
else:
acc.append(res)
return recfind(p, data, res+1, acc)
def collectallchuncks(data):
res = collections.defaultdict(str)
size = len(data)
for base in xrange(size):
for seg in xrange(MINSIZE, size-base+1):
chunk = data[base:base+seg]
if data.count(chunk) > 1:
res[chunk] = recfind(chunk, data, 0, [])
return res
if __name__ == "__main__":
data = 'khjkzahklahjTESTkahklaghTESTjklajhkhzkhjkzahklahjTESTkahklaghTESz'
allchuncks = collectallchuncks(data)
print 'TEST', allchuncks['TEST']
print 'hklag', allchuncks['hklag']
EDIT: If you just need the number of occurrences of each substring in the main string you can easily obtain it getting rid of the recursive function:
import collections
MINSIZE = 3
def collectallchuncks2(data):
res = collections.defaultdict(str)
size = len(data)
for base in xrange(size):
for seg in xrange(MINSIZE, size-base+1):
chunk = data[base:base+seg]
cnt = data.count(chunk)
if cnt > 1:
res[chunk] = cnt
return res
if __name__ == "__main__":
data = 'khjkzahklahjTESTkahklaghTESTjklajhkhzkhjkzahklahjTESTkahklaghTESz'
allchuncks = collectallchuncks2(data)
print 'TEST', allchuncks['TEST']
print 'hklag', allchuncks['hklag']

Categories

Resources