Reducing compute time for Anagram word search - python

The code below is a brute force method of searching a list of words and creating sub-lists of any that are Anagrams.
Searching the entire English dictionary is prohibitively time consuming so I'm curious of anyone has tips for reducing the compute complexity of the code?
def anogramtastic(anagrms):
d = []
e = []
for j in range(len(anagrms)):
if anagrms[j] in e:
pass
else:
templist = []
tester = anagrms[j]
tester = list(tester)
tester.sort()
tester = ''.join(tester)
for k in range(len(anagrms)):
if k == j:
pass
else:
testers = anagrms[k]
testers = list(testers)
testers.sort()
testers = ''.join(testers)
if testers == tester:
templist.append(anagrms[k])
e.append(anagrms[k])
if len(templist) > 0:
templist.append(anagrms[j])
d.append(templist)
d.sort(key=len,reverse=True)
return d
print(anogramtastic(wordlist))

How about using a dictionary of frozensets? Frozensets are immutable, meaning you can hash them for constant lookup. And when it comes to anagrams, what makes two words anagrams of each other is that they have the same letters with the same count. So you can construct a frozenset of {(letter, count), ...} pairs, and hash these for efficient lookup.
Here's a quick little function to convert a word to a multiset using collections.Counter:
from collections import Counter, defaultdict
def word2multiset(word):
return frozenset(Counter(word).items())
Now, given a list of words, populate your anagram dictionary like this:
list_of_words = [... ]
anagram_dict = defaultdict(set)
for word in list_of_words:
anagram_dict[word2multiset(word)].add(word)
For example, when list_of_words = ['hello', 'olleh', 'test', 'apple'], this is the output of anagram_dict after a run of the loop above:
print(anagram_dict)
defaultdict(set,
{frozenset({('e', 1), ('h', 1), ('l', 2), ('o', 1)}): {'hello',
'olleh'},
frozenset({('e', 1), ('s', 1), ('t', 2)}): {'test'},
frozenset({('a', 1), ('e', 1), ('l', 1), ('p', 2)}): {'apple'}})

Unless I'm misunderstanding the problem, simply grouping the words by sorting their characters should be an efficient solution -- as you've already realized. The trick is to avoid comparing every word to all the other ones. A dict with the char-sorted string as key will make finding the right group for each word fast; a lookup/insertion will be O(log n).
#!/usr/bin/env python3
#coding=utf8
from sys import stdin
groups = {}
for line in stdin:
w = line.strip()
g = ''.join(sorted(w))
if g not in groups:
groups[g] = []
groups[g].append(w)
for g, words in groups.items():
if len(words) > 1:
print('%2d %-20s' % (len(words), g), ' '.join(words))
Testing on my words file (99171 words), it seems to work well:
anagram$ wc /usr/share/dict/words
99171 99171 938848 /usr/share/dict/words
anagram$ time ./anagram.py < /usr/share/dict/words | tail
2 eeeprsw sweeper weepers
2 brsu burs rubs
2 aeegnrv avenger engrave
2 ddenoru redound rounded
3 aesy ayes easy yeas
2 gimnpu impugn umping
2 deeiinsst densities destinies
2 abinost bastion obtains
2 degilr girdle glider
2 orsttu trouts tutors
real 0m0.366s
user 0m0.357s
sys 0m0.012s

You can speed things up considerably by using a dictionary for checking membership instead of doing linear searches. The only "trick" is to devise a way to create keys for it such that it will be the same for anagrammatical words (and not for others).
In the code below this is being done by creating a sorted tuple from the letters in each word.
def anagramtastic(words):
dct = {}
for word in words:
key = tuple(sorted(word)) # Identifier based on letters.
dct.setdefault(key, []).append(word)
# Return a list of all that had an anagram.
return [words for words in dct.values() if len(words) > 1]
wordlist = ['act', 'cat', 'binary', 'brainy', 'case', 'aces',
'aide', 'idea', 'earth', 'heart', 'tea', 'tee']
print('result:', anagramtastic(wordlist))
Output produced:
result: [['act', 'cat'], ['binary', 'brainy'], ['case', 'aces'], ['aide', 'idea'], ['earth', 'heart']]

Related

Trying to sort a dict by dict.values()

The task is to read a file, create a dict and print out the word and its counter value. Below is code that works fine, but I can't seem to get my mind to understand why in the print_words() function, I can't change the sort to:
words = sorted(word_count.values())
and then print the word and its counter, sorted by the counter (number of times that word is in word_count[]).
def word_count_dict(filename):
word_count = {}
input_file = open(filename, 'r')
for line in input_file:
words = line.split()
for word in words:
word = word.lower()
if not word in word_count:
word_count[word] = 1
else:
word_count[word] = word_count[word] + 1
input_file.close()
return word_count
def print_words(filename):
word_count = word_count_dict(filename)
words = sorted(word_count.keys())
for word in words:
print word, word_count[word]
If you sorted output by value (including the keys), the simplest approach is sorting the items (key-value pairs), using a key argument to sorted that sorts on the value, then iterating the result. So for your example, you'd replace:
words = sorted(word_count.keys())
for word in words:
print word, word_count[word]
with (adding from operator import itemgetter to the top of the module):
# key=itemgetter(1) means the sort key is the second value in each key-value
# tuple, meaning the value
sorted_word_counts = sorted(word_count.items(), key=itemgetter(1))
for word, count in sorted_word_counts:
print word, count
First thing to note is that dictionaries are not considered to be ordered, although this may change in the future. Therefore, it is good practice to convert your dict to a list of tuples ordered in some way.
The below function will help you convert a dictionary to a list of tuples ordered by values.
d = {'a': 5, 'b': 1, 'c': 7, 'd': 3}
def order_by_values(dct):
rev = sorted((v, k) for k, v in dct.items())
return [t[::-1] for t in rev]
order_by_values(d) # [('b', 1), ('d', 3), ('a', 5), ('c', 7)]

Sub-dictionary erroneously repeated throughout dictionary?

I'm trying to store in a dictionary the number of times a given letter occurs after another given letter. For example, dictionary['a']['d'] would give me the number of times 'd' follows 'a' in short_list.
alphabet = ['a', 'b', 'c', 'd', 'e', 'f', 'g', 'h', 'i', 'j', 'k', 'l', 'm', 'n', 'o', 'p', 'q', 'r', 's', 't', 'u', 'v', 'w', 'x', 'y', 'z']
short_list = ['ford','hello','orange','apple']
# dictionary to keep track of how often a given letter occurs
tally = {}
for a in alphabet:
tally[a] = 0
# dictionary to keep track of how often a given letter occurs after a given letter
# e.g. how many times does 'd' follow 'a' -- master_dict['a']['d']
master_dict = {}
for a in alphabet:
master_dict[a] = tally
def precedingLetter(letter,word):
if word.index(letter) == 0:
return
else:
return word[word.index(letter)-1]
for a in alphabet:
for word in short_list:
for b in alphabet:
if precedingLetter(b,word) == a:
master_dict[a][b] += 1
However, the entries for all of the letters (the keys) in master_dict are all the same. I can't think of another way to properly tally each letter's occurrence after another letter. Can anyone offer some insight here?
If the sub-dicts are all supposed to be updated independently after creation, you need to shallow copy them. Easiest/fastest way is with .copy():
for a in alphabet:
master_dict[a] = tally.copy()
The other approach is to initialize the dict lazily. The easiest way to do that is with defaultdict:
from collections import defaultdict
masterdict = defaultdict(lambda: defaultdict(int))
# or
from collections import Counter, defaultdict
masterdict = defaultdict(Counter)
No need to pre-create empty tallies or populate masterdict at all, and this avoids creating dicts when the letter never occurs. If you access masterdict[a] for an a that doesn't yet exist, it creates a defaultdict(int) value for it automatically. When masterdict[a][b] is accessed and doesn't exist, the count is initialized to 0 automatically.
In Addition to the first answer it could be handy to perform your search the other way around. So instead of looking for each possible pair of letters, you could iterate just over the words.
In combination with the defaultdict this could simplify the process. As an example:
from collections import defaultdict
short_list = ['ford','hello','orange','apple']
master_dict = defaultdict(lambda: defaultdict(int))
for word in short_list:
for i in range(0,len(word)-1):
master_dict[word[i]][word[i+1]] += 1
Now master_dict contains all occured letter combinations while it returns zero for all other ones. A few examples below:
print(master_dict["f"]["o"]) # ==> 1
print(master_dict["o"]["r"]) # ==> 2
print(master_dict["a"]["a"]) # ==> 0
The problem you ask about is that the master_dict[a] = tally is only assigning the same object another name, so updating it through any of the references updates them all. You could fix that by making a copy of it each time by using master_dict[a] = tally.copy() as already pointed out in #ShadowRanger's answer.
As #ShadowRanger goes on to point out, it would also be considerably less wasteful to make your master_dict a defaultdict(lambda: defaultdict(int)) because doing so would only allocate and initialize counts for the combinations that actually encountered rather than all possible 2 letter permutations (if it was used properly).
To give you a concert idea of the savings, consider that there are only 15 unique letter pairs in your sample short_list of words, yet the exhaustive approach would still create and initialize 26 placeholders in 26 dictionaries for all 676 the possible counts.
It also occurs to me that you really don't need a two-level dictionary at all to accomplish what you want since the same thing could be done with a single dictionary which had keys comprised of tuples of pairs of characters.
Beyond that, another important improvement, as pointed out in #AdmPicard's answer, is that your approach of iterating through all possible permutations and seeing if any pairs of them are in each word via the precedingLetter() function is significantly more time consuming than it would be if you just iterated over all the successive pairs of letters that actually occurred in each one of them.
So, putting all this advice together would result in something like the following:
from collections import defaultdict
from string import ascii_lowercase
alphabet = set(ascii_lowercase)
short_list = ['ford','hello','orange','apple']
# dictionary to keep track of how often a letter pair occurred after one other.
# e.g. how many times 'd' followed an 'a' -> master_dict[('a','d')]
master_dict = defaultdict(int)
try:
from itertools import izip
except ImportError: # Python 3
izip = zip
def pairwise(iterable):
"s -> (s0,s1), (s1,s2), (s2, s3), ..."
a, b = iter(iterable), iter(iterable) # 2 independent iterators
next(b, None) # advance the 2nd one
return izip(a, b)
for word in short_list:
for (ch1,ch2) in pairwise(word.lower()):
if ch1 in alphabet and ch2 in alphabet:
master_dict[(ch1,ch2)] += 1
# display results
unique_pairs = 0
for (ch1,ch2) in sorted(master_dict):
print('({},{}): {}'.format(ch1, ch2, master_dict[(ch1,ch2)]))
unique_pairs += 1
print('A total of {} different letter pairs occurred in'.format(unique_pairs))
print('the words: {}'.format(', '.join(repr(word) for word in short_list)))
Which produces this output from the short_list:
(a,n): 1
(a,p): 1
(e,l): 1
(f,o): 1
(g,e): 1
(h,e): 1
(l,e): 1
(l,l): 1
(l,o): 1
(n,g): 1
(o,r): 2
(p,l): 1
(p,p): 1
(r,a): 1
(r,d): 1
A total of 15 different letter pairs occurred in
the words: 'ford', 'hello', 'orange', 'apple'

Using collections.Counter to count elements in sublists

I have a list of tokenized text sentences (youtube comments):
sample_tok = [['How', 'does', 'it', 'call', 'them', '?', '\xef\xbb\xbf'],
['Thats', 'smart\xef\xbb\xbf'],
... # and sooo on.....
['1:45', ':', 'O', '\xef\xbb\xbf']]
Now I want to make a dictionary with the words and the amount of times they are mentioned.
from collections import Counter
d = Counter()
for sent in [sample_tok]:
for words in sent:
d = Counter(words)
Unfortunately, this just counts the last sublist...
[(':', 1), ('1:45', 1), ('\xef\xbb\xbf', 1), ('O', 1)]
Is there a way to make it count all the tokenized sentences?
You are replacing your counter, not updating it. Each time in the loop you produce a new Counter() instance, discarding the previous copy.
Pass each word in a nested generator expression to your Counter():
d = Counter(word for sublist in sample_tok for word in sublist)
or, if you need to somehow process each sublist first, use Counter.update():
d = Counter()
for sent in [sample_tok]:
for words in sent:
d.update(words)
You can use the update method of Counter instances. This counts the passed values and adds them to the counter.
d = Counter()
for sent in [sample_tok]:
for words in sent:
d.update(words)
Or you can add the new counter to the old one:
d = Counter()
for sent in [sample_tok]:
for words in sent:
d += Counter(words)

Append to a dict of lists with a dict comprehension

Suppose I have a large list of words. For an example:
>>> with open('/usr/share/dict/words') as f:
... words=[word for word in f.read().split('\n') if word]
If I wanted to build an index by first letter of this word list, this is easy:
d={}
for word in words:
if word[0].lower() in 'aeiou':
d.setdefault(word[0].lower(),[]).append(word)
# You could use defaultdict here too...
Results in something like this:
{'a':[list of 'a' words], 'e':[list of 'e' words], 'i': etc...}
Is there a way to do this with Python 2.7, 3+ dict comprehension? In other words, is it possible with the dict comprehension syntax to append the list represented by the key as the dict is being built?
ie:
index={k[0].lower():XXX for k in words if k[0].lower() in 'aeiou'}
Where XXX performs an append operation or list creation for the key as index is being created.
Edit
Taking the suggestions and benchmarking:
def f1():
d={}
for word in words:
c=word[0].lower()
if c in 'aeiou':
d.setdefault(c,[]).append(word)
def f2():
d={}
{d.setdefault(word[0].lower(),[]).append(word) for word in words
if word[0].lower() in 'aeiou'}
def f3():
d=defaultdict(list)
{d[word[0].lower()].append(word) for word in words
if word[0].lower() in 'aeiou'}
def f4():
d=functools.reduce(lambda d, w: d.setdefault(w[0], []).append(w[1]) or d,
((w[0].lower(), w) for w in words
if w[0].lower() in 'aeiou'), {})
def f5():
d=defaultdict(list)
for word in words:
c=word[0].lower()
if c in 'aeiou':
d[c].append(word)
Produces this benchmark:
rate/sec f4 f2 f1 f3 f5
f4 11 -- -21.8% -31.1% -31.2% -41.2%
f2 14 27.8% -- -11.9% -12.1% -24.8%
f1 16 45.1% 13.5% -- -0.2% -14.7%
f3 16 45.4% 13.8% 0.2% -- -14.5%
f5 18 70.0% 33.0% 17.2% 16.9% --
The straight loop with a default dict is fastest followed by set comprehension and loop with setdefault.
Thanks for the ideas!
No - dict comprehensions are designed to generate non-overlapping keys with each iteration; they don't support aggregation. For this particular use case, a loop is the proper way to accomplish the task efficiently (in linear time).
It is not possible (at least easily or directly) with a dict comprehension.
It is possible, but potentially abusive of the syntax, with a set or list comprehension:
# your code:
d={}
for word in words:
if word[0].lower() in 'aeiou':
d.setdefault(word[0].lower(),[]).append(word)
# a side effect set comprehension:
index={}
r={index.setdefault(word[0].lower(),[]).append(word) for word in words
if word[0].lower() in 'aeiou'}
print r
print [(k, len(d[k])) for k in sorted(d.keys())]
print [(k, len(index[k])) for k in sorted(index.keys())]
Prints:
set([None])
[('a', 17094), ('e', 8734), ('i', 8797), ('o', 7847), ('u', 16385)]
[('a', 17094), ('e', 8734), ('i', 8797), ('o', 7847), ('u', 16385)]
The set comprehension produces a set with the results of the setdefault() method after iterating over the words list. The sum total of set([None]) in this case. It also produces your desired side effect of producing your dict of lists.
It is not as readable (IMHO) as the straight looping construct and should be avoided (IMHO). It is no shorter and probably not materially faster. This is more interesting trivia about Python than useful -- IMHO... Maybe to win a bet?
I'd use filter:
>>> words = ['abcd', 'abdef', 'eft', 'egg', 'uck', 'ice']
>>> index = {k.lower() : list(filter(lambda x:x[0].lower() == k.lower(),words)) for k in 'aeiou'}
>>> index
{'a': ['abcd', 'abdef'], 'i': ['ice'], 'e': ['eft', 'egg'], 'u': ['uck'], 'o': []}
This is not exactly a dict comprehension, but:
reduce(lambda d, w: d.setdefault(w[0], []).append(w[1]) or d,
((w[0].lower(), w) for w in words
if w[0].lower() in 'aeiou'), {})
Not answering the question of a dict comprehension, but it might help someone searching this problem. In a reduced example, when filling growing lists on the run into a new dictionary, consider calling a function in a list comprehension, which is, admittedly, nothing better than a loop.
def fill_lists_per_dict_keys(k, v):
d[k] = (
v
if k not in d
else d[k] + v
)
# global d
d = {}
out = [fill_lists_per_dict_keys(i[0], [i[1]]) for i in d2.items()]
The out is only to suppress the None Output of each loop.
If you ever want to use the new dictionary even inside the list comprehension at runtime or if you run into another reason why your dictionary gets overwritten by each loop, check to make it global with global d at the beginning of the script (commented out because not necessary here).

Separating nltk.FreqDist words into two lists?

I have a series of texts that are instances of a custom WebText class. Each text is an object that has a rating (-10 to +10) and a word count (nltk.FreqDist) associated with it:
>>trainingTexts = [WebText('train1.txt'), WebText('train2.txt'), WebText('train3.txt'), WebText('train4.txt')]
>>trainingTexts[1].rating
10
>>trainingTexts[1].freq_dist
<FreqDist: 'the': 60, ',': 49, 'to': 38, 'is': 34,...>
How can you now get two lists (or dictionaries) containing every word used exclusively in the positively rated texts (trainingText[].rating>0), and another list containing every word used exclusively in the negative texts (trainingText[].rating<0). And have each list contain the total word counts for all the positive or negative texts, so that you get something like this:
>>only_positive_words
[('sky', 10), ('good', 9), ('great', 2)...]
>>only_negative_words
[('earth', 10), ('ski', 9), ('food', 2)...]
I considered using sets, as sets contain unique instances, but i can't see how this can be done with nltk.FreqDist, and on top of that, a set wouldn't be ordered by word frequency. Any ideas?
Ok, let's say you start with this for the purposes of testing:
class Rated(object):
def __init__(self, rating, freq_dist):
self.rating = rating
self.freq_dist = freq_dist
a = Rated(5, nltk.FreqDist('the boy sees the dog'.split()))
b = Rated(8, nltk.FreqDist('the cat sees the mouse'.split()))
c = Rated(-3, nltk.FreqDist('some boy likes nothing'.split()))
trainingTexts = [a,b,c]
Then your code would look like:
from collections import defaultdict
from operator import itemgetter
# dictionaries for keeping track of the counts
pos_dict = defaultdict(int)
neg_dict = defaultdict(int)
for r in trainingTexts:
rating = r.rating
freq = r.freq_dist
# choose the appropriate counts dict
if rating > 0:
partition = pos_dict
elif rating < 0:
partition = neg_dict
else:
continue
# add the information to the correct counts dict
for word,count in freq.iteritems():
partition[word] += count
# Turn the counts dictionaries into lists of descending-frequency words
def only_list(counts, filtered):
return sorted(filter(lambda (w,c): w not in filtered, counts.items()), \
key=itemgetter(1), \
reverse=True)
only_positive_words = only_list(pos_dict, neg_dict)
only_negative_words = only_list(neg_dict, pos_dict)
And the result is:
>>> only_positive_words
[('the', 4), ('sees', 2), ('dog', 1), ('cat', 1), ('mouse', 1)]
>>> only_negative_words
[('nothing', 1), ('some', 1), ('likes', 1)]

Categories

Resources