Missing keys from dictionary counting words in list - python

My output is incomplete. There are 3 element which don't count.
# A programm to count words in a string and put them in a dictionary as key = word and value = count
def word_in_str (S):
dict_s = {} # make a empty dict
s = S.lower() # make string lowercase
l = s.split() # split string into a list and separate theme by spase
print (l) # original list contain all words
for word in l:
counter = l.count (str(word))
print (str(word)) # for testing the code, it's value = count
print (counter) # for testing the code, it's key = word
dict_s[str(word)] = counter
l[:] = (value for value in l if value != str(word)) #delete the word after count it
print (l) # for testing the code, it's the list after deleting the word
print (dict_s) # main print code, but there is no ('when', 'young', 'and') in result
if __name__ == '__main__':
word_in_str ('I am tall when I am young and I am short when I am old')
the output for this code is:
['i', 'am', 'tall', 'when', 'i', 'am', 'young', 'and', 'i', 'am', 'short', 'when', 'i', 'am', 'old']
i
4
['am', 'tall', 'when', 'am', 'young', 'and', 'am', 'short', 'when', 'am', 'old']
tall
1
['am', 'when', 'am', 'young', 'and', 'am', 'short', 'when', 'am', 'old']
am
4
['when', 'young', 'and', 'short', 'when', 'old']
short
1
['when', 'young', 'and', 'when', 'old']
old
1
['when', 'young', 'and', 'when'] <==what happened to this words?
{'i': 4, 'tall': 1, 'am': 4, 'short': 1, 'old': 1} <==result without the words above

I think you're over thinking the problem. A Counter already counts elements of an iterable, and it is a type of dict
from collections import Counter
def word_in_str(S):
return dict(Counter(S.split()))
The problem with your code is that your for loop is over l, but then you're attempting to "delete" and reassign l[:], where you don't really need to. Just count and store the dict entry.

def word_in_str (S):
dict_s = {}
s = S.lower()
l = s.split()
for word in l:
counter = l.count (word)
dict_s[word] = counter
print (dict_s)
word_in_str ('I am tall when I am young and I am short when I am old')

Related

python how to record the number of elements in listA that appear in the same order as in listB

I am trying to trace to what extent is listA, listB, listC... similar to the original list. How do I print the number of elements that occur in the same sequence in listA as they occur in the original list?
original_list = ['I', 'live', 'in', 'space', 'with', 'my', 'dog']
listA = ['my', 'name', 'my', 'dog', 'is', 'two', 'years', 'old']
listB = ['how', 'where', 'I', 'live', 'in', 'space', 'with']
listC = ['I', 'live', 'to', 'the' 'in', 'space', 'with', 'my', 'football', 'my','dog']
Output:
listA: Count = 2 #'my', 'dog'
listB: Count = 5 #'I', 'live', 'in', 'space', 'with'
listC: Count = 2,4,2 #'I', 'live'
#'in', 'space', 'with', 'my'
#'my', 'dog'
I wrote a function that does the job I think. It might be a bit too complex, but I can't see an easier way at the moment:
original = ['I', 'live', 'in', 'space', 'with', 'my', 'dog']
listA = ['my', 'name', 'my', 'dog', 'is', 'two', 'years', 'old']
listB = ['how', 'where', 'I', 'live', 'in', 'space', 'with']
listC = ['I', 'live', 'to', 'the', 'in', 'space', 'with', 'my', 'football', 'my', 'dog']
def get_sequence_lengths(original_list, comparative_list):
original_options = []
for i in range(len(original_list)):
for j in range(i + 1, len(original_list)):
original_options.append(original_list[i:j + 1])
comparative_options = []
for i in range(len(comparative_list)):
for j in range(i+1, len(comparative_list)):
comparative_options.append(comparative_list[i:j+1])
comparative_options.sort(key=len, reverse=True)
matches = []
while comparative_options:
for option in comparative_options:
if option in original_options:
matches.append(option)
new_comparative_options = comparative_options.copy()
for l in comparative_options:
counter = 0
for v in option:
counter = counter + 1 if v in l else 0
if counter == len(l):
new_comparative_options.remove(l)
break
comparative_options = new_comparative_options
break
if option == comparative_options[-1]:
break
matches = [option for option in original_options if option in matches]
lengths = [len(option) for option in matches]
print(lengths)
print(matches)
return lengths
If you call it with the original list and example lists, it prints the following.
get_sequence_lengths(original, listA) prints [2] [['my', 'dog']].
get_sequence_lengths(original, listB) prints [5] [['I', 'live', 'in', 'space', 'with']].
get_sequence_lengths(original, listC) prints [2, 4, 2] [['I', 'live'], ['in', 'space', 'with', 'my'], ['my', 'dog']].
EDITED
I found this problem fun to do and wanted to explore some other options from the accepted one.
def _get_sequences(inter_dict : dict, list_range : int) -> tuple[set, int]:
occuring = [0] * list_range
for key, indices in inter_dict.items(): # lays out intersecting strings as they occur
for idx in indices:
occuring[idx] = key
_temp_list = []
lengths = []
matches = []
for idx in range(len(occuring)):
item = occuring.pop(0)
if item != 0: # if on python 3.8+ you could use (( item := occuring.pop(0) ) != 0) instead
_temp_list.append(item)
elif (bool(_temp_list) and len(_temp_list) > 1):
matches.append( _temp_list.copy() )
lengths.append( len(_temp_list) )
_temp_list.clear()
elif (bool(_temp_list) and item == 0) and len(_temp_list) == 1: # if its a single occurrence ignore
_temp_list.clear()
if bool(_temp_list) and len(_temp_list) > 1: # ensures no matching strings are missed
matches.append( _temp_list )
lengths.append( len(_temp_list) )
return lengths, matches
def get_intersecting(list_a, list_b) -> tuple[set, int]:
intersecting = set(list_a) & set(list_b) # returns intersecting strings
indices_dict = {}
for item in intersecting:
indices = [ index for index, value in enumerate(list_b) if value == item ] # gets occuring indices of each string
indices_dict[item] = indices
return _get_sequences( indices_dict, len(list_b) )
if __name__ == "__main__":
original = ['I', 'live', 'in', 'space', 'with', 'my', 'dog']
listA = ['my', 'name', 'my', 'dog', 'is', 'two', 'years', 'old']
listB = ['how', 'where', 'I', 'live', 'in', 'space', 'with']
listC = ['I', 'live', 'to', 'the', 'in', 'space', 'with', 'my', 'football', 'my', 'dog']
lengths, matches = get_intersecting(original, listA)
print(lengths, matches) # [2] [['my', 'dog']]
lengths, matches = get_intersecting(original, listB)
print(lengths, matches) # [5] [['I', 'live', 'in', 'space', 'with']]
lengths, matches = get_intersecting(original, listC)
print(lengths, matches) # [2, 4, 2] [['I', 'live'] ['in', 'space', 'with', 'my'] ['my', 'dog']]
EDITED x2
This would probably be my final solution.
def ordered_intersecting(list_a, list_b) -> tuple[int, list]:
matches = []
for item in list_b:
if item in list_a: # while iterating we can just add them to a return list as they appear
matches.append(item)
elif len(matches) > 1: # once we come across an item that does not intersect we know we can yield a return value ( as long as matches are greater than 1 )
yield len(matches), matches.copy() ; matches.clear() # a shallow copy should be good enough, but if needed it can be changed to a deep one
if len(matches) > 1: # catch any remaining matches
yield len(matches), matches
if __name__ == "__main__":
original = ['I', 'live', 'in', 'space', 'with', 'my', 'dog']
listA = ['my', 'name', 'my', 'dog', 'is', 'two', 'years', 'old']
listB = ['how', 'where', 'I', 'live', 'in', 'space', 'with']
listC = ['I', 'live', 'to', 'the', 'in', 'space', 'with', 'my', 'football', 'my', 'dog']
print( list(ordered_intersecting(original, listA)) )
print( list(ordered_intersecting(original, listB)) )
print( list(ordered_intersecting(original, listC)) )

How do I sort dictionary values within a list?

I am continuing with a coding exercise which has me return a dictionary where the key is the length of a word and the value is the word itself. This is done by splitting a text, which is the parameter passed to the get_word_len_dict(text) function and counting the number of characters. The length is then sorted and outputted in print_dict_in_key_order(a_dict).
I get an output like this:
2 : ['to', 'is']
3 : ['why', 'you', 'say', 'are', 'but', 'the', 'wet']
4 : ['does', 'when', 'four', 'they', 'have']
5 : ['there', 'stars', 'check', 'paint']
7 : ['someone', 'believe', 'billion']
Which looks right, but what if I wanted to order the values within the list by alphabetical order? That means that words starting in caps should also be prioritised. For example. ['May', 'and'].
Ideally, I would want an output like this with the values in alphabetical order:
2 : ['is', 'to']
3 : ['are', 'but', 'say', 'the', 'wet', 'why', 'you']
4 : ['does', 'four', 'have', 'they', 'when']
5 : ['check', 'paint', 'stars', 'there']
7 : ['believe', 'billion', 'someone']
I have managed to sort the keys so far within the print_dict_in_key_order(a_dict), but not sure how to go about it if I want to also sort the values?
def get_word_len_dict(text):
dictionary = {}
word_list = text.split()
for word in word_list:
letter = len(word)
dictionary.setdefault(letter,[])
if word not in dictionary[letter]:
dictionary[letter].append(word)
return dictionary
def test_get_word_len_dict():
text = 'why does someone believe you when you say there are four billion stars but they have to check when you say the paint is wet'
the_dict = get_word_len_dict(text)
print_dict_in_key_order(the_dict)
def print_dict_in_key_order(a_dict):
all_keys = list(a_dict.keys())
all_keys.sort()
for key in all_keys:
print(key, ":", a_dict[key])
What you want to do is to group by length and then sort by value (since uppercase letters are "smaller" than lowercase letters when compared lexicographically), then remove duplicates from each group and put everything in a dict comprehension.
Note that itertools.groupby, unlike the analogous function in, say, pandas, will treat noncontiguous groups as distinct, so we need to sort by length first.
Example:
from itertools import groupby
from pprint import pprint
def solution(sentence):
sorted_words = sorted(sentence.split(' '), key=len)
return {length: sorted(set(words)) for length, words in groupby(sorted_words, len)}
sentence = 'Why does someone believe you when you say there are four billion stars but they have to check when you say the paint is wet'
pprint(solution(sentence))
Output:
{2: ['is', 'to'],
3: ['Why', 'are', 'but', 'say', 'the', 'wet', 'you'],
4: ['does', 'four', 'have', 'they', 'when'],
5: ['check', 'paint', 'stars', 'there'],
7: ['believe', 'billion', 'someone']}
Notice that 'Why' comes before the others because it starts with a capital letter, and the rest are sorted alphabetically.
If you want to retain your function structure, you can just sort each list in your dictionary inplace:
def get_word_len_dict(text):
dictionary = {}
word_list = text.split()
for word in word_list:
letter = len(word)
dictionary.setdefault(letter,[])
if word not in dictionary[letter]:
dictionary[letter].append(word)
for words in dictionary.values():
words.sort()
return dictionary
Given this dict
d = {
2: ['to', 'is'],
3: ['why', 'you', 'say', 'are', 'but', 'the', 'wet'],
4: ['does', 'when', 'four', 'they', 'have'],
5: ['there', 'stars', 'check', 'paint'],
7: ['someone', 'believe', 'billion'],
}
You can sort the values like this:
{k: sorted(v) for k, v in d.items()}
Output (via pprint):
{2: ['is', 'to'],
3: ['are', 'but', 'say', 'the', 'wet', 'why', 'you'],
4: ['does', 'four', 'have', 'they', 'when'],
5: ['check', 'paint', 'stars', 'there'],
7: ['believe', 'billion', 'someone']}
Though if you only care about sorting it when printing, just change this line in your code:
print(key, ":", a_dict[key])
to this:
print(key, ":", sorted(a_dict[key]))
d = {
2: ['to', 'is'],
3: ['why', 'you', 'say', 'are', 'but', 'the', 'wet'],
4: ['does', 'when', 'four', 'they', 'have'],
5: ['there', 'stars', 'check', 'paint'],
7: ['someone', 'believe', 'billion'],
}
for i in d:
d[i].sort()
print(d)
output
{
2: ['is', 'to'],
3: ['are', 'but', 'say', 'the', 'wet', 'why', 'you'],
4: ['does', 'four', 'have', 'they', 'when'],
5: ['check', 'paint', 'stars', 'there'],
7: ['believe', 'billion', 'someone']
}

split existed list based on the repeated word

I tried to split a list into new list. Here's the initial list:
initList =['PTE123', '', 'I', 'am', 'programmer', 'PTE345', 'based', 'word',
'title', 'PTE427', 'how', 'are', 'you']
If I want to split the list based on the PTExyz to new list which looks:
newList = ['PTE123 I am programmer', 'PTE345 based word title', 'PTE427 how are you']
How should I develop proper algorithm for general case with repeated item PTExyz?
Thank You!
The algorithm will be something like this.
Iterate over the list. Find a the string s that starts with PTE. Assign it to a temp string which is initialized as an empty string. Add every next string s with temp unless that string starts with PTE. In that case, if the temp string is not empty then append it with your result list else add the string with temp.
ls = ['PTE123', '', 'I', 'am', 'programmer', 'PTE345', 'based', 'word', 'title', 'PTE427', 'how', 'are', 'you']
result = []
temp = ''
for s in ls:
if s.startswith('PTE'):
if temp != '':
result.append(temp)
temp = s
else:
if temp == '':
continue
temp += ' ' + s
result.append(temp)
print(result)
Edit
For handling the pattern PTExyz you can use regular expression. In that case the code will be like this where the line is s.startswith('PTE'):
re.match(r'PTE\w{3}$', s)
I think it will work
l =['PTE123', '', 'I', 'am', 'programmer', 'PTE345', 'based', 'word','title', 'PTE427', 'how', 'are', 'you']
resultlist = []
s = ' '.join(l)
str = s.split('PTE')
for i in str:
resultlist.append('PTE'+i)
resultlist.remove('PTE')
print resultlist
It works on a regular expression PTExyz
import re
l =['PTE123', '', 'I', 'am', 'programmer', 'PTE345', 'based', 'word',
'title', 'PTE427', 'how', 'are', 'you']
pattern = re.compile(r'[P][T][E]\d\d\d')
k = []
for i in l:
if pattern.match(i) is not None:
k.append(i)
s = ' '.join(l)
str = re.split(pattern, s)
str.remove('')
for i in range(len(k)):
str[i] = k[i] + str[i]
print str
>>> list =['PTE123', '', 'I', 'am', 'programmer', 'PTE345', 'based', 'word','title', 'PTE427', 'how', 'are', 'you']
>>> index_list =[ list.index(item) for item in list if "PTE" in item]
>>> index_list.append(len(list))
>>> index_list
[0, 5, 9, 13]
>>> [' '.join(list[index_list[i-1]:index_list[i]]) for i,item in enumerate(index_list) if item > 0 ]
Output
['PTE123 I am programmer', 'PTE345 based word title', 'PTE427 how are you']

Finding duplicates in a list of a list, and adding their values

I'm trying to find the top 50 words that occur within three texts of Shakespeare and the ratio of each words occurrance in, macbeth.txt, allswell.txt, and othello.txt. Here is my code so far:
def byFreq(pair):
return pair[1]
def shakespeare():
counts = {}
A = []
for words in ['macbeth.txt','allswell.txt','othello.txt']:
text = open(words, 'r').read()
test = text.lower()
for ch in '!"$%&()*+,-./:;<=>?#[\\]^_`{|}~':
text = text.replace(ch, ' ')
words = text.split()
for w in words:
counts[w] = counts.get(w, 0) + 1
items = list(counts.items())
items.sort()
items.sort(key=byFreq, reverse = True)
for i in range(50):
word, count = items[i]
count = count / float(len(counts))
A += [[word, count]]
print A
And its output:
>>> shakespeare()
[['the', 0.12929982922664066], ['and', 0.09148572822639668], ['I', 0.08075140278116613], ['of', 0.07684801171017322], ['to', 0.07562820200048792], ['a', 0.05220785557453037], ['you', 0.04415711149060746], ['in', 0.041717492071236886], ['And', 0.04147353012929983], ['my', 0.04147353012929983], ['is', 0.03927787265186631], ['not', 0.03781410100024396], ['that', 0.0358624054647475], ['it', 0.03366674798731398], ['Macb', 0.03342278604537692], ['with', 0.03269090021956575], ['his', 0.03147109050988046], ['be', 0.03025128080019517], ['The', 0.028787509148572824], ['haue', 0.028543547206635766], ['me', 0.027079775555013418], ['your', 0.02683581361307636], ['our', 0.025128080019516955], ['him', 0.021956574774335203], ['Enter', 0.019516955354964626], ['That', 0.019516955354964626], ['for', 0.01927299341302757], ['this', 0.01927299341302757], ['he', 0.018541107587216395], ['To', 0.01780922176140522], ['so', 0.017077335935594046], ['all', 0.0156135642839717], ['What', 0.015369602342034643], ['are', 0.015369602342034643], ['thou', 0.015369602342034643], ['will', 0.015125640400097584], ['Macbeth', 0.014881678458160527], ['thee', 0.014881678458160527], ['But', 0.014637716516223469], ['but', 0.014637716516223469], ['Macd', 0.014149792632349353], ['they', 0.014149792632349353], ['their', 0.013905830690412296], ['we', 0.013905830690412296], ['as', 0.01341790680653818], ['vs', 0.01341790680653818], ['King', 0.013173944864601122], ['on', 0.013173944864601122], ['yet', 0.012198097096852892], ['Rosse', 0.011954135154915833], ['the', 0.15813168261114238], ['I', 0.14279684862127182], ['and', 0.1231007315700619], ['to', 0.10875070343275182], ['of', 0.10481148002250985], ['a', 0.08581879572312887], ['you', 0.08581879572312887], ['my', 0.06992121553179516], ['in', 0.061902082160945414], ['is', 0.05852560495216657], ['not', 0.05486775464265616], ['it', 0.05472706809229038], ['that', 0.05472706809229038], ['his', 0.04727068092290377], ['your', 0.04389420371412493], ['me', 0.043753517163759144], ['be', 0.04305008441193022], ['And', 0.04037703995498031], ['with', 0.038266741699493526], ['him', 0.037703995498030385], ['for', 0.03601575689364097], ['he', 0.03404614518851998], ['The', 0.03137310073157006], ['this', 0.030810354530106922], ['her', 0.029262802476083285], ['will', 0.0291221159257175], ['so', 0.027011817670230726], ['have', 0.02687113111986494], ['our', 0.02687113111986494], ['but', 0.024760832864378166], ['That', 0.02293190770962296], ['PAROLLES', 0.022791221159257174], ['To', 0.021384355655599326], ['all', 0.021384355655599326], ['shall', 0.021102982554867755], ['are', 0.02096229600450197], ['as', 0.02096229600450197], ['thou', 0.02039954980303883], ['Macb', 0.019274057400112548], ['thee', 0.019274057400112548], ['no', 0.01871131119864941], ['But', 0.01842993809791784], ['Enter', 0.01814856499718627], ['BERTRAM', 0.01758581879572313], ['HELENA', 0.01730444569499156], ['we', 0.01730444569499156], ['do', 0.017163759144625774], ['thy', 0.017163759144625774], ['was', 0.01674169949352842], ['haue', 0.016460326392796848], ['I', 0.19463784682531435], ['the', 0.17894627455055595], ['and', 0.1472513769094877], ['to', 0.12989712147978802], ['of', 0.12002494024732412], ['you', 0.1079704873739998], ['a', 0.10339810869791126], ['my', 0.0909279850358516], ['in', 0.07627558973293151], ['not', 0.07159929335965914], ['is', 0.0697287748103502], ['it', 0.0676504208666736], ['that', 0.06733866777512211], ['me', 0.06099968824690845], ['your', 0.0543489556271433], ['And', 0.053205860958121166], ['be', 0.05310194326093734], ['his', 0.05154317780317988], ['with', 0.04769822300737816], ['him', 0.04665904603553985], ['her', 0.04364543281720877], ['for', 0.04322976202847345], ['he', 0.042190585056635144], ['this', 0.04187883196508366], ['will', 0.035332017042502335], ['Iago', 0.03522809934531851], ['so', 0.03356541619037722], ['The', 0.03325366309882573], ['haue', 0.031902733035435935], ['do', 0.03138314454951678], ['but', 0.030240049880494647], ['That', 0.02857736672555336], ['thou', 0.027642107450898887], ['as', 0.027434272056531227], ['To', 0.026810765873428243], ['our', 0.02504416502130313], ['are', 0.024628494232567806], ['But', 0.024420658838200146], ['all', 0.024316741141016316], ['What', 0.024212823443832486], ['shall', 0.024004988049464823], ['on', 0.02265405798607503], ['thee', 0.022134469500155875], ['Enter', 0.021822716408604385], ['thy', 0.021199210225501402], ['no', 0.020783539436766082], ['she', 0.02026395095084693], ['am', 0.02005611555647927], ['by', 0.019848280162111608], ['have', 0.019848280162111608]]
Instead of outputing the top 50 words of all three texts, its outputs the top 50 words of each text, 150 words. Im struggling on trying to delete the duplicates but add their ratios together. For example, in macbeth.txt the word 'the' has a ratio of 0.12929982922664066, allswell.txt has a ratio of 0.15813168261114238, and othello.txt has a ratio of 0.17894627455055595. I want to combine the ratios of all three of them. I;m pretty sure I have to use a for loop but I'm struggling to loop through a list within a list. I am more of a java guy so any help would be appreciated!
You can use a list comprehension and the Counter-class:
from collections import Counter
c = Counter([word for file in ['macbeth.txt','allswell.txt','othello.txt']
for word in open(file).read().split()])
Then you get a dict which maps words to their counts. You can sort them like this:
sorted([(i,v) for v,i in c.items()])
If you want the relative quantities, then you can calculate the total number of words:
numWords = sum([i for (v,i) in c.items()])
and adapt the dict c via a dict-comprehension:
c = { v:(i/numWords) for (v,i) in c.items()}
You're summarizing the count inside your loop over files. Move the summary code outside your for loop.

Build a dictionary from list of lists

I am trying to build an inverted index, i.e. map a text to the document it came from.
It's position within the list/document.
In my case i have parsed list containing lists(i.e list of lists).
My input is like this.
[
['why', 'was', 'cinderella', 'late', 'for', 'the', 'ball', 'she', 'forgot', 'to', 'swing', 'the', 'bat'],
['why', 'is', 'the', 'little', 'duck', 'always', 'so', 'sad', 'because', 'he', 'always', 'sees', 'a', 'bill', 'in', 'front', 'of', 'his', 'face'],
['what', 'has', 'four', 'legs', 'and', 'goes', 'booo', 'a', 'cow', 'with', 'a', 'cold'],
['what', 'is', 'a', 'caterpillar', 'afraid', 'of', 'a', 'dogerpillar'],
['what', 'did', 'the', 'crop', 'say', 'to', 'the', 'farmer', 'why', 'are', 'you', 'always', 'picking', 'on', 'me']
]
This is my code
def create_inverted(mylists):
myDict = {}
for sublist in mylists:
for i in range(len(sublist)):
if sublist[i] in myDict:
myDict[sublist[i]].append(i)
else:
myDict[sublist[i]] = [i]
return myDict
It does build the dictionary, but when i do a search i am not getting the correct
result. I am trying to do something like this.
documents = [['owl', 'lion'], ['lion', 'deer'], ['owl', 'leopard']]
index = {'owl': [0, 2],
'lion': [0, 1], # IDs are sorted.
'deer': [1],
'leopard': [2]}
def indexed_search(documents, index, query):
return [documents[doc_id] for doc_id in index[query]]
print indexed_search(documents, index, 'lion')
Where i can enter search text and it gets the list ids.
Any Ideas.
You're mapping each word to the positions it was found in in each document, not which document it was found in. You should store indexes into the list of documents instead of indexes into the documents themselves, or perhaps just map words to documents directly instead of to indices:
def create_inverted_index(documents):
index = {}
for i, document in enumerate(documents):
for word in set(document):
if word in index:
index[word].append(i)
else:
index[word] = [i]
return index
Most of this is the same as your code. The main differences are in these two lines:
for i, document in enumerate(documents):
for word in set(document):
which correspond to the following part of your code:
for sublist in mylists:
for i in range(len(sublist)):
enumerate iterates over the indices and elements of a sequence. Since enumerate is on the outer loop, i in my code is the index of the document, while i in your code is the index of a word within a document.
set(document) creates a set of the words in the document, where each word appears only once. This ensures that each word is only counted once per document, rather than having 10 occurrences of 2 in the list for 'Cheetos' if 'Cheetos' appears in document 2 10 times.
At first I would extract all possible words and store them in one set.
Then I look up each word in each list and collect all the indexes of lists the word happens to be in...
source = [
['why', 'was', 'cinderella', 'late', 'for', 'the', 'ball', 'she', 'forgot', 'to', 'swing', 'the', 'bat'],
['why', 'is', 'the', 'little', 'duck', 'always', 'so', 'sad', 'because', 'he', 'always', 'sees', 'a', 'bill', 'in', 'front', 'of', 'his', 'face'],
['what', 'has', 'four', 'legs', 'and', 'goes', 'booo', 'a', 'cow', 'with', 'a', 'cold'],
['what', 'is', 'a', 'caterpillar', 'afraid', 'of', 'a', 'dogerpillar'],
['what', 'did', 'the', 'crop', 'say', 'to', 'the', 'farmer', 'why', 'are', 'you', 'always', 'picking', 'on', 'me']
]
allWords = set(word for lst in source for word in lst)
wordDict = { word: [
i for i, lst in enumerate(source) if word in lst
] for word in allWords }
print wordDict
Out[30]:
{'a': [1, 2, 3],
'afraid': [3],
'always': [1, 4],
'and': [2],
...
This is straightforward as long you don't need efficient code:
documents = [['owl', 'lion'], ['lion', 'deer'], ['owl', 'leopard']]
def index(docs):
doc_index = {}
for doc_id, doc in enumerate(docs, 1):
for term_pos, term in enumerate(doc, 1):
doc_index.setdefault(term, {}).setdefault(doc_id, []).append(term_pos)
return doc_index
Now you get a two-level dictionary giving you access to the document ids, and then to the positions of the terms in this document:
>>> index(documents)
{'lion': {1: [2], 2: [1]}, 'leopard': {3: [2]}, 'deer': {2: [2]}, 'owl': {1: [1], 3: [1]}}
This is only a preliminary step for indexing; afterwards, you need to separate the term dictionary from the document postings from the positions postings. Typically, the dictionary is stored in a tree-like structures (there are Python packages for this), and the document postings and positions postings are represented as arrays of unsigned integers.
I'd accumulate the indices into a set to avoid duplicates and then sort
>>> documents = [['owl', 'lion'], ['lion', 'deer'], ['owl', 'leopard']]
>>> from collections import defaultdict
>>> D = defaultdict(set)
>>> for i, doc in enumerate(documents):
... for word in doc:
... D[word].add(i)
...
>>> D ## Take a look at the defaultdict
defaultdict(<class 'set'>, {'owl': {0, 2}, 'leopard': {2}, 'lion': {0, 1}, 'deer': {1}})
>>> {k:sorted(v) for k,v in D.items()}
{'lion': [0, 1], 'owl': [0, 2], 'leopard': [2], 'deer': [1]}

Categories

Resources