Python - Dictionary of sets with the same key - python

I'm trying to create a function that will read a text file that has one word on each line, like
afd
asmv
adsasd
It will take words of the user given length and will construct a python dictionary where the key is a string of the word where the letters are sorted. The values will be a set of all words that have the same key. So far I have:
def setdict():
wordfile = argv[1]
open(wordfile, "r")
setdict = {}
for line in wordfile:
words = line.split()
for word in words:
word = word.rstrip("\n")
if word == wordlength:
key = str(sorted(word))
I'm a little lost on how to create the sets with words that have the same key and put them in the dictionary. Any help would be appreciated.

collections.defaultdict is useful here:
from collections import defaultdict
from pprint import pprint
words = defaultdict(set)
with open('input.txt') as input_file:
for line in input_file:
for word in line.split():
sorted_list = sorted(word)
sorted_str = ''.join(sorted_list)
words[sorted_str].add(word)
pprint(words)
Of course, anything you can do with defaultdict, you can also do with dict.setdefault():
words = dict()
with open('input.txt') as input_file:
for line in input_file:
for word in line.split():
sorted_list = sorted(word)
sorted_str = ''.join(sorted_list)
words.setdefault(sorted_str, set()).add(word)

start with something simple
words = ["hello","python","world"]
my_dict = {}
for word in words:
try:
my_dict[sorted(word)].append(word)
except KeyError:
my_dict[sorted(word)] = [word]
now instead of using predefined words read them from a file
words = map(str.split,open("some_word_file.txt"))

the key here is to access the dictionary with a for loop that makes the value set available for manipulation. you can solve your problem by reading the file linewise (readline) and checking the following:
for key, value in my_dict:
if sorted(word) == key:
value.append(word)
else:
my_dict[sorted(word)] = value

Related

Count the occurrence of specific words on a text file and print the 50 most occurred ones among them

I want to count the occurrence of specific keywords (stored in .txt file, one word each line) in a text file.And print the 50 most occurred ones. Here's what I did :
from collections import Counter
with open("./Text_file.txt", "r", encoding='utf8') as logfile:
word_counts = Counter(logfile.read().split())
with open("./key_words.txt", "r", encoding='utf8') as word:
lines = word.readlines()
for line in lines:
count = [word_counts.get('line')]
lst = sorted (count)
print (lst[:50])
I returns this to me, which doesn't mean anything :
[20]
Any help ?
One Option
from collections import Counter
# Read keywords
with open("./key_words.txt", "r", encoding='utf8') as keyfile:
# Use set of keywords (#MisterMiyagi comment)
keywords = set(keyfile.read().split('\n'))
# Process words
with open("./Text_file.txt", "r", encoding='utf8') as logfile:
cnts = Counter()
for line in logfile:
if line:
line = line.rstrip()
# only count keywords
cnts.update(word for word in line.split() if word in keywords)
# Use counter most_common to get most popular 50
print(cnts.most_common(50))
Alternative Using Counter+Regex
Regex used to separate words from punctuation i.e. perids, quotes, commas, etc.
import re
from collections import Counter
with open("./key_words.txt", "r", encoding='utf8') as keyfile:
keywords = keyfile.read().lower().split('\n')
with open("./Text_file.txt", "r", encoding='utf8') as logfile:
cnts = Counter()
for line in logfile:
# use regex to separate words from punctuation
# lowercase words
words = map(lambda x:x.lower(), re.findall('[a-zA-Z]+', line, flags=re.A))
cnts.update(word for word in words if word in keywords)
print(cnts.most_common(50))
Here is what you can do:
from collections import Counter
with open("./Text_file.txt", "r") as file,open("./key_words.txt", "r") as word:
words1 = [w.strip() for w in file.read().split()] # Strore words from text file into list
words2 = [w.strip() for w in word.read().split()] # Strore words from key file into list
s = [w1 for w1 in words1 if w1 in words2] # List all words from text file that are in key file
d = Counter(s) # Diction that stores each word from s with the amount of times the word occurs in s
lst = [w for k,w in sorted([(v,k) for k,v in d.items()],reverse=True)[:50]]
print(lst)
On here word_counts.get('line'), you are calling for just the occurances of line on every iteration, which is why your result list has single value. Following is modified code of yours for top 50 words from keywords.
from collections import Counter
with open("./Text_file.txt", "r", encoding='utf8') as logfile:
word_counts = Counter(logfile.read().split())
wc = dict(word_counts)
kwc = {} #keyword counter
with open("./key_words.txt", "r", encoding='utf8') as word:
lines = word.readlines()
for line in lines:
line = line.strip() #assuming each word is in separate line, removes '\n' character from end of line
if line in wc.keys():
kwc.update({line:wc[line]}) # if keyword is found, adds that to kwc
lst = sorted (kwc, key = kwc.get, reverse = True) #sorts in decreasing order on value of dict
print (lst[:50])
I modified your code - you were close but needed to fix a few things:
You were only storing a single count, not building a list of words. I solved this by making a new dict of words-to-counts, but only for the found keywords.
As others have said, you were using the string literal 'line' instead of line
You weren't stripping the newline from each line - when you use readlines() the \n newline is at the end of each line, so none of your words were being found in your Counter.
So, here's the code. It prints out the keywords in descending order of counts, and just the 1st 50:
from collections import Counter
with open("./Text_file.txt", "r", encoding='utf8') as logfile:
word_counts = Counter(logfile.read().split())
found_keywords = {}
with open("./key_words.txt", "r", encoding='utf8') as word:
lines = word.readlines()
for line in lines:
line = line.rstrip()
count = word_counts[line]
if count > 0:
found_keywords[line] = count
>>> print([(k, v) for k, v in sorted(found_keywords.items(), key=lambda item: item[1], reverse=True)][:50])
[('cat', 3), ('dog', 1)]

I m trying to append multiple values to key in a dictionary in python

I am trying to read the file and converting it into dictionary .after reading i have to take a word and word first character as a key and word itself as a value. If another word with same character comes it should append the values to existing key itself.
import io
file1 = open("text.txt")
line = file1.read()
words = line.split()
Dict={}
for w in words:
if w[0] in Dict.keys():
key1=w[0]
wor=str(w)
Dict.setdefault(key1,[])
Dict[key1].append(wor)
else:
Dict[w[0]] = w
print Dict
Just simplified your code. There is no point in having a else condition if using set_default
words = 'hello how are you'.split()
dictionary = {}
for word in words:
key = word[0]
dictionary.setdefault(key, []).append(word)
print dictionary
In order to get rid of set_default use default_dict
from collections import defaultdict
words = 'hello how are you'.split()
dictionary = defaultdict(list)
for word in words:
key = word[0]
dictionary[key].append(word)
print dictionary.items()

Change a file to a list to a dictionary

I am trying to write a code that takes the text from a novel and converts it to a dictionary where the keys are each unique word and the values are the number of occurrences of the word in the text.
For example it could look like: {'the': 25, 'girl': 59...etc}
I have been trying to make the text first into a list and then use the Counter function to make a dictionary of all the words:
source = open('novel.html', 'r', encoding = "UTF-8")
soup = BeautifulSoup(source, 'html.parser')
#make a list of all the words in file, get rid of words that aren't content
mylist = []
mylist.append(soup.find_all('p'))
newlist = filter(None, mylist)
cnt = collections.Counter()
for line in newlist:
try:
if line is not None:
words = line.split(" ")
for word in line:
cnt[word] += 1
except:
pass
print(cnt)
This code doesn't work because of an error with "NoneType" or it just prints an empty list. I'm wondering if there is an easier way to do what I'm trying to do or how I can fix this code so it won't have this error.
import collections
from bs4 import BeautifulSoup
with open('novel.html', 'r', encoding='UTF-8') as source:
soup = BeautifulSoup(source, 'html.parser')
cnt = collections.Counter()
for tag in soup.find_all('p'):
for word in tag.string.split():
word = ''.join(ch for ch in word.lower() if ch.isalnum())
if word != '':
cnt[word] += 1
print(cnt)
with statement is simply a safer way to open the file
soup.find_all returns a list of Tag's
tag.string.split() gets all the words (separated by spaces) from the Tag
word = ''.join(ch for ch in word.lower() if ch.isalnum()) removes punctuation and convertes to lowercase so that 'Hello' and 'hello!' count as the same word
For the counter just do a
from collections import Counter
cnt = Counter(mylist)
Are you sure your list is getting items to begin with? After what step are you getting an empty list?
Once you've converted your page to a list, try something like this out:
#create dictionary and fake list
d = {}
x = ["hi", "hi", "hello", "hey", "hi", "hello", "hey", "hi"]
#count the times a unique word occurs and add that pair to your dictionary
for word in set(x):
count = x.count(word)
d[word] = count
Output:
{'hello': 2, 'hey': 2, 'hi': 4}

Why am i getting an empty dictionary?

I am learning python from an introductory Python textbook and I am stuck on the following problem:
You will implement function index() that takes as input the name of a text file and a list of words. For every word in the list, your function will find the lines in the text file where the word occurs and print the corresponding line numbers.
Ex:
>>>> index('raven.txt', ['raven', 'mortal', 'dying', 'ghost', 'ghastly', 'evil', 'demon'])
ghost 9
dying 9
demon 122
evil 99, 106
ghastly 82
mortal 30
raven 44, 53, 55, 64, 78, 97, 104, 111, 118, 120
Here is my attempt at the problem:
def index(filename, lst):
infile = open(filename, 'r')
lines = infile.readlines()
lst = []
dic = {}
for line in lines:
words = line.split()
lst. append(words)
for i in range(len(lst)):
for j in range(len(lst[i])):
if lst[i][j] in lst:
dic[lst[i][j]] = i
return dic
When I run the function, I get back an empty dictionary. I do not understand why I am getting an empty dictionary. So what is wrong with my function? Thanks.
You are overwriting the value of lst. You use it as both a parameter to a function (in which case it is a list of strings) and as the list of words in the file (in which case it's a list of list of strings). When you do:
if lst[i][j] in lst
The comparison always returns False because lst[i][j] is a str, but lst contains only lists of strings, not strings themselves. This means that the assignment to the dic is never executed and you get an empty dict as result.
To avoid this you should use a different name for the list in which you store the words, for example:
In [4]: !echo 'a b c\nd e f' > test.txt
In [5]: def index(filename, lst):
...: infile = open(filename, 'r')
...: lines = infile.readlines()
...: words = []
...: dic = {}
...: for line in lines:
...: line_words = line.split()
...: words.append(line_words)
...: for i in range(len(words)):
...: for j in range(len(words[i])):
...: if words[i][j] in lst:
...: dic[words[i][j]] = i
...: return dic
...:
In [6]: index('test.txt', ['a', 'b', 'c'])
Out[6]: {'a': 0, 'c': 0, 'b': 0}
There are also a lot of things you can change.
When you want to iterate a list you don't have to explicitly use indexes. If you need the index you can use enumerate:
for i, line_words in enumerate(words):
for word in line_words:
if word in lst: dict[word] = i
You can also iterate directly on a file (refer to Reading and Writing Files section of the python tutorial for a bit more information):
# use the with statement to make sure that the file gets closed
with open('test.txt') as infile:
for i, line in enumerate(infile):
print('Line {}: {}'.format(i, line))
In fact I don't see why would you first build that words list of list. Just itertate on the file directly while building the dictionary:
def index(filename, lst):
with open(filename, 'r') as infile:
dic = {}
for i, line in enumerate(infile):
for word in line.split():
if word in lst:
dic[word] = i
return dic
Your dic values should be lists, since more than one line can contain the same word. As it stands your dic would only store the last line where a word is found:
from collections import defaultdict
def index(filename, words):
# make faster the in check afterwards
words = frozenset(words)
with open(filename) as infile:
dic = defaultdict(list)
for i, line in enumerate(infile):
for word in line.split():
if word in words:
dic[word].append(i)
return dic
If you don't want to use the collections.defaultdict you can replace dic = defaultdict(list) with dic = {} and then change the:
dic[word].append(i)
With:
if word in dic:
dic[word] = [i]
else:
dic[word].append(i)
Or, alternatively, you can use dict.setdefault:
dic.setdefault(word, []).append(i)
although this last way is a bit slower than the original code.
Note that all these solutions have the property that if a word isn't found in the file it will not appear in the result at all. However you may want it in the result, with an emty list as value. In such a case it's simpler the dict with empty lists before starting to loop, such as in:
dic = {word : [] for word in words}
for i, line in enumerate(infile):
for word in line.split():
if word in words:
dic[word].append(i)
Refer to the documentation about List Comprehensions and Dictionaries to understand the first line.
You can also iterate over words instead of the line, like this:
dic = {word : [] for word in words}
for i, line in enumerate(infile):
for word in words:
if word in line.split():
dic[word].append(i)
Note however that this is going to be slower because:
line.split() returns a list, so word in line.split() will have to scan all the list.
You are repeating the computation of line.split().
You can try to solve these two problems doing:
dic = {word : [] for word in words}
for i, line in enumerate(infile):
line_words = frozenset(line.split())
for word in words:
if word in line_words:
dic[word].append(i)
Note that here we are iterating once over line.split() to build the set and also over words. Depending on the sizes of the two sets this may be slower or faster than the original version (iteratinv over line.split()).
However at this point it's probably faster to intersect the sets:
dic = {word : [] for word in words}
for i, line in enumerate(infile):
line_words = frozenset(line.split())
for word in words & line_words: # & stands for set intersection
dic[word].append(i)
Try this,
def index(filename, lst):
dic = {w:[] for w in lst}
for n,line in enumerate( open(filename,'r') ):
for word in lst:
if word in line.split(' '):
dic[word].append(n+1)
return dic
There are some features of the language introduced here that you should be aware of because they will make life a lot easier in the long run.
The first is a dictionary comprehension. It basically initializes a dictionary using the words in lst as keys and an empty list [] as the value for each key.
Next the enumerate command. This allows us to iterate over the items in a sequence but also gives us the index of those items. In this case, because we passed a file object to enumerate it will loop over the lines. For each iteration, n will be the 0-based index of the line and line will be the line itself. Next we iterate over the words in lst.
Notice that we don't need any indices here. Python encourages looping over objects in sequences rather than looping over indices and then accessing the objects in a sequence based on index (for example discourages doing for i in range(len(lst)): do something with lst[i]).
Finally, the in operator is a very straightforward way to test membership for many types of objects and the syntax is very intuitive. In this case, we are asking is the current word from lst in the current line.
Note that we use line.split(' ') to get a list of the words in the line. If we don't do this, 'the' in 'there was a ghost' would return True as the is a substring of one of the words.
On the other hand 'the' in ['there', 'was', 'a', 'ghost'] would return False. If the conditional returns True, we append it to the list associated to the key in our dictionary.
That might be a lot to chew on, but these concepts make problems like this more straight forward.
First, your function param with the words is named lst and also the list where you put all the words in the file is also named lst, so you are not saving the words passed to your functions, because on line 4 you're redeclaring the list.
Second, You are iterating over each line in the file (the first for), and getting the words in that line. After that lst has all the words in the entire file. So in the for i ... you are iterating over all the words readed from the file, there's no need to use the third for j where you are iterating over each character in every word.
In resume, in that if you are saying "If this single character is in the lists of words ..." wich is not, so the dict will be never filled up.
for i in range(len(lst)):
if words[i] in lst:
dic[words[i]] = dic[words[i]] + i # To count repetitions
You need to rethink the problem, even my answer will fail because the word in the dict will not exist giving an error, but you get the point. Good luck!

Not working: indexing the words in a file in a dict by first letter

I have to write a function based on a open file that has one lowercase word per line. I have to return a dictionary with keys in single lowercase letters and each value is a list of the words from the file that starts with that letter. (The keys in the dictionary are from only the letters of the words that appear in the file.)
This is my code:
def words(file):
line = file.readline()
dict = {}
list = []
while (line != ""):
list = line[:].split()
if line[0] not in dict.keys():
dict[line[0]] = list
line = file.readline()
return dict
However, when I was testing it myself, my function doesn't seem to return all the values. If there are more than two words that start with a certain letter, only the first one shows up as the values in the output. What am I doing wrong?
For example, the file should return:
{'a': ['apple'], 'p': ['peach', 'pear', 'pineapple'], \
'b': ['banana', 'blueberry'], 'o': ['orange']}, ...
... but returns ...
{'a': ['apple'], 'p': ['pear'], \
'b': ['banana'], 'o': ['orange']}, ...
Try this solution, it takes into account the case where there are words starting with the same character in more than one line, and it doesn't use defaultdict. I also simplified the function a bit:
def words(file):
dict = {}
for line in file:
lst = line.split()
dict.setdefault(line[0], []).extend(lst)
return dict
You aren't adding to the list for each additional letter. Try:
if line[0] not in dict.keys():
dict[line[0]] = list
else:
dict[line[0]] += list
The specific problem is that dict[line[0]] = list replaces the value for the new key. There are many ways to fix this... I'm happy to provide one, but you asked what was wrong and that's it. Welcome StackOverflow.
It seems like every dictionary entry should be a list. Use the append method on the dictionary key.
Sacrificing performance (to a certain extent) for elegance:
with open(whatever) as f: words = f.read().split()
result = {
first: [word for word in words if word.startswith(first)]
for first in set(word[0] for word in words)
}
Something like this should work
def words(file):
dct = {}
for line in file:
word = line.strip()
try:
dct[word[0]].append(word)
except KeyError:
dct[word[0]] = [word]
return dct
The first time a new letter is found, there will be a KeyError, subsequent occurances of the letter will cause the word to be appended to the existing list
Another approach would be to prepopulate the dict with the keys you need
import string
def words(file):
dct = dict.fromkeys(string.lowercase, [])
for line in file:
word = line.strip()
dct[word[0]] = dct[word[0]] + [word]
return dct
I'll leave it as an exercise to work out why dct[word[0]] += [word] won't work
Try this function
def words(file):
dict = {}
line = file.readline()
while (line != ""):
my_key = line[0].lower()
dict.setdefault(my_key, []).extend(line.split() )
line = file.readline()
return dict

Categories

Resources