Here is my code:
def corpus_reading_pos(corpus_name, pos_tag, option="pos"):
pos_tags = []
words = []
tokens_pos = {}
file_count = 0
for root, dirs, files in os.walk(corpus_name):
for file in files:
if file.endswith(".v4_gold_conll"):
with open((os.path.join(root, file))) as f:
pos_tags += [line.split()[4] for line in f if line.strip() and not line.startswith("#")]
with open((os.path.join(root, file))) as g:
words += [line.split()[3] for line in g if line.strip() and not line.startswith("#")]
file_count += 1
for pos in pos_tags:
tokens_pos[pos] = []
words_pos = list(zip(words, pos_tags))
for word in words_pos:
tokens_pos[word[1]] = word[0]
#print(words_pos)
print(tokens_pos)
#print(words)
print("Token count:", len(tokens_pos))
print("File count:", file_count)
I'm trying to create a dictionary that has all of the pos items as keys, and the dictionary values will be all of the words that belong to that specific pos. I'm stuck on the par where for the values in the dictionary, I have to create a list of words, but I can't seem to get there.
In the code, the line tokens_pos[word[1]] = word[0] only adds one word per key, but if I try something like [].append(word[0]), the dictionary returns all values as NONE.
You seem to be doing a lot of double work but to give a solution to your specific question:
for word in words_pos:
tokens_pos[word[1]].append(word[0])
should do what you want to achieve.
with
tokens_pos[word[1]] = word[0]
you are basically overwriting existing values that have the same key, and thus only the last written value with that key will remain in the end.
Related
def inverted_index(doc):
words = word_count(doc)
ln = 0
for word in words:
temp = []
with open(doc) as file:
for line in file:
ln += 1
li = line.split()
if word in li:
temp.append(ln)
words[word] = temp
return words
I am trying to create an inverted index from a text file, where words is a dictionary with all the 19000 unique words in the file. The text file has around 5000+ lines. I want to iterate through the file and dictionary to create the inverted index that has the word followed by line numbers that the word appears but it is taking too long to compile as it is nested for loop. So is there a more efficient way to do this?
Here is my approach to solve this, please read the notes below code for some pragmatic tips.
def inverted_index(doc):
# this will open the file
file = open(doc, encoding='utf8')
f = file.read()
file.seek(0)
# Get number of lines in file
lines = 1
for word in f:
if word == '\n':
lines += 1
print("Number of lines in file is: ", lines) # Just for debuggin, please remove in PROD version
d = {}
for i in range(lines):
line = file.readline()
l = line.lower().split(' ')
for item in l:
if item not in d:
d[item] = [i+1]
if item in d:
d[item].append(i+1)
return d
print(inverted_index('file.txt'))
I would suggest removing stopwords first before creating the inverted index for any meaningful analysis. You can use nltk package for that.
Apologies if this has been addressed before. I can't find any previous answers that address my specific problem, so here it is.
The exercise requires that the user inputs a .txt file name. The code takes that file, and counts the words within it, creating a dictionary of word : count pairs. If the file has already been input, and its words counted, then instead of recounting it, the program refers to the cache, where its previous counts are stored.
My problem is creating a nested dictionary of dictionaries - the cache. The following is what I have so far. At the moment, each new .txt file rewrites the dictionary, and prevents it being used as a cache.
def main():
file = input("Enter the file name: ") #Takes a file input to count the words
d = {} #open dictionary of dictionaries: a cache of word counts]
with open(file) as f:
if f in d: #check if this file is in cache.
for word in sorted(d[f]): #print the result of the word count of an old document.
print("That file has already been assessed:\n%-12s:%5d" % (word, d[f][word]))
else: #count the words in this file and add the count to the cache as a nested list.
d[f] = {} #create a nested dictionary within 'd'.
for line in f: #counts the unique words within the document.
words = line.split()
for word in words:
word = word.rstrip("!'?.,") #clean up punctuation here
word = word.upper() #all words to uppercase here
if word not in d[f]:
d[f][word] = 1
else:
d[f][word] = d[f][word] + 1
for word in sorted(d[f]): #print the result of the word count of a new document.
print("%-12s:%5d" % (word, d[f][word]))
main() #Run code again to try new file.
main()
Easy fix:
d[file] = {}
....
d[file][word] = 1 # and so on
because when you cahnge f d[f] still refers to the same entry in d
Also, you can reuse defaultdict:
from collections import defaultdict
d = defaultdict(lambda x: defaultdict(int))
def count(file):
with (open(file)) as f:
if file not in d:
# this is just list comprehension
[d[file][word.rstrip("!'?.,").upper()] += 1
for word in line.split()
for line in f]
return d[file]
def main():
file = input("Enter the file name: ")
count(file)
if file in d:
print("That file has already been assessed, blah blah")
for word in sorted(d[file]): #print the result of the word count of a new document.
print("%-12s:%5d" % (word, d[f][word]))
if __name__ == "__main__":
main()
Your issue is that you re-initialise the dictionary every time you call main(). You need to declare it outside the loop wherein you ask the user to provide a file name.
The process could also be neatened up a bit using collections.Counter() and string.translate:
from collections import Counter
import string
import os.path
d = {}
while True:
input_file = input("Enter the file name: ")
if not os.path.isfile(input_file):
print('File not found, try again')
continue
if d.get(input_file, None):
print('Already found, top 5 words:')
else:
with open(input_file, 'rb') as f:
d[input_file] = Counter(f.read().upper().translate(None, string.punctuation).split())
for word, freq in sorted(d[input_file].items(), reverse=True, key=lambda x: x[1])[:5]:
print(word.ljust(20) + str(freq).rjust(5))
This will print the top 5 most frequent words and their frequencies for a file. If it has already seen the file, it'll provide a warning as such. Example output:
THE 24
OF 15
AND 12
A 10
MODEL 9
I have a file.txt with thousands of words, and I need to create a new file based on certain parameters, and then sort them a certain way.
Assuming the user imports the proper libraries when they test, what is wrong with my code? (There are 3 separate functions)
For the first, I must create a file with words containing certain letters, and sort them lexicographically, then put them into a new file list.txt.
def getSortedContain(s,ifile,ofile):
toWrite = ""
toWrites = ""
for line in ifile:
word = line[:-1]
if s in word:
toWrite += word + "\n"
newList = []
newList.append(toWrite)
newList.sort()
for h in newList:
toWrites += h
ofile.write(toWrites[:-1])
The second is similar, but must be sorted reverse lexicographically, if the string inputted is NOT in the word.
def getReverseSortedNotContain(s,ifile,ofile):
toWrite = ""
toWrites = ""
for line in ifile:
word = line[:-1]
if s not in word:
toWrite += word + "\n"
newList = []
newList.append(toWrite)
newList.sort()
newList.reverse()
for h in newList:
toWrites += h
ofile.write(toWrites[:-1])
For the third, I must sort words that contain a certain amount of integers, and sort lexicographically by the last character in each word.
def getRhymeSortedCount(n, ifile, ofile):
toWrite = ""
for line in ifile:
word = line[:-1] #gets rid of \n
if len(word) == n:
toWrite += word + "\n"
reversetoWrite = toWrite[::-1]
newList = []
newList.append(toWrite)
newList.sort()
newList.reverse()
for h in newList:
toWrites += h
reversetoWrite = toWrites[::-1]
ofile.write(reversetoWrites[:-1])
Could someone please point me in the right direction for these? Right now they are not sorting as they're supposed to.
There is a lot of stuff that is unclear here so I'll try my best to clean this up.
You're concatenating strings together into one big string then appending that one big string into a list. You then tried to sort your 1-element list. This obviously will do nothing. Instead put all the strings into a list and then sort that list
IE: for your first example do the following:
def getSortedContain(s,ifile,ofile):
words = [word for word in ifile if s in words]
words.sort()
ofile.write("\n".join(words))
I need this to print the corresponding line numbers from the text file.
def index (filename, lst):
infile = open('raven.txt', 'r')
lines = infile.readlines()
words = []
dic = {}
for line in lines:
line_words = line.split(' ')
words.append(line_words)
for i in range(len(words)):
for j in range(len(words[i])):
if words[i][j] in lst:
dic[words[i][j]] = i
return dic
The result:
In: index('raven.txt',['raven', 'mortal', 'dying', 'ghost', 'ghastly', 'evil', 'demon'])
Out: {'dying': 8, 'mortal': 29, 'raven': 77, 'ghost': 8}
(The words above appear in several lines but it's only printing one line and for some it doesn't print anything
Also, it does not count the empty lines in the text file. So 8 should actually be 9 because there's an empty line which it is not counting.)
Please tell me how to fix this.
def index (filename, lst):
infile = open('raven.txt', 'r')
lines = infile.readlines()
words = []
dic = {}
for line in lines:
line_words = line.split(' ')
words.append(line_words)
for i in range(len(words)):
for j in range(len(words[i])):
if words[i][j] in lst:
if words[i][j] not in dic.keys():
dic[words[i][j]] = set()
dic[words[i][j]].add(i + 1) #range starts from 0
return dic
Using a set instead of a list is useful in cases were the word is present several times in the same line.
Use defaultdict to create a list of linenumbers for each line:
from collections import defaultdict
def index(filename, lst):
with open(filename, 'r') as infile:
lines = [line.split() for line in infile]
word2linenumbers = defaultdict(list)
for linenumber, line in enumerate(lines, 1):
for word in line:
if word in lst:
word2linenumbers[word].append(linenumber)
return word2linenumbers
You can also use dict.setdefault to either start a new list for each word or append to an existing list if that word has already been found:
def index(filename, lst):
# For larger lists, checking membership will be asymptotically faster using a set.
lst = set(lst)
dic = {}
with open(filename, 'r') as fobj:
for lineno, line in enumerate(fobj, 1):
words = line.split()
for word in words:
if word in lst:
dic.setdefault(word, []).append(lineno)
return dic
Youre two main problems can be fixed by:
1.) multiple indices: you need to initiate/assign a list as the dict value instead of just a single int. otherwise, each word will be reassigned a new index every time a new line is found with that word.
2.) empty lines SHOULD be read as a line so I think its just an indexing issue. your first line is indexed to 0 since the first number in a range starts at 0.
You can simplify your program as follows:
def index (filename, lst):
wordinds = {key:[] for key in lst} #initiates an empty list for each word
with open(filename,'r') as infile: #why use filename param if you hardcoded the open....
#the with statement is useful. trust.
for linenum,line in enumerate(infile):
for word in line.rstrip().split(): #strip new line and split into words
if word in wordinds:
wordinds[word].append(linenum)
return {x for x in wordinds.iteritems() if x[1]} #filters empty lists
this simplifies everything to nest into one for loop that is enumerated for each line. if you want the first line to be 1 and second line as 2 you would have to change wordinds[word].append(linenum) to ....append(linenum + 1)
EDIT: someone made a good point in another answer to have enumerate(infile,1) to start your enumeration at index 1. thats way cleaner.
I would like to define a function scaryDict() which takes one parameter (a textfile) and returns the words from the textfile in alphabetical order, basically produce a dictionary but does not print any one or two letter words.
Here is what I have so far...it isn't much but I don't know the next step
def scaryDict(fineName):
inFile = open(fileName,'r')
lines = inFile.read()
line = lines.split()
myDict = {}
for word in inFile:
myDict[words] = []
#I am not sure what goes between the line above and below
for x in lines:
print(word, end='\n')
You are doing fine till line = lines.split(). But your for loop must loop through the line array, not the inFile.
for word in line:
if len(word) > 2: # Make sure to check the word length!
myDict[word] = 'something'
I'm not sure what you want with the dictionary (maybe get the word count?), but once you have it, you can get the words you added to it by,
allWords = myDict.keys() # so allWords is now a list of words
And then you can sort allWords to get them in alphabetical order.
allWords.sort()
I would store all of the words into a set (to eliminate dups), then sort that set:
#!/usr/bin/python3
def scaryDict(fileName):
with open(fileName) as inFile:
return sorted(set(word
for line in inFile
for word in line.split()
if len(word) > 2))
scaryWords = scaryDict('frankenstein.txt')
print ('\n'.join(scaryWords))
Also keep in mind as of 2.5 the 'with' file contains an enter and exit methods which can prevent some issues (such as that file never getting closed)
with open(...) as f:
for line in f:
<do something with line>
Unique set
Sort the set
Now you can put it all together.
sorry that i am 3 years late : ) here is my version
def scaryDict():
infile = open('filename', 'r')
content = infile.read()
infile.close()
table = str.maketrans('.`/()|,\';!:"?=-', 15 * ' ')
content = content.translate(table)
words = content.split()
new_words = list()
for word in words:
if len(word) > 2:
new_words.append(word)
new_words = list(set(new_words))
new_words.sort()
for word in new_words:
print(word)