syntax errors on creating wordDictionary of word and occurences - python

Having Attribute error issue on line 32. Requesting some assistance figuring out how to display word and occurrences.
import re
file_object = open('dialog.txt')
# read the file content
fileContents = file_object.read()
# convert fileContents to lowercase
final_dialog = fileContents.lower()
# print(final_dialog)
# replace a-z and spaces with cleanText variable
a_string = final_dialog
cleanText = re.sub("[^0-9a-zA-Z]+", "1", a_string)
# print(cleanText)
# wordlist that contains all words found in cleanText
text_string = cleanText
wordList = re.sub("1"," ", text_string)
# print(wordList)
#wordDictionary to count occurrence of each word to list in wordList
wordDictionary = dict()
#loop through .txt
for line in list(wordList):
# remove spaces and newline characters
line = line.strip()
# split the line into words
words = line.split()
#iterate over each word in line
for word in words.split():
if word not in wordDictionary:
wordDictionary[word] = 1
else:
wordDictionary[word] += 1
# print contents of dictionary
print(word)
# print file content
# print(fileContents)
# close file
# file_object.close()
Having Attribute error issue on line 32. Requesting some assistance figuring out how to display word and occurrences.

I think the error is
for word in words.split():
and should be replaced with
for word in words:
Explanation: words is already a list. A list has no split method, so you'll get an AttributeError when trying to call that method.

Related

How do I make each line in a text file its own dictionary to sort through in Python?

Currently, I have
import re
import string
input_file = open('documents.txt', 'r')
stopwords_file = open('stopwords_en.txt', 'r')
stopwords_list = []
for line in stopwords_file.readlines():
stopwords_list.extend(line.split())
stopwords_set = set(stopwords_list)
word_count = {}
for line in input_file.readlines():
words = line.strip()
words = words.translate(str.maketrans('','', string.punctuation))
words = re.findall('\w+', line)
for word in words:
if word.lower() in stopwords_set:
continue
word = word.lower()
if not word in word_count:
word_count[word] = 1
else:
word_count[word] = word_count[word] + 1
word_index = sorted(word_count.keys())
for word in word_index:
print (word, word_count[word])
What it does is parses through a txt file I have, removes stopwords, and outputs the number of times a word appears in the document it is reading from.
The problem is that the txt file is not one file, but five.
The text in the document looks something like this:
1
The cat in the hat was on the mat
2
The rat on the mat sat
3
The bat was fat and named Pat
Each "document" is a line preceded by the document ID number.
In Python, I want to find a way to go through 1, 2, and 3 and count how many times a word appears in an individual document, as well as the total amount of times a word appears in the whole text file - which my code currently does.
i.e Mat appears 2 times in the text document. It appears in Document 1 and Document 2 Ideally less wordy.
Give this a try:
import re
import string
def count_words(file_name):
word_count = {}
with open(file_name, 'r') as input_file:
for line in input_file:
if line.startswith("document"):
doc_id = line.split()[0]
words = line.strip().split()[1:]
for word in words:
word = word.translate(str.maketrans('','', string.punctuation)).lower()
if word in word_count:
word_count[word][doc_id] = word_count[word].get(doc_id, 0) + 1
else:
word_count[word] = {doc_id: 1}
return word_count
word_count = count_words("documents.txt")
for word, doc_count in word_count.items():
print(f"{word} appears in: {doc_count}")
You have deleted your previous similar question and with it my answer, so I'm not sure if it's a good idea to answer again. I'll give a slightly different answer, without groupby, although I think it was fine.
You could try:
import re
from collections import Counter
from string import punctuation
with open("stopwords_en.txt", "r") as file:
stopwords = set().union(*(line.rstrip().split() for line in file))
translation = str.maketrans("", "", punctuation)
re_new_doc = re.compile(r"(\d+)\s*$")
with open("documents.txt", "r") as file:
word_count, doc_no = {}, 0
for line in file:
match = re_new_doc.match(line)
if match:
doc_no = int(match[1])
continue
line = line.translate(translation)
for word in re.findall(r"\w+", line):
word = word.casefold()
if word in stopwords:
continue
word_count.setdefault(word, []).append(doc_no)
word_count_overall = {word: len(docs) for word, docs in word_count.items()}
word_count_docs = {word: Counter(docs) for word, docs in word_count.items()}
I would make the translation table only once, beforehand, not for each line again.
The regex for the identification of a new document (\d+)\s*$" looks for digits at the beginning of a line and nothing else, except maybe some whitespace, until the line break. You have to adjust it if the identifier follows a different logic.
word_count records each occurrence of a word in a list with the number of the current document.
word_count_overall just takes the length of the resp. lists to get the overall count of a word.
word_count_docs does apply a Counter on the lists to get the counts per document for each word.

count how many times the same word occurs in a txt file

Hello I am struggling with this:
I have a txt file that once open looks like this:
Jim, task1\n
Marc, task3\n
Tom, task4\n
Jim, task2\n
Jim, task6\n
And I want to check how many duplicate names there are. I am interested only in the first field (i.e person name).
I tried to look for an answer on this website, but I could not find anything that helped, as in my case I don't know which name is duplicate, since this file txt will be updated frequently.
As I am new to Python/programming is there a simple way to solve this without using any dictionaries or list comprehensions or without importing modules?
Thank you
same_word_count = 0
with open('tasks.txt','r') as file2:
content = file2.readlines()
for line in content:
split_data = line.split(', ')
user = split_data[0]
word = user
if word == user:
same_word_count -= 1
print(same_word_count)
You can do the following.
word = "Word" # word you want to count
count = 0
with open("temp.txt", 'r') as f:
for line in f:
words = line.split()
for i in words:
if(i==word):
count=count+1
print("Occurrences of the word", word, ":", count)
Or you can get list of all words occurrences
# Open the file in read mode
text = open("sample.txt", "r")
# Create an empty dictionary
d = dict()
# Loop through each line of the file
for line in text:
# Remove the leading spaces and newline character
line = line.strip()
# Convert the characters in line to
# lowercase to avoid case mismatch
line = line.lower()
# Split the line into words
words = line.split(" ")
# Iterate over each word in line
for word in words:
# Check if the word is already in dictionary
if word in d:
# Increment count of word by 1
d[word] = d[word] + 1
else:
# Add the word to dictionary with count 1
d[word] = 1
# Print the contents of dictionary
for key in list(d.keys()):
print(key, ":", d[key])

'str' object has no attribute 'txt'

I'm trying to get this code to work and keep getting
AttributeError: 'str' object has no attribute 'txt'
my code is as written below, I am new to this so any help would be greatly appreciated. I for the life of me cannot figure out what I am doing wrong.
def countFrequency(alice):
# Open file for reading
file = open(alice.txt, "r")
# Create an empty dictionary to store the words and their frequency
wordFreq = {}
# Read file line by line
for line in file:
# Split the line into words
words = line.strip().split()
# Iterate through the list of words
for i in range(len(words)):
# Remove punctuations and special symbols from the word
for ch in '!"#$%&()*+,-./:;<=>?<#[\\]^_`{|}~' :
words[i] = words[i].replace(ch, "")
# Convert the word to lowercase
words[i] = words[i].lower()
# Add the word to the dictionary with a frequency of 1 if it is not already in the dictionary
if words[i] not in wordFreq:
wordFreq[words[i]] = 1
# Increase the frequency of the word by 1 in the dictionary if it is already in the dictionary
else:
wordFreq[words[i]] += 1
# Close the file
file.close()
# Return the dictionary
return wordFreq
if __name__ == "__main__":
# Call the function to get frequency of the words in the file
wordFreq = countFrequency("alice.txt")
# Open file for writing
outFile = open("most_frequent_alice.txt", "w")
# Write the number of unique words to the file
outFile.write("Total number of unique words in the file: " + str(len(wordFreq)) + "\n")
# Write the top 20 most used words and their frequency to the file
outFile.write("\nTop 20 most used words and their frequency:\n\n")
outFile.write("{:<20} {}\n" .format("Word", "Frequency"))
wordFreq = sorted(wordFreq.items(), key = lambda kv:(kv[1], kv[0]), reverse = True)
for i in range(20):
outFile.write("{:<20} {}\n" .format(wordFreq[i][0], str(wordFreq[i][1])))
# Close the file
outFile.close()
file = open("alice.txt", "r")
You missed the quotation, and you might need to give the correct location of that text file too.

Python - Calculating length of string is inaccurate for certain strings only

I'm new to programming and trying to make a basic hangman game. For some reason when calculating the length of a string from a text file some words have the length of the string calculated incorrectly. Some strings have values too high and some too low. I can't seem to figure out why. I have already ensured that there are no spaces in the text file so that the space is counted as a character.
import random
#chooses word from textfile for hangman
def choose_word():
words = []
with open("words.txt", "r") as file:
words = file.readlines()
#number of words in text file
num_words = sum(1 for line in open("words.txt"))
n = random.randint(1, num_words)
#chooses the selected word from the text file
chosen_word = (words[n-1])
print(chosen_word)
#calculates the length of the word
len_word = len(chosen_word)
print(len_word)
choose_word()
#obama returns 5
#snake, turtle, hoodie, all return 7
#intentions returns 11
#racecar returns 8
words.txt
snake
racecar
turtle
cowboy
intentions
hoodie
obama
Use strip().
string.strip(s[, chars])
Return a copy of the string with leading and trailing characters removed. If chars is omitted or None, whitespace characters are removed. If given and not None, chars must be a string; the characters in the string will be stripped from the both ends of the string this method is called on.
Example:
>>> ' Hello'.strip()
'Hello'
Try this:
import random
#chooses word from textfile for hangman
def choose_word():
words = []
with open("words.txt", "r") as file:
words = file.readlines()
#number of words in text file
num_words = sum(1 for line in open("words.txt"))
n = random.randint(1, num_words)
#chooses the selected word from the text file
chosen_word = (words[n-1].strip())
print(chosen_word)
#calculates the length of the word
len_word = len(chosen_word)
print(len_word)
choose_word()
You are reading a random line from a text file.
Probably you have spaces in some lines after the words in those lines.
For example, the word "snake" is written in the file as "snake ", so it has length of 7.
To solve it you can either:
A) Manually or by a script remove the spaces in the file
B) When you read a random line from the text, before you check the length of the word, write: chosen_word = chosen_word.replace(" ", "").
This will remove the spaces from your word.
You need to strip all spaces from each line. This removes the beginning and trailing spaces. Here is your corrected code.
import random
# chooses word from textfile for hangman
def choose_word():
words = []
with open("./words.txt", "r") as file:
words = file.readlines()
# number of words in text file
num_words = sum(1 for line in open("words.txt"))
n = random.randint(1, num_words)
# chooses the selected word from the text file
# Added strip() to remove spaces surrounding your words
chosen_word = (words[n-1]).strip()
print(chosen_word)
# calculates the length of the word
len_word = len(chosen_word)
print(len_word)
choose_word()
Im supposing that the .txt file contains one word per line and without commas.
Maybe try to change some things here:
First, notice that the readlines() method is returning a list with all the lines but that also includes the newline string "\n".
# This deletes the newline from each line
# strip() also matches new lines as Hampus Larsson suggested
words = [x.strip() for x in file.readlines()]
You can calculate the number of words from the length of the words list itself:
num_words = len(words)
You do not need parenthesis to get the random word
chosen_word = words[n]
It should now work correctly!
in the file everyword has an \n to symbolize a new line.
in order to cut that out you have to replace:
chosen_word = (words[n-1])
by
chosen_word = (words[n-1][:-1])
this will cut of the last two letters of the chosen word!

Extracting the most common words and then append to a csv file with python

So i'm trying to extract the most used words from a .txt file, and then put the 4 most common words into a csv file. (and then append if need be), At the moment it's extracting the most common words, and appending to a csv file. But it's appending each letter to a cell.
python
import collections
import pandas as pd
import matplotlib.pyplot as plt
import csv
fields=['first','second','third']
# Read input file, note the encoding is specified here
# It may be different in your text file
file = open('pyTest.txt', encoding="utf8")
a= file.read()
# Stopwords
stopwords = set(line.strip() for line in open('stopwords.txt'))
stopwords = stopwords.union(set(['mr','mrs','one','two','said']))
# Instantiate a dictionary, and for every word in the file,
# Add to the dictionary if it doesn't exist. If it does, increase the count.
wordcount = {}
# To eliminate duplicates, remember to split by punctuation, and use case demiliters.
for word in a.lower().split():
word = word.replace(".","")
word = word.replace(",","")
word = word.replace(":","")
word = word.replace("\"","")
word = word.replace("!","")
word = word.replace("“","")
word = word.replace("‘","")
word = word.replace("*","")
if word not in stopwords:
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
# Print most common word
n_print = int(input("How many most common words to print: "))
print("\nOK. The {} most common words are as follows\n".format(n_print))
word_counter = collections.Counter(wordcount)
for word in word_counter.most_common(n_print):
print(word[0])
# Close the file
file.close()
with open('old.csv', 'a') as out_file:
writer = csv.writer(out_file)
for word in word_counter.most_common(4):
print(word)
writer.writerow(word[0])
Output csv file
p,i,p,e
d,i,a,m,e,t,e,r
f,i,t,t,i,n,g,s
o,u,t,s,i,d,e
You can use a generator expression to extract the first item of each sub-list in the list returned by the most_common method as a row instead:
with open('old.csv', 'a') as out_file:
writer = csv.writer(out_file)
writer.writerow(word for word, _ in word_counter.most_common(4))

Categories

Resources