Word count from a txt file program - python

I am counting word of a txt file with the following code:
#!/usr/bin/python
file=open("D:\\zzzz\\names2.txt","r+")
wordcount={}
for word in file.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
print (word,wordcount)
file.close();
this is giving me the output like this:
>>>
goat {'goat': 2, 'cow': 1, 'Dog': 1, 'lion': 1, 'snake': 1, 'horse': 1, '': 1, 'tiger': 1, 'cat': 2, 'dog': 1}
but I want the output in the following manner:
word wordcount
goat 2
cow 1
dog 1.....
Also I am getting an extra symbol in the output (). How can I remove this?

The funny symbols you're encountering are a UTF-8 BOM (Byte Order Mark). To get rid of them, open the file using the correct encoding (I'm assuming you're on Python 3):
file = open(r"D:\zzzz\names2.txt", "r", encoding="utf-8-sig")
Furthermore, for counting, you can use collections.Counter:
from collections import Counter
wordcount = Counter(file.read().split())
Display them with:
>>> for item in wordcount.items(): print("{}\t{}".format(*item))
...
snake 1
lion 2
goat 2
horse 3

#!/usr/bin/python
file=open("D:\\zzzz\\names2.txt","r+")
wordcount={}
for word in file.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
for k,v in wordcount.items():
print k, v

FILE_NAME = 'file.txt'
wordCounter = {}
with open(FILE_NAME,'r') as fh:
for line in fh:
# Replacing punctuation characters. Making the string to lower.
# The split will spit the line into a list.
word_list = line.replace(',','').replace('\'','').replace('.','').lower().split()
for word in word_list:
# Adding the word into the wordCounter dictionary.
if word not in wordCounter:
wordCounter[word] = 1
else:
# if the word is already in the dictionary update its count.
wordCounter[word] = wordCounter[word] + 1
print('{:15}{:3}'.format('Word','Count'))
print('-' * 18)
# printing the words and its occurrence.
for (word,occurance) in wordCounter.items():
print('{:15}{:3}'.format(word,occurance))
#
Word Count
------------------
of 6
examples 2
used 2
development 2
modified 2
open-source 2

import sys
file=open(sys.argv[1],"r+")
wordcount={}
for word in file.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
for key in wordcount.keys():
print ("%s %s " %(key , wordcount[key]))
file.close();

If you are using graphLab, you can use this function. It is really powerfull
products['word_count'] = graphlab.text_analytics.count_words(your_text)

#!/usr/bin/python
file=open("D:\\zzzz\\names2.txt","r+")
wordcount={}
for word in file.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
for k,v in wordcount.items():
print k,v
file.close();

you can do this:
file= open(r'D:\\zzzz\\names2.txt')
file_split=set(file.read().split())
print(len(file_split))

Below code from Python | How to Count the frequency of a word in the text file? worked for me.
import re
frequency = {}
#Open the sample text file in read mode.
document_text = open('sample.txt', 'r')
#convert the string of the document in lowercase and assign it to text_string variable.
text = document_text.read().lower()
pattern = re.findall(r'\b[a-z]{2,15}\b', text)
for word in pattern:
count = frequency.get(word,0)
frequency[word] = count + 1
frequency_list = frequency.keys()
for words in frequency_list:
print(words, frequency[words])
OUTPUT:

print("sorted counting values:-")
from collections import Counter
fname = open(filename)
fname = fname.read()
fsplit = fname.split()
user = Counter(fsplit)
for i,v in sorted(user.items()):
print((v,i))

Related

Python 3 counter that is ignoring strings with less than x characters

i have a programm that counts words of a text file. Now i want to restrict the counter to strings with more than x characters
from collections import Counter
input = 'C:/Users/micha/Dropbox/IPCC_Boox/FOD_v1_ch15.txt'
Counter = {}
words = {}
with open(input,'r', encoding='utf-8-sig') as fh:
for line in fh:
word_list = line.replace(',','').replace('\'','').replace('.','').lower().split()
for word in word_list:
if word not in Counter:
Counter[word] = 1
else:
Counter[word] = Counter[word] + 1
N = 20
top_words = Counter(Counter).most_common(N)
for word, frequency in top_words:
print("%s %d" % (word, frequency))
I tried the re code, but it did not work.
re.sub(r'\b\w{1,3}\b')
I dont know how to implement it...
At the end I would like to have an output that ignores all the short words like and, you, be etc.
You could do this more simply with:
for word in word_list:
if len(word) < 5: # check the length of each word is less than 5 for example
continue # this skips the counter portion and jumps to next word in word_list
elif word not in Counter:
Counter[word] = 1
else:
Counter[word] = Counter[word] + 1
Few notes.
1) You import a Counter but don't use it properly (you do a Counter = {} thus overwriting the import).
from collections import Counter
2) Instead of doing several replaces use list comprehension with a set, its faster and only does one (two with the join) iterations instead of several:
sentence = ''.join([char for char in line if char not in {'.', ',', "'"}])
word_list = sentence.split()
3) Use the counter and list comp for length:
c = Counter(word for word in word_list if len(word) > 3)
Thats it.
Counter already does what you want. You can "feed" it wiht an iterable and this will work.
https://docs.python.org/2/library/collections.html#counter-objects
You can use the filter function too https://docs.python.org/3.7/library/functions.html#filter
The could look alike:
counted = Counter(filter(lambda x: len(x) >= 5, words))

How to count one specific word in Python?

I want to count a specific word in the file.
For example how many times does 'apple' appear in the file.
I tried this:
#!/usr/bin/env python
import re
logfile = open("log_file", "r")
wordcount={}
for word in logfile.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
for k,v in wordcount.items():
print k, v
by replacing 'word' with 'apple', but it still counts all possible words in my file.
Any advice would be greatly appreciated. :)
You could just use str.count() since you only care about occurrences of a single word:
with open("log_file") as f:
contents = f.read()
count = contents.count("apple")
However, to avoid some corner cases, such as erroneously counting words like "applejack", I suggest that you use a regex:
import re
with open("log_file") as f:
contents = f.read()
count = sum(1 for match in re.finditer(r"\bapple\b", contents))
\b in the regex ensures that the pattern begins and ends on a word boundary (as opposed to a substring within a longer string).
If you only care about one word then you do not need to create a dictionary to keep track of every word count. You can just iterate over the file line-by-line and find the occurrences of the word you are interested in.
#!/usr/bin/env python
logfile = open("log_file", "r")
wordcount=0
my_word="apple"
for line in logfile:
if my_word in line.split():
wordcount += 1
print my_word, wordcount
However, if you also want to count all the words, and just print the word count for the word you are interested in then these minor changes to your code should work:
#!/usr/bin/env python
import re
logfile = open("log_file", "r")
wordcount={}
for word in logfile.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
# print only the count for my_word instead of iterating over entire dictionary
my_word="apple"
print my_word, wordcount[my_word]
You can use the Counter dictionary for this
from collections import Counter
with open("log_file", "r") as logfile:
word_counts = Counter(logfile.read().split())
print word_counts.get('apple')
This is an example of counting words in array of words. I am assuming file reader will be pretty much similar.
def count(word, array):
n=0
for x in array:
if x== word:
n+=1
return n
text= 'apple orange kiwi apple orange grape kiwi apple apple'
ar = text.split()
print(count('apple', ar))
def Freq(x,y):
d={}
open_file = open(x,"r")
lines = open_file.readlines()
for line in lines:
word = line.lower()
words = word.split()
for i in words:
if i in d:
d[i] = d[i] + 1
else:
d[i] = 1
print(d)
fi=open("text.txt","r")
cash=0
visa=0
amex=0
for line in fi:
k=line.split()
print(k)
if 'Cash' in k:
cash=cash+1
elif 'Visa' in k:
visa=visa+1
elif 'Amex' in k:
amex=amex+1
print("# persons paid by cash are:",cash)
print("# persons paid by Visa card are :",visa)
print("#persons paid by Amex card are :",amex)
fi.close()

Python Word Count of Text File

I'm trying to get a count of the frequency of a word in a Text File using a python function. I can get the frequency of all of the words separately, but I'm trying to get a count of specific words by having them in a list. Here's what I have so far but I am currently stuck. My
def repeatedWords():
with open(fname) as f:
wordcount={}
for word in word_list:
for word in f.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
for k,v in wordcount.items():
print k, v
word_list = [‘Emma’, ‘Woodhouse’, ‘father’, ‘Taylor’, ‘Miss’, ‘been’, ‘she’, ‘her’]
repeatedWords('file.txt')
Updated, still showing all words:
def repeatedWords(fname, word_list):
with open(fname) as f:
wordcount = {}
for word in word_list:
for word in f.read().split():
wordcount[word] = wordcount.get(word, 0) + 1
for k,v in wordcount.items():
print k, v
word_list = ['Emma', 'Woodhouse', 'father', 'Taylor', 'Miss', 'been', 'she', 'her']
repeatedWords('Emma.txt', word_list)
So you want the frequency of only the specific words in that list (Emma, Woodhouse, Father...)? If so, this code might help (try running it):
word_list = ['Emma','Woodhouse','father','Taylor','Miss','been','she','her']
#i'm using this example text in place of the file you are using
text = 'This is an example text. It will contain words you are looking for, like Emma, Emma, Emma, Woodhouse, Woodhouse, Father, Father, Taylor,Miss,been,she,her,her,her. I made them repeat to show that the code works.'
text = text.replace(',',' ') #these statements remove irrelevant punctuation
text = text.replace('.','')
text = text.lower() #this makes all the words lowercase, so that capitalization wont affect the frequency measurement
for repeatedword in word_list:
counter = 0 #counter starts at 0
for word in text.split():
if repeatedword.lower() == word:
counter = counter + 1 #add 1 every time there is a match in the list
print(repeatedword,':', counter) #prints the word from 'word_list' and its frequency
The output shows the frequency of only those words in the list you provided, and that's what you wanted right?
the output produced when run in python3 is:
Emma : 3
Woodhouse : 2
father : 2
Taylor : 1
Miss : 1
been : 1
she : 1
her : 3
The best way to deal with this is to use get method in Python dictionary. It can be like this:
def repeatedWords():
with open(fname) as f:
wordcount = {}
#Example list of words not needed
nonwordlist = ['father', 'Miss', 'been']
for word in word_list:
for word in file.read().split():
if not word in nonwordlist:
wordcount[word] = wordcount.get(word, 0) + 1
# Put these outside the function repeatedWords
for k,v in wordcount.items():
print k, v
The print statement should give you this:
word_list = [‘Emma’, ‘Woodhouse’, ‘father’, ‘Taylor’, ‘Miss’, ‘been’, ‘she’, ‘her’]
newDict = {}
for newWord in word_list:
newDict[newWord] = newDict.get(newWord, 0) + 1
print newDict
What this line wordcount[word] = wordcount.get(word, 0) + 1 does is, it first looks for word in the dictionary wordcount, if the word already exists, it gets it's value first and adds 1 to it. If the word does not exist, the value defaults to 0 and at this instance, 1 is added making it the first occurrence of that word having a count of 1.

counting total number of words in a text file

I am new to python and trying to print the total number of words in a text file and the total number of specific words in the file provided by the user.
I tested my code, but results output of single word,but i need only the overall word count of all the words in the file and also the overall wordcount of words provided by the user.
Code:
name = raw_input("Enter the query x ")
name1 = raw_input("Enter the query y ")
file=open("xmlfil.xml","r+")
wordcount={}
for word in file.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
for k,v in wordcount.items():
print k, v
for name in file.read().split():
if name not in wordcount:
wordcount[name] = 1
else:
wordcount[name] += 1
for k,v in wordcount.items():
print k, v
for name1 in file.read().split():
if name1 not in wordcount:
wordcount[name1] = 1
else:
wordcount[name1] += 1
for k,v in wordcount.items():
print k, v
MyFile=open('test.txt','r')
words={}
count=0
given_words=['The','document','1']
for x in MyFile.read().split():
count+=1
if x in given_words:
words.setdefault(x,0)
words[str(x)]+=1
MyFile.close()
print count, words
Sample output
17 {'1': 1, 'The': 1, 'document': 1}
Please do not name the variable to handle open() result file as then you'll overwrite the constructor function for the file type.
You can get what you need easily via Counter
from collections import Counter
c = Counter()
with open('your_file', 'rb') as f:
for ln in f:
c.update(ln.split())
total = sum(c.values())
specific = c['your_specific_word']

How to calculate the number of times a word is in a text

I'm new in python and i don't know how to solve this: Write a function that calculates the number of times a word appears in a text. This is my code so far but i'm stuck. I think i need to find a way to split the text in words but it's in a list so i can't do it in this way.
def searcher(file):
f = open(file,"r")
word = raw_input("Write your word: ")
text = f.readlines()
text1 = text.split()
counter = 0
for x in text1:
if x == word:
counter = counter +1
print counter
Thanks in advance
Use collections.Counter passing in each line split in individual words.
s = "foo foo foobar bar"
from collections import Counter
print Counter(s.split())
Counter({'foo': 2, 'foobar': 1, 'bar': 1})
def searcher(file):
c = Counter()
word = raw_input("Write your word: ")
with open(file,"r") as f:
for line in f:
c.update(line.lower().rstrip().split())
return c.get(word)

Categories

Resources