I want to count a specific word in the file.
For example how many times does 'apple' appear in the file.
I tried this:
#!/usr/bin/env python
import re
logfile = open("log_file", "r")
wordcount={}
for word in logfile.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
for k,v in wordcount.items():
print k, v
by replacing 'word' with 'apple', but it still counts all possible words in my file.
Any advice would be greatly appreciated. :)
You could just use str.count() since you only care about occurrences of a single word:
with open("log_file") as f:
contents = f.read()
count = contents.count("apple")
However, to avoid some corner cases, such as erroneously counting words like "applejack", I suggest that you use a regex:
import re
with open("log_file") as f:
contents = f.read()
count = sum(1 for match in re.finditer(r"\bapple\b", contents))
\b in the regex ensures that the pattern begins and ends on a word boundary (as opposed to a substring within a longer string).
If you only care about one word then you do not need to create a dictionary to keep track of every word count. You can just iterate over the file line-by-line and find the occurrences of the word you are interested in.
#!/usr/bin/env python
logfile = open("log_file", "r")
wordcount=0
my_word="apple"
for line in logfile:
if my_word in line.split():
wordcount += 1
print my_word, wordcount
However, if you also want to count all the words, and just print the word count for the word you are interested in then these minor changes to your code should work:
#!/usr/bin/env python
import re
logfile = open("log_file", "r")
wordcount={}
for word in logfile.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
# print only the count for my_word instead of iterating over entire dictionary
my_word="apple"
print my_word, wordcount[my_word]
You can use the Counter dictionary for this
from collections import Counter
with open("log_file", "r") as logfile:
word_counts = Counter(logfile.read().split())
print word_counts.get('apple')
This is an example of counting words in array of words. I am assuming file reader will be pretty much similar.
def count(word, array):
n=0
for x in array:
if x== word:
n+=1
return n
text= 'apple orange kiwi apple orange grape kiwi apple apple'
ar = text.split()
print(count('apple', ar))
def Freq(x,y):
d={}
open_file = open(x,"r")
lines = open_file.readlines()
for line in lines:
word = line.lower()
words = word.split()
for i in words:
if i in d:
d[i] = d[i] + 1
else:
d[i] = 1
print(d)
fi=open("text.txt","r")
cash=0
visa=0
amex=0
for line in fi:
k=line.split()
print(k)
if 'Cash' in k:
cash=cash+1
elif 'Visa' in k:
visa=visa+1
elif 'Amex' in k:
amex=amex+1
print("# persons paid by cash are:",cash)
print("# persons paid by Visa card are :",visa)
print("#persons paid by Amex card are :",amex)
fi.close()
Related
I am making a program, that reads a file and makes a dictionary, that shows how many times a word has been used:
filename = 'for_python.txt'
with open(filename) as file:
contents = file.read().split()
dict = {}
for word in contents:
if word not in dict:
dict[word] = 1
else:
dict[word] += 1
dict = sorted(dict.items(), key=lambda x: x[1], reverse=True)
for i in dict:
print(i[0], i[1])
It works, but it treats words that have commas in them as different words, which I do not want it to do. Is there a simple and efficient way to do this?
Remove all commas before splitting them
filename = 'for_python.txt'
with open(filename) as file:
contents = file.read().replace(",", "").split()
You are splitting the whole data based on " " as the delimiter but not doing the same for commas. You can do so by splitting the words further using commas. Here's how:
...
for word in contents:
new_words = word.split(',')
for new_word in new_words:
if new_word not in dict:
dict[new_word] = 1
else:
dict[new_word] += 1
...
I'd suggest you strip() with the different punctuation chars when using the word. Also don't use builtin dict name, its the dictionnary constructor
import string
words = {}
for word in contents:
word = word.strip(string.punctuation)
if word not in words:
words[word] = 1
else:
words[word] += 1
For you know, it exists collections.Counter that does this jobs
import string
from collections import Counter
filename = 'test.txt'
with open(filename) as file:
contents = file.read().split()
words = Counter(word.strip(string.punctuation) for word in contents)
for k, v in words.most_common(): # All content, in occurence conut order descreasingly
print(k, v)
for k, v in words.most_common(5): # Only 5 most occurrence
print(k, v)
I am trying to read a quote from a text file and find any duplicated words that appear next to each other. The following is the quote:
"He that would make his own liberty liberty secure,
must guard even his enemy from oppression;
for for if he violates this duty, he
he establishes a precedent that will reach to himself."
-- Thomas Paine
The output should be the following:
Found word: "Liberty" on line 1
Found word: "for" on line 3
Found word: "he" on line 4
I have written the code to read the text from the file but I am having trouble with the code to identify the duplicates. I have tried enumerating each word in the file and checking if the word at one index is equal to the the word at the following index. However, I am getting an index error because the loop continues outside of the index range. Here's what I've come up with so far:
import string
file_str = input("Enter file name: ")
input_file = open(file_str, 'r')
word_list = []
duplicates = []
for line in input_file:
line_list = line_str.split()
for word in line_list:
if word != "--":
word_list.append(word)
for idx, word in enumerate(word_list):
print(idx, word)
if word_list[idx] == word_list[idx + 1]:
duplicates.append(word)
Any help with the current method I'm trying would be appreciated, or suggestions for another method.
When you record the word_list you are losing information about which line the word is on.
Perhaps better would be to determine duplicates as you read the lines.
line_number = 1
for line in input_file:
line_list = line_str.split()
previous_word = None
for word in line_list:
if word != "--":
word_list.append(word)
if word == previous_word:
duplicates.append([word, line_number])
previous_word = word
line_number += 1
This should do the trick OP. In the for loop over the word list it only goes up to the second to last element now. This won't keep track of the line numbers though, I would use Phillip Martin's solution for that.
import string
file_str = input("Enter file name: ")
input_file = open(file_str, 'r')
word_list = []
duplicates = []
for line in input_file:
line_list = line.split()
for word in line_list:
if word != "--":
word_list.append(word)
#Here is the change I made > <
for idx, word in enumerate(word_list[:-1]):
print(idx, word)
if word_list[idx] == word_list[idx + 1]:
duplicates.append(word)
print duplicates
Here's another approach.
from itertools import tee, izip
from collections import defaultdict
dups = defaultdict(set)
with open('file.txt') as f:
for no, line in enumerate(f, 1):
it1, it2 = tee(line.split())
next(it2, None)
for word, follower in izip(it1, it2):
if word != '--' and word == follower:
dups[no].add(word)
which yields
>>> dups
defaultdict(<type 'set'>, {1: set(['liberty']), 3: set(['for'])})
which is a dictionary which holds a set of pair-duplicates for each line, e.g.
>>> dups[3]
set(['for'])
(I don't know why you expect "he" to be found on line four, it is certainly not doubled in your sample file.)
I'm trying to get a count of the frequency of a word in a Text File using a python function. I can get the frequency of all of the words separately, but I'm trying to get a count of specific words by having them in a list. Here's what I have so far but I am currently stuck. My
def repeatedWords():
with open(fname) as f:
wordcount={}
for word in word_list:
for word in f.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
for k,v in wordcount.items():
print k, v
word_list = [‘Emma’, ‘Woodhouse’, ‘father’, ‘Taylor’, ‘Miss’, ‘been’, ‘she’, ‘her’]
repeatedWords('file.txt')
Updated, still showing all words:
def repeatedWords(fname, word_list):
with open(fname) as f:
wordcount = {}
for word in word_list:
for word in f.read().split():
wordcount[word] = wordcount.get(word, 0) + 1
for k,v in wordcount.items():
print k, v
word_list = ['Emma', 'Woodhouse', 'father', 'Taylor', 'Miss', 'been', 'she', 'her']
repeatedWords('Emma.txt', word_list)
So you want the frequency of only the specific words in that list (Emma, Woodhouse, Father...)? If so, this code might help (try running it):
word_list = ['Emma','Woodhouse','father','Taylor','Miss','been','she','her']
#i'm using this example text in place of the file you are using
text = 'This is an example text. It will contain words you are looking for, like Emma, Emma, Emma, Woodhouse, Woodhouse, Father, Father, Taylor,Miss,been,she,her,her,her. I made them repeat to show that the code works.'
text = text.replace(',',' ') #these statements remove irrelevant punctuation
text = text.replace('.','')
text = text.lower() #this makes all the words lowercase, so that capitalization wont affect the frequency measurement
for repeatedword in word_list:
counter = 0 #counter starts at 0
for word in text.split():
if repeatedword.lower() == word:
counter = counter + 1 #add 1 every time there is a match in the list
print(repeatedword,':', counter) #prints the word from 'word_list' and its frequency
The output shows the frequency of only those words in the list you provided, and that's what you wanted right?
the output produced when run in python3 is:
Emma : 3
Woodhouse : 2
father : 2
Taylor : 1
Miss : 1
been : 1
she : 1
her : 3
The best way to deal with this is to use get method in Python dictionary. It can be like this:
def repeatedWords():
with open(fname) as f:
wordcount = {}
#Example list of words not needed
nonwordlist = ['father', 'Miss', 'been']
for word in word_list:
for word in file.read().split():
if not word in nonwordlist:
wordcount[word] = wordcount.get(word, 0) + 1
# Put these outside the function repeatedWords
for k,v in wordcount.items():
print k, v
The print statement should give you this:
word_list = [‘Emma’, ‘Woodhouse’, ‘father’, ‘Taylor’, ‘Miss’, ‘been’, ‘she’, ‘her’]
newDict = {}
for newWord in word_list:
newDict[newWord] = newDict.get(newWord, 0) + 1
print newDict
What this line wordcount[word] = wordcount.get(word, 0) + 1 does is, it first looks for word in the dictionary wordcount, if the word already exists, it gets it's value first and adds 1 to it. If the word does not exist, the value defaults to 0 and at this instance, 1 is added making it the first occurrence of that word having a count of 1.
I have a sentence "The quick fox jumps over the lazy dog", and I have counted the number of times each word occurs in this sentence. The output should be like this:
brown:1,dog:1,fox:1,jumps:1,lazy:1,over:1,quick:1,the:2
There should be no spaces between the characters in this output, and there should be commas between the words/numbers.
The output from my program looks like this:
,brown:1,dog:1,fox:1,jumps:1,lazy:1,over:1,quick:1,the:2
I find that there is a comma place before 'brown'. Is there an easier way to print this?
filename = os.path.basename(path)
with open(filename, 'r+') as f:
fline = f.read()
fwords = fline.split()
allwords = [word.lower() for word in fwords]
sortwords = list(set(allwords))
r = sorted(sortwords, key=str.lower)
finalwords = ','.join(r)
sys.stdout.write(str(finalwords))
print '\n'
countlist = {}
for word in allwords:
try: countlist[word] += 1
except KeyError: countlist[word] = 1
for c,num in sorted(countlist.items()):
sys.stdout.write(",{:}:{:}".format(c, num))
A couple alternate ways of making the word list. First, a one-liner:
countlist = {word:allwords.count(word) for word in allwords}
As pointed out by DSM, that method can be slow with long lists. An alternate would be to use defaultdict:
from itertools import defaultdict
countlist = defaultdict(int)
for word in allwords:
countlist[word] += 1
For output, join individual word counts with a ,, which avoids having one at the beginning:
sys.stdout.write(",".join(["{:}:{:}".format(key, value) for key, value in countlist .items()]))
I am counting word of a txt file with the following code:
#!/usr/bin/python
file=open("D:\\zzzz\\names2.txt","r+")
wordcount={}
for word in file.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
print (word,wordcount)
file.close();
this is giving me the output like this:
>>>
goat {'goat': 2, 'cow': 1, 'Dog': 1, 'lion': 1, 'snake': 1, 'horse': 1, '': 1, 'tiger': 1, 'cat': 2, 'dog': 1}
but I want the output in the following manner:
word wordcount
goat 2
cow 1
dog 1.....
Also I am getting an extra symbol in the output (). How can I remove this?
The funny symbols you're encountering are a UTF-8 BOM (Byte Order Mark). To get rid of them, open the file using the correct encoding (I'm assuming you're on Python 3):
file = open(r"D:\zzzz\names2.txt", "r", encoding="utf-8-sig")
Furthermore, for counting, you can use collections.Counter:
from collections import Counter
wordcount = Counter(file.read().split())
Display them with:
>>> for item in wordcount.items(): print("{}\t{}".format(*item))
...
snake 1
lion 2
goat 2
horse 3
#!/usr/bin/python
file=open("D:\\zzzz\\names2.txt","r+")
wordcount={}
for word in file.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
for k,v in wordcount.items():
print k, v
FILE_NAME = 'file.txt'
wordCounter = {}
with open(FILE_NAME,'r') as fh:
for line in fh:
# Replacing punctuation characters. Making the string to lower.
# The split will spit the line into a list.
word_list = line.replace(',','').replace('\'','').replace('.','').lower().split()
for word in word_list:
# Adding the word into the wordCounter dictionary.
if word not in wordCounter:
wordCounter[word] = 1
else:
# if the word is already in the dictionary update its count.
wordCounter[word] = wordCounter[word] + 1
print('{:15}{:3}'.format('Word','Count'))
print('-' * 18)
# printing the words and its occurrence.
for (word,occurance) in wordCounter.items():
print('{:15}{:3}'.format(word,occurance))
#
Word Count
------------------
of 6
examples 2
used 2
development 2
modified 2
open-source 2
import sys
file=open(sys.argv[1],"r+")
wordcount={}
for word in file.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
for key in wordcount.keys():
print ("%s %s " %(key , wordcount[key]))
file.close();
If you are using graphLab, you can use this function. It is really powerfull
products['word_count'] = graphlab.text_analytics.count_words(your_text)
#!/usr/bin/python
file=open("D:\\zzzz\\names2.txt","r+")
wordcount={}
for word in file.read().split():
if word not in wordcount:
wordcount[word] = 1
else:
wordcount[word] += 1
for k,v in wordcount.items():
print k,v
file.close();
you can do this:
file= open(r'D:\\zzzz\\names2.txt')
file_split=set(file.read().split())
print(len(file_split))
Below code from Python | How to Count the frequency of a word in the text file? worked for me.
import re
frequency = {}
#Open the sample text file in read mode.
document_text = open('sample.txt', 'r')
#convert the string of the document in lowercase and assign it to text_string variable.
text = document_text.read().lower()
pattern = re.findall(r'\b[a-z]{2,15}\b', text)
for word in pattern:
count = frequency.get(word,0)
frequency[word] = count + 1
frequency_list = frequency.keys()
for words in frequency_list:
print(words, frequency[words])
OUTPUT:
print("sorted counting values:-")
from collections import Counter
fname = open(filename)
fname = fname.read()
fsplit = fname.split()
user = Counter(fsplit)
for i,v in sorted(user.items()):
print((v,i))