Python - How do i get filename in wordcount mapreduce job - python

My task is to get wordcount from txt file using mapreduce job. However, im getting key error when i try to print filename with wordcount. Please help me out.
#!/usr/bin/env python
import sys
import os
import re
# input comes from STDIN (standard input)
for line in sys.stdin:
stopwords = ['a','able','about','across','after','all','almost','also','am','among','an','and','any','are','as','at','be','because','been','but','by','can','cannot','could','dear','did','do','does','either','else','ever','every','for','from','get','got','had','has','have','he','her','hers','him','his','how','however','i','if','in','into','is','it','its','just','least','let','like','likely','may','me','might','most','must','my','neither','no','nor','not','of','off','often','on','only','or','other','our','own','rather','said','say','says','she','should','since','so','some','than','that','the','their','them','then','there','these','they','this','tis','to','too','twas','us','wants','was','we','were','what','when','where','which','while','who','whom','why','will','with','would','yet','you','your']
# remove leading and trailing whitespace
line = line.strip()
# split the line into words
fname = os.environ['map_input_file']
words = re.findall(r"[A-Za-z]+", line)
words = line.split()
words = [word for word in words if word not in stopwords]
# increase counters
for word in words:
# write the results to STDOUT (standard output);
# what we output here will be the input for the
# Reduce step, i.e. the input for reducer.py
#
# tab-delimited; the trivial word count is 1
print '%s\t%s' % (word + ' ' + fname, 1)
I have to pass the reducer (word&filename, 1). With above code I gets key error.
File "/home/s/ss/ssa8455/mapper.py", line 12, in ?
fname = os.environ['map_input_file']
File "/usr/lib64/python2.4/UserDict.py", line 17, in __getitem__
def __getitem__(self, key): return self.data[key]
KeyError: 'map_input_file'

Related

how to find the longest_word by using searching loop

Write a function longest_word that asks the user for words and returns the longest word entered by the user. It should stop when the user hits return without typing a word. If multiple words have the same maximum length, return the first word entered by the user. If the user quits before entering any words, return “No words were entered”. This function should use a searching loop.
(Hint: remember that the len function returns the length of a string.)
Takes list of files and a next and checks if the words in file appear in those files.
Code:
def list_search_dictionary(fname, listfname):
for search in listfname:
with open(fname,'r') as f1:
with open(search,'r') as f2:
#splits word by word
All = f2.read().split()
formater = lambda x:x.strip().lower()
#formats each element
All = list(map(formater,All))
for word in f1:
word = formater(word)
if word in All:
print(f'Found: {word}')
Using your functions:
Change your function to:
def my_search_dictionary(search_dic, fname):
# read the file
infile = open(fname,"r")
# look for each line in the file
for line in infile:
# remove the string at the end of the line
line_strip = line.strip()
# lowercase
line_lower = line_strip.lower()
# i love dogs
# more than hotdogs
# split " " in the line and make them in the list
line_split = line_lower.split(" ")
# ["i"," love dogs"]
# ["more", " than", " hotdogs"]
# look for each search term in the line_split
for word in line_split:
# check if every word in the line
for key in search_dic:
if word == key:
search_dic[key] += [fname]
infile.close()
The third function would be:
def list_search_dictionary(fname, listfname):
search_dic = key_emptylist(fname)
for file in listfname:
search_dic = my_search_dictonary(search_dic,file)

why python tracebacks at self.collectionFile=param[1]

Hi I'm beginner in python and I'm trying to execute this program to create an inverted index for a collection file:
import sys
import re
from porterStemmer import PorterStemmer
from collections import defaultdict
from array import array
import gc
porter=PorterStemmer()
class CreateIndex:
def __init__(self):
self.index=defaultdict(list) #the inverted index
def getStopwords(self):
'''get stopwords from the stopwords file'''
f=open(self.stopwordsFile, 'r')
stopwords=[line.rstrip() for line in f]
self.sw=dict.fromkeys(stopwords)
f.close()
def getTerms(self, line):
'''given a stream of text, get the terms from the text'''
line=line.lower()
line=re.sub(r'[^a-z0-9 ]',' ',line) #put spaces instead of non-alphanumeric characters
line=line.split()
line=[x for x in line if x not in self.sw] #eliminate the stopwords
line=[ porter.stem(word, 0, len(word)-1) for word in line]
return line
def parseCollection(self):
''' returns the id, title and text of the next page in the collection '''
doc=[]
for line in self.collFile:
if line=='</page>\n':
break
doc.append(line)
curPage=''.join(doc)
pageid=re.search('<id>(.*?)</id>', curPage, re.DOTALL)
pagetitle=re.search('<title>(.*?)</title>', curPage, re.DOTALL)
pagetext=re.search('<text>(.*?)</text>', curPage, re.DOTALL)
if pageid==None or pagetitle==None or pagetext==None:
return {}
d={}
d['id']=pageid.group(1)
d['title']=pagetitle.group(1)
d['text']=pagetext.group(1)
return d
def writeIndexToFile(self):
'''write the inverted index to the file'''
f=open(self.indexFile, 'w')
for term in self.index.iterkeys():
postinglist=[]
for p in self.index[term]:
docID=p[0]
positions=p[1]
postinglist.append(':'.join([str(docID) ,','.join(map(str,positions))]))
print >> f, ''.join((term,'|',';'.join(postinglist)))
f.close()
def getParams(self):
'''get the parameters stopwords file, collection file, and the output index file'''
param=sys.argv
self.stopwordsFile=param[0]
self.collectionFile=param[1]
self.indexFile=param[2]
def createIndex(self):
'''main of the program, creates the index'''
self.getParams()
self.collFile=open(self.collectionFile,'r')
self.getStopwords()
#bug in python garbage collector!
#appending to list becomes O(N) instead of O(1) as the size grows if gc is enabled.
gc.disable()
pagedict={}
pagedict=self.parseCollection()
#main loop creating the index
while pagedict != {}:
lines='\n'.join((pagedict['title'],pagedict['text']))
pageid=int(pagedict['id'])
terms=self.getTerms(lines)
#build the index for the current page
termdictPage={}
for position, term in enumerate(terms):
try:
termdictPage[term][1].append(position)
except:
termdictPage[term]=[pageid, array('I',[position])]
#merge the current page index with the main index
for termpage, postingpage in termdictPage.iteritems():
self.index[termpage].append(postingpage)
pagedict=self.parseCollection()
gc.enable()
self.writeIndexToFile()
if __name__=="__main__":
c=CreateIndex()
c.createIndex()
and it tracebacks
File "C:\Users\createIndex.py", line 119, in
c.createIndex()
File "C:\Users\createIndex.py", line 81, in createIndex
self.getParams()
File "C:\Users\createIndex.py", line 75, in getParams
self.collectionFile=param[1]
IndexError: list index out of range
why Python tracebacks at these line and
what should I do to fix this ??

Trying to write function that recursively counts the number of times word appears in a text file in Python

Trying to write function that recursively counts the number of times word appears in a text file in Python.
def word_count(filename, word):
with open('C:/Users/Ibrahim/Desktop/file.txt', 'r') as f:
result_list = [x.split(',') for x in f.readlines()]
for i in result_list:
if i == word:
return word_count(filename,word)
is what I currently have.
I Think it may helpful for you:
import sys, re
def word_count(filename, niddle, splitter=","):
regex_pattern = '|'.join(map(re.escape, splitter))
with open(filename, 'r') as f:
words = [ word for line in f.read().splitlines() for word in re.split(regex_pattern, line)]
words = filter(None, words)
print "Total Words :", len(words)
print "Searching %s in list" % niddle
print "Total Occurance : %d" % words.count(niddle)
def main(argv):
splitter = ","
if len(argv)==3:
filename, word, splitter = argv
elif len(argv)==2:
filename, word = argv
splitter = splitter
else:
print "Usage : word_count.py <file> <word> <splitter>"
sys.exit()
word_count(filename, word, splitter)
if __name__ == "__main__":
main(sys.argv[1:])
Counter is your friend here, i.e.:
from collections import Counter
f = open('yourfile.txt', 'r')
counts = Counter(f.read().split())
print counts
To check if a specific word exists:
if "myword" in counts:
print "exists"
To get a specific word count value, use:
print counts.get('myword')
# or simply
print counts['myword']
Note:
A Counter is simply a dict subclass and supports all the dict operators and methods.

Read special characters from .txt file in python

The goal of this code is to find the frequency of words used in a book.
I am tying to read in the text of a book but the following line keeps throwing my code off:
precious protégés. No, gentlemen; he'll always show 'em a clean pair
specifically the é character
I have looked at the following documentation, but I don't quite understand it: https://docs.python.org/3.4/howto/unicode.html
Heres my code:
import string
# Create word dictionary from the comprehensive word list
word_dict = {}
def create_word_dict ():
# open words.txt and populate dictionary
word_file = open ("./words.txt", "r")
for line in word_file:
line = line.strip()
word_dict[line] = 1
# Removes punctuation marks from a string
def parseString (st):
st = st.encode("ascii", "replace")
new_line = ""
st = st.strip()
for ch in st:
ch = str(ch)
if (n for n in (1,2,3,4,5,6,7,8,9,0)) in ch or ' ' in ch or ch.isspace() or ch == u'\xe9':
print (ch)
new_line += ch
else:
new_line += ""
# now remove all instances of 's or ' at end of line
new_line = new_line.strip()
print (new_line)
if (new_line[-1] == "'"):
new_line = new_line[:-1]
new_line.replace("'s", "")
# Conversion from ASCII codes back to useable text
message = new_line
decodedMessage = ""
for item in message.split():
decodedMessage += chr(int(item))
print (decodedMessage)
return new_line
# Returns a dictionary of words and their frequencies
def getWordFreq (file):
# Open file for reading the book.txt
book = open (file, "r")
# create an empty set for all Capitalized words
cap_words = set()
# create a dictionary for words
book_dict = {}
total_words = 0
# remove all punctuation marks other than '[not s]
for line in book:
line = line.strip()
if (len(line) > 0):
line = parseString (line)
word_list = line.split()
# add words to the book dictionary
for word in word_list:
total_words += 1
if (word in book_dict):
book_dict[word] = book_dict[word] + 1
else:
book_dict[word] = 1
print (book_dict)
# close the file
book.close()
def main():
wordFreq1 = getWordFreq ("./Tale.txt")
print (wordFreq1)
main()
The error that I received is as follows:
Traceback (most recent call last):
File "Books.py", line 80, in <module>
main()
File "Books.py", line 77, in main
wordFreq1 = getWordFreq ("./Tale.txt")
File "Books.py", line 60, in getWordFreq
line = parseString (line)
File "Books.py", line 36, in parseString
decodedMessage += chr(int(item))
OverflowError: Python int too large to convert to C long
When you open a text file in python, the encoding is ANSI by default, so it doesn't contain your é chartecter. Try
word_file = open ("./words.txt", "r", encoding='utf-8')
The best way I could think of is to read each character as an ASCII value, into an array, and then take the char value. For example, 97 is ASCII for "a" and if you do char(97) it will output "a". Check out some online ASCII tables that provide values for special characters also.
Try:
def parseString(st):
st = st.encode("ascii", "replace")
# rest of code here
The new error you are getting is because you are calling isalpha on an int (i.e. a number)
Try this:
for ch in st:
ch = str(ch)
if (n for n in (1,2,3,4,5,6,7,8,9,0) if n in ch) or ' ' in ch or ch.isspace() or ch == u'\xe9':
print (ch)

code for counting number of sentences, words and characters in an input file

I have written the following code to count the number of sentences, words and characters in the input file sample.txt, which contains a paragraph of text. It works fine in giving the number of sentences and words, but does not give the precise and correct number of characters ( without whitespaces and punctuation marks)
lines,blanklines,sentences,words=0,0,0,0
num_chars=0
print '-'*50
try:
filename = 'sample.txt'
textf = open(filename,'r')c
except IOError:
print 'cannot open file %s for reading' % filename
import sys
sys.exit(0)
for line in textf:
print line
lines += 1
if line.startswith('\n'):
blanklines += 1
else:
sentences += line.count('.')+ line.count ('!')+ line.count('?')
tempwords = line.split(None)
print tempwords
words += len(tempwords)
textf.close()
print '-'*50
print "Lines:", lines
print "blank lines:",blanklines
print "sentences:",sentences
print "words:",words
import nltk
import nltk.data
import nltk.tokenize
with open('sample.txt' , 'r') as f:
for line in f:
num_chars += len(line)
num_chars = num_chars - (words +1 )
pcount = 0
from nltk.tokenize import TreebankWordTokenizer
with open('sample.txt','r') as f1:
for line in f1:
#tokenised_words = nltk.tokenize.word_tokenize(line)
tokenizer = TreebankWordTokenizer()
tokenised_words = tokenizer.tokenize(line)
for w in tokenised_words:
if ((w=='.')|(w==';')|(w=='!')|(w=='?')):
pcount = pcount + 1
print "pcount:",pcount
num_chars = num_chars - pcount
print "chars:",num_chars
pcount is the number of punctuation marks. Can some suggest the changes I need to make in order to find out the exact number of characters without spaces and punctuation marks?
import string
#
# Per-line counting functions
#
def countLines(ln): return 1
def countBlankLines(ln): return 0 if ln.strip() else 1
def countWords(ln): return len(ln.split())
def charCounter(validChars):
vc = set(validChars)
def counter(ln):
return sum(1 for ch in ln if ch in vc)
return counter
countSentences = charCounter('.!?')
countLetters = charCounter(string.letters)
countPunct = charCounter(string.punctuation)
#
# do counting
#
class FileStats(object):
def __init__(self, countFns, labels=None):
super(FileStats,self).__init__()
self.fns = countFns
self.labels = labels if labels else [fn.__name__ for fn in countFns]
self.reset()
def reset(self):
self.counts = [0]*len(self.fns)
def doFile(self, fname):
try:
with open(fname) as inf:
for line in inf:
for i,fn in enumerate(self.fns):
self.counts[i] += fn(line)
except IOError:
print('Could not open file {0} for reading'.format(fname))
def __str__(self):
return '\n'.join('{0:20} {1:>6}'.format(label, count) for label,count in zip(self.labels, self.counts))
fs = FileStats(
(countLines, countBlankLines, countSentences, countWords, countLetters, countPunct),
("Lines", "Blank Lines", "Sentences", "Words", "Letters", "Punctuation")
)
fs.doFile('sample.txt')
print(fs)
results in
Lines 101
Blank Lines 12
Sentences 48
Words 339
Letters 1604
Punctuation 455
You can also use a regex to replace all non-alphanumeric characters and then count the number of characters in each line.
Once thing you could do is when you read the line iterate through it and increment number of characters:
for character in line:
if character.isalnum():
num_chars += 1
P.S. you might want to change if statement condition to satisfy your particular needs, i.e. if you want to count $ for example.
Try this for count number of words and number of sentences and get probability for similar words,
from nltk.tokenize import word_tokenize
from nltk.tokenize import sent_tokenize
text_file = open("..//..//static//output.txt", "r")
lines = text_file.readlines()
x=0
tokenized_words = [word_tokenize(i) for i in lines]
for i in tokenized_words:
print(i) #array contain with tokens
print(str(len(i))) #word count
for j in i:
if j== 'words': #simple algo for count number of 'words' to be count
x = x+1
tokenized_sents = [sent_tokenize(k) for k in lines]
for k in tokenized_sents:
print("Sentences"+str(k)) #array contain with sentences
print("number of sentences "+str(len(k))) #number of sentences
print("number of word"+str(x))
print("Probability of 'word' in text file "+str(x/len(i)))

Categories

Resources