Hi I'm beginner in python and I'm trying to execute this program to create an inverted index for a collection file:
import sys
import re
from porterStemmer import PorterStemmer
from collections import defaultdict
from array import array
import gc
porter=PorterStemmer()
class CreateIndex:
def __init__(self):
self.index=defaultdict(list) #the inverted index
def getStopwords(self):
'''get stopwords from the stopwords file'''
f=open(self.stopwordsFile, 'r')
stopwords=[line.rstrip() for line in f]
self.sw=dict.fromkeys(stopwords)
f.close()
def getTerms(self, line):
'''given a stream of text, get the terms from the text'''
line=line.lower()
line=re.sub(r'[^a-z0-9 ]',' ',line) #put spaces instead of non-alphanumeric characters
line=line.split()
line=[x for x in line if x not in self.sw] #eliminate the stopwords
line=[ porter.stem(word, 0, len(word)-1) for word in line]
return line
def parseCollection(self):
''' returns the id, title and text of the next page in the collection '''
doc=[]
for line in self.collFile:
if line=='</page>\n':
break
doc.append(line)
curPage=''.join(doc)
pageid=re.search('<id>(.*?)</id>', curPage, re.DOTALL)
pagetitle=re.search('<title>(.*?)</title>', curPage, re.DOTALL)
pagetext=re.search('<text>(.*?)</text>', curPage, re.DOTALL)
if pageid==None or pagetitle==None or pagetext==None:
return {}
d={}
d['id']=pageid.group(1)
d['title']=pagetitle.group(1)
d['text']=pagetext.group(1)
return d
def writeIndexToFile(self):
'''write the inverted index to the file'''
f=open(self.indexFile, 'w')
for term in self.index.iterkeys():
postinglist=[]
for p in self.index[term]:
docID=p[0]
positions=p[1]
postinglist.append(':'.join([str(docID) ,','.join(map(str,positions))]))
print >> f, ''.join((term,'|',';'.join(postinglist)))
f.close()
def getParams(self):
'''get the parameters stopwords file, collection file, and the output index file'''
param=sys.argv
self.stopwordsFile=param[0]
self.collectionFile=param[1]
self.indexFile=param[2]
def createIndex(self):
'''main of the program, creates the index'''
self.getParams()
self.collFile=open(self.collectionFile,'r')
self.getStopwords()
#bug in python garbage collector!
#appending to list becomes O(N) instead of O(1) as the size grows if gc is enabled.
gc.disable()
pagedict={}
pagedict=self.parseCollection()
#main loop creating the index
while pagedict != {}:
lines='\n'.join((pagedict['title'],pagedict['text']))
pageid=int(pagedict['id'])
terms=self.getTerms(lines)
#build the index for the current page
termdictPage={}
for position, term in enumerate(terms):
try:
termdictPage[term][1].append(position)
except:
termdictPage[term]=[pageid, array('I',[position])]
#merge the current page index with the main index
for termpage, postingpage in termdictPage.iteritems():
self.index[termpage].append(postingpage)
pagedict=self.parseCollection()
gc.enable()
self.writeIndexToFile()
if __name__=="__main__":
c=CreateIndex()
c.createIndex()
and it tracebacks
File "C:\Users\createIndex.py", line 119, in
c.createIndex()
File "C:\Users\createIndex.py", line 81, in createIndex
self.getParams()
File "C:\Users\createIndex.py", line 75, in getParams
self.collectionFile=param[1]
IndexError: list index out of range
why Python tracebacks at these line and
what should I do to fix this ??
Related
I want to write a python script which will be integrated in a shell pipeline. So it has to take some input text from std-input and print the result to the std-output. I have multiple text processings to chain. Some processings apply on each line, others, first need to detect some blocks before change.
I made a loop for each text processing but my problem is that I don't see how to chain them to have the next loop taking as input the ouput of the previous.
Here under is my first draft.
As I'm used to write shell scripts, I have the feeling that I will have to works with tempfiles but not sure it's the way to go in Python.
And I assume that it would be nicer if I put each loop's processing in a function, too.
#!/usr/bin/python3
""" Sample of pre-processing formating script """
import fileinput
import re
import sys
""" Read StdIn """
lines_in = fileinput.input()
lines_out = ""
preform_txt_regex = re.compile(r"^ ")
code_block = ""
"""
Walk through the input and replace the 'preformatted text' (starting with 2 spaces)
into 'Fixed width text' (<code>…</code>).
"""
for line in lines_in:
if line.startswith(" "):
code_block = code_block + preform_txt_regex.sub('', line)
else:
if code_block != "":
lines_out = lines_out + "<syntaxhighlight lang='shell'>\n{}</syntaxhighlight>\n".format(code_block)
code_block = ""
sys.stdout.write(line)
lines_out = lines_out + line
# Reset lines_in and lines_out
lines_in = lines_out.split("\n")
lines_out = ""
"""
Remove the all 'Category' tags
"""
for line in lines_in:
lines_out = lines_out + re.sub(r'\[\[Cat[ée]gor.*:[^\]]*]]', r'', line)
"""
Few other string substitution
"""
for line in lines_in:
[...]
""" Print the processed texts """
sys.stdout.write(lines_out)
Propositions from #Steve and #ibra were to use a list of lines as variable buffer. And indeed, I made something working like that.
So here is my code reviewed. I moved my processing loops in functions which take lines_buffer as parameter:
#!/usr/bin/python3
# -*- encoding: utf-8 -*-
""" Sample of pre-processing formating script """
import fileinput
import re
import sys
"""
Walk through the input and replace the 'preformatted text' (starting with 2 spaces)
into 'Fixed width text' (<code>…</code>).
"""
def render_code_block(lines):
preform_txt_regex = re.compile(r"^ ")
code_block = []
output = []
for line in lines:
if line.startswith(" "):
code_block.append(preform_txt_regex.sub('', line))
else:
if code_block != []:
output.append("<syntaxhighlight lang='shell'>\n{}</syntaxhighlight>\n".format(code_block))
code_block = []
output.append(line)
return output
"""
Remove the all 'Category' tags
"""
def remove_category_tags(lines):
output = []
for line in lines:
output.append(re.sub(r'\[\[Cat[ée]gor.*:[^\]]*]]', r'', line))
return output
""" Main """
lines_buffer = []
lines_buffer = fileinput.input()
lines_buffer = render_code_block(lines_buffer)
lines_buffer = remove_category_tags(lines_buffer)
for line in lines_buffer:
sys.stdout.write(line)
And of course, I had to replace the string initialization (= "") by list initialization (= []) and append in place of concatenate (+).
I want to open a file with a given word. The function will read the file line by line and return a count of how many lines contain the given word.
def count_word(file_name, word):
with open(file_name, 'r') as file:
line = file.readline()
line.rstrip('\n')
cnt = 0
for line in file:
if word in line:
cnt += 1
return cnt
This is what I've tried, but it's not working correctly. Not sure what's going on.
Try this:
def count_word(file_name, word):
with open(file_name, 'r') as file:
content = file.read()
return content.count(word)
You need to count the occurrences of the word in isolation. For example, as is in classic, but the word as does not appear in the sentence this is a classic problem. Additionally, you need to move your return to outside the for-loop:
def wordCount(infilepath, word):
answer = 0
with open(infilepath) as infile:
for line in infilepath:
answer += line.split().count(word)
return answer
Here is an alternative version using collections.Counter and re.split:
from collections import Counter
import re
def count_word(file_name, word):
return Counter(re.split('\s+|\W+', open(file_name).read())).get(word, 0)
This should do it, first, it loads the file, then looks through it and counts up all of the words in each line and returning them
#function to count words
def count_word(file_name, word):
#hold number of words
cnt = 0
#open file
with open(file_name, 'r') as file:
#get lines
lines = file.readlines()
#loop the thrpugh file
for line in lines:
#strip the line
line.rstrip('\n')
#get how many times it appears in the line
cnt += line.lower().split().count(word)
return cnt
print(count_word("test.txt", "test"))
Hello I'm beginner in python and I'm trying to execute this program to create an inverted index for a collection file:
import sys
import re
from porterStemmer import PorterStemmer
from collections import defaultdict
from array import array
import gc
porter=PorterStemmer()
class CreateIndex:
def __init__(self):
self.index=defaultdict(list) #the inverted index
def getStopwords(self):
'''get stopwords from the stopwords file'''
f=open(self.stopwordsFile, 'r')
stopwords=[line.rstrip() for line in f]
self.sw=dict.fromkeys(stopwords)
f.close()
def getTerms(self, line):
'''given a stream of text, get the terms from the text'''
line=line.lower()
line=re.sub(r'[^a-z0-9 ]',' ',line) #put spaces instead of non-alphanumeric characters
line=line.split()
line=[x for x in line if x not in self.sw] #eliminate the stopwords
line=[ porter.stem(word, 0, len(word)-1) for word in line]
return line
def parseCollection(self):
''' returns the id, title and text of the next page in the collection '''
doc=[]
for line in self.collFile:
if line=='</page>\n':
break
doc.append(line)
curPage=''.join(doc)
pageid=re.search('<id>(.*?)</id>', curPage, re.DOTALL)
pagetitle=re.search('<title>(.*?)</title>', curPage, re.DOTALL)
pagetext=re.search('<text>(.*?)</text>', curPage, re.DOTALL)
if pageid==None or pagetitle==None or pagetext==None:
return {}
d={}
d['id']=pageid.group(1)
d['title']=pagetitle.group(1)
d['text']=pagetext.group(1)
return d
def writeIndexToFile(self):
'''write the inverted index to the file'''
f=open(self.indexFile, 'w')
for term in self.index.iterkeys():
postinglist=[]
for p in self.index[term]:
docID=p[0]
positions=p[1]
postinglist.append(':'.join([str(docID) ,','.join(map(str,positions))]))
print >> f, ''.join((term,'|',';'.join(postinglist)))
f.close()
def getParams(self):
'''get the parameters stopwords file, collection file, and the output index file'''
param=sys.argv
self.stopwordsFile=param[0]
self.collectionFile=param[1]
self.indexFile=param[2]
def createIndex(self):
'''main of the program, creates the index'''
self.getParams()
self.collFile=open(self.collectionFile,'r')
self.getStopwords()
#bug in python garbage collector!
#appending to list becomes O(N) instead of O(1) as the size grows if gc is enabled.
gc.disable()
pagedict={}
pagedict=self.parseCollection()
#main loop creating the index
while pagedict != {}:
lines='\n'.join((pagedict['title'],pagedict['text']))
pageid=int(pagedict['id'])
terms=self.getTerms(lines)
#build the index for the current page
termdictPage={}
for position, term in enumerate(terms):
try:
termdictPage[term][1].append(position)
except:
termdictPage[term]=[pageid, array('I',[position])]
#merge the current page index with the main index
for termpage, postingpage in termdictPage.iteritems():
self.index[termpage].append(postingpage)
pagedict=self.parseCollection()
gc.enable()
self.writeIndexToFile()
if __name__=="__main__":
c=CreateIndex()
c.createIndex()
and it says that there is only 1 argument in the sys.argv...
how should the other arguments appear???
In the getParams function, you can see that your code request 3 parameters.
When you call your program:
python your_program.py
# sys.argv[0] = 'your_program.py'
It has 1 argument. So you need two more:
python your_program.py arg_1 arg_2
# sys.argv[0] = 'your_program.py'
# sys.argv[1] = 'arg_1'
# sys.argv[2] = 'arg_2
My task is to get wordcount from txt file using mapreduce job. However, im getting key error when i try to print filename with wordcount. Please help me out.
#!/usr/bin/env python
import sys
import os
import re
# input comes from STDIN (standard input)
for line in sys.stdin:
stopwords = ['a','able','about','across','after','all','almost','also','am','among','an','and','any','are','as','at','be','because','been','but','by','can','cannot','could','dear','did','do','does','either','else','ever','every','for','from','get','got','had','has','have','he','her','hers','him','his','how','however','i','if','in','into','is','it','its','just','least','let','like','likely','may','me','might','most','must','my','neither','no','nor','not','of','off','often','on','only','or','other','our','own','rather','said','say','says','she','should','since','so','some','than','that','the','their','them','then','there','these','they','this','tis','to','too','twas','us','wants','was','we','were','what','when','where','which','while','who','whom','why','will','with','would','yet','you','your']
# remove leading and trailing whitespace
line = line.strip()
# split the line into words
fname = os.environ['map_input_file']
words = re.findall(r"[A-Za-z]+", line)
words = line.split()
words = [word for word in words if word not in stopwords]
# increase counters
for word in words:
# write the results to STDOUT (standard output);
# what we output here will be the input for the
# Reduce step, i.e. the input for reducer.py
#
# tab-delimited; the trivial word count is 1
print '%s\t%s' % (word + ' ' + fname, 1)
I have to pass the reducer (word&filename, 1). With above code I gets key error.
File "/home/s/ss/ssa8455/mapper.py", line 12, in ?
fname = os.environ['map_input_file']
File "/usr/lib64/python2.4/UserDict.py", line 17, in __getitem__
def __getitem__(self, key): return self.data[key]
KeyError: 'map_input_file'
How can I find only words that are unique to a text file? If a word is used frequently by in other files then it gets dropped.
Here is a reference http://sahandsaba.com/visualizing-philosophers-and-scientists-by-the-words-they-used-with-d3js-and-python.html
I need a script which loops through all text files in a folder and outputs the results in Json format.
My code so far :
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
from pprint import pprint as pp
from glob import glob
from nltk import word_tokenize
from nltk.corpus import wordnet as wn
from sklearn.feature_extraction.text import CountVectorizer
import codecs
import jinja2
import json
import os
def get_raw_data():
texts = []
for x in range(1,95):
file_name = str(x+1)+".txt"
with codecs.open(file_name,"rU","utf-8") as myfile:
data = myfile.read()
texts.append(data)
yield file_name, '\n'.join(texts)
class StemTokenizer(object):
def __init__(self):
self.ignore_set = {'footnote'}
def __call__(self, doc):
words = []
for word in word_tokenize(doc):
word = word.lower()
w = wn.morphy(word)
if w and len(w) > 1 and w not in self.ignore_set:
words.append(w)
return words
def process_text(counts, vectorizer, text, file_name, index):
result = {w: counts[index][vectorizer.vocabulary_.get(w)]
for w in vectorizer.get_feature_names()}
result = {w: c for w, c in result.iteritems() if c > 4}
normalizing_factor = max(c for c in result.itervalues())
result = {w: c / normalizing_factor
for w, c in result.iteritems()}
return result
def main():
data = list(get_raw_data())
print('Data loaded')
n = len(data)
vectorizer = CountVectorizer(stop_words='english', min_df=(n-1) / n,tokenizer=StemTokenizer())
counts = vectorizer.fit_transform(text for p, text in data).toarray()
print('Vectorization done.')
print (counts)
for x in range(95):
file_name = str(x+1)+".txt"
# print (text)
for i, (text) in enumerate(data):
print (file_name)
# print (text)
with codecs.open(file_name,"rU","utf-8") as myfile:
text = myfile.read()
result = process_text(counts, vectorizer, text, file_name, i)
print (result)
if __name__ == '__main__':
main()
Looks like you've got a bunch of files named 1.txt, 2.txt, ... 95.txt, and you want to find words that occur in one file only. I'd just gather all words, counting how many files each one occurs in; and print out the singletons.
from collections import Counter
import re
fileids = [ str(n+1)+".txt" for n in range(95) ]
filecounts = Counter()
for fname in fileids:
with open(fname) as fp: # Add encoding if really needed
text = fp.read().lower()
words = re.split(r"\W+", text) # Keep letters, drop the rest
filecounts.update(set(words))
singletons = [ word in filecounts if filecounts[word] == 1 ]
print(" ".join(singletons))
Done. You don't need scikit, you don't need the nltk, you don't need a pile of IR algorithms. You can use the list of singletons in an IR algorithm, but that's a different story.
def parseText():
# oFile: text file to test
# myWord: word we are looking for
# Get all lines into list
aLines = oFile.readlines()
# Perform list comprehension on lines to test if the word is found
for sLine in aLines:
# Parse the line (remove spaces), returns list
aLine = sLine.split()
# Iterate words and test to see if they match our word
for sWord in aLines:
# if it matches, append it to our list
if sWord == myWord: aWords.append( sWord )
# Create empty list to store all instances of the word that we may find
aWords = []
# Prompt user to know what word to search
myWord = str( raw_input( 'what word to searh:' ) )
# Call function
parseText()
# Check if list has at least one element
if len( aWords ) < 1: print 'Word not found in file'
else: print str( len( aWords ) ) + ' instances of our word found in file'