import os,re
import math
from math import log10
import nltk.corpus
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import defaultdict
python_file_root = './presidential_debates'
def getidf(token):
document_occurance = 0
for filename in os.listdir(python_file_root):
file = open(os.path.join(python_file_root, filename), "r")
for line in file:
if re.search(r'\b' +token+ r'\b', line):
document_occurance = document_occurance + 1
break
if (document_occurance != 0):
idf = log10(30 / document_occurance)
return idf
return -1
def normalize(filename,token):
file = open(os.path.join(python_file_root, filename), "r")
counts = dict()
square = []
count1 = 0
for line in file:
count1 = count1 + 1
if line in counts:
counts[line] += 1
else:
counts[line] = 1
for key,value in counts.items():
tf = 1 +log10(value)
idf = getidf(key.rstrip())
square.append((tf * idf)*(tf * idf))
summ = sum(square)
sqroot = math.sqrt(summ)
return sqroot
def getweight(filename,token):
hit_count1 = 0
final = 0
file = open(os.path.join(python_file_root, filename), "r")
idft = getidf(token)
for line in file:
if re.search(r'\b' +token+ r'\b', line):
hit_count1 = hit_count1 + 1
if (hit_count1 == 0):
return 0
else:
tf = 1 + log10(hit_count1)
initial = idft * tf
if(initial <= 0):
final = 0
return final
else:
normalize_fact = normalize(filename,token)
final = initial / normalize_fact
return final
for filename in os.listdir(python_file_root):
file = open(os.path.join(python_file_root, filename), "r")
doc = file.read()
doc = doc.lower()
stemmed = []
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
tokens = tokenizer.tokenize(doc)
stoplist = stopwords.words('english')
stop_removed = [word for word in tokens if word not in stoplist]
with open(os.path.join(python_file_root, filename), "w") as f:
for item in stop_removed:
stemmer = PorterStemmer()
stemmed = [stemmer.stem(item)]
for items in stemmed:
f.write("%s\n" % items)
print("\nIDF\n")
print("%.12f" % getidf("health"))
print("%.12f" % getidf("agenda"))
print("%.12f" % getidf("vector"))
print("%.12f" % getidf("reason"))
print("%.12f" % getidf("hispan"))
print("%.12f" % getidf("hispanic"))
print("\n")
print("%.12f" % getweight("2012-10-03.txt","health"))
print("%.12f" % getweight("1960-10-21.txt","reason"))
print("%.12f" % getweight("1976-10-22.txt","agenda"))
print("%.12f" % getweight("2012-10-16.txt","hispan"))
print("%.12f" % getweight("2012-10-16.txt","hispanic"))
I have 30 txt files and i have developed a program to find the idf and normalized tf-idf vectors. Im getting the correct values but the function getweight takes more than 15 minutes to generate the output. Can anyone suggest me a few methods for optimization.
I donot want to use any other non-standard Python package.
Why do you create a new PorterStemmer for every word?
Apart from this obvious thing, try profiling your code. NLTI has the reputation of being really slow - so it may well be not your fault. If you profile, then you'll know.
Related
I want to create a very basic Q&A chatbot. Given a list of questions & answers that I use as my dataset, I want to train it in order to return relevant answers, depending on a hard-coded question (different every time). First I tokenize, cleaning up, then using cosine similarity, but it gives me an error, which is (I guess) a pickle issue.
UPDATED
import csv
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
import pickle
import os.path
import re, math
tokenizer = RegexpTokenizer(r'\w+')
stopwords = stopwords.words('english')
extra_stopwords = stopwords + ['I', 'can']
WORD = re.compile(r'\w+')
def get_clean_data():
clean_data_set = {
'questions' : {},
'answers' : {}
}
reader = csv.reader(open('data.csv', 'r', encoding="utf-8"))
tags = []
counter = 0
for r in reader:
question = str(r[0].encode('utf-8'))
answer = str(r[1].encode('utf-8'))
_, tags_question = get_tags(question)
_, tags_answer = get_tags(answer)
clean_data_set['answers'][answer] = tags_answer + tags_question
clean_data_set['questions'][question] = text_to_vector(question)
counter += 1
# hardcode the number :)
print (counter, ' out of 746')
# pickle.dump(clean_data_set, open('dump.dict', 'wb'))
with open('dump.dict', 'wb') as my_dump_file:
pickle.dump(clean_data_set, my_dump_file)
def get_tags(text, use_set = True):
tokens = tokenizer.tokenize(text)
# remove stop words from tokens
# make it lower case
filtered_words = [word.lower() for word in tokens if word not in extra_stopwords]
# return non duplicate values by default
if use_set == True:
filterd_words = list(set(filtered_words))
return Counter(filtered_words), filtered_words
# simple cosine similarity measure
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
# question_set is the data we had
def get_cosine_value(question, question_set):
question_vector = text_to_vector(question)
cosine = get_cosine(question_vector, question_set)
return cosine
def answer_question(question, top = 5):
with open('dump.dict', 'rb') as my_dump_file:
data_set = pickle.load(my_dump_file)
# data_set = pickle.load(open('dump.dict', 'rb'))
pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
_, question_tags = get_tags(question)
ranking_dict = {}
similar_questions_rank = {}
for entry in data_set['answers']:
tags = data_set['answers'][entry]
# rank is the intersection between the list of tags from the question
# and the list of tags associated to answers
rank = len(set(question_tags).intersection(tags))
ranking_dict[entry] = rank
for entry in data_set['questions']:
cosine_similarity = get_cosine_value(question, data_set['questions'][entry])
similar_questions_rank[entry] = cosine_similarity
sorted_similarity_dict = sorted(similar_questions_rank.items(), key=lambda x: x[1], reverse=True)
sorted_ranking_dict = sorted(ranking_dict.items(), key=lambda x: x[1], reverse=True)
# sort them by rank
for item in sorted_ranking_dict[0:top-1]:
print ('Rank: ', item[1])
print ('Answer: ', item[0])
print ('\n\n')
# sort them by rank
for item in sorted_similarity_dict[0:top-1]:
print ('Rank: ', item[1])
print ('Question: ', item[0])
#get_clean_data()
question = 'why all these errors?'
answer_question(question)
This is the updated error message:
Traceback (most recent call last):
File "C:\Users\joasa\Desktop\si\main.py", line 133, in <module>
answer_question(question)
File "C:\Users\joasa\Desktop\si\main.py", line 94, in answer_question
data_set = pickle.load(my_dump_file)
EOFError: Ran out of input
[Finished in 1.4s]
Can someone help please? I have no idea what to do. Thanks in advance
I think it comes from this line in your get_clean_data function:
pickle.dump(clean_data_set, open('dump.dict', 'w'))
See here you open the file for writing, but you never close it, so when you try to read it there is nothing to signify that the end of file has been reached. To avoid stuff like this from happening, use a context manager block:
with open('dump.dict', 'wb') as my_dump_file:
pickle.dump(clean_data_set, my_dump_file)
That way, whichever way you exit the with block, you are guaranteed to close your file.
You should also do the same when loading your pickle dump in answer_question:
with open('dump.dict', 'rb') as my_dump_file:
data_set = pickle.load(my_dump_file)
import re
from sys import argv
def read_file(fname):
""" open and extract the text from the file """
txt_file = open(fname, 'r')
txt = txt_file.read()
txt_file.close()
return txt
def clean_space(files):
""" remove spaces from the file """
return files.replace('\n', '')
def filter_file(files):
""" remove punctuation and filter small words from the file """
split_words = map(lambda x: re.sub('[^A-Za-z0-9]+', '', x),
files.split())
filtered_txt = [x for x in split_words if len(x) > 1]
return filtered_txt
def dict_count(files):
""" for loop to return dict with word count and length keys """
lengths = {}
for word in filtered_text: # And this also
length = len(word)
if length not in lengths:
lengths[length] = 0
lengths[length] += 1
for length, counter in lengths.item():
return "Words of length %d: %d" % (length, counter)
def print_result(fname):
fi = dict_count(filter_file(clean_space(read_file(fname))))
print fi
if __name__ == '__main__':
script, fname = argv
print_result(fname)
In the function dict_count you have never created the filtered_text variable, and then you want to use it.
You must create the variable before using it with:
filtered_text = filter_file(files)
complete code:
def dict_count(files):
""" for loop to return dict with word count and length keys """
lengths = {}
filtered_text = filter_file(files)
for word in filtered_text: # And this also
length = len(word)
if length not in lengths:
lengths[length] = 0
lengths[length] += 1
for length, counter in lengths.item():
return "Words of length %d: %d" % (length, counter)
I have a file where each line is ordered alphabetically. The file is 12Gb, which means I can't simply read it line by line. The data looks like this:
brown 0 1 0 1 2
fox 3 5 0 0 1
jumped 2 0 6 1 0
The words at the beginning of each line are unique. The word and the numbers on each line are separated by tabs. I want to be able to query the file for specific keywords. For example, if I query "fox", the program should return "fox 3 5 0 0 1".
It seems that a good candidate for this would be the bisect module: https://docs.python.org/3.0/library/bisect.html
I found a post which uses bisect to find out the line number of a keyword: How do I perform binary search on a text file to search a keyword in python?
This is what the code looks like:
import bisect
import os
class Query(object):
def __init__(self, query, index=5):
self.query = query
self.index = index
def __lt__(self, comparable):
return self.query < comparable[self.index:]
class FileSearcher(object):
def __init__(self, file_pointer, record_size=35):
self.file_pointer = file_pointer
self.file_pointer.seek(0, os.SEEK_END)
self.record_size = record_size + len(os.linesep)
self.num_bytes = self.file_pointer.tell()
self.file_size = (self.num_bytes // self.record_size)
def __len__(self):
return self.file_size
def __getitem__(self, item):
self.file_pointer.seek(item * self.record_size)
return self.file_pointer.read(self.record_size)
with open('myfile') as file_to_search:
query = 'fox\t' #token to query
wrapped_query = Query(query)
searchable_file = FileSearcher(file_to_search)
linepos = bisect.bisect(searchable_file, wrapped_query)
print "Located # line: ", linepos
#print content of line?
However, I can't figure out how to actually print the content of the line. I should at least add a read statement somewhere, but I don't know where.
Is it possible to print the content of the line with the bisect module?
If you want go with Python solution, you can do the following:
Read file by small chunks of MAX_LINE bytes, each time moving forward by fixed offset
That offset determines block size
For each such read, determine the key (first word in a line)
These keys serve as delimiters of blocks
Construct the list of such keys. The list would be sorted as keys are ordered
You may persist such list somewhere via pickle/json.dumps/...
When quering, find via bisect the index of a block where you key is located
Read that block entirely and find the key with data
Here is the example file bigfile:
abc 4
bar 2
baz 3
egg 6
foo 1
god 8
ham 5
sex 7
The code:
import os
from bisect import bisect
MAX_LINE = 7
BLOCK_SIZE = 10
def parse_chunks(filename):
size = os.path.getsize(filename)
chunks = []
with open(filename, 'rb') as file:
block = str(file.read(MAX_LINE*2))
first_line = block[:block.find('\n') + 1]
chunks.append(first_line.split()[0])
pos = BLOCK_SIZE
while pos < size:
file.seek(pos)
block = str(file.read(MAX_LINE*2))
first_eol = block.find('\n')
second_eol = block.find('\n', first_eol + 1)
if first_eol == -1 or second_eol == -1:
break
line = block[first_eol + 1:second_eol]
key = line.split()[0]
chunks.append(key)
pos += BLOCK_SIZE
return chunks
if __name__ == '__main__':
BLOCK_SIZE = 10
filename = 'bigfile'
chunks = parse_chunks(filename)
query = 'abc'
pos_before = bisect(chunks, query) - 1
with open(filename, 'rb') as file:
file.seek(pos_before*BLOCK_SIZE)
block = str(file.read(BLOCK_SIZE + MAX_LINE))
line_start = block.find(query)
line_end = block.find('\n', line_start + 1)
line = block[line_start:line_end]
print(line)
In this toy example I use block size of 10 bytes, in your case of 12GB file I'd suggest you to start with 1M.
The following recursive function should be able to narrow the search interval. I'm not sure that you can modify it so that it returns a match or None for no match.
def bisearch(f, word, i, j)
if (j-1)<1E6: return i,j
k = (i+j)/2
f.seek(k)
while k<j:
c = f.read(1)
k = k+1
if c == '\n': break
else:
# ??? no match ??? I'm not sure
w = []
while 1:
c = f.read(1)
if c == '\t': break
w.append(c)
w = "".join(w)
if w == word:
return k, k
if w < word:
return bisearch(f, word, k, j)
else:
return bisearch(f, word, i, k)
and here an example of usage
word = ...
f = open(...)
i,j = bisearch(f, word, 0, len_f)
f.seek(i)
if i==j:
line = f.readline()
else:
#################### EDIT ################
# OLD
# buffer = f.read(1E6)
# NEW
buffer = f.read(j-i)
lenw = len(word)
for line in buffer.split('\n'):
if line[:lenw] == word: break
else:
# no matches, SOS
result = process(line)
Try seeking to the line in question and using readline.
print "Located # line: ", linepos
file_to_search.seek(linepos)
line = file_to_search.readline()
This is assuming linepos is the position of the line, counted in bytes from the beginning of the file. If it's the position counted in line numbers, you'll need to multiply by the number of bytes per line before seeking.
print "Located # line: ", linepos
file_to_search.seek(linepos * searchable_file.record_size)
line = file_to_search.readline()
I am looking to alter my map reduce files to output the top bigrams in a chunk of text instead of the word count, so both words and the bigram count
This is my current code and approach.
Map:
import sys
for line in sys.stdin:
line = line.strip()
words = line.split() #bigrams = line.split()
for word in words: #for bigram in words
print '%s\t%s' % (word,1) #print ... word pair???
Reduce:
mydict = dict()
for line in sys.stdin:
(word,cnt) = line.strip().split('\t') #bigram and bigram count
mydict[word] = mydict.get(word,0) 1
for word,cnt in mydict.items():
print word,cnt #print bigram and bigram count
Thank you.
I saw nltk as a popular solution for computing bigrams, should I have that approach even in my mapreduce format?
I wouldn't do it with stdin and stdout. I'd rather throw multiprocessing at this and read from some saved file:
import multiprocessing as mp
def main(infilepath):
bgqIn, bgqOut = [mp.Queue() for _ in xrange(2)]
procs = [mp.Process(target=mapper, args=(bgqIn, bgqOut)) for _ in xrange(mp.cpu_count())]
for p in procs:
p.start()
with open(infilepath) as infile:
first = ''
second = ''
for line in infile:
line = line.lower()
for word in line.split():
first = second
second = word
bigram = (first, second)
bgqIn.put(bigram)
for p in procs:
bgqIn.put(None)
rqs = [(mp.Queue() for _ in xrange(2)) for i in xrange(mp.cpu_count())]
rprocs = [mp.Process(target=reducer, args=(*rqs[i])) for i in xrange(mp.cpu_count())]
for p in rprocs:
p.start()
qmap = {}
for char in xrange(97,123):
qmap[ord(char)] = rqs[(char-97)/len(rqs)]
dones = 0
while dones != len(procs):
t = bgqOut.get()
if t is None:
dones += 1
else:
qmap[t[0][0]].put(t)
for q in rqs:
q.put(None)
answer = {}
for q in rqs:
for bg,count in iter(q.get, None):
if bg not in answer:
answer[bg] = 0
answer[bg] += count
for bg,count in answer.iteritems():
print "There are", count, "occurrences of", bg
def mapper(qIn, qOut):
counts = {}
for bg in iter(qIn.get, None):
if bg not in counts:
counts[bg] = 0
counts[bg] += 1
for k,v in counts.iteritems():
qOut.put((k,v))
qOut.put(None)
def reducer(qIn, qOut):
counts = {}
for bg,count in iter(qIn.get, None):
if bg not in counts:
counts[bg] = 0
counts[bg] += count
for bg,count in counts.iteritems():
qOut.put((bg,count))
qOut.put(None)
I haven't tested this, but it's a basic skeleton that should get you started.
Pretty new to python/programming in general, been working on a script but have run into indentation errors around line for line in csv.reader( open(filename), delimiter="\t"): been trying a few things but could use a little help sorting it out, any ideas?
Could you explain any responses you have, helps with the learning process thank you!
#!/usr/bin/python
import csv
import pprint
pp = pprint.PrettyPrinter(indent=4)
import sys
import getopt
import re
changes = {}
import argparse
parser = argparse.ArgumentParser()
parser.add_argument ("infile", metavar="CSV", nargs="+", type=str, help="data file")
args = parser.parse_args()
sample_names = []
SIMILARITY_CUTOFF = 95
#
# Function that investigates the similarity between two samples.
#
#
def similar_samples( sample_name1, sample_name2):
combined_changes = dict()
for change, fraction in changes[ sample_name1 ]:
if ( change not in combined_changes):
combined_changes[change] = []
combined_changes[change].append(float(fraction))
for change, fraction in changes[ sample_name2 ]:
if ( change not in combined_changes):
combined_changes[change] = []
combined_changes[change].append(float(fraction))
passed_changes = 0
failed_changes = 0
for change in combined_changes.keys():
if ( len(combined_changes[ change ]) == 1):
failed_changes +=1
continue
sum = 0
count = 0
for a in combined_changes[ change ]:
sum += a
count += 1
mean = sum/ count
for a in combined_changes[ change ]:
if ( mean > a + 2 or mean < a - 2):
failed_changes += 1
else:
passed_changes += 1
# print "passed changes: %d, failed changes: %d" % ( passed_changes, failed_changes)
if ( passed_changes * 100 / (passed_changes + failed_changes) > SIMILARITY_CUTOFF):
print " vs ".join([sample_name1, sample_name2]) + " : Similar samples"
return 1
else:
print " vs ".join([sample_name1, sample_name2]) + " : Different samples"
return 0
# print "mean %.2f \n" % ( sum/ count)
for filename in args.infile:
sample_name = filename
#sample_name = re.search("^(.*)\_", filename).group(1)
changes[ sample_name ] = []
sample_names.append( sample_name )
for line in csv.reader( open(filename), delimiter="\t"):
for item in line[2:]:
if not item.strip():
continue
item = item.split(":")
item[1] = item[1].rstrip("%")
changes[ sample_name].append([line[1]+item[0],item[1]])
for i in range(0, len(sample_names)):
for j in range(i+1, len(sample_names)):
similar = similar_samples( sample_names[ i ], sample_names[ j ])
exit()
Indentation error.
Try indenting
sample_names.append( sample_name )
line
The line before the one you quoted should be one indention more
sample_names.append( sample_name )
This line I mean :)