Pickle ValueError: binary mode doesn't take an encoding argument - python

I want to create a very basic Q&A chatbot. Given a list of questions & answers that I use as my dataset, I want to train it in order to return relevant answers, depending on a hard-coded question (different every time). First I tokenize, cleaning up, then using cosine similarity, but it gives me an error, which is (I guess) a pickle issue.
UPDATED
import csv
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from collections import Counter
import pickle
import os.path
import re, math
tokenizer = RegexpTokenizer(r'\w+')
stopwords = stopwords.words('english')
extra_stopwords = stopwords + ['I', 'can']
WORD = re.compile(r'\w+')
def get_clean_data():
clean_data_set = {
'questions' : {},
'answers' : {}
}
reader = csv.reader(open('data.csv', 'r', encoding="utf-8"))
tags = []
counter = 0
for r in reader:
question = str(r[0].encode('utf-8'))
answer = str(r[1].encode('utf-8'))
_, tags_question = get_tags(question)
_, tags_answer = get_tags(answer)
clean_data_set['answers'][answer] = tags_answer + tags_question
clean_data_set['questions'][question] = text_to_vector(question)
counter += 1
# hardcode the number :)
print (counter, ' out of 746')
# pickle.dump(clean_data_set, open('dump.dict', 'wb'))
with open('dump.dict', 'wb') as my_dump_file:
pickle.dump(clean_data_set, my_dump_file)
def get_tags(text, use_set = True):
tokens = tokenizer.tokenize(text)
# remove stop words from tokens
# make it lower case
filtered_words = [word.lower() for word in tokens if word not in extra_stopwords]
# return non duplicate values by default
if use_set == True:
filterd_words = list(set(filtered_words))
return Counter(filtered_words), filtered_words
# simple cosine similarity measure
def get_cosine(vec1, vec2):
intersection = set(vec1.keys()) & set(vec2.keys())
numerator = sum([vec1[x] * vec2[x] for x in intersection])
sum1 = sum([vec1[x]**2 for x in vec1.keys()])
sum2 = sum([vec2[x]**2 for x in vec2.keys()])
denominator = math.sqrt(sum1) * math.sqrt(sum2)
if not denominator:
return 0.0
else:
return float(numerator) / denominator
def text_to_vector(text):
words = WORD.findall(text)
return Counter(words)
# question_set is the data we had
def get_cosine_value(question, question_set):
question_vector = text_to_vector(question)
cosine = get_cosine(question_vector, question_set)
return cosine
def answer_question(question, top = 5):
with open('dump.dict', 'rb') as my_dump_file:
data_set = pickle.load(my_dump_file)
# data_set = pickle.load(open('dump.dict', 'rb'))
pickle.dump(a, handle, protocol=pickle.HIGHEST_PROTOCOL)
_, question_tags = get_tags(question)
ranking_dict = {}
similar_questions_rank = {}
for entry in data_set['answers']:
tags = data_set['answers'][entry]
# rank is the intersection between the list of tags from the question
# and the list of tags associated to answers
rank = len(set(question_tags).intersection(tags))
ranking_dict[entry] = rank
for entry in data_set['questions']:
cosine_similarity = get_cosine_value(question, data_set['questions'][entry])
similar_questions_rank[entry] = cosine_similarity
sorted_similarity_dict = sorted(similar_questions_rank.items(), key=lambda x: x[1], reverse=True)
sorted_ranking_dict = sorted(ranking_dict.items(), key=lambda x: x[1], reverse=True)
# sort them by rank
for item in sorted_ranking_dict[0:top-1]:
print ('Rank: ', item[1])
print ('Answer: ', item[0])
print ('\n\n')
# sort them by rank
for item in sorted_similarity_dict[0:top-1]:
print ('Rank: ', item[1])
print ('Question: ', item[0])
#get_clean_data()
question = 'why all these errors?'
answer_question(question)
This is the updated error message:
Traceback (most recent call last):
File "C:\Users\joasa\Desktop\si\main.py", line 133, in <module>
answer_question(question)
File "C:\Users\joasa\Desktop\si\main.py", line 94, in answer_question
data_set = pickle.load(my_dump_file)
EOFError: Ran out of input
[Finished in 1.4s]
Can someone help please? I have no idea what to do. Thanks in advance

I think it comes from this line in your get_clean_data function:
pickle.dump(clean_data_set, open('dump.dict', 'w'))
See here you open the file for writing, but you never close it, so when you try to read it there is nothing to signify that the end of file has been reached. To avoid stuff like this from happening, use a context manager block:
with open('dump.dict', 'wb') as my_dump_file:
pickle.dump(clean_data_set, my_dump_file)
That way, whichever way you exit the with block, you are guaranteed to close your file.
You should also do the same when loading your pickle dump in answer_question:
with open('dump.dict', 'rb') as my_dump_file:
data_set = pickle.load(my_dump_file)

Related

How to make my python code more computationally efficient

I am working on an information retrieval project, where I have to process a ~1.5 GB text data and create a Dictionary (words, document frequency) and posting list (document id, term frequency). According to the professor, it should take around 10-15 minutes. But my code is running for more than 8 hours now! I tried a smaller dataset (~35 MB) and it took 5 hours to process.
I am a newbie in python and I think it is taking so long because i have created many python dictionaries and lists in my code. I tried to use generator, but I am not sure how to work around with it.
file = open(filename, 'rt')
text = file.read()
file.close()
p = r'<P ID=\d+>.*?</P>'
tag = RegexpTokenizer(p)
passage = tag.tokenize(text)
doc_re = re.compile(r"<P ID=(\d+)>")
def process_data(docu):
tokens = RegexpTokenizer(r'\w+')
lower_tokens = [word.lower() for word in tokens.tokenize(docu)]
table = str.maketrans('','', string.punctuation)
stripped = [w.translate(table) for w in lower_tokens]
alpha = [word for word in stripped if word.isalpha()]
stopwordlist = stopwords.words('english')
stopped = [w for w in alpha if not w in stopwordlist]
return stopped
data = {}
for doc in passage:
group_docID = doc_re.match(doc)
docID = group_docID.group(1)
tokens = process_data(doc)
data[docID] = list(set(tokens))
vocab = [item for i in data.values() for item in i]
total_vocab = list(set(vocab))
total_vocab.sort()
print('Document Size = ', len(data))
print('Collection Size = ', len(vocab))
print('Vocabulary Size= ', len(total_vocab))
inv_index = {}
for x in total_vocab:
for y, z in data.items():
if x in z:
wordfreq = z.count(x)
inv_index.setdefault(x, []).append((int(y), wordfreq))
flattend = [item for tag in inv_index.values() for item in tag]
posting = [item for tag in flattend for item in tag ]
doc_freq=[]
for k,v in inv_index.items():
freq1=len([item for item in v if item])
doc_freq.append((freq1))
#offset value of each vocabulary/words
offset = []
offset1=0
for i in range(len(doc_freq)):
if i>0:
offset1 =offset1 + (doc_freq[i-1]*2)
offset.append((offset1))
#create dcitionary of words, document frequency and offset
dictionary = {}
for i in range(len(total_vocab)):
dictionary[total_vocab[i]]=(doc_freq[i],offset[i])
#dictionary of word, inverse document frequency
idf = {}
for i in range(len(dictionary)):
a = np.log2(len(data)/doc_freq[i])
idf[total_vocab[i]] = a
with open('dictionary.json', 'w') as f:
json.dump(dictionary,f)
with open('idf.json', 'w') as f:
json.dump(idf, f)
binary_file = open('binary_file.txt', 'wb')
for i in range(0, len(posting)):
binary_int = (posting[i]).to_bytes(4, byteorder = 'big')
binary_file.write(binary_int)
binary_file.close()
Could someone please help me to rewrite this code so that it becomes more computationally and time efficient?

Python code taking more than 15 minutes to generate output

import os,re
import math
from math import log10
import nltk.corpus
from nltk.tokenize import RegexpTokenizer
from nltk.corpus import stopwords
from nltk.stem.porter import PorterStemmer
from collections import defaultdict
python_file_root = './presidential_debates'
def getidf(token):
document_occurance = 0
for filename in os.listdir(python_file_root):
file = open(os.path.join(python_file_root, filename), "r")
for line in file:
if re.search(r'\b' +token+ r'\b', line):
document_occurance = document_occurance + 1
break
if (document_occurance != 0):
idf = log10(30 / document_occurance)
return idf
return -1
def normalize(filename,token):
file = open(os.path.join(python_file_root, filename), "r")
counts = dict()
square = []
count1 = 0
for line in file:
count1 = count1 + 1
if line in counts:
counts[line] += 1
else:
counts[line] = 1
for key,value in counts.items():
tf = 1 +log10(value)
idf = getidf(key.rstrip())
square.append((tf * idf)*(tf * idf))
summ = sum(square)
sqroot = math.sqrt(summ)
return sqroot
def getweight(filename,token):
hit_count1 = 0
final = 0
file = open(os.path.join(python_file_root, filename), "r")
idft = getidf(token)
for line in file:
if re.search(r'\b' +token+ r'\b', line):
hit_count1 = hit_count1 + 1
if (hit_count1 == 0):
return 0
else:
tf = 1 + log10(hit_count1)
initial = idft * tf
if(initial <= 0):
final = 0
return final
else:
normalize_fact = normalize(filename,token)
final = initial / normalize_fact
return final
for filename in os.listdir(python_file_root):
file = open(os.path.join(python_file_root, filename), "r")
doc = file.read()
doc = doc.lower()
stemmed = []
tokenizer = RegexpTokenizer(r'[a-zA-Z]+')
tokens = tokenizer.tokenize(doc)
stoplist = stopwords.words('english')
stop_removed = [word for word in tokens if word not in stoplist]
with open(os.path.join(python_file_root, filename), "w") as f:
for item in stop_removed:
stemmer = PorterStemmer()
stemmed = [stemmer.stem(item)]
for items in stemmed:
f.write("%s\n" % items)
print("\nIDF\n")
print("%.12f" % getidf("health"))
print("%.12f" % getidf("agenda"))
print("%.12f" % getidf("vector"))
print("%.12f" % getidf("reason"))
print("%.12f" % getidf("hispan"))
print("%.12f" % getidf("hispanic"))
print("\n")
print("%.12f" % getweight("2012-10-03.txt","health"))
print("%.12f" % getweight("1960-10-21.txt","reason"))
print("%.12f" % getweight("1976-10-22.txt","agenda"))
print("%.12f" % getweight("2012-10-16.txt","hispan"))
print("%.12f" % getweight("2012-10-16.txt","hispanic"))
I have 30 txt files and i have developed a program to find the idf and normalized tf-idf vectors. Im getting the correct values but the function getweight takes more than 15 minutes to generate the output. Can anyone suggest me a few methods for optimization.
I donot want to use any other non-standard Python package.
Why do you create a new PorterStemmer for every word?
Apart from this obvious thing, try profiling your code. NLTI has the reputation of being really slow - so it may well be not your fault. If you profile, then you'll know.

Error in prediction script using CNN model for text classification

I am try to write prediction part of script for the tutorial: https://mxnet.incubator.apache.org/tutorials/nlp/cnn.html
import mxnet as mx
from collections import Counter
import os
import re
import threading
import sys
import itertools
import numpy as np
from collections import namedtuple
SENTENCES_DIR = 'C:/code/mxnet/sentences'
CURRENT_DIR = 'C:/code/mxnet'
def clean_str(string):
string = re.sub(r"[^A-Za-z0-9(),!?\'\`]", " ", string)
string = re.sub(r"\'s", " \'s", string)
string = re.sub(r"\'ve", " \'ve", string)
string = re.sub(r"n\'t", " n\'t", string)
string = re.sub(r"\'re", " \'re", string)
string = re.sub(r"\'d", " \'d", string)
string = re.sub(r"\'ll", " \'ll", string)
string = re.sub(r",", " , ", string)
string = re.sub(r"!", " ! ", string)
string = re.sub(r"\(", " \( ", string)
string = re.sub(r"\)", " \) ", string)
string = re.sub(r"\?", " \? ", string)
string = re.sub(r"\s{2,}", " ", string)
return string.strip().lower()
def load_data_sentences(filename):
sentences_file = open( filename, "r")
# Tokenize
x_text = [line.decode('Latin1').strip() for line in sentences_file.readlines()]
x_text = [clean_str(sent).split(" ") for sent in x_text]
return x_text
def pad_sentences(sentences, padding_word=""):"
sequence_length = max(len(x) for x in sentences)
padded_sentences = []
for i in range(len(sentences)):
sentence = sentences[i]
num_padding = sequence_length - len(sentence)
new_sentence = sentence + [padding_word] * num_padding
padded_sentences.append(new_sentence)
return padded_sentences
def build_vocab(sentences):
word_counts = Counter(itertools.chain(*sentences))
vocabulary_inv = [x[0] for x in word_counts.most_common()]
vocabulary = {x: i for i, x in enumerate(vocabulary_inv)}
return vocabulary, vocabulary_inv
def build_input_data(sentences, vocabulary):
x = np.array([
[vocabulary[word] for word in sentence]
for sentence in sentences])
return x
def predict(mod, sen):
mod.forward(Batch(data=[mx.nd.array(sen)]))
prob = mod.get_outputs()[0].asnumpy()
prob = np.squeeze(prob)
a = np.argsort(prob)[::-1]
for i in a[0:5]:
print('probability=%f' %(prob[i]))
sentences = load_data_sentences( os.path.join( SENTENCES_DIR, 'test-pos-1.txt') )
sentences_padded = pad_sentences(sentences)
vocabulary, vocabulary_inv = build_vocab(sentences_padded)
x = build_input_data(sentences_padded, vocabulary)
Batch = namedtuple('Batch', ['data'])
sym, arg_params, aux_params = mx.model.load_checkpoint( os.path.join( CURRENT_DIR, 'cnn'), 19)
mod = mx.mod.Module(symbol=sym, context=mx.cpu(), label_names = None)
mod.bind(for_training=False, data_shapes=[('data', (50,56))], label_shapes=mod._label_shapes)
mod.set_params(arg_params, aux_params, allow_missing=True)
predict(mod, x)
But I got the error:
infer_shape error. Arguments: data: (50, 26L)
Traceback (most recent call last):
File "C:\code\mxnet\test2.py", line 152, in predict(mod, x)
File "C:\code\mxnet\test2.py", line 123, in predict
mod.forward(Batch(data=[mx.nd.array(sen)]))
...
MXNetError: Error in operator reshape0: [16:20:21] c:\projects\mxnet-distro-win\mxnet-build\src\operator\tensor./matrix_op-inl.h:187:
Check failed: oshape.Size() == dshape.Size() (840000 vs. 390000)
Target shape size is different to source.
Target: [50,1,56,300]
Source: [50,26,300]
Source is text file with 50 strings of sentences
Unfortunately I didn't found any help in Internet. Please take a look.
OS: Windows 10. Python 2.7
Thank you.
I believe the error you're having is because the padding of your input sentences is different than what the model expects. The way pad_sentences works is to pad the sentences to the length of the longest sentence passed in, so if you're using a different data set, you'll almost certainly get a different padding than your model's padding (which is 56). In this case, it looks like you're getting a padding of 26 (From the error message 'Source: [50, 26, 300]').
I was able to get your code to run successfully by modifying pad_sentence as follows and running it with sequence_length=56 to match the model.
def pad_sentences(sentences, sequence_length, padding_word=""):
padded_sentences = []
for i in range(len(sentences)):
sentence = sentences[i]
num_padding = sequence_length - len(sentence)
new_sentence = sentence + [padding_word] * num_padding
padded_sentences.append(new_sentence)
return padded_sentences
N.B when you do get your successful run, you'll encounter an error because prob[i] is not a float.
def predict(mod, sen):
mod.forward(Batch(data=[mx.nd.array(sen)]))
prob = mod.get_outputs()[0].asnumpy()
prob = np.squeeze(prob)
a = np.argsort(prob)[::-1]
for i in a[0:5]:
print('probability=%f' %(prob[i])) << prob is a numpy.ndarray, not a float.
Vishaal

TypeError: list object is not callable

I have the following function:
def sample_handling(sample, lexicon, classification):
featureset = []
with open(sample, 'r') as f:
contents = f.readlines()
for l in contents[:hm_lines]:
current_words = word_tokenize(l.lower())
current_words = [lemmatizer.lemmatize(i) for i in current_words]
features = np.zeros(len(lexicon))
for word in current_words():
if word.lower() in lexicon:
index_value = lexicon.index(word.lower())
features[index_value] += 1
features = list(features)
featureset.append([features, classification])
return featureset
When I run the code, it gives me the following error:
TypeError: 'list' object is not callable
Is there any overshadowing going on here? I followed many threads on SO dealing with this error but could not solve my problem.
This is my full code:
import nltk
from nltk.tokenize import word_tokenize
from nltk.stem import WordNetLemmatizer
import numpy as np
import random
import pickle
from collections import Counter
lemmatizer = WordNetLemmatizer()
hm_lines = 10000000
def create_lexicon(pos, neg):
lexicon = []
for fi in [pos, neg]:
with open(fi, 'r') as f:
contents = f.readlines()
for l in contents[:hm_lines]:
all_words = word_tokenize(l.lower())
lexicon += list(all_words)
lexicon = [lemmatizer.lemmatize(i) for i in lexicon]
w_counts = Counter(lexicon)
#w_counts = {'the': 52521, 'and': 25242}
l2 = []
for w in w_counts:
if 1000 > w_counts[w] > 50:
l2.append(w)
print(l2)
return l2
def sample_handling(sample, lexicon, classification):
featureset = []
with open(sample, 'r') as f:
contents = f.readlines()
for l in contents[:hm_lines]:
current_words = word_tokenize(l.lower())
current_words = [lemmatizer.lemmatize(i) for i in current_words]
features = np.zeros(len(lexicon))
for word in current_words():
if word.lower() in lexicon:
index_value = lexicon.index(word.lower())
features[index_value] += 1
features = list(features)
featureset.append([features, classification])
return featureset
def create_feature_sets_and_lables(pos, neg, test_size = 0.1):
lexicon = create_lexicon(pos, neg)
features = []
features += sample_handling('pos.txt', lexicon,[1,0])
features += sample_handling('neg.txt', lexicon,[0,1])
random.shuffle(features)
features = np.array(features)
testing_size = int(test_size * len(features))
train_x = list(features[:,0][:-testing_size])
train_y = list(features[:,1][:-testing_size])
test_x = list(features[:,0][-testing_size:])
test_y = list(features[:,1][-testing_size:])
return train_x, train_y, test_x, test_y
if __name__ == '__main__':
train_x, train_y, test_x, test_y = create_feature_sets_and_lables('pos.txt', 'neg.txt')
with open('sentiment_set.pickle', 'wb') as f:
pickle.dump([train_x, train_y, test_x, test_y], f)
It would have been more helpful had you printed the full stack-trace. As this is a relatively simple error, the problem is easily identifiable in this case. It's this line,
for word in current_words():
You needn't call a list while looping it. Simply this will do,
for word in current_words:
Well, to start debugging, I would run the program by using
python -m pdb whatever_your_file_is.py
This will launch a pdb debugging console. Once there, press 'c' to run the program. After awhile, supposing the program crashes, you'll be stopped at the exact location where the error occurred.
From there, you can refer to this or this (just Google python pdb) to figure out exactly what's going on in your code.
Good luck!

error in Naive bayes classifier

i'm beginner in machine learning and i'm trying to implement my first Naive Bayes by myself for better understanding. So, i have dataset from http://archive.ics.uci.edu/ml/datasets/Adult (american census data, classes are '<=50k' and '>50k').
Here is my python code:
#!/usr/bin/python
import sys
import csv
words_stats = {} # {'word': {'class1': cnt, 'class2': cnt'}}
words_cnt = 0
targets_stats = {} # {'class1': 3234, 'class2': 884} how many words in each class
class_stats = {} # {'class1': 7896, 'class2': 3034} how many lines in each class
items_cnt = 0
def train(dataset, targets):
global words_stats, words_cnt, targets_stats, items_cnt, class_stats
num = len(dataset)
for item in xrange(num):
class_stats[targets[item]] = class_stats.get(targets[item], 0) + 1
for i in xrange(len(dataset[item])):
word = dataset[item][i]
if not words_stats.has_key(word):
words_stats[word] = {}
tgt = targets[item]
cnt = words_stats[word].get(tgt, 0)
words_stats[word][tgt] = cnt + 1
targets_stats[tgt] = targets_stats.get(tgt, 0) + 1
words_cnt += 1
items_cnt = num
def classify(doc, tgt_set):
global words_stats, words_cnt, targets_stats, items_cnt
probs = {} #the probability itself P(c|W) = P(W|c) * P(c) / P(W)
pc = {} #probability of the class in document set P(c)
pwc = {} #probability of the word set in particular class. P(W|c)
pw = 1 #probability of the word set in documet set
for word in doc:
if word not in words_stats:
continue #dirty, very dirty
pw = pw * float(sum(words_stats[word].values())) / words_cnt
for tgt in tgt_set:
pc[tgt] = class_stats[tgt] / float(items_cnt)
for word in doc:
if word not in words_stats:
continue #dirty, very dirty
tgt_wrd_cnt = words_stats[word].get(tgt, 0)
pwc[tgt] = pwc.get(tgt, 1) * float(tgt_wrd_cnt) / targets_stats[tgt]
probs[tgt] = (pwc[tgt] * pc[tgt]) / pw
l = sorted(probs.items(), key = lambda i: i[1], reverse=True)
print probs
return l[0][0]
def check_results(dataset, targets):
num = len(dataset)
tgt_set = set(targets)
correct = 0
incorrect = 0
for item in xrange(num):
res = classify(dataset[item], tgt_set)
if res == targets[item]:
correct = correct + 1
else:
incorrect = incorrect + 1
print 'correct:', float(correct) / num, ' incorrect:', float(incorrect) / num
def load_data(fil):
data = []
tgts = []
reader = csv.reader(fil)
for line in reader:
d = [x.strip() for x in line]
if '?' in d:
continue
if not len(d):
continue
data.append(d[:-1])
tgts.append(d[-1:][0])
return data, tgts
if __name__ == '__main__':
if len(sys.argv) < 3:
print './program train_data.txt test_data.txt'
sys.exit(1)
filename = sys.argv[1]
fil = open(filename, 'r')
data, tgt = load_data(fil)
train(data, tgt)
test_file = open(sys.argv[2], 'r')
test_data, test_tgt = load_data(test_file)
check_results(test_data, tgt)
it gives ~61% of correct results. when i print probabilities i get the following:
{'<=50K': 0.07371606889800396, '>50K': 15.325378327213354}
but in case of correct classifier i expect to see sum of both probabilities equal to 1.
At first i thought the problem is in float underflow and tried to make all calculations in logarithms, but results were similiar.
i understand that omitting some words is gonna affect accuracy, but the probabilities are sooo wrong.
What do i do wrong or don't understand?
for your convinience i've uploaded dataset and python script here:
https://dl.dropboxusercontent.com/u/36180992/adult.tar.gz
Thank you for your help.
Naive Bayes doesn't compute a probability directly, rather it computes a "raw score" that is relatively compared to the other scores for each label in order to classify an instance. This score can easily be converted to a "probability" in the range of [0, 1]:
total = sum(probs.itervalues())
for label, score in probs.iteritems():
probs[label] = score / total
However, keep in mind this still doesn't represent a true probability, as mentioned in this answer:
naive Bayes tends to predict probabilities that are almost always either very close to zero or very close to one.

Categories

Resources