I am a beginner to AI and sentimental analysis. I'm doing sentimental analysis between two documents. This code is working perfectly fine when I add only one source document rather than a list of multiple source documents to compare it with multiple target documents.
Can someone please tell me what I need to change to work it with multiple source documents list?
#Loading pre=trained word2vec model
from gensim.models.keyedvectors import KeyedVectors
# You need to dowload google pre-trained model using below link
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
#Change the path according to your directory
model_path = 'E:\GoogleNews_vectors_negative300.bin'
w2v_model = KeyedVectors.load_word2vec_format(model_path, binary=True)
#Setting Parameters for model
class DocSim(object):
def __init__(self, w2v_model , stopwords=[]):
self.w2v_model = w2v_model
self.stopwords = stopwords
def vectorize(self, doc):
"""Identify the vector values for each word in the given document"""
doc = doc.lower()
words = [w for w in doc.split(" ") if w not in self.stopwords]
word_vecs = []
for word in words:
try:
vec = self.w2v_model[word]
word_vecs.append(vec)
except KeyError:
# Ignore, if the word doesn't exist in the vocabulary
pass
# Assuming that document vector is the mean of all the word vectors
vector = np.mean(word_vecs, axis=0)
return vector
def _cosine_sim(self, vecA, vecB):
"""Find the cosine similarity distance between two vectors."""
csim = np.dot(vecA, vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
if np.isnan(np.sum(csim)):
return 0
return csim
def calculate_similarity(self, source_doc, target_docs=[], threshold=0):
"""Calculates & returns similarity scores between given source document & all
the target documents."""
if isinstance(target_docs, str):
target_docs = [target_docs]
source_vec = self.vectorize(source_doc)
results = []
for doc in target_docs:
target_vec = self.vectorize(doc)
sim_score = self._cosine_sim(source_vec, target_vec)
if sim_score > threshold:
results.append({
'score' : sim_score,
'doc' : doc
})
# Sort results by score in desc order
results.sort(key=lambda k : k['score'] , reverse=True)
return results
ds = DocSim(w2v_model)
#Calculate the similarity score between a source rule & a target rule.
source_rule = [ '2.1.1 Context','2.2.3 Value']
target_rule = [ '2.1.1 Context','2.1.2.4 Assist Failed Train']
# This will return one target rules text with a similarity score
sim_scores = ds.calculate_similarity(source_rule, target_rule)
print(sim_scores)
This is the error I am getting right now.
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-22-041084a3f599> in <module>
6 # This will return one target rules text with similarity score
7
----> 8 sim_scores = ds.calculate_similarity(source_rule, target_rule)
9
10 print(sim_scores)
<ipython-input-20-055f5d25808f> in calculate_similarity(self, source_doc, target_docs, threshold)
41 source_doc=[source_doc]
42
---> 43 source_vec = self.vectorize(source_doc)
44 results = []
45 for doc in target_docs:
<ipython-input-20-055f5d25808f> in vectorize(self, doc)
8 def vectorize(self, doc):
9 """Identify the vector values for each word in the given document"""
---> 10 doc = doc.lower()
11 words = [w for w in doc.split(" ") if w not in self.stopwords]
12 word_vecs = []
AttributeError: 'list' object has no attribute 'lower'
Rather than sending the whole list to the function, make sure the source_rule is a list and then iterate over it and then execute the calculate_similarity() function on it
#Loading pre=trained word2vec model
from gensim.models.keyedvectors import KeyedVectors
# You need to dowload google pre-trained model using below link
# https://drive.google.com/file/d/0B7XkCwpI5KDYNlNUTTlSS21pQmM/edit
#Change the path according to your directory
model_path = 'E:\GoogleNews_vectors_negative300.bin'
w2v_model = KeyedVectors.load_word2vec_format(model_path, binary=True)
#Setting Parameters for model
class DocSim(object):
def __init__(self, w2v_model , stopwords=[]):
self.w2v_model = w2v_model
self.stopwords = stopwords
def vectorize(self, doc):
"""Identify the vector values for each word in the given document"""
doc = doc.lower()
words = [w for w in doc.split(" ") if w not in self.stopwords]
word_vecs = []
for word in words:
try:
vec = self.w2v_model[word]
word_vecs.append(vec)
except KeyError:
# Ignore, if the word doesn't exist in the vocabulary
pass
# Assuming that document vector is the mean of all the word vectors
vector = np.mean(word_vecs, axis=0)
return vector
def _cosine_sim(self, vecA, vecB):
"""Find the cosine similarity distance between two vectors."""
csim = np.dot(vecA, vecB) / (np.linalg.norm(vecA) * np.linalg.norm(vecB))
if np.isnan(np.sum(csim)):
return 0
return csim
def calculate_similarity(self, source_doc, target_docs=[], threshold=0):
"""Calculates & returns similarity scores between given source document & all
the target documents."""
if isinstance(target_docs, str):
target_docs = [target_docs]
source_vec = self.vectorize(source_doc)
results = []
for doc in target_docs:
target_vec = self.vectorize(doc)
sim_score = self._cosine_sim(source_vec, target_vec)
if sim_score > threshold:
results.append({
'score' : sim_score,
'doc' : doc
})
# Sort results by score in desc order
results.sort(key=lambda k : k['score'] , reverse=True)
return results
ds = DocSim(w2v_model)
#Calculate the similarity score between a source rule & a target rule.
source_rule = [ '2.1.1 Context','2.2.3 Value']
target_rule = [ '2.1.1 Context','2.1.2.4 Assist Failed Train']
if isinstance(source_rule, str):
source_rule = [source_rule]
# This will return one target rules text with a similarity score
for rule in source_rule:
sim_scores = ds.calculate_similarity(rule, target_rule)
print("Similarity with {} is {}".format(rule, sim_scores))
Related
I created my own custom pipeline for text processing. Inside the .transform() method, I want to remove the target row if there are no tokens.
class SpacyVectorizer(BaseEstimator, TransformerMixin):
def __init__(
self,
alpha_only: bool = True,
lemmatize: bool = True,
remove_stopwords: bool = True,
case_fold: bool = True,
):
self.alpha_only = alpha_only
self.lemmatize = lemmatize
self.remove_stopwords = remove_stopwords
self.case_fold = case_fold
self.nlp = spacy.load(
name='en_core_web_sm',
disable=["parser", "ner"]
)
def fit(self, X, y=None):
return self
def transform(self, X, y):
# Bag-of-Words matrix
bow_matrix = []
# Iterate over documents in SpaCy pipeline
for i, doc in enumerate(nlp.pipe(X)):
# Words array
words = []
# Tokenize document
for token in doc:
# Remove non-alphanumeric tokens
if self.alpha_only and not token.is_alpha:
continue
# Stopword removal
if self.remove_stopwords and token.is_stop:
continue
# Lemmatization
if self.lemmatize:
token = token.lemma_
# Case folding
if self.case_fold:
token = str(token).casefold()
# Append token to words array
words.append(token)
# Update the Bow representation
if words:
# Preprocessed document
new_doc = ' '.join(words)
# L2-normalized vector of preprocessed document
word_vec = nlp(new_doc).vector
else:
# Remove target label
y.drop(y.index[i], inplace=True)
# Update the BoW matrix
bow_matrix.append(word_vec)
# Return BoW matrix
return bow_matrix
Unfortunately, because I cannot pass the y vector to the .transform() method, it does not work.
How can I force the pipeline to pass both X and y parameters?
Is there any other workaround on how to do it?
I don't want to pass y via .fit_transform(), because test data shouldn't be fitted.
def transform(self, X, y=None):
Here you have written y = None, which means if you aren't passing any y value then it's taking a default value as None.
In order to force a pipeline to pass a y value u should write
def transform(self, X, y):
pass
If you do this then you have to pass a y value, else it will return a error
the space problem I am talking about
class SpacyVectorizer:
def __init__(
self,
alpha_only: bool = True,
lemmatize: bool = True,
remove_stopwords: bool = True,
case_fold: bool = True,
):
self.alpha_only = alpha_only
self.lemmatize = lemmatize
self.remove_stopwords = remove_stopwords
self.case_fold = case_fold
self.nlp = spacy.load(
name='en_core_web_sm',
disable=["parser", "ner"]
)
def transform(self, X, y):
# Bag-of-Words matrix
bow_matrix = []
# Iterate over documents in SpaCy pipeline
for i, doc in enumerate(nlp.pipe(X)):
# Words array
words = []
# Tokenize document
for token in doc:
# Remove non-alphanumeric tokens
if self.alpha_only and not token.is_alpha:
continue
# Stopword removal
if self.remove_stopwords and token.is_stop:
continue
# Lemmatization
if self.lemmatize:
token = token.lemma_
# Case folding
if self.case_fold:
token = str(token).casefold()
# Append token to words array
words.append(token)
# Update the Bow representation
if words:
# Preprocessed document
new_doc = ' '.join(words)
# L2-normalized vector of preprocessed document
word_vec = nlp(new_doc).vector
else:
# Remove target label
y.drop(y.index[i], inplace=True)
# Update the BoW matrix
bow_matrix.append(word_vec)
# Return BoW matrix
return bow_matrix
The error you are getting might be because of the space problem, as self might be taking x value and X parameter might be taking y value
I have a set of about 200 images that I want to cluster into groups of images with similar features. I'm using Resnet50 to extract feature vectors from images and with the help of Faiss Kmeans I'm trying to cluster them into groups.
I have defined a class for Faiss KMeans as given on the link here.
class FaissKMeans:
def __init__(self, n_clusters=8, n_init=10, max_iter=300):
self.n_clusters = n_clusters
self.n_init = n_init
self.max_iter = max_iter
self.kmeans = None
self.cluster_centers_ = None
self.inertia_ = None
def fit(self, X, y):
self.kmeans = faiss.Kmeans(d=X.shape[1],
k=self.n_clusters,
niter=self.max_iter,
nredo=self.n_init)
self.kmeans.train(X.astype(np.float32))
self.cluster_centers_ = self.kmeans.centroids
self.inertia_ = self.kmeans.obj[-1]
def predict(self, X):
return self.kmeans.index.search(X.astype(np.float32), 1)[1]
I'm storing the images and their vectors in a dictionary as key-value pairs.
#function to extract image vector
def extract_features(file, model):
img = load_img(file,target_size=(224,224))
img = np.array(img)
reshaped_img = img.reshape(1,224,224,3)
imgx = preprocess_input(reshaped_img)
features = model.predict(imgx,use_multiprocessing=True)
return features
#append the images in a folder to list "products"
products = []
with os.scandir(mypath) as files:
for file in files:
if file.name.endswith('.jpg'):
products.append(file.name)
#load ResNet50 model
model = ResNet50()
model = Model(inputs = model.inputs, outputs = model.layers[-2].output)
#save image and image vector to dictionary "feature_dict" as key value pair
feature_dict = {}
p = pkl_path
for product in products:
try:
feat = extract_features(product,model)
feature_dict[product] = feat
except:
with open(p,'wb') as file:
pickle.dump(data,file)
#convert dictionary to a numpy array
filenames = np.array(list(feature_dict.keys()))
feat = np.array(list(feature_dict.values()))
feat = feat.reshape(-1,2048)
I'm using the package "kneed" to determine the number of clusters
#determine the number of clusters
length = len(filenames)
lim = 25
sse = []
list_k = list(range(1, lim))
for k in list_k:
km = KMeans(n_clusters=k,random_state=22, n_jobs=-1)
labels= km.fit_predict(feat)
sse.append(km.inertia_)
kneedle=KneeLocator(list_k,sse,curve='convex',direction='decreasing')
elbow = kneedle.elbow #number of clusters
Now I'm trying to cluster the images into different groups using faiss Kmeans but I'm getting the error of AttributeError: 'Kmeans' object has no attribute 'fit' on kmeans.fit(feat)
kmeans = faiss.Kmeans(d=feat.shape[0] ,k=elbow, niter=200)
kmeans.fit(feat)
kmeans.train(feat)
When I try to use kmeans.train(feat) which I found on the link, I get the error AssertionError
I'm trying to build a utility where a dataset will be processed by the NMF model every couple of days. For this in the first run, I'm providing with a starting value for the number of topics. How can I calculate the coherence score for this entire dataset? I'm planning to use this calculated score to rebuild the model so that it'll be more accurate. Below is the code that I've used.
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.decomposition import NMF
import pandas as pd
import clr
#PLOTTING TOOLS
# import matplotlib.pyplot as PLOTTING
import matplotlib.pyplot as plt
import warnings
warnings.filterwarnings("ignore", category = DeprecationWarning)
dataset = pd.read_json('out.json', lines = True)
documents = dataset['attachment']
no_features = 1000
no_topics = 9
# print ('Old number of topics: ', no_topics)
tfidf_vectorizer = TfidfVectorizer(max_df = 0.95, min_df = 2, max_features = no_features, stop_words = 'english', norm='l2')
tfidf = tfidf_vectorizer.fit_transform(documents)
tfidf_feature_names = tfidf_vectorizer.get_feature_names()
no_topics = tfidf.shape
retrain_value = no_topics[0]
# print('New number of topics :', retrain_value)
nmf = NMF(n_components = retrain_value, random_state = 1, alpha = .1, l1_ratio = .5, init = 'nndsvd').fit(tfidf)
def display_topics(model, feature_names, no_top_words):
for topic_idx, topic in enumerate(model.components_):
print ("Topic %d: " % (topic_idx))
print (" ".join([feature_names[i] for i in topic.argsort()[:-no_top_words -1:-1]]))
no_top_words = 20
display_topics(nmf, tfidf_feature_names, no_top_words)
Unfortunately there is no out-of-the-box coherence model for sklearn.decomposition.NMF.
I've had the very same issue and found a custom implementation that is working with python 3.8.
It should be easy to adapt to your code. Please check the link for full imports, etc.
A snipptet from my recent usage of this technique:
kmin, kmax = 2, 30
topic_models = []
# try each value of k
for k in range(kmin,kmax+1):
print("Applying NMF for k=%d ..." % k )
# run NMF
model = decomposition.NMF( init="nndsvd", n_components=k )
W = model.fit_transform( A )
H = model.components_
# store for later
topic_models.append( (k,W,H) )
class TokenGenerator:
def __init__( self, documents, stopwords ):
self.documents = documents
self.stopwords = stopwords
self.tokenizer = re.compile( r"(?u)\b\w\w+\b" )
def __iter__( self ):
print("Building Word2Vec model ...")
for doc in self.documents:
tokens = []
for tok in self.tokenizer.findall( doc ):
if tok.lower() in self.stopwords:
tokens.append( "<stopword>" )
elif len(tok) >= 2:
tokens.append( tok.lower() )
yield tokens
docgen = TokenGenerator(docs_raw, stop_words)
w2v_model = gensim.models.Word2Vec(docgen, size=500, min_count=20, sg=1)
def calculate_coherence( w2v_model, term_rankings ):
overall_coherence = 0.0
for topic_index in range(len(term_rankings)):
# check each pair of terms
pair_scores = []
for pair in combinations( term_rankings[topic_index], 2 ):
#print(str(pair[0]) + " " + str(pair[1]))
pair_scores.append( w2v_model.similarity(pair[0], pair[1]))
# get the mean for all pairs in this topic
topic_score = sum(pair_scores) / len(pair_scores)
overall_coherence += topic_score
# get the mean score across all topics
return overall_coherence / len(term_rankings)
def get_descriptor( all_terms, H, topic_index, top ):
# reverse sort the values to sort the indices
top_indices = np.argsort( H[topic_index,:] )[::-1]
# now get the terms corresponding to the top-ranked indices
top_terms = []
for term_index in top_indices[0:top]:
top_terms.append( all_terms[term_index] )
return top_terms
k_values = []
coherences = []
for (k,W,H) in topic_models:
# Get all of the topic descriptors - the term_rankings, based on top 10 terms
term_rankings = []
for topic_index in range(k):
term_rankings.append( get_descriptor( terms, H, topic_index, 10 ) )
# Now calculate the coherence based on our Word2vec model
k_values.append( k )
coherences.append( calculate_coherence( w2v_model, term_rankings ) )
print("K=%02d: Coherence=%.4f" % ( k, coherences[-1] ) )
%matplotlib inline
plt.style.use("ggplot")
matplotlib.rcParams.update({"font.size": 14})
fig = plt.figure(figsize=(13,7))
# create the line plot
ax = plt.plot( k_values, coherences )
plt.xticks(k_values)
plt.xlabel("Number of Topics")
plt.ylabel("Mean Coherence")
# add the points
plt.scatter( k_values, coherences, s=120)
# find and annotate the maximum point on the plot
ymax = max(coherences)
xpos = coherences.index(ymax)
best_k = k_values[xpos]
plt.annotate( "k=%d" % best_k, xy=(best_k, ymax), xytext=(best_k, ymax), textcoords="offset points", fontsize=16)
# show the plot
plt.show()
Results:
K=02: Coherence=0.4157
K=03: Coherence=0.4399
K=04: Coherence=0.4626
K=05: Coherence=0.4333
K=06: Coherence=0.4075
K=07: Coherence=0.4121
...
EDIT 1: Following the answer received by akshat, I used this tutorial to load the model.
The code has been updated as follows:
import tensorflow as tf
class Bot:
'''This class defines the routine of the bot'''
def __init__(self, presentation):
'''The bot presents itself'''
self.presentation = presentation
print('')
print(presentation)
def ask_question(self):
'''This method defines how the bot asks questions'''
print('')
self.answer = str(input('Please enter your question: ')).split(' ')
print('')
print('Thank you for your question. Let me check..')
def answer_question(self):
'''This method answer the user's questions'''
print('')
print(self.answer)
def load_model(self):
with tf.Session() as sess:
new_saver = tf.train.import_meta_graph('model.tflearn.meta')
new_saver.restore(sess, tf.train.latest_checkpoint('./'))
print(sess.run('w1:0'))
bot = Bot('Good morning, my Name is BotPy')
question = bot.ask_question()
answer = bot.answer_question()
model = bot.load_model()
Launching it the following traceback is received:
2018-07-29 08:28:14.622798: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
WARNING:tensorflow:The saved meta_graph is possibly from an older release:
'model_variables' collection should be of type 'byte_list', but instead is of type 'node_list'.
Traceback (most recent call last):
File "/home/marco/PycharmProjects/chatBot/main.py", line 41, in <module>
model = bot.load_model()
File "/home/marco/PycharmProjects/chatBot/main.py", line 33, in load_model
new_saver = tf.train.import_meta_graph('model.tflearn.meta')
File "/home/marco/PycharmProjects/chatBot/venv/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 1960, in import_meta_graph
**kwargs)
File "/home/marco/PycharmProjects/chatBot/venv/lib/python3.5/site-packages/tensorflow/python/framework/meta_graph.py", line 790, in import_scoped_meta_graph
ops.prepend_name_scope(value, scope_to_prepend_to_names))
File "/home/marco/PycharmProjects/chatBot/venv/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3613, in as_graph_element
return self._as_graph_element_locked(obj, allow_tensor, allow_operation)
File "/home/marco/PycharmProjects/chatBot/venv/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3673, in _as_graph_element_locked
"graph." % repr(name))
KeyError: "The name 'Adam' refers to an Operation not in the graph."
ORIGINAL QUESTION: I am trying to build a chatbot.
Following this tutorial, I already have a jupyter notebook solution that is working with a model already trained.
import nltk
from nltk.stem.lancaster import LancasterStemmer
import numpy as np
import tflearn
import tensorflow as tf
import random
import json
from ._conv import register_converters as _register_converters
stemmer = LancasterStemmer()
with open('intents.json') as json_data:
intents = json.load(json_data)
words = []
classes = []
documents = []
ignore_words = ['?']
# loop through each sentence in our intents patterns
for intent in intents['intents']:
for pattern in intent['patterns']:
# tokenize each word in the sentence
w = nltk.word_tokenize(pattern)
# add to our words list
words.extend(w)
# add to documents in our corpus
documents.append((w, intent['tag']))
# add to our classes list
if intent['tag'] not in classes:
classes.append(intent['tag'])
# stem and lower each word and remove duplicates
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))
# remove duplicates
classes = sorted(list(set(classes)))
print (len(documents), "documents")
print (len(classes), "classes", classes)
print (len(words), "unique stemmed words", words)
20 documents
6 classes ['goodbye', 'greeting', 'hours', 'opentoday', 'payments', 'thanks']
32 unique stemmed words ["'s", 'acceiv', 'anyon', 'ar', 'bye', 'card', 'cash', 'credit', 'day', 'do', 'good', 'goodby', 'hello', 'help', 'hi', 'hour', 'how', 'is', 'lat', 'mastercard', 'on', 'op', 'see', 'tak', 'thank', 'that', 'ther', 'today', 'what', 'when', 'yo', 'you']
# create our training data
training = []
output = []
# create an empty array for our output
output_empty = [0] * len(classes)
# training set, bag of words for each sentence
for doc in documents:
# initialize our bag of words
bag = []
# list of tokenized words for the pattern
pattern_words = doc[0]
# stem each word
pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
# create our bag of words array
for w in words:
bag.append(1) if w in pattern_words else bag.append(0)
# output is a '0' for each tag and '1' for current tag
output_row = list(output_empty)
output_row[classes.index(doc[1])] = 1
training.append([bag, output_row])
# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)
# create train and test lists
train_x = list(training[:,0])
train_y = list(training[:,1])
# reset underlying graph data
tf.reset_default_graph()
# Build neural network
net = tflearn.input_data(shape=[None, len(train_x[0])])
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')
net = tflearn.regression(net)
# Define model and setup tensorboard
model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')
# Start training (apply gradient descent algorithm)
model.fit(train_x, train_y, n_epoch=1000, batch_size=8, show_metric=True)
model.save('model.tflearn')
Training Step: 2999 | total loss: 0.01771 | time: 0.006s
| Adam | epoch: 1000 | loss: 0.01771 - acc: 0.9999 -- iter: 16/20
Training Step: 3000 | total loss: 0.01754 | time: 0.009s
| Adam | epoch: 1000 | loss: 0.01754 - acc: 1.0000 -- iter: 20/20
--
INFO:tensorflow:/home/marco/PycharmProjects/chatBot/model.tflearn is not in all_model_checkpoint_paths. Manually adding it.
# save all of our data structures
import pickle
pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( "training_data", "wb" ) )
data = pickle.load( open( "training_data", "rb" ) )
words = data['words']
classes = data['classes']
train_x = data['train_x']
train_y = data['train_y']
# import our chat-bot intents file
import json
with open('intents.json') as json_data:
intents = json.load(json_data)
# load our saved model
model.load('./model.tflearn')
INFO:tensorflow:Restoring parameters from /home/marco/PycharmProjects/chatBot/model.tflearn
def clean_up_sentence(sentence):
# tokenize the pattern
sentence_words = nltk.word_tokenize(sentence)
# stem each word
sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
return sentence_words
# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=False):
# tokenize the pattern
sentence_words = clean_up_sentence(sentence)
# bag of words
bag = [0]*len(words)
for s in sentence_words:
for i,w in enumerate(words):
if w == s:
bag[i] = 1
if show_details:
print ("found in bag: %s" % w)
return(np.array(bag))
# create a data structure to hold user context
context = {}
ERROR_THRESHOLD = 0.25
def classify(sentence):
# generate probabilities from the model
results = model.predict([bow(sentence, words)])[0]
# filter out predictions below a threshold
results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD]
# sort by strength of probability
results.sort(key=lambda x: x[1], reverse=True)
return_list = []
for r in results:
return_list.append((classes[r[0]], r[1]))
# return tuple of intent and probability
return return_list
def response(sentence, userID='123', show_details=False):
results = classify(sentence)
# if we have a classification then find the matching intent tag
if results:
# loop as long as there are matches to process
while results:
for i in intents['intents']:
# find a tag matching the first result
if i['tag'] == results[0][0]:
# set context for this intent if necessary
if 'context_set' in i:
if show_details: print ('context:', i['context_set'])
context[userID] = i['context_set']
# check if this intent is contextual and applies to this user's conversation
if not 'context_filter' in i or \
(userID in context and 'context_filter' in i and i['context_filter'] == context[userID]):
if show_details: print ('tag:', i['tag'])
# a random response from the intent
return print(random.choice(i['responses']))
results.pop(0)
Example of output:
response('Hello')
>>Hi there, how can I help?
response('open')
>>Our hours are 9am-8pm every day
My goal is to structure the chatbot itself using object oriented programming (with a base structure I thought about below):
class Bot:
'''This class defines the routine of the bot'''
def __init__(self, presentation):
'''The bot presents itself'''
self.presentation = presentation
print('')
print(presentation)
def ask_question(self):
'''This method defines how the bot asks questions'''
print('')
self.answer = str(input('Please enter your question: ')).split(' ')
print('')
print('Thank you for your question. Let me check..')
def answer_question(self):
'''This method answer the user's questions'''
print('')
print(self.answer)
bot = Bot('Good morning, my Name is BotPy')
question = bot.ask_question()
answer = bot.answer_question()
My questions:
Do I have to train the model every time I launch the chatbot or I can call the model already trained from the directory?
If one of the two solutions above is correct, how may I implement the correct one?
I am trying to implement the k-fold cross-validation algorithm in python.
I know SKLearn provides an implementation but still...
This is my code as of right now.
from sklearn import metrics
import numpy as np
class Cross_Validation:
#staticmethod
def partition(vector, fold, k):
size = vector.shape[0]
start = (size/k)*fold
end = (size/k)*(fold+1)
validation = vector[start:end]
if str(type(vector)) == "<class 'scipy.sparse.csr.csr_matrix'>":
indices = range(start, end)
mask = np.ones(vector.shape[0], dtype=bool)
mask[indices] = False
training = vector[mask]
elif str(type(vector)) == "<type 'numpy.ndarray'>":
training = np.concatenate((vector[:start], vector[end:]))
return training, validation
#staticmethod
def Cross_Validation(learner, k, examples, labels):
train_folds_score = []
validation_folds_score = []
for fold in range(0, k):
training_set, validation_set = Cross_Validation.partition(examples, fold, k)
training_labels, validation_labels = Cross_Validation.partition(labels, fold, k)
learner.fit(training_set, training_labels)
training_predicted = learner.predict(training_set)
validation_predicted = learner.predict(validation_set)
train_folds_score.append(metrics.accuracy_score(training_labels, training_predicted))
validation_folds_score.append(metrics.accuracy_score(validation_labels, validation_predicted))
return train_folds_score, validation_folds_score
The learner parameter is a classifier from SKlearn library, k is the number of folds, examples is a sparse matrix produced by the CountVectorizer (again SKlearn) that is the representation of the bag of words.
For example:
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.naive_bayes import MultinomialNB
from Cross_Validation import Cross_Validation as cv
vectorizer = CountVectorizer(stop_words='english', lowercase=True, min_df=2, analyzer="word")
data = vectorizer.fit_transform("""textual data""")
clfMNB = MultinomialNB(alpha=.0001)
score = cv.Cross_Validation(clfMNB, 10, data, labels)
print "Train score" + str(score[0])
print "Test score" + str(score[1])
I'm assuming there is some logic error somewhere since the scores are 95% on the training set (as expected) but practically 0 on the test test, but I can't find it.
I hope I was clear.
Thanks in advance.
________________________________EDIT___________________________________
This is the code that loads the text into the vector that can be passed to the vectorizer. It also returns the label vector.
from nltk.tokenize import word_tokenize
from Categories_Data import categories
import numpy as np
import codecs
import glob
import os
import re
class Data_Preprocessor:
def tokenize(self, text):
tokens = word_tokenize(text)
alpha = [t for t in tokens if unicode(t).isalpha()]
return alpha
def header_not_fully_removed(self, text):
if ":" in text.splitlines()[0]:
return len(text.splitlines()[0].split(":")[0].split()) == 1
else:
return False
def strip_newsgroup_header(self, text):
_before, _blankline, after = text.partition('\n\n')
if len(after) > 0 and self.header_not_fully_removed(after):
after = self.strip_newsgroup_header(after)
return after
def strip_newsgroup_quoting(self, text):
_QUOTE_RE = re.compile(r'(writes in|writes:|wrote:|says:|said:'r'|^In article|^Quoted from|^\||^>)')
good_lines = [line for line in text.split('\n')
if not _QUOTE_RE.search(line)]
return '\n'.join(good_lines)
def strip_newsgroup_footer(self, text):
lines = text.strip().split('\n')
for line_num in range(len(lines) - 1, -1, -1):
line = lines[line_num]
if line.strip().strip('-') == '':
break
if line_num > 0:
return '\n'.join(lines[:line_num])
else:
return text
def raw_to_vector(self, path, to_be_stripped=["header", "footer", "quoting"], noise_threshold=-1):
base_dir = os.getcwd()
train_data = []
label_data = []
for category in categories:
os.chdir(base_dir)
os.chdir(path+"/"+category[0])
for filename in glob.glob("*"):
with codecs.open(filename, 'r', encoding='utf-8', errors='replace') as target:
data = target.read()
if "quoting" in to_be_stripped:
data = self.strip_newsgroup_quoting(data)
if "header" in to_be_stripped:
data = self.strip_newsgroup_header(data)
if "footer" in to_be_stripped:
data = self.strip_newsgroup_footer(data)
if len(data) > noise_threshold:
train_data.append(data)
label_data.append(category[1])
os.chdir(base_dir)
return np.array(train_data), np.array(label_data)
This is what "from Categories_Data import categories" imports...
categories = [
('alt.atheism',0),
('comp.graphics',1),
('comp.os.ms-windows.misc',2),
('comp.sys.ibm.pc.hardware',3),
('comp.sys.mac.hardware',4),
('comp.windows.x',5),
('misc.forsale',6),
('rec.autos',7),
('rec.motorcycles',8),
('rec.sport.baseball',9),
('rec.sport.hockey',10),
('sci.crypt',11),
('sci.electronics',12),
('sci.med',13),
('sci.space',14),
('soc.religion.christian',15),
('talk.politics.guns',16),
('talk.politics.mideast',17),
('talk.politics.misc',18),
('talk.religion.misc',19)
]
The reason why your validation score is low is subtle.
The issue is how you have partitioned the dataset. Remember, when doing cross-validation you should randomly split the dataset. It is the randomness that you are missing.
Your data is loaded category by category, which means in your input dataset, class labels and examples follow one after the other. By not doing the random split, you have completely removed a class which your model never sees during the training phase and hence you get a bad result on your test/validation phase.
You can solve this by doing a random shuffle. So, do this:
from sklearn.utils import shuffle
processor = Data_Preprocessor()
td, tl = processor.raw_to_vector(path="C:/Users/Pankaj/Downloads/ng/")
vectorizer = CountVectorizer(stop_words='english', lowercase=True, min_df=2, analyzer="word")
data = vectorizer.fit_transform(td)
# Shuffle the data and labels
data, tl = shuffle(data, tl, random_state=0)
clfMNB = MultinomialNB(alpha=.0001)
score = Cross_Validation.Cross_Validation(clfMNB, 10, data, tl)
print("Train score" + str(score[0]))
print("Test score" + str(score[1]))