I am trying to train spacy NER model on custom dataset. Basically I want to use this model to extract Name, Organization, Email, phone number etc from resume.
Below is the code I am using.
import json
import random
import spacy
import sys
import logging
from sklearn.metrics import classification_report
from sklearn.metrics import precision_recall_fscore_support
from spacy.gold import GoldParse
from spacy.scorer import Scorer
from sklearn.metrics import accuracy_score
from spacy.gold import biluo_tags_from_offsets
def convert_dataturks_to_spacy(dataturks_JSON_FilePath):
try:
training_data = []
lines=[]
with open(dataturks_JSON_FilePath, encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
data = json.loads(line)
text = data['content']
entities = []
for annotation in data['annotation']:
#only a single point in text annotation.
point = annotation['points'][0]
labels = annotation['label']
if not isinstance(labels, list):
labels = [labels]
for label in labels:
entities.append((point['start'], point['end'] + 1 ,label))
training_data.append((text, {"entities" : entities}))
return training_data
except Exception as e:
logging.exception("Unable to process " + dataturks_JSON_FilePath + "\n" + "error = " + str(e))
return None
def reformat_train_data(tokenizer, examples):
output = []
for i, (text, entity_offsets) in enumerate(examples):
doc = tokenizer(text.strip())
ner_tags = biluo_tags_from_offsets(tokenizer(text), entity_offsets['entities'])
words = [w.text for w in doc]
tags = ['-'] * len(doc)
heads = [0] * len(doc)
deps = [''] * len(doc)
sentence = (range(len(doc)), words, tags, heads, deps, ner_tags)
output.append((text, [(sentence, [])]))
print("output",output)
return output
################### Train Spacy NER.###########
def train_spacy():
TRAIN_DATA = convert_dataturks_to_spacy("C:\\Users\\akjain\\Downloads\\Entity-Recognition-In-Resumes-SpaCy-master\\traindata.json")
nlp = spacy.blank("en")
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
def get_data(): return reformat_train_data(nlp.tokenizer, TRAIN_DATA)
optimizer = nlp.begin_training(get_data)
for itn in range(10):
print("Starting iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update(
[text], # batch of texts
[annotations], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
print(losses)
train_spacy()
I am getting the below error. Also, I came across a link (https://github.com/explosion/spaCy/issues/3558) with some suggestion to fix this code. But even after implementing that I am still getting error.
I am using Python 3.6.5 and Spacy 2.2.3
Dataset:
{"content": "Nida Khan\nTech Support Executive - Teleperformance for Microsoft\n\nJaipur, Rajasthan - Email me on Indeed: indeed.com/r/Nida-Khan/6c9160696f57efd8\n\n• To be an integral part of the organization and enhance my knowledge to utilize it in a productive\nmanner for the growth of the company and the global.\n\nINDUSTRIAL TRAINING\n\n• BHEL, (HEEP) HARIDWAR\nOn CNC System& PLC Programming.\n\nWORK EXPERIENCE\n\nTech Support Executive\n\nTeleperformance for Microsoft -\n\nSeptember 2017 to Present\n\nprocess.\n• 21 months of experience in ADFC as Phone Banker.\n\nEDUCATION\n\nBachelor of Technology in Electronics & communication Engg\n\nGNIT institute of Technology - Lucknow, Uttar Pradesh\n\n2008 to 2012\n\nClass XII\n\nU.P. Board - Bareilly, Uttar Pradesh\n\n2007\n\nClass X\n\nU.P. Board - Bareilly, Uttar Pradesh\n\n2005\n\nSKILLS\n\nMicrosoft office, excel, cisco, c language, cbs. (4 years)\n\nhttps://www.indeed.com/r/Nida-Khan/6c9160696f57efd8?isid=rex-download&ikw=download-top&co=IN","annotation":[{"label":["Email Address"],"points":[{"start":872,"end":910,"text":"indeed.com/r/Nida-Khan/6c9160696f57efd8"}]},{"label":["Skills"],"points":[{"start":800,"end":857,"text":"Microsoft office, excel, cisco, c language, cbs. (4 years)"}]},{"label":["Graduation Year"],"points":[{"start":676,"end":679,"text":"2012"}]},{"label":["College Name"],"points":[{"start":612,"end":640,"text":"GNIT institute of Technology "}]},{"label":["Degree"],"points":[{"start":552,"end":609,"text":"Bachelor of Technology in Electronics & communication Engg"}]},{"label":["Companies worked at"],"points":[{"start":420,"end":448,"text":"Teleperformance for Microsoft"}]},{"label":["Designation"],"points":[{"start":395,"end":417,"text":"\nTech Support Executive"}]},{"label":["Email Address"],"points":[{"start":106,"end":144,"text":"indeed.com/r/Nida-Khan/6c9160696f57efd8"}]},{"label":["Location"],"points":[{"start":66,"end":71,"text":"Jaipur"}]},{"label":["Companies worked at"],"points":[{"start":35,"end":63,"text":"Teleperformance for Microsoft"}]},{"label":["Designation"],"points":[{"start":10,"end":32,"text":"Tech Support Executive "}]},{"label":["Designation"],"points":[{"start":9,"end":31,"text":"\nTech Support Executive"}]},{"label":["Name"],"points":[{"start":0,"end":8,"text":"Nida Khan"}]}]}
The problem is you are feeding training data to model optimizer.
As mentioned in https://github.com/explosion/spaCy/issues/3558, use the following function to remove leading and trailing white spaces from entity spans.
def trim_entity_spans(data: list) -> list:
"""Removes leading and trailing white spaces from entity spans.
Args:
data (list): The data to be cleaned in spaCy JSON format.
Returns:
list: The cleaned data.
"""
invalid_span_tokens = re.compile(r'\s')
cleaned_data = []
for text, annotations in data:
entities = annotations['entities']
valid_entities = []
for start, end, label in entities:
valid_start = start
valid_end = end
# if there's preceding spaces, move the start position to nearest character
while valid_start < len(text) and invalid_span_tokens.match(
text[valid_start]):
valid_start += 1
while valid_end > 1 and invalid_span_tokens.match(
text[valid_end - 1]):
valid_end -= 1
valid_entities.append([valid_start, valid_end, label])
cleaned_data.append([text, {'entities': valid_entities}])
return cleaned_data
Then use the following function for training:
def train_spacy():
TRAIN_DATA = convert_dataturks_to_spacy("C:\\Users\\akjain\\Downloads\\Entity-Recognition-In-Resumes-SpaCy-master\\traindata.json")
TRAIN_DATA = trim_entity_spans(TRAIN_DATA)
nlp = spacy.blank('en') # create blank Language class
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(10):
print("Statring iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
nlp.update(
[text], # batch of texts
[annotations], # batch of annotations
drop=0.2, # dropout - make it harder to memorise data
sgd=optimizer, # callable to update weights
losses=losses)
print(losses)
Related
I am trying to add new entities to the already existing model en_core_web_lg. If I don't train the model recognize all the entities. In the moment that I train it, it seems to forget all the previous entities. Can someone help me please? Here's the code:
def main(new_model_name=new_model_name, output_dir=output_dir, n_iter=n_iter):
print("\n")
# Load pre-existing spacy model
nlp=spacy.load(output_dir)
# Getting the pipeline component
if "ner" not in nlp.pipe_names:
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner, last=True)
else:
ner = nlp.get_pipe("ner")
# Adding labels to the `ner`
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# Disable pipeline components you dont need to change
disabled_pipes = []
for pipe_name in nlp.pipe_names:
if pipe_name != 'ner':
nlp.disable_pipes(pipe_name)
disabled_pipes.append(pipe_name)
# TRAINING THE MODEL
optimizer = nlp.create_optimizer()
for iteration in range(n_iter):
# Shuffling examples before every iteration
random.shuffle(TRAIN_DATA)
losses = {}
# Batch up the examples using spaCy's minibatch
for batch in spacy.util.minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001)):
for text, annotations in batch:
# create Example
doc = nlp.make_doc(text)
example = Example.from_dict(doc, annotations)
# Update the model
nlp.update([example], sgd=optimizer,losses=losses, drop=0.8)
print(f"{iteration+1}/{n_iter} - Losses {losses}")
for pipe_name in disabled_pipes:
nlp.enable_pipe(pipe_name)
print("\n")
if output_dir is not None:
# Save the model to directory
nlp.meta["name"] = new_model_name
nlp.to_disk(output_dir)
print(colored(f"Saved model to {output_dir}", 'white'))
# Load the saved model and predict
print(colored(f"Loading from {output_dir}", 'white'))
nlp2 = spacy.load(output_dir)
for text, _ in TEST_DATA:
doc = nlp2(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
I want to make an ABSA using Python where the sentiment of pre-defined aspects (e.g. delivery, quality, service) is analyzed from online reviews. I want to do it unsupervised because this will save me from manually labeling reviews and I can analyze a lot more review data (looking at around 100k reviews). Therefore, my datasets consists of only reviews and no ratings. I would like to have a model that can first detect the aspect category and then assign the sentiment polarity. E.g. when the review says "The shipment went smoothly, but the product is broken" I want the model to assign the word "shipment" to the aspect category "delivery" and "smoothly" relates to a positive sentiment.
I have searched for approaches to take and I would like to know if anyone has experience with this and could guide me into a direction that could help me. It will be highly appreciated!
Aspect Based Sentiment Analysis (ABSA), where the task is first to
extract aspects or features of an entity (i.e. Aspect Term Extraction
or ATE1 ) from a given text, and second to determine the sentiment
polarity (SP), if any, towards each aspect of that entity. The
importance of ABSA led to the creation of the ABSA task
B-LSTM & CRF classifier will be used for feature extraction and aspect
term detection for both supervised and unsupervised ATE.
https://www.researchgate.net/profile/Andreea_Hossmann/publication/319875533_Unsupervised_Aspect_Term_Extraction_with_B-LSTM_and_CRF_using_Automatically_Labelled_Datasets/links/5a3436a70f7e9b10d842b0eb/Unsupervised-Aspect-Term-Extraction-with-B-LSTM-and-CRF-using-Automatically-Labelled-Datasets.pdf
https://github.com/songyouwei/ABSA-PyTorch/blob/master/infer_example.py
# -*- coding: utf-8 -*-
# file: infer.py
# author: songyouwei <youwei0314#gmail.com>
# Copyright (C) 2019. All Rights Reserved.
import torch
import torch.nn.functional as F
import argparse
from data_utils import build_tokenizer, build_embedding_matrix
from models import IAN, MemNet, ATAE_LSTM, AOA
class Inferer:
"""A simple inference example"""
def __init__(self, opt):
self.opt = opt
self.tokenizer = build_tokenizer(
fnames=[opt.dataset_file['train'], opt.dataset_file['test']],
max_seq_len=opt.max_seq_len,
dat_fname='{0}_tokenizer.dat'.format(opt.dataset))
embedding_matrix = build_embedding_matrix(
word2idx=self.tokenizer.word2idx,
embed_dim=opt.embed_dim,
dat_fname='{0}_{1}_embedding_matrix.dat'.format(str(opt.embed_dim), opt.dataset))
self.model = opt.model_class(embedding_matrix, opt)
print('loading model {0} ...'.format(opt.model_name))
self.model.load_state_dict(torch.load(opt.state_dict_path))
self.model = self.model.to(opt.device)
# switch model to evaluation mode
self.model.eval()
torch.autograd.set_grad_enabled(False)
def evaluate(self, raw_texts):
context_seqs = [self.tokenizer.text_to_sequence(raw_text.lower().strip()) for raw_text in raw_texts]
aspect_seqs = [self.tokenizer.text_to_sequence('null')] * len(raw_texts)
context_indices = torch.tensor(context_seqs, dtype=torch.int64).to(self.opt.device)
aspect_indices = torch.tensor(aspect_seqs, dtype=torch.int64).to(self.opt.device)
t_inputs = [context_indices, aspect_indices]
t_outputs = self.model(t_inputs)
t_probs = F.softmax(t_outputs, dim=-1).cpu().numpy()
return t_probs
if __name__ == '__main__':
model_classes = {
'atae_lstm': ATAE_LSTM,
'ian': IAN,
'memnet': MemNet,
'aoa': AOA,
}
# set your trained models here
model_state_dict_paths = {
'atae_lstm': 'state_dict/atae_lstm_restaurant_acc0.7786',
'ian': 'state_dict/ian_restaurant_acc0.7911',
'memnet': 'state_dict/memnet_restaurant_acc0.7911',
'aoa': 'state_dict/aoa_restaurant_acc0.8063',
}
class Option(object): pass
opt = Option()
opt.model_name = 'ian'
opt.model_class = model_classes[opt.model_name]
opt.dataset = 'restaurant'
opt.dataset_file = {
'train': './datasets/semeval14/Restaurants_Train.xml.seg',
'test': './datasets/semeval14/Restaurants_Test_Gold.xml.seg'
}
opt.state_dict_path = model_state_dict_paths[opt.model_name]
opt.embed_dim = 300
opt.hidden_dim = 300
opt.max_seq_len = 80
opt.polarities_dim = 3
opt.hops = 3
opt.device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
inf = Inferer(opt)
t_probs = inf.evaluate(['happy memory', 'the service is terrible', 'just normal food'])
print(t_probs.argmax(axis=-1) - 1)
I am getting the following error while training spacy NER model with my custom training data.
ValueError: [E024] Could not find an optimal move to supervise the parser. Usually, this means the GoldParse was not correct. For example, are all labels added to the model?
Can anyone help me with this?
passing the training data through this function below works fine without any error.
def trim_entity_spans(data: list) -> list:
"""Removes leading and trailing white spaces from entity spans.
Args:
data (list): The data to be cleaned in spaCy JSON format.
Returns:
list: The cleaned data.
"""
invalid_span_tokens = re.compile(r'\s')
cleaned_data = []
for text, annotations in data:
entities = annotations['entities']
valid_entities = []
for start, end, label in entities:
valid_start = start
valid_end = end
while valid_start < len(text) and invalid_span_tokens.match(
text[valid_start]):
valid_start += 1
while valid_end > 1 and invalid_span_tokens.match(
text[valid_end - 1]):
valid_end -= 1
valid_entities.append([valid_start, valid_end, label])
cleaned_data.append([text, {'entities': valid_entities}])
return cleaned_data
This happens when there is an empty content (data) in your annotation. Examples of empty data may include tags, label, starting and ending points of your label. The solution provided above should work for trimming/cleansing the data. However, if you want a brute force approach, just include an exception handler before updating the model, as follows:
def train_spacy(data,iterations):
nlp = spacy.blank('en') # create blank Language class
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if 'ner' not in nlp.pipe_names:
ner = nlp.create_pipe('ner')
nlp.add_pipe(ner, last=True)
#add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get('entities'):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != 'ner']
with nlp.disable_pipes(*other_pipes): # only train NER
optimizer = nlp.begin_training()
for itn in range(iterations):
print("Starting iteration " + str(itn))
random.shuffle(TRAIN_DATA)
losses = {}
for text, annotations in TRAIN_DATA:
try:
nlp.update(
[text],
[annotations],
drop=0.2,
sgd=optimizer,
losses=losses)
except Exception as error:
print(error)
continue
print(losses)
return nlp
So assuming your TRAIN_DATA contains 1000 rows and only row number 200 has an empty data, instead of the model throwing the error, it will always skip number 200 and train the remaining the data.
for spacy v3 supported data format for training...
pass in the list of training data via this function...
def clean_entity_spans(data: list) -> list:
invalid_span_tokens = re.compile(r'\s')
cleaned_data = []
for content in data:
name = content['documentName']
text = content['document']
userinput = content['user_input']
valid_entities = []
for annotate_content in content['annotation']:
start = annotate_content['start']
end = annotate_content['end']
label = annotate_content['label']
text1 = annotate_content['text']
valid_start = start
valid_end = end
while valid_start < len(text) and invalid_span_tokens.match(
text[valid_start]):
valid_start += 1
while valid_end > 1 and invalid_span_tokens.match(
text[valid_end - 1]):
valid_end -= 1
valid_entities.append({'start': valid_start, 'end': valid_end, 'label': label, 'text': text1, 'propertiesList': [], 'commentsList': []})
cleaned_data.append({'documentName': name, 'document':text, 'annotation': valid_entities, 'user_input': userinput})
return cleaned_data
import spacy
import random
from spacy.gold import GoldParse
from spacy.language import EntityRecognizer
train_data = [
('Who is Chaka Khan?', [(7, 17, 'PERSON')]),
('I like London and Berlin.', [(7, 13, 'LOC'), (18, 24, 'LOC')])
]
nlp = spacy.load('en_depent_web_md', entity=False)
ner = EntityRecognizer(nlp.vocab, entity_types=['PERSON', 'LOC'])
for itn in range(1000):
random.shuffle(train_data)
for raw_text, entity_offsets in train_data:
doc = nlp.make_doc(raw_text)
gold = GoldParse(doc, entities=entity_offsets)
nlp.tagger(doc)
ner.update(doc, gold)
ner.model.end_training()
doc = nlp.make_doc('I like London and Berlin.')
nlp.tagger(doc)
print(ner(doc))
Above code is not working properly for custom tag.
Am workin to tag custom tag names like NOL - ORG , GDRFA - ORG , DHONI - Cricket.
Additional information - https://support.prodi.gy/t/custom-ner-tag-for-english/704
Additional information - https://spacy.io/usage/training#section-ner
Looking for a sample code or examples/explanation
def main(model=None, output_dir=r'model', n_iter=100):
"""Load the model, set up the pipeline and train the entity recognizer."""
if model is not None:
nlp = spacy.load(model) # load existing spaCy model
print("Loaded model '%s'" % model)
else:
nlp = spacy.blank("en") # create blank Language class
print("Created blank 'en' model")
# create the built-in pipeline components and add them to the pipeline
# nlp.create_pipe works for built-ins that are registered with spaCy
if "ner" not in nlp.pipe_names:
ner = nlp.create_pipe("ner")
nlp.add_pipe(ner, last=True)
# otherwise, get it so we can add labels
else:
ner = nlp.get_pipe("ner")
# add labels
for _, annotations in TRAIN_DATA:
for ent in annotations.get("entities"):
ner.add_label(ent[2])
# get names of other pipes to disable them during training
other_pipes = [pipe for pipe in nlp.pipe_names if pipe != "ner"]
with nlp.disable_pipes(*other_pipes): # only train NER
# reset and initialize the weights randomly – but only if we're
# training a new model
if model is None:
nlp.begin_training()
for itn in range(n_iter):
random.shuffle(TRAIN_DATA)
losses = {}
# batch up the examples using spaCy's minibatch
batches = minibatch(TRAIN_DATA, size=compounding(4.0, 32.0, 1.001))
for batch in batches:
texts, annotations = zip(*batch)
nlp.update(
texts, # batch of texts
annotations, # batch of annotations
drop=0.5, # dropout - make it harder to memorise data
losses=losses,
)
print("Losses", losses)
# test the trained model
for text, _ in TRAIN_DATA:
doc = nlp(text)
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
print("Tokens", [(t.text, t.ent_type_, t.ent_iob) for t in doc])
# save model to output directory
if output_dir is not None:
output_dir = Path(output_dir)
if not output_dir.exists():
output_dir.mkdir()
nlp.to_disk(output_dir)
print("Saved model to", output_dir)
then, load the same model:
print("Loading from", output_dir)
nlp2 = spacy.load(output_dir)
doc = nlp2("<your any text>")
print("Entities", [(ent.text, ent.label_) for ent in doc.ents])
and,
TRAIN_DATA = [
("my site brand is ttt.", {"entities": [(17, 20, "PERSON")]}),
]
EDIT 1: Following the answer received by akshat, I used this tutorial to load the model.
The code has been updated as follows:
import tensorflow as tf
class Bot:
'''This class defines the routine of the bot'''
def __init__(self, presentation):
'''The bot presents itself'''
self.presentation = presentation
print('')
print(presentation)
def ask_question(self):
'''This method defines how the bot asks questions'''
print('')
self.answer = str(input('Please enter your question: ')).split(' ')
print('')
print('Thank you for your question. Let me check..')
def answer_question(self):
'''This method answer the user's questions'''
print('')
print(self.answer)
def load_model(self):
with tf.Session() as sess:
new_saver = tf.train.import_meta_graph('model.tflearn.meta')
new_saver.restore(sess, tf.train.latest_checkpoint('./'))
print(sess.run('w1:0'))
bot = Bot('Good morning, my Name is BotPy')
question = bot.ask_question()
answer = bot.answer_question()
model = bot.load_model()
Launching it the following traceback is received:
2018-07-29 08:28:14.622798: I tensorflow/core/platform/cpu_feature_guard.cc:141] Your CPU supports instructions that this TensorFlow binary was not compiled to use: AVX2 FMA
WARNING:tensorflow:The saved meta_graph is possibly from an older release:
'model_variables' collection should be of type 'byte_list', but instead is of type 'node_list'.
Traceback (most recent call last):
File "/home/marco/PycharmProjects/chatBot/main.py", line 41, in <module>
model = bot.load_model()
File "/home/marco/PycharmProjects/chatBot/main.py", line 33, in load_model
new_saver = tf.train.import_meta_graph('model.tflearn.meta')
File "/home/marco/PycharmProjects/chatBot/venv/lib/python3.5/site-packages/tensorflow/python/training/saver.py", line 1960, in import_meta_graph
**kwargs)
File "/home/marco/PycharmProjects/chatBot/venv/lib/python3.5/site-packages/tensorflow/python/framework/meta_graph.py", line 790, in import_scoped_meta_graph
ops.prepend_name_scope(value, scope_to_prepend_to_names))
File "/home/marco/PycharmProjects/chatBot/venv/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3613, in as_graph_element
return self._as_graph_element_locked(obj, allow_tensor, allow_operation)
File "/home/marco/PycharmProjects/chatBot/venv/lib/python3.5/site-packages/tensorflow/python/framework/ops.py", line 3673, in _as_graph_element_locked
"graph." % repr(name))
KeyError: "The name 'Adam' refers to an Operation not in the graph."
ORIGINAL QUESTION: I am trying to build a chatbot.
Following this tutorial, I already have a jupyter notebook solution that is working with a model already trained.
import nltk
from nltk.stem.lancaster import LancasterStemmer
import numpy as np
import tflearn
import tensorflow as tf
import random
import json
from ._conv import register_converters as _register_converters
stemmer = LancasterStemmer()
with open('intents.json') as json_data:
intents = json.load(json_data)
words = []
classes = []
documents = []
ignore_words = ['?']
# loop through each sentence in our intents patterns
for intent in intents['intents']:
for pattern in intent['patterns']:
# tokenize each word in the sentence
w = nltk.word_tokenize(pattern)
# add to our words list
words.extend(w)
# add to documents in our corpus
documents.append((w, intent['tag']))
# add to our classes list
if intent['tag'] not in classes:
classes.append(intent['tag'])
# stem and lower each word and remove duplicates
words = [stemmer.stem(w.lower()) for w in words if w not in ignore_words]
words = sorted(list(set(words)))
# remove duplicates
classes = sorted(list(set(classes)))
print (len(documents), "documents")
print (len(classes), "classes", classes)
print (len(words), "unique stemmed words", words)
20 documents
6 classes ['goodbye', 'greeting', 'hours', 'opentoday', 'payments', 'thanks']
32 unique stemmed words ["'s", 'acceiv', 'anyon', 'ar', 'bye', 'card', 'cash', 'credit', 'day', 'do', 'good', 'goodby', 'hello', 'help', 'hi', 'hour', 'how', 'is', 'lat', 'mastercard', 'on', 'op', 'see', 'tak', 'thank', 'that', 'ther', 'today', 'what', 'when', 'yo', 'you']
# create our training data
training = []
output = []
# create an empty array for our output
output_empty = [0] * len(classes)
# training set, bag of words for each sentence
for doc in documents:
# initialize our bag of words
bag = []
# list of tokenized words for the pattern
pattern_words = doc[0]
# stem each word
pattern_words = [stemmer.stem(word.lower()) for word in pattern_words]
# create our bag of words array
for w in words:
bag.append(1) if w in pattern_words else bag.append(0)
# output is a '0' for each tag and '1' for current tag
output_row = list(output_empty)
output_row[classes.index(doc[1])] = 1
training.append([bag, output_row])
# shuffle our features and turn into np.array
random.shuffle(training)
training = np.array(training)
# create train and test lists
train_x = list(training[:,0])
train_y = list(training[:,1])
# reset underlying graph data
tf.reset_default_graph()
# Build neural network
net = tflearn.input_data(shape=[None, len(train_x[0])])
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, 8)
net = tflearn.fully_connected(net, len(train_y[0]), activation='softmax')
net = tflearn.regression(net)
# Define model and setup tensorboard
model = tflearn.DNN(net, tensorboard_dir='tflearn_logs')
# Start training (apply gradient descent algorithm)
model.fit(train_x, train_y, n_epoch=1000, batch_size=8, show_metric=True)
model.save('model.tflearn')
Training Step: 2999 | total loss: 0.01771 | time: 0.006s
| Adam | epoch: 1000 | loss: 0.01771 - acc: 0.9999 -- iter: 16/20
Training Step: 3000 | total loss: 0.01754 | time: 0.009s
| Adam | epoch: 1000 | loss: 0.01754 - acc: 1.0000 -- iter: 20/20
--
INFO:tensorflow:/home/marco/PycharmProjects/chatBot/model.tflearn is not in all_model_checkpoint_paths. Manually adding it.
# save all of our data structures
import pickle
pickle.dump( {'words':words, 'classes':classes, 'train_x':train_x, 'train_y':train_y}, open( "training_data", "wb" ) )
data = pickle.load( open( "training_data", "rb" ) )
words = data['words']
classes = data['classes']
train_x = data['train_x']
train_y = data['train_y']
# import our chat-bot intents file
import json
with open('intents.json') as json_data:
intents = json.load(json_data)
# load our saved model
model.load('./model.tflearn')
INFO:tensorflow:Restoring parameters from /home/marco/PycharmProjects/chatBot/model.tflearn
def clean_up_sentence(sentence):
# tokenize the pattern
sentence_words = nltk.word_tokenize(sentence)
# stem each word
sentence_words = [stemmer.stem(word.lower()) for word in sentence_words]
return sentence_words
# return bag of words array: 0 or 1 for each word in the bag that exists in the sentence
def bow(sentence, words, show_details=False):
# tokenize the pattern
sentence_words = clean_up_sentence(sentence)
# bag of words
bag = [0]*len(words)
for s in sentence_words:
for i,w in enumerate(words):
if w == s:
bag[i] = 1
if show_details:
print ("found in bag: %s" % w)
return(np.array(bag))
# create a data structure to hold user context
context = {}
ERROR_THRESHOLD = 0.25
def classify(sentence):
# generate probabilities from the model
results = model.predict([bow(sentence, words)])[0]
# filter out predictions below a threshold
results = [[i,r] for i,r in enumerate(results) if r>ERROR_THRESHOLD]
# sort by strength of probability
results.sort(key=lambda x: x[1], reverse=True)
return_list = []
for r in results:
return_list.append((classes[r[0]], r[1]))
# return tuple of intent and probability
return return_list
def response(sentence, userID='123', show_details=False):
results = classify(sentence)
# if we have a classification then find the matching intent tag
if results:
# loop as long as there are matches to process
while results:
for i in intents['intents']:
# find a tag matching the first result
if i['tag'] == results[0][0]:
# set context for this intent if necessary
if 'context_set' in i:
if show_details: print ('context:', i['context_set'])
context[userID] = i['context_set']
# check if this intent is contextual and applies to this user's conversation
if not 'context_filter' in i or \
(userID in context and 'context_filter' in i and i['context_filter'] == context[userID]):
if show_details: print ('tag:', i['tag'])
# a random response from the intent
return print(random.choice(i['responses']))
results.pop(0)
Example of output:
response('Hello')
>>Hi there, how can I help?
response('open')
>>Our hours are 9am-8pm every day
My goal is to structure the chatbot itself using object oriented programming (with a base structure I thought about below):
class Bot:
'''This class defines the routine of the bot'''
def __init__(self, presentation):
'''The bot presents itself'''
self.presentation = presentation
print('')
print(presentation)
def ask_question(self):
'''This method defines how the bot asks questions'''
print('')
self.answer = str(input('Please enter your question: ')).split(' ')
print('')
print('Thank you for your question. Let me check..')
def answer_question(self):
'''This method answer the user's questions'''
print('')
print(self.answer)
bot = Bot('Good morning, my Name is BotPy')
question = bot.ask_question()
answer = bot.answer_question()
My questions:
Do I have to train the model every time I launch the chatbot or I can call the model already trained from the directory?
If one of the two solutions above is correct, how may I implement the correct one?