Related
I'm using pytorch lightining and I have this error but I'm non really understanding what is the problem. I create a Deep Learning pipeline to run with hyperparameters searching and I think that the problem is in.
I omitted some part of the code because I think they are irrelevant for this issue (due to stackoverflow restrictions). Thanks for the help!
class ProtBertBFDClassifier(pl.LightningModule):
def __init__(self,hparams) -> None:
super(ProtBertBFDClassifier, self).__init__()
self.hparams = hparams
self.batch_size = self.hparams.batch_size
self.model_name = pretrained_model_name
self.dataset = Loc_dataset()
self.metric_acc = Accuracy()
# build model
self.__build_model()
# Loss criterion initialization.
self.__build_loss()
if self.hparams.nr_frozen_epochs > 0:
self.freeze_encoder()
else:
self._frozen = False
self.nr_frozen_epochs = self.hparams.nr_frozen_epochs
def __build_model(self) -> None:
""" Init BERT model + tokenizer + classification head."""
self.ProtBertBFD = BertModel.from_pretrained(self.model_name,gradient_checkpointing=self.hparams.gradient_checkpointing)
self.encoder_features = 1024
# Tokenizer
self.tokenizer = BertTokenizer.from_pretrained(self.model_name, do_lower_case=False)
# Label Encoder
self.label_encoder = LabelEncoder(
self.hparams.label_set.split(","), reserved_labels=[]
)
self.label_encoder.unknown_index = None
# Classification head
self.classification_head = nn.Sequential(
nn.Linear(self.encoder_features*4, self.label_encoder.vocab_size),
nn.Tanh(),
)
.....
def predict(self, sample: dict) -> dict:
""" Predict function.
:param sample: dictionary with the text we want to classify.
Returns:
Dictionary with the input text and the predicted label.
"""
......
def pool_strategy(self, features,
pool_cls=True, pool_max=True, pool_mean=True,
pool_mean_sqrt=True):
token_embeddings = features['token_embeddings']
cls_token = features['cls_token_embeddings']
attention_mask = features['attention_mask']
## Pooling strategy
output_vectors = []
if pool_cls:
output_vectors.append(cls_token)
if pool_max:
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
token_embeddings[input_mask_expanded == 0] = -1e9 # Set padding tokens to large negative value
max_over_time = torch.max(token_embeddings, 1)[0]
output_vectors.append(max_over_time)
if pool_mean or pool_mean_sqrt:
input_mask_expanded = attention_mask.unsqueeze(-1).expand(token_embeddings.size()).float()
sum_embeddings = torch.sum(token_embeddings * input_mask_expanded, 1)
#If tokens are weighted (by WordWeights layer), feature 'token_weights_sum' will be present
if 'token_weights_sum' in features:
sum_mask = features['token_weights_sum'].unsqueeze(-1).expand(sum_embeddings.size())
else:
sum_mask = input_mask_expanded.sum(1)
sum_mask = torch.clamp(sum_mask, min=1e-9)
if pool_mean:
output_vectors.append(sum_embeddings / sum_mask)
if pool_mean_sqrt:
output_vectors.append(sum_embeddings / torch.sqrt(sum_mask))
output_vector = torch.cat(output_vectors, 1)
return output_vector
........
inputs = self.tokenizer.batch_encode_plus(sample["seq"],
add_special_tokens=True,
padding=True,
truncation=True,
max_length=self.hparams.max_length)
if not prepare_target:
return inputs, {}
# Prepare target:
try:
targets = {"labels": self.label_encoder.batch_encode(sample["label"])}
return inputs, targets
except RuntimeError:
print(sample["label"])
raise Exception("Label encoder found an unknown label.")
......
def validation_step(self, batch: tuple, batch_nb: int, *args, **kwargs) -> dict:
""" Similar to the training step but with the model in eval mode.
Returns:
- dictionary passed to the validation_end function.
"""
inputs, targets = batch
model_out = self.forward(**inputs)
loss_val = self.loss(model_out, targets)
y = targets["labels"]
y_hat = model_out["logits"]
labels_hat = torch.argmax(y_hat, dim=1)
val_acc = self.metric_acc(labels_hat, y)
output = OrderedDict({"val_loss": loss_val, "val_acc": val_acc,})
return output
def validation_epoch_end(self, outputs: list) -> dict:
""" Function that takes as input a list of dictionaries returned by the validation_step
function and measures the model performance accross the entire validation set.
Returns:
- Dictionary with metrics to be added to the lightning logger.
"""
val_loss_mean = torch.stack([x['val_loss'] for x in outputs]).mean()
val_acc_mean = torch.stack([x['val_acc'] for x in outputs]).mean()
tqdm_dict = {"val_loss": val_loss_mean, "val_acc": val_acc_mean}
result = {
"progress_bar": tqdm_dict,
"log": tqdm_dict,
"val_loss": val_loss_mean,
}
return result
.......
def test_epoch_end(self, outputs: list) -> dict:
""" Function that takes as input a list of dictionaries returned by the validation_step
function and measures the model performance accross the entire validation set.
Returns:
- Dictionary with metrics to be added to the lightning logger.
"""
test_loss_mean = torch.stack([x['test_loss'] for x in outputs]).mean()
test_acc_mean = torch.stack([x['test_acc'] for x in outputs]).mean()
tqdm_dict = {"test_loss": test_loss_mean, "test_acc": test_acc_mean}
result = {
"progress_bar": tqdm_dict,
"log": tqdm_dict,
"test_loss": test_loss_mean,
}
return result
def configure_optimizers(self):
""" Sets different Learning rates for different parameter groups. """
parameters = [
{"params": self.classification_head.parameters()},
{
"params": self.ProtBertBFD.parameters(),
"lr": self.hparams.encoder_learning_rate,
},
]
optimizer = optim.Adam(parameters, lr=self.hparams.learning_rate)
return [optimizer], []
def __retrieve_dataset(self, train=True, val=True, test=True):
""" Retrieves task specific dataset """
if train:
return self.dataset.load_dataset(hparams.train_csv)
elif val:
return self.dataset.load_dataset(hparams.dev_csv)
elif test:
return self.dataset.load_dataset(hparams.test_csv)
else:
print('Incorrect dataset split')
def train_dataloader(self) -> DataLoader:
""" Function that loads the train set. """
self._train_dataset = self.__retrieve_dataset(val=False, test=False)
return DataLoader(
dataset=self._train_dataset,
sampler=RandomSampler(self._train_dataset),
batch_size=self.hparams.batch_size,
collate_fn=self.prepare_sample,
num_workers=self.hparams.loader_workers,
)
....
#classmethod
def add_model_specific_args(
cls, parser: HyperOptArgumentParser
) -> HyperOptArgumentParser:
""" Parser for Estimator specific arguments/hyperparameters.
:param parser: HyperOptArgumentParser obj
Returns:
- updated parser
"""
parser.opt_list(
"--max_length",
default=1536,
type=int,
help="Maximum sequence length.",
)
parser.add_argument(
"--encoder_learning_rate",
default=5e-06,
type=float,
help="Encoder specific learning rate.",
)
return parser
# these are project-wide arguments
parser = HyperOptArgumentParser(
strategy="random_search",
description="Minimalist ProtBERT Classifier",
add_help=True,
)
# Early Stopping
parser.add_argument(
"--monitor", default="val_acc", type=str, help="Quantity to monitor."
)
parser.add_argument(
"--metric_mode",
default="max",
type=str,
help="If we want to min/max the monitored quantity.",
choices=["auto", "min", "max"],
)
parser.add_argument(
"--patience",
default=5,
type=int,
help=(
"Number of epochs with no improvement "
"after which training will be stopped."
),
)
parser.add_argument(
"--accumulate_grad_batches",
default=32,
type=int,
help=(
"Accumulated gradients runs K small batches of size N before "
"doing a backwards pass."
),
)
# gpu/tpu args
parser.add_argument("--gpus", type=int, default=1, help="How many gpus")
parser.add_argument("--tpu_cores", type=int, default=None, help="How many tpus")
parser.add_argument(
"--val_percent_check",
default=1.0,
type=float,
help=(
"If you don't want to use the entire dev set (for debugging or "
"if it's huge), set how much of the dev set you want to use with this flag."
),
)
# each LightningModule defines arguments relevant to it
parser = ProtBertBFDClassifier.add_model_specific_args(parser)
hparams = parser.parse_known_args()[0]
"""
Main training routine specific for this project
:param hparams:
"""
seed_everything(hparams.seed)
# ------------------------
# 1 INIT LIGHTNING MODEL
# ------------------------
model = ProtBertBFDClassifier(hparams)
This is the error:
1 frames
<ipython-input-26-561494d91469> in __init__(self)
10 def __init__(self) -> None:
11 super(ProtBertBFDClassifier, self).__init__()
---> 12 self.hparams = parser.parse_known_args()[0]
13 self.batch_size = self.hparams.batch_size
14
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in __setattr__(self, name, value)
1223 buffers[name] = value
1224 else:
-> 1225 object.__setattr__(self, name, value)
1226
1227 def __delattr__(self, name):
AttributeError: can't set attribute
pip install pytorch-lightning==1.2.10
I'm trying to deploy a TF2.0 model to SageMaker. So far, I managed to train the model and save it into an S3 bucket but when I'm calling the .deploy() method, I get the following error from cloud Watch
ValueError: no SavedModel bundles found!
Here is my training script:
### Code to add in a tensorflow_estimator.py file
import argparse
import os
import pathlib
import tensorflow as tf
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# hyperparameters sent by the client are passed as command-line arguments to the script.
parser.add_argument('--epochs', type=int, default=10)
parser.add_argument('--batch_size', type=int, default=100)
parser.add_argument('--learning_rate', type=float, default=0.1)
# Data, model, and output directories
parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
args, _ = parser.parse_known_args()
print("##### ARGS ##### \n{}".format(args))
# Get files
path = pathlib.Path(args.train)
# Print out folder content
for item in path.iterdir():
print("##### DIRECTORIES ##### \n{}".format(item))
# Get all images
all_images = list(path.glob("*/*"))
all_image_paths = [str(path) for path in list(path.glob("*/*"))]
# Transform images into tensors
def preprocess_and_load_images(path):
image = tf.io.read_file(path)
image = tf.image.decode_jpeg(image, channels=3)
image = tf.image.resize(image, [192, 192])
return image
# Apply preprocessing function
ds_paths = tf.data.Dataset.from_tensor_slices(all_image_paths)
ds_images = ds_paths.map(preprocess_and_load_images)
# Map Labels
labels = []
for data in path.iterdir():
if data.is_dir():
labels += [data.name]
labels_index = {}
for i,label in enumerate(labels):
labels_index[label]=i
print("##### Label Index ##### \n{}".format(labels_index))
all_image_labels = [labels_index[path.parent.name] for path in list(path.glob("*/*"))]
# Create a tf Dataset
labels_ds = tf.data.Dataset.from_tensor_slices(all_image_labels)
# Zip train and labeled dataset
full_ds = tf.data.Dataset.zip((ds_images, labels_ds))
# Shuffle Dataset and batch it
full_ds = full_ds.shuffle(len(all_images)).batch(args.batch_size)
# Create a pre-trained model
base_model = tf.keras.applications.InceptionV3(input_shape=(192,192,3),
include_top=False,
weights = "imagenet"
)
base_model.trainable = False
model = tf.keras.Sequential([
base_model,
tf.keras.layers.GlobalAveragePooling2D(),
tf.keras.layers.Dense(len(labels), activation="softmax")
])
initial_learning_rate = args.learning_rate
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate,
decay_steps=1000,
decay_rate=0.96,
staircase=True)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = lr_schedule),
loss = tf.keras.losses.SparseCategoricalCrossentropy(),
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()])
# Train the model
model.fit(full_ds, epochs=args.epochs)
# Save the model
model.save(os.path.join(args.model_dir, "tf_model"), save_format="tf")
def model_fn(model_dir):
classifier = tf.keras.models.load_model(os.path.join(model_dir, "tf_model"))
return classifier
And here is the code that I wrote into Colab
from sagemaker.tensorflow import TensorFlow
tf_estimator = TensorFlow(entry_point='tensorflow_estimator.py',
role=role,
train_instance_count=1,
train_instance_type='ml.m5.large',
framework_version='2.0.0',
sagemaker_session=sagemaker_session,
output_path=s3_output_location,
hyperparameters={'epochs': 1,
'batch_size': 30,
'learning_rate': 0.001},
py_version='py3')
tf_estimator.fit({"train":train_data})
from sagemaker.tensorflow.serving import Model
model = Model(model_data='s3://path/to/model.tar.gz',
role=role,
framework_version="2.0.0",
sagemaker_session=sagemaker_session)
predictor = model.deploy(initial_instance_count=1, instance_type='ml.m5.large')
I already tried to look at this thread but I actually don't have the problem of versions in my tar.gz file as the structure is the following :
├── assets
├── saved_model.pb
└── variables
├── variables.data-00000-of-00001
└── variables.index
I feel I might be wrong when defining model_fn() in my training script but definitely don't what to replace that with. Would you guys have an idea?
Thanks a lot for your help!
I actually tried to modify my training script to the following :
### Code to add in a tensorflow_estimator.py file
import argparse
import os
import pathlib
import tensorflow as tf
if __name__ == '__main__':
parser = argparse.ArgumentParser()
# hyperparameters sent by the client are passed as command-line arguments to the script.
parser.add_argument('--epochs', type=int, default=10)
parser.add_argument('--batch_size', type=int, default=100)
parser.add_argument('--learning_rate', type=float, default=0.1)
# Data, model, and output directories
parser.add_argument('--output-data-dir', type=str, default=os.environ.get('SM_OUTPUT_DATA_DIR'))
parser.add_argument('--model-dir', type=str, default=os.environ.get('SM_MODEL_DIR'))
parser.add_argument('--train', type=str, default=os.environ.get('SM_CHANNEL_TRAIN'))
parser.add_argument('--test', type=str, default=os.environ.get('SM_CHANNEL_TEST'))
args, _ = parser.parse_known_args()
print("##### ARGS ##### \n{}".format(args))
# Get files
path = pathlib.Path(args.train)
# Print out folder content
for item in path.iterdir():
print("##### DIRECTORIES ##### \n{}".format(item))
# Get all images
all_images = list(path.glob("*/*"))
all_image_paths = [str(path) for path in list(path.glob("*/*"))]
# Transform images into tensors
def preprocess_and_load_images(path):
image = tf.io.read_file(path)
image = tf.image.decode_jpeg(image, channels=3)
image = tf.image.resize(image, [192, 192])
return image
# Apply preprocessing function
ds_paths = tf.data.Dataset.from_tensor_slices(all_image_paths)
ds_images = ds_paths.map(preprocess_and_load_images)
# Map Labels
labels = []
for data in path.iterdir():
if data.is_dir():
labels += [data.name]
labels_index = {}
for i,label in enumerate(labels):
labels_index[label]=i
print("##### Label Index ##### \n{}".format(labels_index))
all_image_labels = [labels_index[path.parent.name] for path in list(path.glob("*/*"))]
# Create a tf Dataset
labels_ds = tf.data.Dataset.from_tensor_slices(all_image_labels)
# Zip train and labeled dataset
full_ds = tf.data.Dataset.zip((ds_images, labels_ds))
# Shuffle Dataset and batch it
full_ds = full_ds.shuffle(len(all_images)).batch(args.batch_size)
# Create a pre-trained model
base_model = tf.keras.applications.InceptionV3(input_shape=(192,192,3),
include_top=False,
weights = "imagenet"
)
base_model.trainable = False
model = tf.keras.Sequential([
base_model,
tf.keras.layers.GlobalAveragePooling2D(),
tf.keras.layers.Dense(len(labels), activation="softmax")
])
initial_learning_rate = args.learning_rate
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate,
decay_steps=1000,
decay_rate=0.96,
staircase=True)
model.compile(optimizer=tf.keras.optimizers.Adam(learning_rate = lr_schedule),
loss = tf.keras.losses.SparseCategoricalCrossentropy(),
metrics = [tf.keras.metrics.SparseCategoricalAccuracy()])
# Train the model
model.fit(full_ds, epochs=args.epochs)
# Save the model
model.save(os.path.join(args.model_dir, "tensorflow_model/1"), save_format="tf")
It seems that it's important to have a numerical name for your folder:
# Save the model
model.save(os.path.join(args.model_dir, "tensorflow_model/1"), save_format="tf")
I have been following this tutorial for creating a TensorFlow Audio Recognition model.
I have completed training of my model (which is based upon a smaller set of simple data from the content in the example).
Now I have built the model, what is the lines of code I need in order to actually classify against the created model?
Please see the following code which has built my model which is a slightly altered version of the article's code:
train.py:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os.path
import sys
import numpy as np
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
import input_data
import models
from tensorflow.python.platform import gfile
FLAGS = None
def main(_):
# Set the verbosity based on flags (default is INFO, so we see all messages)
tf.compat.v1.logging.set_verbosity(FLAGS.verbosity)
# Start a new TensorFlow session.
sess = tf.compat.v1.InteractiveSession()
# Begin by making sure we have the training data we need. If you already have
# training data of your own, use `--data_url= ` on the command line to avoid
# downloading.c
model_settings = models.prepare_model_settings(
len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
FLAGS.window_stride_ms, FLAGS.feature_bin_count, FLAGS.preprocess)
audio_processor = input_data.AudioProcessor(
FLAGS.data_url, FLAGS.data_dir,
FLAGS.silence_percentage, FLAGS.unknown_percentage,
FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
FLAGS.testing_percentage, model_settings, FLAGS.summaries_dir)
fingerprint_size = model_settings['fingerprint_size']
label_count = model_settings['label_count']
time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)
# Figure out the learning rates for each training phase. Since it's often
# effective to have high learning rates at the start of training, followed by
# lower levels towards the end, the number of steps and learning rates can be
# specified as comma-separated lists to define the rate at each stage. For
# example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
# will run 13,000 training loops in total, with a rate of 0.001 for the first
# 10,000, and 0.0001 for the final 3,000.
training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
if len(training_steps_list) != len(learning_rates_list):
raise Exception(
'--how_many_training_steps and --learning_rate must be equal length '
'lists, but are %d and %d long instead' % (len(training_steps_list),
len(learning_rates_list)))
input_placeholder = tf.compat.v1.placeholder(
tf.float32, [None, fingerprint_size], name='fingerprint_input')
if FLAGS.quantize:
fingerprint_min, fingerprint_max = input_data.get_features_range(
model_settings)
fingerprint_input = tf.quantization.fake_quant_with_min_max_args(
input_placeholder, fingerprint_min, fingerprint_max)
else:
fingerprint_input = input_placeholder
logits, dropout_prob = models.create_model(
fingerprint_input,
model_settings,
FLAGS.model_architecture,
is_training=True)
# Define loss and optimizer
ground_truth_input = tf.compat.v1.placeholder(
tf.int64, [None], name='groundtruth_input')
# Optionally we can add runtime checks to spot when NaNs or other symptoms of
# numerical errors start occurring during training.
control_dependencies = []
if FLAGS.check_nans:
checks = tf.compat.v1.add_check_numerics_ops()
control_dependencies = [checks]
# Create the back propagation and training evaluation machinery in the graph.
with tf.compat.v1.name_scope('cross_entropy'):
cross_entropy_mean = tf.compat.v1.losses.sparse_softmax_cross_entropy(
labels=ground_truth_input, logits=logits)
if FLAGS.quantize:
tf.contrib.quantize.create_training_graph(quant_delay=0)
with tf.compat.v1.name_scope('train'), tf.control_dependencies(
control_dependencies):
learning_rate_input = tf.compat.v1.placeholder(
tf.float32, [], name='learning_rate_input')
train_step = tf.compat.v1.train.GradientDescentOptimizer(
learning_rate_input).minimize(cross_entropy_mean)
predicted_indices = tf.argmax(input=logits, axis=1)
correct_prediction = tf.equal(predicted_indices, ground_truth_input)
confusion_matrix = tf.math.confusion_matrix(labels=ground_truth_input,
predictions=predicted_indices,
num_classes=label_count)
evaluation_step = tf.reduce_mean(input_tensor=tf.cast(correct_prediction,
tf.float32))
with tf.compat.v1.get_default_graph().name_scope('eval'):
tf.compat.v1.summary.scalar('cross_entropy', cross_entropy_mean)
tf.compat.v1.summary.scalar('accuracy', evaluation_step)
global_step = tf.compat.v1.train.get_or_create_global_step()
increment_global_step = tf.compat.v1.assign(global_step, global_step + 1)
saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables())
# Merge all the summaries and write them out to /tmp/retrain_logs (by default)
merged_summaries = tf.compat.v1.summary.merge_all(scope='eval')
train_writer = tf.compat.v1.summary.FileWriter(FLAGS.summaries_dir + '/train',
sess.graph)
validation_writer = tf.compat.v1.summary.FileWriter(
FLAGS.summaries_dir + '/validation')
tf.compat.v1.global_variables_initializer().run()
start_step = 1
if FLAGS.start_checkpoint:
models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
start_step = global_step.eval(session=sess)
tf.compat.v1.logging.info('Training from step: %d ', start_step)
# Save graph.pbtxt.
tf.io.write_graph(sess.graph_def, FLAGS.train_dir,
FLAGS.model_architecture + '.pbtxt')
# Save list of words.
with gfile.GFile(
os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'),
'w') as f:
f.write('\n'.join(audio_processor.words_list))
# Training loop.
training_steps_max = np.sum(training_steps_list)
for training_step in xrange(start_step, training_steps_max + 1):
# Figure out what the current learning rate is.
training_steps_sum = 0
for i in range(len(training_steps_list)):
training_steps_sum += training_steps_list[i]
if training_step <= training_steps_sum:
learning_rate_value = learning_rates_list[i]
break
# Pull the audio samples we'll use for training.
train_fingerprints, train_ground_truth = audio_processor.get_data(
FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency,
FLAGS.background_volume, time_shift_samples, 'training', sess)
# Run the graph with this batch of training data.
train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run(
[
merged_summaries,
evaluation_step,
cross_entropy_mean,
train_step,
increment_global_step,
],
feed_dict={
fingerprint_input: train_fingerprints,
ground_truth_input: train_ground_truth,
learning_rate_input: learning_rate_value,
dropout_prob: 0.5
})
train_writer.add_summary(train_summary, training_step)
tf.compat.v1.logging.info(
'Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' %
(training_step, learning_rate_value, train_accuracy * 100,
cross_entropy_value))
is_last_step = (training_step == training_steps_max)
if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step:
set_size = audio_processor.set_size('validation')
total_accuracy = 0
total_conf_matrix = None
for i in xrange(0, set_size, FLAGS.batch_size):
validation_fingerprints, validation_ground_truth = (
audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0,
0.0, 0, 'validation', sess))
# Run a validation step and capture training summaries for TensorBoard
# with the `merged` op.
validation_summary, validation_accuracy, conf_matrix = sess.run(
[merged_summaries, evaluation_step, confusion_matrix],
feed_dict={
fingerprint_input: validation_fingerprints,
ground_truth_input: validation_ground_truth,
dropout_prob: 1.0
})
validation_writer.add_summary(validation_summary, training_step)
batch_size = min(FLAGS.batch_size, set_size - i)
total_accuracy += (validation_accuracy * batch_size) / set_size
if total_conf_matrix is None:
total_conf_matrix = conf_matrix
else:
total_conf_matrix += conf_matrix
tf.compat.v1.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
tf.compat.v1.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' %
(training_step, total_accuracy * 100, set_size))
# Save the model checkpoint periodically.
if (training_step % FLAGS.save_step_interval == 0 or
training_step == training_steps_max):
checkpoint_path = os.path.join(FLAGS.train_dir,
FLAGS.model_architecture + '.ckpt')
tf.compat.v1.logging.info('Saving to "%s-%d"', checkpoint_path,
training_step)
saver.save(sess, checkpoint_path, global_step=training_step)
set_size = audio_processor.set_size('testing')
tf.compat.v1.logging.info('set_size=%d', set_size)
total_accuracy = 0
total_conf_matrix = None
for i in xrange(0, set_size, FLAGS.batch_size):
test_fingerprints, test_ground_truth = audio_processor.get_data(
FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess)
test_accuracy, conf_matrix = sess.run(
[evaluation_step, confusion_matrix],
feed_dict={
fingerprint_input: test_fingerprints,
ground_truth_input: test_ground_truth,
dropout_prob: 1.0
})
batch_size = min(FLAGS.batch_size, set_size - i)
total_accuracy += (test_accuracy * batch_size) / set_size
if total_conf_matrix is None:
total_conf_matrix = conf_matrix
else:
total_conf_matrix += conf_matrix
tf.compat.v1.logging.warn('Confusion Matrix:\n %s' % (total_conf_matrix))
tf.compat.v1.logging.warn('Final test accuracy = %.1f%% (N=%d)' %
(total_accuracy * 100, set_size))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--data_dir',
type=str,
default='Audio/training_data',
help="""\
Where to download the speech training data to.
""")
parser.add_argument(
'--background_volume',
type=float,
default=0.1,
help="""\
How loud the background noise should be, between 0 and 1.
""")
parser.add_argument(
'--background_frequency',
type=float,
default=0.0,
help="""\
How many of the training samples have background noise mixed in.
""")
parser.add_argument(
'--silence_percentage',
type=float,
default=10.0,
help="""\
How much of the training data should be silence.
""")
parser.add_argument(
'--unknown_percentage',
type=float,
default=10.0,
help="""\
How much of the training data should be unknown words.
""")
parser.add_argument(
'--time_shift_ms',
type=float,
default=100.0,
help="""\
Range to randomly shift the training audio by in time.
""")
parser.add_argument(
'--testing_percentage',
type=int,
default=10,
help='What percentage of wavs to use as a test set.')
parser.add_argument(
'--validation_percentage',
type=int,
default=10,
help='What percentage of wavs to use as a validation set.')
parser.add_argument(
'--sample_rate',
type=int,
default=16000,
help='Expected sample rate of the wavs',)
parser.add_argument(
'--clip_duration_ms',
type=int,
default=1000,
help='Expected duration in milliseconds of the wavs',)
parser.add_argument(
'--window_size_ms',
type=float,
default=30.0,
help='How long each spectrogram timeslice is.',)
parser.add_argument(
'--window_stride_ms',
type=float,
default=10.0,
help='How far to move in time between spectogram timeslices.',)
parser.add_argument(
'--feature_bin_count',
type=int,
default=40,
help='How many bins to use for the MFCC fingerprint',
)
parser.add_argument(
'--how_many_training_steps',
type=str,
default='200,50',
help='How many training loops to run',)
parser.add_argument(
'--eval_step_interval',
type=int,
default=50,
help='How often to evaluate the training results.')
parser.add_argument(
'--learning_rate',
type=str,
default='0.001,0.0001',
help='How large a learning rate to use when training.')
parser.add_argument(
'--batch_size',
type=int,
default=10,
help='How many items to train with at once',)
parser.add_argument(
'--summaries_dir',
type=str,
default='logs/retrain_logs',
help='Where to save summary logs for TensorBoard.')
parser.add_argument(
'--wanted_words',
type=str,
default='splash,footsteps,enemy',
help='Words to use (others will be added to an unknown label)',)
parser.add_argument(
'--train_dir',
type=str,
default='logs/commands_train',
help='Directory to write event logs and checkpoint.')
parser.add_argument(
'--save_step_interval',
type=int,
default=10,
help='Save model checkpoint every save_steps.')
parser.add_argument(
'--start_checkpoint',
type=str,
default='',
help='If specified, restore this pretrained model before any training.')
parser.add_argument(
'--model_architecture',
type=str,
default='conv',
help='What model architecture to use')
parser.add_argument(
'--check_nans',
type=bool,
default=False,
help='Whether to check for invalid numbers during processing')
parser.add_argument(
'--quantize',
type=bool,
default=False,
help='Whether to train the model for eight-bit deployment')
parser.add_argument(
'--preprocess',
type=str,
default='mfcc',
help='Spectrogram processing mode. Can be "mfcc", "average", or "micro"')
parser.add_argument(
'--data_url',
type=str,
default='Audio/training_data',
help='Directory where training data resides')
# Function used to parse --verbosity argument
def verbosity_arg(value):
"""Parses verbosity argument.
Args:
value: A member of tf.logging.
Raises:
ArgumentTypeError: Not an expected value.
"""
value = value.upper()
if value == 'INFO':
return tf.compat.v1.logging.INFO
elif value == 'DEBUG':
return tf.compat.v1.logging.DEBUG
elif value == 'ERROR':
return tf.compat.v1.logging.ERROR
elif value == 'FATAL':
return tf.compat.v1.logging.FATAL
elif value == 'WARN':
return tf.compat.v1.logging.WARN
else:
raise argparse.ArgumentTypeError('Not an expected value')
parser.add_argument(
'--verbosity',
type=verbosity_arg,
default=tf.compat.v1.logging.INFO,
help='Log verbosity. Can be "INFO", "DEBUG", "ERROR", "FATAL", or "WARN"')
FLAGS, unparsed = parser.parse_known_args()
tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
My question is, how do I actually classify a wav file against this model?
it's a simple argmax classfy you just need to get the softmax predict output by add it to sess run params
pred , train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run(
[
predicted_indices ,
merged_summaries,
evaluation_step,
cross_entropy_mean,
train_step,
increment_global_step,
],
then compare the pred array with train_ground_truth which should be the dict or array container audio label.
I am experimenting with the gpt-2 model's conditional text generation to tweak it for a good chatbot. I am using nsheppard's code for retraining it on my custom dataset.
I trained my model on a custom dataset of conversations that I pulled from my facebook data. I changed the sample length to 20 as they are dialogues during interactive conditional generation.
The dataset looks something like this:
How are you
Hi Great and you
Am also good
So you re a graphic designer
Yeah
How can you contribute to making the game In d graphics aspect
Can you show me some of your work if u don t mind
Am planning to learn making it a motion type
U can go through my photos
K
Can you make animations for it
Flash animations to be specific
No please only stable ones
Ok
But, after the training when i try to chat with it, it is instead completing my sentences instead of replying to them.
User >>> bye
======================================== SAMPLE 1 ========================================
and
hi
are there any positions in khrzh being appointed right now
I understand that the interactive_conditional_samples.py was built to complete the sentence based on the prompt, but I thought changing the dataset would work and sure it doesn't work.
train.py
#!/usr/bin/env python3
# Usage:
# PYTHONPATH=src ./train --dataset <file|directory|glob>
import argparse
import json
import os
import numpy as np
import tensorflow as tf
import time
import tqdm
from tensorflow.core.protobuf import rewriter_config_pb2
import model, sample, encoder
from load_dataset import load_dataset, Sampler
from accumulate import AccumulatingOptimizer
import memory_saving_gradients
CHECKPOINT_DIR = 'checkpoint'
SAMPLE_DIR = 'samples'
parser = argparse.ArgumentParser(
description='Fine-tune GPT-2 on your custom dataset.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset', metavar='PATH', type=str, required=True, help='Input file, directory, or glob pattern (utf-8 text, or preencoded .npz files).')
parser.add_argument('--model_name', metavar='MODEL', type=str, default='117M', help='Pretrained model name')
parser.add_argument('--combine', metavar='CHARS', type=int, default=50000, help='Concatenate input files with <|endoftext|> separator into chunks of this minimum size')
parser.add_argument('--batch_size', metavar='SIZE', type=int, default=1, help='Batch size')
parser.add_argument('--learning_rate', metavar='LR', type=float, default=0.00002, help='Learning rate for Adam')
parser.add_argument('--accumulate_gradients', metavar='N', type=int, default=1, help='Accumulate gradients across N minibatches.')
parser.add_argument('--memory_saving_gradients', default=False, action='store_true', help='Use gradient checkpointing to reduce vram usage.')
parser.add_argument('--only_train_transformer_layers', default=False, action='store_true', help='Restrict training to the transformer blocks.')
parser.add_argument('--optimizer', type=str, default='adam', help='Optimizer. <adam|sgd>.')
parser.add_argument('--noise', type=float, default=0.0, help='Add noise to input training data to regularize against typos.')
parser.add_argument('--top_k', type=int, default=40, help='K for top-k sampling.')
parser.add_argument('--top_p', type=float, default=0.0, help='P for top-p sampling. Overrides top_k if set > 0.')
parser.add_argument('--restore_from', type=str, default='latest', help='Either "latest", "fresh", or a path to a checkpoint file')
parser.add_argument('--run_name', type=str, default='run1', help='Run id. Name of subdirectory in checkpoint/ and samples/')
parser.add_argument('--sample_every', metavar='N', type=int, default=100, help='Generate samples every N steps')
parser.add_argument('--sample_length', metavar='TOKENS', type=int, default=1023, help='Sample this many tokens')
parser.add_argument('--sample_num', metavar='N', type=int, default=1, help='Generate this many samples')
parser.add_argument('--save_every', metavar='N', type=int, default=1000, help='Write a checkpoint every N steps')
parser.add_argument('--val_dataset', metavar='PATH', type=str, default=None, help='Dataset for validation loss, defaults to --dataset.')
parser.add_argument('--val_batch_size', metavar='SIZE', type=int, default=2, help='Batch size for validation.')
parser.add_argument('--val_batch_count', metavar='N', type=int, default=40, help='Number of batches for validation.')
parser.add_argument('--val_every', metavar='STEPS', type=int, default=0, help='Calculate validation loss every STEPS steps.')
def maketree(path):
try:
os.makedirs(path)
except:
pass
def randomize(context, hparams, p):
if p > 0:
mask = tf.random.uniform(shape=tf.shape(context)) < p
noise = tf.random.uniform(shape=tf.shape(context), minval=0, maxval=hparams.n_vocab, dtype=tf.int32)
return tf.where(mask, noise, context)
else:
return context
def main():
args = parser.parse_args()
enc = encoder.get_encoder(args.model_name)
hparams = model.default_hparams()
with open(os.path.join('models', args.model_name, 'hparams.json')) as f:
hparams.override_from_dict(json.load(f))
if args.sample_length > hparams.n_ctx:
raise ValueError(
"Can't get samples longer than window size: %s" % hparams.n_ctx)
if args.model_name == '345M':
args.memory_saving_gradients = True
if args.optimizer == 'adam':
args.only_train_transformer_layers = True
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.graph_options.rewrite_options.layout_optimizer = rewriter_config_pb2.RewriterConfig.OFF
with tf.Session(config=config) as sess:
context = tf.placeholder(tf.int32, [args.batch_size, None])
context_in = randomize(context, hparams, args.noise)
output = model.model(hparams=hparams, X=context_in)
loss = tf.reduce_mean(
tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=context[:, 1:], logits=output['logits'][:, :-1]))
if args.val_every > 0:
val_context = tf.placeholder(tf.int32, [args.val_batch_size, None])
val_output = model.model(hparams=hparams, X=val_context)
val_loss = tf.reduce_mean(
tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=val_context[:, 1:], logits=val_output['logits'][:, :-1]))
val_loss_summary = tf.summary.scalar('val_loss', val_loss)
tf_sample = sample.sample_sequence(
hparams=hparams,
length=args.sample_length,
context=context,
batch_size=args.batch_size,
temperature=1.0,
top_k=args.top_k,
top_p=args.top_p)
all_vars = [v for v in tf.trainable_variables() if 'model' in v.name]
train_vars = [v for v in all_vars if '/h' in v.name] if args.only_train_transformer_layers else all_vars
if args.optimizer == 'adam':
opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
elif args.optimizer == 'sgd':
opt = tf.train.GradientDescentOptimizer(learning_rate=args.learning_rate)
else:
exit('Bad optimizer:', args.optimizer)
if args.accumulate_gradients > 1:
if args.memory_saving_gradients:
exit("Memory saving gradients are not implemented for gradient accumulation yet.")
opt = AccumulatingOptimizer(
opt=opt,
var_list=train_vars)
opt_reset = opt.reset()
opt_compute = opt.compute_gradients(loss)
opt_apply = opt.apply_gradients()
summary_loss = tf.summary.scalar('loss', opt_apply)
else:
if args.memory_saving_gradients:
opt_grads = memory_saving_gradients.gradients(loss, train_vars)
else:
opt_grads = tf.gradients(loss, train_vars)
opt_grads = list(zip(opt_grads, train_vars))
opt_apply = opt.apply_gradients(opt_grads)
summary_loss = tf.summary.scalar('loss', loss)
summary_lr = tf.summary.scalar('learning_rate', args.learning_rate)
summaries = tf.summary.merge([summary_lr, summary_loss])
summary_log = tf.summary.FileWriter(
os.path.join(CHECKPOINT_DIR, args.run_name))
saver = tf.train.Saver(
var_list=all_vars,
max_to_keep=5,
keep_checkpoint_every_n_hours=2)
sess.run(tf.global_variables_initializer())
if args.restore_from == 'latest':
ckpt = tf.train.latest_checkpoint(
os.path.join(CHECKPOINT_DIR, args.run_name))
if ckpt is None:
# Get fresh GPT weights if new run.
ckpt = tf.train.latest_checkpoint(
os.path.join('models', args.model_name))
elif args.restore_from == 'fresh':
ckpt = tf.train.latest_checkpoint(
os.path.join('models', args.model_name))
else:
ckpt = tf.train.latest_checkpoint(args.restore_from)
print('Loading checkpoint', ckpt)
saver.restore(sess, ckpt)
print('Loading dataset...')
chunks = load_dataset(enc, args.dataset, args.combine)
data_sampler = Sampler(chunks)
if args.val_every > 0:
val_chunks = load_dataset(enc, args.val_dataset, args.combine) if args.val_dataset else chunks
print('dataset has', data_sampler.total_size, 'tokens')
print('Training...')
if args.val_every > 0:
# Sample from validation set once with fixed seed to make
# it deterministic during training as well as across runs.
val_data_sampler = Sampler(val_chunks, seed=1)
val_batches = [[val_data_sampler.sample(1024) for _ in range(args.val_batch_size)]
for _ in range(args.val_batch_count)]
counter = 1
counter_path = os.path.join(CHECKPOINT_DIR, args.run_name, 'counter')
if os.path.exists(counter_path):
# Load the step number if we're resuming a run
# Add 1 so we don't immediately try to save again
with open(counter_path, 'r') as fp:
counter = int(fp.read()) + 1
def save():
maketree(os.path.join(CHECKPOINT_DIR, args.run_name))
print(
'Saving',
os.path.join(CHECKPOINT_DIR, args.run_name,
'model-{}').format(counter))
saver.save(
sess,
os.path.join(CHECKPOINT_DIR, args.run_name, 'model'),
global_step=counter)
with open(counter_path, 'w') as fp:
fp.write(str(counter) + '\n')
def generate_samples():
print('Generating samples...')
context_tokens = data_sampler.sample(1)
all_text = []
index = 0
while index < args.sample_num:
out = sess.run(
tf_sample,
feed_dict={context: args.batch_size * [context_tokens]})
for i in range(min(args.sample_num - index, args.batch_size)):
text = enc.decode(out[i])
text = '======== SAMPLE {} ========\n{}\n'.format(
index + 1, text)
all_text.append(text)
index += 1
print(text)
maketree(os.path.join(SAMPLE_DIR, args.run_name))
with open(
os.path.join(SAMPLE_DIR, args.run_name,
'samples-{}').format(counter), 'w') as fp:
fp.write('\n'.join(all_text))
def validation():
print('Calculating validation loss...')
losses = []
for batch in tqdm.tqdm(val_batches):
losses.append(sess.run(val_loss, feed_dict={val_context: batch}))
v_val_loss = np.mean(losses)
v_summary = sess.run(val_loss_summary, feed_dict={val_loss: v_val_loss})
summary_log.add_summary(v_summary, counter)
summary_log.flush()
print(
'[{counter} | {time:2.2f}] validation loss = {loss:2.2f}'
.format(
counter=counter,
time=time.time() - start_time,
loss=v_val_loss))
def sample_batch():
return [data_sampler.sample(1024) for _ in range(args.batch_size)]
avg_loss = (0.0, 0.0)
start_time = time.time()
try:
while True:
if counter % args.save_every == 0:
save()
if counter % args.sample_every == 0:
generate_samples()
if args.val_every > 0 and (counter % args.val_every == 0 or counter == 1):
validation()
if args.accumulate_gradients > 1:
sess.run(opt_reset)
for _ in range(args.accumulate_gradients):
sess.run(
opt_compute, feed_dict={context: sample_batch()})
(v_loss, v_summary) = sess.run((opt_apply, summaries))
else:
(_, v_loss, v_summary) = sess.run(
(opt_apply, loss, summaries),
feed_dict={context: sample_batch()})
summary_log.add_summary(v_summary, counter)
avg_loss = (avg_loss[0] * 0.99 + v_loss,
avg_loss[1] * 0.99 + 1.0)
print(
'[{counter} | {time:2.2f}] loss={loss:2.2f} avg={avg:2.2f}'
.format(
counter=counter,
time=time.time() - start_time,
loss=v_loss,
avg=avg_loss[0] / avg_loss[1]))
counter += 1
except KeyboardInterrupt:
print('interrupted')
save()
if __name__ == '__main__':
main()
sample.py
import tensorflow as tf
import model
def top_k_logits(logits, k):
if k == 0:
# no truncation
return logits
def _top_k():
values, _ = tf.nn.top_k(logits, k=k)
min_values = values[:, -1, tf.newaxis]
return tf.where(
logits < min_values,
tf.ones_like(logits, dtype=logits.dtype) * -1e10,
logits,
)
return tf.cond(
tf.equal(k, 0),
lambda: logits,
lambda: _top_k(),
)
def top_p_logits(logits, p):
with tf.variable_scope('top_p_logits'):
logits_sort = tf.sort(logits, direction='DESCENDING')
probs_sort = tf.nn.softmax(logits_sort)
probs_sums = tf.cumsum(probs_sort, axis=1, exclusive=True)
logits_masked = tf.where(probs_sums < p, logits_sort, tf.ones_like(logits_sort)*1000) # [batchsize, vocab]
min_logits = tf.reduce_min(logits_masked, axis=1, keepdims=True) # [batchsize, 1]
return tf.where(
logits < min_logits,
tf.ones_like(logits, dtype=logits.dtype) * -1e10,
logits,
)
def sample_sequence(*, hparams, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, top_p=0.0):
if start_token is None:
assert context is not None, 'Specify exactly one of start_token and context!'
else:
assert context is None, 'Specify exactly one of start_token and context!'
context = tf.fill([batch_size, 1], start_token)
def step(hparams, tokens, past=None):
lm_output = model.model(hparams=hparams, X=tokens, past=past, reuse=tf.AUTO_REUSE)
logits = lm_output['logits'][:, :, :hparams.n_vocab]
presents = lm_output['present']
presents.set_shape(model.past_shape(hparams=hparams, batch_size=batch_size))
return {
'logits': logits,
'presents': presents,
}
with tf.name_scope('sample_sequence'):
# Don't feed the last context token -- leave that to the loop below
# TODO: Would be slightly faster if we called step on the entire context,
# rather than leaving the last token transformer calculation to the while loop.
context_output = step(hparams, context[:, :-1])
def body(past, prev, output):
next_outputs = step(hparams, prev[:, tf.newaxis], past=past)
logits = next_outputs['logits'][:, -1, :] / tf.to_float(temperature)
if top_p > 0.0:
logits = top_p_logits(logits, p=top_p)
else:
logits = top_k_logits(logits, k=top_k)
samples = tf.multinomial(logits, num_samples=1, output_dtype=tf.int32)
return [
tf.concat([past, next_outputs['presents']], axis=-2),
tf.squeeze(samples, axis=[1]),
tf.concat([output, samples], axis=1),
]
def cond(*args):
return True
_, _, tokens = tf.while_loop(
cond=cond, body=body,
maximum_iterations=length,
loop_vars=[
context_output['presents'],
context[:, -1],
context,
],
shape_invariants=[
tf.TensorShape(model.past_shape(hparams=hparams, batch_size=batch_size)),
tf.TensorShape([batch_size]),
tf.TensorShape([batch_size, None]),
],
back_prop=False,
)
return tokens
interactive_conditional_samples.py
#!/usr/bin/env python3
import fire
import json
import os
import numpy as np
import tensorflow as tf
import model, sample, encoder
def interact_model(
model_name='chatbot',
seed=None,
nsamples=1,
batch_size=1,
length=20,
temperature=1,
top_k=0,
top_p=0.0
):
"""
Interactively run the model
:model_name=chatbot : String, which model to use
:seed=None : Integer seed for random number generators, fix seed to reproduce
results
:nsamples=1 : Number of samples to return total
:batch_size=1 : Number of batches (only affects speed/memory). Must divide nsamples.
:length=None : Number of tokens in generated text, if None (default), is
determined by model hyperparameters
:temperature=1 : Float value controlling randomness in boltzmann
distribution. Lower temperature results in less random completions. As the
temperature approaches zero, the model will become deterministic and
repetitive. Higher temperature results in more random completions.
:top_k=0 : Integer value controlling diversity. 1 means only 1 word is
considered for each step (token), resulting in deterministic completions,
while 40 means 40 words are considered at each step. 0 (default) is a
special setting meaning no restrictions. 40 generally is a good value.
:top_p=0.0 : Float value controlling diversity. Implements nucleus sampling,
overriding top_k if set to a value > 0. A good setting is 0.9.
"""
if batch_size is None:
batch_size = 1
assert nsamples % batch_size == 0
enc = encoder.get_encoder(model_name)
hparams = model.default_hparams()
with open(os.path.join('models', model_name, 'hparams.json')) as f:
hparams.override_from_dict(json.load(f))
if length is None:
length = hparams.n_ctx // 2
elif length > hparams.n_ctx:
raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx)
with tf.Session(graph=tf.Graph()) as sess:
context = tf.placeholder(tf.int32, [batch_size, None])
np.random.seed(seed)
tf.set_random_seed(seed)
output = sample.sample_sequence(
hparams=hparams, length=length,
context=context,
batch_size=batch_size,
temperature=temperature, top_k=top_k, top_p=top_p
)
saver = tf.train.Saver()
ckpt = tf.train.latest_checkpoint(os.path.join('models', model_name))
saver.restore(sess, ckpt)
while True:
raw_text = input("User >>> ")
while not raw_text:
print('Prompt should not be empty!')
raw_text = input("User >>> ")
context_tokens = enc.encode(raw_text)
generated = 0
for _ in range(nsamples // batch_size):
out = sess.run(output, feed_dict={
context: [context_tokens for _ in range(batch_size)]
})[:, len(context_tokens):]
for i in range(batch_size):
generated += 1
text = enc.decode(out[i])
print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
print(text)
print("=" * 80)
if __name__ == '__main__':
fire.Fire(interact_model)
How can I tweak the code to get it working like a chatbot? I am guessing it has something to do with the context part in sample.py, though i am unsure how is this going to work.
I know this is an old question now, but I have successfully tuned many Q&A style datasets on GPT-2 and have a suggestion that will work for future people who find this question.
GPT-2 reads unstructured text data, but it is very good at inferring and obeying structure in that data. Your issue is basically that you are not terminating your input lines with an identifier that GPT-2 understands, so it continues the sentence.
A simple way to fix this would be to annotate your dataset. Really anything with stop/start tokens will work, but you should also annotate the speaker identities. I would just do something like this:
A: How are you <EOL>
B: Hi Great and you <EOL>
A: Am also good <EOL>
B: So you re a graphic designer <EOL>
B: Another line from B <EOL>
The other benefit of this approach is that GPT-2 will learn multi-line input/output, and the different identities of the two conversants.
Problem is, all model sees is looking at the series of text you gave it, and trying to predict next most likely /token to be exact. It's not an encoder-decoder architecture. What you require is fine-tuning this architecture for a chatbot architecture.The only implementation I found regarding that one is here. But's it's done in pytorch so i am afraid it won't be what you are wanting.
https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313
I trained a Wide & Deep model using the pre-made Estimator class (DNNLinearCombinedClassifier), by essentially following the tutorial on tensorflow.org.
I wanted to do inference/serving, but without using tensorflow-serving. This basically comes down to feeding some test data to the correct input tensor and retrieving the output tensor.
However, I am not sure what the input nodes/layer should be. In the tensorflow graph (graph.pbtxt), the following nodes seem relevant. But they are also related to the input queue which is mainly used during training, but not necessarily inference (I can just send one instance at a time).
name: "enqueue_input/random_shuffle_queue"
name: "enqueue_input/Placeholder"
name: "enqueue_input/Placeholder_1"
name: "enqueue_input/Placeholder_2"
...
name: "enqueue_input/Placeholder_84"
name: "enqueue_input/random_shuffle_queue_EnqueueMany_1"
name: "enqueue_input/random_shuffle_queue_EnqueueMany_2"
name: "enqueue_input/random_shuffle_queue_EnqueueMany_3"
name: "enqueue_input/random_shuffle_queue_EnqueueMany_4"
name: "enqueue_input/random_shuffle_queue_EnqueueMany"
name: "enqueue_input/sub/y"
name: "enqueue_input/sub"
name: "enqueue_input/Maximum/x"
name: "enqueue_input/Maximum"
name: "enqueue_input/Cast"
name: "enqueue_input/mul/y"
name: "enqueue_input/mul"
Does anyone know the answer? Thanks in advance!
If you want inference, but without using tensorflow-serving, you can just use the tf.estimator.Estimator predict method.
But if you want to do it manually (so that is runs faster), you need a workaround. I am not sure if what I did was exactly the best approach, but it worked. Here's my solution.
1) Let's do the imports and create variables and fake data:
import os
import numpy as np
from functools import partial
import pickle
import tensorflow as tf
N = 10000
EPOCHS = 1000
BATCH_SIZE = 2
X_data = np.random.random((N, 10))
y_data = (np.random.random((N, 1)) >= 0.5).astype(int)
my_dir = os.getcwd() + "/"
2) Define an input_fn, which you will use tf.data.Dataset. Save the tensor names in a dictionary ("input_tensor_map"), which maps the input key to the tensor name.
def my_input_fn(X, y=None, is_training=False):
def internal_input_fn(X, y=None, is_training=False):
if (not isinstance(X, dict)):
X = {"x": X}
if (y is None):
dataset = tf.data.Dataset.from_tensor_slices(X)
else:
dataset = tf.data.Dataset.from_tensor_slices((X, y))
if (is_training):
dataset = dataset.repeat().shuffle(100)
batch_size = BATCH_SIZE
else:
batch_size = 1
dataset = dataset.batch(batch_size)
dataset_iter = dataset.make_initializable_iterator()
if (y is None):
features = dataset_iter.get_next()
labels = None
else:
features, labels = dataset_iter.get_next()
input_tensor_map = dict()
for input_name, tensor in features.items():
input_tensor_map[input_name] = tensor.name
with open(os.path.join(my_dir, 'input_tensor_map.pickle'), 'wb') as f:
pickle.dump(input_tensor_map, f, protocol=pickle.HIGHEST_PROTOCOL)
tf.add_to_collection(tf.GraphKeys.TABLE_INITIALIZERS, dataset_iter.initializer)
return (features, labels) if (not labels is None) else features
return partial(internal_input_fn, X=X, y=y, is_training=is_training)
3) Define your model, to be used in your tf.estimator.Estimator. For example:
def my_model_fn(features, labels, mode):
output = tf.layers.dense(inputs=features["x"], units=1, activation=None)
logits = tf.identity(output, name="logits")
prediction = tf.nn.sigmoid(logits, name="predictions")
classes = tf.to_int64(tf.greater(logits, 0.0), name="classes")
predictions_dict = {
"class": classes,
"probabilities": prediction
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode=mode, predictions=predictions_dict)
one_hot_labels = tf.squeeze(tf.one_hot(tf.cast(labels, dtype=tf.int32), 2))
loss = tf.losses.sigmoid_cross_entropy(multi_class_labels=one_hot_labels, logits=logits)
tf.summary.scalar("loss", loss)
accuracy = tf.reduce_mean(tf.to_float(tf.equal(labels, classes)))
tf.summary.scalar("accuracy", accuracy)
# Configure the Training Op (for TRAIN mode)
if (mode == tf.estimator.ModeKeys.TRAIN):
train_op = tf.train.AdamOptimizer().minimize(loss, global_step=tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode=mode, loss=loss, train_op=train_op)
return tf.estimator.EstimatorSpec(mode=mode, loss=loss)
4) Train and freeze your model. The freeze method is from TensorFlow: How to freeze a model and serve it with a python API, which I added a tiny modification.
def freeze_graph(output_node_names):
"""Extract the sub graph defined by the output nodes and convert
all its variables into constant
Args:
model_dir: the root folder containing the checkpoint state file
output_node_names: a string, containing all the output node's names,
comma separated
"""
if (output_node_names is None):
output_node_names = 'loss'
if not tf.gfile.Exists(my_dir):
raise AssertionError(
"Export directory doesn't exists. Please specify an export "
"directory: %s" % my_dir)
if not output_node_names:
print("You need to supply the name of a node to --output_node_names.")
return -1
# We retrieve our checkpoint fullpath
checkpoint = tf.train.get_checkpoint_state(my_dir)
input_checkpoint = checkpoint.model_checkpoint_path
# We precise the file fullname of our freezed graph
absolute_model_dir = "/".join(input_checkpoint.split('/')[:-1])
output_graph = absolute_model_dir + "/frozen_model.pb"
# We clear devices to allow TensorFlow to control on which device it will load operations
clear_devices = True
# We start a session using a temporary fresh Graph
with tf.Session(graph=tf.Graph()) as sess:
# We import the meta graph in the current default Graph
saver = tf.train.import_meta_graph(input_checkpoint + '.meta', clear_devices=clear_devices)
# We restore the weights
saver.restore(sess, input_checkpoint)
# We use a built-in TF helper to export variables to constants
output_graph_def = tf.graph_util.convert_variables_to_constants(
sess, # The session is used to retrieve the weights
tf.get_default_graph().as_graph_def(), # The graph_def is used to retrieve the nodes
output_node_names.split(",") # The output node names are used to select the usefull nodes
)
# Finally we serialize and dump the output graph to the filesystem
with tf.gfile.GFile(output_graph, "wb") as f:
f.write(output_graph_def.SerializeToString())
print("%d ops in the final graph." % len(output_graph_def.node))
return output_graph_def
# *****************************************************************************
tf.logging.set_verbosity(tf.logging.INFO)
estimator = tf.estimator.Estimator(model_fn=my_model_fn, model_dir=my_dir)
if (estimator.latest_checkpoint() is None):
estimator.train(input_fn=my_input_fn(X=X_data, y=y_data, is_training=True), steps=EPOCHS)
freeze_graph("predictions,classes")
tf.logging.set_verbosity(tf.logging.INFO)
estimator = tf.estimator.Estimator(model_fn=my_model_fn, model_dir=my_dir)
if (estimator.latest_checkpoint() is None):
estimator.train(input_fn=my_input_fn(X=X_data, y=y_data, is_training=True), steps=EPOCHS)
freeze_graph("predictions,classes")
5) Finally, you can use the frozen graph for inference, input tensors names are in the dictionary that you saved. Again, the method to load the freezed model from TensorFlow: How to freeze a model and serve it with a python API.
def load_frozen_graph(prefix="frozen_graph"):
frozen_graph_filename = os.path.join(my_dir, "frozen_model.pb")
# We load the protobuf file from the disk and parse it to retrieve the
# unserialized graph_def
with tf.gfile.GFile(frozen_graph_filename, "rb") as f:
graph_def = tf.GraphDef()
graph_def.ParseFromString(f.read())
# Then, we import the graph_def into a new Graph and returns it
with tf.Graph().as_default() as graph:
# The name var will prefix every op/nodes in your graph
# Since we load everything in a new graph, this is not needed
tf.import_graph_def(graph_def, name=prefix)
return graph
# *****************************************************************************
X_test = {"x": np.random.random((int(N/2), 10))}
prefix = "frozen_graph"
graph = load_frozen_graph(prefix)
for op in graph.get_operations():
print(op.name)
with open(os.path.join(my_dir, 'input_tensor_map.pickle'), 'rb') as f:
input_tensor_map = pickle.load(f)
with tf.Session(graph=graph) as sess:
input_feed = dict()
for key, tensor_name in input_tensor_map.items():
tensor = graph.get_tensor_by_name(prefix + "/" + tensor_name)
input_feed[tensor] = X_test[key]
logits = graph.get_operation_by_name(prefix + "/logits").outputs[0]
probabilities = graph.get_operation_by_name(prefix + "/predictions").outputs[0]
classes = graph.get_operation_by_name(prefix + "/classes").outputs[0]
logits_values, probabilities_values, classes_values = sess.run([logits, probabilities, classes], feed_dict=input_feed)