Fine tune GPT-2 Text Prediction for Conversational AI

Fine tune GPT-2 Text Prediction for Conversational AI - python

I am experimenting with the gpt-2 model's conditional text generation to tweak it for a good chatbot. I am using nsheppard's code for retraining it on my custom dataset.
I trained my model on a custom dataset of conversations that I pulled from my facebook data. I changed the sample length to 20 as they are dialogues during interactive conditional generation.
The dataset looks something like this:
How are you
Hi Great and you
Am also good
So you re a graphic designer
Yeah
How can you contribute to making the game In d graphics aspect
Can you show me some of your work if u don t mind
Am planning to learn making it a motion type
U can go through my photos
K
Can you make animations for it
Flash animations to be specific
No please only stable ones
Ok
But, after the training when i try to chat with it, it is instead completing my sentences instead of replying to them.
User >>> bye
======================================== SAMPLE 1 ========================================
and
hi
are there any positions in khrzh being appointed right now
I understand that the interactive_conditional_samples.py was built to complete the sentence based on the prompt, but I thought changing the dataset would work and sure it doesn't work.
train.py
#!/usr/bin/env python3
# Usage:
# PYTHONPATH=src ./train --dataset <file|directory|glob>
import argparse
import json
import os
import numpy as np
import tensorflow as tf
import time
import tqdm
from tensorflow.core.protobuf import rewriter_config_pb2
import model, sample, encoder
from load_dataset import load_dataset, Sampler
from accumulate import AccumulatingOptimizer
import memory_saving_gradients
CHECKPOINT_DIR = 'checkpoint'
SAMPLE_DIR = 'samples'
parser = argparse.ArgumentParser(
description='Fine-tune GPT-2 on your custom dataset.',
formatter_class=argparse.ArgumentDefaultsHelpFormatter)
parser.add_argument('--dataset', metavar='PATH', type=str, required=True, help='Input file, directory, or glob pattern (utf-8 text, or preencoded .npz files).')
parser.add_argument('--model_name', metavar='MODEL', type=str, default='117M', help='Pretrained model name')
parser.add_argument('--combine', metavar='CHARS', type=int, default=50000, help='Concatenate input files with <|endoftext|> separator into chunks of this minimum size')
parser.add_argument('--batch_size', metavar='SIZE', type=int, default=1, help='Batch size')
parser.add_argument('--learning_rate', metavar='LR', type=float, default=0.00002, help='Learning rate for Adam')
parser.add_argument('--accumulate_gradients', metavar='N', type=int, default=1, help='Accumulate gradients across N minibatches.')
parser.add_argument('--memory_saving_gradients', default=False, action='store_true', help='Use gradient checkpointing to reduce vram usage.')
parser.add_argument('--only_train_transformer_layers', default=False, action='store_true', help='Restrict training to the transformer blocks.')
parser.add_argument('--optimizer', type=str, default='adam', help='Optimizer. <adam|sgd>.')
parser.add_argument('--noise', type=float, default=0.0, help='Add noise to input training data to regularize against typos.')
parser.add_argument('--top_k', type=int, default=40, help='K for top-k sampling.')
parser.add_argument('--top_p', type=float, default=0.0, help='P for top-p sampling. Overrides top_k if set > 0.')
parser.add_argument('--restore_from', type=str, default='latest', help='Either "latest", "fresh", or a path to a checkpoint file')
parser.add_argument('--run_name', type=str, default='run1', help='Run id. Name of subdirectory in checkpoint/ and samples/')
parser.add_argument('--sample_every', metavar='N', type=int, default=100, help='Generate samples every N steps')
parser.add_argument('--sample_length', metavar='TOKENS', type=int, default=1023, help='Sample this many tokens')
parser.add_argument('--sample_num', metavar='N', type=int, default=1, help='Generate this many samples')
parser.add_argument('--save_every', metavar='N', type=int, default=1000, help='Write a checkpoint every N steps')
parser.add_argument('--val_dataset', metavar='PATH', type=str, default=None, help='Dataset for validation loss, defaults to --dataset.')
parser.add_argument('--val_batch_size', metavar='SIZE', type=int, default=2, help='Batch size for validation.')
parser.add_argument('--val_batch_count', metavar='N', type=int, default=40, help='Number of batches for validation.')
parser.add_argument('--val_every', metavar='STEPS', type=int, default=0, help='Calculate validation loss every STEPS steps.')
def maketree(path):
try:
os.makedirs(path)
except:
pass
def randomize(context, hparams, p):
if p > 0:
mask = tf.random.uniform(shape=tf.shape(context)) < p
noise = tf.random.uniform(shape=tf.shape(context), minval=0, maxval=hparams.n_vocab, dtype=tf.int32)
return tf.where(mask, noise, context)
else:
return context
def main():
args = parser.parse_args()
enc = encoder.get_encoder(args.model_name)
hparams = model.default_hparams()
with open(os.path.join('models', args.model_name, 'hparams.json')) as f:
hparams.override_from_dict(json.load(f))
if args.sample_length > hparams.n_ctx:
raise ValueError(
"Can't get samples longer than window size: %s" % hparams.n_ctx)
if args.model_name == '345M':
args.memory_saving_gradients = True
if args.optimizer == 'adam':
args.only_train_transformer_layers = True
config = tf.ConfigProto()
config.gpu_options.allow_growth = True
config.graph_options.rewrite_options.layout_optimizer = rewriter_config_pb2.RewriterConfig.OFF
with tf.Session(config=config) as sess:
context = tf.placeholder(tf.int32, [args.batch_size, None])
context_in = randomize(context, hparams, args.noise)
output = model.model(hparams=hparams, X=context_in)
loss = tf.reduce_mean(
tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=context[:, 1:], logits=output['logits'][:, :-1]))
if args.val_every > 0:
val_context = tf.placeholder(tf.int32, [args.val_batch_size, None])
val_output = model.model(hparams=hparams, X=val_context)
val_loss = tf.reduce_mean(
tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=val_context[:, 1:], logits=val_output['logits'][:, :-1]))
val_loss_summary = tf.summary.scalar('val_loss', val_loss)
tf_sample = sample.sample_sequence(
hparams=hparams,
length=args.sample_length,
context=context,
batch_size=args.batch_size,
temperature=1.0,
top_k=args.top_k,
top_p=args.top_p)
all_vars = [v for v in tf.trainable_variables() if 'model' in v.name]
train_vars = [v for v in all_vars if '/h' in v.name] if args.only_train_transformer_layers else all_vars
if args.optimizer == 'adam':
opt = tf.train.AdamOptimizer(learning_rate=args.learning_rate)
elif args.optimizer == 'sgd':
opt = tf.train.GradientDescentOptimizer(learning_rate=args.learning_rate)
else:
exit('Bad optimizer:', args.optimizer)
if args.accumulate_gradients > 1:
if args.memory_saving_gradients:
exit("Memory saving gradients are not implemented for gradient accumulation yet.")
opt = AccumulatingOptimizer(
opt=opt,
var_list=train_vars)
opt_reset = opt.reset()
opt_compute = opt.compute_gradients(loss)
opt_apply = opt.apply_gradients()
summary_loss = tf.summary.scalar('loss', opt_apply)
else:
if args.memory_saving_gradients:
opt_grads = memory_saving_gradients.gradients(loss, train_vars)
else:
opt_grads = tf.gradients(loss, train_vars)
opt_grads = list(zip(opt_grads, train_vars))
opt_apply = opt.apply_gradients(opt_grads)
summary_loss = tf.summary.scalar('loss', loss)
summary_lr = tf.summary.scalar('learning_rate', args.learning_rate)
summaries = tf.summary.merge([summary_lr, summary_loss])
summary_log = tf.summary.FileWriter(
os.path.join(CHECKPOINT_DIR, args.run_name))
saver = tf.train.Saver(
var_list=all_vars,
max_to_keep=5,
keep_checkpoint_every_n_hours=2)
sess.run(tf.global_variables_initializer())
if args.restore_from == 'latest':
ckpt = tf.train.latest_checkpoint(
os.path.join(CHECKPOINT_DIR, args.run_name))
if ckpt is None:
# Get fresh GPT weights if new run.
ckpt = tf.train.latest_checkpoint(
os.path.join('models', args.model_name))
elif args.restore_from == 'fresh':
ckpt = tf.train.latest_checkpoint(
os.path.join('models', args.model_name))
else:
ckpt = tf.train.latest_checkpoint(args.restore_from)
print('Loading checkpoint', ckpt)
saver.restore(sess, ckpt)
print('Loading dataset...')
chunks = load_dataset(enc, args.dataset, args.combine)
data_sampler = Sampler(chunks)
if args.val_every > 0:
val_chunks = load_dataset(enc, args.val_dataset, args.combine) if args.val_dataset else chunks
print('dataset has', data_sampler.total_size, 'tokens')
print('Training...')
if args.val_every > 0:
# Sample from validation set once with fixed seed to make
# it deterministic during training as well as across runs.
val_data_sampler = Sampler(val_chunks, seed=1)
val_batches = [[val_data_sampler.sample(1024) for _ in range(args.val_batch_size)]
for _ in range(args.val_batch_count)]
counter = 1
counter_path = os.path.join(CHECKPOINT_DIR, args.run_name, 'counter')
if os.path.exists(counter_path):
# Load the step number if we're resuming a run
# Add 1 so we don't immediately try to save again
with open(counter_path, 'r') as fp:
counter = int(fp.read()) + 1
def save():
maketree(os.path.join(CHECKPOINT_DIR, args.run_name))
print(
'Saving',
os.path.join(CHECKPOINT_DIR, args.run_name,
'model-{}').format(counter))
saver.save(
sess,
os.path.join(CHECKPOINT_DIR, args.run_name, 'model'),
global_step=counter)
with open(counter_path, 'w') as fp:
fp.write(str(counter) + '\n')
def generate_samples():
print('Generating samples...')
context_tokens = data_sampler.sample(1)
all_text = []
index = 0
while index < args.sample_num:
out = sess.run(
tf_sample,
feed_dict={context: args.batch_size * [context_tokens]})
for i in range(min(args.sample_num - index, args.batch_size)):
text = enc.decode(out[i])
text = '======== SAMPLE {} ========\n{}\n'.format(
index + 1, text)
all_text.append(text)
index += 1
print(text)
maketree(os.path.join(SAMPLE_DIR, args.run_name))
with open(
os.path.join(SAMPLE_DIR, args.run_name,
'samples-{}').format(counter), 'w') as fp:
fp.write('\n'.join(all_text))
def validation():
print('Calculating validation loss...')
losses = []
for batch in tqdm.tqdm(val_batches):
losses.append(sess.run(val_loss, feed_dict={val_context: batch}))
v_val_loss = np.mean(losses)
v_summary = sess.run(val_loss_summary, feed_dict={val_loss: v_val_loss})
summary_log.add_summary(v_summary, counter)
summary_log.flush()
print(
'[{counter} | {time:2.2f}] validation loss = {loss:2.2f}'
.format(
counter=counter,
time=time.time() - start_time,
loss=v_val_loss))
def sample_batch():
return [data_sampler.sample(1024) for _ in range(args.batch_size)]
avg_loss = (0.0, 0.0)
start_time = time.time()
try:
while True:
if counter % args.save_every == 0:
save()
if counter % args.sample_every == 0:
generate_samples()
if args.val_every > 0 and (counter % args.val_every == 0 or counter == 1):
validation()
if args.accumulate_gradients > 1:
sess.run(opt_reset)
for _ in range(args.accumulate_gradients):
sess.run(
opt_compute, feed_dict={context: sample_batch()})
(v_loss, v_summary) = sess.run((opt_apply, summaries))
else:
(_, v_loss, v_summary) = sess.run(
(opt_apply, loss, summaries),
feed_dict={context: sample_batch()})
summary_log.add_summary(v_summary, counter)
avg_loss = (avg_loss[0] * 0.99 + v_loss,
avg_loss[1] * 0.99 + 1.0)
print(
'[{counter} | {time:2.2f}] loss={loss:2.2f} avg={avg:2.2f}'
.format(
counter=counter,
time=time.time() - start_time,
loss=v_loss,
avg=avg_loss[0] / avg_loss[1]))
counter += 1
except KeyboardInterrupt:
print('interrupted')
save()
if __name__ == '__main__':
main()
sample.py
import tensorflow as tf
import model
def top_k_logits(logits, k):
if k == 0:
# no truncation
return logits
def _top_k():
values, _ = tf.nn.top_k(logits, k=k)
min_values = values[:, -1, tf.newaxis]
return tf.where(
logits < min_values,
tf.ones_like(logits, dtype=logits.dtype) * -1e10,
logits,
)
return tf.cond(
tf.equal(k, 0),
lambda: logits,
lambda: _top_k(),
)
def top_p_logits(logits, p):
with tf.variable_scope('top_p_logits'):
logits_sort = tf.sort(logits, direction='DESCENDING')
probs_sort = tf.nn.softmax(logits_sort)
probs_sums = tf.cumsum(probs_sort, axis=1, exclusive=True)
logits_masked = tf.where(probs_sums < p, logits_sort, tf.ones_like(logits_sort)*1000) # [batchsize, vocab]
min_logits = tf.reduce_min(logits_masked, axis=1, keepdims=True) # [batchsize, 1]
return tf.where(
logits < min_logits,
tf.ones_like(logits, dtype=logits.dtype) * -1e10,
logits,
)
def sample_sequence(*, hparams, length, start_token=None, batch_size=None, context=None, temperature=1, top_k=0, top_p=0.0):
if start_token is None:
assert context is not None, 'Specify exactly one of start_token and context!'
else:
assert context is None, 'Specify exactly one of start_token and context!'
context = tf.fill([batch_size, 1], start_token)
def step(hparams, tokens, past=None):
lm_output = model.model(hparams=hparams, X=tokens, past=past, reuse=tf.AUTO_REUSE)
logits = lm_output['logits'][:, :, :hparams.n_vocab]
presents = lm_output['present']
presents.set_shape(model.past_shape(hparams=hparams, batch_size=batch_size))
return {
'logits': logits,
'presents': presents,
}
with tf.name_scope('sample_sequence'):
# Don't feed the last context token -- leave that to the loop below
# TODO: Would be slightly faster if we called step on the entire context,
# rather than leaving the last token transformer calculation to the while loop.
context_output = step(hparams, context[:, :-1])
def body(past, prev, output):
next_outputs = step(hparams, prev[:, tf.newaxis], past=past)
logits = next_outputs['logits'][:, -1, :] / tf.to_float(temperature)
if top_p > 0.0:
logits = top_p_logits(logits, p=top_p)
else:
logits = top_k_logits(logits, k=top_k)
samples = tf.multinomial(logits, num_samples=1, output_dtype=tf.int32)
return [
tf.concat([past, next_outputs['presents']], axis=-2),
tf.squeeze(samples, axis=[1]),
tf.concat([output, samples], axis=1),
]
def cond(*args):
return True
_, _, tokens = tf.while_loop(
cond=cond, body=body,
maximum_iterations=length,
loop_vars=[
context_output['presents'],
context[:, -1],
context,
],
shape_invariants=[
tf.TensorShape(model.past_shape(hparams=hparams, batch_size=batch_size)),
tf.TensorShape([batch_size]),
tf.TensorShape([batch_size, None]),
],
back_prop=False,
)
return tokens
interactive_conditional_samples.py
#!/usr/bin/env python3
import fire
import json
import os
import numpy as np
import tensorflow as tf
import model, sample, encoder
def interact_model(
model_name='chatbot',
seed=None,
nsamples=1,
batch_size=1,
length=20,
temperature=1,
top_k=0,
top_p=0.0
):
"""
Interactively run the model
:model_name=chatbot : String, which model to use
:seed=None : Integer seed for random number generators, fix seed to reproduce
results
:nsamples=1 : Number of samples to return total
:batch_size=1 : Number of batches (only affects speed/memory). Must divide nsamples.
:length=None : Number of tokens in generated text, if None (default), is
determined by model hyperparameters
:temperature=1 : Float value controlling randomness in boltzmann
distribution. Lower temperature results in less random completions. As the
temperature approaches zero, the model will become deterministic and
repetitive. Higher temperature results in more random completions.
:top_k=0 : Integer value controlling diversity. 1 means only 1 word is
considered for each step (token), resulting in deterministic completions,
while 40 means 40 words are considered at each step. 0 (default) is a
special setting meaning no restrictions. 40 generally is a good value.
:top_p=0.0 : Float value controlling diversity. Implements nucleus sampling,
overriding top_k if set to a value > 0. A good setting is 0.9.
"""
if batch_size is None:
batch_size = 1
assert nsamples % batch_size == 0
enc = encoder.get_encoder(model_name)
hparams = model.default_hparams()
with open(os.path.join('models', model_name, 'hparams.json')) as f:
hparams.override_from_dict(json.load(f))
if length is None:
length = hparams.n_ctx // 2
elif length > hparams.n_ctx:
raise ValueError("Can't get samples longer than window size: %s" % hparams.n_ctx)
with tf.Session(graph=tf.Graph()) as sess:
context = tf.placeholder(tf.int32, [batch_size, None])
np.random.seed(seed)
tf.set_random_seed(seed)
output = sample.sample_sequence(
hparams=hparams, length=length,
context=context,
batch_size=batch_size,
temperature=temperature, top_k=top_k, top_p=top_p
)
saver = tf.train.Saver()
ckpt = tf.train.latest_checkpoint(os.path.join('models', model_name))
saver.restore(sess, ckpt)
while True:
raw_text = input("User >>> ")
while not raw_text:
print('Prompt should not be empty!')
raw_text = input("User >>> ")
context_tokens = enc.encode(raw_text)
generated = 0
for _ in range(nsamples // batch_size):
out = sess.run(output, feed_dict={
context: [context_tokens for _ in range(batch_size)]
})[:, len(context_tokens):]
for i in range(batch_size):
generated += 1
text = enc.decode(out[i])
print("=" * 40 + " SAMPLE " + str(generated) + " " + "=" * 40)
print(text)
print("=" * 80)
if __name__ == '__main__':
fire.Fire(interact_model)
How can I tweak the code to get it working like a chatbot? I am guessing it has something to do with the context part in sample.py, though i am unsure how is this going to work.

I know this is an old question now, but I have successfully tuned many Q&A style datasets on GPT-2 and have a suggestion that will work for future people who find this question.
GPT-2 reads unstructured text data, but it is very good at inferring and obeying structure in that data. Your issue is basically that you are not terminating your input lines with an identifier that GPT-2 understands, so it continues the sentence.
A simple way to fix this would be to annotate your dataset. Really anything with stop/start tokens will work, but you should also annotate the speaker identities. I would just do something like this:
A: How are you <EOL>
B: Hi Great and you <EOL>
A: Am also good <EOL>
B: So you re a graphic designer <EOL>
B: Another line from B <EOL>
The other benefit of this approach is that GPT-2 will learn multi-line input/output, and the different identities of the two conversants.

Problem is, all model sees is looking at the series of text you gave it, and trying to predict next most likely /token to be exact. It's not an encoder-decoder architecture. What you require is fine-tuning this architecture for a chatbot architecture.The only implementation I found regarding that one is here. But's it's done in pytorch so i am afraid it won't be what you are wanting.
https://medium.com/huggingface/how-to-build-a-state-of-the-art-conversational-ai-with-transfer-learning-2d818ac26313

Related

Why does an ANN validation accuracy oscillate?

The following training curve is generated using the same Tensorflow + Keras script written in Python:
RED line uses five features.
GREEN line uses seven features.
BLUE line uses nine features.
Can anyone tell me the probable cause of the oscillation of the GREEN line so that I can troubleshoot my script?
Source code:
import os
os.environ["TF_CPP_MIN_LOG_LEVEL"] = "2"
#os.environ["CUDA_DEVICE_ORDER"] = "PCI_BUS_ID"
os.environ["CUDA_VISIBLE_DEVICES"] = "0" # Use both gpus for training.
import sys, random
import time
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras.models import Sequential
from tensorflow.keras.callbacks import ModelCheckpoint
import numpy as np
from lxml import etree, objectify
# <editor-fold desc="GPU">
# resolve GPU related issues.
try:
physical_devices = tf.config.list_physical_devices('GPU')
for gpu_instance in physical_devices:
tf.config.experimental.set_memory_growth(gpu_instance, True)
except Exception as e:
pass
# END of try
# </editor-fold>
# <editor-fold desc="Lxml helper">
class LxmlHelper:
#classmethod
def objectify_xml(cls, input_path_dir):
file_dom = etree.parse(input_path_dir) # parse xml and convert it into DOM
file_xml_bin = etree.tostring(file_dom, pretty_print=False, encoding="ascii") # encode DOM into ASCII object
file_xml_text = file_xml_bin.decode() # convert binary ASCII object into ASCII text
objectified_xml = objectify.fromstring(file_xml_text) # convert text into a Doxygen object
return objectified_xml
# </editor-fold>
# <editor-fold desc="def encode(letter)">
def encode(letter: str):
if letter == 'H':
return [1.0, 0.0, 0.0]
elif letter == 'E':
return [0.0, 1.0, 0.0]
elif letter == 'C':
return [0.0, 0.0, 1.0]
elif letter == '-':
return [0.0, 0.0, 0.0]
# END of function
def encode_string_1(pattern_str: str):
# Iterate over the string
one_hot_binary_str = []
for ch in pattern_str:
try:
one_hot_binary_str = one_hot_binary_str + encode(ch)
except Exception as e:
print(pattern_str, one_hot_binary_str, ch)
# END of for loop
return one_hot_binary_str
# END of function
def encode_string_2(pattern_str: str):
# Iterate over the string
one_hot_binary_str = []
for ch in pattern_str:
temp_encoded_vect = [encode(ch)]
one_hot_binary_str = one_hot_binary_str + temp_encoded_vect
# END of for loop
return one_hot_binary_str
# END of function
# </editor-fold>
# <editor-fold desc="def load_data()">
def load_data_k(fname: str, class_index: int, feature_start_index: int, **selection):
"""Loads data for training and validation
:param fname: (``string``) - name of the file with the data
:param selection: (``kwargs``) - see below
:return: four tensorflow tensors: training input, training output, validation input and validation output
:Keyword Arguments:
* *top_n_lines* (``number``) --
take top N lines of the input and disregard the rest
* *random_n_lines* (``number``) --
take random N lines of the input and disregard the rest
* *validation_part* (``float``) --
separate N_lines * given_fraction of the input lines from the training set and use
them for validation. When the given_fraction = 1.0, then the same input set of
N_lines is used both for training and validation (this is the default)
"""
i = 0
file = open(fname)
if "top_n_lines" in selection:
lines = [next(file) for _ in range(int(selection["top_n_lines"]))]
elif "random_n_lines" in selection:
tmp_lines = file.readlines()
lines = random.sample(tmp_lines, int(selection["random_n_lines"]))
else:
lines = file.readlines()
data_x, data_y, data_z = [], [], []
for l in lines:
row = l.strip().split() # return a list of words from the line.
x = [float(ix) for ix in row[feature_start_index:]] # convert 3rd to 20th word into a vector of float numbers.
y = encode(row[class_index]) # convert the 3rd word into binary.
z = encode_string_1(row[class_index+1])
data_x.append(x) # append the vector into 'data_x'
data_y.append(y) # append the vector into 'data_y'
data_z.append(z) # append the vector into 'data_z'
# END for l in lines
num_rows = len(data_x)
given_fraction = selection.get("validation_part", 1.0)
if given_fraction > 0.9999:
valid_x, valid_y, valid_z = data_x, data_y, data_z
else:
n = int(num_rows * given_fraction)
data_x, data_y, data_z = data_x[n:], data_y[n:], data_z[n:]
valid_x, valid_y, valid_z = data_x[:n], data_y[:n], data_z[:n]
# END of if-else block
tx = tf.convert_to_tensor(data_x, np.float32)
ty = tf.convert_to_tensor(data_y, np.float32)
tz = tf.convert_to_tensor(data_z, np.float32)
vx = tf.convert_to_tensor(valid_x, np.float32)
vy = tf.convert_to_tensor(valid_y, np.float32)
vz = tf.convert_to_tensor(valid_z, np.float32)
return tx, ty, tz, vx, vy, vz
# END of the function
# </editor-fold>
# <editor-fold desc="def create_model()">
def create_model(n_hidden_1, n_hidden_2, num_classes, num_features):
# create the model
model = Sequential()
model.add(tf.keras.layers.InputLayer(input_shape=(num_features,)))
model.add(tf.keras.layers.Dense(n_hidden_1, activation='sigmoid'))
model.add(tf.keras.layers.Dense(n_hidden_2, activation='sigmoid'))
###model.add(tf.keras.layers.Dense(n_hidden_3, activation='sigmoid'))
model.add(tf.keras.layers.Dense(num_classes, activation='softmax'))
# instantiate the optimizer
opt = keras.optimizers.SGD(learning_rate=LEARNING_RATE)
# compile the model
model.compile(
optimizer=opt,
loss="categorical_crossentropy",
metrics="categorical_accuracy"
)
# return model
return model
# </editor-fold>
if __name__ == "__main__":
# <editor-fold desc="(input/output parameters)">
my_project_routine = LxmlHelper.objectify_xml("my_project_evaluate.xml")
# input data
INPUT_DATA_FILE = str(my_project_routine.input.input_data_file)
INPUT_PATH = str(my_project_routine.input.input_path)
CLASS_INDEX = int(my_project_routine.input.class_index)
FEATURE_INDEX = int(my_project_routine.input.feature_index)
# output data
OUTPUT_PATH = str(my_project_routine.output.output_path)
MODEL_FILE = str(my_project_routine.output.model_file)
TRAINING_PROGRESS_FILE = str(my_project_routine.output.training_progress_file)
# Learning parameters
LEARNING_RATE = float(my_project_routine.training_params.learning_rate)
EPOCH_SIZE = int(my_project_routine.training_params.epoch_size)
BATCH_SIZE = int(my_project_routine.training_params.batch_size)
INPUT_LINES_COUNT = int(my_project_routine.input.input_lines_count)
VALIDATION_PART = float(my_project_routine.training_params.validation_part)
SAVE_PERIOD = str(my_project_routine.output.save_period)
# NN parameters
HIDDEN_LAYER_1_NEURON_COUNT = int(my_project_routine.hidden_layers.one)
HIDDEN_LAYER_2_NEURON_COUNT = int(my_project_routine.hidden_layers.two)
###HIDDEN_LAYER_3_NEURON_COUNT = int(my_project_routine.hidden_layers.three)
CLASS_COUNT = int(my_project_routine.class_count)
FEATURES_COUNT = int(my_project_routine.features_count)
input_file_path_str = os.path.join(INPUT_PATH, INPUT_DATA_FILE)
training_progress_file_path_str = os.path.join(OUTPUT_PATH, TRAINING_PROGRESS_FILE)
model_file_path = os.path.join(OUTPUT_PATH, MODEL_FILE)
# command-line arg processing
input_file_name_str = None
if len(sys.argv) > 1:
input_file_name_str = sys.argv[1]
else:
input_file_name_str = input_file_path_str
# END of if-else
# </editor-fold>
# <editor-fold desc="(load data from file)">
# load training data from the disk
train_x, train_y, _, validate_x, validate_y, _ = \
load_data_k(
fname=input_file_name_str,
class_index=CLASS_INDEX,
feature_start_index=FEATURE_INDEX,
random_n_lines=INPUT_LINES_COUNT,
validation_part=VALIDATION_PART
)
print("training data size : ", len(train_x))
print("validation data size : ", len(validate_x))
# </editor-fold>
### STEPS_PER_EPOCH = len(train_x) // BATCH_SIZE
### VALIDATION_STEPS = len(validate_x) // BATCH_SIZE
# <editor-fold desc="(model creation)">
# load previously saved NN model
model = None
try:
model = keras.models.load_model(model_file_path)
print("Loading NN model from file.")
model.summary()
except Exception as ex:
print("No NN model found for loading.")
# END of try-except
# </editor-fold>
# <editor-fold desc="(model run)">
# # if there is no model loaded, create a new model
if model is None:
csv_logger = keras.callbacks.CSVLogger(training_progress_file_path_str)
checkpoint = ModelCheckpoint(
model_file_path,
monitor='loss',
verbose=1,
save_best_only=True,
mode='auto',
save_freq='epoch'
)
callbacks_vector = [
csv_logger,
checkpoint
]
# Set mirror strategy
#strategy = tf.distribute.MirroredStrategy(devices=["/device:GPU:0","/device:GPU:1"])
#with strategy.scope():
print("New NN model created.")
# create sequential NN model
model = create_model(
n_hidden_1=HIDDEN_LAYER_1_NEURON_COUNT,
n_hidden_2=HIDDEN_LAYER_2_NEURON_COUNT,
##n_hidden_3=HIDDEN_LAYER_3_NEURON_COUNT,
num_classes=CLASS_COUNT,
num_features=FEATURES_COUNT
)
# Train the model with the new callback
history = model.fit(
train_x, train_y,
validation_data=(validate_x, validate_y),
batch_size=BATCH_SIZE,
epochs=EPOCH_SIZE,
callbacks=[callbacks_vector],
shuffle=True,
verbose=2
)
print(history.history.keys())
# END of ... with
# END of ... if
# </editor-fold>
Plotting Script
import os
from argparse import ArgumentParser
import random
from typing import List
import matplotlib.pyplot as plt
import numpy as np
import math
import sys
import datetime
class Quad:
def __init__(self, x_vector, y_vector, color_char, label_str):
self.__x_vector = x_vector
self.__y_vector = y_vector
self.__color_char = color_char
self.__label_str = label_str
def get_x_vector(self):
return self.__x_vector
def get_y_vector(self):
return self.__y_vector
def get_color_char(self):
return self.__color_char
def get_label_str(self):
return self.__label_str
class HecaPlotClass:
def __init__(self):
self.__x_label_str: str = None
self.__y_label_str: str = None
self.__title_str: str = None
self.__trio_vector: List[Quad] = []
self.__plotter = plt
#property
def x_label_str(self):
return self.__x_label_str
#x_label_str.setter
def x_label_str(self, t):
self.__x_label_str = t
#property
def y_label_str(self):
return self.__y_label_str
#y_label_str.setter
def y_label_str(self, t):
self.__y_label_str = t
#property
def title_str(self):
return self.__title_str
#title_str.setter
def title_str(self, t):
self.__title_str = t
def add_y_axes(self, trio_obj: Quad):
self.__trio_vector.append(trio_obj)
def generate_plot(self):
for obj in self.__trio_vector:
x_vector = obj.get_x_vector()
y_vector = obj.get_y_vector()
label_str = obj.get_label_str()
# print(label_str)
# print(len(x_vector))
# print(len(y_vector))
self.__plotter.plot(
x_vector,
y_vector,
color=obj.get_color_char(),
label=label_str
)
# END of ... for loop
# Naming the x-axis, y_1_vector-axis and the whole graph
self.__plotter.xlabel(self.__x_label_str)
self.__plotter.ylabel(self.__y_label_str)
self.__plotter.title(self.__title_str)
# Adding legend, which helps us recognize the curve according to it's color
self.__plotter.legend()
# To load the display window
#self.__plotter.show()
def save_png(self, output_directory_str):
output_file_str = os.path.join(output_directory_str, self.__title_str + '.png')
self.__plotter.savefig(output_file_str)
def save_pdf(self, output_directory_str):
output_file_str = os.path.join(output_directory_str, self.__title_str + '.pdf')
self.__plotter.savefig(output_file_str)
class MainClass(object):
__colors_vector = ['red', 'green', 'blue', 'cyan', 'magenta', 'yellow', 'orange', 'lightgreen', 'crimson']
__working_dir = r"."
__file_names_vector = ["training_progress-32.txt", "training_progress-64.txt", "training_progress-128.txt"]
__input_files_vector = []
__output_directory = None
__column_no_int = 0
__split_percentage_at_tail_int = 100
__is_pdf_output = False
__is_png_output = False
# <editor-fold desc="def load_data()">
#classmethod
def __load_data(cls, fname: str, percetage_int:int, column_no_int:int):
np_array = np.loadtxt(
fname,
# usecols=range(1,11),
dtype=np.float32,
skiprows=1,
delimiter=","
)
size_vector = np_array.shape
array_len_int = size_vector[0]
rows_count_int = int(percetage_int * array_len_int / 100)
np_array = np_array[-rows_count_int:]
x = np_array[:, 0]
y = np_array[:, column_no_int]
return x, y
# END of the function
# </editor-fold>
# <editor-fold desc="(__parse_args())">
#classmethod
def __parse_args(cls):
# initialize argument parser
my_parser = ArgumentParser()
my_parser.add_argument("-c", help="column no.", type=int)
my_parser.add_argument('-i', nargs='+', help='a list of input files', required=True)
my_parser.add_argument("-o", help="output directory", type=str)
my_parser.add_argument("-n", help="percentage of data to split from tail", type=float)
my_parser.add_argument("--pdf", help="PDF output", action='store_true')
my_parser.add_argument("--png", help="PNG output", action='store_true')
# parse the argument
args = my_parser.parse_args()
cls.__input_files_vector = args.i
cls.__output_directory = args.o
cls.__split_percentage_at_tail_int = args.n
cls.__column_no_int = args.c
cls.__is_pdf_output = args.pdf
cls.__is_png_output = args.png
# </editor-fold>
#classmethod
def main(cls):
cls.__parse_args()
if cls.__input_files_vector is None:
cls.__input_files_vector = cls.__file_names_vector
if cls.__output_directory is None:
cls.__output_directory = cls.__working_dir
if cls.__split_percentage_at_tail_int is None:
cls.__split_percentage_at_tail_int = 100
if cls.__column_no_int is None:
cls.__column_no_int = 1
my_project_plot_obj = HecaPlotClass()
i = 0
for file_path_str in cls.__input_files_vector:
print(file_path_str)
x_vector, y_vector = cls.__load_data(os.path.join(cls.__working_dir, file_path_str), cls.__split_percentage_at_tail_int, cls.__column_no_int)
my_project_plot_obj.x_label_str = "Epoch"
my_project_plot_obj.y_label_str = "Accuracy"
my_project_plot_obj.title_str = "training_plot-{date:%Y-%m-%d_%H:%M:%S}".format(date=datetime.datetime.now())
my_project_plot_obj.x_axis_vector = x_vector
if i == 0:
random_int = 0
else:
random_int = i % (len(cls.__colors_vector)-1)
# END of ... if
print("random_int : ", random_int)
my_project_plot_obj.add_y_axes(Quad(x_vector, y_vector, cls.__colors_vector[random_int], file_path_str))
i = i + 1
# END of ... for loop
my_project_plot_obj.generate_plot()
my_project_plot_obj.save_png(cls.__output_directory)
my_project_plot_obj.save_pdf(cls.__output_directory)
if __name__ == "__main__":
MainClass.main()

The primary reason could be improper (non-random ~ ordered) distribution of data.
If you notice the accuracy beyond epoch 180, there is a orderly switching between the accuracy between ~0.43 (approx.) and ~0.33 (~approx.), and occasionally ~0.23 (approx.). The more important thing to notice is that the accuracy is decreasing (there's no improvement in validation accuracy) as we increase the epochs.
The accuracy can increase in such cases if you (1) reduce batch size, or (2) use a better optimizer like Adam. And check the learning rate.
These changes can help the shift and oscillation, as well.
Additionally, Running average of the accuracy can be plotted to avoid the oscillation. This is again a mitigation scheme rather than a correction scheme. But, what it does is removes the order (partition of the data) and mixes the nearby data.
Lastly, I would also reshuffle the data and normalize after each layer. See if that helps.

Generally, sharp jumps and flat lines in the accuracy usually mean that a group of examples is classified as a given class at a same time. If your dataset contains, say, 50 examples with the same combination of 7 features then they would go into the same class at the same time. This is what probably causes sharp jumps - identical or similar examples clustered together.
So for example, if you have 50 men aged 64, and a decision boundary to classify them as more prone to an illness shifts from >65 to >63, then accuracy changes rapidly as all of them change classification at the same time.
Regarding the oscillation of the curve - due to the fact above, oscillation will be amplified by small changes in learning. Your network learns based on cross entropy, which means that it minimizes the difference between target and your predictions. This means that it operates on the difference between probability and target (say, 0.3 vs class 0) instead of class and target like accuracy (so, 0 vs 0) in the same example. Cross entropy is much more smooth as it is not affected by the issue outlined above.

RuntimeError: The size of tensor a (549) must match the size of tensor b (512) at non-singleton dimension 1

GLUE_TASKS = ["cola", "mnli", "mnli-mm", "mrpc", "qnli", "qqp", "rte", "sst2", "stsb", "wnli"]
#useful in preprocessing, this sets what each task does.
task_to_keys = {
"cola": ("sentence", None),
"mnli": ("premise", "hypothesis"),
"mnli-mm": ("premise", "hypothesis"),
"mrpc": ("sentence1", "sentence2"),
"qnli": ("question", "sentence"),
"qqp": ("question1", "question2"),
"rte": ("sentence1", "sentence2"),
"sst2": ("sentence", None),
"stsb": ("sentence1", "sentence2"),
"wnli": ("sentence1", "sentence2"),
}
def set_module_grad_status(module, flag=False):
if isinstance(module, list):
for m in module:
set_module_grad_status(m, flag)
else:
for p in module.parameters():
p.requires_grad = flag
def compute_metrics(eval_pred):
predictions, labels = eval_pred
if task != "stsb":
predictions = np.argmax(predictions, axis=1)
else:
predictions = predictions[:, 0]
return metric.compute(predictions=predictions, references=labels)
#setting some model parameters
task = "qnli"
sentence1_key, sentence2_key = task_to_keys[task]
# model_checkpoint = "test-glue/basemodel"
num_labels = 3 if task.startswith("mnli") else 1 if task=="stsb" else 2
metric_name = "pearson" if task == "stsb" else "matthews_correlation" if task == "cola" else "accuracy"
validation_key = "validation_mismatched" if task == "mnli-mm" else "validation_matched" if task == "mnli" else "validation"
model_checkpoint = "google/mobilebert-uncased"
tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
model = AutoModelForSequenceClassification.from_pretrained(model_checkpoint, num_labels=num_labels)
model = model.to(device)
args = TrainingArguments(
"test-glue",
evaluation_strategy = "epoch",
learning_rate=2e-5,
per_device_train_batch_size=batch_size,
per_device_eval_batch_size=batch_size,
num_train_epochs=epochs,
weight_decay=0.01,
save_steps=0
# load_best_model_at_end=True,
# metric_for_best_model=metric_name,
)
#load dataset
actual_task = "mnli" if task == "mnli-mm" else task #some error checking..
dataset = load_dataset("glue", actual_task)
metric = load_metric('glue', actual_task)
# tokenizer = AutoTokenizer.from_pretrained(model_checkpoint, use_fast=True)
def preprocess_function(examples):
if sentence2_key is None:
return tokenizer(examples[sentence1_key], truncation=True)
return tokenizer(examples[sentence1_key], examples[sentence2_key], truncation=True)
encoded_dataset = dataset.map(preprocess_function, batched=True)
trainer = Trainer(
model,
args,
train_dataset=encoded_dataset["train"],
eval_dataset=encoded_dataset[validation_key],
tokenizer=tokenizer,
compute_metrics=compute_metrics
)
trainer.train()
I not quite sure why I am getting the error "RuntimeError: The size of tensor a (549) must match the size of tensor b (512) at non-singleton dimension 1" at the like "trainer.train()". Like isn't the each sample supposed to be consistently size of 512. This only happens with MobileBERT model and not on DistilBERT model. The code that I am using is mostly based on the huggingface colab tutorial https://colab.research.google.com/github/huggingface/notebooks/blob/master/examples/text_classification.ipynb#scrollTo=7k8ge1L1IrJk

Keras Double DQN average reward decreases over time and is unable to converge

I am attempting to teach a Double DQN agent to run a gridworld where there is one seeker (the agent) who will try to collect all the hiders which are randomly spawned. Every step has a path_cost of -0.1 and if a hider is collected a reward of 1 is received. The DQN net receives an array with the shape (world_width,world_height,1) as the state which is a complete translation of the environment viewed from above where empty space is described as 0, seeker as 2, and hider as 3. The agent is then supposed to choose one action, either left, up, right, or down. An example configuration of the environment is shown in the image below.
gridworld
However, when training my agent the reward initially decreases in correlation to the decreasing exploration and therefore it can be assumed that when the agent follows the DQN net it will perform worse than when choosing actions randomly. Here are a few examples of the reward graphs I have received when training with different hyperparameters (y-axis is total steps where each episode is 100 steps unless it finishes).
Reward Graph
As seen the agent becomes worse at solving the environment and it is approximately when epsilon becomes equal to my min_epsilon the curve stabilizes (meaning almost no exploration or random moves).
I have tried different hyperparameters but without any apparent differences in results and would there appreciate it if someone could give me a pointer to where the problem might be.
The hyperparameters I have been mostly using is:
wandb.config.epsilon = 1.0
wandb.config.epsilon_decay = 0.99
wandb.config.batch_size = 32
wandb.config.learning_rate = 1e-3
wandb.config.gamma = 0.8
wandb.config.min_epsilon = 1e-1
wandb.config.buffersize = 10000
wandb.config.epochs = 1
wandb.config.reward_discount = 0.01
wandb.config.episodes = 1000
And here is my code:
import tensorflow as tf
from tensorflow.keras.layers import Input, Dense, Conv2D, MaxPooling2D, Flatten
from tensorflow.keras.optimizers import Adam
from collections import deque
from termcolor import colored
import wandb
from wandb.keras import WandbCallback
import numpy as np
import copy, os, random
from argparse import ArgumentParser
from plotter import plotter
from HNS import HNS
tf.keras.backend.set_floatx('float64')
wandb.init(name=name, project=project)
wandb.env.name = "HNS"
wandb.env.world_size = (8, 8)
wandb.env.state_dim = (8, 8, 1)
wandb.env.hider_count = 2
wandb.env.action_dim = 4
wandb.env.random_spawn = True
wandb.env.max_steps = 100
wandb.config.node = node
wandb.config.epsilon = 1.0
wandb.config.epsilon_decay = 0.99
wandb.config.batch_size = 32
wandb.config.learning_rate = 1e-3
wandb.config.gamma = 0.8
wandb.config.min_epsilon = 1e-1
wandb.config.buffersize = 10000
wandb.config.epochs = 1
wandb.config.reward_discount = 0.01
wandb.config.episodes = 1000
wandb.config.conv1_kernel = (8,8)
wandb.config.conv1_filters = 16
wandb.config.conv1_strides = 4
wandb.config.conv1_activation = "relu"
wandb.config.conv1_padding = "same"
wandb.config.conv2_kernel = (4,4)
wandb.config.conv2_filters = 32
wandb.config.conv2_strides = 4
wandb.config.conv2_activation = "relu"
wandb.config.conv2_padding = "same"
wandb.config.dense1_neurons = 16
wandb.config.dense1_activation = "relu"
wandb.config.loss = "mse"
parser = ArgumentParser()
parser.add_argument('--hider_count', type=int, default=wandb.env.hider_count)
parser.add_argument('--max_steps', type=int, default=wandb.env.max_steps)
parser.add_argument('--epsilon_decay', type=float, default=wandb.config.epsilon_decay)
parser.add_argument('--min_epsilon', type=float, default=wandb.config.min_epsilon)
parser.add_argument('--learning_rate', type=float, default=wandb.config.learning_rate)
parser.add_argument('--gamma', type=float, default=wandb.config.gamma)
parser.add_argument('--reward_discount', type=float, default=wandb.config.reward_discount)
parser.add_argument('--episodes', type=int, default=wandb.config.episodes)
parser.add_argument('--batch_size', type=int, default=wandb.config.batch_size)
args, unknown = parser.parse_known_args()
wandb.config.update(args, allow_val_change=True)
class ReplayBuffer:
def __init__(self):
self.buffer = deque(maxlen=wandb.config.buffersize)
def put(self, state, action, reward, next_state, done):
self.buffer.append([state, action, reward, next_state, done])
def sample(self):
sample = random.sample(self.buffer, wandb.config.batch_size)
states, actions, rewards, next_states, done = map(np.asarray, zip(*sample))
return states, actions, rewards, next_states, done
def size(self):
return len(self.buffer)
class ActionStatemodel:
def __init__(self):
self.epsilon = wandb.config.epsilon
self.model = self.create_model()
def create_model(self):
# Init model
model = tf.keras.Sequential()
# Set up layers
model.add(Conv2D(filters=wandb.config.conv1_filters, kernel_size=wandb.config.conv1_kernel, activation=wandb.config.conv1_activation,
strides=wandb.config.conv1_strides, padding=wandb.config.conv1_padding, name="conv_1", input_shape=wandb.env.state_dim))
model.add(Conv2D(filters=wandb.config.conv2_filters, kernel_size=wandb.config.conv2_kernel, activation=wandb.config.conv2_activation,
strides=wandb.config.conv2_strides, padding=wandb.config.conv2_padding, name="conv_2"))
model.add(Flatten())
model.add(Dense(units=wandb.config.dense1_neurons, activation=wandb.config.dense1_activation, name="dense_1"))
model.add(Dense(wandb.env.action_dim, name="dense_2"))
# Finalize model
model.compile(loss=wandb.config.loss, optimizer=Adam(wandb.config.learning_rate))
model.summary()
return model
# Get q-values from state
def predict(self, state):
return self.model.predict(state)
# Get action from
def get_action(self, state):
# Predict action
state = np.expand_dims(state, axis=0)
q_value = self.predict(state)
if np.random.random() < self.epsilon: return random.randint(0, wandb.env.action_dim - 1), 1
else: return np.argmax(q_value), 0
def train(self, states, targets):
history = self.model.fit(states, targets, epochs=wandb.config.epochs, callbacks=[WandbCallback()], verbose=2, use_multiprocessing=True)
return history.history["loss"][0]
class Agent:
def __init__(self, env):
self.env = env
self.predict_net = ActionStatemodel()
self.target_net = ActionStatemodel()
self.target_update()
self.buffer = ReplayBuffer()
# Copy weights from model to target_model
def target_update(self):
weights = self.predict_net.model.get_weights()
self.target_net.model.set_weights(weights)
def replay(self):
loss = 0
for _ in range(5):
states, actions, rewards, next_states, done = self.buffer.sample()
# Collect predicted actions from predict_net
predicted_q_values = self.predict_net.predict(next_states)
predicted_actions = np.argmax(predicted_q_values, axis=1)
# Get q values from target_net of above predicted actions
target_q_values = self.target_net.predict(next_states)
target_action_q_values = [np.take(target_q_values[i], predicted_actions[i]) for i in range(len(target_q_values))]
# Create targets based on q values, reward and done
targets = predicted_q_values.copy()
targets[range(wandb.config.batch_size), actions] = rewards + (1 - done) * target_action_q_values * args.gamma
loss += self.predict_net.train(states, targets)
return loss
def train(self):
# Save weights for heatmap rendering
# Main training loop
for ep in range(wandb.config.episodes):
# Initialization
done, total_reward, step, loss, exploration = False, 0, 0, 0, 0
state = self.env.reset()
while not done and step < wandb.env.max_steps:
# Predict and perform action
action, e = self.predict_net.get_action(state)
exploration += e
next_state, reward, done, _ = self.env.step(action)
self.buffer.put(state, action, reward * wandb.config.reward_discount, next_state, done)
total_reward += reward
if self.buffer.size() >= 1000 and step % 10 == 0:
loss = self.replay()
state = next_state
step += 1
self.target_update()
# Update epsilon
self.predict_net.epsilon = max(wandb.config.epsilon_decay * self.predict_net.epsilon, wandb.config.min_epsilon)
# Calculate weights change and log weights
pre_weights = self.get_weights(self.predict_net.model.layers)
tar_weights = self.get_weights(self.target_net.model.layers)
# LOG
print(colored("EP" + str(ep) + "-Reward: " + str(total_reward) + " Done: " + str(done), "green"))
wandb.log({"episode" : ep,
"buffersize" : self.buffer.size(),
"EpReward" : total_reward,
"epsilon" : self.predict_net.epsilon,
"done" : int(done),
"Exploration" : exploration / _,
"loss" : loss,
"pre_weights" : pre_weights,
"tar_weights" : tar_weights
})
# "weigthUpdate" : wandb.Image(neuron_map),
# Get weights and names for every layer of nn model
def get_weights(self, layers):
weigths = []
names = []
for layer in layers:
wb = layer.get_weights()
if wb:
weigths.append(wb[0].flatten())
names.append(layer.name)
return weigths, names
if __name__ == "__main__":
env = HNS(random_spawn=wandb.env.random_spawn, world_size=wandb.env.world_size, hider_count=wandb.env.hider_count)
agent = Agent(env=env)
agent.train()
agent.target_net.model.save(os.path.join(wandb.run.dir, "model.h5"))

How do you use a TensorFlow model to classify a sound using this code

I have been following this tutorial for creating a TensorFlow Audio Recognition model.
I have completed training of my model (which is based upon a smaller set of simple data from the content in the example).
Now I have built the model, what is the lines of code I need in order to actually classify against the created model?
Please see the following code which has built my model which is a slightly altered version of the article's code:
train.py:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os.path
import sys
import numpy as np
from six.moves import xrange # pylint: disable=redefined-builtin
import tensorflow as tf
import input_data
import models
from tensorflow.python.platform import gfile
FLAGS = None
def main(_):
# Set the verbosity based on flags (default is INFO, so we see all messages)
tf.compat.v1.logging.set_verbosity(FLAGS.verbosity)
# Start a new TensorFlow session.
sess = tf.compat.v1.InteractiveSession()
# Begin by making sure we have the training data we need. If you already have
# training data of your own, use `--data_url= ` on the command line to avoid
# downloading.c
model_settings = models.prepare_model_settings(
len(input_data.prepare_words_list(FLAGS.wanted_words.split(','))),
FLAGS.sample_rate, FLAGS.clip_duration_ms, FLAGS.window_size_ms,
FLAGS.window_stride_ms, FLAGS.feature_bin_count, FLAGS.preprocess)
audio_processor = input_data.AudioProcessor(
FLAGS.data_url, FLAGS.data_dir,
FLAGS.silence_percentage, FLAGS.unknown_percentage,
FLAGS.wanted_words.split(','), FLAGS.validation_percentage,
FLAGS.testing_percentage, model_settings, FLAGS.summaries_dir)
fingerprint_size = model_settings['fingerprint_size']
label_count = model_settings['label_count']
time_shift_samples = int((FLAGS.time_shift_ms * FLAGS.sample_rate) / 1000)
# Figure out the learning rates for each training phase. Since it's often
# effective to have high learning rates at the start of training, followed by
# lower levels towards the end, the number of steps and learning rates can be
# specified as comma-separated lists to define the rate at each stage. For
# example --how_many_training_steps=10000,3000 --learning_rate=0.001,0.0001
# will run 13,000 training loops in total, with a rate of 0.001 for the first
# 10,000, and 0.0001 for the final 3,000.
training_steps_list = list(map(int, FLAGS.how_many_training_steps.split(',')))
learning_rates_list = list(map(float, FLAGS.learning_rate.split(',')))
if len(training_steps_list) != len(learning_rates_list):
raise Exception(
'--how_many_training_steps and --learning_rate must be equal length '
'lists, but are %d and %d long instead' % (len(training_steps_list),
len(learning_rates_list)))
input_placeholder = tf.compat.v1.placeholder(
tf.float32, [None, fingerprint_size], name='fingerprint_input')
if FLAGS.quantize:
fingerprint_min, fingerprint_max = input_data.get_features_range(
model_settings)
fingerprint_input = tf.quantization.fake_quant_with_min_max_args(
input_placeholder, fingerprint_min, fingerprint_max)
else:
fingerprint_input = input_placeholder
logits, dropout_prob = models.create_model(
fingerprint_input,
model_settings,
FLAGS.model_architecture,
is_training=True)
# Define loss and optimizer
ground_truth_input = tf.compat.v1.placeholder(
tf.int64, [None], name='groundtruth_input')
# Optionally we can add runtime checks to spot when NaNs or other symptoms of
# numerical errors start occurring during training.
control_dependencies = []
if FLAGS.check_nans:
checks = tf.compat.v1.add_check_numerics_ops()
control_dependencies = [checks]
# Create the back propagation and training evaluation machinery in the graph.
with tf.compat.v1.name_scope('cross_entropy'):
cross_entropy_mean = tf.compat.v1.losses.sparse_softmax_cross_entropy(
labels=ground_truth_input, logits=logits)
if FLAGS.quantize:
tf.contrib.quantize.create_training_graph(quant_delay=0)
with tf.compat.v1.name_scope('train'), tf.control_dependencies(
control_dependencies):
learning_rate_input = tf.compat.v1.placeholder(
tf.float32, [], name='learning_rate_input')
train_step = tf.compat.v1.train.GradientDescentOptimizer(
learning_rate_input).minimize(cross_entropy_mean)
predicted_indices = tf.argmax(input=logits, axis=1)
correct_prediction = tf.equal(predicted_indices, ground_truth_input)
confusion_matrix = tf.math.confusion_matrix(labels=ground_truth_input,
predictions=predicted_indices,
num_classes=label_count)
evaluation_step = tf.reduce_mean(input_tensor=tf.cast(correct_prediction,
tf.float32))
with tf.compat.v1.get_default_graph().name_scope('eval'):
tf.compat.v1.summary.scalar('cross_entropy', cross_entropy_mean)
tf.compat.v1.summary.scalar('accuracy', evaluation_step)
global_step = tf.compat.v1.train.get_or_create_global_step()
increment_global_step = tf.compat.v1.assign(global_step, global_step + 1)
saver = tf.compat.v1.train.Saver(tf.compat.v1.global_variables())
# Merge all the summaries and write them out to /tmp/retrain_logs (by default)
merged_summaries = tf.compat.v1.summary.merge_all(scope='eval')
train_writer = tf.compat.v1.summary.FileWriter(FLAGS.summaries_dir + '/train',
sess.graph)
validation_writer = tf.compat.v1.summary.FileWriter(
FLAGS.summaries_dir + '/validation')
tf.compat.v1.global_variables_initializer().run()
start_step = 1
if FLAGS.start_checkpoint:
models.load_variables_from_checkpoint(sess, FLAGS.start_checkpoint)
start_step = global_step.eval(session=sess)
tf.compat.v1.logging.info('Training from step: %d ', start_step)
# Save graph.pbtxt.
tf.io.write_graph(sess.graph_def, FLAGS.train_dir,
FLAGS.model_architecture + '.pbtxt')
# Save list of words.
with gfile.GFile(
os.path.join(FLAGS.train_dir, FLAGS.model_architecture + '_labels.txt'),
'w') as f:
f.write('\n'.join(audio_processor.words_list))
# Training loop.
training_steps_max = np.sum(training_steps_list)
for training_step in xrange(start_step, training_steps_max + 1):
# Figure out what the current learning rate is.
training_steps_sum = 0
for i in range(len(training_steps_list)):
training_steps_sum += training_steps_list[i]
if training_step <= training_steps_sum:
learning_rate_value = learning_rates_list[i]
break
# Pull the audio samples we'll use for training.
train_fingerprints, train_ground_truth = audio_processor.get_data(
FLAGS.batch_size, 0, model_settings, FLAGS.background_frequency,
FLAGS.background_volume, time_shift_samples, 'training', sess)
# Run the graph with this batch of training data.
train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run(
[
merged_summaries,
evaluation_step,
cross_entropy_mean,
train_step,
increment_global_step,
],
feed_dict={
fingerprint_input: train_fingerprints,
ground_truth_input: train_ground_truth,
learning_rate_input: learning_rate_value,
dropout_prob: 0.5
})
train_writer.add_summary(train_summary, training_step)
tf.compat.v1.logging.info(
'Step #%d: rate %f, accuracy %.1f%%, cross entropy %f' %
(training_step, learning_rate_value, train_accuracy * 100,
cross_entropy_value))
is_last_step = (training_step == training_steps_max)
if (training_step % FLAGS.eval_step_interval) == 0 or is_last_step:
set_size = audio_processor.set_size('validation')
total_accuracy = 0
total_conf_matrix = None
for i in xrange(0, set_size, FLAGS.batch_size):
validation_fingerprints, validation_ground_truth = (
audio_processor.get_data(FLAGS.batch_size, i, model_settings, 0.0,
0.0, 0, 'validation', sess))
# Run a validation step and capture training summaries for TensorBoard
# with the `merged` op.
validation_summary, validation_accuracy, conf_matrix = sess.run(
[merged_summaries, evaluation_step, confusion_matrix],
feed_dict={
fingerprint_input: validation_fingerprints,
ground_truth_input: validation_ground_truth,
dropout_prob: 1.0
})
validation_writer.add_summary(validation_summary, training_step)
batch_size = min(FLAGS.batch_size, set_size - i)
total_accuracy += (validation_accuracy * batch_size) / set_size
if total_conf_matrix is None:
total_conf_matrix = conf_matrix
else:
total_conf_matrix += conf_matrix
tf.compat.v1.logging.info('Confusion Matrix:\n %s' % (total_conf_matrix))
tf.compat.v1.logging.info('Step %d: Validation accuracy = %.1f%% (N=%d)' %
(training_step, total_accuracy * 100, set_size))
# Save the model checkpoint periodically.
if (training_step % FLAGS.save_step_interval == 0 or
training_step == training_steps_max):
checkpoint_path = os.path.join(FLAGS.train_dir,
FLAGS.model_architecture + '.ckpt')
tf.compat.v1.logging.info('Saving to "%s-%d"', checkpoint_path,
training_step)
saver.save(sess, checkpoint_path, global_step=training_step)
set_size = audio_processor.set_size('testing')
tf.compat.v1.logging.info('set_size=%d', set_size)
total_accuracy = 0
total_conf_matrix = None
for i in xrange(0, set_size, FLAGS.batch_size):
test_fingerprints, test_ground_truth = audio_processor.get_data(
FLAGS.batch_size, i, model_settings, 0.0, 0.0, 0, 'testing', sess)
test_accuracy, conf_matrix = sess.run(
[evaluation_step, confusion_matrix],
feed_dict={
fingerprint_input: test_fingerprints,
ground_truth_input: test_ground_truth,
dropout_prob: 1.0
})
batch_size = min(FLAGS.batch_size, set_size - i)
total_accuracy += (test_accuracy * batch_size) / set_size
if total_conf_matrix is None:
total_conf_matrix = conf_matrix
else:
total_conf_matrix += conf_matrix
tf.compat.v1.logging.warn('Confusion Matrix:\n %s' % (total_conf_matrix))
tf.compat.v1.logging.warn('Final test accuracy = %.1f%% (N=%d)' %
(total_accuracy * 100, set_size))
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--data_dir',
type=str,
default='Audio/training_data',
help="""\
Where to download the speech training data to.
""")
parser.add_argument(
'--background_volume',
type=float,
default=0.1,
help="""\
How loud the background noise should be, between 0 and 1.
""")
parser.add_argument(
'--background_frequency',
type=float,
default=0.0,
help="""\
How many of the training samples have background noise mixed in.
""")
parser.add_argument(
'--silence_percentage',
type=float,
default=10.0,
help="""\
How much of the training data should be silence.
""")
parser.add_argument(
'--unknown_percentage',
type=float,
default=10.0,
help="""\
How much of the training data should be unknown words.
""")
parser.add_argument(
'--time_shift_ms',
type=float,
default=100.0,
help="""\
Range to randomly shift the training audio by in time.
""")
parser.add_argument(
'--testing_percentage',
type=int,
default=10,
help='What percentage of wavs to use as a test set.')
parser.add_argument(
'--validation_percentage',
type=int,
default=10,
help='What percentage of wavs to use as a validation set.')
parser.add_argument(
'--sample_rate',
type=int,
default=16000,
help='Expected sample rate of the wavs',)
parser.add_argument(
'--clip_duration_ms',
type=int,
default=1000,
help='Expected duration in milliseconds of the wavs',)
parser.add_argument(
'--window_size_ms',
type=float,
default=30.0,
help='How long each spectrogram timeslice is.',)
parser.add_argument(
'--window_stride_ms',
type=float,
default=10.0,
help='How far to move in time between spectogram timeslices.',)
parser.add_argument(
'--feature_bin_count',
type=int,
default=40,
help='How many bins to use for the MFCC fingerprint',
)
parser.add_argument(
'--how_many_training_steps',
type=str,
default='200,50',
help='How many training loops to run',)
parser.add_argument(
'--eval_step_interval',
type=int,
default=50,
help='How often to evaluate the training results.')
parser.add_argument(
'--learning_rate',
type=str,
default='0.001,0.0001',
help='How large a learning rate to use when training.')
parser.add_argument(
'--batch_size',
type=int,
default=10,
help='How many items to train with at once',)
parser.add_argument(
'--summaries_dir',
type=str,
default='logs/retrain_logs',
help='Where to save summary logs for TensorBoard.')
parser.add_argument(
'--wanted_words',
type=str,
default='splash,footsteps,enemy',
help='Words to use (others will be added to an unknown label)',)
parser.add_argument(
'--train_dir',
type=str,
default='logs/commands_train',
help='Directory to write event logs and checkpoint.')
parser.add_argument(
'--save_step_interval',
type=int,
default=10,
help='Save model checkpoint every save_steps.')
parser.add_argument(
'--start_checkpoint',
type=str,
default='',
help='If specified, restore this pretrained model before any training.')
parser.add_argument(
'--model_architecture',
type=str,
default='conv',
help='What model architecture to use')
parser.add_argument(
'--check_nans',
type=bool,
default=False,
help='Whether to check for invalid numbers during processing')
parser.add_argument(
'--quantize',
type=bool,
default=False,
help='Whether to train the model for eight-bit deployment')
parser.add_argument(
'--preprocess',
type=str,
default='mfcc',
help='Spectrogram processing mode. Can be "mfcc", "average", or "micro"')
parser.add_argument(
'--data_url',
type=str,
default='Audio/training_data',
help='Directory where training data resides')
# Function used to parse --verbosity argument
def verbosity_arg(value):
"""Parses verbosity argument.
Args:
value: A member of tf.logging.
Raises:
ArgumentTypeError: Not an expected value.
"""
value = value.upper()
if value == 'INFO':
return tf.compat.v1.logging.INFO
elif value == 'DEBUG':
return tf.compat.v1.logging.DEBUG
elif value == 'ERROR':
return tf.compat.v1.logging.ERROR
elif value == 'FATAL':
return tf.compat.v1.logging.FATAL
elif value == 'WARN':
return tf.compat.v1.logging.WARN
else:
raise argparse.ArgumentTypeError('Not an expected value')
parser.add_argument(
'--verbosity',
type=verbosity_arg,
default=tf.compat.v1.logging.INFO,
help='Log verbosity. Can be "INFO", "DEBUG", "ERROR", "FATAL", or "WARN"')
FLAGS, unparsed = parser.parse_known_args()
tf.compat.v1.app.run(main=main, argv=[sys.argv[0]] + unparsed)
My question is, how do I actually classify a wav file against this model?

it's a simple argmax classfy you just need to get the softmax predict output by add it to sess run params
pred , train_summary, train_accuracy, cross_entropy_value, _, _ = sess.run(
[
predicted_indices ,
merged_summaries,
evaluation_step,
cross_entropy_mean,
train_step,
increment_global_step,
],
then compare the pred array with train_ground_truth which should be the dict or array container audio label.

Tensorflow feed forward network session doesn't stop

I am trying to build a simple feed forward neural network using TensorFlow and its tfr formatting. I have been using TensorFlow's tutorials and examples as a reference:
https://github.com/tensorflow/tensorflow/tree/master/tensorflow/examples/how_tos/reading_data
Given "food" float values, I want to predict the "happiness" float value that it produces.
food_test.json is a a JSON file that contains the "food" value and its associated "happiness" value. This is the format the data is stored in.
food_to_record.py is based off of tensorflow's convert_to_records.py. It reads in the food_test.json and converts it to a food_record.tfr file.
food_reader.py is based off of tensorflow's fully_connected_reader.py. It reads in the food_record.tfr file and runs the data through a neural network.
I run the program in this order:
1. food_to_record.py
2. food_reader.py
When food_reader.py is run, it starts a TensorFlow session, but the session never terminates, does anyone know the reason for this?
food_test.json:
[
{
"food": 1.0,
"happiness": 2.0
},
{
"food": 1.4,
"happiness": 5.4
}
]
food_to_record.py:
#based off of tensorflow's convert_to_records.py
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os
import sys
import json
import tensorflow as tf
FLAGS = None
#feature for integers
def _int64_feature(value):
return tf.train.Feature(int64_list=tf.train.Int64List(value=[value]))
#feature for floats
def _float_feature(value):
return tf.train.Feature(float_list = tf.train.FloatList(value= [value]))
#feature for strings and others
def _bytes_feature(value):
return tf.train.Feature(bytes_list=tf.train.BytesList(value=[value]))
def main(unused_argv):
print("food_to_record:main")
script_dir = os.path.dirname(__file__)
file_path = os.path.join(script_dir, 'food_test.json')
with open(file_path) as data_file:
data = json.load(data_file)
print(data)
num_examples = 2
name = 'food_record'
filename = os.path.join(FLAGS.directory, name + '.tfrecords')
print('Writing', filename)
writer = tf.python_io.TFRecordWriter(filename)
for index in range(num_examples):
example = tf.train.Example(features=tf.train.Features(feature={
'food': _float_feature(data[index]['food']),
'happiness': _float_feature(data[index]['happiness'])
}))
writer.write(example.SerializeToString())
writer.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--directory',
type=str,
default='.',
help='Directory to download data files and write the converted result'
)
parser.add_argument(
'--validation_size',
type=int,
default=5000,
help="""\
Number of examples to separate from the training data for the validation
set.\
"""
)
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
food_reader.py:
#based off of tensorflow's fully_connected_reader
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import os.path
import sys
import time
import tensorflow as tf
# Basic model parameters as external flags.
FLAGS = None
# Constants used for dealing with the files
TRAIN_FILE = 'food_record.tfrecords'
# For simple testing purposes, use training file for validation
VALIDATION_FILE = 'food_record.tfrecords'
def read_and_decode(filename_queue):
reader = tf.TFRecordReader()
_, serialized_example = reader.read(filename_queue)
features = tf.parse_single_example(
serialized_example,
# Defaults are not specified since both keys are required.
features={
'food': tf.FixedLenFeature([], tf.float32),
'happiness': tf.FixedLenFeature([], tf.float32)
})
food = tf.cast(features['food'], tf.float32)
happiness = tf.cast(features['happiness'], tf.float32)
food = tf.expand_dims(food, -1)
print("food shape: ", tf.shape(food))
print("happiness shape: ", tf.shape(happiness))
return food, happiness
def inputs(train, batch_size, num_epochs):
"""Reads input data num_epochs times.
Args:
train: Selects between the training (True) and validation (False) data.
batch_size: Number of examples per returned batch.
num_epochs: Number of times to read the input data, or 0/None to
train forever.
Returns:
A tuple (images, labels), where:
* images is a float tensor with shape [batch_size, mnist.IMAGE_PIXELS]
in the range [-0.5, 0.5].
* labels is an int32 tensor with shape [batch_size] with the true label,
a number in the range [0, mnist.NUM_CLASSES).
Note that an tf.train.QueueRunner is added to the graph, which
must be run using e.g. tf.train.start_queue_runners().
"""
if not num_epochs: num_epochs = None
filename = os.path.join(FLAGS.train_dir,
TRAIN_FILE if train else VALIDATION_FILE)
with tf.name_scope('input'):
filename_queue = tf.train.string_input_producer(
[filename], num_epochs=num_epochs)
# Even when reading in multiple threads, share the filename
# queue.
food, happiness = read_and_decode(filename_queue)
# Shuffle the examples and collect them into batch_size batches.
# (Internally uses a RandomShuffleQueue.)
# We run this in two threads to avoid being a bottleneck.
foods, happinesses= tf.train.shuffle_batch(
[food, happiness], batch_size=batch_size, num_threads=2,
capacity=1000 + 3 * batch_size,
# Ensures a minimum amount of shuffling of examples.
min_after_dequeue=1000)
return foods, happinesses
def main(_):
with tf.Graph().as_default():
# Input images and labels.
foods, happinesses = inputs(train=True, batch_size=FLAGS.batch_size,
num_epochs=FLAGS.num_epochs)
HIDDEN_UNITS = 4
INPUTS = 1
OUTPUTS = 1
weights_1 = tf.Variable(tf.truncated_normal([INPUTS, HIDDEN_UNITS]))
biases_1 = tf.Variable(tf.zeros([HIDDEN_UNITS]))
layer_1_outputs = tf.nn.sigmoid(tf.matmul(foods, weights_1) + biases_1)
weights_2 = tf.Variable(tf.truncated_normal([HIDDEN_UNITS, OUTPUTS]))
biases_2 = tf.Variable(tf.zeros([OUTPUTS]))
logits = tf.nn.sigmoid(tf.matmul(layer_1_outputs, weights_2) + biases_2)
#loss = tf.reduce_mean(logits)
labels = tf.to_int64(happinesses)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=labels, logits=logits, name='xentropy')
#loss = tf.reduce_mean(cross_entropy, name='xentropy_mean')
loss = tf.reduce_sum(tf.sub(logits, happinesses))
learning_rate = 0.01
optimizer = tf.train.GradientDescentOptimizer(learning_rate)
train_op = optimizer.minimize(loss)
init_op = tf.group(tf.global_variables_initializer(),
tf.local_variables_initializer())
sess = tf.Session()
sess.run(init_op)
print('staring iteration', 0)
_, loss = sess.run([train_op, loss])
print(loss)
sess.close()
if __name__ == '__main__':
parser = argparse.ArgumentParser()
parser.add_argument(
'--learning_rate',
type=float,
default=0.01,
help='Initial learning rate.'
)
parser.add_argument(
'--num_epochs',
type=int,
default=2,
help='Number of epochs to run trainer.'
)
parser.add_argument(
'--hidden1',
type=int,
default=128,
help='Number of units in hidden layer 1.'
)
parser.add_argument(
'--hidden2',
type=int,
default=32,
help='Number of units in hidden layer 2.'
)
parser.add_argument(
'--batch_size',
type=int,
default=100,
help='Batch size.'
)
parser.add_argument(
'--train_dir',
type=str,
default='.',
help='Directory with the training data.'
)
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)

You must call tf.train.start_queue_runners to populate the queue before you call run or eval to execute reading files. Otherwise, reading will block while it waits for filenames from the queue. Pleases check run_training method from the original example, or tensorflow's documentation about how_tos/reading_data.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Fine tune GPT-2 Text Prediction for Conversational AI - python

Related

Why does an ANN validation accuracy oscillate?

RuntimeError: The size of tensor a (549) must match the size of tensor b (512) at non-singleton dimension 1

Keras Double DQN average reward decreases over time and is unable to converge

How do you use a TensorFlow model to classify a sound using this code

Tensorflow feed forward network session doesn't stop

Categories

Resources