Debugging tensorflow fit not making sense - python

So I was having different results with a self-implemented code and Tensorflow results. I wanted to test each value to see where was my error (loss, gradients, optimizer, etc).
Therefore I did a test code like the one in this repo inspired on the fashion mnist example. Just for simplicity I will copy-paste it at the end of the question.
Logic:
Basically, I do 1 epoch on 1 batch. And then save:
Weigths before training
Gradients
Weights after only one epoch and batch.
As I use the default SGD TensorFlow algorithm, then the saved gradients should be equal to (initial_weights - final_weights)/0.01. This idea was taken from here.
However, this does not happen, what's more, results get closer if I divide by 0.0001 instead of 0.01 which is strangely enough 0.01^2.
Is there an error in my logic? testing code? I cannot find it.
PS: I tried using tf version 2.2.0 and 2.4.1 on Linux.
import tensorflow as tf
import numpy as np
from pdb import set_trace
def get_dataset():
fashion_mnist = tf.keras.datasets.fashion_mnist
(train_images, train_labels), (test_images, test_labels) = fashion_mnist.load_data()
return (train_images, train_labels), (test_images, test_labels)
def get_model(init1='glorot_uniform', init2='glorot_uniform'):
tf.random.set_seed(1)
model = tf.keras.Sequential([
tf.keras.layers.Flatten(input_shape=(28, 28)),
tf.keras.layers.Dense(128, activation='relu', kernel_initializer=init1),
tf.keras.layers.Dense(10, kernel_initializer=init2)
])
model.compile(optimizer='sgd',
loss=tf.keras.losses.CategoricalCrossentropy(from_logits=False),
metrics=['accuracy'])
return model
def train(model, x_fit, y_fit):
np.save("initial_weights.npy", np.array(model.get_weights()))
with tf.GradientTape() as g:
y_pred = model(x_fit)
loss = tf.keras.losses.categorical_crossentropy(y_pred=y_pred, y_true=y_fit)
np.save("loss.npy", np.array(loss))
gradients = g.gradient(loss, model.trainable_weights)
np.save("gradients.npy", np.array(gradients))
model.fit(x_fit, y_fit, epochs=1, batch_size=100)
np.save("final_weights.npy", np.array(model.get_weights()))
if __name__ == "__main__":
(train_images, train_labels), (test_images, test_labels) = get_dataset()
model = get_model()
y_fit = np.zeros((100, 10))
for i, val in enumerate(train_labels[:100]):
y_fit[i][val] = 1.
train(model, train_images[:100], y_fit)
results = {
"loss": np.load("loss.npy", allow_pickle=True),
"init_weights": np.load("initial_weights.npy", allow_pickle=True),
"gradients": np.load("gradients.npy", allow_pickle=True),
"final_weights": np.load("final_weights.npy", allow_pickle=True)
}
for i_w, f_w, gr in zip(results["init_weights"], results["final_weights"], results["gradients"]):
gr = gr.numpy()
print(np.allclose(gr, (i_w - f_w) / 0.01))
# set_trace()

It looks like the call to fit is averaging the gradient over the batch size. I don't know if it's a bug of it is by design.
As you compute your gradient manually anyway, you can just call model.optimizer.apply_gradients to update your weights, you should get the correct results.
def train(model, x_fit, y_fit):
np.save("initial_weights.npy", np.array(model.get_weights()))
with tf.GradientTape() as g:
y_pred = model(x_fit)
loss = tf.keras.losses.categorical_crossentropy(y_pred=y_pred, y_true=y_fit)
np.save("loss.npy", np.array(loss))
gradients = g.gradient(loss, model.trainable_weights)
np.save("gradients.npy", np.array(gradients))
model.optimizer.apply_gradients(zip(gradients, model.trainable_weights))
np.save("final_weights.npy", np.array(model.get_weights()))

Related

Struggling to implement SMILES2vec RNN from DeepChem for fitting on FreeSolv Dataset leading to negative R^2

I'm trying to use a SMILES2vec model from Deepchem in order to reproduce the regression results from the original paper (https://arxiv.org/pdf/1712.02034.pdf). Towards this end, rather than using the model directly from DeepChem, I used tensorflow to throw the model together using sequential, and used the architecture that fed the embeddings into a 1D convolution and 2 LSTMs. I don't have any errors, but I've used coefficient of determination as my error metric and it comes out negative. This happened regardless when I've tried bidirectional() on the LSTMs and I've switched from MSE to MAE loss, and I'm still not sure what to do. The dataset I'm training on is FreeSolv from deepchem.
!pip install --pre deepchem
!pip install rdkit-pypi
!pip install tensorflow-addons
tasks, dataset, transformers = dc.molnet.load_freesolv()
train_dataset, valid_dataset, test_dataset = dataset
smiles_list = [x for x in itertools.chain(train_dataset.ids, valid_dataset.ids, test_dataset.ids)]
charset = set("".join(list(smiles_list))+"!E")
char_to_int = dict((c,i) for i,c in enumerate(charset))
int_to_char = dict((i,c) for i,c in enumerate(charset))
embed = max([len(smile) for smile in smiles_list]) + 5
## converts SMILES strings to embedding or vector
def vectorize(smiles):
one_hot = np.zeros((smiles.shape[0], embed , len(charset)),dtype=np.int8)
for i,smile in enumerate(smiles):
#encode the startchar
one_hot[i,0,char_to_int["!"]] = 1
#encode the rest of the chars
for j,c in enumerate(smile):
one_hot[i,j+1,char_to_int[c]] = 1
#Encode endchar
one_hot[i,len(smile)+1:,char_to_int["E"]] = 1
return one_hot[:,0:-1,:], one_hot[:,1:,:]
def get_lr_metric(optimizer):
def lr(y_true, y_pred):
return optimizer.lr
return lr
#Prepare features for SMILES2vec
X_train, _ = vectorize(train_dataset.ids)
X_valid, _ = vectorize(valid_dataset.ids)
X_test, _ = vectorize(test_dataset.ids)
Y_train = train_dataset.y
Y_valid = valid_dataset.y
Y_test = test_dataset.y
vocab_size=len(charset)
## Build model
model = tf.keras.Sequential()
model.add(tf.keras.layers.Embedding(vocab_size, 50, input_length=embed-1))
model.add(tf.keras.layers.Conv1D(192,10,activation='relu'))
model.add(tf.keras.layers.BatchNormalization())
model.add(keras.layers.LSTM(224,return_sequences=True))
model.add(keras.layers.LSTM(384,return_sequences=True))
model.add(tf.keras.layers.Flatten())
model.add(tf.keras.layers.Dense(100, activation='relu'))
model.add(tf.keras.layers.Dropout(0.4))
model.add(tf.keras.layers.Dense(1, activation='linear'))
## Coefficient of determination metric
def coeff_determination(y_true, y_pred):
from keras import backend as K
SS_res = K.sum(K.square( y_true-y_pred ))
SS_tot = K.sum(K.square( y_true - K.mean(y_true) ) )
return ( 1 - SS_res/(SS_tot + K.epsilon()) )
optimizer = tf.keras.optimizers.RMSprop()
lr_metric = get_lr_metric(optimizer)
model.compile(loss="mae", optimizer=optimizer, metrics=[tf.keras.metrics.RootMeanSquaredError(), coeff_determination, lr_metric])
callbacks_list = [
ReduceLROnPlateau(monitor='val_loss', factor=0.5, patience=5, min_lr=1e-3, verbose=1, mode='auto',cooldown=0),
ModelCheckpoint(filepath="weights.best.hdf5", monitor='val_loss', save_best_only=True, verbose=1, mode='auto')
]
history =model.fit(x=np.argmax(X_train, axis=2), y=Y_train,
batch_size=32,
epochs=50,
validation_data=(np.argmax(X_valid, axis=2),Y_valid),
callbacks=callbacks_list)
If it helps at all, I'm running this on a google colab notebook. https://colab.research.google.com/drive/1pJ25THeefBWUpe73cL_1LNnq45Pd95XZ?usp=sharing
As to why I'm not using the DeepChem implementation of SMILES2vec, I wanted to 1. get more hands-on work with building models with tensorflow and 2. I struggled with trying to get the DeepChem implementation running just form using DeepChem's Documentation (https://deepchem.readthedocs.io/en/latest/api_reference/models.html). However, I want to focus on why my coefficient of determination is reaching negative scores. Additionally, I've been trying to use this notebook for reference which uses a 'proof of concept' implementation of SMILES2vec where the LSTMs are replaced with 1D conv layers (https://github.com/Abdulk084/Smiles2vec/blob/master/smiles2vec.ipynb).

Why does my TensorFlow model loose its accuracy after loading

So I am training on the MNIST dataset and the code is down below.
The issue is that, on the first run, it calculates everything and gives me a fair accuracy.
But on the second run (when it's supposed to load from the saved file) the accuracy drops considerably.
Is it something wrong with my code or any practices I'm not following here ?
import tensorflow as tf
from tensorflow import keras
import numpy as np
import matplotlib.pyplot as plt
from os import environ, sep
environ['TF_CPP_MIN_LOG_LEVEL'] = '3'
MODELFILENAME = 'TF_ZTH_02_model' + sep
labels = {
0:'T-shirt/Top',
1:'Trouser',
2:'Pullover',
3:'Dress',
4:'Coat',
5:'Sandal',
6:'Shirt',
7:'Sneaker',
8:'Bag',
9:'Ankle Boot'
}
def main():
fashionmnist = keras.datasets.fashion_mnist
(trainimages, trainlabels), (testimages, testlabels) = fashionmnist.load_data()
trainimages, testimages = trainimages/255., testimages/255.
try:
#try load model
model = keras.models.load_model(MODELFILENAME)
#files doesn't exist, train model
except:
#activation functions
#relu - rectified linear unit - return value if its greater than 0 or 0
#softmax - picks biggest number in set
model = keras.Sequential([
keras.layers.Flatten(input_shape=(28, 28)), #size of image
keras.layers.Dense(128, activation=tf.nn.relu),
keras.layers.Dense(10, activation=tf.nn.softmax) #ten clothing
])
model.compile(
optimizer = 'adam',
loss = 'sparse_categorical_crossentropy',
metrics = 'accuracy'
)
model.fit(trainimages, trainlabels, epochs=5)
#save to file
model.save(MODELFILENAME)
testloss, testacc = model.evaluate(testimages, testlabels)
print('\nEvaluation, loss and accuracy : ', testloss, testacc)
predictions = model.predict(testimages)
# predictions = model.predict(np.asarray([testimages[0]]))
while True:
x = int(input('\nEnter image number (<%d) : '%len(testimages)))
print('\nPredictions : ',
predictions[x],
predictions[x].argmax(),
labels[predictions[x].argmax()]
)
print('Actual : ', testlabels[x], labels[testlabels[x]])
plt.ioff()
plt.imshow(testimages[x])
plt.title(labels[predictions[x].argmax()])
plt.show()
#but this ds has objects centered
#in the case of an unprocessed ds, you'd need to SPOT FEATURES
#with the help of convolutional networks
try:
main()
except Exception as e:
print(e)
finally:
input()
Output on First Run
Output on Second Run

PyTorch Linear MINST model training error

I am creating a binary classifier based on the MINST dataset using PyTorch. I want my classifier to classify between only 0s and 1s, however, when I train it, the error doesn't decrease and the loss becomes negative.
Here's the error and loss at the first few iterations:
I was obviously expecting better results.
Here is the code I am using:
# Loading the MNISR data reduced to the 0/1 examples
from torchvision import datasets, transforms
from torch.utils.data import DataLoader
mnist_train = datasets.MNIST("./data", train=True, download=True, transform=transforms.ToTensor())
mnist_test = datasets.MNIST("./data", train=False, download=True, transform=transforms.ToTensor())
train_idx = mnist_train.train_labels <= 1
try:
mnist_train.train_data = mnist_train.train_data[train_idx]
except AttributeError:
mnist_train._train_data = mnist_train.train_data[train_idx]
try:
mnist_train.train_labels = mnist_train.train_labels[train_idx]
except AttributeError:
mnist_train._train_labels = mnist_train.train_labels[train_idx]
test_idx = mnist_test.test_labels <= 1
try:
mnist_test.test_data = mnist_test.test_data[test_idx]
except AttributeError:
mnist_test._test_data = mnist_test.test_data[test_idx]
try:
mnist_test.test_labels = mnist_test.test_labels[test_idx]
except AttributeError:
mnist_test._test_labels = mnist_test.test_labels[test_idx]
train_loader = DataLoader(mnist_train, batch_size = 100, shuffle=True)
test_loader = DataLoader(mnist_test, batch_size = 100, shuffle=False)
# Creating a simple linear classifier
import torch
import torch.nn as nn
import torch.optim as optim
# do a single pass over the data
def epoch(loader, model, opt=None):
total_loss, total_err = 0.,0.
for X,y in loader:
yp = model(X.view(X.shape[0], -1))[:,0]
loss = nn.BCEWithLogitsLoss()(yp, y.float())
if opt:
opt.zero_grad()
loss.backward()
opt.step()
total_err += ((yp > 0) * (y==0) + (yp < 0) * (y==1)).sum().item()
total_loss += loss.item() * X.shape[0]
return total_err / len(loader.dataset), total_loss / len(loader.dataset)
model = nn.Linear(784, 1)
opt = optim.SGD(model.parameters(), lr=1)
print("Train Err", "Train Loss", "Test Err", "Test Loss", sep="\t")
for i in range(10):
train_err, train_loss = epoch(train_loader, model, opt)
test_err, test_loss = epoch(test_loader, model)
print(*("{:.6f}".format(i) for i in (train_err, train_loss, test_err, test_loss)), sep="\t")
I don't know why my error does not decrease nor why my loss keeps getting more negative. Does anyone spot the error?
As mnist data consists of 10 different outputs change the model to output size to 10
model = nn.Linear(784, 10)
Also change the loss to cross entropy loss and reduce the learning rate to some smaller value(0.001) and use much deeper model.
Probably the above changes should solve your problem
I found the error. My initial code to select only 1s and 0s from the MNIST dataset didn't work. So obviously, applying BCELoss to a non-binary dataset was making the model fail.

skopt's gp_minimize() function raises ValueError: array must not contain infs or NaNs

I am currently using the skopt (scikit-optimize) package for hyperparameter tuning of a neural network (I am trying to minimize -1* accuracy). It seems to run fine (and successfully prints to the console) for several iterations before it raises Value Error: array must not contain infs or NaNs.
What are some possible causes of this? My data does not contain infs or NaNs and neither do my search parameter ranges. The neural network code is quite long, so for brevity, I will paste the relevant sections:
Imports:
import pandas as pd
import numpy as np
from skopt import gp_minimize
from skopt.utils import use_named_args
from skopt.space import Real, Categorical, Integer
from tensorflow.python.framework import ops
from sklearn.model_selection import train_test_split
import tensorflow
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Conv1D, Dropout, MaxPooling1D, Flatten
from keras import backend as K
Creation of search parameters:
dim_num_filters_L1 = Integer(low=1, high=50, name='num_filters_L1')
#dim_kernel_size_L1 = Integer(low=1, high=70, name='kernel_size_L1')
dim_activation_L1 = Categorical(categories=['relu', 'linear', 'softmax'], name='activation_L1')
dim_num_filters_L2 = Integer(low=1, high=50, name='num_filters_L2')
#dim_kernel_size_L2 = Integer(low=1, high=70, name='kernel_size_L2')
dim_activation_L2 = Categorical(categories=['relu', 'linear', 'softmax'], name='activation_L2')
dim_num_dense_nodes = Integer(low=1, high=28, name='num_dense_nodes')
dim_activation_L3 = Categorical(categories=['relu', 'linear', 'softmax'], name='activation_L3')
dim_dropout_rate = Real(low = 0, high = 0.5, name = 'dropout_rate')
dim_learning_rate = Real(low=1e-4, high=1e-2, name='learning_rate')
dimensions = [dim_num_filters_L1,
#dim_kernel_size_L1,
dim_activation_L1,
dim_num_filters_L2,
#dim_kernel_size_L2,
dim_activation_L2,
dim_num_dense_nodes,
dim_activation_L3,
dim_dropout_rate,
dim_learning_rate,
]
Function that creates all models that will be tested:
def create_model(num_filters_L1, #kernel_size_L1,
activation_L1,
num_filters_L2, #kernel_size_L2,
activation_L2,
num_dense_nodes, activation_L3,
dropout_rate,
learning_rate):
input_shape = (X_train.shape[1], 1)
model = Sequential()
model.add(Conv1D(num_filters_L1, kernel_size = 40, activation = activation_L1, input_shape = input_shape))
model.add(MaxPooling1D(pool_size=2))
model.add(Conv1D(num_filters_L2, kernel_size=20, activation=activation_L2))
model.add(MaxPooling1D(pool_size=2))
model.add(Flatten())
model.add(Dense(num_dense_nodes, activation = activation_L3))
model.add(Dropout(dropout_rate))
model.add(Dense(y_train.shape[1], activation='linear'))
adam = tensorflow.keras.optimizers.Adam(learning_rate = learning_rate)
model.compile(optimizer=adam, loss='mean_squared_error', metrics=['accuracy'])
return model
Define fitness function:
#use_named_args(dimensions=dimensions)
def fitness(num_filters_L1, #kernel_size_L1,
activation_L1,
num_filters_L2, #kernel_size_L2,
activation_L2,
num_dense_nodes, activation_L3,
dropout_rate,
learning_rate):
model = create_model(num_filters_L1, #kernel_size_L1,
activation_L1,
num_filters_L2, #kernel_size_L2,
activation_L2,
num_dense_nodes, activation_L3,
dropout_rate,
learning_rate)
history_opt = model.fit(x=X_train,
y=y_train,
validation_data=(X_val,y_val),
shuffle=True,
verbose=2,
epochs=10
)
#return the validation accuracy for the last epoch.
accuracy_opt = model.evaluate(X_test,y_test)[1]
# Print the classification accuracy:
print("Experimental Model Accuracy: {0:.2%}".format(accuracy_opt))
# Delete the Keras model with these hyper-parameters from memory:
del model
# Clear the Keras session, otherwise it will keep adding new models to the same TensorFlow graph each time we create model with a different set of hyper-parameters.
K.clear_session()
ops.reset_default_graph()
# the optimizer aims for the lowest score, so return negative accuracy:
return -accuracy # or sum(RMSE)?
Run hyperparameter search:
gp_result = gp_minimize(func=fitness,
dimensions=dimensions)
print("best accuracy was " + str(round(gp_result.fun *-100,2))+"%.")
Your activation function is not converging in a random acquisition function call. I encountered this problem and removed 'relu' function from search space.

Exploding Gradient on fully connected layer

I am trying to learn on the MNIST dataset using a deep learning model.
My model is of the format-
Input(28*28*1)
Conv2d(14*14*32)
Conv2d(7*7*64)-flatten
FC(3164*1024)
FC(1024*10)
10 class prediction of MNIST
%matplotlib inline
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
mnist = tf.keras.datasets.mnist
(x_train, y_train),(x_test, y_test) = mnist.load_data()
x_train, x_test = x_train / 255.0, x_test / 255.0
y_train,y_test=one_hot(y_train),one_hot(y_test)
x_train=np.reshape(x_train,[x_train.shape[0],28,28,1])
x_test=np.reshape(x_test,[x_test.shape[0],28,28,1])
x_dataset=tf.data.Dataset.from_tensor_slices(x_train)
y_dataset=tf.data.Dataset.from_tensor_slices(y_train)
train_dataset=tf.data.Dataset.zip((x_dataset,y_dataset)).repeat().batch(50)
iterator=tf.data.Iterator.from_structure(train_dataset.output_types,train_dataset.output_shapes)
next_element=iterator.get_next()
training_init_op=iterator.make_initializer(train_dataset)
x_testds=tf.data.Dataset.from_tensor_slices(x_test)
y_testds=tf.data.Dataset.from_tensor_slices(y_test)
testds=tf.data.Dataset.zip((x_testds,y_testds)).repeat().batch(2000)
valid_inti_op=iterator.make_initializer(testds)
##%%##
def one_hot(y_train):
y_train1=np.zeros((y_train.shape[0],10))
for i in range(y_train.shape[0]):
y_train1[i][y_train[i]]=1
return y_train1
def conv_layer(input,channels_in,channels_out,name="conv"):
with tf.name_scope(name):
input=tf.cast(input,tf.float32)
w=tf.Variable(tf.truncated_normal([5,5,channels_in,channels_out],stddev=0.1),name="W")
b=tf.Variable(tf.truncated_normal([channels_out],stddev=0.1),name="B")
conv=tf.nn.conv2d(input,w,strides=[1,1,1,1],padding="SAME")
act=tf.nn.relu(conv+b)
tf.summary.histogram("weights",w)
tf.summary.histogram("biases",b)
tf.summary.histogram("activation",act)
return act
def fc_layer(input,channels_in,channels_out,name="fc"):
with tf.name_scope(name):
w=tf.Variable(tf.truncated_normal([channels_in,channels_out],stddev=0.1),name="W")
b=tf.Variable(tf.zeros([channels_out]),name="B")
act=tf.nn.relu(tf.matmul(input,w)+b)
tf.summary.histogram("weights",w)
tf.summary.histogram("biases",b)
tf.summary.histogram("activation",act)
return act
conv1=conv_layer(next_element[0],1,32,"conv1")
pool1=tf.nn.max_pool(conv1,ksize=[1,2,2,1],strides=[1,2,2,1],padding="SAME",name="pool1")
conv2=conv_layer(pool1,32,64,"conv2")
pool2=tf.nn.max_pool(conv2,ksize=[1,2,2,1],strides=[1,2,2,1],padding="SAME",name="pool2")
flattened=tf.reshape(pool2,[-1,7*7*64])
fc1=fc_layer(flattened,7*7*64,1024,"fc1")
logits=fc_layer(fc1,1024,10,"fc2")
##%%##
with tf.name_scope("cross_entropy"):
cross_entropy=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,labels=next_element[1]))
with tf.name_scope("train"):
train_step=tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
with tf.name_scope("accuracy"):
correct_prediction=tf.equal(tf.argmax(logits,1),tf.argmax(next_element[1],1))
accuracy=tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
##%%
sess=tf.Session()
tf.summary.scalar('cross_entropy',cross_entropy)
tf.summary.scalar('accuracy',accuracy)
tf.summary.image('input',next_element[0])
merged_summary=tf.summary.merge_all()
writer=tf.summary.FileWriter("D:/work/ML/tensorboard/MNIST/deep/4")
writer.add_graph(sess.graph)
##%%
sess.run(tf.global_variables_initializer())
sess.run(training_init_op)
for i in range(600):
s=sess.run(merged_summary)
if(i%5==0):
writer.add_summary(s,i)
print(i,end="\r")
sess.run(valid_inti_op)
for i in range(1,6):
s1=sess.run(merged_summary)
writer.add_summary(s1,601+i)
My Accuracy and cross_entropy are stuck. After trying to use tensorboard, the issue seems to be that my weights of FC layer are stuck at very large values, even though i have initialised them to 0 if this really is the error then i dont know how to fix and if it isnt then i dont know what the error is.
A sess.run(train_step) after inputting data fixed my code

Categories

Resources