My neural network doesn't improve its accuracy - python

I'm training a neural network to recognize characters using the notMNIST dataset, but once I run it its accuracy stays relatively constant after each iteration.
I've tried lower the learning rate but it wasn't any different. What might be the problem?
I think the problem might be in the implementation of the tf.nn.relu() method, and how I calculate the predictions, since I'm fairly new at Tensor Flow and neural networks
Here is a screenshot of my program running and you can see that the accuracy on the training set, validation set, and test set are all pretty bad
num_steps=801
def accuracy(predictions, labels):
return (100.0 * np.sum(np.argmax(predictions,1) == np.argmax(labels,1))
/ predictions.shape[0])
with tf.Session(graph=graph) as session:
#this is a one-time operation which ensure the parameters get initialized
#we described in the graph: random weights for the matrix, zeros for the
#biases.
tf.global_variables_initializer().run()
print("initialized")
for step in range(num_steps):
#run the computations. we tell .run() that we want to run the optimizer,
#and get the loss value and the training predictions returned as numpy
#arrays.
_, l, predictions = session.run([optimizer,loss, train_prediction])
if (step % 100 ==0):
print("loss at step %d: %f" % (step,l))
print("Training accuracy: %.1f%%" % accuracy(
predictions, train_labels[:train_subset,:]))
#calling .eval() on valid_prediction is basically like calling run(), but
#just to get that one numpy array. Note that it recomputes all its graph
#dependencies.
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("test accuracy: %.1f%%" % accuracy(test_prediction.eval(),test_labels))
batch_size = 128
hidden_nodes = 1024
graph = tf.Graph()
with graph.as_default():
#input data. For the training data, we use a placeholder that will be fed
#at run time with a training minibatch
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size*image_size), name="td")
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels), name="tl")
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
#variables
weights1 = tf.Variable(
tf.truncated_normal([image_size*image_size, hidden_nodes]))
biases1 = tf.Variable(tf.zeros([hidden_nodes]))
weights2 =tf.Variable(
tf.truncated_normal([hidden_nodes, num_labels]))
biases2 = tf.Variable(tf.zeros([num_labels]))
#training computation.
relu1 = tf.nn.relu(tf.matmul(tf_train_dataset, weights1) + biases1)
relu_out= tf.nn.relu(tf.matmul(relu1, weights2) + biases2)
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits=relu_out,labels=tf_train_labels))
#optimizer
optimizer = tf.train.GradientDescentOptimizer(0.25).minimize(loss)
#predictions for the training, validation, and test data
train_prediction = relu_out
valid_prediction = tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset, weights1) + biases1), weights2) + biases2)
test_prediction = tf.nn.relu(tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1), weights2) + biases2)
num_steps = 3001
with tf.Session(graph=graph) as session:
tf.global_variables_initializer().run()
print("initialized")
for step in range(num_steps):
#pick an offset within the training data, which has been randomized.
#note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
#generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
#prepare a dictionary telling the session where to feed the minibatch.
#the key of the dictionary is the placeholder node of the graph to be fed,
#and the value is the numpy array to feed to it
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("minibatch loss at step %d: %f" % (step,l))
print("minibatch accuracy: %.1f%%" % accuracy(predictions,batch_labels))
print("validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))

As I thought, the problem was the implementation of the relu() method.
In the computation segment I was using relu() 2 times where I should've been using it only once. After the change it ended up looking like this.
logits_1 = tf.matmul(tf_train_dataset, weights1) + biases1
relu1 = tf.nn.relu(logits_1)
logits_2 = tf.matmul(relu1, weights2) + biases2
And I changed the parameter logits in the loss variable from relu_out to logits_2.
loss = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(logits=logits_2,labels=tf_train_labels))
And finally I changed the prediction variables in order to compute using logits_2 and not relu_out.
train_prediction = tf.nn.softmax(logits_2)
valid_prediction = tf.nn.softmax(
tf.matmul(tf.nn.relu(tf.matmul(tf_valid_dataset,weights1) +biases1), weights2) + biases2)
test_prediction = tf.nn.softmax(
tf.matmul(tf.nn.relu(tf.matmul(tf_test_dataset, weights1) + biases1), weights2) + biases2)
As you can see the accuracy gets up around 90%
Although I'm still not sure why implementing the relu() method twice was a problem. If I'm not wrong the relu() method returns either 0 or the value of the parameter it's given, so shouldnn't it be the same?
If anyone knows feel free to answer

Related

Tensorflow to PyTorch - model.predict equivalent

I am attempting to retrieve the mean squared error of my training. within the original code based in TensorFlow, I am moving this code over to PyTorch (for research reasons).
the original TensorFlow code:
print("Calculating threshold")
x_opt_predictions = model.predict(x_opt)
print("Calculating MSE on optimization set...")
mse = np.mean(np.power(x_opt - x_opt_predictions, 2), axis=1)
print("mean is %.5f" % mse.mean())
print("min is %.5f" % mse.min())
print("max is %.5f" % mse.max())
print("std is %.5f" % mse.std())
tr = mse.mean() + mse.std()
the training method of pytorch:
def train(net, x_train, x_opt, BATCH_SIZE, EPOCHS, input_dim):
outputs = 0
mse = 0
optimizer = optim.SGD(net.parameters(), lr=0.001)
loss_function = nn.MSELoss()
loss = 0
for epoch in range(EPOCHS):
for i in tqdm(range(0, len(x_train), BATCH_SIZE)):
batch_y = x_opt[i:i + BATCH_SIZE]
net.zero_grad()
outputs = net(batch_y)
loss = loss_function(outputs, batch_y)
loss.backward()
optimizer.step()
print(f"Epoch: {epoch}. Loss: {loss}")
print("opt", x_opt.size(), "output", outputs.__sizeof__())
# VVVVVVVVVVVVVVVVVVVVVVVVVVVVVV
return np.mean(np.power(x_opt - outputs, 2), axis=1)
# ^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^
as seen above the line "outputs" is not a numpy array of predictions, and acquiring this equivalent to generate the threshold
If there are any other (improved or missing) ways to acquire this value, appreciation in advance.
The variable output is a pytorch tensor to convert it to numpy all you have to change is change this line of code return np.mean(np.power(x_opt - outputs, 2), axis=1) to this return np.mean(np.power(x_opt - outputs.cpu().data.numpy(), 2), axis=1) That will convert the tensor to a numpy array. If you are not using cuda with your network you do not need the .cpu() part.

Tensorflow:Training doesn't improve accuracy

I have just begin to learn tensorflow,and write a model for exercising on MNIST.Thus I am following a book,but there is still porblem,could you please help me about this?
Following is my code with problem description in it,thank you very much!
x = tf.placeholder(tf.float32,[None,INPUT_NODE],name='input')
y_ = tf.placeholder(tf.float32,[None,OUTPUT_NODE],name='output')
weights1 = tf.Variable(tf.truncated_normal([INPUT_NODE,LAYER1_NODE],stddev=0.1))
biases1 = tf.Variable(tf.constant(0.1,shape=[LAYER1_NODE]))
weights2 = tf.Variable(tf.truncated_normal([LAYER1_NODE,OUTPUT_NODE],stddev=0.1))
biases2 = tf.Variable(tf.constant(0.1,shape=[OUTPUT_NODE]))
the next y = ()...define forward propagating without using moving average model.
y = inference(x,None,weights1,biases1,weights2,biases2)
global_step = tf.Variable(0,trainable=False)
variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY,global_step)
variables_averages_op = variable_averages.apply(tf.trainable_variables())
the next average_y =()...define forward propagating using moving average model.
average_y = inference(x,variable_averages,weights1,biases1,weights2,biases2)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y,labels=tf.arg_max(y_,1))
cross_entropy_mean = tf.reduce_mean(cross_entropy)
regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE)
regularization = regularizer(variable_averages.average(weights1)) +\
regularizer(variable_averages.average(weights2))
loss = cross_entropy_mean + regularization
learning_rate = tf.train.exponential_decay(
LEARNING_RATE_BASE,
global_step,
mnist.train.num_examples / BATCH_SIZE,
LEARNING_RATE_DECAY
)
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss,global_step=global_step)
train_op = tf.group(train_step,variables_averages_op)
the problem is when I use average_y to calculate the accuracy,it seems like training doesn't help improving at all:
After 0 training steps, acc in validatation is 0.0742
After 1000 training steps, acc in validatation is 0.0924
After 2000 training steps, acc in validatation is 0.0924
When I using y instead of average_y,everything is good.This really confuse me:
After 0 training steps, acc in validatation is 0.0686
After 1000 training steps, acc in validatation is 0.9716
After 2000 training steps, acc in validatation is 0.9768
#correct_prediction = tf.equal(tf.arg_max(y,1),tf.arg_max(y_,1))
correct_prediction = tf.equal(tf.arg_max(average_y,1),tf.arg_max(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
with tf.Session() as sess:
tf.initialize_all_variables().run()
validate_feed = {
x:mnist.validation.images,
y_:mnist.validation.labels
}
test_feed={
x:mnist.test.images,
y_:mnist.test.labels
}
for i in range(TRAINING_STEPS):
if i%1000 == 0:
validate_acc = sess.run(accuracy,feed_dict=validate_feed)
print("After %d training steps, acc in validatation is %g"%(i,validate_acc))
xs,ys = mnist.train.next_batch(BATCH_SIZE)
sess.run([train_op,global_step],feed_dict={x:xs,y_:ys})
test_acc = sess.run(accuracy,feed_dict=test_feed)
print("After %d training steps, acc in test is %g" % (TRAINING_STEPS, test_acc))
From your code snippet you are training the classification loss with respect to the y logits instead of average_y, so the inference graph with exponential moving average is actually not trained
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y,labels=tf.arg_max(y_,1))

Getting error in epoch_cost "operands could not be broadcast together with shapes (32,) (5,) (32,) " while applying minibatches in neural net

I am doing this project in which i applied minibatches in neural network and calculating epoch cost:-
def model(X_train, Y_train, X_test, Y_test, learning_rate = 0.0001, num_epochs = 1500,
minibatch_size = 32, print_cost = True):
ops.reset_default_graph()
tf.set_random_seed(1)
seed = 3
costs = []
(n_x, m) = X_train.shape
n_y = Y_train.shape[0]
#create placeholder
X, Y = create_placeholder(n_x, n_y)
# init parameter
parameters = init_parameter()
# forward prop
Z3 = forward_prop(X, parameters)
# compute cost
cost = compute_cost(Z3, Y)
# optimizer
optimizer = tf.train.AdamOptimizer(learning_rate= learning_rate).minimize(cost)
# Initialize all variables
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(num_epochs):
epoch_cost = 0
num_minibatche = int(m/ minibatch_size)
seed = seed + 1
minibatches = random_mini_batches(X_train, Y_train,
minibatch_size, seed)
for minibatch in minibatches:
(minibatch_X, minibatch_Y) = minibatch
_, minibatch_cost = sess.run([optimizer, cost], feed_dict
= {X: minibatch_X, Y: minibatch_Y})
epoch_cost += minibatch_cost / num_minibatche
# Print the cost every epoch
if print_cost == True and epoch % 100 == 0:
print ("Cost after epoch " ,epoch, np.mean(epoch_cost))
if print_cost == True and epoch % 5 == 0:
costs.append(epoch_cost)
# plot the cost
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations (per tens)')
plt.title("Learning rate =" + str(learning_rate))
plt.show()
# save the parameters
parameters = sess.run(parameters)
print ("Parameters have been trained!")
# Calculate the correct predictions
correct_prediction = tf.equal(tf.argmax(Z3), tf.argmax(Y))
# Calculate accuracy on the test set
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print ("Train Accuracy:", accuracy.eval({X: X_train, Y: Y_train}))
print ("Test Accuracy:", accuracy.eval({X: X_test, Y: Y_test}))
return parameters
SO when i run this code i am getting this error on line:-
---->epoch_cost += minibatch_cost / num_minibatche
---->ValueError: operands could not be broadcast together with shapes (32,) (5,) (32,)
I took minibatche_size = 32 and number of training examples = 1381
But i am totally confused why i am getting this error.
The code you've posted is missing a lot of parts, like the whole model() function, so it's difficult to debug. But based on just what we have, some things here that are supposed to be scalars are in fact, arrays.
epoch_cost starts out as a scalar zero with epoch_cost = 0. Then you add some value to it, then try to print np.mean( epoch_cost ). Why do you take the mean of a scalar? Looks like the code was different earlier, and the migration to a scalar epoch_cost was not successful.
It is easy to imagine that minibatch_cost is returned as an array from TensorFlow - one cost value for each member of the batch. In that case you would need to apply np.mean() right there, like
epoch_cost += np.mean( minibatch_cost ) / num_minibatche
Maybe even num_minibatche somehow became a vector. It comes from
num_minibatche = int(m/ minibatch_size)
and minibatch_size is supposedly 32, so that's all right. But m comes from
(n_x, m) = X_train.shape
and we know nothing of X_train. Maybe m somehow became a vector, and in turn num_minibatche too. You will need to print the value for num_minibatche once calculated and make sure it's what it's supposed to be.
Hope this helps. If you post the whole code, I can help you more.

Multilevel neural network

I am attempting to complete the following tensorflow tutorial and (attempting problem 4): https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/udacity/3_regularization.ipynb
However, I think I might be setting up the arrays of weights below wrong. As soon as I change hidden_layer to [image_size * image_size,1024,num_labels] (i.e. just one hidden layer), this works fine. Currently I am getting NaNs for the loss.
One possible solution is that the block
for i in range(1,len(weights)-1):
relus = tf.nn.dropout(tf.nn.relu(tf.matmul(relus, weights[i]) + biases[i]),p_hide)
is causing the problems since I am destroying the past value of relus and Neural Nets need them to do backpropagation. In fact when there is one hidden layer this block does not get executed.
batch_size = 128
hidden_layer = [image_size * image_size,1024,300,num_labels]
l2_regulariser = 0.005
p_hide = 0.5
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
# Variables.
weights = [None]*(len(hidden_layer)-1)
biases = [None]*(len(hidden_layer)-1)
for i in range(len(weights)):
weights[i] = tf.Variable(tf.truncated_normal([hidden_layer[i], hidden_layer[i+1]]))
biases[i] = tf.Variable(tf.zeros([hidden_layer[i+1]]))
# Training computation.
relus = tf.nn.dropout(tf.nn.relu(tf.matmul(tf_train_dataset, weights[0]) + biases[0]),p_hide)
for i in range(1,len(weights)-1):
relus = tf.nn.dropout(tf.nn.relu(tf.matmul(relus, weights[i]) + biases[i]),p_hide)
logits = tf.matmul(relus, weights[len(weights)-1]) + biases[len(weights)-1]
loss = 0
for weight in weights:
loss += tf.nn.l2_loss(weight)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))+ l2_regulariser*loss
# Optimizer.
global_step = tf.Variable(0) # count the number of steps taken.
learning_rate = tf.train.exponential_decay(0.5, global_step, decay_steps=20, decay_rate=0.9)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
relus = tf.nn.relu(tf.matmul(tf_valid_dataset, weights[0]) + biases[0])
for i in range(1,len(weights)-1):
relus = tf.nn.relu(tf.matmul(relus, weights[i]) + biases[i])
valid_prediction = tf.nn.softmax(tf.matmul(relus, weights[len(weights)-1]) + biases[len(weights)-1])
relus = tf.nn.relu(tf.matmul(tf_test_dataset, weights[0]) + biases[0])
for i in range(1,len(weights)-1):
relus = tf.nn.relu(tf.matmul(relus, weights[i]) + biases[i])
test_prediction = tf.nn.softmax(tf.matmul(relus, weights[len(weights)-1]) + biases[len(weights)-1])
######################
# The NN training part
######################
num_steps = 3001
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, global_step : int(step)}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
You should better initialize your weights:
tf.truncated_normal([hidden_layer[i], hidden_layer[i+1]], stddev=0.1)
And most of all, you should lower your learning rate to something around 0.01, 0.001...
I think your get a loss of NaN because the learning rate is too high and it breaks the network (you get exploding weights).

Adding multiple layers to TensorFlow causes loss function to become Nan

I'm writing a neural-network classifier in TensorFlow/Python for the notMNIST dataset. I've implemented l2 regularization and dropout on the hidden layers. It works fine as long as there is only one hidden layer, but when I added more layers (to improve accuracy), the loss function increases rapidly at each step, becoming NaN by step 5. I tried temporarily disabling Dropout and L2 regularization, but I get the same behavior as long as there are 2+ layers. I even rewrote my code from scratch (doing some refactoring to make it more flexible), but with the same results. The number and size of layers is controlled by hidden_layer_spec. What am I missing?
#works for np.array([1024]) with about 96.1% accuracy
hidden_layer_spec = np.array([1024, 300])
num_hidden_layers = hidden_layer_spec.shape[0]
batch_size = 256
beta = 0.0005
epochs = 100
stepsPerEpoch = float(train_dataset.shape[0]) / batch_size
num_steps = int(math.ceil(float(epochs) * stepsPerEpoch))
l2Graph = tf.Graph()
with l2Graph.as_default():
#with tf.device('/cpu:0'):
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
weights = []
biases = []
for hi in range(0, num_hidden_layers + 1):
width = image_size * image_size if hi == 0 else hidden_layer_spec[hi - 1]
height = num_labels if hi == num_hidden_layers else hidden_layer_spec[hi]
weights.append(tf.Variable(tf.truncated_normal([width, height]), name = "w" + `hi + 1`))
biases.append(tf.Variable(tf.zeros([height]), name = "b" + `hi + 1`))
print(`width` + 'x' + `height`)
def logits(input, addDropoutLayer = False):
previous_layer = input
for hi in range(0, hidden_layer_spec.shape[0]):
previous_layer = tf.nn.relu(tf.matmul(previous_layer, weights[hi]) + biases[hi])
if addDropoutLayer:
previous_layer = tf.nn.dropout(previous_layer, 0.5)
return tf.matmul(previous_layer, weights[num_hidden_layers]) + biases[num_hidden_layers]
# Training computation.
train_logits = logits(tf_train_dataset, True)
l2 = tf.nn.l2_loss(weights[0])
for hi in range(1, len(weights)):
l2 = l2 + tf.nn.l2_loss(weights[0])
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(train_logits, tf_train_labels)) + beta * l2
# Optimizer.
global_step = tf.Variable(0) # count the number of steps taken.
learning_rate = tf.train.exponential_decay(0.5, global_step, int(stepsPerEpoch) * 2, 0.96, staircase = True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(train_logits)
valid_prediction = tf.nn.softmax(logits(tf_valid_dataset))
test_prediction = tf.nn.softmax(logits(tf_test_dataset))
saver = tf.train.Saver()
with tf.Session(graph=l2Graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Learning rate: " % learning_rate)
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
save_path = saver.save(session, "l2_degrade.ckpt")
print("Model save to " + `save_path`)
Turns out this was not so much a coding issue as a Deep Learning Issue. The extra layer made the gradients too unstable, and that lead to the loss function quickly devolving to NaN. The best way to fix this is to use Xavier initialization. Otherwise, the variance of the initial values will tend to be too high, causing instability. Also, decreasing the learning rate may help.
I had the same problem and reducing the batch size and learning rate worked for me.

Categories

Resources