I am trying to build a deep network using TF after using Martin Gorner's video as a reference. I has some success with the shallow network example; however the deep network's accuracy is collapsing after reaching around 98% accuracy for some reason.
The network implemented is used to recognise MNIST numerical characters using a five layer network. I am training with batches of 100 for 10000 iterations. The accuracy steadily increases until it reaches around 98%, then collapses completely to 9.8%.
Any ideas please?
"""Tensor flow character recognition of Numerals"""
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data", one_hot=True)
# layer K will have 200 neuron and so on
K = 200
L = 100
M = 60
N = 30
# ----- Initialization -----
# the None will become the batch size of 100
# 28 by 28 grayscale images described by a single byte
X = tf.placeholder(tf.float32, [None, 784])
# training will require computing variables W and b
W1 = tf.Variable(tf.truncated_normal([28*28, K], stddev=0.1))
B1 = tf.Variable(tf.zeros([K]))
W2 = tf.Variable(tf.truncated_normal([K, L], stddev=0.1))
B2 = tf.Variable(tf.zeros([L]))
W3 = tf.Variable(tf.truncated_normal([L, M], stddev=0.1))
B3 = tf.Variable(tf.zeros([M]))
W4 = tf.Variable(tf.truncated_normal([M, N], stddev=0.1))
B4 = tf.Variable(tf.zeros([N]))
W5 = tf.Variable(tf.truncated_normal([N, 10], stddev=0.1))
B5 = tf.Variable(tf.zeros([10]))
init = tf.global_variables_initializer()
# ----- Model -----
# the model Y = WX+b
# reshape is used to flatted the image into a 1D array of 784 locations
# -1 is used to tell python to figure the reshape as there's only on solution
#Y = tf.nn.softmax(tf.matmul(tf.reshape(X, [-1, 784]), W) + b)
Y1 = tf.nn.relu(tf.matmul(X, W1) + B1)
Y2 = tf.nn.relu(tf.matmul(Y1, W2) + B2)
Y3 = tf.nn.relu(tf.matmul(Y2, W3) + B3)
Y4 = tf.nn.relu(tf.matmul(Y3, W4) + B4)
Y5 = tf.nn.softmax(tf.matmul(Y4, W5) + B5)
# placeholder for correct answers
# e.g. correct answer for 2 will be [0 0 1 0 0 0 0 0 0 0 ]
Y_ = tf.placeholder(tf.float32, [None, 10])
# the loss function
cross_entropy = tf.reduce_sum(Y_ * tf.log(Y5)) * -1
# ----- Success Metrics -----
# calculate the % of correct answers found in batch
is_correct = tf.equal(tf.argmax(Y5, 1), tf.argmax(Y_, 1))
accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32))
# ----- Training Step -----
# pick an optimizer and tell it to minimize the cross entropy loss function
optimizer = tf.train.GradientDescentOptimizer(0.003)
train_step = optimizer.minimize(cross_entropy)
# create the execution session
sess = tf.Session()
sess.run(init)
for i in range(10000):
# load a batch of images from mnist
batch_X, batch_Y = mnist.train.next_batch(100)
train_data = {X: batch_X, Y_: batch_Y}
# ----- Execution -----
# train
sess.run(train_step, feed_dict=train_data)
# test for success
a, c = sess.run([accuracy, cross_entropy], feed_dict=train_data)
# this is only to display information
if i%100 == 0:
# check for success on whole data set
test_data = {X: mnist.test.images, Y_:mnist.test.labels}
a, c = sess.run([accuracy, cross_entropy], feed_dict=test_data)
print(a)
It is the accuracy on the validation set which collapses. right ?
so, you may be dramatically overfitting.
98% is possibly the best you can achieve with a network with such a capacity/structure.
Related
I had a curious experience with Keras.
Info: input dataset shapes
16 features, 5000 observations
target variable: 1 dimension
Problem: Regression
While writing code for students I developed a toy network using tf using the following code (I know is not a complete example but I hope it will give you enough information)
n1 = 15 # Number of neurons in layer 1
n2 = 15 # Number of neurons in layer 2
n3 = 15
nx = number_of_x_points
n_dim = nx
n4 = 1
stddev_f = 0.1
tf.set_random_seed(5)
X = tf.placeholder(tf.float32, [n_dim, None])
Y = tf.placeholder(tf.float32, [10, None])
W1 = tf.Variable(tf.random_normal([n1, n_dim], stddev=stddev_f))
b1 = tf.Variable(tf.constant(0.0, shape = [n1,1]) )
W2 = tf.Variable(tf.random_normal([n2, n1], stddev=stddev_f))
b2 = tf.Variable(tf.constant(0.0, shape = [n2,1]))
W3 = tf.Variable(tf.random_normal([n3,n2], stddev = stddev_f))
b3 = tf.Variable(tf.constant(0.0, shape = [n3,1]))
W4 = tf.Variable(tf.random_normal([n4,n3], stddev = stddev_f))
b4 = tf.Variable(tf.constant(0.0, shape = [n4,1]))
X = tf.placeholder(tf.float32, [nx, None]) # Inputs
Y = tf.placeholder(tf.float32, [1, None]) # Labels
Z1 = tf.nn.sigmoid(tf.matmul(W1, X) + b1) # n1 x n_dim * n_dim x n_obs = n1 x n_obs
Z2 = tf.nn.sigmoid(tf.matmul(W2, Z1) + b2) # n2 x n1 * n1 * n_obs = n2 x n_obs
Z3 = tf.nn.sigmoid(tf.matmul(W3, Z2) + b3)
Z4 = tf.matmul(W4, Z3) + b4
y_ = tf.sigmoid(Z4)
cost = tf.reduce_mean(tf.square(y_-Y))
learning_rate = 0.005
training_step = tf.train.AdamOptimizer(learning_rate).minimize(cost)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
training_epochs = 1000
cost_history = np.empty(shape=[1], dtype = float)
cost_meas_history = np.empty(shape=[1], dtype = float)
train_x = np.transpose(data)
train_y = np.transpose(targets)
cost_history = []
for epoch in range(training_epochs+1):
for i in range(0, train_x.shape[0], batch_size):
x_batch = train_x[i:i + batch_size,:]
y_batch = train_y[i:i + batch_size,:]
sess.run(training_step, feed_dict = {X: x_batch, Y: y_batch})
cost_ = sess.run(cost, feed_dict={ X:train_x, Y: train_y})
cost_history = np.append(cost_history, cost_)
if (epoch % 5000 == 0):
print("Reached epoch",epoch,"cost J =", cost_)
this code is working quite well and it takes on my laptop for 1000 epochs 5 sec. Now I developed the same network with keras with the code
model = tf.keras.Sequential()
model.add(layers.Dense(15, input_dim=16, activation='sigmoid'))
model.add(layers.Dense(15, activation='sigmoid'))
model.add(layers.Dense(15, activation='sigmoid'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer=tf.train.AdamOptimizer(0.005),
loss='mse',
metrics=['mae'])
# Training Phase
model.fit(train_x.transpose(), train_y.transpose()/100.0, epochs=1000, batch_size=100,verbose = 0)
This code takes 43 sec. Has anyone any idea what this is the case? Now I expected Keras to be slower but not that much slower. What am I missing?
Thanks, Umberto
Ok I found the reason... It was my mistake. Due to a series of mistakes, due to programming at night after midnight (...), I realized I was comparing batch GD and mini-batch GD. My apologies to everyone and thanks to today that noticed my mistake...
If someone thinks this should be deleted is fine with me.
Now Keras and plain TF are taking exactly the same time. Thanks everyone for reading.
Best, Umberto
import numpy as np
import tensorflow as tf
import pandas as pd
data = pd.read_csv('mnist_train.csv')
X = data.drop('label', axis=1).values
y = data['label'].values
with tf.Session() as sess:
Y = tf.one_hot(y, 10).eval()
hidden = [5, 4, 3]
def costa(y, yhat):
loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=yhat, labels=y)
loss = tf.reduce_sum(loss)
return loss
def train(cost):
train_op = tf.train.GradientDescentOptimizer(0.0001).minimize(cost)
return train_op
with tf.Graph().as_default():
X1 = tf.placeholder(tf.float32, [None, 784])
y1 = tf.placeholder(tf.float32, [None, 10])
w1 = tf.Variable(tf.random_normal((784, hidden[0])))
w2 = tf.Variable(tf.random_normal((hidden[0], hidden[1])))
w3 = tf.Variable(tf.random_normal((hidden[1], hidden[2])))
wo = tf.Variable(tf.random_normal((hidden[2], 10)))
b1 = tf.Variable(tf.random_normal((1, hidden[0])))
b2 = tf.Variable(tf.random_normal((1, hidden[1])))
b3 = tf.Variable(tf.random_normal((1, hidden[2])))
bo = tf.Variable(tf.random_normal((1, 10)))
layer1 = tf.nn.relu(tf.matmul(X1, w1) + b1)
layer2 = tf.nn.relu(tf.matmul(layer1, w2) + b2)
layer3 = tf.nn.relu(tf.matmul(layer2, w3) + b3)
layerout = (tf.matmul(layer3, wo) + bo)
yhat = layerout
cost = costa(y1, yhat)
train_op = train(cost)
init_op = tf.global_variables_initializer()
for epoch in range(1000):
with tf.Session() as sess:
sess.run(init_op)
sess.run(train_op, feed_dict={X1:X, y1:Y})
loss = sess.run(cost, feed_dict={X1:X, y1:Y})
print("Loss for epoch {}: {}".format(epoch, loss))
The loss stays around the same, jumps up and down a lot, but does not decrease accordingly.
I can't seem to find what is going wrong here, any help would be appeciated.
Is it the activations to the layers or am I getting the cost function wrong?
There are a couple of issues here:
You are running sess.run(init_op) every epoch. This means that the model parameters are being reset to random numbers every epoch, and therefore will be unable to learn. Try putting this op before for epoch in range(1000)
You are creating a new session every epoch. Change your code so it looks like this:
with tf.Session() as sess:
sess.run(init_op)
for epoch in range(1000):
sess.run(train_op, feed_dict={X1:X, y1:Y})
loss = sess.run(cost, feed_dict={X1:X, y1:Y})
print("Loss for epoch {}: {}".format(epoch, loss))
Initialising weights with a standard deviation of (2.0/neurons_in_prev_layer)**0.5 worked like a charm for me!
Also changed the hidden layers to, 2 hidden layers of 256, 256 neurons.
Okay one little tweak did the trick, I used RMSPropOptimizer instead and the loss started decreasing as expected.
I still have to figure out as to why this works, I’m still learning, but for now this is the solution I have.
Although the loss decreases very slowly.
I try to build a model that will identify by data and try to see the LOSS function
loss =tf.reduce_mean(-(y_ * tf.log(y)+(1- y_)* tf.log (1-y)))
But as of now I only get NAN on the prediction and printing NAN in the LOSS function
np_labels = np.array(labels)
np_labels = np_labels.reshape([np_labels.shape[0], 1])
features = 910
hidden_layer_nodes = 100
x = tf.placeholder(tf.float32, [None, features])
y_ = tf.placeholder(tf.float32, [None, 1])
W1 = tf.Variable(tf.truncated_normal([features,hidden_layer_nodes], stddev=0.1))
b1 = tf.Variable(tf.constant(0.1, shape=[hidden_layer_nodes]))
z1 = tf.add(tf.matmul(x,W1),b1)
a1 = tf.nn.relu(z1)
W2 = tf.Variable(tf.truncated_normal([hidden_layer_nodes,1], stddev=0.1))
b2 = tf.Variable(0.)
z2 = tf.matmul(a1,W2) + b2
y = 1 / (1.0 + tf.exp(-z2))
loss =tf.reduce_mean(-(y_ * tf.log(y)+(1- y_)* tf.log (1-y)))
update = tf.train.AdamOptimizer(0.01).minimize(loss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for i in range(0,50):
sess.run(update, feed_dict = {x:fvecs, y_:np_labels})
print(sess.run(loss, feed_dict={x: fvecs, y_: np_labels}))
# sess.run(update, feed_dict = {x:data_x, y_:data_y})
# print(sess.run(loss, feed_dict={x: data_x, y_: data_y}))
print('prediction: ', y.eval(session=sess, feed_dict = {x:[[493.9, 702.6, .....
i want to print the loss
Thanks
This is not a TensorFlow-Issue. This results from the very bad idea of implementing the loss-function yourself.
import tensorflow as tf
z2 = tf.random_normal([8, 10]) * 20
y_ = tf.random_uniform([8, 1], minval=0, maxval=10, dtype=tf.float32)
y = 1 / (1.0 + tf.exp(-z2))
loss = tf.reduce_mean(-(y_ * tf.log(y)+(1- y_)* tf.log (1-y)))
with tf.Session() as sess:
print sess.run(loss) # will always fail with high prob
Will give Inf just because of missing the log-sum-exp trick which then causes your implementation to fail due to numerical instabilities (a folklore example which produces overflows). Just run this code several times and you get either NaN or Inf.
Solution would be:
replace y = tf.sigmoid(-z2) by y = tf.identity(z2) to just get the untransformed logits
replace loss = .. by loss = tf.nn.sigmoid_cross_entropy_with_logits(...) to use the numerical stable way
See the docs of sigmoid_cross_entropy_with_logits which explicitly describes this issue.
I'm building a simple neural network that takes 3 values and gives 2 outputs.
I'm getting an accuracy of 67.5% and an average cost of 0.05
I have a training dataset of 1000 examples and 500 testing examples. I plan on making a larger dataset in the near future.
A little while ago I managed to get an accuracy of about 82% and sometimes a bit higher, but the cost was quite high.
I've been experimenting with adding another layer which is currently in the model and that is the reason I have got the loss under 1.0
I'm not sure what is going wrong, I'm new to Tensorflow and NNs in general.
Here is my code:
import tensorflow as tf
import numpy as np
import sys
sys.path.insert(0, '.../Dataset/Testing/')
sys.path.insert(0, '.../Dataset/Training/')
#other files
from TestDataNormaliser import *
from TrainDataNormaliser import *
learning_rate = 0.01
trainingIteration = 10
batchSize = 100
displayStep = 1
x = tf.placeholder("float", [None, 3])
y = tf.placeholder("float", [None, 2])
#layer 1
w1 = tf.Variable(tf.truncated_normal([3, 4], stddev=0.1))
b1 = tf.Variable(tf.zeros([4]))
y1 = tf.matmul(x, w1) + b1
#layer 2
w2 = tf.Variable(tf.truncated_normal([4, 4], stddev=0.1))
b2 = tf.Variable(tf.zeros([4]))
#y2 = tf.nn.sigmoid(tf.matmul(y1, w2) + b2)
y2 = tf.matmul(y1, w2) + b2
w3 = tf.Variable(tf.truncated_normal([4, 2], stddev=0.1))
b3 = tf.Variable(tf.zeros([2]))
y3 = tf.nn.sigmoid(tf.matmul(y2, w3) + b3) #sigmoid
#output
#wO = tf.Variable(tf.truncated_normal([2, 2], stddev=0.1))
#bO = tf.Variable(tf.zeros([2]))
a = y3 #tf.nn.softmax(tf.matmul(y2, wO) + bO) #y2
a_ = tf.placeholder("float", [None, 2])
#cost function
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y * tf.log(a)))
#cross_entropy = -tf.reduce_sum(y*tf.log(a))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cross_entropy)
#training
init = tf.global_variables_initializer() #initialises tensorflow
with tf.Session() as sess:
sess.run(init) #runs the initialiser
writer = tf.summary.FileWriter(".../Logs")
writer.add_graph(sess.graph)
merged_summary = tf.summary.merge_all()
for iteration in range(trainingIteration):
avg_cost = 0
totalBatch = int(len(trainArrayValues)/batchSize) #1000/100
#totalBatch = 10
for i in range(batchSize):
start = i
end = i + batchSize #100
xBatch = trainArrayValues[start:end]
yBatch = trainArrayLabels[start:end]
#feeding training data
sess.run(optimizer, feed_dict={x: xBatch, y: yBatch})
i += batchSize
avg_cost += sess.run(cross_entropy, feed_dict={x: xBatch, y: yBatch})/totalBatch
if iteration % displayStep == 0:
print("Iteration:", '%04d' % (iteration + 1), "cost=", "{:.9f}".format(avg_cost))
#
print("Training complete")
predictions = tf.equal(tf.argmax(a, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(predictions, "float"))
print("Accuracy:", accuracy.eval({x: testArrayValues, y: testArrayLabels}))
A few important notes:
You don't have non-linearities between your layers. This means you're training a network which is equivalent to a single-layer network, just with a lot of wasted computation. This is easily solved by adding a simple non-linearity, e.g. tf.nn.relu after each matmul/+ bias line, e.g. y2 = tf.nn.relu(y2) for all bar the last layer.
You are using a numerically unstable cross entropy implementation. I'd encourage you to use tf.nn.sigmoid_cross_entropy_with_logits, and removing your explicit sigmoid call (the input to your sigmoid function is what is generally referred to as the logits, or 'logistic units').
It seems you are not shuffling your dataset as you go. This could be particularly bad given your choice of optimizer, which leads us to...
Stochastic gradient descent is not great. For a boost without adding too much complication, consider using MomentumOptimizer instead. AdamOptimizer is my go-to, but play around with them.
When it comes to writing clean, maintainable code, I'd also encourage you to consider the following:
Use higher level APIs, e.g. tf.layers. It's good you know what's going on at a variable level, but it's easy to make a mistake with all that replicated code, and the default values with the layer implementations are generally pretty good
Consider using the tf.data.Dataset API for your data input. It's a bit scary at first, but it handles a lot of things like batching, shuffling, repeating epochs etc. very nicely
Consider using something like the tf.estimator.Estimator API for handling session runs, summary writing and evaluation.
With all those changes, you might have something that looks like the following (I've left your code in so you can roughly see the equivalent lines).
For graph construction:
def get_logits(features):
"""tf.layers API is cleaner and has better default values."""
# #layer 1
# w1 = tf.Variable(tf.truncated_normal([3, 4], stddev=0.1))
# b1 = tf.Variable(tf.zeros([4]))
# y1 = tf.matmul(x, w1) + b1
x = tf.layers.dense(features, 4, activation=tf.nn.relu)
# #layer 2
# w2 = tf.Variable(tf.truncated_normal([4, 4], stddev=0.1))
# b2 = tf.Variable(tf.zeros([4]))
# y2 = tf.matmul(y1, w2) + b2
x = tf.layers.dense(x, 4, activation=tf.nn.relu)
# w3 = tf.Variable(tf.truncated_normal([4, 2], stddev=0.1))
# b3 = tf.Variable(tf.zeros([2]))
# y3 = tf.nn.sigmoid(tf.matmul(y2, w3) + b3) #sigmoid
# N.B Don't take a non-linearity here.
logits = tf.layers.dense(x, 1, actiation=None)
# remove unnecessary final dimension, batch_size * 1 -> batch_size
logits = tf.squeeze(logits, axis=-1)
return logits
def get_loss(logits, labels):
"""tf.nn.sigmoid_cross_entropy_with_logits is numerically stable."""
# #cost function
# cross_entropy = tf.reduce_mean(-tf.reduce_sum(y * tf.log(a)))
return tf.nn.sigmoid_cross_entropy_with_logits(
logits=logits, labels=labels)
def get_train_op(loss):
"""There are better options than standard SGD. Try the following."""
learning_rate = 1e-3
# optimizer = tf.train.GradientDescentOptimizer(learning_rate)
optimizer = tf.train.MomentumOptimizer(learning_rate)
# optimizer = tf.train.AdamOptimizer(learning_rate)
return optimizer.minimize(loss)
def get_inputs(feature_data, label_data, batch_size, n_epochs=None,
shuffle=True):
"""
Get features and labels for training/evaluation.
Args:
feature_data: numpy array of feature data.
label_data: numpy array of label data
batch_size: size of batch to be returned
n_epochs: number of epochs to train for. None will result in repeating
forever/until stopped
shuffle: bool flag indicating whether or not to shuffle.
"""
dataset = tf.data.Dataset.from_tensor_slices(
(feature_data, label_data))
dataset = dataset.repeat(n_epochs)
if shuffle:
dataset = dataset.shuffle(len(feature_data))
dataset = dataset.batch(batch_size)
features, labels = dataset.make_one_shot_iterator().get_next()
return features, labels
For session running you could use this like you have (what I'd call 'the hard way')...
features, labels = get_inputs(
trainArrayValues, trainArrayLabels, batchSize, n_epochs, shuffle=True)
logits = get_logits(features)
loss = get_loss(logits, labels)
train_op = get_train_op(loss)
init = tf.global_variables_initializer()
# monitored sessions have the `should_stop` method, which works with datasets
with tf.train.MonitoredSession() as sess:
sess.run(init)
while not sess.should_stop():
# get both loss and optimizer step in the same session run
loss_val, _ = sess.run([loss, train_op])
print(loss_val)
# save variables etc, do evaluation in another graph with different inputs?
but I think you're better off using a tf.estimator.Estimator, though some people prefer tf.keras.Models.
def model_fn(features, labels, mode):
logits = get_logits(features)
loss = get_loss(logits, labels)
train_op = get_train_op(loss)
predictions = tf.greater(logits, 0)
accuracy = tf.metrics.accuracy(labels, predictions)
return tf.estimator.EstimatorSpec(
mode=mode, loss=loss, train_op=train_op,
eval_metric_ops={'accuracy': accuracy}, predictions=predictions)
def train_input_fn():
return get_inputs(trainArrayValues, trainArrayLabels, batchSize)
def eval_input_fn():
return get_inputs(
testArrayValues, testArrayLabels, batchSize, n_epochs=1, shuffle=False)
# Where variables and summaries will be saved to
model_dir = './model'
estimator = tf.estimator.Estimator(model_fn, model_dir)
estimator.train(train_input_fn, max_steps=max_steps)
estimator.evaluate(eval_input_fn)
Note if you use estimators the variables will be saved after training, so you won't need to re-train each time. If you want to reset, just delete the model_dir.
I see that you are using a softmax loss with sigmoidal activation functions in the last layer. Now let me explain the difference between softmax activations and sigmoidal.
You are now allowing the output of the network to be y=(0, 1), y=(1, 0), y=(0, 0) and y=(1, 1). This is because your sigmoidal activations "squish" each element in y between 0 and 1. Your loss function, however, assumes that your y vector sums to one.
What you need to do here is either to penalise the sigmoidal cross entropy function, which looks like this:
-tf.reduce_sum(y*tf.log(a))-tf.reduce_sum((1-y)*tf.log(1-a))
Or, if you want a to sum to one, you need to use softmax activations in your final layer (to get your a's) instead of sigmoids, which is implemented like this
exp_out = tf.exp(y3)
a = exp_out/tf reduce_sum(exp_out)
Ps. I'm using my phone on a train so please excuse typos
I am learning TensorFlow and was implementing a simple neural network as explained in MNIST for Beginners in TensorFlow docs. Here is the link. The accuracy was about 80-90 %, as expected.
Then following the same article was MNIST for Experts using ConvNet. Instead of implementing that I decided to improve the beginner part. I know about Neural Nets and how they learn and the fact that deep networks can perform better than shallow networks. I modified the original program in MNIST for Beginner to implement a Neural network with 2 hidden layers each of 16 neurons.
It looks something like this :
Image of Network
Code for it
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
x = tf.placeholder(tf.float32, [None, 784], 'images')
y = tf.placeholder(tf.float32, [None, 10], 'labels')
# We are going to make 2 hidden layer neurons with 16 neurons each
# All the weights in network
W0 = tf.Variable(dtype=tf.float32, name='InputLayerWeights', initial_value=tf.zeros([784, 16]))
W1 = tf.Variable(dtype=tf.float32, name='HiddenLayer1Weights', initial_value=tf.zeros([16, 16]))
W2 = tf.Variable(dtype=tf.float32, name='HiddenLayer2Weights', initial_value=tf.zeros([16, 10]))
# All the biases for the network
B0 = tf.Variable(dtype=tf.float32, name='HiddenLayer1Biases', initial_value=tf.zeros([16]))
B1 = tf.Variable(dtype=tf.float32, name='HiddenLayer2Biases', initial_value=tf.zeros([16]))
B2 = tf.Variable(dtype=tf.float32, name='OutputLayerBiases', initial_value=tf.zeros([10]))
def build_graph():
"""This functions wires up all the biases and weights of the network
and returns the last layer connections
:return: returns the activation in last layer of network/output layer without softmax
"""
A1 = tf.nn.relu(tf.matmul(x, W0) + B0)
A2 = tf.nn.relu(tf.matmul(A1, W1) + B1)
return tf.matmul(A2, W2) + B2
def print_accuracy(sx, sy, tf_session):
"""This function prints the accuracy of a model at the time of invocation
:return: None
"""
correct_prediction = tf.equal(tf.argmax(y), tf.argmax(tf.nn.softmax(build_graph())))
correct_prediction_float = tf.cast(correct_prediction, dtype=tf.float32)
accuracy = tf.reduce_mean(correct_prediction_float)
print(accuracy.eval(feed_dict={x: sx, y: sy}, session=tf_session))
y_predicted = build_graph()
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=y_predicted))
model = tf.train.GradientDescentOptimizer(0.03).minimize(cross_entropy)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for _ in range(1000):
batch_x, batch_y = mnist.train.next_batch(50)
if _ % 100 == 0:
print_accuracy(batch_x, batch_y, sess)
sess.run(model, feed_dict={x: batch_x, y: batch_y})
The Output expected was supposed to be better than what could be achieved with when just a single layer (Assumed that W0 has shape of [784,10] and B0 has shape of [10])
def build_graph():
return tf.matmul(x,W0) + B0
Instead, the output says that network was not training at all. The Accuracy was not crossing 20% in any iteration.
Output
Extracting MNIST_data/train-images-idx3-ubyte.gz
Extracting MNIST_data/train-labels-idx1-ubyte.gz
Extracting MNIST_data/t10k-images-idx3-ubyte.gz
Extracting MNIST_data/t10k-labels-idx1-ubyte.gz
0.1
0.1
0.1
0.1
0.1
0.1
0.1
0.1
0.1
0.1
My Question
What is wrong with the above program that it does not generalize at all? How can I improve it more without using convolutional neural networks?
Your main mistake is network symmetry, because you initialized all weights to zeros. As a result, the weights are never updated. Change it to small random numbers and it will start learning. It's ok to initialize biases with zeros.
Another issue is pure technical: print_accuracy function is creating new nodes in the computational graph, and since you're calling it in the loop, the graph gets bloated and eventually will use up all memory.
You might also want to play with hyper-parameters and make the network bigger to increase its capacity.
Edit: I also spotted a bug in your accuracy calculation. It should be
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_predicted, 1))
Here's a complete code:
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
x = tf.placeholder(tf.float32, [None, 784], 'images')
y = tf.placeholder(tf.float32, [None, 10], 'labels')
W0 = tf.Variable(dtype=tf.float32, name='InputLayerWeights', initial_value=tf.truncated_normal([784, 16]) * 0.001)
W1 = tf.Variable(dtype=tf.float32, name='HiddenLayer1Weights', initial_value=tf.truncated_normal([16, 16]) * 0.001)
W2 = tf.Variable(dtype=tf.float32, name='HiddenLayer2Weights', initial_value=tf.truncated_normal([16, 10]) * 0.001)
B0 = tf.Variable(dtype=tf.float32, name='HiddenLayer1Biases', initial_value=tf.ones([16]))
B1 = tf.Variable(dtype=tf.float32, name='HiddenLayer2Biases', initial_value=tf.ones([16]))
B2 = tf.Variable(dtype=tf.float32, name='OutputLayerBiases', initial_value=tf.ones([10]))
A1 = tf.nn.relu(tf.matmul(x, W0) + B0)
A2 = tf.nn.relu(tf.matmul(A1, W1) + B1)
y_predicted = tf.matmul(A2, W2) + B2
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_predicted, 1))
correct_prediction_float = tf.cast(correct_prediction, dtype=tf.float32)
accuracy = tf.reduce_mean(correct_prediction_float)
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=y_predicted))
optimizer = tf.train.AdamOptimizer(0.001).minimize(cross_entropy)
mnist = input_data.read_data_sets('mnist', one_hot=True)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(20000):
batch_x, batch_y = mnist.train.next_batch(64)
_, cost_val, acc_val = sess.run([optimizer, cross_entropy, accuracy], feed_dict={x: batch_x, y: batch_y})
if i % 100 == 0:
print('cost=%.3f accuracy=%.3f' % (cost_val, acc_val))