TensorFlow for binary classification - python

I am trying to adapt this MNIST example to binary classification.
But when changing my NLABELS from NLABELS=2 to NLABELS=1, the loss function always returns 0 (and accuracy 1).
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
# Import data
mnist = input_data.read_data_sets('data', one_hot=True)
sess = tf.InteractiveSession()
# Create the model
x = tf.placeholder(tf.float32, [None, 784], name='x-input')
W = tf.Variable(tf.zeros([784, NLABELS]), name='weights')
b = tf.Variable(tf.zeros([NLABELS], name='bias'))
y = tf.nn.softmax(tf.matmul(x, W) + b)
# Add summary ops to collect data
_ = tf.histogram_summary('weights', W)
_ = tf.histogram_summary('biases', b)
_ = tf.histogram_summary('y', y)
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, NLABELS], name='y-input')
# More name scopes will clean up the graph representation
with tf.name_scope('cross_entropy'):
cross_entropy = -tf.reduce_mean(y_ * tf.log(y))
_ = tf.scalar_summary('cross entropy', cross_entropy)
with tf.name_scope('train'):
train_step = tf.train.GradientDescentOptimizer(10.).minimize(cross_entropy)
with tf.name_scope('test'):
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
_ = tf.scalar_summary('accuracy', accuracy)
# Merge all the summaries and write them out to /tmp/mnist_logs
merged = tf.merge_all_summaries()
writer = tf.train.SummaryWriter('logs', sess.graph_def)
# Train the model, and feed in test data and record summaries every 10 steps
for i in range(1000):
if i % 10 == 0: # Record summary data and the accuracy
labels = mnist.test.labels[:, 0:NLABELS]
feed = {x: mnist.test.images, y_: labels}
result = sess.run([merged, accuracy, cross_entropy], feed_dict=feed)
summary_str = result[0]
acc = result[1]
loss = result[2]
writer.add_summary(summary_str, i)
print('Accuracy at step %s: %s - loss: %f' % (i, acc, loss))
batch_xs, batch_ys = mnist.train.next_batch(100)
batch_ys = batch_ys[:, 0:NLABELS]
feed = {x: batch_xs, y_: batch_ys}
sess.run(train_step, feed_dict=feed)
I have checked the dimensions of both batch_ys (fed into y) and _y and they are both 1xN matrices when NLABELS=1 so the problem seems to be prior to that. Maybe something to do with the matrix multiplication?
I actually have got this same problem in a real project, so any help would be appreciated... Thanks!

The original MNIST example uses a one-hot encoding to represent the labels in the data: this means that if there are NLABELS = 10 classes (as in MNIST), the target output is [1 0 0 0 0 0 0 0 0 0] for class 0, [0 1 0 0 0 0 0 0 0 0] for class 1, etc. The tf.nn.softmax() operator converts the logits computed by tf.matmul(x, W) + b into a probability distribution across the different output classes, which is then compared to the fed-in value for y_.
If NLABELS = 1, this acts as if there were only a single class, and the tf.nn.softmax() op would compute a probability of 1.0 for that class, leading to a cross-entropy of 0.0, since tf.log(1.0) is 0.0 for all of the examples.
There are (at least) two approaches you could try for binary classification:
The simplest would be to set NLABELS = 2 for the two possible classes, and encode your training data as [1 0] for label 0 and [0 1] for label 1. This answer has a suggestion for how to do that.
You could keep the labels as integers 0 and 1 and use tf.nn.sparse_softmax_cross_entropy_with_logits(), as suggested in this answer.

I've been looking for good examples of how to implement binary classification in TensorFlow in a similar manner to the way it would be done in Keras. I didn't find any, but after digging through the code a bit, I think I have it figured out. I modified the problem here to implement a solution that uses sigmoid_cross_entropy_with_logits the way Keras does under the hood.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
# Import data
mnist = input_data.read_data_sets('data', one_hot=True)
sess = tf.InteractiveSession()
# Create the model
x = tf.placeholder(tf.float32, [None, 784], name='x-input')
W = tf.get_variable('weights', [784, NLABELS],
initializer=tf.truncated_normal_initializer()) * 0.1
b = tf.Variable(tf.zeros([NLABELS], name='bias'))
logits = tf.matmul(x, W) + b
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, NLABELS], name='y-input')
# More name scopes will clean up the graph representation
with tf.name_scope('cross_entropy'):
#manual calculation : under the hood math, don't use this it will have gradient problems
# entropy = tf.multiply(tf.log(tf.sigmoid(logits)), y_) + tf.multiply((1 - y_), tf.log(1 - tf.sigmoid(logits)))
# loss = -tf.reduce_mean(entropy, name='loss')
entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_, logits=logits)
loss = tf.reduce_mean(entropy, name='loss')
with tf.name_scope('train'):
# Using Adam instead
# train_step = tf.train.GradientDescentOptimizer(learning_rate=0.001).minimize(loss)
train_step = tf.train.AdamOptimizer(learning_rate=0.002).minimize(loss)
with tf.name_scope('test'):
preds = tf.cast((logits > 0.5), tf.float32)
correct_prediction = tf.equal(preds, y_)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# Train the model, and feed in test data and record summaries every 10 steps
for i in range(2000):
if i % 100 == 0: # Record summary data and the accuracy
labels = mnist.test.labels[:, 0:NLABELS]
feed = {x: mnist.test.images, y_: labels}
result = sess.run([loss, accuracy], feed_dict=feed)
print('Accuracy at step %s: %s - loss: %f' % (i, result[1], result[0]))
batch_xs, batch_ys = mnist.train.next_batch(100)
batch_ys = batch_ys[:, 0:NLABELS]
feed = {x: batch_xs, y_: batch_ys}
sess.run(train_step, feed_dict=feed)
Accuracy at step 0: 0.7373 - loss: 0.758670
Accuracy at step 100: 0.9017 - loss: 0.423321
Accuracy at step 200: 0.9031 - loss: 0.322541
Accuracy at step 300: 0.9085 - loss: 0.255705
Accuracy at step 400: 0.9188 - loss: 0.209892
Accuracy at step 500: 0.9308 - loss: 0.178372
Accuracy at step 600: 0.9453 - loss: 0.155927
Accuracy at step 700: 0.9507 - loss: 0.139031
Accuracy at step 800: 0.9556 - loss: 0.125855
Accuracy at step 900: 0.9607 - loss: 0.115340
Accuracy at step 1000: 0.9633 - loss: 0.106709
Accuracy at step 1100: 0.9667 - loss: 0.099286
Accuracy at step 1200: 0.971 - loss: 0.093048
Accuracy at step 1300: 0.9714 - loss: 0.087915
Accuracy at step 1400: 0.9745 - loss: 0.083300
Accuracy at step 1500: 0.9745 - loss: 0.079019
Accuracy at step 1600: 0.9761 - loss: 0.075164
Accuracy at step 1700: 0.9768 - loss: 0.071803
Accuracy at step 1800: 0.9777 - loss: 0.068825
Accuracy at step 1900: 0.9788 - loss: 0.066270


Making batch tensorflow

So I have this problem of making batch in my code, the thing is, I tried to search how we do batching but all I found was using some method like next_batch in MNIST sample program. I would really appreciate if someone could actually give me some tips on how I should make batch in my program below.
import tensorflow as tf
import numpy as np
from sklearn import cross_validation
import pandas as pd
#this is input data, data is a 7x86594 and label is a 5x86594
data2 = pd.read_csv('rawdata.csv', sep=',', header=None)
data = np.array(data2)
label2=pd.read_csv('class.csv', sep='\t', header=None)
#this is supposed to be neural size in hidden layer
num_units = 15
x = tf.placeholder(tf.float32, [None, 7])
t = tf.placeholder(tf.float32, [None, 5])
w1 = tf.Variable(tf.truncated_normal([7, num_units], mean=0.0, stddev=0.05))
b1 = tf.Variable(tf.zeros([num_units]))
hidden1 = tf.nn.relu(tf.matmul(x, w1) + b1)
w0 = tf.Variable(tf.zeros([num_units, 5]))
b0 = tf.Variable(tf.zeros([5]))
p = tf.nn.softmax(tf.matmul(hidden1, w0) + b0)
loss = -tf.reduce_sum(t * tf.log(tf.clip_by_value(p,1e-10,1.0)))
train_step = tf.train.AdamOptimizer().minimize(loss)
correct_prediction = tf.equal(tf.argmax(p, 1), tf.argmax(t, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sess = tf.InteractiveSession()
#this is how i think batching is
batch_size = 100
for j in range(0, 86594, batch_size):
xs,ys= train_x[j:j+batch_size], train_t[j:j+batch_size]
i = 0
for _ in range(4000):
i += 1
sess.run(train_step, feed_dict={x: xs, t: ys})
if i % 100 == 0:
loss_val, acc_val = sess.run([loss, accuracy],feed_dict={x:test_x, t: test_t})
print ('Step: %d, Loss: %f, Accuracy: %f'% (i, loss_val, acc_val))
The result of this program, of course, isn't right.
Keep extracting the batches of your data and keep feeding them to the network for training. In each epoch, all the samples of your training dataset should be run once. So you can rewrite your code like this:
Required part of code only:
epochs = 4000
batch_size = 100
for epoch_no in range(epochs):
for index, offset in enumerate(range(0, 86594, batch_size)):
xs, ys = train_x[offset: offset + batch_size], train_t[offset: offset + batch_size]
sess.run(train_step, feed_dict={x: xs, t: ys})
if index % 100 == 0:
loss_val, acc_val = sess.run([loss, accuracy], feed_dict = {x: test_x, t: test_t})
print ('Epoch %d, Step: %d, Loss: %f, Accuracy: %f'% (epoch_no, index, loss_val, acc_val))

unable to get the updated value of tensor in tensorflow

I used the code below for simple logistic regression. I was able to get the updated value of b: the values of b.eval() before/after training are different. However, the value of W.eval() remains the same. I was wondering what mistake I made? Thank you!
from __future__ import print_function
import tensorflow as tf
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
# Parameters
learning_rate = 0.01
training_epochs = 20
batch_size = 100
display_step = 1
# tf Graph Input
x = tf.placeholder(tf.float32, [None, 784]) # mnist data image of shape 28*28=784
y = tf.placeholder(tf.float32, [None, 10]) # 0-9 digits recognition => 10 classes
# Set model weights
W = tf.Variable(tf.random_normal([784, 10]))
b = tf.Variable(tf.zeros([10]))
# Construct model
pred = tf.nn.softmax(tf.matmul(x, W) + b) # Softmax
# Minimize error using cross entropy
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices=1))
# Gradient Descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
print('W is:')
print('b is:')
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(mnist.train.num_examples/batch_size)
# Loop over all batches
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([optimizer, cost], feed_dict={x: batch_xs,
y: batch_ys})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if (epoch+1) % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))
print("Optimization Finished!")
print('W is:')
print('b is:')
# Test model
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print("Accuracy:", accuracy.eval({x: mnist.test.images, y: mnist.test.labels}))
When we print a numpy array only initial and last values will get printed, And in case of MNIST those indices of weights are not updating as corresponding pixels in images remains constant as all digits are written in centre part of array or image not along boundary regions.
The actual pixels which are varying from one input sample to another input sample are centre pixels so only those corresponding weights elements will get update.
To compare weights before and after training you can use numpy.array_equal(w1, w2)
or, you can print whole numpy array by doing:
import numpy
or, you can compare element by element, and print only those values of array which differ by a certain threshold

TensorFlow: Regression using Deep Neural Network

I am a novice in TensorFlow but have a fair understanding of ML algorithms. I have a project to model the error characteristic of time of flight cameras. As ground truth, I have acquired a set of depth images(512x424 pixels) from a stereo set up. The the range images(512x424 pixels) from the ToF camera needs to be compared with the reference depth images. In order to learn the error characteristic I am implementing a deep neural network with the reference image pixels as training input data(median pixels as features) and the difference of reference image and range camera image as training output value. There are 3 pairs of images to train and 1 pair of images to test. I have flattened the image matrices so the training input data are 3-element lists of 217088 sized arrays.
My code works without any errors but the results are ugly:
The cost reduces nicely after the first epoch but does not change much after the second epoch.
The accuracy of the test phase is horrendous.
The code is extremely slow. It takes almost 2 hours for a complete run. May be it has to do with the hardware. I am running it on core i3.
My code:
import tensorflow as tf
import numpy
import cv2
import matplotlib.pyplot as plt
import glob
refDepthImgLoc = 'M:\Internship\Scan\png\scan_dist*.png'
tofDepthImgLoc = 'M:\Internship\Scan\png\kinect_distance*.png'
numImg = 4
refDepthImg = []
tofDepthImg = []
refLoc = glob.glob(scanDistImgLoc)
tofLoc = glob.glob(tofDistImgLoc)
for refImg, tofImg in zip(refLoc, tofLoc) :
img1 = cv2.imread(refImg, 0)
img2 = cv2.imread(tofImg, 0)
trainData_median = []
trainLabel = []
for i in range(len(refDepthImg)):
tempData = cv2.medianBlur(refDepthImg[i], 3)
tempLabel = refDepthImg[i] - tofDepthImg[i]
n_nodes_hl1 = 100
n_nodes_hl2 = 100
n_nodes_hl3 = 100
n_input = 1;
n_output = 1;
learning_rate = 0.01
x = tf.placeholder('float')
y = tf.placeholder('float')
def neural_network_model(data):
hidden_1_layer = {'weights':tf.Variable(tf.random_normal([n_input, n_nodes_hl1])),
hidden_2_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
hidden_3_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
l1 = tf.add(tf.matmul(data,hidden_1_layer['weights']), hidden_1_layer['biases'])
l1 = tf.nn.relu(l1)
l2 = tf.add(tf.matmul(l1,hidden_2_layer['weights']), hidden_2_layer['biases'])
l2 = tf.nn.relu(l2)
l3 = tf.add(tf.matmul(l2,hidden_3_layer['weights']), hidden_3_layer['biases'])
l3 = tf.nn.relu(l3)
output = tf.reduce_sum(l3)
return output
def train_neural_network(x):
prediction = neural_network_model(x)
cost = tf.reduce_sum(tf.square(prediction-y))/((numImg-1)*len(trainLabel[0]))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
hm_epochs = 10
with tf.Session() as sess:
for epoch in range(hm_epochs):
tempLoss = 0
for i in range(numImg - 1):
for (X, Y) in zip(trainData_median[i], trainLabel[i]):
_, c = sess.run([optimizer, cost], feed_dict={x: [[X]], y: [[Y]]})
tempLoss += c
print('Epoch', (epoch+1), 'completed out of',hm_epochs,'loss:',tempLoss)
print("Testing starts now")
test = tf.abs(prediction - y)
i = 0;
pred = numpy.zeros(len(trainLabel[0]));
result = numpy.zeros(len(trainLabel[0]));
for (X, Y) in zip(trainData_median[numImg - 1], trainLabel[numImg - 1]):
correct, pred[i] = sess.run([test, prediction], feed_dict={x: [[X]], y: [[Y]]})
if (correct < 0.5):
result[i] = 1
i += 1
accuracy = tf.reduce_mean(tf.cast(result, 'float'))
print('Accuracy:', accuracy.eval())
The output:
Epoch 1 completed out of 10 loss: 204681865.46
Epoch 2 completed out of 10 loss: 3188.81297796
Epoch 3 completed out of 10 loss: 3183.35926716
Epoch 4 completed out of 10 loss: 3181.37895241
Epoch 5 completed out of 10 loss: 3179.95276242
Epoch 6 completed out of 10 loss: 3178.51366003
Epoch 7 completed out of 10 loss: 3177.6227609
Epoch 8 completed out of 10 loss: 3176.69995104
Epoch 9 completed out of 10 loss: 3176.85162593
Epoch 10 completed out of 10 loss: 3177.04338937
Testing starts now
Accuracy: 0.00301721
Please comment if there is any inherent logical error in the code or the entire approach to the problem is incorrect. Should I try implementing it using CNNs? Please help me in making this work. Please let me know if any further information is required.

Super high cost Tensorflow

I'm trying to make some price prediction on a kaggle dataset with Tensorflow.
My Neural network is learning, but, my cost function is really high and my predictions are far from the real output.
I tried to change my network by adding or removing some layers, neurons and activations functions.
I tried a lot with my hyper-parameters but that don't change so much things.
I don't think that the problem come from my datas, I checked on kaggle and that's the ones that most people uses.
If you have any idea why my cost is so high and how to reduce it and if you could explain it to me, it would be really great !
Her's my code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.utils import shuffle
df = pd.read_csv(r"C:\Users\User\Documents\TENSORFLOW\Prediction prix\train2.csv", sep=';')
df = df.loc[:, ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'SalePrice']]
df = df.replace(np.nan, 0)
%matplotlib inline
plt = sns.pairplot(df)
df = shuffle(df)
df_train = df[0:1000]
df_test = df[1001:1451]
inputX = df_train.drop('SalePrice', 1).as_matrix()
inputX = inputX.astype(int)
inputY = df_train.loc[:, ['SalePrice']].as_matrix()
inputY = inputY.astype(int)
inputX_test = df_test.drop('SalePrice', 1).as_matrix()
inputX_test = inputX_test.astype(int)
inputY_test = df_test.loc[:, ['SalePrice']].as_matrix()
inputY_test = inputY_test.astype(int)
# Parameters
learning_rate = 0.01
training_epochs = 1000
batch_size = 500
display_step = 50
n_samples = inputX.shape[0]
x = tf.placeholder(tf.float32, [None, 5])
y = tf.placeholder(tf.float32, [None, 1])
def add_layer(inputs, in_size, out_size, activation_function=None):
Weights = tf.Variable(tf.random_normal([in_size, out_size], stddev=0.1))
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
Wx_plus_b = tf.matmul(inputs, Weights) + biases
if activation_function is None:
output = Wx_plus_b
output = activation_function(Wx_plus_b)
return output
l1 = add_layer(x, 5, 3, activation_function=tf.nn.relu)
pred = add_layer(l1, 3, 1)
# Mean squared error
cost = tf.reduce_sum(tf.pow(pred-y, 2))/(2*n_samples)
# Gradient descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = batch_size
# Loop over all batches
for i in range(total_batch):
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([optimizer, cost], feed_dict={x: inputX,
y: inputY})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", \
print("Optimization Finished!")
# Test model
correct_prediction = tf.equal(pred,y)
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print("Accuracy:", accuracy.eval({x: inputX, y: inputY}))
print(sess.run(pred, feed_dict={x: inputX_test}))
Epoch: 0001 cost= 10142407502702304395526144.000000000
Epoch: 0051 cost= 3256106752.000019550
Epoch: 0101 cost= 3256106752.000019550
Epoch: 0151 cost= 3256106752.000019550
Epoch: 0201 cost= 3256106752.000019550
Thanks for your help !
I see couple of problems with the implementation:
Inputs are not scaled.
Use sklearn StandardScaler to scale the inputs inputX, inputY (and also inputX_text and inputY_text) to make it zero mean and unit variance. You can use the inverse_transform to convert the outputs back to proper scale again.
sc = StandardScaler().fit(inputX)
inputX = sc.transform(inputX)
inputX_test = sc.transform(inputX_test)
The batch_size is too large, you are passing the entire set as a single batch. This should not cause the particular problem you are facing, but for better convergence try with reduced batch size. Implement a get_batch() generator function and do the following:
for batch_X, batch_Y in get_batch(input_X, input_Y, batch_size):
_, c = sess.run([optimizer, cost], feed_dict={x: batch_X,
y: batch_Y})
Try smaller Weights initialization (stddev) if you still see issues.
inputX = df_train.drop('SalePrice', 1).as_matrix()
inputX = inputX.astype(int)
sc = StandardScaler().fit(inputX)
inputX = sc.transform(inputX)
inputY = df_train.loc[:, ['SalePrice']].as_matrix()
inputY = inputY.astype(int)
sc1 = StandardScaler().fit(inputY)
inputY = sc1.transform(inputY)
inputX_test = df_test.drop('SalePrice', 1).as_matrix()
inputX_test = inputX_test.astype(int)
inputX_test = sc.transform(inputX_test)
inputY_test = df_test.loc[:, ['SalePrice']].as_matrix()
inputY_test = inputY_test.astype(int)
inputY_test = sc1.transform(inputY_test)
learning_rate = 0.01
training_epochs = 1000
batch_size = 50
display_step = 50
n_samples = inputX.shape[0]
x = tf.placeholder(tf.float32, [None, 5])
y = tf.placeholder(tf.float32, [None, 1])
def get_batch(inputX, inputY, batch_size):
duration = len(inputX)
for i in range(0,duration//batch_size):
idx = i*batch_size
yield inputX[idx:idx+batch_size], inputY[idx:idx+batch_size]
def add_layer(inputs, in_size, out_size, activation_function=None):
Weights = tf.Variable(tf.random_normal([in_size, out_size], stddev=0.005))
biases = tf.Variable(tf.zeros([1, out_size]))
Wx_plus_b = tf.matmul(inputs, Weights) + biases
if activation_function is None:
output = Wx_plus_b
output = activation_function(Wx_plus_b)
return output
l1 = add_layer(x, 5, 3, activation_function=tf.nn.relu)
pred = add_layer(l1, 3, 1)
# Mean squared error
cost = tf.reduce_mean(tf.pow(tf.subtract(pred, y), 2))
# Gradient descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = batch_size
# Loop over all batches
#for i in range(total_batch):
for batch_x, batch_y in get_batch(inputX, inputY, batch_size):
# Run optimization op (backprop) and cost op (to get loss value)
_, c, _l1, _pred = sess.run([optimizer, cost, l1, pred], feed_dict={x: batch_x, y: batch_y})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f} ".format(avg_cost))
#print(_l1, _pred)
print("Optimization Finished!")
I have already had a similar problem of a very high cost reached after a few training steps, and then the cost remaining constant there. For me it was a kind of overflow, with the gradients too big and creating Nan values quite early in training. I solved it by starting with a smaller learning rate (potentially much smaller), until the cost and gradients become more reasonable (a few dozen steps), and then back to a regular one (higher at the start, potentially decaying).
See my answer to this post for a similar case that was solved just by taking a smaller learning rate on start.
You can also clip your gradients to avoid this problem, using tf.clip_by_value. It sets a minimum and maximum value to your gradients, which avoids to have huge ones that send your weights straight to Nan after the first few iterations. To use it (with min and max at -1 and 1, which is probably too tight), replace
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
opt= tf.train.GradientDescentOptimizer(learning_rate)
gvs = opt.compute_gradients(cost)
capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
optimizer = opt.apply_gradients(capped_gvs)

Multilevel neural network

I am attempting to complete the following tensorflow tutorial and (attempting problem 4): https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/udacity/3_regularization.ipynb
However, I think I might be setting up the arrays of weights below wrong. As soon as I change hidden_layer to [image_size * image_size,1024,num_labels] (i.e. just one hidden layer), this works fine. Currently I am getting NaNs for the loss.
One possible solution is that the block
for i in range(1,len(weights)-1):
relus = tf.nn.dropout(tf.nn.relu(tf.matmul(relus, weights[i]) + biases[i]),p_hide)
is causing the problems since I am destroying the past value of relus and Neural Nets need them to do backpropagation. In fact when there is one hidden layer this block does not get executed.
batch_size = 128
hidden_layer = [image_size * image_size,1024,300,num_labels]
l2_regulariser = 0.005
p_hide = 0.5
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
# Variables.
weights = [None]*(len(hidden_layer)-1)
biases = [None]*(len(hidden_layer)-1)
for i in range(len(weights)):
weights[i] = tf.Variable(tf.truncated_normal([hidden_layer[i], hidden_layer[i+1]]))
biases[i] = tf.Variable(tf.zeros([hidden_layer[i+1]]))
# Training computation.
relus = tf.nn.dropout(tf.nn.relu(tf.matmul(tf_train_dataset, weights[0]) + biases[0]),p_hide)
for i in range(1,len(weights)-1):
relus = tf.nn.dropout(tf.nn.relu(tf.matmul(relus, weights[i]) + biases[i]),p_hide)
logits = tf.matmul(relus, weights[len(weights)-1]) + biases[len(weights)-1]
loss = 0
for weight in weights:
loss += tf.nn.l2_loss(weight)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))+ l2_regulariser*loss
# Optimizer.
global_step = tf.Variable(0) # count the number of steps taken.
learning_rate = tf.train.exponential_decay(0.5, global_step, decay_steps=20, decay_rate=0.9)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
relus = tf.nn.relu(tf.matmul(tf_valid_dataset, weights[0]) + biases[0])
for i in range(1,len(weights)-1):
relus = tf.nn.relu(tf.matmul(relus, weights[i]) + biases[i])
valid_prediction = tf.nn.softmax(tf.matmul(relus, weights[len(weights)-1]) + biases[len(weights)-1])
relus = tf.nn.relu(tf.matmul(tf_test_dataset, weights[0]) + biases[0])
for i in range(1,len(weights)-1):
relus = tf.nn.relu(tf.matmul(relus, weights[i]) + biases[i])
test_prediction = tf.nn.softmax(tf.matmul(relus, weights[len(weights)-1]) + biases[len(weights)-1])
# The NN training part
num_steps = 3001
with tf.Session(graph=graph) as session:
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, global_step : int(step)}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
You should better initialize your weights:
tf.truncated_normal([hidden_layer[i], hidden_layer[i+1]], stddev=0.1)
And most of all, you should lower your learning rate to something around 0.01, 0.001...
I think your get a loss of NaN because the learning rate is too high and it breaks the network (you get exploding weights).

