I am a novice in TensorFlow but have a fair understanding of ML algorithms. I have a project to model the error characteristic of time of flight cameras. As ground truth, I have acquired a set of depth images(512x424 pixels) from a stereo set up. The the range images(512x424 pixels) from the ToF camera needs to be compared with the reference depth images. In order to learn the error characteristic I am implementing a deep neural network with the reference image pixels as training input data(median pixels as features) and the difference of reference image and range camera image as training output value. There are 3 pairs of images to train and 1 pair of images to test. I have flattened the image matrices so the training input data are 3-element lists of 217088 sized arrays.
My code works without any errors but the results are ugly:
The cost reduces nicely after the first epoch but does not change much after the second epoch.
The accuracy of the test phase is horrendous.
The code is extremely slow. It takes almost 2 hours for a complete run. May be it has to do with the hardware. I am running it on core i3.
My code:
import tensorflow as tf
import numpy
import cv2
import matplotlib.pyplot as plt
import glob
refDepthImgLoc = 'M:\Internship\Scan\png\scan_dist*.png'
tofDepthImgLoc = 'M:\Internship\Scan\png\kinect_distance*.png'
numImg = 4
refDepthImg = []
tofDepthImg = []
refLoc = glob.glob(scanDistImgLoc)
tofLoc = glob.glob(tofDistImgLoc)
for refImg, tofImg in zip(refLoc, tofLoc) :
img1 = cv2.imread(refImg, 0)
refDepthImg.append(img1)
img2 = cv2.imread(tofImg, 0)
tofDepthImg.append(img2)
trainData_median = []
trainLabel = []
for i in range(len(refDepthImg)):
tempData = cv2.medianBlur(refDepthImg[i], 3)
trainData_median.append(tempData.ravel())
tempLabel = refDepthImg[i] - tofDepthImg[i]
trainLabel.append(tempLabel.ravel())
n_nodes_hl1 = 100
n_nodes_hl2 = 100
n_nodes_hl3 = 100
n_input = 1;
n_output = 1;
learning_rate = 0.01
x = tf.placeholder('float')
y = tf.placeholder('float')
def neural_network_model(data):
hidden_1_layer = {'weights':tf.Variable(tf.random_normal([n_input, n_nodes_hl1])),
'biases':tf.Variable(tf.random_normal([n_nodes_hl1]))}
hidden_2_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
'biases':tf.Variable(tf.random_normal([n_nodes_hl2]))}
hidden_3_layer = {'weights':tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
'biases':tf.Variable(tf.random_normal([n_nodes_hl3]))}
l1 = tf.add(tf.matmul(data,hidden_1_layer['weights']), hidden_1_layer['biases'])
l1 = tf.nn.relu(l1)
l2 = tf.add(tf.matmul(l1,hidden_2_layer['weights']), hidden_2_layer['biases'])
l2 = tf.nn.relu(l2)
l3 = tf.add(tf.matmul(l2,hidden_3_layer['weights']), hidden_3_layer['biases'])
l3 = tf.nn.relu(l3)
output = tf.reduce_sum(l3)
return output
def train_neural_network(x):
prediction = neural_network_model(x)
cost = tf.reduce_sum(tf.square(prediction-y))/((numImg-1)*len(trainLabel[0]))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
hm_epochs = 10
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for epoch in range(hm_epochs):
tempLoss = 0
for i in range(numImg - 1):
for (X, Y) in zip(trainData_median[i], trainLabel[i]):
_, c = sess.run([optimizer, cost], feed_dict={x: [[X]], y: [[Y]]})
tempLoss += c
print('Epoch', (epoch+1), 'completed out of',hm_epochs,'loss:',tempLoss)
print("Testing starts now")
test = tf.abs(prediction - y)
i = 0;
pred = numpy.zeros(len(trainLabel[0]));
result = numpy.zeros(len(trainLabel[0]));
for (X, Y) in zip(trainData_median[numImg - 1], trainLabel[numImg - 1]):
correct, pred[i] = sess.run([test, prediction], feed_dict={x: [[X]], y: [[Y]]})
if (correct < 0.5):
result[i] = 1
i += 1
accuracy = tf.reduce_mean(tf.cast(result, 'float'))
print('Accuracy:', accuracy.eval())
train_neural_network(x)
The output:
Epoch 1 completed out of 10 loss: 204681865.46
Epoch 2 completed out of 10 loss: 3188.81297796
Epoch 3 completed out of 10 loss: 3183.35926716
Epoch 4 completed out of 10 loss: 3181.37895241
Epoch 5 completed out of 10 loss: 3179.95276242
Epoch 6 completed out of 10 loss: 3178.51366003
Epoch 7 completed out of 10 loss: 3177.6227609
Epoch 8 completed out of 10 loss: 3176.69995104
Epoch 9 completed out of 10 loss: 3176.85162593
Epoch 10 completed out of 10 loss: 3177.04338937
Testing starts now
Accuracy: 0.00301721
Please comment if there is any inherent logical error in the code or the entire approach to the problem is incorrect. Should I try implementing it using CNNs? Please help me in making this work. Please let me know if any further information is required.
Thanks.
Related
I have a discrepancy between the val_loss's produced by model.fit and model.test_on_batch.
For model.fit, after 1 epoch of batch size 4 and 50k training set size, this is the output
50000/50000 [==============================] - 508s 10ms/step - loss: 1.5587 - acc: 0.9442 - val_loss: 0.6883 - val_acc: 0.9721
Notice that val_loss = 0.6883.
I then stopped the training, and trained the model with model.train_on_batch, validating every 1k batches. I did not reset the model, so the weights are not changed. After 1k batches, I get this output:
Batch 1139: Train[0.539348,0.977112] ; Val[146.972092,0.972529] ; Duration=0.040436 s
Notice that here the validation loss is 146.97.... How is that possible? Does model.fit do some post-processing to the validation loss?
model.fit code
batch_size = 4
epochs = 300
myhist = model.fit(x_test,y_test,batch_size=batch_size,epochs=epochs,shuffle=True,validation_data=(val_x[:1000,],val_y[:1000,]),callbacks=[plot_losses])
model.train_on_batch iteration
n_batches = 500000
batch_size = 4
val_size = 1000
val_freq = 1000
val_loss,val_acc = 0,0
model_check = '17102019_1.hd5'
val_loss_min = 1000000
for ib in range(n_batches):
batch_init = time.time()
batch_x,batch_y = generate_mini_batch(batch_size,x_test,y_test,linear_comb=False,trans=False)
train_loss,train_acc = model.train_on_batch(batch_x,batch_y)
batch_end = time.time()-batch_init
clear_output(wait=True)
if (ib % val_freq == 0) & (ib > 0):
val_loss,val_acc = model.test_on_batch(val_x[:val_size,],val_y[:val_size,])
if val_loss < val_loss_min:
model.save(model_check)
val_loss_min = val_loss
print('Batch %i: Train[%f,%f] ; Val[%f,%f] ; Duration=%f s'%(ib,train_loss,train_acc,val_loss,val_acc,batch_end))
It seems like model.test_on_batch returns the sum of losses of the batch entries, while model.train_on_batch returns the average loss, so that solves the issue.
I am trying to start learning ML.
I wrote a simple example:
import numpy as np
# Prepare the data
input = np.array(list(range(100)))
output = np.array([x**2 + 2 for x in list(range(100))])
# Visualize Data
import matplotlib.pyplot as plt
plt.plot(input, output, 'ro')
plt.show()
# Define your Model
a = 1
b = 1
# y = ax + b # we put a bias in the model based on our knowledge
# Train your model == Optimize the parameters so that they give very less loss
for e in range(10):
for x, y in zip(input, output):
y_hat = a*x + b
loss = 0.5*(y_hat-y)**2
# Now that we have loss, we want gradient of the parameters a and b
# derivative of loss wrt a = (-x)(y-ax+b)
# so gradient descent: a = a - (learning_rate)*(derivative wrt a)
a = a - 0.1*(-x)*(y_hat-y)
b = b - 0.1*(-1)*(y_hat-y)
print("Epoch {0} Training loss = {1}".format(e, loss))
# Make Prections on new data
test_input = np.array(list(range(101,150)))
test_output = np.array([x**2.0 + 2 for x in list(range(101,150))])
model_predictions = np.array([a*x + b for x in list(range(101,150))])
plt.plot(test_input, test_output, 'ro')
plt.plot(test_input, model_predictions, '-')
plt.show()
Now when I run the code:
ml_zero.py:22: RuntimeWarning: overflow encountered in double_scalars
loss = 0.5*(y_hat-y)**2
Epoch 0 Training loss = inf
ml_zero.py:21: RuntimeWarning: overflow encountered in double_scalars
y_hat = a*x + b
Epoch 1 Training loss = inf
ml_zero.py:21: RuntimeWarning: invalid value encountered in double_scalars
y_hat = a*x + b
Epoch 2 Training loss = nan
Epoch 3 Training loss = nan
Epoch 4 Training loss = nan
Epoch 5 Training loss = nan
Epoch 6 Training loss = nan
Epoch 7 Training loss = nan
Epoch 8 Training loss = nan
Epoch 9 Training loss = nan
Why is the error nan? I wrote the simplest model, but with python I was getting:
Traceback (most recent call last):
File "ml_zero.py", line 20, in <module>
loss = (y_hat-y)**2
OverflowError: (34, 'Result too large')
Then I converted all Python lists to numpy. Now, I get Nan error, I just don't understand why these small values are giving theses errors.
With Daniele's answer to replace the loss with mean squared loss, i.e. dividing the loss by total number of inputs, I get this output:
Epoch 0 Training loss = 1.7942781420994678e+36
Epoch 1 Training loss = 9.232837400842652e+70
Epoch 2 Training loss = 4.751367833814119e+105
Epoch 3 Training loss = 2.4455835946216386e+140
Epoch 4 Training loss = 1.2585275201812707e+175
Epoch 5 Training loss = 6.4767849625200624e+209
Epoch 6 Training loss = 3.331617554363007e+244
Epoch 7 Training loss = 1.714758503849272e+279
ml_zero.py:22: RuntimeWarning: overflow encountered in double_scalars
loss = 0.5*(y-y_hat)**2
Epoch 8 Training loss = inf
Epoch 9 Training loss = inf
At least it runs, but I am trying to learn the linear function using Stochastic gradient descent, which updates parameters after each point's loss.
Still not getting how people work with these models, loss should decrease why is it increasing with gradient descent?
You got the math wrong. When you compute the gradient update for GD you have to divide by the number of samples in your dataset: that's why it is called mean squared error and not just squared error.
Also, you might want to use smaller inputs since you're trying to work with an exponential, as it tends to grow... well, exponentially with x.
Look at this post for a good intro to LR and GD.
I took the liberty of rewriting your code a bit, this should work:
import numpy as np
import matplotlib.pyplot as plt
# Prepare the data
input_ = np.linspace(0, 10, 100) # Don't assign user data to Python's input builtin
output = np.array([x**2 + 2 for x in input_])
# Define model
a = 1
b = 1
# Train model
N = input_.shape[0] # Number of samples
for e in range(10):
loss = 0.
for x, y in zip(input_, output):
y_hat = a * x + b
a = a - 0.1 * (2. / N) * (-x) * (y - y_hat)
b = b - 0.1 * (2. / N) * (-1) * (y - y_hat)
loss += 0.5 * ((y - y_hat) ** 2)
loss /= N
print("Epoch {:2d}\tLoss: {:4f}".format(e, loss))
# Predict on test data
test_input = np.linspace(0, 15, 150) # Training data [0-10] + test data [10 - 15]
test_output = np.array([x**2.0 + 2 for x in test_input])
model_predictions = np.array([a*x + b for x in test_input])
plt.plot(test_input, test_output, 'ro')
plt.plot(test_input, model_predictions, '-')
plt.show()
This should give you as output something along these lines:
Epoch 0 Loss: 33.117127
Epoch 1 Loss: 42.949756
Epoch 2 Loss: 40.733332
Epoch 3 Loss: 38.657764
Epoch 4 Loss: 36.774646
Epoch 5 Loss: 35.067299
Epoch 6 Loss: 33.520409
Epoch 7 Loss: 32.119958
Epoch 8 Loss: 30.853112
Epoch 9 Loss: 29.708126
And this is the output plot:
Cheers
EDIT: OP was asking about SGD. The above answer is still valid code, but it's for standard GD (where you iterate on the whole dataset at the same time).
For SGD, the main loop must be slightly changed:
for e in range(10):
for x, y in zip(input_, output):
y_hat = a * x + b
loss = 0.5 * ((y - y_hat) ** 2)
a = a - 0.01 * (2.) * (-x) * (y - y_hat)
b = b - 0.01 * (2.) * (-1) * (y - y_hat)
print("Epoch {:2d}\tLoss: {:4f}".format(e, loss))
Note that I had to lower the learning rate to avoid divergence. When you train with a batch size of 1 it becomes really important to avoid this kind of gradient explosions, because a single sample may substantially mess up your descent towards the optimum.
Example output:
Epoch 0 Loss: 0.130379
Epoch 1 Loss: 0.123007
Epoch 2 Loss: 0.117352
Epoch 3 Loss: 0.112991
Epoch 4 Loss: 0.109615
Epoch 5 Loss: 0.106992
Epoch 6 Loss: 0.104948
Epoch 7 Loss: 0.103353
Epoch 8 Loss: 0.102105
Epoch 9 Loss: 0.101127
I am trying to train the Alexnet upon the data that i collected.
It contains the images converted to grayscale and the associated key.
This is a program to simulate a self driving car.
The keys are :
w = [1,0,0]
a = [0,1,0]
d = [0,0,1]
This is my code
import numpy as np
from alexnet import alexnet
WIDTH = 100
HEIGHT = 80
LR = 1e-3
EPOCHS = 8
MODEL_NAME = 'Udacity Model Car NN'
model = alexnet(WIDTH,HEIGHT,LR)
train_data = np.load('data.npy',encoding="bytes")
train = train_data[:-200]
test = train_data[-200:]
X = np.array([i[0] for i in train]).reshape(-1,WIDTH,HEIGHT,1)
Y = [i[1] for i in train]
test_X = np.array([i[0] for i in test]).reshape(-1,WIDTH,HEIGHT,1)
test_Y = [i[1] for i in test]
model.fit({'input':X},{'targets':Y},n_epoch=EPOCHS,validation_set=({'input':test_X},{'targets:test_y'}),snapshot_step=500,show_metric=True,run_id=MODEL_NAME)
model.save(MODEL_NAME)
But after every Epoch the validation accuracy remains 0 as well as the validation loss remains 0 as well.
Training Step: 104 | total loss: 1.31713 | time: 119.279s| Momentum |epoch: 008 | loss: 1.31713 - acc: 0.3878 | val_loss: 0.00000 - val_acc: 0.0000 -- iter: 801/801
This is probably a typo, look what you are passing as validation:
{'input':test_X},{'targets:test_y'}
\______________/ \_______________/
correct dict this is a set with a string!
while it should be
{'input':test_X},{'targets':test_Y}
I'm trying to make some price prediction on a kaggle dataset with Tensorflow.
My Neural network is learning, but, my cost function is really high and my predictions are far from the real output.
I tried to change my network by adding or removing some layers, neurons and activations functions.
I tried a lot with my hyper-parameters but that don't change so much things.
I don't think that the problem come from my datas, I checked on kaggle and that's the ones that most people uses.
If you have any idea why my cost is so high and how to reduce it and if you could explain it to me, it would be really great !
Her's my code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.utils import shuffle
df = pd.read_csv(r"C:\Users\User\Documents\TENSORFLOW\Prediction prix\train2.csv", sep=';')
df.head()
df = df.loc[:, ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'SalePrice']]
df = df.replace(np.nan, 0)
df
%matplotlib inline
plt = sns.pairplot(df)
plt
df = shuffle(df)
df_train = df[0:1000]
df_test = df[1001:1451]
inputX = df_train.drop('SalePrice', 1).as_matrix()
inputX = inputX.astype(int)
inputY = df_train.loc[:, ['SalePrice']].as_matrix()
inputY = inputY.astype(int)
inputX_test = df_test.drop('SalePrice', 1).as_matrix()
inputX_test = inputX_test.astype(int)
inputY_test = df_test.loc[:, ['SalePrice']].as_matrix()
inputY_test = inputY_test.astype(int)
# Parameters
learning_rate = 0.01
training_epochs = 1000
batch_size = 500
display_step = 50
n_samples = inputX.shape[0]
x = tf.placeholder(tf.float32, [None, 5])
y = tf.placeholder(tf.float32, [None, 1])
def add_layer(inputs, in_size, out_size, activation_function=None):
Weights = tf.Variable(tf.random_normal([in_size, out_size], stddev=0.1))
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
Wx_plus_b = tf.matmul(inputs, Weights) + biases
if activation_function is None:
output = Wx_plus_b
else:
output = activation_function(Wx_plus_b)
return output
l1 = add_layer(x, 5, 3, activation_function=tf.nn.relu)
pred = add_layer(l1, 3, 1)
# Mean squared error
cost = tf.reduce_sum(tf.pow(pred-y, 2))/(2*n_samples)
# Gradient descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = batch_size
# Loop over all batches
for i in range(total_batch):
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([optimizer, cost], feed_dict={x: inputX,
y: inputY})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", \
"{:.9f}".format(avg_cost))
print("Optimization Finished!")
# Test model
correct_prediction = tf.equal(pred,y)
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print("Accuracy:", accuracy.eval({x: inputX, y: inputY}))
print(sess.run(pred, feed_dict={x: inputX_test}))
Epoch: 0001 cost= 10142407502702304395526144.000000000
Epoch: 0051 cost= 3256106752.000019550
Epoch: 0101 cost= 3256106752.000019550
Epoch: 0151 cost= 3256106752.000019550
Epoch: 0201 cost= 3256106752.000019550
...
Thanks for your help !
I see couple of problems with the implementation:
Inputs are not scaled.
Use sklearn StandardScaler to scale the inputs inputX, inputY (and also inputX_text and inputY_text) to make it zero mean and unit variance. You can use the inverse_transform to convert the outputs back to proper scale again.
sc = StandardScaler().fit(inputX)
inputX = sc.transform(inputX)
inputX_test = sc.transform(inputX_test)
The batch_size is too large, you are passing the entire set as a single batch. This should not cause the particular problem you are facing, but for better convergence try with reduced batch size. Implement a get_batch() generator function and do the following:
for batch_X, batch_Y in get_batch(input_X, input_Y, batch_size):
_, c = sess.run([optimizer, cost], feed_dict={x: batch_X,
y: batch_Y})
Try smaller Weights initialization (stddev) if you still see issues.
WORKING CODE BELOW:
inputX = df_train.drop('SalePrice', 1).as_matrix()
inputX = inputX.astype(int)
sc = StandardScaler().fit(inputX)
inputX = sc.transform(inputX)
inputY = df_train.loc[:, ['SalePrice']].as_matrix()
inputY = inputY.astype(int)
sc1 = StandardScaler().fit(inputY)
inputY = sc1.transform(inputY)
inputX_test = df_test.drop('SalePrice', 1).as_matrix()
inputX_test = inputX_test.astype(int)
inputX_test = sc.transform(inputX_test)
inputY_test = df_test.loc[:, ['SalePrice']].as_matrix()
inputY_test = inputY_test.astype(int)
inputY_test = sc1.transform(inputY_test)
learning_rate = 0.01
training_epochs = 1000
batch_size = 50
display_step = 50
n_samples = inputX.shape[0]
x = tf.placeholder(tf.float32, [None, 5])
y = tf.placeholder(tf.float32, [None, 1])
def get_batch(inputX, inputY, batch_size):
duration = len(inputX)
for i in range(0,duration//batch_size):
idx = i*batch_size
yield inputX[idx:idx+batch_size], inputY[idx:idx+batch_size]
def add_layer(inputs, in_size, out_size, activation_function=None):
Weights = tf.Variable(tf.random_normal([in_size, out_size], stddev=0.005))
biases = tf.Variable(tf.zeros([1, out_size]))
Wx_plus_b = tf.matmul(inputs, Weights) + biases
if activation_function is None:
output = Wx_plus_b
else:
output = activation_function(Wx_plus_b)
return output
l1 = add_layer(x, 5, 3, activation_function=tf.nn.relu)
pred = add_layer(l1, 3, 1)
# Mean squared error
cost = tf.reduce_mean(tf.pow(tf.subtract(pred, y), 2))
# Gradient descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = batch_size
# Loop over all batches
#for i in range(total_batch):
for batch_x, batch_y in get_batch(inputX, inputY, batch_size):
# Run optimization op (backprop) and cost op (to get loss value)
_, c, _l1, _pred = sess.run([optimizer, cost, l1, pred], feed_dict={x: batch_x, y: batch_y})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f} ".format(avg_cost))
#print(_l1, _pred)
print("Optimization Finished!")
I have already had a similar problem of a very high cost reached after a few training steps, and then the cost remaining constant there. For me it was a kind of overflow, with the gradients too big and creating Nan values quite early in training. I solved it by starting with a smaller learning rate (potentially much smaller), until the cost and gradients become more reasonable (a few dozen steps), and then back to a regular one (higher at the start, potentially decaying).
See my answer to this post for a similar case that was solved just by taking a smaller learning rate on start.
You can also clip your gradients to avoid this problem, using tf.clip_by_value. It sets a minimum and maximum value to your gradients, which avoids to have huge ones that send your weights straight to Nan after the first few iterations. To use it (with min and max at -1 and 1, which is probably too tight), replace
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
by
opt= tf.train.GradientDescentOptimizer(learning_rate)
gvs = opt.compute_gradients(cost)
capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
optimizer = opt.apply_gradients(capped_gvs)
I am trying to adapt this MNIST example to binary classification.
But when changing my NLABELS from NLABELS=2 to NLABELS=1, the loss function always returns 0 (and accuracy 1).
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
# Import data
mnist = input_data.read_data_sets('data', one_hot=True)
NLABELS = 2
sess = tf.InteractiveSession()
# Create the model
x = tf.placeholder(tf.float32, [None, 784], name='x-input')
W = tf.Variable(tf.zeros([784, NLABELS]), name='weights')
b = tf.Variable(tf.zeros([NLABELS], name='bias'))
y = tf.nn.softmax(tf.matmul(x, W) + b)
# Add summary ops to collect data
_ = tf.histogram_summary('weights', W)
_ = tf.histogram_summary('biases', b)
_ = tf.histogram_summary('y', y)
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, NLABELS], name='y-input')
# More name scopes will clean up the graph representation
with tf.name_scope('cross_entropy'):
cross_entropy = -tf.reduce_mean(y_ * tf.log(y))
_ = tf.scalar_summary('cross entropy', cross_entropy)
with tf.name_scope('train'):
train_step = tf.train.GradientDescentOptimizer(10.).minimize(cross_entropy)
with tf.name_scope('test'):
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
_ = tf.scalar_summary('accuracy', accuracy)
# Merge all the summaries and write them out to /tmp/mnist_logs
merged = tf.merge_all_summaries()
writer = tf.train.SummaryWriter('logs', sess.graph_def)
tf.initialize_all_variables().run()
# Train the model, and feed in test data and record summaries every 10 steps
for i in range(1000):
if i % 10 == 0: # Record summary data and the accuracy
labels = mnist.test.labels[:, 0:NLABELS]
feed = {x: mnist.test.images, y_: labels}
result = sess.run([merged, accuracy, cross_entropy], feed_dict=feed)
summary_str = result[0]
acc = result[1]
loss = result[2]
writer.add_summary(summary_str, i)
print('Accuracy at step %s: %s - loss: %f' % (i, acc, loss))
else:
batch_xs, batch_ys = mnist.train.next_batch(100)
batch_ys = batch_ys[:, 0:NLABELS]
feed = {x: batch_xs, y_: batch_ys}
sess.run(train_step, feed_dict=feed)
I have checked the dimensions of both batch_ys (fed into y) and _y and they are both 1xN matrices when NLABELS=1 so the problem seems to be prior to that. Maybe something to do with the matrix multiplication?
I actually have got this same problem in a real project, so any help would be appreciated... Thanks!
The original MNIST example uses a one-hot encoding to represent the labels in the data: this means that if there are NLABELS = 10 classes (as in MNIST), the target output is [1 0 0 0 0 0 0 0 0 0] for class 0, [0 1 0 0 0 0 0 0 0 0] for class 1, etc. The tf.nn.softmax() operator converts the logits computed by tf.matmul(x, W) + b into a probability distribution across the different output classes, which is then compared to the fed-in value for y_.
If NLABELS = 1, this acts as if there were only a single class, and the tf.nn.softmax() op would compute a probability of 1.0 for that class, leading to a cross-entropy of 0.0, since tf.log(1.0) is 0.0 for all of the examples.
There are (at least) two approaches you could try for binary classification:
The simplest would be to set NLABELS = 2 for the two possible classes, and encode your training data as [1 0] for label 0 and [0 1] for label 1. This answer has a suggestion for how to do that.
You could keep the labels as integers 0 and 1 and use tf.nn.sparse_softmax_cross_entropy_with_logits(), as suggested in this answer.
I've been looking for good examples of how to implement binary classification in TensorFlow in a similar manner to the way it would be done in Keras. I didn't find any, but after digging through the code a bit, I think I have it figured out. I modified the problem here to implement a solution that uses sigmoid_cross_entropy_with_logits the way Keras does under the hood.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
# Import data
mnist = input_data.read_data_sets('data', one_hot=True)
NLABELS = 1
sess = tf.InteractiveSession()
# Create the model
x = tf.placeholder(tf.float32, [None, 784], name='x-input')
W = tf.get_variable('weights', [784, NLABELS],
initializer=tf.truncated_normal_initializer()) * 0.1
b = tf.Variable(tf.zeros([NLABELS], name='bias'))
logits = tf.matmul(x, W) + b
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, NLABELS], name='y-input')
# More name scopes will clean up the graph representation
with tf.name_scope('cross_entropy'):
#manual calculation : under the hood math, don't use this it will have gradient problems
# entropy = tf.multiply(tf.log(tf.sigmoid(logits)), y_) + tf.multiply((1 - y_), tf.log(1 - tf.sigmoid(logits)))
# loss = -tf.reduce_mean(entropy, name='loss')
entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_, logits=logits)
loss = tf.reduce_mean(entropy, name='loss')
with tf.name_scope('train'):
# Using Adam instead
# train_step = tf.train.GradientDescentOptimizer(learning_rate=0.001).minimize(loss)
train_step = tf.train.AdamOptimizer(learning_rate=0.002).minimize(loss)
with tf.name_scope('test'):
preds = tf.cast((logits > 0.5), tf.float32)
correct_prediction = tf.equal(preds, y_)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.initialize_all_variables().run()
# Train the model, and feed in test data and record summaries every 10 steps
for i in range(2000):
if i % 100 == 0: # Record summary data and the accuracy
labels = mnist.test.labels[:, 0:NLABELS]
feed = {x: mnist.test.images, y_: labels}
result = sess.run([loss, accuracy], feed_dict=feed)
print('Accuracy at step %s: %s - loss: %f' % (i, result[1], result[0]))
else:
batch_xs, batch_ys = mnist.train.next_batch(100)
batch_ys = batch_ys[:, 0:NLABELS]
feed = {x: batch_xs, y_: batch_ys}
sess.run(train_step, feed_dict=feed)
Training:
Accuracy at step 0: 0.7373 - loss: 0.758670
Accuracy at step 100: 0.9017 - loss: 0.423321
Accuracy at step 200: 0.9031 - loss: 0.322541
Accuracy at step 300: 0.9085 - loss: 0.255705
Accuracy at step 400: 0.9188 - loss: 0.209892
Accuracy at step 500: 0.9308 - loss: 0.178372
Accuracy at step 600: 0.9453 - loss: 0.155927
Accuracy at step 700: 0.9507 - loss: 0.139031
Accuracy at step 800: 0.9556 - loss: 0.125855
Accuracy at step 900: 0.9607 - loss: 0.115340
Accuracy at step 1000: 0.9633 - loss: 0.106709
Accuracy at step 1100: 0.9667 - loss: 0.099286
Accuracy at step 1200: 0.971 - loss: 0.093048
Accuracy at step 1300: 0.9714 - loss: 0.087915
Accuracy at step 1400: 0.9745 - loss: 0.083300
Accuracy at step 1500: 0.9745 - loss: 0.079019
Accuracy at step 1600: 0.9761 - loss: 0.075164
Accuracy at step 1700: 0.9768 - loss: 0.071803
Accuracy at step 1800: 0.9777 - loss: 0.068825
Accuracy at step 1900: 0.9788 - loss: 0.066270