So I have this problem of making batch in my code, the thing is, I tried to search how we do batching but all I found was using some method like next_batch in MNIST sample program. I would really appreciate if someone could actually give me some tips on how I should make batch in my program below.
import tensorflow as tf
import numpy as np
from sklearn import cross_validation
import pandas as pd
np.random.seed(20160612)
tf.set_random_seed(20160612)
#this is input data, data is a 7x86594 and label is a 5x86594
data2 = pd.read_csv('rawdata.csv', sep=',', header=None)
data = np.array(data2)
label2=pd.read_csv('class.csv', sep='\t', header=None)
label=np.array(label2)
train_x,test_x,train_t,test_t=cross_validation.train_test_split(data,label,test_size=0.1,random_state=None)
#this is supposed to be neural size in hidden layer
num_units = 15
x = tf.placeholder(tf.float32, [None, 7])
t = tf.placeholder(tf.float32, [None, 5])
w1 = tf.Variable(tf.truncated_normal([7, num_units], mean=0.0, stddev=0.05))
b1 = tf.Variable(tf.zeros([num_units]))
hidden1 = tf.nn.relu(tf.matmul(x, w1) + b1)
w0 = tf.Variable(tf.zeros([num_units, 5]))
b0 = tf.Variable(tf.zeros([5]))
p = tf.nn.softmax(tf.matmul(hidden1, w0) + b0)
loss = -tf.reduce_sum(t * tf.log(tf.clip_by_value(p,1e-10,1.0)))
train_step = tf.train.AdamOptimizer().minimize(loss)
correct_prediction = tf.equal(tf.argmax(p, 1), tf.argmax(t, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sess = tf.InteractiveSession()
sess.run(tf.initialize_all_variables())
#this is how i think batching is
batch_size = 100
for j in range(0, 86594, batch_size):
xs,ys= train_x[j:j+batch_size], train_t[j:j+batch_size]
i = 0
for _ in range(4000):
i += 1
sess.run(train_step, feed_dict={x: xs, t: ys})
if i % 100 == 0:
loss_val, acc_val = sess.run([loss, accuracy],feed_dict={x:test_x, t: test_t})
print ('Step: %d, Loss: %f, Accuracy: %f'% (i, loss_val, acc_val))
The result of this program, of course, isn't right.
Keep extracting the batches of your data and keep feeding them to the network for training. In each epoch, all the samples of your training dataset should be run once. So you can rewrite your code like this:
Required part of code only:
epochs = 4000
batch_size = 100
for epoch_no in range(epochs):
for index, offset in enumerate(range(0, 86594, batch_size)):
xs, ys = train_x[offset: offset + batch_size], train_t[offset: offset + batch_size]
sess.run(train_step, feed_dict={x: xs, t: ys})
if index % 100 == 0:
loss_val, acc_val = sess.run([loss, accuracy], feed_dict = {x: test_x, t: test_t})
print ('Epoch %d, Step: %d, Loss: %f, Accuracy: %f'% (epoch_no, index, loss_val, acc_val))
Related
as the title says, I am trying to train a neural network to predict outcomes, and I can't figure out what is wrong with my model. I keep getting the exact same accuracy level, and the loss is Nan. I'm so confused... I have looked at other similar questions and still can't seem to get it working. My code for the model and training is below:
import numpy as np
import pandas as pd
import tensorflow as tf
import urllib.request as request
import matplotlib.pyplot as plt
from FlowersCustom import get_MY_data
def get_data():
IRIS_TRAIN_URL = "http://download.tensorflow.org/data/iris_training.csv"
IRIS_TEST_URL = "http://download.tensorflow.org/data/iris_test.csv"
names = ['sepal-length', 'sepal-width', 'petal-length', 'petal-width', 'species']
train = pd.read_csv(IRIS_TRAIN_URL, names=names, skiprows=1)
test = pd.read_csv(IRIS_TEST_URL, names=names, skiprows=1)
# Train and test input data
Xtrain = train.drop("species", axis=1)
Xtest = test.drop("species", axis=1)
# Encode target values into binary ('one-hot' style) representation
ytrain = pd.get_dummies(train.species)
ytest = pd.get_dummies(test.species)
return Xtrain, Xtest, ytrain, ytest
def create_graph(hidden_nodes):
# Reset the graph
tf.reset_default_graph()
# Placeholders for input and output data
X = tf.placeholder(shape=Xtrain.shape, dtype=tf.float64, name='X')
y = tf.placeholder(shape=ytrain.shape, dtype=tf.float64, name='y')
# Variables for two group of weights between the three layers of the network
print(Xtrain.shape, ytrain.shape)
W1 = tf.Variable(np.random.rand(Xtrain.shape[1], hidden_nodes), dtype=tf.float64)
W2 = tf.Variable(np.random.rand(hidden_nodes, ytrain.shape[1]), dtype=tf.float64)
# Create the neural net graph
A1 = tf.sigmoid(tf.matmul(X, W1))
y_est = tf.sigmoid(tf.matmul(A1, W2))
# Define a loss function
deltas = tf.square(y_est - y)
loss = tf.reduce_sum(deltas)
# Define a train operation to minimize the loss
# optimizer = tf.train.GradientDescentOptimizer(0.005)
optimizer = tf.train.AdamOptimizer(0.001)
opt = optimizer.minimize(loss)
return opt, X, y, loss, W1, W2, y_est
def train_model(hidden_nodes, num_iters, opt, X, y, loss, W1, W2, y_est):
# Initialize variables and run session
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
losses = []
# Go through num_iters iterations
for i in range(num_iters):
sess.run(opt, feed_dict={X: Xtrain, y: ytrain})
local_loss = sess.run(loss, feed_dict={X: Xtrain.values, y: ytrain.values})
losses.append(local_loss)
weights1 = sess.run(W1)
weights2 = sess.run(W2)
y_est_np = sess.run(y_est, feed_dict={X: Xtrain.values, y: ytrain.values})
correct = [estimate.argmax(axis=0) == target.argmax(axis=0)
for estimate, target in zip(y_est_np, ytrain.values)]
acc = 100 * sum(correct) / len(correct)
if i % 10 == 0:
print('Epoch: %d, Accuracy: %.2f, Loss: %.2f' % (i, acc, local_loss))
print("loss (hidden nodes: %d, iterations: %d): %.2f" % (hidden_nodes, num_iters, losses[-1]))
sess.close()
return weights1, weights2
def test_accuracy(weights1, weights2):
X = tf.placeholder(shape=Xtest.shape, dtype=tf.float64, name='X')
y = tf.placeholder(shape=ytest.shape, dtype=tf.float64, name='y')
W1 = tf.Variable(weights1)
W2 = tf.Variable(weights2)
A1 = tf.sigmoid(tf.matmul(X, W1))
y_est = tf.sigmoid(tf.matmul(A1, W2))
# Calculate the predicted outputs
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
y_est_np = sess.run(y_est, feed_dict={X: Xtest, y: ytest})
# Calculate the prediction accuracy
correct = [estimate.argmax(axis=0) == target.argmax(axis=0)
for estimate, target in zip(y_est_np, ytest.values)]
accuracy = 100 * sum(correct) / len(correct)
print('final accuracy: %.2f%%' % accuracy)
def get_inputs_and_outputs(train, test, output_column_name):
Xtrain = train.drop(output_column_name, axis=1)
Xtest = test.drop(output_column_name, axis=1)
ytrain = pd.get_dummies(getattr(train, output_column_name))
ytest = pd.get_dummies(getattr(test, output_column_name))
return Xtrain, Xtest, ytrain, ytest
if __name__ == '__main__':
train, test = get_MY_data('output')
Xtrain, Xtest, ytrain, ytest = get_inputs_and_outputs(train, test, 'output')#get_data()
# Xtrain, Xtest, ytrain, ytest = get_data()
hidden_layers = 10
num_epochs = 500
opt, X, y, loss, W1, W2, y_est = create_graph(hidden_layers)
w1, w2 = train_model(hidden_layers, num_epochs, opt, X, y, loss, W1, W2, y_est)
# test_accuracy(w1, w2)
Here is a screenshot of what the training is printing out:
And this is a screenshot of the Pandas Dataframe that I am using for the input data (5 columns of floats):
And finally, here is the Pandas Dataframe that I am using for the expected outputs (1 column of either -1 or 1):
This is almost always a problem with the input data.
I would suggest inspecting in detail the values you are feeding into the model to make sure the model is receiving what you think it is.
I am currently facing an issue with the next_batch function. When I try to implement it with the code I have below, I get this error.
AttributeError: 'str' object has no attribute 'nextBatch'
I am suspecting that the issue is stemming from the next_batch function. I have tried to create my own implementation of the function from the tensorflow example. I have based my code for this Logistic Regression model on the example below.
https://github.com/aymericdamien/TensorFlow-Examples/blob/master/examples/2_BasicModels/logistic_regression.py
I have based the code for the next_batch function on a previous StackOverflow post.
how to implement tensorflow's next_batch for own data
Thanks for the help in advance.
from __future__ import print_function
import tensorflow as tf
import os as os
from LogRegModel import csvToTrainTF, csvtoTestTF
dir_path = os.path.dirname(os.path.realpath(__file__))
filename1 = dir_path + "/TrainData2.csv"
filename2 = dir_path + "/TestData2.csv"
filenameTrain = 'TrainData2.tfrecords'
filenameTest = 'TestData2.tfrecords'
trainData = csvToTrainTF(filename1, filenameTrain)
testData = csvtoTestTF(filename2, filenameTest)
learning_rate = 0.01
training_epochs = 20
batch_size = 32
display_step = 1
x = tf.placeholder(tf.float32, [None, 9])
y = tf.placeholder(tf.float32, [None, 2])
W = tf.Variable(tf.zeros([9,2]))
b = tf.Variable(tf.zeros([2]))
pred = tf.nn.softmax(tf.matmul(x,W) + b)
cost = tf.reduce_mean(-tf.reduce_sum(y*tf.log(pred), reduction_indices = 1))
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
start = 0
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(992/batch_size)
for i in range(total_batch):
batch_xs, batch_ys = trainData.nextBatch(batch_size) # Error is occurring in the train.next_batch call? Could you please tell me how to write out the function?
_, c = sess.run([optimizer, cost], feed_dict={x: batch_xs, y: batch_ys})
avg_cost += c/total_batch
if (epoch+1) % display_step == 0:
print("Epoch:", '%40d' % (epoch+1), "cost=", "{:.9f}".format(avg_cost))
print("Optimization Finished")
# Test model
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(y, 1))
# Calculate accuracy
#I am not sure if this is the correct way to print out the accuracy. Could you please check this?
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print("Accuracy:", accuracy.eval({x: trainData[0:9], y: trainData[10]}))
Above is the code for the main function
Below is code for next_batch
def next_batch(self,batch_size,shuffle = True):
start = self._index_in_epoch
if start == 0 and self._epochs_completed == 0:
idx = np.arange(0, self._num_examples) # get all possible indexes
np.random.shuffle(idx) # shuffle indexe
self._data = self.data[idx] # get list of `num` random samples
# go to the next batch
if start + batch_size > self._num_examples:
self._epochs_completed += 1
rest_num_examples = self._num_examples - start
data_rest_part = self.data[start:self._num_examples]
idx0 = np.arange(0, self._num_examples) # get all possible indexes
np.random.shuffle(idx0) # shuffle indexes
self._data = self.data[idx0] # get list of `num` random samples
start = 0
self._index_in_epoch = batch_size - rest_num_examples #avoid the case where the #sample != integar times of batch_size
end = self._index_in_epoch
data_new_part = self._data[start:end]
return np.concatenate((data_rest_part, data_new_part), axis=0)
else:
self._index_in_epoch += batch_size
end = self._index_in_epoch
return self._data[start:end]
I am new to tensorflow. I am trying to train a rnn(for phoneme tagging). When I run the code, the loss is not lowing down at all.(My batch size is one because each sequence has different length). I am wondering if I am doing something wrong? Can I optimize the model by feeding the data through a for loop like this? I thought each time I use sess.run for optimization, it will update the variable from it's previous value. Am I wrong?
with tf.variable_scope("foo",reuse=True):
x = tf.placeholder(tf.float32, [1, None, 69])
y = tf.placeholder(tf.int32, [None])
cell = tf.nn.rnn_cell.LSTMCell(48)
outputs, state = tf.nn.dynamic_rnn(cell, x, dtype=tf.float32)
loss =tf.reduce_mean(tf.nn.sparse_softmax_cross_entropy_with_logits(labels = y, logits = tf.reshape(outputs,[-1, 48])))
optimizer = tf.train.AdamOptimizer(learning_rate=0.1).minimize(loss)
saver = tf.train.Saver()
with tf.Session() as sess:
init = tf.global_variables_initializer()
sess.run(init)
for epoch in range(1):
for index, ID in enumerate(list(X.keys())):
x_temp = np.array(X[ID])[np.newaxis,:,:]
y_temp = np.array(Y_int[ID])
_ , _loss = sess.run([optimizer,loss], feed_dict = {x: x_temp, y : y_temp } )
if index % 10 == 0:
print("epoch "+str(epoch)+" samples "+str(index)+" loss: "+str(_loss))
prediction = []
save_path = saver.save(sess, "/tmp/model.ckpt")
for index, ID in enumerate(list(X.keys())):
if index % 10 == 0:
print("epoch "+str(epoch)+" samples "+str(index))
x_temp = np.array(X[ID])[np.newaxis,:,:]
y_temp = np.array(Y_int[ID])
out = sess.run(outputs, feed_dict = {x: x_temp, y : y_temp })
prediction.append(out)
I'm trying to make some price prediction on a kaggle dataset with Tensorflow.
My Neural network is learning, but, my cost function is really high and my predictions are far from the real output.
I tried to change my network by adding or removing some layers, neurons and activations functions.
I tried a lot with my hyper-parameters but that don't change so much things.
I don't think that the problem come from my datas, I checked on kaggle and that's the ones that most people uses.
If you have any idea why my cost is so high and how to reduce it and if you could explain it to me, it would be really great !
Her's my code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.utils import shuffle
df = pd.read_csv(r"C:\Users\User\Documents\TENSORFLOW\Prediction prix\train2.csv", sep=';')
df.head()
df = df.loc[:, ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'SalePrice']]
df = df.replace(np.nan, 0)
df
%matplotlib inline
plt = sns.pairplot(df)
plt
df = shuffle(df)
df_train = df[0:1000]
df_test = df[1001:1451]
inputX = df_train.drop('SalePrice', 1).as_matrix()
inputX = inputX.astype(int)
inputY = df_train.loc[:, ['SalePrice']].as_matrix()
inputY = inputY.astype(int)
inputX_test = df_test.drop('SalePrice', 1).as_matrix()
inputX_test = inputX_test.astype(int)
inputY_test = df_test.loc[:, ['SalePrice']].as_matrix()
inputY_test = inputY_test.astype(int)
# Parameters
learning_rate = 0.01
training_epochs = 1000
batch_size = 500
display_step = 50
n_samples = inputX.shape[0]
x = tf.placeholder(tf.float32, [None, 5])
y = tf.placeholder(tf.float32, [None, 1])
def add_layer(inputs, in_size, out_size, activation_function=None):
Weights = tf.Variable(tf.random_normal([in_size, out_size], stddev=0.1))
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
Wx_plus_b = tf.matmul(inputs, Weights) + biases
if activation_function is None:
output = Wx_plus_b
else:
output = activation_function(Wx_plus_b)
return output
l1 = add_layer(x, 5, 3, activation_function=tf.nn.relu)
pred = add_layer(l1, 3, 1)
# Mean squared error
cost = tf.reduce_sum(tf.pow(pred-y, 2))/(2*n_samples)
# Gradient descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = batch_size
# Loop over all batches
for i in range(total_batch):
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([optimizer, cost], feed_dict={x: inputX,
y: inputY})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", \
"{:.9f}".format(avg_cost))
print("Optimization Finished!")
# Test model
correct_prediction = tf.equal(pred,y)
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print("Accuracy:", accuracy.eval({x: inputX, y: inputY}))
print(sess.run(pred, feed_dict={x: inputX_test}))
Epoch: 0001 cost= 10142407502702304395526144.000000000
Epoch: 0051 cost= 3256106752.000019550
Epoch: 0101 cost= 3256106752.000019550
Epoch: 0151 cost= 3256106752.000019550
Epoch: 0201 cost= 3256106752.000019550
...
Thanks for your help !
I see couple of problems with the implementation:
Inputs are not scaled.
Use sklearn StandardScaler to scale the inputs inputX, inputY (and also inputX_text and inputY_text) to make it zero mean and unit variance. You can use the inverse_transform to convert the outputs back to proper scale again.
sc = StandardScaler().fit(inputX)
inputX = sc.transform(inputX)
inputX_test = sc.transform(inputX_test)
The batch_size is too large, you are passing the entire set as a single batch. This should not cause the particular problem you are facing, but for better convergence try with reduced batch size. Implement a get_batch() generator function and do the following:
for batch_X, batch_Y in get_batch(input_X, input_Y, batch_size):
_, c = sess.run([optimizer, cost], feed_dict={x: batch_X,
y: batch_Y})
Try smaller Weights initialization (stddev) if you still see issues.
WORKING CODE BELOW:
inputX = df_train.drop('SalePrice', 1).as_matrix()
inputX = inputX.astype(int)
sc = StandardScaler().fit(inputX)
inputX = sc.transform(inputX)
inputY = df_train.loc[:, ['SalePrice']].as_matrix()
inputY = inputY.astype(int)
sc1 = StandardScaler().fit(inputY)
inputY = sc1.transform(inputY)
inputX_test = df_test.drop('SalePrice', 1).as_matrix()
inputX_test = inputX_test.astype(int)
inputX_test = sc.transform(inputX_test)
inputY_test = df_test.loc[:, ['SalePrice']].as_matrix()
inputY_test = inputY_test.astype(int)
inputY_test = sc1.transform(inputY_test)
learning_rate = 0.01
training_epochs = 1000
batch_size = 50
display_step = 50
n_samples = inputX.shape[0]
x = tf.placeholder(tf.float32, [None, 5])
y = tf.placeholder(tf.float32, [None, 1])
def get_batch(inputX, inputY, batch_size):
duration = len(inputX)
for i in range(0,duration//batch_size):
idx = i*batch_size
yield inputX[idx:idx+batch_size], inputY[idx:idx+batch_size]
def add_layer(inputs, in_size, out_size, activation_function=None):
Weights = tf.Variable(tf.random_normal([in_size, out_size], stddev=0.005))
biases = tf.Variable(tf.zeros([1, out_size]))
Wx_plus_b = tf.matmul(inputs, Weights) + biases
if activation_function is None:
output = Wx_plus_b
else:
output = activation_function(Wx_plus_b)
return output
l1 = add_layer(x, 5, 3, activation_function=tf.nn.relu)
pred = add_layer(l1, 3, 1)
# Mean squared error
cost = tf.reduce_mean(tf.pow(tf.subtract(pred, y), 2))
# Gradient descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = batch_size
# Loop over all batches
#for i in range(total_batch):
for batch_x, batch_y in get_batch(inputX, inputY, batch_size):
# Run optimization op (backprop) and cost op (to get loss value)
_, c, _l1, _pred = sess.run([optimizer, cost, l1, pred], feed_dict={x: batch_x, y: batch_y})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f} ".format(avg_cost))
#print(_l1, _pred)
print("Optimization Finished!")
I have already had a similar problem of a very high cost reached after a few training steps, and then the cost remaining constant there. For me it was a kind of overflow, with the gradients too big and creating Nan values quite early in training. I solved it by starting with a smaller learning rate (potentially much smaller), until the cost and gradients become more reasonable (a few dozen steps), and then back to a regular one (higher at the start, potentially decaying).
See my answer to this post for a similar case that was solved just by taking a smaller learning rate on start.
You can also clip your gradients to avoid this problem, using tf.clip_by_value. It sets a minimum and maximum value to your gradients, which avoids to have huge ones that send your weights straight to Nan after the first few iterations. To use it (with min and max at -1 and 1, which is probably too tight), replace
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
by
opt= tf.train.GradientDescentOptimizer(learning_rate)
gvs = opt.compute_gradients(cost)
capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
optimizer = opt.apply_gradients(capped_gvs)
I am trying to adapt this MNIST example to binary classification.
But when changing my NLABELS from NLABELS=2 to NLABELS=1, the loss function always returns 0 (and accuracy 1).
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
# Import data
mnist = input_data.read_data_sets('data', one_hot=True)
NLABELS = 2
sess = tf.InteractiveSession()
# Create the model
x = tf.placeholder(tf.float32, [None, 784], name='x-input')
W = tf.Variable(tf.zeros([784, NLABELS]), name='weights')
b = tf.Variable(tf.zeros([NLABELS], name='bias'))
y = tf.nn.softmax(tf.matmul(x, W) + b)
# Add summary ops to collect data
_ = tf.histogram_summary('weights', W)
_ = tf.histogram_summary('biases', b)
_ = tf.histogram_summary('y', y)
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, NLABELS], name='y-input')
# More name scopes will clean up the graph representation
with tf.name_scope('cross_entropy'):
cross_entropy = -tf.reduce_mean(y_ * tf.log(y))
_ = tf.scalar_summary('cross entropy', cross_entropy)
with tf.name_scope('train'):
train_step = tf.train.GradientDescentOptimizer(10.).minimize(cross_entropy)
with tf.name_scope('test'):
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
_ = tf.scalar_summary('accuracy', accuracy)
# Merge all the summaries and write them out to /tmp/mnist_logs
merged = tf.merge_all_summaries()
writer = tf.train.SummaryWriter('logs', sess.graph_def)
tf.initialize_all_variables().run()
# Train the model, and feed in test data and record summaries every 10 steps
for i in range(1000):
if i % 10 == 0: # Record summary data and the accuracy
labels = mnist.test.labels[:, 0:NLABELS]
feed = {x: mnist.test.images, y_: labels}
result = sess.run([merged, accuracy, cross_entropy], feed_dict=feed)
summary_str = result[0]
acc = result[1]
loss = result[2]
writer.add_summary(summary_str, i)
print('Accuracy at step %s: %s - loss: %f' % (i, acc, loss))
else:
batch_xs, batch_ys = mnist.train.next_batch(100)
batch_ys = batch_ys[:, 0:NLABELS]
feed = {x: batch_xs, y_: batch_ys}
sess.run(train_step, feed_dict=feed)
I have checked the dimensions of both batch_ys (fed into y) and _y and they are both 1xN matrices when NLABELS=1 so the problem seems to be prior to that. Maybe something to do with the matrix multiplication?
I actually have got this same problem in a real project, so any help would be appreciated... Thanks!
The original MNIST example uses a one-hot encoding to represent the labels in the data: this means that if there are NLABELS = 10 classes (as in MNIST), the target output is [1 0 0 0 0 0 0 0 0 0] for class 0, [0 1 0 0 0 0 0 0 0 0] for class 1, etc. The tf.nn.softmax() operator converts the logits computed by tf.matmul(x, W) + b into a probability distribution across the different output classes, which is then compared to the fed-in value for y_.
If NLABELS = 1, this acts as if there were only a single class, and the tf.nn.softmax() op would compute a probability of 1.0 for that class, leading to a cross-entropy of 0.0, since tf.log(1.0) is 0.0 for all of the examples.
There are (at least) two approaches you could try for binary classification:
The simplest would be to set NLABELS = 2 for the two possible classes, and encode your training data as [1 0] for label 0 and [0 1] for label 1. This answer has a suggestion for how to do that.
You could keep the labels as integers 0 and 1 and use tf.nn.sparse_softmax_cross_entropy_with_logits(), as suggested in this answer.
I've been looking for good examples of how to implement binary classification in TensorFlow in a similar manner to the way it would be done in Keras. I didn't find any, but after digging through the code a bit, I think I have it figured out. I modified the problem here to implement a solution that uses sigmoid_cross_entropy_with_logits the way Keras does under the hood.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
# Import data
mnist = input_data.read_data_sets('data', one_hot=True)
NLABELS = 1
sess = tf.InteractiveSession()
# Create the model
x = tf.placeholder(tf.float32, [None, 784], name='x-input')
W = tf.get_variable('weights', [784, NLABELS],
initializer=tf.truncated_normal_initializer()) * 0.1
b = tf.Variable(tf.zeros([NLABELS], name='bias'))
logits = tf.matmul(x, W) + b
# Define loss and optimizer
y_ = tf.placeholder(tf.float32, [None, NLABELS], name='y-input')
# More name scopes will clean up the graph representation
with tf.name_scope('cross_entropy'):
#manual calculation : under the hood math, don't use this it will have gradient problems
# entropy = tf.multiply(tf.log(tf.sigmoid(logits)), y_) + tf.multiply((1 - y_), tf.log(1 - tf.sigmoid(logits)))
# loss = -tf.reduce_mean(entropy, name='loss')
entropy = tf.nn.sigmoid_cross_entropy_with_logits(labels=y_, logits=logits)
loss = tf.reduce_mean(entropy, name='loss')
with tf.name_scope('train'):
# Using Adam instead
# train_step = tf.train.GradientDescentOptimizer(learning_rate=0.001).minimize(loss)
train_step = tf.train.AdamOptimizer(learning_rate=0.002).minimize(loss)
with tf.name_scope('test'):
preds = tf.cast((logits > 0.5), tf.float32)
correct_prediction = tf.equal(preds, y_)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.initialize_all_variables().run()
# Train the model, and feed in test data and record summaries every 10 steps
for i in range(2000):
if i % 100 == 0: # Record summary data and the accuracy
labels = mnist.test.labels[:, 0:NLABELS]
feed = {x: mnist.test.images, y_: labels}
result = sess.run([loss, accuracy], feed_dict=feed)
print('Accuracy at step %s: %s - loss: %f' % (i, result[1], result[0]))
else:
batch_xs, batch_ys = mnist.train.next_batch(100)
batch_ys = batch_ys[:, 0:NLABELS]
feed = {x: batch_xs, y_: batch_ys}
sess.run(train_step, feed_dict=feed)
Training:
Accuracy at step 0: 0.7373 - loss: 0.758670
Accuracy at step 100: 0.9017 - loss: 0.423321
Accuracy at step 200: 0.9031 - loss: 0.322541
Accuracy at step 300: 0.9085 - loss: 0.255705
Accuracy at step 400: 0.9188 - loss: 0.209892
Accuracy at step 500: 0.9308 - loss: 0.178372
Accuracy at step 600: 0.9453 - loss: 0.155927
Accuracy at step 700: 0.9507 - loss: 0.139031
Accuracy at step 800: 0.9556 - loss: 0.125855
Accuracy at step 900: 0.9607 - loss: 0.115340
Accuracy at step 1000: 0.9633 - loss: 0.106709
Accuracy at step 1100: 0.9667 - loss: 0.099286
Accuracy at step 1200: 0.971 - loss: 0.093048
Accuracy at step 1300: 0.9714 - loss: 0.087915
Accuracy at step 1400: 0.9745 - loss: 0.083300
Accuracy at step 1500: 0.9745 - loss: 0.079019
Accuracy at step 1600: 0.9761 - loss: 0.075164
Accuracy at step 1700: 0.9768 - loss: 0.071803
Accuracy at step 1800: 0.9777 - loss: 0.068825
Accuracy at step 1900: 0.9788 - loss: 0.066270