I new to tensorflow and my code is below:
import tensorflow as tf
logdir="/tmp/mnist_tutorial5/"
mnist = tf.contrib.learn.datasets.mnist.read_data_sets(train_dir=logdir+"data",one_hot = True)
tf.reset_default_graph()
sess = tf.Session()
writer = tf.summary.FileWriter(logdir)
def model(input):
w = tf.Variable(tf.truncated_normal([784,10], stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[10]), name="B")
act = tf.matmul(input,w) + b
tf.summary.histogram("weights",w)
tf.summary.histogram("biases",b)
tf.summary.histogram("activations",act)
return act
def train():
x = tf.placeholder(tf.float32, shape=[None, 784], name="input_img")
y = tf.placeholder(tf.float32, shape=[None, 10], name="labels")
my_label = model(x)
print("linear_regression is completed")
mean_error = tf.reduce_mean(tf.reduce_sum(tf.square(my_label-y)))
tf.summary.scalar("loss", mean_error)
train_step=tf.train.GradientDescentOptimizer(0.0003).minimize(mean_error)
correct_prediction = tf.equal(tf.argmax(my_label, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar("accuracy", accuracy)
sess.run(tf.global_variables_initializer())
summ = tf.summary.merge_all()
for i in range(2000):
batch = mnist.train.next_batch(100)
train_accuracy = sess.run(train_step, feed_dict={x: batch[0], y: batch[1]})
print("%s th iteration"%i)
if i%500==0:
print("over 2")
summarys = sess.run(summ, {x: batch[0], y: batch[1]})##i'm getting error here
print("over 3")
writer.add_summary(summarys,i)
print("one over")
train()
writer.add_graph(sess.graph)
And this is the error I am getting:
InvalidArgumentError (see above for traceback): Nan in summary histogram for: weights
[[Node: weights = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](weights/tag, W/read)]]
first of all its just a one layer network with no hidden layer
you have not applied any kind of activation function
no activation means your outputs are not being squashed
your gradients are getting exploded because of which the weights during updation are being exploded which may result in nan
You are training for 2000 epochs which means obviously after few epochs the weights will be nan
try to use an activation function like sigmoid and add atleast one hidden layer and you will be fine
also reduce the number of epochs ... For mnist classification you do not need the model to train for 2000 epochs ... Waste of time
Related
I have 3 folders for my CNN model which are train_data, val_data, test_data.
when I am training my model, I found that the accuracy may vary and sometimes the last epoch does not show the best accuracy. for example, last epoch accuracy is 71% but I found the better accuracy in the earlier epoch. I want to save the checkpoint of that epoch which has higher accuracy and then use that checkpoint to predict my model on test_data
I trained my model on train_data and predicted on val_data and save the checkpoint of the model like below:
print("{} Saving checkpoint of model...". format(datetime.now()))
checkpoint_path = os.path.join(checkpoint_dir, 'model_epoch' + str(epoch) + '.ckpt')
save_path = saver.save(session, checkpoint_path)
and before starting the tf.Session() I have this line:
saver = tf.train.Saver()
I want to know how can I save the best epoch which has higher accuracy and then use this checkpoint for my test_data?
The tf.train.Saver() documentation describes the following:
saver.save(sess, 'my-model', global_step=0) ==> filename: 'my-model-0'
...
saver.save(sess, 'my-model', global_step=1000) ==> filename: 'my-model-1000'
Note that if you pass global_step to the saver you will generate checkpoint files that contain the global step number. I generally save checkpoints every X minutes and then come back and review the results and choose a checkpoint at the appropriate step value. If you're using tensorboard you'll find this intuitive since all your graphs can be displayed by global step as well.
https://www.tensorflow.org/api_docs/python/tf/train/Saver
You can use CheckpointSaverListener.
from __future__ import print_function
import tensorflow as tf
import os
from sacred import Experiment
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
ex = Experiment('test-07-05-2018')
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
checkpoint_path = "/tmp/checkpoints/"
class ExampleCheckpointSaverListener(CheckpointSaverListener):
def begin(self):
print('Starting the session.')
self.prev_accuracy = 0
self.acc = 0
def after_save(self, session, global_step_value):
print('Only keep this checkpoint if it is better than the previous one')
self.acc = acc
if self.acc < self.prev_accuracy :
os.remove(tf.train.latest_checkpoint())
else:
self.prev_accuracy = self.acc
def end(self, session, global_step_value):
print('Done with the session.')
#ex.config
def my_config():
pass
#ex.automain
def main():
#build the graph of vanilla multiclass logistic regression
x = tf.placeholder(tf.float32, [None, 784])
y = tf.placeholder(tf.float32, [None, 10])
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y_pred = tf.nn.softmax(tf.matmul(x, W) + b) #
loss = tf.reduce_mean(-tf.reduce_sum(y*tf.log(y_pred), reduction_indices=1))
optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(cost)
init = tf.global_variables_initializer()
y_pred_cls = tf.argmax(y_pred, dimension=1)
y_true_cls = tf.argmax(y, dimension=1)
correct_prediction = tf.equal(y_pred_cls, y_true_cls)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
saver = tf.train.Saver()
listener = ExampleCheckpointSaverListener()
saver_hook = tf.train.CheckpointSaverHook(checkpoint_dir, listeners=[listener])
with tf.train.MonitoredTrainingSession(chief_only_hooks=[saver_hook]) as sess:
sess.run(init)
for epoch in range(25):
avg_loss = 0.
total_batch = int(mnist.train.num_examples/100)
# Loop over all batches
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(100)
_, l, acc = sess.run([optimizer, loss, accuracy], feed_dict={x: batch_xs, y: batch_ys})
avg_loss += l / total_batch
saver.save(sess, checkpoint_path)
I'm comparing the performance of Tensorflow with sklearn on two datasets:
A toy dataset in sklearn
MNIST dataset
Here is my code (Python):
from __future__ import print_function
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
import tensorflow as tf
from sklearn.datasets import load_digits
import numpy as np
# digits = load_digits()
# data = digits.data
# labels = digits.target
# convert to binary labels
# y = np.zeros((labels.shape[0],10))
# y[np.arange(labels.shape[0]),labels] = 1
x_train = mnist.train.images
y_train = mnist.train.labels
x_test = mnist.test.images
y_test = mnist.test.labels
n_train = mnist.train.images.shape[0]
# import pdb;pdb.set_trace()
# Parameters
learning_rate = 1e-3
lambda_val = 1e-5
training_epochs = 30
batch_size = 200
display_step = 1
# Network Parameters
n_hidden_1 = 300 # 1st layer number of neurons
n_input = x_train.shape[1] # MNIST data input (img shape: 28*28)
n_classes = 10 # MNIST total classes (0-9 digits)
# tf Graph input
X = tf.placeholder("float", [None, n_input])
Y = tf.placeholder("float", [None, n_classes])
# Store layers weight & bias
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'out': tf.Variable(tf.random_normal([n_hidden_1, n_classes]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
# Create model
def multilayer_perceptron(x):
# Hidden fully connected layer with 256 neurons
layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
# Activation
layer_1_relu = tf.nn.relu(layer_1)
# Output fully connected layer with a neuron for each class
out_layer = tf.matmul(layer_1_relu, weights['out']) + biases['out']
return out_layer
# Construct model
logits = multilayer_perceptron(X)
# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y)) + lambda_val*tf.nn.l2_loss(weights['h1']) + lambda_val*tf.nn.l2_loss(weights['out'])
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)
# Test model
pred = tf.nn.softmax(logits) # Apply softmax to logits
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(Y, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
# Initializing the variables
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(n_train/batch_size)
# Loop over all batches
ptr = 0
for i in range(total_batch):
next_ptr = ptr + batch_size
if next_ptr > len(x_train):
next_ptr = len(x_train)
batch_x, batch_y = x_train[ptr:next_ptr],y_train[ptr:next_ptr]
ptr += batch_size
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([train_op, loss_op], feed_dict={X: batch_x,
Y: batch_y})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost={:.9f}".format(avg_cost))
print("Optimization Finished!")
print("Accuracy on training set: ", accuracy.eval({X:x_train,Y:y_train}))
print("Accuracy on testing set:", accuracy.eval({X: x_test, Y: y_test}))
print("Experimenting sklearn...")
# now experiment with sklearn
from sklearn.datasets import load_digits
import numpy as np
from sklearn.neural_network import MLPClassifier
import time
# use MLP
t_start = time.time()
print('fitting MLP...')
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(300,),max_iter=training_epochs)
clf.fit(x_train,y_train)
print('fitted MLP in {:.2f} seconds'.format(time.time() - t_start))
print('predicting...')
labels_predicted = clf.predict(x_test)
print('accuracy: {:.2f} %'.format(np.mean(np.argmax(y_test,axis=1) == np.argmax(labels_predicted,axis=1)) * 100))
The code is adapted from a github repository. For this testing, I'm using a traditional neural network (MLP) with only one hidden layer of size 300.
Following is the result for the both datasets:
sklearn digits: ~83% (tensorflow), ~90% (sklearn)
MNIST: ~94% (tensorflow), ~97% (sklearn)
I'm using the same model for both libraries. All the parameters (number of hidden layers, number of hidden units, learning_rate, l2 regularization constant, number of training epochs, batch size) and optimization algorithms are the same (Adam optimizer, beta parameters for Adam optimizer, no momentum, etc).
I wonder if sklearn has done a magic implementation over tensorflow? Can anyone help answer?
Thank you very much.
I just begin to study tensorflow and I want to create a DNN for MNIST. In the tutorial, there is a very simple neural network with 784 input nodes, 10 output nodes and no hidden nodes. I try to modify these codes to create a DNN network. Here is my code. I think I just add a hidden layer with 500 nodes between input and output layers, but the test accuracy is just 10%, which means it is not trained. Do you know what's wrong with my codes?
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import os
os.chdir('../')
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
x=tf.placeholder(tf.float32,[None,784])
W_h1=tf.Variable(tf.zeros([784,500]))
B_h1=tf.Variable(tf.zeros([500]))
h1=tf.nn.relu(tf.matmul(x,W_h1)+B_h1)
'''
W_h2=tf.Variable(tf.zeros([5,5]))
B_h2=tf.Variable(tf.zeros([5]))
h2=tf.nn.relu(tf.matmul(h1,W_h2)+B_h2)
'''
B_o=tf.Variable(tf.zeros([10]))
W_o=tf.Variable(tf.zeros([500,10]))
y=tf.nn.relu(tf.matmul(h1,W_o)+B_o)
y_=tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.05).minimize(cross_entropy)
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
number_steps = 10000
batch_size = 100
for _ in range(number_steps):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
train=sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
# Print classifier's accuracy
print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
OK, according to #lejlot's suggestion, I change my code as following.
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
import os
os.chdir('../')
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
x=tf.placeholder(tf.float32,[None,784])
W_h1=tf.Variable(tf.random_normal([784,500]))
B_h1=tf.Variable(tf.random_normal([500]))
h1=tf.nn.relu(tf.matmul(x,W_h1)+B_h1)
'''
W_h2=tf.Variable(tf.random_normal([500,500]))
B_h2=tf.Variable(tf.random_normal([500]))
h2=tf.nn.relu(tf.matmul(h1,W_h2)+B_h2)
'''
B_o=tf.Variable(tf.random_normal([10]))
W_o=tf.Variable(tf.random_normal([500,10]))
y= tf.matmul(h1,W_o)+B_o # notice no activation
y_=tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.nn.log_softmax(y), # notice log_softmax
reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
sess = tf.InteractiveSession()
tf.global_variables_initializer().run()
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
number_steps = 10000
batch_size = 100
for i in range(number_steps):
batch_xs, batch_ys = mnist.train.next_batch(batch_size)
train=sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
if i % 1000==0:
acc=sess.run(accuracy,feed_dict={x: mnist.test.images, y_: mnist.test.labels})
print('Current loop %d, Accuracy: %g'%(i,acc))
# Print classifier's accuracy
print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
There are two modification:
change the initial value of W_h1 and B_h1 with tf.random_normal
change the define of y and cross_entropy
The modification dose work. But I still don't know what's wrong with my original code. I call the tf.global_variables_initializer().run(), and I think this function will random the value of W_h1 and B_h1. Besides, if I define y and cross_entropy as following, it doesn't work.
y= tf.nn.softmax(tf.matmul(h1,W_o)+B_o)
y_=tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y),reduction_indices=[1]))
First of all this is not valid classifier model.
y=tf.nn.relu(tf.matmul(h1,W_o)+B_o)
y_=tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
You are using explicit equation for cross entropy which requires y to be a (row-wise) probability distribution, yet you produce y by applying relu, meaning that you are simply outputing some non-negative numbers. In fact, if you ever output zeros, your code will produce NaNs and fail (as log of 0 is minus infinity).
You should use
y = tf.nn.softmax(tf.matmul(h1,W_o)+B_o)
instead. Or even better (for better numerical stability):
y= tf.matmul(h1,W_o)+B_o # notice no activation
y_=tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(
-tf.reduce_sum(y_ * tf.nn.log_softmax(y), # notice log_softmax
reduction_indices=[1]))
update
Second issue is initialisation - you cannot initialise neural network weights to zeros, they have to be random numbers, typically sampled from low-variance zero-mean Gaussians. Global initialiser does not randomise weights, it simply runs all the initialisation ops - if the initialisation ops are constant ones (like zeros), it simply makes sure these zeros are assigned to variables, nothing else (thus it can be used to reset the network etc.). Zero initialisation works only for convex problems, such as logistic regression, but cannot work for complex model like neural network.
I have a dataframe of shape (38307, 26) with timestamp as index:
I'm trying to implement a LSTM classifier but I'm struggling to feed it into the DataFlow
The final arrays I'm trying to feed are of shape '(X_train = (38307, 25), y_train = (38307, 2))'
I have added the code in case
# Parametres
learning_rate = 0.001
training_epochs = 100
batch_size = 128
display_step = 10
# Network Parameters
n_input = 25 # features= 25
n_steps = 28 # timesteps
n_hidden = 128 # hidden layer num of features
n_classes = 2 # Binary classification
# TF Graph input
x = tf.placeholder("float32", [None, n_steps, n_input])
y = tf.placeholder("float32", [None, n_classes])
# TF Weights
weights = {
'out': tf.Variable(tf.random_normal([n_hidden, n_classes]))
}
biases = {
'out': tf.Variable(tf.random_normal([n_classes]))
}
pred = RNN(x, weights, biases)
# Initialize the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
step = 1
# Keep training until reach max iterations
for epoch in range(training_epochs):
avg_cost = 0
total_batch = int(len(X_train)/batch_size)
X_batches = np.array_split(X_train, total_batch)
Y_batches = np.array_split(y_train, total_batch)
#Loop over all batches
for i in range(total_batch):
batch_x, batch_y = X_batches[i], Y_batches[i]
# batch_y.shape = (batch_y.shape[0]), 1)
# Run optimization op (backprop) and cost op(to get loss value)
_, c = sess.run([optimizer, cost], feed_dict={x: batch_x,
y: batch_y})
# Compute average loss
avg_cost += c / total_batch
#Display logs per epoch step
if epoch % display_step == 0:
print(("Epoch:", '%04d' % (epoch + 1), "cost=", "{:.9f}".format(avg_cost)))
print('Optimization finished')
# Store session for analysis with TensorBoard
writer = tf.summary.FileWriter("/tmp/test", sess.graph)
#Test model
print("Accuracy:", accuracy.eval({x: X_test, y: y_test}))
global result
result = tf.argmax(pred, 1).eval({x: X_test, y: y_test})
EDIT the RNN function:
def RNN(x, weights, biases):
# Prepare data shape to match 'rnn' function requirements
# Current data input shape: (batch_size, n_steps, n_input)
# Required Shape: 'n_steps' tensors list of shape (batch size, n_input)
# Permuting batch_size and n_steps
x = tf.transpose(x, [1, 0, 2])
# Reshaping to (n_steps*batch_size, n_input)
x = tf.reshape(x, [-1, n_input])
# Split to get a list of 'n_steps' tensors of shape (batch_size, n_input)
x = tf.split(0, n_steps, x)
# x = tf.split(x, n_steps, 0) # Syntax change this version
# LSTM tensorflow using rnn from tensorflow.contrib
lstm_cell = rnn_cell.BasicLSTMCell(n_hidden, forget_bias=1.0)
# Get LSTM cell output
outputs, states = rnn.rnn(lstm_cell, x, dtype=tf.float32)
# Linear activation, using rnn inner loop last output
return tf.matmul(outputs[-1], weights['out']) + biases['out']
Unfortunately, the most important part of your code, is hidden in the RNN function.
Some tips to help you out: I guess you are trying to build a dynamic RNN... (is that correct? ) In that case, a common mistake I see is that people confuse the time major and batch major setting of these RNNs. In other words, is you input data [batch,time,variables], or [time,batch,variables].
More about this can be found here: https://github.com/tensorflow/tensorflow/blob/master/tensorflow/g3doc/api_docs/python/functions_and_classes/shard8/tf.nn.dynamic_rnn.md
I was going through the tutorial of tensorflow-
https://www.tensorflow.org/versions/r0.9/tutorials/mnist/beginners/index.html
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data/", one_hot=True)
x = tf.placeholder(tf.float32, [None, 784])
W = tf.Variable(tf.zeros([784, 10])) #weights
b = tf.Variable(tf.zeros([10])) #bias
y = tf.nn.softmax(tf.matmul(x, W) + b)
y_ = tf.placeholder(tf.float32, [None, 10])
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y), reduction_indices=[1]))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
for i in range(1000):
batch_xs, batch_ys = mnist.train.next_batch(100)
sess.run(train_step, feed_dict={x: batch_xs, y_: batch_ys})
correct_prediction = tf.equal(tf.argmax(y,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
print(sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}))
Towards the very end, we pass in test data to the placeholders. y_ is matrix containing true values. and y is the matrix with predicted values. My question is when is y computed for the test data. The W matrix has been trained by backpropagation. But this trained matrix must be multiplied with new input x (test data) to give the prediction y. Where does this happen?
Normally i have seen sequential execution of code, and in the last few lines, y isn't called explicitly.
accuracy depends on correct_prediction which depends on y.
So when you call sess.run(accuracy, feed_dict={x: mnist.test.images, y_: mnist.test.labels}), y is computed before accuracy is computed. All this happen inside the TensorFlow graph.
The TensorFlow graph is the same for train and test. The only difference is the data you feed to the placeholders x and y_.
y is computed here:
y = tf.nn.softmax(tf.matmul(x, W) + b) # Line 7
specifically what you are looking for is with in that line:
tf.matmul(x, W) + b
the output of which is put through the softmax function to identify the class.
This is computed in each of the 1000 passes through the graph, each time the variables W, and b are updated by GradientDescent and y is computed and compared against y_ to determine the loss.