save checkpoint with Tensorflow

save checkpoint with Tensorflow - python

I have 3 folders for my CNN model which are train_data, val_data, test_data.
when I am training my model, I found that the accuracy may vary and sometimes the last epoch does not show the best accuracy. for example, last epoch accuracy is 71% but I found the better accuracy in the earlier epoch. I want to save the checkpoint of that epoch which has higher accuracy and then use that checkpoint to predict my model on test_data
I trained my model on train_data and predicted on val_data and save the checkpoint of the model like below:
print("{} Saving checkpoint of model...". format(datetime.now()))
checkpoint_path = os.path.join(checkpoint_dir, 'model_epoch' + str(epoch) + '.ckpt')
save_path = saver.save(session, checkpoint_path)
and before starting the tf.Session() I have this line:
saver = tf.train.Saver()
I want to know how can I save the best epoch which has higher accuracy and then use this checkpoint for my test_data?

The tf.train.Saver() documentation describes the following:
saver.save(sess, 'my-model', global_step=0) ==> filename: 'my-model-0'
...
saver.save(sess, 'my-model', global_step=1000) ==> filename: 'my-model-1000'
Note that if you pass global_step to the saver you will generate checkpoint files that contain the global step number. I generally save checkpoints every X minutes and then come back and review the results and choose a checkpoint at the appropriate step value. If you're using tensorboard you'll find this intuitive since all your graphs can be displayed by global step as well.
https://www.tensorflow.org/api_docs/python/tf/train/Saver

You can use CheckpointSaverListener.
from __future__ import print_function
import tensorflow as tf
import os
from sacred import Experiment
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
ex = Experiment('test-07-05-2018')
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
checkpoint_path = "/tmp/checkpoints/"
class ExampleCheckpointSaverListener(CheckpointSaverListener):
def begin(self):
print('Starting the session.')
self.prev_accuracy = 0
self.acc = 0
def after_save(self, session, global_step_value):
print('Only keep this checkpoint if it is better than the previous one')
self.acc = acc
if self.acc < self.prev_accuracy :
os.remove(tf.train.latest_checkpoint())
else:
self.prev_accuracy = self.acc
def end(self, session, global_step_value):
print('Done with the session.')
#ex.config
def my_config():
pass
#ex.automain
def main():
#build the graph of vanilla multiclass logistic regression
x = tf.placeholder(tf.float32, [None, 784])
y = tf.placeholder(tf.float32, [None, 10])
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y_pred = tf.nn.softmax(tf.matmul(x, W) + b) #
loss = tf.reduce_mean(-tf.reduce_sum(y*tf.log(y_pred), reduction_indices=1))
optimizer = tf.train.GradientDescentOptimizer(0.01).minimize(cost)
init = tf.global_variables_initializer()
y_pred_cls = tf.argmax(y_pred, dimension=1)
y_true_cls = tf.argmax(y, dimension=1)
correct_prediction = tf.equal(y_pred_cls, y_true_cls)
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
saver = tf.train.Saver()
listener = ExampleCheckpointSaverListener()
saver_hook = tf.train.CheckpointSaverHook(checkpoint_dir, listeners=[listener])
with tf.train.MonitoredTrainingSession(chief_only_hooks=[saver_hook]) as sess:
sess.run(init)
for epoch in range(25):
avg_loss = 0.
total_batch = int(mnist.train.num_examples/100)
# Loop over all batches
for i in range(total_batch):
batch_xs, batch_ys = mnist.train.next_batch(100)
_, l, acc = sess.run([optimizer, loss, accuracy], feed_dict={x: batch_xs, y: batch_ys})
avg_loss += l / total_batch
saver.save(sess, checkpoint_path)

Related

Tensorflow Histogram error

I new to tensorflow and my code is below:
import tensorflow as tf
logdir="/tmp/mnist_tutorial5/"
mnist = tf.contrib.learn.datasets.mnist.read_data_sets(train_dir=logdir+"data",one_hot = True)
tf.reset_default_graph()
sess = tf.Session()
writer = tf.summary.FileWriter(logdir)
def model(input):
w = tf.Variable(tf.truncated_normal([784,10], stddev=0.1), name="W")
b = tf.Variable(tf.constant(0.1, shape=[10]), name="B")
act = tf.matmul(input,w) + b
tf.summary.histogram("weights",w)
tf.summary.histogram("biases",b)
tf.summary.histogram("activations",act)
return act
def train():
x = tf.placeholder(tf.float32, shape=[None, 784], name="input_img")
y = tf.placeholder(tf.float32, shape=[None, 10], name="labels")
my_label = model(x)
print("linear_regression is completed")
mean_error = tf.reduce_mean(tf.reduce_sum(tf.square(my_label-y)))
tf.summary.scalar("loss", mean_error)
train_step=tf.train.GradientDescentOptimizer(0.0003).minimize(mean_error)
correct_prediction = tf.equal(tf.argmax(my_label, 1), tf.argmax(y, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
tf.summary.scalar("accuracy", accuracy)
sess.run(tf.global_variables_initializer())
summ = tf.summary.merge_all()
for i in range(2000):
batch = mnist.train.next_batch(100)
train_accuracy = sess.run(train_step, feed_dict={x: batch[0], y: batch[1]})
print("%s th iteration"%i)
if i%500==0:
print("over 2")
summarys = sess.run(summ, {x: batch[0], y: batch[1]})##i'm getting error here
print("over 3")
writer.add_summary(summarys,i)
print("one over")
train()
writer.add_graph(sess.graph)
And this is the error I am getting:
InvalidArgumentError (see above for traceback): Nan in summary histogram for: weights
[[Node: weights = HistogramSummary[T=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"](weights/tag, W/read)]]

first of all its just a one layer network with no hidden layer
you have not applied any kind of activation function
no activation means your outputs are not being squashed
your gradients are getting exploded because of which the weights during updation are being exploded which may result in nan
You are training for 2000 epochs which means obviously after few epochs the weights will be nan
try to use an activation function like sigmoid and add atleast one hidden layer and you will be fine
also reduce the number of epochs ... For mnist classification you do not need the model to train for 2000 epochs ... Waste of time

Inferior performance of Tensorflow compared to sklearn

I'm comparing the performance of Tensorflow with sklearn on two datasets:
A toy dataset in sklearn
MNIST dataset
Here is my code (Python):
from __future__ import print_function
# Import MNIST data
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("/tmp/data/", one_hot=True)
import tensorflow as tf
from sklearn.datasets import load_digits
import numpy as np
# digits = load_digits()
# data = digits.data
# labels = digits.target
# convert to binary labels
# y = np.zeros((labels.shape[0],10))
# y[np.arange(labels.shape[0]),labels] = 1
x_train = mnist.train.images
y_train = mnist.train.labels
x_test = mnist.test.images
y_test = mnist.test.labels
n_train = mnist.train.images.shape[0]
# import pdb;pdb.set_trace()
# Parameters
learning_rate = 1e-3
lambda_val = 1e-5
training_epochs = 30
batch_size = 200
display_step = 1
# Network Parameters
n_hidden_1 = 300 # 1st layer number of neurons
n_input = x_train.shape[1] # MNIST data input (img shape: 28*28)
n_classes = 10 # MNIST total classes (0-9 digits)
# tf Graph input
X = tf.placeholder("float", [None, n_input])
Y = tf.placeholder("float", [None, n_classes])
# Store layers weight & bias
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'out': tf.Variable(tf.random_normal([n_hidden_1, n_classes]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
# Create model
def multilayer_perceptron(x):
# Hidden fully connected layer with 256 neurons
layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
# Activation
layer_1_relu = tf.nn.relu(layer_1)
# Output fully connected layer with a neuron for each class
out_layer = tf.matmul(layer_1_relu, weights['out']) + biases['out']
return out_layer
# Construct model
logits = multilayer_perceptron(X)
# Define loss and optimizer
loss_op = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=Y)) + lambda_val*tf.nn.l2_loss(weights['h1']) + lambda_val*tf.nn.l2_loss(weights['out'])
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)
# Test model
pred = tf.nn.softmax(logits) # Apply softmax to logits
correct_prediction = tf.equal(tf.argmax(pred, 1), tf.argmax(Y, 1))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
# Initializing the variables
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(n_train/batch_size)
# Loop over all batches
ptr = 0
for i in range(total_batch):
next_ptr = ptr + batch_size
if next_ptr > len(x_train):
next_ptr = len(x_train)
batch_x, batch_y = x_train[ptr:next_ptr],y_train[ptr:next_ptr]
ptr += batch_size
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([train_op, loss_op], feed_dict={X: batch_x,
Y: batch_y})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost={:.9f}".format(avg_cost))
print("Optimization Finished!")
print("Accuracy on training set: ", accuracy.eval({X:x_train,Y:y_train}))
print("Accuracy on testing set:", accuracy.eval({X: x_test, Y: y_test}))
print("Experimenting sklearn...")
# now experiment with sklearn
from sklearn.datasets import load_digits
import numpy as np
from sklearn.neural_network import MLPClassifier
import time
# use MLP
t_start = time.time()
print('fitting MLP...')
clf = MLPClassifier(solver='adam', alpha=1e-5, hidden_layer_sizes=(300,),max_iter=training_epochs)
clf.fit(x_train,y_train)
print('fitted MLP in {:.2f} seconds'.format(time.time() - t_start))
print('predicting...')
labels_predicted = clf.predict(x_test)
print('accuracy: {:.2f} %'.format(np.mean(np.argmax(y_test,axis=1) == np.argmax(labels_predicted,axis=1)) * 100))
The code is adapted from a github repository. For this testing, I'm using a traditional neural network (MLP) with only one hidden layer of size 300.
Following is the result for the both datasets:
sklearn digits: ~83% (tensorflow), ~90% (sklearn)
MNIST: ~94% (tensorflow), ~97% (sklearn)
I'm using the same model for both libraries. All the parameters (number of hidden layers, number of hidden units, learning_rate, l2 regularization constant, number of training epochs, batch size) and optimization algorithms are the same (Adam optimizer, beta parameters for Adam optimizer, no momentum, etc).
I wonder if sklearn has done a magic implementation over tensorflow? Can anyone help answer?
Thank you very much.

TensorFlow loss returns NaN

I'm trying to write my first simple neural net with my own data based on various tutorials and information. I'm stuck at the point where I think I prepared the model and I'm trying to run it, but when I want to find out the cost function changes in every epochs, it returns NaN.
My code is:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
df = pd.read_excel("mydataset.xlsx")
# Preparing the dataset, doing some stuff here
df2 = df.dropna(subset=['wl'])
df2 = df2.sample(frac=1)
df2_X = df2[['param1','param2','param3','param4','param5','param6','param7']]
df2_y = df2[['numerical_result_param']]
# Spliting the dataset...
train_X, test_X, train_y, test_y = df2_X[:210], df2_X[210:], df2_y[:210], df2_y[210:]
# Creating model:
X = tf.placeholder("float", shape=[None, train_X.shape[1]])
y = tf.placeholder("float", shape=[None, train_y.shape[1]])
hl_size = 256 # Number of neurons in hidden layer
weights = {
'hl': tf.Variable(tf.random_normal([train_X.shape[1], hl_size])),
'out': tf.Variable(tf.random_normal([hl_size, train_y.shape[1]]))
}
biases = {
'hl': tf.Variable(tf.random_normal([hl_size])),
'out': tf.Variable(tf.random_normal([train_y.shape[1]]))
}
def multilayer_perceptron(x):
hl_layer = tf.add(tf.matmul(x, weights['hl']), biases['hl'])
hl_layer = tf.nn.relu(hl_layer)
out_layer = tf.matmul(hl_layer, weights['out']) + biases['out']
return out_layer
logits = multilayer_perceptron(X)
hm_epochs = 100 # Number of epochs
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y, logits=logits))
optimizer = tf.train.AdamOptimizer(0.01).minimize(cost) # Training optimizer
# Running the session
with tf.Session() as sess:
init = tf.global_variables_initializer()
sess.run(init)
for epoch in range(hm_epochs):
epoch_loss = 0
_, c = sess.run([optimizer, cost], feed_dict={X: train_X, y: train_y})
epoch_loss += c
print('Epoch',epoch,'out of',hm_epochs,'loss:',epoch_loss)
And it returns:
Epoch 0 out of 100 loss: nan
Epoch 1 out of 100 loss: nan
etc.
I'd appreciate any help and ideas what I did wrong!

Neural network for regression not learning in tensorflow

I've modified tensor flow example to fit on my data, given here: data
But my neural network is not learning at all, I tried to use different no. of hidden layers, learning rate and optimization functions, but it didn't help.My code is given below:
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import tensorflow as tf
from tensorflow.contrib import learn
import matplotlib.pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn import datasets, linear_model
from sklearn import cross_validation
from sklearn import preprocessing
import numpy as np
filename_queue = tf.train.string_input_producer(["file0.csv"])
reader = tf.TextLineReader()
key, value = reader.read(filename_queue)
# Default values, in case of empty columns. Also specifies the type of the
# decoded result.
record_defaults = [[0], [0], [0], [0]]
col1, col2, col3, col4 = tf.decode_csv(
value, record_defaults=record_defaults)
features = tf.stack([col1, col2, col3])
with tf.Session() as sess:
# Start populating the filename queue.
coord = tf.train.Coordinator()
threads = tf.train.start_queue_runners(coord=coord)
x = np.zeros(shape=(1813,3))
y = np.zeros(shape=(1813))
for i in range(1813):
# Retrieve a single instance:
x1, y1 = sess.run([features, col4])
x[i] = x1
y[i] = y1
coord.request_stop()
coord.join(threads)
#standard_scaler = preprocessing.StandardScaler()
#x = standard_scaler.fit_transform(x)
# Split in test and train data
X_train, X_test, Y_train, Y_test = cross_validation.train_test_split(x, y, test_size=0.2)
total_len = X_train.shape[0]
# Parameters
learning_rate = 0.001
training_epochs = 500
batch_size = 5
display_step = 1
# Network Parameters
n_hidden_1 = 50
n_input = X_train.shape[1]
n_classes = 1
# tf Graph input
x = tf.placeholder("float", [None, 3])
y = tf.placeholder("float", [None])
# Create model
def multilayer_perceptron(x, weights, biases):
# Hidden layer with RELU activation
layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
layer_1 = tf.nn.relu(layer_1)
# Output layer with linear activation
out_layer = tf.matmul(layer_1, weights['out']) + biases['out']
return out_layer
# Store layers weight & bias
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1], 0, 0.1)),
'out': tf.Variable(tf.random_normal([n_hidden_1, n_classes], 0, 0.1))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1], 0, 0.1)),
'out': tf.Variable(tf.random_normal([n_classes], 0, 0.1))
}
# Construct model
pred = multilayer_perceptron(x, weights, biases)
#reshape(pred, [-1])
tf.shape(pred)
tf.shape(y)
print("Prediction matrix:", pred)
print("Output matrix:", y)
# Define loss and optimizer
cost = tf.reduce_mean(tf.square(pred-y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost)
# Launch the graph
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = int(total_len/batch_size)
print(total_batch)
# Loop over all batches
for i in range(total_batch-1):
batch_x = X_train[i*batch_size:(i+1)*batch_size]
batch_y = Y_train[i*batch_size:(i+1)*batch_size]
# Run optimization op (backprop) and cost op (to get loss value)
_, c, p = sess.run([optimizer, cost, pred], feed_dict={x: batch_x,
y: batch_y})
# Compute average loss
avg_cost += c / total_batch
# sample prediction
label_value = batch_y
estimate = p
err = label_value-estimate
print ("num batch:", total_batch)
# Display logs per epoch step
if epoch % display_step == 0:
print ("Epoch:", '%04d' % (epoch+1), "cost=", \
"{:.9f}".format(avg_cost))
print ("[*]----------------------------")
for i in xrange(5):
print ("label value:", label_value[i], \
"estimated value:", estimate[i])
print ("[*]============================")
print ("Optimization Finished!")
# Test model
correct_prediction = tf.equal(tf.argmax(pred), tf.argmax(y))
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print ("Accuracy:", accuracy.eval({x: X_test, y: Y_test}))
and result looks like that: (label value = expected result)
result

Does anyone know how to allocate more memory for the graph? Tensorflow: "ValueError: GraphDef cannot be larger than 2GB."

I am learning how to do transfer learning from an already trained AlexNet on images. I am striping off the last fully connected layer in AlexNet and creating my own layer with the number of classes in my traffic signs labels. I don't want to retrain the layers previous to this new layer only the new layer itself.
I am having trouble finding the bug that is causing this error. I have searched the web for solutions, but I don't believe any of them apply.
Please take a moment and see if you can find where I am creating the extra nodes that are causing the Graph to grow. Thanks!
import pickle
import tensorflow as tf
from sklearn.model_selection import train_test_split
from alexnet import AlexNet
import os
import numpy as np
# TODO: Load traffic signs data.
# Load pickled data
# Where training and testing data is saved.
training_file = os.getcwd()+"/train.p"
print(training_file)
with open(training_file, mode='rb') as f:
train = pickle.load(f)
X_train, y_train = train['features'], train['labels']
print('Train data shape = ', X_train.shape)
X_train_original = np.copy(X_train)
# TODO: Split data into training and validation sets.
X_train, X_valid, y_train, y_valid = train_test_split(X_train,
y_train,
train_size=.8,
random_state=42)
# TODO: Define placeholders and resize operation.
nb_classes = 43
x = tf.placeholder(tf.float32, (None, 32, 32, 3), name='x')
y = tf.placeholder(tf.int32, (None), name='y')
one_hot_y = tf.one_hot(y, 43, name='one_hot_y')
resized = tf.image.resize_images(x, (227, 227))
print('Resized data shape = ', resized.shape)
# TODO: pass placeholder as first argument to `AlexNet`.
fc7 = AlexNet(resized, feature_extract=True)
# NOTE: `tf.stop_gradient` prevents the gradient from flowing backwards
# past this point, keeping the weights before and up to `fc7` frozen.
# This also makes training faster, less work to do!
fc7 = tf.stop_gradient(fc7)
# TODO: Add the final layer for traffic sign classification.
shape = (fc7.get_shape().as_list()[-1], nb_classes) # use this shape for the weight matrix
#fc8W = np.random.normal(0, .15, size=(shape[0], shape[1])).astype(np.float32)
#print(fc8W)
#fc8b = np.random.normal(0, .15, size=(shape[1])).astype(np.float32)
#print(fc8b)
mu = 0
sigma = 0.05
fc8W = tf.Variable(tf.truncated_normal(shape, stddev=1e-2))
fc8b = tf.Variable(tf.zeros(nb_classes))
#logits = tf.matmul(fc7, fc8W) + fc8b; print(logits)
logits = tf.nn.xw_plus_b(fc7, fc8W, fc8b)
#probs = tf.nn.softmax(logits)
# TODO: Define loss, training, accuracy operations.
# HINT: Look back at your traffic signs project solution, you may
# be able to reuse some the code.
rate = 0.001
cross_entropy = tf.nn.softmax_cross_entropy_with_logits(logits=logits, labels=one_hot_y)
loss_operation = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(learning_rate = rate)
training_operation = optimizer.minimize(loss_operation)
# TODO: Train and evaluate the feature extraction model.
#Shuffle data
from sklearn.utils import shuffle
X_train, y_train = shuffle(X_train, y_train)
#Epochs for training and batch sizes defined.
EPOCHS = 10
BATCH_SIZE = 128
### Evaluation function.
correct_prediction = tf.equal(tf.argmax(logits, 1), tf.argmax(one_hot_y, 1))
accuracy_operation = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
#saver = tf.train.Saver()
def evaluate(X_data, y_data):
num_examples = len(X_data)
total_accuracy = 0
sess = tf.get_default_session()
for offset in range(0, num_examples, BATCH_SIZE):
batch_x, batch_y = X_data[offset:offset+BATCH_SIZE], y_data[offset:offset+BATCH_SIZE]
accuracy = sess.run(accuracy_operation, feed_dict={x: batch_x, y: batch_y, keep_prob: 1})
total_accuracy += (accuracy * len(batch_x))
return total_accuracy / num_examples
### Training function.
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
num_examples = len(X_train)
print("Training...")
print()
for i in range(EPOCHS):
X_train, y_train = shuffle(X_train, y_train)
for offset in range(0, num_examples, BATCH_SIZE):
end = offset + BATCH_SIZE
batch_x, batch_y = X_train[offset:end], y_train[offset:end]
sess.run(training_operation, feed_dict={x: batch_x, y: batch_y})
train_accuracy = evaluate(X_train, y_train)
validation_accuracy = evaluate(X_valid, y_valid)
print("EPOCH {} ...".format(i+1))
print('Training Accuracy = {:.3f}'.format(train_accuracy))
print("Validation Accuracy = {:.3f}".format(validation_accuracy))
print()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

save checkpoint with Tensorflow - python

Related

Tensorflow Histogram error

Inferior performance of Tensorflow compared to sklearn

TensorFlow loss returns NaN

Neural network for regression not learning in tensorflow

Does anyone know how to allocate more memory for the graph? Tensorflow: "ValueError: GraphDef cannot be larger than 2GB."

Categories

Resources