Tensorflow - Oscillating Learning Rate on CIFAR-10 dataset

Tensorflow - Oscillating Learning Rate on CIFAR-10 dataset - python

I edited the TensorFlow sample of MNIST data set, which got ~90% accuracy on my PC, and tried to use the similar code on CIFAR-10 dataset. However the accuracy only ranged from 0-15%, and never got up to 20%.
import six.moves.cPickle as cPickle
from pprint import pprint
def unpickle():
dict=[]
fo = open(r'C:\train\cifar-10-batches-py\data_batch_1', 'rb')
dict.append(cPickle.load(fo, encoding='latin1'))
fo.close()
return dict
def testpickle():
afo = open(r'C:\train\cifar-10-batches-py\test_batch', 'rb')
adict = cPickle.load(afo, encoding='latin1')
afo.close()
return adict
dt=unpickle()
import tensorflow as tf
import numpy as np
datadt=np.empty([5,10000,1024])
####to arrange input data properly####
for p in range(len(dt)):
print(p)
for i in range(len(dt[p]["labels"])):
a=dt[p]["labels"][i]
dt[p]["labels"][i]=[0,0,0,0,0,0,0,0,0,0]
dt[p]["labels"][i][a]=1
datadt[p][i]=(dt[p]["data"][i].tolist()[:1024])
tdt=testpickle()
###arrange test data properly###
testdt=np.empty([10000,1024])
for i in range(len(tdt["labels"])):
a=tdt["labels"][i]
tdt["labels"][i]=[0,0,0,0,0,0,0,0,0,0]
tdt["labels"][i][a]=1
testdt[i]=(tdt["data"][i].tolist()[:1024])
sess = tf.InteractiveSession()
x = tf.placeholder(tf.float32, shape=[None, 1024])
y_ = tf.placeholder(tf.float32, shape=[None, 10])
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
W_conv1=weight_variable([5,5,1,8])
b_conv1=bias_variable([8])
x_image=tf.reshape(x,[-1,32,32,1])
h_conv1=tf.nn.relu(conv2d(x_image,W_conv1)+b_conv1)
h_pool1=max_pool_2x2(h_conv1)
W_conv2 = weight_variable([5, 5, 8, 16])
b_conv2 = bias_variable([16])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
W_fc1 = weight_variable([8 * 8 * 16, 32])
b_fc1 = bias_variable([32])
h_pool2_flat = tf.reshape(h_pool2, [-1, 8*8*16])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
W_fc2 = weight_variable([32, 10])
b_fc2 = bias_variable([10])
y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
train_step = tf.train.AdamOptimizer(0.5).minimize(cross_entropy)
sess.run(tf.global_variables_initializer())
tshaped_x=testdt
tshaped_y=tdt["labels"]
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
k=100
import random
for i in range(len(dt)):
for u in range(99):
shaped_x=datadt[i][(u*k):(u*k+k)]#np.reshape(dt["data"][i], (-1,3072))
shaped_y=dt[i]["labels"][(u*k):(u*k+k)]#np.reshape(dt["labels"][i], (-1,10))
train_step.run(feed_dict={x: shaped_x, y_:shaped_y,keep_prob:0.5})
r=random.randint(0,9000)
print(accuracy.eval(feed_dict={x:tshaped_x[r:r+50], y_:tshaped_y[r:r+50],keep_prob:1.0}))
The neural network part of the code is very similar to the sample, however results:
0.08
0.06
0.12
0.2
0.14
0.14
0.1
0.12
0.1
0.1
0.04
0.14
0.14
(For convenience I just used each picture data RGB's Red data as input - the original 3072 int indicates R,G,B, and i used the first 1024 ints, as shown by dt[p]["data"][i].tolist()[:1024])
I've been looking for answers in different sites but sadly failed. As a beginner to Tensorflow, sorry for being naive. Thanks for your generous help in advance!
P.S. No matter how I altered AdamOptimizer learning rate from 0.0001 to 999, the results is the same(very similar)

In the initialization of weights, lower the standard deviation, say around 0.01, or tweak with it more. Your network will start to learn then!
Refer to this: https://stats.stackexchange.com/questions/198840/cnn-xavier-weight-initialization
Keep in mind those are variances given and we need to feed standard deviation so square root those.

Related

Error when using tf.get_variable as alternativ for tf.Variable in Tensorflow

Hi I'm new to neural networks and I'm currently working on Tensoflow.
First I did the MNIST tutorial which worked quite well. Now I wanted to deepen the whole by means of an own network for Cifar10 in Google Colab. For this purpose I wrote the following code:
def conv2d(input, size, inputDim, outputCount):
with tf.variable_scope("conv2d"):
## -> This area causes problems <- ##
##########variant1
weight = tf.Variable(tf.truncated_normal([size, size, inputDim, outputCount], stddev=0.1),name="weight")
bias = tf.Variable( tf.constant(0.1, shape=[outputCount]),name="bias")
##########variant2
weight = tf.get_variable("weight", tf.truncated_normal([size, size, inputDim, outputCount], stddev=0.1))
bias = tf.get_variable("bias", tf.constant(0.1, shape=[outputCount]))
##################
conv = tf.nn.relu(tf.nn.conv2d(input, weight, strides=[1, 1, 1, 1], padding='SAME') + bias)
return conv
def maxPool(conv2d):....
def fullyConnect(input, inputSize, outputCount, relu):
with tf.variable_scope("fullyConnect"):
## -> This area causes problems <- ##
##########variant1
weight = tf.Variable( tf.truncated_normal([inputSize, outputCount], stddev=0.1),name="weight")
bias = tf.Variable( tf.constant(0.1, shape=[outputCount]),name="bias")
##########variant2
weight = tf.get_variable("weight", tf.truncated_normal([inputSize, outputCount], stddev=0.1))
bias = tf.get_variable("bias", tf.constant(0.1, shape=[outputCount]))
##################
fullyIn = tf.reshape(input, [-1, inputSize])
fullyCon = fullyIn
if relu:
fullyCon = tf.nn.relu(tf.matmul(fullyIn, weight) + bias)
return fullyCon
#Model Def.
def getVGG16A(grafic,width,height,dim):
with tf.name_scope("VGG16A"):
img = tf.reshape(grafic, [-1,width,height,dim])
with tf.name_scope("Layer1"):
with tf.variable_scope("Layer1"):
with tf.variable_scope("conv1"):
l1_c = conv2d(img,3, dim, 64)
with tf.variable_scope("mp1"):
l1_mp = maxPool(l1_c) #32 > 16
with tf.name_scope("Layer2"):
with tf.variable_scope("Layer2"):
with tf.variable_scope("conv1"):
l2_c = conv2d(l1_mp,3, 64, 128)
with tf.variable_scope("mp1"):
l2_mp = maxPool(l2_c) #16 > 8
with tf.name_scope("Layer6"):
with tf.variable_scope("Layer6"):
with tf.variable_scope("fully1"):
L6_fc1 = fullyConnect(l2_mp, 8*8*128 , 1024, True)
with tf.variable_scope("fully2"):
L6_fc2 = fullyConnect(L6_fc1, 1024, 1024, True)
keep_prob = tf.placeholder(tf.float32)
drop = tf.nn.dropout(L6_fc2, keep_prob)
with tf.variable_scope("fully3"):
L6_fc3 = fullyConnect(drop,1024, 3, False)
return L6_fc3, keep_prob
x = tf.placeholder(tf.float32, [None, 3072]) #input
y_ = tf.placeholder(tf.float32, [None, 3]) #output
# Build the graph for the deep net
y_conv, keep_prob = getVGG16A(x,32,32,3) #create Model
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
train_step = tf.train.AdamOptimizer(1e-3).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for batch in getBatchData(prep_filter_dataBatch1,2): #a self-written method for custom batch return
train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.8})
print('test accuracy %g' % accuracy.eval(feed_dict={
x: mnist.test.images, y_: mnist.test.labels, keep_prob: 1.0}))
For the definition of the tensorflow variables I first used variant1 (tf.variable).
This caused an overflow of the graphics memory after repeated execution.
Then I used variant2 (tf.get_variable). If I have understood the documentation correctly, this should use already existing variables if they exist.
But as soon as I do this I get the following error message:
TypeError: Tensor objects are not iterable when eager execution is not enabled. To iterate over this tensor use tf.map_fn.
I've been looking the hole day, but I haven't found an explanation for this.
Now I hope that there is someone here who can explain to me why this is not possible, or where I can find further information. The error message is getting me nowhere. I don't want a solution because I want to and have to understand this, because I want to write my bachelor thesis in the field of CNN.
Why can I use tf.variable but not tf.get_variable which should do the same?
Thanks for the help,
best regards, Pascal :)

I found my mistake.
I forgot the keyword initializer.
the correct line looks like this:
weight = tf.get_variable("weight",initializer=tf.truncated_normal([size, size, inputDim, outputCount], stddev=anpassung))

Incorrect: usage of hyperopt with tensorflow

In the following code, I have modified the Deep MNIST example from the tensorflow tutorials (official).
Modifications -- Added weight decay into the loss function and also modifying the weights as well. (If its incorrect please do let me know).
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import argparse
import sys
from tensorflow.examples.tutorials.mnist import input_data
import tensorflow as tf
from hyperopt import STATUS_OK, STATUS_FAIL
Flags2=None
def build_and_optimize(hp_space):
global Flags2
Flags2 = {}
Flags2['dp'] = hp_space['dropout_global']
Flags2['wd'] = hp_space['wd']
res = main(Flags2)
results = {
'loss': res,
'status': STATUS_OK
}
return results
def deepnn(x):
"""deepnn builds the graph for a deep net for classifying digits.
args:
x: an input tensor with the dimensions (N_examples, 784), where 784 is the number of piexs in a standard MNIST image.
returns:
a tuple (y, keep_prob). y is a tensor of shape (N_examples, 10), with values equal to the logits of classifying the digit into one of classes (the digits 0-9). keep_prob is a scalar placeholder for the probability of dropout.
"""
# reshape to use within a convolutional neural net
# last dimension is for "features" - there is only one here, since images are
# grayscale -- it would be 3 for RGB, 4 for RGBA, etc.
x_image = tf.reshape(x, [-1, 28, 28, 1])
wd = tf.placeholder(tf.float32)
# first convolutional layer - maps one grayscale image to 32 feature maps
W_conv1 = weight_variable([5, 5, 1, 32], wd)
b_conv1 = bias_variable([32])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
# pooling layer - downsamples by 2X
h_pool1 = max_pool_2X2(h_conv1)
# second convolutional layer --maps 32 feature maps to 64
W_conv2 = weight_variable([5, 5, 32, 64], wd)
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
# second pooling layer - downsamples by 2X
h_pool2 = max_pool_2X2(h_conv2)
# fully connected layer 1 -- after 2 round of downsampleing, our 28x28 image
# is done to 7x7x64 feature maps --maps this to 1025 features.
W_fc1 = weight_variable([7*7*64, 1024], wd)
b_fc1 = bias_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
# dropout - controls the complexity of the model, prevents co-adaptation of features.
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
# map the 1024 features to 10 classes, one for each digit
W_fc2 = weight_variable([1024, 10], wd)
b_fc2 = bias_variable([10])
y_conv = tf.matmul(h_fc1_drop, W_fc2) + b_fc2
return y_conv, keep_prob, wd
def conv2d(x, W):
"""conv2d returns a 2d convolution layer with full stride."""
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2X2(x):
"""max_pool_2x2 downsamples a feature map by 2X."""
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
def weight_variable(shape, wd = None):
"""weight_variable generates a weight variable of a given shape."""
initial = tf.truncated_normal(shape, stddev=0.1)
# weight decay
if wd is not None:
weight_decay = tf.multiply(tf.nn.l2_loss(initial), wd, name = 'weight_loss')
tf.add_to_collection('losses', weight_decay)
return tf.Variable(initial)
def bias_variable(shape):
"""bias_variable generates a bias variable of a given shape."""
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
def main(_):
global Flags2
if Flags2 is None:
Flags2 = {}
if 'keep_prob' not in Flags2:
Flags2 = {}
Flags2['dp'] = 1.0
Flags2['wd'] = 0.0
print(Flags2)
# import data
mnist = input_data.read_data_sets('/tmp/tensorflow/mnist/input_data', one_hot=True)
# create the model
x = tf.placeholder(tf.float32, [None, 784])
y_ = tf.placeholder(tf.float32, [None, 10])
# build the graph for the deep net
y_conv, keep_prob, wd = deepnn(x)
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=y_, logits=y_conv))
# adding weight decay
tf.add_to_collection('losses', cross_entropy)
total_loss = tf.add_n(tf.get_collection('losses'), name='total_loss')
train_step = tf.train.AdamOptimizer(1e-4).minimize(total_loss)
correct_prediction = tf.equal(tf.argmax(y_conv, 1), tf.argmax(y_, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(1000):
batch =mnist.train.next_batch(200)
if i % 100 == 0:
train_accuracy = accuracy.eval(feed_dict={
x: batch[0], y_:batch[1], keep_prob: Flags2['dp'], wd: Flags2['wd']})
print('step %d, training accuracy %g' %(i, train_accuracy))
train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: Flags2['dp'], wd: Flags2['wd']})
test_accuracy = accuracy.eval(feed_dict={x:mnist.test.images, y_:mnist.test.labels, keep_prob:1.0, wd: Flags2['wd']})
print('test accuracy %g' % test_accuracy)
return test_accuracy
if __name__ == "__main__":
parser = argparse.ArgumentParser()
parser.add_argument('--data_dir', type=str,
default='/tmp/tensorflow/mnist/input_data',
help='directory for storing input data')
FLAGS, unparsed = parser.parse_known_args()
tf.app.run(main=main, argv=[sys.argv[0]] + unparsed)
Hyperopt is used to tune the hyper-parameters (weight decay factor and dropout probability).
from hyperopt import fmin, tpe, hp, Trials
import pickle
import traceback
from my_mnist_convnet import build_and_optimize
space = {
'dropout_global': hp.uniform('conv_dropout_prob', 0.4, 0.6),
'wd': hp.uniform('wd', 0.0, 0.01)
}
def run_a_trail():
"""Run one TPE meta optimisation step and save its results."""
max_evals = nb_evals = 3
print("Attempt to resume a past training if it exists:")
try:
trials = pickle.load(open("results.pkl", "rb"))
print("Found saved Trials! Loading...")
max_evals = len(trials.trials) + nb_evals
print("Rerunning from {} trials to add another one.".format(
len(trials.trials)))
except:
trials = Trials()
print("Starting from scratch: new trials.")
best = fmin(
build_and_optimize,
space,
algo=tpe.suggest,
trials=trials,
max_evals=max_evals
)
pickle.dump(trials, open("results.pkl", "wb"))
print(best)
return
def plot_base_and_best_models():
return
if __name__ == "__main__":
"""plot the model and run the optimisation forever (and save results)."""
run_a_trail()
When hyperopt code is used, the code runs fine for only one TPE run, however, if the number of trails is increased then it reports the following error.
self._traceback = _extract_stack()
InvalidArgumentError (see above for traceback): Shape [-1,784] has negative dimensions
[[Node: Placeholder = Placeholder[dtype=DT_FLOAT, shape=[?,784], _device="/job:localhost/replica:0/task:0/gpu:0"]()]]

This problem is most likely arising because each call to build_and_optimize() is adding nodes to the same TensorFlow graph, and the tf.train.AdamOptimizer is attempting to optimize variables from all of the previous graphs in addition to the current graph. To work around this problem, modify build_and_optimize() so that it runs main() in a different TensorFlow graph, using the following change:
def build_and_optimize(hp_space):
global Flags2
Flags2 = {}
Flags2['dp'] = hp_space['dropout_global']
Flags2['wd'] = hp_space['wd']
# Create a new, empty graph for each trial to avoid interference from
# previous trials.
with tf.Graph().as_default():
res = main(Flags2)
results = {
'loss': res,
'status': STATUS_OK
}
return results

My tensorflow model fails training when I increase number of neurons or layers

I have made a convolutional neural network model using tensorflow to recognize handwriting by referring to tensorflow tutorials[1].This model uses convolutional filter1:[5,5,1,16], filter2:[5,5,16,32], fully combined layers[7*7*32,1024], and [1024,10] and then uses softmax to covert it to probabilities. I runs this model and failed because "loss" did't decrease ever and all of outputs are [0,0,1,0,0,0,0,0,0,0,0].
Then, I reduced the number of the filters and neurons and it succeeded and the accuracy marked about 97%.
Why can't I train successfully when I make a model in the same number of filters and neurons?
Here is my failed model.(I used "mnist.csv")
x = tf.placeholder(tf.float32,[None,28*28])
t = tf.placeholder(tf.float32,[None,10])
def weight(shape):
init = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(init)
def bias(shape):
init = tf.constant(0.1, shape=shape)
return tf.Variable(init)
def conv2d(x,W):
return tf.nn.conv2d(x,W,strides=[1,1,1,1],padding="SAME")
def max_pool_22(x):
return tf.nn.max_pool(x,ksize=[1,2,2,1],strides=[1,2,2,1],padding="SAME")
W_conv1 = weight([5,5,1,16])
b_conv1 = bias([16])
x_image = tf.reshape(x,[-1,28,28,1])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_22(h_conv1)
print(h_pool1.shape)
W_conv2 = weight([5,5,16,64])
b_conv2 = bias([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1,W_conv2) + b_conv2)
h_pool2 = max_pool_22(h_conv2)
W_fc1 = weight([7*7*64,1024])
b_fc1 = bias([1024])
h_pool2_flat = tf.reshape(h_pool2,[-1,7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat,W_fc1) + b_fc1)
W_fc2 = weight([1024,10])
b_fc2 = bias([10])
prediction = tf.nn.softmax(tf.matmul(h_fc1,W_fc2) + b_fc2)
cross_entropy=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=t,logits=prediction))
train_step = tf.train.AdamOptimizer().minimize(cross_entropy)
correct_prediction =tf.equal(tf.argmax(prediction,1),tf.argmax(t,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
sess = tf.InteractiveSession()
sess.run(tf.global_variables_initializer())
for epoch in range(20):
avg_loss = 0.
avg_accuracy = 0.
for i in range(1000):
ind = np.random.choice(len(x_train),50)
x_train_batch = x_train[ind]
t_train_batch = t_train[ind]
_, loss, a = sess.run([train_step,cross_entropy, accuracy],feed_dict={x:x_train_batch,t:t_train_batch})
avg_loss += loss/1000
avg_accuracy += a/1000
if epoch % 1 == 0:
print("Step:{0} Loss:{1} TrainAccuracy:{2}".format(epoch,avg_loss,avg_accuracy))
print("test_accuracy:{0}".format(accuracy.eval(feed_dict={x:x_test,t:t_test})))
[1]: https://www.tensorflow.org/get_started/mnist/prosenter code here

You are calling softmax_cross_entropy_with_logits on the output of softmax. This applies softmax twice leading to wrong results. softmax_cross_entropy_with_logits should be called on the linear output of the last layer, before applying softmax:
y = tf.matmul(h_fc1,W_fc2) + b_fc2
cross_entropy=tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=t, logits=y))
prediction_probabilities = tf.nn.softmax(y)
prediction_class = tf.argmax(y, 1)
The prediction_probabilities tensor above is only needed if you need the probabilities of each class. Otherwise, you can call argmax on y directly to get the predicted class.

CNN's - visualizing maximized filter activation with Tensorflow (using MNIST)

I am currently working on creating visualizations for a maximal input image given the kernel/filters generated by a Convolutional Neural Network.
Keras had a blog post here that does something similar, but the results were questionable at best when using anything but the supplied dataset, so I thought I might give it a try with Tensorflow directly. [I will try and edit my post later with the images from it, not available on this computer].
Using the MNIST dataset along with the Tensorflow tutorial and Keras blog post as reference, I have generated the following code in attempts to create said visualizations. I am not sure if my methodology is correct, especially with how/when to normalize my results to visualize them.
import tensorflow as tf
import numpy as np
from tensorflow.examples.tutorials.mnist import input_data
import copy
from scipy.misc import imsave
#~~~~~~~~~~~~~~~~~~~~~~~~~ CNN ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~
#Most of the CNN section directly from the tutorial
mnist = input_data.read_data_sets('MNIST_data', one_hot=True)
img_width = 28
img_height = 28
n = 3
remove_negatives = False
normalize = True
use = 'layer'
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return(tf.Variable(initial))
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return(tf.Variable(initial))
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])
W_conv1 = weight_variable([5, 5, 1, 32])
b_conv1 = bias_variable([32])
x_image = tf.reshape(x, [-1,28,28,1])
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
W_conv2 = weight_variable([5, 5, 32, 64])
b_conv2 = bias_variable([64])
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
W_fc1 = weight_variable([7 * 7 * 64, 1024])
b_fc1 = bias_variable([1024])
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
keep_prob = tf.placeholder(tf.float32)
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
W_fc2 = weight_variable([1024, 10])
b_fc2 = bias_variable([10])
y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
cross_entropy = tf.reduce_mean(-tf.reduce_sum(y_ * tf.log(y_conv), reduction_indices=[1]))
train_step = tf.train.AdamOptimizer(1e-4).minimize(cross_entropy)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for i in range(5000):
batch = mnist.train.next_batch(50)
if i%100 == 0:
train_accuracy = accuracy.eval(feed_dict={x:batch[0], y_: batch[1], keep_prob: 1.0})
print("step %d, training accuracy %g"%(i, train_accuracy))
train_step.run(feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5})
layer = sess.run(W_conv1[:,:,:,:])
bias = sess.run(b_conv1)
layer2 = sess.run(W_conv2[:,:,:,:])
bias2 = sess.run(b_conv2)
#~~~~~~~~~~~~~~~ Begin Visualization Code ~~~~~~~~~~~~~~~~
kept_filters = []
layer_use = layer
bias_use = bias
k=1
#toggle between layer 1 and layer 2 based on variable defined at beginning
if use != 'layer':
k = np.shape(layer2[:,:,:,:])[2]
layer_use = layer2
bias_use = bias2
#loop through kernels/feature maps and maximize each one's input image
for fmap in range(len(layer[0,0,0,:])):
feat_map = fmap
#randomized white-noise input image that will be max'ed
noise_mat = weight_variable([1,28,28,k])
#load kernel as a constant
single_layer = tf.constant(layer_use[:,:,0:k,feat_map-1:feat_map] + bias_use[feat_map],dtype=tf.float32)
conv = conv2d(noise_mat,single_layer)
#Use mean of the image matrix as the "loss" - is this the proper way to do this?
loss = -tf.reduce_mean(conv)
train_step = tf.train.GradientDescentOptimizer(.5).minimize(loss,var_list=[noise_mat])
#the training/maximizing
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
updatelist = [np.sum(sess.run(noise_mat)[0,:,:,0])]
noise_mat_begin = sess.run(noise_mat[0,:,:,0])
conv_saved = sess.run(conv)
for __ in range(5000):
train_step.run()
if __%200 == 0:
updatelist = updatelist + [np.sum(sess.run(noise_mat)[0,:,:,0])]
noise_mat_end = sess.run(noise_mat)[0,:,:,0]
noise_mat_normed = copy.deepcopy(noise_mat_end)
#not sure the best way to normalize?
if remove_negatives:
noise_mat_normed[noise_mat_normed <= 0] = 0
if normalize:
std = np.std(noise_mat_normed)
mean = np.mean(noise_mat_normed)
def full_norm(val):
return((val - mean)/std)
vnew = np.vectorize(full_norm)
noise_mat_normed = vnew(noise_mat_normed)
else:
oldmax = np.max(noise_mat_normed)
oldmin = np.min(noise_mat_normed)
def new_range(val,OldMax,OldMin):
return((((val - OldMin) * 255) / (OldMax - OldMin)))
vnew = np.vectorize(new_range)
noise_mat_normed = vnew(noise_mat_normed,oldmax,oldmin)
#negative sums generally imply a lack of convergence due to my loss metric, so remove them
if np.sum(noise_mat_normed) > 0:
kept_filters += [noise_mat_normed]
#visualize results in a grid format, similar to the blog post
kept_filters = kept_filters[:n * n]
margin = 5
width = n * img_width + (n - 1) * margin
height = n * img_height + (n - 1) * margin
stitched_filters = np.zeros((width, height))
for i in range(n):
for j in range(n):
img = kept_filters[i * n + j]
stitched_filters[(img_width + margin) * i: (img_width + margin) * i + img_width,
(img_height + margin) * j: (img_height + margin) * j + img_height] = img
imsave('TF_vis_%dx%d.png' % (n, n), stitched_filters)
This produces results like so (from convolutional layer 1):
I'm not sure if this is at all correct, especially since layer 2 doesn't seem much different. Do my results and/or methodology seem reasonable? Has anyone else done this using the MNIST dataset? As an aside, validation accuracy was >95%.
EDIT: I must have been doing something wrong originally; I re-did/re-ran the code from the blog post and now the results from my own Tensorflow code look about the same as the blog post method's output, so that's good. However, the main concerns still stand:
Why am I not getting more obvious or distinct outputs? I know they wont be as specific as the filters themselves, but these images don't seem to portray anything, unlike the blog post counterparts. Is there just not enough variation in the original dataset?
Shouldn't I be getting at least SOME things that aren't just glorified bordered images, like diagonals or curves?
Shouldn't the second layer look a more complex iteration of the first?

Efficiently grab gradients from TensorFlow?

I'm trying to implement an asynchronous parameter server, DistBelief style using TensorFlow. I found that minimize() is split into two functions, compute_gradients and apply_gradients, so my plan is to insert a network boundary between them. I have a question about how to evaluate all the gradients simultaneously and pull them out all at once. I understand that eval only evaluates the subgraph necessary, but it also only returns one tensor, not the chain of tensors required to compute that tensor.
How can I do this more efficiently? I took the Deep MNIST example as a starting point:
import tensorflow as tf
import download_mnist
def weight_variable(shape, name):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial, name=name)
def bias_variable(shape, name):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial, name=name)
def conv2d(x, W):
return tf.nn.conv2d(x, W, strides=[1, 1, 1, 1], padding='SAME')
def max_pool_2x2(x):
return tf.nn.max_pool(x, ksize=[1, 2, 2, 1],
strides=[1, 2, 2, 1], padding='SAME')
mnist = download_mnist.read_data_sets('MNIST_data', one_hot=True)
session = tf.InteractiveSession()
x = tf.placeholder("float", shape=[None, 784], name='x')
x_image = tf.reshape(x, [-1,28,28,1], name='reshape')
y_ = tf.placeholder("float", shape=[None, 10], name='y_')
W_conv1 = weight_variable([5, 5, 1, 32], 'W_conv1')
b_conv1 = bias_variable([32], 'b_conv1')
h_conv1 = tf.nn.relu(conv2d(x_image, W_conv1) + b_conv1)
h_pool1 = max_pool_2x2(h_conv1)
W_conv2 = weight_variable([5, 5, 32, 64], 'W_conv2')
b_conv2 = bias_variable([64], 'b_conv2')
h_conv2 = tf.nn.relu(conv2d(h_pool1, W_conv2) + b_conv2)
h_pool2 = max_pool_2x2(h_conv2)
W_fc1 = weight_variable([7 * 7 * 64, 1024], 'W_fc1')
b_fc1 = bias_variable([1024], 'b_fc1')
h_pool2_flat = tf.reshape(h_pool2, [-1, 7*7*64])
h_fc1 = tf.nn.relu(tf.matmul(h_pool2_flat, W_fc1) + b_fc1)
keep_prob = tf.placeholder("float", name='keep_prob')
h_fc1_drop = tf.nn.dropout(h_fc1, keep_prob)
W_fc2 = weight_variable([1024, 10], 'W_fc2')
b_fc2 = bias_variable([10], 'b_fc2')
y_conv=tf.nn.softmax(tf.matmul(h_fc1_drop, W_fc2) + b_fc2)
loss = -tf.reduce_sum(y_ * tf.log(y_conv))
optimizer = tf.train.AdamOptimizer(1e-4)
correct_prediction = tf.equal(tf.argmax(y_conv,1), tf.argmax(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
compute_gradients = optimizer.compute_gradients(loss)
session.run(tf.initialize_all_variables())
batch = mnist.train.next_batch(50)
feed_dict={x: batch[0], y_: batch[1], keep_prob: 0.5}
gradients = []
for grad_var in compute_gradients:
grad = grad_var[0].eval(feed_dict=feed_dict)
var = grad_var[1]
gradients.append((grad, var))
I think this last for loop is actually recalculating the last gradient several times, whereas the first gradient is computed only once? How can I grab all the gradients without recomputing them?

Just give you a simple example. Understand it and try your specific task out.
Initialize required symbols.
x = tf.Variable(0.5)
y = x*x
opt = tf.train.AdagradOptimizer(0.1)
grads = opt.compute_gradients(y)
grad_placeholder = [(tf.placeholder("float", shape=grad[1].get_shape()), grad[1] for grad in grads]
apply_placeholder_op = opt.apply_gradients(grad_placeholder)
transform_grads = [(function1(grad[0]), grad[1]) for grad in grads]
apply_transform_op = opt.apply_gradients(transform_grads)
Initialize
sess = tf.Session()
sess.run(tf.initialize_all_variables())
Get all gradients
grad_vals = sess.run([grad[0] for grad in grads])
Apply gradients
feed_dict = {}
for i in xrange(len(grad_placeholder)):
feed_dict[grad_placeholder[i][0]] = function2(grad_vals[i])
sess.run(apply_placeholder_op, feed_dict=feed_dict)
sess.run(apply_transform_op)
Note: the code hasn't been tested by myself, but I confirm the code is legal except minor code errors.
Note: function1 and function2 is kind of computation, such as 2*x, x^e or e^x and so on.
Refer: TensorFlow apply_gradients remotely

I coded up a very simple example with comments (inspired from the above answer) that is runnable to see gradient descent in action:
import tensorflow as tf
#funciton to transform gradients
def T(g, decay=1.0):
#return decayed gradient
return decay*g
# x variable
x = tf.Variable(10.0,name='x')
# b placeholder (simualtes the "data" part of the training)
b = tf.placeholder(tf.float32)
# make model (1/2)(x-b)^2
xx_b = 0.5*tf.pow(x-b,2)
y=xx_b
learning_rate = 1.0
opt = tf.train.GradientDescentOptimizer(learning_rate)
# gradient variable list = [ (gradient,variable) ]
gv = opt.compute_gradients(y,[x])
# transformed gradient variable list = [ (T(gradient),variable) ]
decay = 0.1 # decay the gradient for the sake of the example
tgv = [(T(g,decay=decay),v) for (g,v) in gv] #list [(grad,var)]
# apply transformed gradients (this case no transform)
apply_transform_op = opt.apply_gradients(tgv)
with tf.Session() as sess:
sess.run(tf.initialize_all_variables())
epochs = 10
for i in range(epochs):
b_val = 1.0 #fake data (in SGD it would be different on every epoch)
print '----'
x_before_update = x.eval()
print 'before update',x_before_update
# compute gradients
grad_vals = sess.run([g for (g,v) in gv], feed_dict={b: b_val})
print 'grad_vals: ',grad_vals
# applies the gradients
result = sess.run(apply_transform_op, feed_dict={b: b_val})
print 'value of x should be: ', x_before_update - T(grad_vals[0], decay=decay)
x_after_update = x.eval()
print 'after update', x_after_update
you can observe the change in the variable as its trained and also the value of the gradient. Note that the only reason T decays the gradient because otherwise it reaches the global minimum in 1 step.
As an extra bonus, if you want to see it work with tensorboard, here you go! :)
## run cmd to collect model: python quadratic_minimizer.py --logdir=/tmp/quaratic_temp
## show board on browser run cmd: tensorboard --logdir=/tmp/quaratic_temp
## browser: http://localhost:6006/
import tensorflow as tf
#funciton to transform gradients
def T(g, decay=1.0):
#return decayed gradient
return decay*g
# x variable
x = tf.Variable(10.0,name='x')
# b placeholder (simualtes the "data" part of the training)
b = tf.placeholder(tf.float32)
# make model (1/2)(x-b)^2
xx_b = 0.5*tf.pow(x-b,2)
y=xx_b
learning_rate = 1.0
opt = tf.train.GradientDescentOptimizer(learning_rate)
# gradient variable list = [ (gradient,variable) ]
gv = opt.compute_gradients(y,[x])
# transformed gradient variable list = [ (T(gradient),variable) ]
decay = 0.9 # decay the gradient for the sake of the example
tgv = [ (T(g,decay=decay), v) for (g,v) in gv] #list [(grad,var)]
# apply transformed gradients (this case no transform)
apply_transform_op = opt.apply_gradients(tgv)
(dydx,_) = tgv[0]
x_scalar_summary = tf.scalar_summary("x", x)
grad_scalar_summary = tf.scalar_summary("dydx", dydx)
with tf.Session() as sess:
merged = tf.merge_all_summaries()
tensorboard_data_dump = '/tmp/quaratic_temp'
writer = tf.train.SummaryWriter(tensorboard_data_dump, sess.graph)
sess.run(tf.initialize_all_variables())
epochs = 14
for i in range(epochs):
b_val = 1.0 #fake data (in SGD it would be different on every epoch)
print '----'
x_before_update = x.eval()
print 'before update',x_before_update
# get gradients
#grad_list = [g for (g,v) in gv]
(summary_str_grad,grad_val) = sess.run([merged] + [dydx], feed_dict={b: b_val})
grad_vals = sess.run([g for (g,v) in gv], feed_dict={b: b_val})
print 'grad_vals: ',grad_vals
writer.add_summary(summary_str_grad, i)
# applies the gradients
[summary_str_apply_transform,_] = sess.run([merged,apply_transform_op], feed_dict={b: b_val})
writer.add_summary(summary_str_apply_transform, i)
print 'value of x after update should be: ', x_before_update - T(grad_vals[0], decay=decay)
x_after_update = x.eval()
print 'after update', x_after_update

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Tensorflow - Oscillating Learning Rate on CIFAR-10 dataset - python

Related

Error when using tf.get_variable as alternativ for tf.Variable in Tensorflow

Incorrect: usage of hyperopt with tensorflow

My tensorflow model fails training when I increase number of neurons or layers

CNN's - visualizing maximized filter activation with Tensorflow (using MNIST)

Efficiently grab gradients from TensorFlow?

Categories

Resources