tensorflow NaN loss during training CNN model image classification - python

I was following the CNN Mnist tutorial on https://www.tensorflow.org/tutorials/layers for my personal image classification task. My input image size is 224 * 224 * 3 instead of 28 * 28 from tutorial, and I have only 5 classes rather than 10. I read previous posts on this problem, and many people pointed out that either a too big learning rate or use of cross_entropy_loss could potentially be problem, but I am not sure if that is the case here.
When I started training, I immediately get this NaN loss training error:
ERROR:tensorflow:Model diverged with loss = NaN.
Traceback (most recent call last):
File "cnn_model.py", line 75, in <module>
main(sys.argv[1], sys.argv[2])
File "cnn_model.py", line 68, in main
classifier.train(input_fn = train_input_fn, steps = 2000, hooks = [logging_hook])
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 241, in train
loss = self._train_model(input_fn=input_fn, hooks=hooks)
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\estimator\estimator.py", line 612, in _train_model
_, loss = mon_sess.run([estimator_spec.train_op, estimator_spec.loss])
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 505, in run
run_metadata=run_metadata)
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 842, in run
run_metadata=run_metadata)
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 798, in run
return self._sess.run(*args, **kwargs)
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\monitored_session.py", line 960, in run
run_metadata=run_metadata))
File "C:\Users\sz\AppData\Local\Continuum\anaconda3\lib\site-packages\tensorflow\python\training\basic_session_run_hooks.py", line 477, in after_run
raise NanLossDuringTrainingError
tensorflow.python.training.basic_session_run_hooks.NanLossDuringTrainingError: NaN loss during training.
And below is the model code:
import tensorflow as tf
from helper import load_data_and_label
import cv2
import sys
import math
def cnn_model_fn(features, labels, mode):
#input layer
input_layer = tf.reshape(features['x'], [-1, 224, 224, 3])
#conv layer 1
conv1 = tf.layers.conv2d(inputs = input_layer, filters = 32, kernel_size
= [5,5], padding = 'same', activation = tf.nn.relu)
#pooling layer 1
pool1 = tf.layers.max_pooling2d(inputs = conv1, pool_size = [2,2], strides = 2)
#conv2 and pool2 layers
conv2 = tf.layers.conv2d(inputs = pool1, filters = 64, kernel_size = [5,5], padding = 'same', activation = tf.nn.relu)
pool2 = tf.layers.max_pooling2d(inputs = conv2, pool_size = [2,2], strides = 2)
#conv3 and pool3 layers
conv3 = tf.layers.conv2d(inputs = pool2, filters = 64, kernel_size = [5,5], padding = 'same', activation = tf.nn.relu)
pool3 = tf.layers.max_pooling2d(inputs = conv3, pool_size = [2,2], strides = 2)
#conv4 and pool4 layers
conv4 = tf.layers.conv2d(inputs = pool3, filters = 64, kernel_size = [5,5], padding = 'same', activation = tf.nn.relu)
pool4 = tf.layers.max_pooling2d(inputs = conv4, pool_size = [2,2], strides = 2)
#conv5 and pool5 layers
conv5 = tf.layers.conv2d(inputs = pool4, filters = 64, kernel_size = [5,5], padding = 'same', activation = tf.nn.relu)
pool5 = tf.layers.max_pooling2d(inputs = conv5, pool_size = [2,2], strides = 2)
#dense layer
pool5_flat = tf.reshape(pool5, [-1, 7 * 7 * 64])
dense = tf.layers.dense(inputs = pool5_flat, units = 1024, activation = tf.nn.relu)
dropout = tf.layers.dropout(inputs = dense, rate = 0.5,
training = mode == tf.estimator.ModeKeys.TRAIN)
#logits layer
logits = tf.layers.dense(inputs = dropout, units = 5)
predictions = {"classes":tf.argmax(input = logits, axis = 1),
"prob": tf.nn.softmax(logits, name = 'softmax_tensor')}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode = mode, predictions = predictions)
#calculate loss
onehot_labels = tf.one_hot(indices = tf.cast(labels, tf.int32), depth = 5)
loss = tf.losses.softmax_cross_entropy(onehot_labels = onehot_labels, logits = logits)
#configure training operation
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.001)
train_op = optimizer.minimize(loss = loss, global_step = tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode = mode, loss = loss, train_op = train_op)
#evaluation metrics
eval_metrics_ops = {"accuracy": tf.metrics.accuracy(labels = labels, predictions = predictions["classes"])}
return tf.estimator.EstimatorSpec(mode = mode, loss = loss, eval_metrics_ops = eval_metrics_ops)
def main(imagepath, labelpath):
train_data, train_labels, eval_data, eval_labels = load_data_and_label(imagepath, labelpath)
classifier = tf.estimator.Estimator(model_fn = cnn_model_fn, model_dir = "/tmp/retina_convnet_model")
tensors_to_log = {"prob": "softmax_tensor"}
logging_hook = tf.train.LoggingTensorHook(tensors = tensors_to_log, every_n_iter = 50)
#train the model
train_input_fn = tf.estimator.inputs.numpy_input_fn(x = {"x":train_data}, y = train_labels,
batch_size = 32, num_epochs = None, shuffle = True)
classifier.train(input_fn = train_input_fn, steps = 2000, hooks = [logging_hook])
eval_input_fn = tf.estimator.inputs.numpy_input_fn(x = {"x":eval_data}, y = eval_labels, num_epochs = 1, shuffle = False)
eval_results = classifier.evaluate(input_fn = eval_input_fn)
print(eval_results)
if __name__ == "__main__":
main(sys.argv[1], sys.argv[2])
Thank you so much! Any help would be really appreciated!

Did you do any preprocessing on the images? If not, then maybe try to standardize the images in your helper function and see if that helps.

Related

Error: Input to reshape is a tensor with 898880 values, but the requested shape requires a multiple of 33640

import tensorflow as tf
from tensorflow.keras.preprocessing.image import ImageDataGenerator
Set the seed
tf.random.set_seed(42)
Preprocess data(get all of the pixel values between 0 & 1, also calles scaling / normalization)
train_datagen = ImageDataGenerator(rescale = 1./255)
valid_datagen = ImageDataGenerator(rescale = 1./255)
Setup path to our data directories
train_dir = '/content/pizza_steak/train'
test_dir = 'pizza_steak/test'
Import data from directories and turn it into batches
train_data = train_datagen.flow_from_directory(directory = train_dir,
batch_size = 32,
target_size = (224, 224),
class_mode = 'binary',
seed = 42)
valid_data = valid_datagen.flow_from_directory(directory = test_dir,
batch_size = 32,
target_size = (224, 224),
class_mode = 'binary',
seed = 42)
Build a CNN model (same as the Tiny VGG on the CNN explain website)
model_1 = tf.keras.models.Sequential([
tf.keras.layers.Conv2D(filters = 10,
kernel_size = 3,
activation = 'relu',
input_shape = (244, 244, 3)),
tf.keras.layers.Conv2D(10, 3, activation = 'relu'),
tf.keras.layers.MaxPool2D(pool_size = 2,
padding = 'valid'),
tf.keras.layers.Conv2D(10, 3, activation = 'relu'),
tf.keras.layers.Conv2D(10, 3, activation = 'relu'),
tf.keras.layers.MaxPool2D(2),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(1, activation = 'sigmoid')
])
Compile our CNN
model_1.compile(
loss = 'binary_crossentropy',
optimizer = tf.keras.optimizers.Adam(),
metrics = ['accuracy']
)
Fit the model
history_1 = model_1.fit(train_data,
epochs = 5,
steps_per_epoch = len(train_data),
validation_data = valid_data,
validation_steps = len(valid_data))
You either need to change your input_shape to (224, 224, 3) or the target_size to (244, 244, 3). It will not work with different shapes. Here is a working example:
import tensorflow as tf
import matplotlib.pyplot as plt
BATCH_SIZE = 32
flowers = tf.keras.utils.get_file(
'flower_photos',
'https://storage.googleapis.com/download.tensorflow.org/example_images/flower_photos.tgz',
untar=True)
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(rescale=1./255)
train_data = train_datagen.flow_from_directory(directory = flowers,
batch_size = 32,
target_size = (224, 224),
class_mode = 'sparse',
seed = 42)
model_1 = tf.keras.models.Sequential([
tf.keras.layers.Conv2D(filters = 10,
kernel_size = 3,
activation = 'relu',
input_shape = (224, 224, 3)),
tf.keras.layers.Conv2D(10, 3, activation = 'relu'),
tf.keras.layers.MaxPool2D(pool_size = 2,
padding = 'valid'),
tf.keras.layers.Conv2D(10, 3, activation = 'relu'),
tf.keras.layers.Conv2D(10, 3, activation = 'relu'),
tf.keras.layers.MaxPool2D(2),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(5, activation = 'softmax')
])
model_1.compile(
loss = 'sparse_categorical_crossentropy',
optimizer = tf.keras.optimizers.Adam(),
metrics = ['accuracy']
)
history_1 = model_1.fit(train_data,steps_per_epoch = len(train_data),
epochs = 5)

How to load a tf.keras model which is trained with pure tensorflow

I have trained a model (which is built with tf.keras) with pure tensorflow. I have saved the model using model.save(model_150.h5) (as it is a keras model).
Here is my model:
conv_1_1 = Conv2D(filters = 64, kernel_size = 3, activation='relu', padding='same')(input_img)
conv_1_1_bn = BatchNormalization()(conv_1_1)
conv_1_1_do = Dropout(droprate)(conv_1_1_bn)
pool_1 = MaxPooling2D(pool_size= 2, strides = 2)(conv_1_1_do)
conv_4_1 = SeparableConv2D(filters = 512, kernel_size = 3, activation='relu', padding='same')(pool_1)
conv_4_1_bn = BatchNormalization()(conv_4_1)
conv_4_1_do = Dropout(droprate)(conv_4_1_bn)
pool_4 = MaxPooling2D(pool_size= 2, strides = 2)(conv_4_1_do)
conv_5_1 = SeparableConv2D(filters = 1024, kernel_size = 3, activation='relu', padding='same')(pool_4)
conv_5_1_bn = BatchNormalization()(conv_5_1)
conv_5_1_do = Dropout(droprate)(conv_5_1_bn)
upconv_1 = upconv_concat(conv_5_1_do, conv_4_1_do, n_filter=512, pool_size=2, stride=2)
conv_6_1 = SeparableConv2D(filters = 512, kernel_size = 3, activation='relu', padding='same')(upconv_1)
conv_6_1_bn = BatchNormalization()(conv_6_1)
conv_6_1_do = Dropout(droprate)(conv_6_1_bn)
upconv_2 = upconv_concat(conv_6_1_do, conv_1_1_do, n_filter=64, pool_size=2, stride=2)
conv_9_1 = SeparableConv2D(filters = 64, kernel_size = 3, activation='relu', padding='same')(upconv_2)
conv_9_1_bn = BatchNormalization()(conv_9_1)
conv_9_1_do = Dropout(droprate)(conv_9_1_bn)
ae_output = Conv2D(num_classes, kernel_size=1, strides = (1,1), activation="softmax")(conv_9_1_do)
I initially defined the model like this:
e_model = Model(input_img, ae_output)
Now I needed some custom training. So I trained the model with pure tensorflow like this:
Here is my loss function
def cut_loss(original_image):
ypred = e_model(original_image)
...
...
#do some computations and calculate some custom loss
...
return loss
Here is my optimizer
#optimizer
opt = tf.train.AdamOptimizer(learning_rate=e_lr).minimize(cut_loss(original_image))
Here is my training loop
with tf.Session() as sess:
sess.run(tf.global_variable_initializer())
for epoch in range(num_epochs):
print("epoch:", epoch)
count = 0
batch_start_index = 0
while (count != num_batches):
X_train_batch = X_train[batch_start_index : batch_start_index+batch_size] #send a batch of input images of size (batchsize, 224, 224, 1)
_, train_loss = sess.run([opt,loss], feed_dict={original_image: X_train_batch})
batch_start_index+=batch_size
count+=1
print("Train loss after ", str(epoch), "is", str(train_loss))
After the training, I saved the model and I restarted my jupyter kernel. when I tried loading the model from the h5 file like this
from tensorflow.keras.models import load_model
e_model = load_model('model_150.h5')
I'm getting the following error:
UserWarning: No training configuration found in save file: the model was *not* compiled. Compile it manually
How can I get rid of this error?
Is it just bad to train the model using tensorflow and save the model using model.save() function from tf.keras?
Please let me know if any additional details are required. Thanks!

Loss value becomes constant after some training steps while training a CNN in Tensorflow

I'm trying to develop a convolutional neural network for image classification.
Currently I am working on classifying a set from about 1000 cats and dogs images.
However, I´m stuck in the training process.
Firstly, I tried to develop my own network, preprocessing and labeling the images myself, testing with different architectures and hyperparameters using Tensorflow.
As I didn't obtain good results, I tried to create a similar network with keras, obtainig better results.
In the following code I create the trainig and validation sets for the tensorflow network:
def oneHot(img):
label = img.split('.')[-3]
if label == 'cat': return [1, 0]
elif label == 'dog': return [0, 1]
def loadData(img_dir):
global img_h
global img_w
data_set = []
for img in tqdm(os.listdir(img_dir)):
label = oneHot(img)
path = os.path.join(img_dir, img)
img = cv2.imread(path)
img = cv2.resize(img, (img_h, img_w))
data_set.append([np.array(img/255, dtype='float32'), np.array(label)])
shuffle(data_set)
return data_set
def divideSet(data_set, train_size):
len_train = int(len(data_set)*train_size)
train_set = data_set[:len_train]
valid_set = data_set[len_train:]
return train_set, valid_set
def separateArgLabel(data_set):
arg = np.array([i[0] for i in data_set])
label = np.array([i[1] for i in data_set])
return arg, label
train_set = loadData(train_dir)
train_data, valid_data = divideSet(train_set, 0.8)
x_train, y_train = separateArgLabel(train_data)
x_valid, y_valid = separateArgLabel(valid_data)
And the code that I used to build and train my model in tensorflow:
def flattenLayer(x):
layer_shape = x.get_shape()
n_input = layer_shape[1:4].num_elements()
flat_layer = tf.reshape(x,[-1,n_input])
return flat_layer
def getRandomBatch(x, y, size):
rnd_idx = np.random.choice(len(x), size)
x_batch = x[rnd_idx]
y_batch = y[rnd_idx]
return x_batch, y_batch
with tf.Session() as sess:
x = tf.placeholder(tf.float32, shape=[None,img_w,img_h,img_c])
y = tf.placeholder(tf.float32, shape=[None,2])
conv1 = tf.layers.conv2d(x, 32, [5,5], strides=1, padding='same',
activation=tf.nn.relu)
pool1 = tf.layers.max_pooling2d(conv1, pool_size=[2,2], strides=2)
conv2 = tf.layers.conv2d(pool1, 64, [5,5], strides=1, padding='same',
activation=tf.nn.relu)
pool2 = tf.layers.max_pooling2d(conv2, pool_size=[2,2], strides=2)
conv3 = tf.layers.conv2d(pool2, 128, [5,5], strides=1, padding='same',
activation=tf.nn.relu)
pool3 = tf.layers.max_pooling2d(conv3, pool_size=[2,2], strides=2)
conv4 = tf.layers.conv2d(pool3, 64, [5,5], strides=1, padding='same',
activation=tf.nn.relu)
pool4 = tf.layers.max_pooling2d(conv4, pool_size=[2,2], strides=2)
conv5 = tf.layers.conv2d(pool4, 32, [5,5], strides=1, padding='same',
activation=tf.nn.relu)
pool5 = tf.layers.max_pooling2d(conv5, pool_size=[2,2], strides=2)
flatten = flattenLayer(pool5)
fc1 = tf.layers.dense(flatten, 1024, activation=tf.nn.relu)
logits = tf.layers.dense(fc1, 2, activation=tf.nn.relu)
y_pred = tf.nn.softmax(logits)
cross_entropy = losses.categorical_crossentropy(y, y_pred)
loss = tf.reduce_mean(cross_entropy)
optimizer = tf.train.AdamOptimizer(0.0005)
grads = optimizer.compute_gradients(loss)
train = optimizer.apply_gradients(grads)
y_cls = tf.arg_max(y, 1)
y_pred_cls = tf.arg_max(y_pred, 1)
correct = tf.equal(y_pred_cls, y_cls)
accuracy = tf.reduce_mean(tf.cast(correct, tf.float32))
init = tf.global_variables_initializer()
sess.run(init)
for epoch in range(10):
sum_loss_train = 0
sum_acc_train = 0
for i in range(100):
batch_x, batch_y = getRandomBatch(x_train, y_train, 32)
feed_dict_train = {x:batch_x, y:batch_y}
_,loss_train,acc_train = sess.run([train,loss,accuracy],
feed_dict=feed_dict_train)
sum_loss_train += loss_train
sum_acc_train += acc_train
sys.stdout.write('\r'+str(i+1)+'/'+str(100)+'\t'+'loss: '+
str(sum_loss_train/(i+1))+' accuracy: '+str(acc_train))
sys.stdout.flush()
mean_loss_train = sum_loss_train/(i+1)
mean_acc_train = sum_acc_train/(i+1)
print("\nÉpoca: " + str(epoch+1) + " ===========> Epoch loss: " + "
{:.4f}".format(mean_loss_train))
print("\tEpoch accuracy: " + "{:.2f} %".format(mean_acc_train*100))
sum_loss_val = 0
sum_acc_val = 0
for j in range(50):
batch_x_val, batch_y_val = getRandomBatch(x_valid, y_valid, 32)
feed_dict_valid = {x:batch_x_val, y:batch_y_val}
loss_val,acc_val = sess.run([loss,accuracy],
feed_dict=feed_dict_valid)
sum_acc_val += acc_val
sum_loss_val += loss_val
mean_acc_val = sum_acc_val/(j+1)
mean_loss_val = sum_loss_val/(j+1)
print("\nValidation loss: " + "{:.4f}".format(mean_loss_val))
print("\tValidation accuracy: " + "{:.2f} %".format(mean_acc_val*100))
When I run the model, after some iterations, the gradients always became zero an the loss got stuck in a constant value.
At first I thought the network stopped learning because of the lack of images, but when I tried to train the same dataset with a network built in Keras, the results were pretty good.
I used the same number of layers, the same hiperparameters and I processed the images the same way in both cases. Although the weigth's initialization may differ, the results make me think that there is some error in the code I added.
Could someone help me with this issue?

How do I correcty reshape my data in Tensorflow?

I have been trying to make the CNN in the Tensorflow CNN tutorial work with 3D Data. But I always get this one dimension error. I apologize in advance, because this is my first time asking a question on stackoverflow so this might look a little bit messy. Here is the code:
import numpy as np
import tensorflow as tf
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
IMG_SIZE_PX = 50
SLICE_COUNT = 20
n_classes = 2
keep_rate = 0.8
tf.logging.set_verbosity(tf.logging.INFO)
def neural_network(features, labels, mode):
input_layer = tf.reshape(features["x"],shape = [-1,IMG_SIZE_PX, IMG_SIZE_PX, SLICE_COUNT, 1])
conv1 = tf.layers.conv3d(inputs = input_layer, filters = 32, kernel_size = [1,2,2,2,1], strides = [1,2,2,2,1],
padding = "same", activation = tf.nn.relu)
pool1 = tf.layers.max_pooling3d(inputs = conv1, pool_size = [1,2,2,2,1], strides = [1,2,2,2,1], padding = "same")
conv2 = tf.layers.conv3d(inputs = pool1, filters = 64, kernel_size = [1,2,2,2,1], strides = [1,2,2,2,1],
padding = "same", activation = tf.nn.relu)
pool2 = tf.layers.max_pooling3d(inputs = conv2, pool_size = [1,2,2,2,1], strides = [1,2,2,2,1], padding = "same")
pool2_flat = tf.reshape(pool2, [-1, 50000])
dense = tf.layers.dense(inputs = pool2_flat, units = 1024, activation = tf.nn.relu)
dropout = tf.layers.dropout(inputs = dense, rate = 0.4, training = mode == tf.estimator.ModeKeys.TRAIN)
logits = tf.layers.dense(inputs = dropout, units = 2)
predictions = {
"classes" : tf.argmax(input = logits, axis = 1),
"probabilities" : tf.nn.softmax(logits, name = "softmax_tensor")
}
if mode == tf.estimator.ModeKeys.PREDICT:
return tf.estimator.EstimatorSpec(mode = mode, predictions = predictions)
onehot_labels = tf.one_hot(indices = tf.cast(labels, tf.int32), depth = 2)
loss = tf.losses.softmax_cross_entropy(onehot_labels = onehot_labels, logits = logits)
if mode == tf.estimator.ModeKeys.TRAIN:
optimizer = tf.train.GradientDescentOptimizer(learning_rate = 0.001)
train_op = optimizer.minimize(loss = loss, global_step = tf.train.get_global_step())
return tf.estimator.EstimatorSpec(mode = mode, loss = loss, train_op = train_op)
eval_metric_ops = {"accuracy" : tf.metrics.accuracy(labels = labels, predictions = predictions["classes"])}
return tf.estimator.EstimatorSpec(mode = mode, loss = loss, eval_metric_ops = eval_metric_ops)
#main
def main(unused_argv):
training_data = train_data[0]
training_labels = train_data[1]
eval_data = train_data[0]
eval_labels = train_data[1]
lung_classifier = tf.estimator.Estimator(model_fn = neural_network,
model_dir = "SOME_DIR")
tensors_to_log = {"probabilities" : "softmax_tensor"}
logging_hook = tf.train.LoggingTensorHook(tensors = tensors_to_log, every_n_iter = 50)
train_input_fn = tf.estimator.inputs.numpy_input_fn(
x = {"x" : np.array(training_data)},
y = np.array(training_labels),
batch_size = 50,
num_epochs = None,
shuffle = True)
lung_classifier.train(input_fn = train_input_fn, steps = 20000, hooks = [logging_hook])
eval_input_fn = tf.estimator.inputs.numpy_input_fn(
x = {"x" : eval_data},
y = eval_labels,
num_epochs = 1,
shuffle = false)
eval_results = lung_classifier.evaluate(input_fn = eval_input_fn)
print(eval_results)
if __name__ == "__main__":
with tf.device("/gpu:0"):
tf.app.run()
I get this Error Message:
ValueError: Dimension size must be evenly divisible by 50000 but is 50 for
'Reshape' (op: 'Reshape') with input shapes: [50], [5] and with input
tensors computed as partial shapes: input[1] = [?,50,50,20,1].
The Error message is pretty long so I simply show you the origin of it.
input_layer = tf.reshape(features["x"],shape = [-1,IMG_SIZE_PX, IMG_SIZE_PX, SLICE_COUNT, 1])
It seems that your features["x"] has dimensions 50x5 and you are trying to give it a dimension of batch_size x 50 x 50 x 20 x 1 which is not possible.
Ensure that you are reading your data correctly by printing out its shape.

TensorFlow CNN on multiple GPUs

I am trying to parallelize my code to have my tensorflow model run on multiple GPUs. For some reason, the code I wrote to parallelize the training works for a standard deep neural net, but throws errors when using a convolutional neural net.
Here is my code to compute the average gradients:
def average_gradients(tower_grads):
average_grads = []
for grad_and_vars in zip(*tower_grads):
# Note that each grad_and_vars looks like the following:
# ((grad0_gpu0, var0_gpu0), ... , (grad0_gpuN, var0_gpuN))
grads = []
for g, _ in grad_and_vars:
# Add 0 dimension to the gradients to represent the tower.
expanded_g = tf.expand_dims(g, 0)
# Append on a 'tower' dimension which we will average over below.
grads.append(expanded_g)
# Average over the 'tower' dimension.
grad = tf.concat(axis=0, values=grads)
grad = tf.reduce_mean(grad, 0)
# Keep in mind that the Variables are redundant because they are shared
# across towers. So .. we will just return the first tower's pointer to
# the Variable.
v = grad_and_vars[0][1]
grad_and_var = (grad, v)
average_grads.append(grad_and_var)
return average_grads
This is my deep neural net architecture: (this works)
def neuralNet(data):
hl_1 = {'weights':tf.get_variable('Weights1',[TF_SHAPE,n_nodes_hl1],initializer=tf.random_normal_initializer()),
'biases':tf.get_variable('Biases1',[n_nodes_hl1],initializer=tf.random_normal_initializer())}
hl_2 = {'weights':tf.get_variable('Weights2',[n_nodes_hl1, n_nodes_hl2],initializer=tf.random_normal_initializer()),
'biases':tf.get_variable('Biases2',[n_nodes_hl2],initializer=tf.random_normal_initializer())}
hl_3 = {'weights':tf.get_variable('Weights3',[n_nodes_hl2, n_nodes_hl3],initializer=tf.random_normal_initializer()),
'biases':tf.get_variable('Biases3',[n_nodes_hl3],initializer=tf.random_normal_initializer())}
hl_4 = {'weights':tf.get_variable('Weights4',[n_nodes_hl3, n_nodes_hl4],initializer=tf.random_normal_initializer()),
'biases':tf.get_variable('Biases4',[n_nodes_hl4],initializer=tf.random_normal_initializer())}
hl_5 = {'weights':tf.get_variable('Weights5',[n_nodes_hl4, n_nodes_hl5],initializer=tf.random_normal_initializer()),
'biases':tf.get_variable('Biases5',[n_nodes_hl5],initializer=tf.random_normal_initializer())}
output_layer = {'weights':tf.get_variable('Weights-outputlayer',[n_nodes_hl5, n_classes],initializer=tf.random_normal_initializer()),
'biases':tf.get_variable('Biases-outputlayer',[n_classes],initializer=tf.random_normal_initializer())}
l1 = tf.add(tf.matmul(data, hl_1['weights']), hl_1['biases'])
l1 = tf.nn.sigmoid(l1, name='op1')
l2 = tf.add(tf.matmul(l1, hl_2['weights']), hl_2['biases'])
l2 = tf.nn.sigmoid(l2, name='op2')
l3 = tf.add(tf.matmul(l2, hl_3['weights']), hl_3['biases'])
l3 = tf.nn.sigmoid(l3, name='op3')
l4 = tf.add(tf.matmul(l3, hl_4['weights']), hl_4['biases'])
l4 = tf.nn.sigmoid(l4, name='op4')
l5 = tf.add(tf.matmul(l4, hl_5['weights']), hl_5['biases'])
l5 = tf.nn.sigmoid(l5, name='op5')
dropout = tf.nn.dropout(l5,keep_prob, name='op6')
ol = tf.add(tf.matmul(dropout, output_layer['weights']), output_layer['biases'], name='op7')
return ol
This is my convnet: (this does not work)
def conv2d(x,W):
return tf.nn.conv2d(x, W, strides=[1,1,1,1], padding='SAME')
def maxpool2d(x):
return tf.nn.max_pool(x, ksize=[1,2,2,1], strides=[1,2,2,1], padding="SAME")
def convNeuralNet(x):
weights = {'w_conv1':tf.get_variable('w_conv1',[7,7,1,2],initializer=tf.random_normal_initializer()),
'w_conv2':tf.get_variable('w_conv2',[7,7,2,4],initializer=tf.random_normal_initializer()),
'w_conv3':tf.get_variable('w_conv3',[7,7,4,8],initializer=tf.random_normal_initializer()),
'w_conv4':tf.get_variable('w_conv4',[7,7,8,16],initializer=tf.random_normal_initializer()),
'w_conv5':tf.get_variable('w_conv5',[7,7,16,32],initializer=tf.random_normal_initializer()),
'w_conv6':tf.get_variable('w_conv6',[7,7,32,64],initializer=tf.random_normal_initializer()),
'w_conv7':tf.get_variable('w_conv7',[7,7,64,128],initializer=tf.random_normal_initializer()),
'w_conv8':tf.get_variable('w_conv8',[7,7,128,256],initializer=tf.random_normal_initializer()),
'w_conv9':tf.get_variable('w_conv9',[7,7,256,512],initializer=tf.random_normal_initializer()),
'w_fc1':tf.get_variable('w_fc1',[512,1024],initializer=tf.random_normal_initializer()),
'w_fc2':tf.get_variable('w_fc2',[1024,2048],initializer=tf.random_normal_initializer()),
'w_fc3':tf.get_variable('w_fc3',[2048,4096],initializer=tf.random_normal_initializer()),
'out':tf.get_variable('w_out',[4096,n_classes],initializer=tf.random_normal_initializer())}
biases = {'b_conv1':tf.get_variable('b_conv1',[2],initializer=tf.random_normal_initializer()),
'b_conv2':tf.get_variable('b_conv2',[4],initializer=tf.random_normal_initializer()),
'b_conv3':tf.get_variable('b_conv3',[8],initializer=tf.random_normal_initializer()),
'b_conv4':tf.get_variable('b_conv4',[16],initializer=tf.random_normal_initializer()),
'b_conv5':tf.get_variable('b_conv5',[32],initializer=tf.random_normal_initializer()),
'b_conv6':tf.get_variable('b_conv6',[64],initializer=tf.random_normal_initializer()),
'b_conv7':tf.get_variable('b_conv7',[128],initializer=tf.random_normal_initializer()),
'b_conv8':tf.get_variable('b_conv8',[256],initializer=tf.random_normal_initializer()),
'b_conv9':tf.get_variable('b_conv9',[512],initializer=tf.random_normal_initializer()),
'b_fc1':tf.get_variable('b_fc1',[1024],initializer=tf.random_normal_initializer()),
'b_fc2':tf.get_variable('b_fc2',[2048],initializer=tf.random_normal_initializer()),
'b_fc3':tf.get_variable('b_fc3',[4096],initializer=tf.random_normal_initializer()),
'out':tf.get_variable('b_out',[n_classes],initializer=tf.random_normal_initializer())}
x = tf.reshape(x,shape=[-1,7,len_puzzle,1])
conv1 = conv2d(x, weights['w_conv1'])
conv1 = maxpool2d(conv1)
conv2 = conv2d(conv1, weights['w_conv2'])
conv2 = maxpool2d(conv2)
conv3 = conv2d(conv2, weights['w_conv3'])
conv3 = maxpool2d(conv3)
conv4 = conv2d(conv3, weights['w_conv4'])
conv4 = maxpool2d(conv4)
conv5 = conv2d(conv4, weights['w_conv5'])
conv5 = maxpool2d(conv5)
conv6 = conv2d(conv5, weights['w_conv6'])
conv6 = maxpool2d(conv6)
conv7 = conv2d(conv6, weights['w_conv7'])
conv7 = maxpool2d(conv7)
conv8 = conv2d(conv7, weights['w_conv8'])
conv8 = maxpool2d(conv8)
conv9 = conv2d(conv8, weights['w_conv9'])
conv9 = maxpool2d(conv9)
fc1 = tf.reshape(conv9, [-1,512])
fc1 = tf.nn.sigmoid(tf.add(tf.matmul(fc1,weights['w_fc1']),biases['b_fc1']))
fc2 = tf.nn.sigmoid(tf.add(tf.matmul(fc1,weights['w_fc2']),biases['b_fc2']))
fc3 = tf.nn.sigmoid(tf.add(tf.matmul(fc2,weights['w_fc3']),biases['b_fc3']))
last = tf.nn.dropout(fc3,keep_prob)
output = tf.add(tf.matmul(last, weights['out']), biases['out'], name='op7')
return output
This is the code which runs the session:
def train(x):
tower_grads = []
opt = tf.train.AdamOptimizer(learning_rate)
for i in xrange(2):
with tf.device('/gpu:%d' % i):
with tf.variable_scope('NN',reuse=i>0):
prediction = convNeuralNet(x)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=prediction,labels=y))
tf.summary.scalar('cross_entropy',cost)
grads = opt.compute_gradients(cost)
tower_grads.append(grads)
print grads
print len(grads)
#scope.reuse_variables()
grads = average_gradients(tower_grads)
apply_gradient_op = opt.apply_gradients(grads)
train_op = tf.group(apply_gradient_op)
correct = tf.equal(tf.argmax(prediction,1),tf.argmax(y,1))
accuracy = tf.reduce_mean(tf.cast(correct,'float'))
tf.summary.scalar('accuracy',accuracy)
num_epochs = ne
with tf.Session(config=tf.ConfigProto(allow_soft_placement=True, log_device_placement=True)) as sess:
saver = tf.train.Saver()
# UNCOMMENT THIS WHEN RESTARTING FROM Checkpoint
#saver.restore(sess, tf.train.latest_checkpoint(os.getcwd()+'/models/base/.'))
sess.run(tf.global_variables_initializer())
merged_summary = tf.summary.merge_all()
for epoch in range(num_epochs):
epoch_loss = 0
for i in range(int(real_X_9.shape[0])/batch_size):#mnist.train.num_examples/batch_size)): # X.shape[0]
randidx = np.random.choice(real_X_9.shape[0], batch_size, replace=False)
epoch_x,epoch_y = real_X_9[randidx,:],real_y_9[randidx,:] #mnist.train.next_batch(batch_size) # X,y
j,c = sess.run([train_op,cost],feed_dict={x:epoch_x,y:epoch_y,keep_prob:TRAIN_KEEP_PROB})
if i == 0:
[ta] = sess.run([accuracy],feed_dict={x:epoch_x,y:epoch_y,keep_prob:TRAIN_KEEP_PROB})
print 'Train Accuracy', ta
epoch_loss += c
print '\n','Epoch', epoch + 1, 'completed out of', num_epochs, '\nLoss:',epoch_loss
#saver.save(sess, os.getcwd()+'/models/base/baseDNN7')
#saver.export_meta_graph(os.getcwd()+'/models/base/baseDNN7.meta')
print '\n','Train Accuracy', accuracy.eval(feed_dict={x:real_X_9, y:real_y_9, keep_prob:TRAIN_KEEP_PROB})
print '\n','Test Accuracy', accuracy.eval(feed_dict={x:test_real_X, y:test_real_y, keep_prob:1.0}) #X, y #mnist.test.images, mnist.test.labels
train(x)
This is the error:
Traceback (most recent call last):
File "CNN_gpu.py", line 393, in <module>
train(x)
File "CNN_gpu.py", line 311, in train
grads = average_gradients(tower_grads)
expanded_g = tf.expand_dims(g, 0)
File "/share/sw/free/tensorflow.1/1.1.0/lib/python2.7/site-packages/tensorflow/python/ops/array_ops.py", line 170, in expand_dims
return gen_array_ops._expand_dims(input, axis, name)
File "/share/sw/free/tensorflow.1/1.1.0/lib/python2.7/site-packages/tensorflow/python/ops/gen_array_ops.py", line 900, in _expand_dims
result = _op_def_lib.apply_op("ExpandDims", input=input, dim=dim, name=name)
File "/share/sw/free/tensorflow.1/1.1.0/lib/python2.7/site-packages/tensorflow/python/framework/op_def_library.py", line 509, in apply_op
(input_name, err))
ValueError: Tried to convert 'input' to a tensor and failed. Error: None values not supported.
I'm really confused. Parallelization across multiple GPUs should work regardless of the type of neural net being used.
Any help here would be appreciated.

Categories

Resources