I have this problem that after one iteration nearly all my parameters (cost function, weights, hypothesis function, etc.) output 'NaN'. My code is similar to the tensorflow tutorial MNIST-Expert (https://www.tensorflow.org/versions/r0.9/tutorials/mnist/pros/index.html). I looked for solutions already and so far I tried: reducing the learning rate to nearly zero and setting it to zero, using AdamOptimizer instead of gradient descent, using sigmoid function for the hypothesis function in the last layer and using only numpy functions. I have some negative and zero values in my input data, so I can't use the logarithmic cross entropy instead of the quadratic cost function. The result is the same, butMy input data consist of stresses and strains of soils.
import tensorflow as tf
import Datafiles3_pv_complete as soil
import numpy as np
m_training = int(18.0)
m_cv = int(5.0)
m_test = int(5.0)
total_examples = 28
" range for running "
range_training = xrange(0,m_training)
range_cv = xrange(m_training,(m_training+m_cv))
range_test = xrange((m_training+m_cv),total_examples)
""" Using interactive Sessions"""
sess = tf.InteractiveSession()
""" creating input and output vectors """
x = tf.placeholder(tf.float32, shape=[None, 11])
y_true = tf.placeholder(tf.float32, shape=[None, 3])
""" Standard Deviation Calculation"""
stdev = np.divide(2.0,np.sqrt(np.prod(x.get_shape().as_list()[1:])))
""" Weights and Biases """
def weights(shape):
initial = tf.truncated_normal(shape, stddev=stdev)
return tf.Variable(initial)
def bias(shape):
initial = tf.truncated_normal(shape, stddev=1.0)
return tf.Variable(initial)
""" Creating weights and biases for all layers """
theta1 = weights([11,7])
bias1 = bias([1,7])
theta2 = weights([7,7])
bias2 = bias([1,7])
"Last layer"
theta3 = weights([7,3])
bias3 = bias([1,3])
""" Hidden layer input (Sum of weights, activation functions and bias)
z = theta^T * activation + bias
"""
def Z_Layer(activation,theta,bias):
return tf.add(tf.matmul(activation,theta),bias)
""" Creating the sigmoid function
sigmoid = 1 / (1 + exp(-z))
"""
def Sigmoid(z):
return tf.div(tf.constant(1.0),tf.add(tf.constant(1.0), tf.exp(tf.neg(z))))
""" hypothesis functions - predicted output """
' layer 1 - input layer '
hyp1 = x
' layer 2 '
z2 = Z_Layer(hyp1, theta1, bias1)
hyp2 = Sigmoid(z2)
' layer 3 '
z3 = Z_Layer(hyp2, theta2, bias2)
hyp3 = Sigmoid(z3)
' layer 4 - output layer '
zL = Z_Layer(hyp3, theta3, bias3)
hypL = tf.add( tf.add(tf.pow(zL,3), tf.pow(zL,2) ), zL)
""" Cost function """
cost_function = tf.mul( tf.div(0.5, m_training), tf.pow( tf.sub(hypL, y_true), 2))
#cross_entropy = -tf.reduce_sum(y_true*tf.log(hypL) + (1-y_true)*tf.log(1-hypL))
""" Gradient Descent """
train_step = tf.train.GradientDescentOptimizer(learning_rate=0.003).minimize(cost_function)
""" Training and Evaluation """
correct_prediction = tf.equal(tf.arg_max(hypL, 1), tf.arg_max(y_true, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
sess.run(tf.initialize_all_variables())
keep_prob = tf.placeholder(tf.float32)
""" Testing - Initialise lists """
hyp1_test = []
z2_test = []
hyp2_test = []
z3_test = []
hyp3_test = []
zL_test = []
hypL_test = []
cost_function_test =[]
complete_error_test = []
theta1_test = []
theta2_test = []
theta3_test = []
bias1_test = []
bias2_test = []
bias3_test = []
""" ------------------------- """
complete_error_init = tf.abs(tf.reduce_mean(tf.sub(hypL,y_true),1))
training_error=[]
for j in range_training:
feedj = {x: soil.input_scale[j], y_true: soil.output_scale[j] , keep_prob: 1.0}
""" ------------------------- """
'Testing - adding to list'
z2_init = z2.eval(feed_dict=feedj)
z2_test.append(z2_init)
hyp2_init = hyp2.eval(feed_dict=feedj)
hyp2_test.append(hyp2_init)
z3_init = z3.eval(feed_dict=feedj)
z3_test.append(z3_init)
hyp3_init = hyp3.eval(feed_dict=feedj)
hyp3_test.append(hyp3_init)
zL_init = zL.eval(feed_dict=feedj)
zL_test.append(zL_init)
hypL_init = hypL.eval(feed_dict=feedj)
hypL_test.append(hypL_init)
cost_function_init = cost_function.eval(feed_dict=feedj)
cost_function_test.append(cost_function_init)
complete_error = complete_error_init.eval(feed_dict=feedj)
complete_error_test.append(complete_error)
print 'number iterations: %g, error (S1, S2, S3): %g, %g, %g' % (j, complete_error[0], complete_error[1], complete_error[2])
theta1_init = theta1.eval()
theta1_test.append(theta1_init)
theta2_init = theta2.eval()
theta2_test.append(theta2_init)
theta3_init = theta3.eval()
theta3_test.append(theta3_init)
bias1_init = bias1.eval()
bias1_test.append(bias1_init)
bias2_init = bias2.eval()
bias2_test.append(bias2_init)
bias3_init = bias3.eval()
bias3_test.append(bias3_init)
""" ------------------------- """
train_accuracy = accuracy.eval(feed_dict=feedj)
print("step %d, training accuracy %g" % (j, train_accuracy))
train_step.run(feed_dict=feedj)
training_error.append(1 - train_accuracy)
cv_error=[]
for k in range_cv:
feedk = {x: soil.input_scale[k], y_true: soil.output_scale[k] , keep_prob: 1.0}
cv_accuracy = accuracy.eval(feed_dict=feedk)
print("cross-validation accuracy %g" % cv_accuracy)
cv_error.append(1-cv_accuracy)
for l in range_test:
print("test accuracy %g" % accuracy.eval(feed_dict={x: soil.input_matrixs[l], y_true: soil.output_matrixs[l], keep_prob: 1.0}))
The last weeks I was working on a Unit-model for this problem, but the same output occurred. I have no idea what to try next. Hope someone can help me.
Edit:
I checked some parameters in detail again. The hypothesis function (hyp) and activation function (z) for layer 3 and 4 (last layer) have the same entries for each data point, i.e. the same value in each line for one column.
1e^-3 is still fairly high, for the classifier you've described. NaN actually means that the weights have tended to infinity, so I would suggest exploring even lower learning rates, around 1e^-7 specifically. If it continues to diverge, multiply your learning rate by 0.1, and repeat until the weights are finite-valued.
Finally, no more NaN values. The solution is to scale my input and output data. The result (accuracy) is still not good, but at least I get some real values for the parameters. I tried feature scaling before in other attempts (where I probably had some other mistakes as well) and assumed it wouldn't help with my problem either.
Related
I'm trying to build a neural network on the Mnist dataset for a HW assignment. I'm not asking anyone to DO the assignment for me, I'm just having trouble figuring out why the Training accuracy and Test Accuracy seem to be static for every epoch?
It's as if my way of updating weights is not working.
Epoch: 0, Train Accuracy: 10.22%, Train Cost: 3.86, Test Accuracy: 10.1%
Epoch: 1, Train Accuracy: 10.22%, Train Cost: 3.86, Test Accuracy: 10.1%
Epoch: 2, Train Accuracy: 10.22%, Train Cost: 3.86, Test Accuracy: 10.1%
Epoch: 3, Train Accuracy: 10.22%, Train Cost: 3.86, Test Accuracy: 10.1%
.
.
.
However, when I run the actual forward and backprop lines in a loop without any 'fluff' of classes or methods the cost goes down. I just can't seem to get it working in the current class setup.
I've tried building my own methods that pass the weights and biases between the backprop and feed-forward methods explicitly, however, those changes haven't done anything to fix this gradient descent issue.
I'm pretty sure it has to do with the definition of the backprop method in the NeuralNetwork class below. I've been struggling to find a way to update the weights by accessing the weight and bias variables in the main training loop.
def backward(self, Y_hat, Y):
'''
Backward pass through network. Update parameters
INPUT
Y_hat: Network predicted
shape: (?, 10)
Y: Correct target
shape: (?, 10)
RETURN
cost: calculate J for errors
type: (float)
'''
#Naked Backprop
dJ_dZ2 = Y_hat - Y
dJ_dW2 = np.matmul(np.transpose(X2), dJ_dZ2)
dJ_db2 = Y_hat - Y
dJ_dX2 = np.matmul(dJ_db2, np.transpose(NeuralNetwork.W2))
dJ_dZ1 = dJ_dX2 * d_sigmoid(Z1)
inner_mat = np.matmul(Y-Y_hat,np.transpose(NeuralNetwork.W2))
dJ_dW1 = np.matmul(np.transpose(X),inner_mat) * d_sigmoid(Z1)
dJ_db1 = np.matmul(Y - Y_hat, np.transpose(NeuralNetwork.W2)) * d_sigmoid(Z1)
lr = 0.1
# weight updates here
#just line 'em up and do lr * the dJ_.. vars you found above
NeuralNetwork.W2 = NeuralNetwork.W2 - lr * dJ_dW2
NeuralNetwork.b2 = NeuralNetwork.b2 - lr * dJ_db2
NeuralNetwork.W1 = NeuralNetwork.W1 - lr * dJ_dW1
NeuralNetwork.b1 = NeuralNetwork.b1 - lr * dJ_db1
# calculate the cost
cost = -1 * np.sum(Y * np.log(Y_hat))
# calc gradients
# weight updates
return cost#, W1, W2, b1, b2
I'm really at a loss here, any help is appreciated!
Full code is shown here...
import keras
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import mnist
np.random.seed(0)
"""### Load MNIST Dataset"""
(x_train, y_train), (x_test, y_test) = mnist.load_data()
X = x_train[0].reshape(1,-1)/255.; Y = y_train[0]
zeros = np.zeros(10); zeros[Y] = 1
Y = zeros
#Here we implement the forward pass for the network using the single example, $X$, from above
### Initialize weights and Biases
num_hidden_nodes = 200
num_classes = 10
# init weights
#first set of weights (these are what the input matrix is multiplied by)
W1 = np.random.uniform(-1e-3,1e-3,size=(784,num_hidden_nodes))
#this is the first bias layer and i think it's a 200 dimensional vector of the biases that go into each neuron before the sigmoid function.
b1 = np.zeros((1,num_hidden_nodes))
#again this are the weights for the 2nd layer that are multiplied by the activation output of the 1st layer
W2 = np.random.uniform(-1e-3,1e-3,size=(num_hidden_nodes,num_classes))
#these are the biases that are added to each neuron before the final softmax activation.
b2 = np.zeros((1,num_classes))
# multiply input with weights
Z1 = np.add(np.matmul(X,W1), b1)
def sigmoid(z):
return 1 / (1 + np.exp(- z))
def d_sigmoid(g):
return sigmoid(g) * (1. - sigmoid(g))
# activation function of Z1
X2 = sigmoid(Z1)
Z2 = np.add(np.matmul(X2,W2), b2)
# softmax
def softmax(z):
# subracting the max adds numerical stability
shiftx = z - np.max(z)
exps = np.exp(shiftx)
return exps / np.sum(exps)
def d_softmax(Y_hat, Y):
return Y_hat - Y
# the hypothesis,
Y_hat = softmax(Z2)
"""Initially the network guesses all categories equally. As we perform backprop the network will get better at discerning images and their categories."""
"""### Calculate Cost"""
cost = -1 * np.sum(Y * np.log(Y_hat))
#so i think the main thing here is like a nested chain rule thing, where we find the change in the cost with respec to each
# set of matrix weights and biases?
#here is probably the order of how we do things based on whats in math below...
'''
1. find the partial deriv of the cost function with respect to the output of the second layer, without the softmax it looks like for some reason?
2. find the partial deriv of the cost function with respect to the weights of the second layer, which is dope cause we can re-use the partial deriv from step 1
3. this one I know intuitively we're looking for the parial deriv of cost with respect to the bias term of the second layer, but how TF does that math translate into
numpy? is that the same y_hat - Y from the first step? where is there anyother Y_hat - y?
4. This is also confusing cause I know where to get the weights for layer 2 from and how to transpose them, but again, where is the Y_hat - Y?
5. Here we take the missing partial deriv from step 4 and multiply it by the d_sigmoid function of the first layer outputs before activations.
6. In this step we multiply the first layer weights (transposed) by the var from 5
7. And this is weird too, this just seems like the same step as number 5 repeated for some reason but with y-y_hat instead of y_hat-y
'''
#look at tutorials like this https://www.youtube.com/watch?v=7qYtIveJ6hU
#I think the most backprop layer steps are fine without biases but how do we find the bias derivatives
#maybe just the hypothesis matrix minus the actual y matrix?
dJ_dZ2 = Y_hat - Y
#find partial deriv of cost w respect to 2nd layer weights
dJ_dW2 = np.matmul(np.transpose(X2), dJ_dZ2)
#finding the partial deriv of cost with respect to the 2nd layer biases
#I'm still not 100% sure why this is here and why it works out to Y_hat - Y
dJ_db2 = Y_hat - Y
#finding the partial deriv of cost with respect to 2nd layer inputs
dJ_dX2 = np.matmul(dJ_db2, np.transpose(W2))
#finding the partial deriv of cost with respect to Activation of layer 1
dJ_dZ1 = dJ_dX2 * d_sigmoid(Z1)
#y-yhat matmul 2nd layer weights
#I added the transpose to the W2 var because the matrices were not compaible sizes without it
inner_mat = np.matmul(Y-Y_hat,np.transpose(W2))
dJ_dW1 = np.matmul(np.transpose(X),inner_mat) * d_sigmoid(Z1)
class NeuralNetwork:
# set learning rate
lr = 0.01
# init weights
W1 = np.random.uniform(-1e-3,1e-3,size=(784,num_hidden_nodes))
b1 = np.zeros((1,num_hidden_nodes))
W2 = np.random.uniform(-1e-3,1e-3,size=(num_hidden_nodes,num_classes))
b2 = np.zeros((1,num_classes))
def __init__(self, num_hidden_nodes, num_classes, lr=0.01):
'''
# set learning rate
lr = lr
# init weights
W1 = np.random.uniform(-1e-3,1e-3,size=(784,num_hidden_nodes))
b1 = np.zeros((1,num_hidden_nodes))
W2 = np.random.uniform(-1e-3,1e-3,size=(num_hidden_nodes,num_classes))
b2 = np.zeros((1,num_classes))
'''
def forward(self, X1):
'''
Forward pass through the network
INPUT
X: input to network
shape: (?, 784)
RETURN
Y_hat: prediction from output of network
shape: (?, 10)
'''
Z1 = np.add(np.matmul(X,W1), b1)
X2 = sigmoid(Z1)# activation function of Z1
Z2 = np.add(np.matmul(X2,W2), b2)
Y_hat = softmax(Z2)
#return the hypothesis
return Y_hat
# store input for backward pass
# you can basically copy and past what you did in the forward pass above here
# think about what you need to store for the backward pass
return
def backward(self, Y_hat, Y):
'''
Backward pass through network. Update parameters
INPUT
Y_hat: Network predicted
shape: (?, 10)
Y: Correct target
shape: (?, 10)
RETURN
cost: calculate J for errors
type: (float)
'''
#Naked Backprop
dJ_dZ2 = Y_hat - Y
dJ_dW2 = np.matmul(np.transpose(X2), dJ_dZ2)
dJ_db2 = Y_hat - Y
dJ_dX2 = np.matmul(dJ_db2, np.transpose(NeuralNetwork.W2))
dJ_dZ1 = dJ_dX2 * d_sigmoid(Z1)
inner_mat = np.matmul(Y-Y_hat,np.transpose(NeuralNetwork.W2))
dJ_dW1 = np.matmul(np.transpose(X),inner_mat) * d_sigmoid(Z1)
dJ_db1 = np.matmul(Y - Y_hat, np.transpose(NeuralNetwork.W2)) * d_sigmoid(Z1)
lr = 0.1
# weight updates here
#just line 'em up and do lr * the dJ_.. vars you found above
NeuralNetwork.W2 = NeuralNetwork.W2 - lr * dJ_dW2
NeuralNetwork.b2 = NeuralNetwork.b2 - lr * dJ_db2
NeuralNetwork.W1 = NeuralNetwork.W1 - lr * dJ_dW1
NeuralNetwork.b1 = NeuralNetwork.b1 - lr * dJ_db1
# calculate the cost
cost = -1 * np.sum(Y * np.log(Y_hat))
# calc gradients
# weight updates
return cost#, W1, W2, b1, b2
nn = NeuralNetwork(200,10,lr=.01)
num_train = float(len(x_train))
num_test = float(len(x_test))
for epoch in range(10):
train_correct = 0; train_cost = 0
# training loop
for i in range(len(x_train)):
x = x_train[i]; y = y_train[i]
# standardizing input to range 0 to 1
X = x.reshape(1,784) /255.
# forward pass through network
Y_hat = nn.forward(X)
# get pred number
pred_num = np.argmax(Y_hat)
# check if prediction was accurate
if pred_num == y:
train_correct += 1
# make a one hot categorical vector; same as keras.utils.to_categorical()
zeros = np.zeros(10); zeros[y] = 1
Y = zeros
# compute gradients and update weights
train_cost += nn.backward(Y_hat, Y)
test_correct = 0
# validation loop
for i in range(len(x_test)):
x = x_test[i]; y = y_test[i]
# standardizing input to range 0 to 1
X = x.reshape(1,784) /255.
# forward pass
Y_hat = nn.forward(X)
# get pred number
pred_num = np.argmax(Y_hat)
# check if prediction was correct
if pred_num == y:
test_correct += 1
# no backward pass here!
# compute average metrics for train and test
train_correct = round(100*(train_correct/num_train), 2)
test_correct = round(100*(test_correct/num_test ), 2)
train_cost = round( train_cost/num_train, 2)
# print status message every epoch
log_message = 'Epoch: {epoch}, Train Accuracy: {train_acc}%, Train Cost: {train_cost}, Test Accuracy: {test_acc}%'.format(
epoch=epoch,
train_acc=train_correct,
train_cost=train_cost,
test_acc=test_correct
)
print (log_message)
also, The project is in this colab & ipynb notebook
I believe this is pretty clear, in this part of your loop:
for epoch in range(10):
train_correct = 0; train_cost = 0
# training loop
for i in range(len(x_train)):
x = x_train[i]; y = y_train[i]
# standardizing input to range 0 to 1
X = x.reshape(1,784) /255.
# forward pass through network
Y_hat = nn.forward(X)
# get pred number
pred_num = np.argmax(Y_hat)
# check if prediction was accurate
if pred_num == y:
train_correct += 1
# make a one hot categorical vector; same as keras.utils.to_categorical()
zeros = np.zeros(10); zeros[y] = 1
Y = zeros
# compute gradients and update weights
train_cost += nn.backward(Y_hat, Y)
test_correct = 0
# validation loop
for i in range(len(x_test)):
x = x_test[i]; y = y_test[i]
# standardizing input to range 0 to 1
X = x.reshape(1,784) /255.
# forward pass
Y_hat = nn.forward(X)
# get pred number
pred_num = np.argmax(Y_hat)
# check if prediction was correct
if pred_num == y:
test_correct += 1
# no backward pass here!
# compute average metrics for train and test
train_correct = round(100*(train_correct/num_train), 2)
test_correct = round(100*(test_correct/num_test ), 2)
train_cost = round( train_cost/num_train, 2)
# print status message every epoch
log_message = 'Epoch: {epoch}, Train Accuracy: {train_acc}%, Train Cost: {train_cost}, Test Accuracy: {test_acc}%'.format(
epoch=epoch,
train_acc=train_correct,
train_cost=train_cost,
test_acc=test_correct
)
print (log_message)
For every epoch of the 10 epochs in your loop, you are setting your train_correct and train_cost to 0, hence there is no updating after each epoch
I am trying to create a simple 3D U-net for image segmentation, just to learn how to use the layers. Therefore I do a 3D convolution with stride 2 and then a transpose deconvolution to get back the same image size. I am also overfitting to a small set (test set) just to see if my network is learning.
I created the same net in Keras and it works just fine. Now I want to create in tensorflow but I been having trouble with it.
The cost changes slightly but no matter what I do (reduce learning rate, add more epochs, add more layers, change batch size...) the output is always the same. I believe the net is not updating the weights. I am sure I am doing something wrong but I can find what it is. Any help would be greatly appreciate it.
Here is my code:
def forward_propagation(X):
if ( mode == 'train'): print(" --------- Net --------- ")
# Convolutional Layer 1
with tf.variable_scope('CONV1'):
Z1 = tf.layers.conv3d(X, filters = 16, kernel =[3,3,3], strides = [ 2, 2, 2], padding='SAME', name = 'S2/conv3d')
A1 = tf.nn.relu(Z1, name = 'S2/ReLU')
if ( mode == 'train'): print("Convolutional Layer 1 S2 " + str(A1.get_shape()))
# DEConvolutional Layer 1
with tf.variable_scope('DeCONV1'):
output_deconv1 = tf.stack([X.get_shape()[0] , X.get_shape()[1], X.get_shape()[2], X.get_shape()[3], 1])
dZ1 = tf.nn.conv3d_transpose(A1, filters = 1, kernel =[3,3,3], strides = [2, 2, 2], padding='SAME', name = 'S2/conv3d_transpose')
dA1 = tf.nn.relu(dZ1, name = 'S2/ReLU')
if ( mode == 'train'): print("Deconvolutional Layer 1 S1 " + str(dA1.get_shape()))
return dA1
def compute_cost(output, target, method = 'dice_hard_coe'):
with tf.variable_scope('COST'):
if (method == 'sigmoid_cross_entropy') :
# Make them vectors
output = tf.reshape( output, [-1, output.get_shape().as_list()[0]] )
target = tf.reshape( target, [-1, target.get_shape().as_list()[0]] )
loss = tf.nn.sigmoid_cross_entropy_with_logits(logits = output, labels = target)
cost = tf.reduce_mean(loss)
return cost
and the main function for the model:
def model(X_h5, Y_h5, learning_rate = 0.009,
num_epochs = 100, minibatch_size = 64, print_cost = True):
ops.reset_default_graph() # to be able to rerun the model without overwriting tf variables
#tf.set_random_seed(1) # to keep results consistent (tensorflow seed)
#seed = 3 # to keep results consistent (numpy seed)
(m, n_D, n_H, n_W, num_channels) = X_h5["test_data"].shape #TTT
num_labels = Y_h5["test_mask"].shape[4] #TTT
img_size = Y_h5["test_mask"].shape[1] #TTT
costs = [] # To keep track of the cost
accuracies = [] # To keep track of the accuracy
# Create Placeholders of the correct shape
X, Y = create_placeholders(n_H, n_W, n_D, minibatch_size)
# Forward propagation: Build the forward propagation in the tensorflow graph
nn_output = forward_propagation(X)
prediction = tf.nn.sigmoid(nn_output)
# Cost function: Add cost function to tensorflow graph
cost_method = 'sigmoid_cross_entropy'
cost = compute_cost(nn_output, Y, cost_method)
# Backpropagation: Define the tensorflow optimizer. Use an AdamOptimizer that minimizes the cost.
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost)
# Initialize all the variables globally
init = tf.global_variables_initializer()
# Start the session to compute the tensorflow graph
with tf.Session() as sess:
print('------ Training ------')
# Run the initialization
tf.local_variables_initializer().run(session=sess)
sess.run(init)
# Do the training loop
for i in range(num_epochs*m):
# ----- TRAIN -------
current_epoch = i//m
patient_start = i-(current_epoch * m)
patient_end = patient_start + minibatch_size
current_X_train = np.zeros((minibatch_size, n_D, n_H, n_W,num_channels))
current_X_train[:,:,:,:,:] = np.array(X_h5["test_data"][patient_start:patient_end,:,:,:,:]) #TTT
current_X_train = np.nan_to_num(current_X_train) # make nan zero
current_Y_train = np.zeros((minibatch_size, n_D, n_H, n_W, num_labels))
current_Y_train[:,:,:,:,:] = np.array(Y_h5["test_mask"][patient_start:patient_end,:,:,:,:]) #TTT
current_Y_train = np.nan_to_num(current_Y_train) # make nan zero
feed_dict = {X: current_X_train, Y: current_Y_train}
_ , temp_cost = sess.run([optimizer, cost], feed_dict=feed_dict)
# ----- TEST -------
# Print the cost every 1/5 epoch
if ((i % (num_epochs*m/5) )== 0):
# Calculate the predictions
test_predictions = np.zeros(Y_h5["test_mask"].shape)
for j in range(0, X_h5["test_data"].shape[0], minibatch_size):
patient_start = j
patient_end = patient_start + minibatch_size
current_X_test = np.zeros((minibatch_size, n_D, n_H, n_W, num_channels))
current_X_test[:,:,:,:,:] = np.array(X_h5["test_data"][patient_start:patient_end,:,:,:,:])
current_X_test = np.nan_to_num(current_X_test) # make nan zero
current_Y_test = np.zeros((minibatch_size, n_D, n_H, n_W, num_labels))
current_Y_test[:,:,:,:,:] = np.array(Y_h5["test_mask"][patient_start:patient_end,:,:,:,:])
current_Y_test = np.nan_to_num(current_Y_test) # make nan zero
feed_dict = {X: current_X_test, Y: current_Y_test}
_, current_prediction = sess.run([cost, prediction], feed_dict=feed_dict)
test_predictions[j:j + minibatch_size,:,:,:,:] = current_prediction
costs.append(temp_cost)
print ("[" + str(current_epoch) + "|" + str(num_epochs) + "] " + "Cost : " + str(costs[-1]))
display_progress(X_h5["test_data"], Y_h5["test_mask"], test_predictions, 5, n_H, n_W)
# plot the cost
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('epochs')
plt.show()
return
I call the model with:
model(hdf5_data_file, hdf5_mask_file, num_epochs = 500, minibatch_size = 1, learning_rate = 1e-3)
These are the results that I am currently getting:
Edit:
I have tried reducing the learning rate and it doesn't help. I also tried using tensorboard debug and the weights are not being updated:
I am not sure why this is happening.
I Created the same simple model in keras and it works fine. I am not sure what I am doing wrong in tensorflow.
Not sure if you are still looking for help, as I am answering this question half a year later your posted date. :) I've listed my observations and also some suggestions for you to try below. It my primary observation is right... then you probably just need a coffee break / a night of good sleep.
primary observation:
tf.reshape( output, [-1, output.get_shape().as_list()[0]] ) seems wrong. If you prefer to flatten the vector, it should be something like tf.reshape(output,[-1,np.prod(image_shape_list)]).
other observations:
With such a shallow network, I doubt the network have enough spatial resolution to differentiate tumor voxels from non-tumor voxels. Can you show the keras implementation and the performance compared to a pure tf implementation? I would probably go with 2+ layers, let's .
say with 3 layers, with a stride of 2 per layer, and an input image width of 256, you will end with a width of 32 at your deepest encoder layer. (If you have a limited GPU memory, downsample the input image.)
if changing the loss computation does not work, as #bremen_matt mentioned, reduce LR to say maybe 1e-5.
after the basic architecture tweaks and you "feel" that the network is sort of learning and not stuck, try augmenting the training data, add dropout, batch norm during training, and then maybe fancy up your loss by adding a discriminator.
I am performing regression on the iris data set to predict its type. I have successfully performed classification using the same data and same neural network. For classification, I have used tanh as the activation function in all layers. But for regression, I am using tanh function in the hidden layer and identity function in the output layer.
import numpy as np
class BackPropagation:
weight =[]
output =[]
layers =0
eta = 0.1
def __init__(self, x):
self.layers = len(x)
for i in range(self.layers-2):
w = np.random.randn(x[i]+1,x[i+1]+1)
self.weight.append(w)
w = w = np.random.randn(x[-2]+1,x[-1])
self.weight.append(w)
def tanh(self,x):
return np.tanh(x)
def deriv_tanh(self,x):
return 1.0-(x**2)
def linear(self,x):
return x
def deriv_linear(self,x):
return 1
def training(self,in_data,target,epoch=100):
bias = np.atleast_2d(np.ones(in_data.shape[0])*(-1)).T
in_data = np.hstack((in_data,bias))
print("Training Starts ......")
while epoch!=0:
epoch-=1
self.output=[]
self.output.append(in_data)
# FORWARD PHASE
for j in range(self.layers-2):
y_in = np.dot(self.output[j],self.weight[j])
y_out = self.tanh(y_in)
self.output.append(y_out)
y_in = np.dot(self.output[-1],self.weight[-1])
y_out = self.linear(y_in)
self.output.append(y_out)
print("Weight Is")
for i in self.weight:
print(i)
# BACKWARD PHASE
error = self.output[-1]-target
print("ERROR IS")
print(np.mean(0.5*error*error))
delta=[]
delta_o = error * self.deriv_linear(self.output[-1])
delta.append(delta_o)
for k in reversed(range(self.layers-2)):
delta_h = np.dot(delta[-1],self.weight[k+1].T) * self.deriv_tanh(self.output[k+1])
delta.append(delta_h)
delta.reverse()
# WEIGHT UPDATE
for i in range(self.layers-1):
self.weight[i] -= (self.eta * np.dot(self.output[i].T, delta[i]))
print("Training complete !")
print("ACCURACY IS")
acc = (1.0-(0.5*error*error))*100
print(np.mean(acc))
def recall(self,in_data):
in_data = np.atleast_2d(in_data)
bias = np.atleast_2d(np.ones(in_data.shape[0])*(-1)).T
in_data = np.hstack((in_data,bias))
y_out = in_data.copy()
for i in range(self.layers-2):
y_in = np.dot(y_out,self.weight[i])
y_out = self.tanh(y_in).copy()
y_in = np.dot(y_out,self.weight[-1])
y_out = self.linear(y_in).copy()
return y_out
# MAIN
data = np.loadtxt("iris.txt",delimiter=",")
obj = BackPropagation([4,2,1])
in_data = data[:rows,:cols].copy()
target = data[:rows,cols:].copy()
obj.training(in_data,target)
print("ANSWER IS")
print(obj.recall(in_data))
The data set is something like this. Here, first 4 columns are features and last column contains the target value. There are 150 records like this in the data set.
5.1,3.5,1.4,0.2,0
4.9,3.0,1.4,0.2,0
5.0,3.6,1.4,0.2,0
5.4,3.9,1.7,0.4,0
4.6,3.4,1.4,0.3,0
7.0,3.2,4.7,1.4,1
6.4,3.2,4.5,1.5,1
6.9,3.1,4.9,1.5,1
5.5,2.3,4.0,1.3,1
6.3,3.3,6.0,2.5,2
5.8,2.7,5.1,1.9,2
7.1,3.0,5.9,2.1,2
6.3,2.9,5.6,1.8,2
After every epoch, the predicted value is increasing exponentially. And, within 50 epochs, the code gives INF or -INF as output. Instead of identity function, I also tried leaky ReLU, but still the output was INF. I have also tried varying learning rate , number of neurons in hidden layers, number of hidden layers, initial weight values, number of iterations etc.
So, how can I perform regression using neural network with back propagation of error ?
Use the mean squared error function for regression tasks. For classification tasks, one usually uses a softmax layer as output and optimizes the cross-entry cost function.
I Wrote a Neural Network in TensorFlow for the XOR input. I have used 1 hidden layer with 2 units and softmax classification. The input is of the form <1, x_1, x_2, zero, one> , where
1 is the bias
x_1 and x_2 are either between 0 and 1 for all the combination {00, 01, 10, 11}. Selected to be normally distributed around 0 or 1
zero: is 1 if the output is zero
one: is 1 if the output is one
The accuracy is always around 0.5. What has gone wrong? Is the architecture of the neural network wrong, or is there something with the code?
import tensorflow as tf
import numpy as np
from random import randint
DEBUG=True
def init_weights(shape):
return tf.Variable(tf.random_normal(shape, stddev=0.01))
def model(X, weight_hidden, weight_output):
# [1,3] x [3,n_hiddent_units] = [1,n_hiddent_units]
hiddern_units_output = tf.nn.sigmoid(tf.matmul(X, weight_hidden))
# [1,n_hiddent_units] x [n_hiddent_units, 2] = [1,2]
return hiddern_units_output
#return tf.matmul(hiddern_units_output, weight_output)
def getHiddenLayerOutput(X, weight_hidden):
hiddern_units_output = tf.nn.sigmoid(tf.matmul(X, weight_hidden))
return hiddern_units_output
total_inputs = 100
zeros = tf.zeros([total_inputs,1])
ones = tf.ones([total_inputs,1])
around_zeros = tf.random_normal([total_inputs,1], mean=0, stddev=0.01)
around_ones = tf.random_normal([total_inputs,1], mean=1, stddev=0.01)
batch_size = 10
n_hiddent_units = 2
X = tf.placeholder("float", [None, 3])
Y = tf.placeholder("float", [None, 2])
weight_hidden = init_weights([3, n_hiddent_units])
weight_output = init_weights([n_hiddent_units, 2])
hiddern_units_output = getHiddenLayerOutput(X, weight_hidden)
py_x = model(X, weight_hidden, weight_output)
#cost = tf.square(Y - py_x)
cost = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits=py_x, labels=Y))
train_op = tf.train.GradientDescentOptimizer(0.05).minimize(cost)
with tf.Session() as sess:
tf.global_variables_initializer().run()
trX_0_0 = sess.run(tf.concat([ones, around_zeros, around_zeros, ones, zeros], axis=1))
trX_0_1 = sess.run(tf.concat([ones, around_zeros, around_ones, zeros, ones], axis=1))
trX_1_0 = sess.run(tf.concat([ones, around_ones, around_zeros, zeros, ones], axis=1))
trX_1_1 = sess.run(tf.concat([ones, around_ones, around_ones, ones, zeros], axis=1))
trX = sess.run(tf.concat([trX_0_0, trX_0_1, trX_1_0, trX_1_1], axis=0))
trX = sess.run(tf.random_shuffle(trX))
print(trX)
for i in range(10):
for start, end in zip(range(0, len(trX), batch_size), range(batch_size, len(trX) + 1, batch_size)):
trY = tf.identity(trX[start:end,3:5])
trY = sess.run(tf.reshape(trY,[batch_size, 2]))
sess.run(train_op, feed_dict={ X: trX[start:end,0:3], Y: trY })
start_index = randint(0, (total_inputs*4)-batch_size)
y_0 = sess.run(py_x, feed_dict={X: trX[start_index:start_index+batch_size,0:3]})
print("iteration :",i, " accuracy :", np.mean(np.absolute(trX[start_index:start_index+batch_size,3:5]-y_0)),"\n")
Check the comments section for the updated code
The problem was with the randomly assigned weights. Here is the modified version, obtained after a series of trail-and-error.
I am trying to make a simple MLP to predict values of a pixel of an image - original blog .
Here's my earlier attempt using Keras in python - link
I've tried to do the same in tensorflow, but I am getting very large output values (~10^12) when they should be less than 1.
Here's my code:
import numpy as np
import cv2
from random import shuffle
import tensorflow as tf
'''
Image preprocessing
'''
image_file = cv2.imread("Mona Lisa.jpg")
h = image_file.shape[0]
w = image_file.shape[1]
preX = []
preY = []
for i in xrange(h):
for j in xrange(w):
preX.append([i,j])
preY.append(image_file[i,j,:].astype('float32')/255.0)
print preX[:5], preY[:5]
zipped = [i for i in zip(preX,preY)]
shuffle(zipped)
X_train = np.array([i for (i,j) in zipped]).astype('float32')
Y_train = np.array([j for (i,j) in zipped]).astype('float32')
print X_train[:10], Y_train[:10]
'''
Tensorflow code
'''
def weight_variable(shape):
initial = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(initial)
def bias_variable(shape):
initial = tf.constant(0.1, shape=shape)
return tf.Variable(initial)
x = tf.placeholder(tf.float32, shape=[None,2])
y = tf.placeholder(tf.float32, shape=[None,3])
'''
Layers
'''
w1 = weight_variable([2,300])
b1 = bias_variable([300])
L1 = tf.nn.relu(tf.matmul(X_train,w1)+b1)
w2 = weight_variable([300,3])
b2 = bias_variable([3])
y_model = tf.matmul(L1,w2)+b2
'''
Training
'''
# criterion
MSE = tf.reduce_mean(tf.square(tf.sub(y,y_model)))
# trainer
train_op = tf.train.GradientDescentOptimizer(learning_rate = 0.01).minimize(MSE)
nb_epochs = 10
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
cost = 0
for i in range(nb_epochs):
sess.run(train_op, feed_dict ={x: X_train, y: Y_train})
cost += sess.run(MSE, feed_dict ={x: X_train, y: Y_train})
cost /= nb_epochs
print cost
'''
Prediction
'''
pred = sess.run(y_model,feed_dict = {x:X_train})*255.0
print pred[:10]
output_image = []
index = 0
h = image_file.shape[0]
w = image_file.shape[1]
for i in xrange(h):
row = []
for j in xrange(w):
row.append(pred[index])
index += 1
row = np.array(row)
output_image.append(row)
output_image = np.array(output_image)
output_image = output_image.astype('uint8')
cv2.imwrite('out_mona_300x3_tf.png',output_image)
First of all, I think that instead of running the train_op and then the MSE
you can run both ops in a list and reduce your computational cost significantly.
for i in range(nb_epochs):
cost += sess.run([MSE, train_op], feed_dict ={x: X_train, y: Y_train})
Secondly, I suggest always writing out your cost function so you can see what is going on during the training phase. Either manually print it out or use tensorboard to log your cost and plot it (you can find examples on the official tf page).
You can also monitor your weights to see that they aren't blowing up.
A few things you can try:
Reduce learning rate, add regularization to weights.
Check that your training set (pixels) really consist of the values that
you expect them to.
You give the input layer weights and the output layer weights the same names w and b, so it seems something goes wrong in the gradient-descent procedure. Actually I'm surprised tensorflow doesn't issue an error or at leas a warning (or am I missing something?)