nan on loss function tensorflow

nan on loss function tensorflow - python

I try to build a model that will identify by data and try to see the LOSS function
loss =tf.reduce_mean(-(y_ * tf.log(y)+(1- y_)* tf.log (1-y)))
But as of now I only get NAN on the prediction and printing NAN in the LOSS function
np_labels = np.array(labels)
np_labels = np_labels.reshape([np_labels.shape[0], 1])
features = 910
hidden_layer_nodes = 100
x = tf.placeholder(tf.float32, [None, features])
y_ = tf.placeholder(tf.float32, [None, 1])
W1 = tf.Variable(tf.truncated_normal([features,hidden_layer_nodes], stddev=0.1))
b1 = tf.Variable(tf.constant(0.1, shape=[hidden_layer_nodes]))
z1 = tf.add(tf.matmul(x,W1),b1)
a1 = tf.nn.relu(z1)
W2 = tf.Variable(tf.truncated_normal([hidden_layer_nodes,1], stddev=0.1))
b2 = tf.Variable(0.)
z2 = tf.matmul(a1,W2) + b2
y = 1 / (1.0 + tf.exp(-z2))
loss =tf.reduce_mean(-(y_ * tf.log(y)+(1- y_)* tf.log (1-y)))
update = tf.train.AdamOptimizer(0.01).minimize(loss)
sess = tf.Session()
sess.run(tf.global_variables_initializer())
for i in range(0,50):
sess.run(update, feed_dict = {x:fvecs, y_:np_labels})
print(sess.run(loss, feed_dict={x: fvecs, y_: np_labels}))
# sess.run(update, feed_dict = {x:data_x, y_:data_y})
# print(sess.run(loss, feed_dict={x: data_x, y_: data_y}))
print('prediction: ', y.eval(session=sess, feed_dict = {x:[[493.9, 702.6, .....
i want to print the loss
Thanks

This is not a TensorFlow-Issue. This results from the very bad idea of implementing the loss-function yourself.
import tensorflow as tf
z2 = tf.random_normal([8, 10]) * 20
y_ = tf.random_uniform([8, 1], minval=0, maxval=10, dtype=tf.float32)
y = 1 / (1.0 + tf.exp(-z2))
loss = tf.reduce_mean(-(y_ * tf.log(y)+(1- y_)* tf.log (1-y)))
with tf.Session() as sess:
print sess.run(loss) # will always fail with high prob
Will give Inf just because of missing the log-sum-exp trick which then causes your implementation to fail due to numerical instabilities (a folklore example which produces overflows). Just run this code several times and you get either NaN or Inf.
Solution would be:
replace y = tf.sigmoid(-z2) by y = tf.identity(z2) to just get the untransformed logits
replace loss = .. by loss = tf.nn.sigmoid_cross_entropy_with_logits(...) to use the numerical stable way
See the docs of sigmoid_cross_entropy_with_logits which explicitly describes this issue.

Related

Computing weights in linear regression problem

I have written the script that demonstrates the linear regression algorithm as follows:
training_epochs = 100
learning_rate = 0.01
# the training set
x_train = np.linspace(0, 10, 100)
y_train = x_train + np.random.normal(0,1,100)
# set up placeholders for input and output
X = tf.placeholder(tf.float32)
Y = tf.placeholder(tf.float32)
# set up variables for weights
w0 = tf.Variable(0.0, name="w0")
w1 = tf.Variable(0.0, name="w1")
y_predicted = X*w1 + w0
# Define the cost function
costF = 0.5*tf.square(Y-y_predicted)
# Define the operation that will be called on each iteration
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(costF)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
# Loop through the data training
for epoch in range(training_epochs):
for (x, y) in zip(x_train, y_train):
sess.run(train_op, feed_dict={X: x, Y: y})
# get values of the final weights
w_val_0,w_val_1 = sess.run([w0,w1])
sess.close()
With this script above, I could compute w_val_1 and w_val_0 easily. But if I changed something with the y_predicted:
w0 = tf.Variable(0.0, name="w0")
w1 = tf.Variable(0.0, name="w1")
w2 = tf.Variable(0.0, name="w2")
y_predicted = X*X*w2 + X*w1 + w0
...
w_val_0,w_val_1,w_val_2 = sess.run([w0,w1,w2])
then I couldn't compute w_val_0, w_val_1, w_val_2. Please help me!

When you are doing X*X the weight (w2, w1 and w0) increase rapidly reaching inf which results in nan values in the loss and no training happens. As a rule of thumb always normalize the data to 0 mean and unit variance.
Fixed code
training_epochs = 100
learning_rate = 0.01
# the training set
x_train = np.linspace(0, 10, 100)
y_train = x_train + np.random.normal(0,1,100)
# # Normalize the data
x_mean = np.mean(x_train)
x_std = np.std(x_train)
x_train_ = (x_train - x_mean)/x_std
X = tf.placeholder(tf.float32)
Y = tf.placeholder(tf.float32)
# set up variables for weights
w0 = tf.Variable(0.0, name="w0")
w1 = tf.Variable(0.0, name="w1")
w2 = tf.Variable(0.0, name="w3")
y_predicted = X*X*w1 + X*w2 + w0
# Define the cost function
costF = 0.5*tf.square(Y-y_predicted)
# Define the operation that will be called on each iteration
train_op = tf.train.GradientDescentOptimizer(learning_rate).minimize(costF)
sess = tf.Session()
init = tf.global_variables_initializer()
sess.run(init)
# Loop through the data training
for epoch in range(training_epochs):
for (x, y) in zip(x_train_, y_train):
sess.run(train_op, feed_dict={X: x, Y: y})
y_hat = sess.run(y_predicted, feed_dict={X: x_train_})
print (sess.run([w0,w1,w2]))
sess.close()
plt.plot(x_train, y_train)
plt.plot(x_train, y_hat)
plt.show()
output:
[4.9228806, -0.08735728, 3.029659]

keras very slow compared to low level TF?

I had a curious experience with Keras.
Info: input dataset shapes
16 features, 5000 observations
target variable: 1 dimension
Problem: Regression
While writing code for students I developed a toy network using tf using the following code (I know is not a complete example but I hope it will give you enough information)
n1 = 15 # Number of neurons in layer 1
n2 = 15 # Number of neurons in layer 2
n3 = 15
nx = number_of_x_points
n_dim = nx
n4 = 1
stddev_f = 0.1
tf.set_random_seed(5)
X = tf.placeholder(tf.float32, [n_dim, None])
Y = tf.placeholder(tf.float32, [10, None])
W1 = tf.Variable(tf.random_normal([n1, n_dim], stddev=stddev_f))
b1 = tf.Variable(tf.constant(0.0, shape = [n1,1]) )
W2 = tf.Variable(tf.random_normal([n2, n1], stddev=stddev_f))
b2 = tf.Variable(tf.constant(0.0, shape = [n2,1]))
W3 = tf.Variable(tf.random_normal([n3,n2], stddev = stddev_f))
b3 = tf.Variable(tf.constant(0.0, shape = [n3,1]))
W4 = tf.Variable(tf.random_normal([n4,n3], stddev = stddev_f))
b4 = tf.Variable(tf.constant(0.0, shape = [n4,1]))
X = tf.placeholder(tf.float32, [nx, None]) # Inputs
Y = tf.placeholder(tf.float32, [1, None]) # Labels
Z1 = tf.nn.sigmoid(tf.matmul(W1, X) + b1) # n1 x n_dim * n_dim x n_obs = n1 x n_obs
Z2 = tf.nn.sigmoid(tf.matmul(W2, Z1) + b2) # n2 x n1 * n1 * n_obs = n2 x n_obs
Z3 = tf.nn.sigmoid(tf.matmul(W3, Z2) + b3)
Z4 = tf.matmul(W4, Z3) + b4
y_ = tf.sigmoid(Z4)
cost = tf.reduce_mean(tf.square(y_-Y))
learning_rate = 0.005
training_step = tf.train.AdamOptimizer(learning_rate).minimize(cost)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
training_epochs = 1000
cost_history = np.empty(shape=[1], dtype = float)
cost_meas_history = np.empty(shape=[1], dtype = float)
train_x = np.transpose(data)
train_y = np.transpose(targets)
cost_history = []
for epoch in range(training_epochs+1):
for i in range(0, train_x.shape[0], batch_size):
x_batch = train_x[i:i + batch_size,:]
y_batch = train_y[i:i + batch_size,:]
sess.run(training_step, feed_dict = {X: x_batch, Y: y_batch})
cost_ = sess.run(cost, feed_dict={ X:train_x, Y: train_y})
cost_history = np.append(cost_history, cost_)
if (epoch % 5000 == 0):
print("Reached epoch",epoch,"cost J =", cost_)
this code is working quite well and it takes on my laptop for 1000 epochs 5 sec. Now I developed the same network with keras with the code
model = tf.keras.Sequential()
model.add(layers.Dense(15, input_dim=16, activation='sigmoid'))
model.add(layers.Dense(15, activation='sigmoid'))
model.add(layers.Dense(15, activation='sigmoid'))
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer=tf.train.AdamOptimizer(0.005),
loss='mse',
metrics=['mae'])
# Training Phase
model.fit(train_x.transpose(), train_y.transpose()/100.0, epochs=1000, batch_size=100,verbose = 0)
This code takes 43 sec. Has anyone any idea what this is the case? Now I expected Keras to be slower but not that much slower. What am I missing?
Thanks, Umberto

Ok I found the reason... It was my mistake. Due to a series of mistakes, due to programming at night after midnight (...), I realized I was comparing batch GD and mini-batch GD. My apologies to everyone and thanks to today that noticed my mistake...
If someone thinks this should be deleted is fine with me.
Now Keras and plain TF are taking exactly the same time. Thanks everyone for reading.
Best, Umberto

Loss for classification using MNIST data stays around the same through every epoch

import numpy as np
import tensorflow as tf
import pandas as pd
data = pd.read_csv('mnist_train.csv')
X = data.drop('label', axis=1).values
y = data['label'].values
with tf.Session() as sess:
Y = tf.one_hot(y, 10).eval()
hidden = [5, 4, 3]
def costa(y, yhat):
loss = tf.nn.softmax_cross_entropy_with_logits_v2(logits=yhat, labels=y)
loss = tf.reduce_sum(loss)
return loss
def train(cost):
train_op = tf.train.GradientDescentOptimizer(0.0001).minimize(cost)
return train_op
with tf.Graph().as_default():
X1 = tf.placeholder(tf.float32, [None, 784])
y1 = tf.placeholder(tf.float32, [None, 10])
w1 = tf.Variable(tf.random_normal((784, hidden[0])))
w2 = tf.Variable(tf.random_normal((hidden[0], hidden[1])))
w3 = tf.Variable(tf.random_normal((hidden[1], hidden[2])))
wo = tf.Variable(tf.random_normal((hidden[2], 10)))
b1 = tf.Variable(tf.random_normal((1, hidden[0])))
b2 = tf.Variable(tf.random_normal((1, hidden[1])))
b3 = tf.Variable(tf.random_normal((1, hidden[2])))
bo = tf.Variable(tf.random_normal((1, 10)))
layer1 = tf.nn.relu(tf.matmul(X1, w1) + b1)
layer2 = tf.nn.relu(tf.matmul(layer1, w2) + b2)
layer3 = tf.nn.relu(tf.matmul(layer2, w3) + b3)
layerout = (tf.matmul(layer3, wo) + bo)
yhat = layerout
cost = costa(y1, yhat)
train_op = train(cost)
init_op = tf.global_variables_initializer()
for epoch in range(1000):
with tf.Session() as sess:
sess.run(init_op)
sess.run(train_op, feed_dict={X1:X, y1:Y})
loss = sess.run(cost, feed_dict={X1:X, y1:Y})
print("Loss for epoch {}: {}".format(epoch, loss))
The loss stays around the same, jumps up and down a lot, but does not decrease accordingly.
I can't seem to find what is going wrong here, any help would be appeciated.
Is it the activations to the layers or am I getting the cost function wrong?

There are a couple of issues here:
You are running sess.run(init_op) every epoch. This means that the model parameters are being reset to random numbers every epoch, and therefore will be unable to learn. Try putting this op before for epoch in range(1000)
You are creating a new session every epoch. Change your code so it looks like this:
with tf.Session() as sess:
sess.run(init_op)
for epoch in range(1000):
sess.run(train_op, feed_dict={X1:X, y1:Y})
loss = sess.run(cost, feed_dict={X1:X, y1:Y})
print("Loss for epoch {}: {}".format(epoch, loss))

Initialising weights with a standard deviation of (2.0/neurons_in_prev_layer)**0.5 worked like a charm for me!
Also changed the hidden layers to, 2 hidden layers of 256, 256 neurons.

Okay one little tweak did the trick, I used RMSPropOptimizer instead and the loss started decreasing as expected.
I still have to figure out as to why this works, I’m still learning, but for now this is the solution I have.
Although the loss decreases very slowly.

Deep network accuracy collapses after reaching a peak value

I am trying to build a deep network using TF after using Martin Gorner's video as a reference. I has some success with the shallow network example; however the deep network's accuracy is collapsing after reaching around 98% accuracy for some reason.
The network implemented is used to recognise MNIST numerical characters using a five layer network. I am training with batches of 100 for 10000 iterations. The accuracy steadily increases until it reaches around 98%, then collapses completely to 9.8%.
Any ideas please?
"""Tensor flow character recognition of Numerals"""
import tensorflow as tf
from tensorflow.examples.tutorials.mnist import input_data
mnist = input_data.read_data_sets("MNIST_data", one_hot=True)
# layer K will have 200 neuron and so on
K = 200
L = 100
M = 60
N = 30
# ----- Initialization -----
# the None will become the batch size of 100
# 28 by 28 grayscale images described by a single byte
X = tf.placeholder(tf.float32, [None, 784])
# training will require computing variables W and b
W1 = tf.Variable(tf.truncated_normal([28*28, K], stddev=0.1))
B1 = tf.Variable(tf.zeros([K]))
W2 = tf.Variable(tf.truncated_normal([K, L], stddev=0.1))
B2 = tf.Variable(tf.zeros([L]))
W3 = tf.Variable(tf.truncated_normal([L, M], stddev=0.1))
B3 = tf.Variable(tf.zeros([M]))
W4 = tf.Variable(tf.truncated_normal([M, N], stddev=0.1))
B4 = tf.Variable(tf.zeros([N]))
W5 = tf.Variable(tf.truncated_normal([N, 10], stddev=0.1))
B5 = tf.Variable(tf.zeros([10]))
init = tf.global_variables_initializer()
# ----- Model -----
# the model Y = WX+b
# reshape is used to flatted the image into a 1D array of 784 locations
# -1 is used to tell python to figure the reshape as there's only on solution
#Y = tf.nn.softmax(tf.matmul(tf.reshape(X, [-1, 784]), W) + b)
Y1 = tf.nn.relu(tf.matmul(X, W1) + B1)
Y2 = tf.nn.relu(tf.matmul(Y1, W2) + B2)
Y3 = tf.nn.relu(tf.matmul(Y2, W3) + B3)
Y4 = tf.nn.relu(tf.matmul(Y3, W4) + B4)
Y5 = tf.nn.softmax(tf.matmul(Y4, W5) + B5)
# placeholder for correct answers
# e.g. correct answer for 2 will be [0 0 1 0 0 0 0 0 0 0 ]
Y_ = tf.placeholder(tf.float32, [None, 10])
# the loss function
cross_entropy = tf.reduce_sum(Y_ * tf.log(Y5)) * -1
# ----- Success Metrics -----
# calculate the % of correct answers found in batch
is_correct = tf.equal(tf.argmax(Y5, 1), tf.argmax(Y_, 1))
accuracy = tf.reduce_mean(tf.cast(is_correct, tf.float32))
# ----- Training Step -----
# pick an optimizer and tell it to minimize the cross entropy loss function
optimizer = tf.train.GradientDescentOptimizer(0.003)
train_step = optimizer.minimize(cross_entropy)
# create the execution session
sess = tf.Session()
sess.run(init)
for i in range(10000):
# load a batch of images from mnist
batch_X, batch_Y = mnist.train.next_batch(100)
train_data = {X: batch_X, Y_: batch_Y}
# ----- Execution -----
# train
sess.run(train_step, feed_dict=train_data)
# test for success
a, c = sess.run([accuracy, cross_entropy], feed_dict=train_data)
# this is only to display information
if i%100 == 0:
# check for success on whole data set
test_data = {X: mnist.test.images, Y_:mnist.test.labels}
a, c = sess.run([accuracy, cross_entropy], feed_dict=test_data)
print(a)

It is the accuracy on the validation set which collapses. right ?
so, you may be dramatically overfitting.
98% is possibly the best you can achieve with a network with such a capacity/structure.

LSTM Tensorflow model not taking sequences into account

I've been struggling with this pet problem for a while now, so any help would be appreciated!
I have a csv file, with a few random columns, and a final column that's based on the sum of the last few values from the first column. I'm trying to use an LSTM model to capture this structure, i.e. to predict the last column given the first few.
Here's the model I've been using:
# Generate test data
train_input = train_input.reshape(m, n_input, 1) # is nr of rows, n_input is number of input columns
NUM_EXAMPLES = int(m * training_size)
test_input = train_input[NUM_EXAMPLES:]
test_output = train_output[NUM_EXAMPLES:]
train_input = train_input[:NUM_EXAMPLES]
train_output = train_output[:NUM_EXAMPLES]
#
# # Design model
#
data = tf.placeholder(tf.float32, [None, n_input, 1])
target = tf.placeholder(tf.float32, [None, n_classes])
num_hidden = 24
cell = tf.contrib.rnn.LSTMCell(num_hidden, state_is_tuple=True)
val, state = tf.nn.dynamic_rnn(cell, data, dtype=tf.float32)
val = tf.transpose(val, [1, 0, 2])
last = tf.gather(val, int(val.get_shape()[0]) - 1)
weight = tf.Variable(tf.truncated_normal([num_hidden, int(target.get_shape()[1])]))
bias = tf.Variable(tf.constant(0.1, shape=[target.get_shape()[1]]))
prediction = tf.nn.softmax(tf.matmul(last, weight) + bias)
cross_entropy = -tf.reduce_sum(target * tf.log(tf.clip_by_value(prediction,1e-10,1.0)))
optimizer = tf.train.AdamOptimizer()
minimize = optimizer.minimize(cross_entropy)
mistakes = tf.not_equal(tf.argmax(target, 1), tf.argmax(prediction, 1))
error = tf.reduce_mean(tf.cast(mistakes, tf.float32))
init_op = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init_op)
no_of_batches = int(len(train_input)/batch_size)
for i in range(epoch):
ptr = 0
for j in range(no_of_batches):
inp, out = train_input[ptr:ptr+batch_size], train_output[ptr:ptr+batch_size]
ptr+=batch_size
sess.run(minimize,{data: inp, target: out})
print("Epoch - {}".format(i))
incorrect = sess.run(error,{data: test_input, target: test_output})
print('Epoch {:2d} error {:3.1f}%'.format(i + 1, 100 * incorrect))
sess.close()
I've tried several spreadsheets with random numbers, and I'm consistently getting around 83% error rate. On the other hand, this algorithm CAN learn if the target column is not sequential.
Thanks in advance!

I can't clearly get your point, do you mean you have a csv file like this?
x1 x2 x3 x4 ... xn
v11 v21 v31 v41 ... vn1
v12 v22 v32 v42 ... vn2
...
v1n v2n v3n v4n ... vnn
y1 y2 y3 y4 ... yn
And yn based on sum(vn1+...+vnn)? Like a * sum(V) + b?

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

nan on loss function tensorflow - python

Related

Computing weights in linear regression problem

keras very slow compared to low level TF?

Loss for classification using MNIST data stays around the same through every epoch

Deep network accuracy collapses after reaching a peak value

LSTM Tensorflow model not taking sequences into account

Categories

Resources