Related
I am building several simple networks to predict the bike rentals at 500 stations in the upcoming hour, given rentals at all stations in the previous 24 hours. I am working with two architectures, one with a graph convolution (which amounts to updating each station with a learned linear combination of other stations, at each hour) and a FNN layer to prediction, and a second with a graph convolution -> LSTM -> FNN to prediction.
Before I describe more, I'm getting poorer performance for my model which includes an LSTM unit, which is confusing me.
See these two images for a description of each architecture, for each architecture I also add hourly meta-data (weather, time, etc) as variation, they are in the images in red, and not relevant to my question. Image links at the bottom of the post.
[Architecture 1: GCNN + FNN][1]
[Architecture 2: GCNN + LSTM + FNN][2]
Confusingly, the test RMSE for the first model is 3.46, for the second model its 3.57. Could someone please explain to me why the second wouldn't be lower, as it seems to be running the exact same processes, except with an additional LSTM unit.
Here are relevant snippets of my code for the GCNN+FNN model:
def gcnn_ddgf(hidden_layer, node_num, feature_in, horizon, learning_rate, beta, batch_size, early_stop_th, training_epochs, X_training, Y_training, X_val, Y_val, X_test, Y_test, scaler, display_step):
n_output_vec = node_num * horizon # length of output vector at the final layer
early_stop_k = 0 # early stop patience
best_val = 10000
traing_error = 0
test_error = 0
pred_Y = []
tf.reset_default_graph()
batch_size = batch_size
early_stop_th = early_stop_th
training_epochs = training_epochs
# tf Graph input and output
X = tf.placeholder(tf.float32, [None, node_num, feature_in]) # X is the input signal
Y = tf.placeholder(tf.float32, [None, n_output_vec]) # y is the regression output
# define dictionaries to store layers weight & bias
weights_hidden = {}
weights_A = {}
biases = {}
vec_length = feature_in
weights_hidden['h1'] = tf.Variable(tf.random_normal([vec_length, hidden_layer], stddev=0.5))
biases['b1'] = tf.Variable(tf.random_normal([1, hidden_layer], stddev=0.5))
weights_A['A1'] = tf.Variable(tf.random_normal([node_num, node_num], stddev=0.5))
weights_hidden['out'] = tf.Variable(tf.random_normal([hidden_layer, horizon], stddev=0.5))
biases['bout'] = tf.Variable(tf.random_normal([1, horizon], stddev=0.5))
# Construct model
pred= gcn(X, weights_hidden, weights_A, biases, node_num, horizon) #see below
pred = scaler.inverse_transform(pred)
Y_original = scaler.inverse_transform(Y)
cost = tf.sqrt(tf.reduce_mean(tf.pow(pred - Y_original, 2)))
#optimizer = tf.train.RMSPropOptimizer(learning_rate, decay).minimize(cost)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost_sq = 0.
num_train = X_training.shape[0]
total_batch = int(num_train/batch_size)
for i in range(total_batch):
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[i*batch_size:(i+1)*batch_size,],
Y: Y_training[i*batch_size:(i+1)*batch_size,]})
avg_cost_sq += np.square(c) * batch_size #/ total_batch
# rest part of training dataset
if total_batch * batch_size != num_train:
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[total_batch*batch_size:num_train,],
Y: Y_training[total_batch*batch_size:num_train,]})
avg_cost_sq += np.square(c) * (num_train - total_batch*batch_size)
avg_cost = np.sqrt(avg_cost_sq / num_train)
# validation
c_val, = sess.run([cost], feed_dict={X: X_val, Y: Y_val})
if c_val < best_val:
# testing
c_tes, preds, Y_true = sess.run([cost, pred, Y_original], feed_dict={X: X_test,Y: Y_test})
best_val = c_val
test_error = c_tes
traing_error = avg_cost
pred_Y = preds
early_stop_k = 0 # reset to 0
# update early stopping patience
if c_val >= best_val:
early_stop_k += 1
# threshold
if early_stop_k == early_stop_th:
break
if epoch % display_step == 0:
print ("Epoch:", '%04d' % (epoch+1), "Training RMSE: ","{:.9f}".format(avg_cost))
print("Validation RMSE: ", c_val)
print("Lowest test RMSE: ", test_error)
print("epoch is ", epoch)
print("training RMSE is ", traing_error)
print("Optimization Finished! the lowest validation RMSE is ", best_val)
print("The test RMSE is ", test_error)
return best_val, pred_Y ,Y_true,test_error
# code that creates the model
def gcn(signal_in, weights_hidden, weights_A, biases, node_num, horizon):
signal_in = tf.transpose(signal_in, [1, 0, 2]) # node_num, batch, feature_in
feature_len = signal_in.shape[2] # feature vector length at the node of the input graph
signal_in = tf.reshape(signal_in, [node_num, -1]) # node_num, batch*feature_in
Adj = 0.5*(weights_A['A1'] + tf.transpose(weights_A['A1']))
Adj = normalize_adj(Adj)
Z = tf.matmul(Adj, signal_in) # node_num, batch*feature_in
Z = tf.reshape(Z, [-1, int(feature_len)]) # node_num * batch, feature_in
signal_output = tf.add(tf.matmul(Z, weights_hidden['h1']), biases['b1'])
signal_output = tf.nn.relu(signal_output) # node_num * batch, hidden_vec
final_output = tf.add(tf.matmul(signal_output, weights_hidden['out']), biases['bout']) # node_num * batch, horizon
# final_output = tf.nn.relu(final_output)
final_output = tf.reshape(final_output, [node_num, -1, horizon]) # node_num, batch, horizon
final_output = tf.transpose(final_output, [1, 0, 2]) # batch, node_num, horizon
final_output = tf.reshape(final_output, [-1, node_num*horizon]) # batch, node_num*horizon
return final_output
And the code for the GCNN+LSTM+FNN model:
def gcnn_ddgf_lstm(node_num, feature_in, learning_rate, beta, batch_size, early_stop_th, training_epochs, X_training,
Y_training, X_val, Y_val, X_test, Y_test, scaler, lstm_layer):
n_output_vec = node_num # length of output vector at the final layer
early_stop_k = 0 # early stop patience
display_step = 1 # frequency of printing results
best_val = 10000
traing_error = 0
test_error = 0
predic_res = []
tf.reset_default_graph()
batch_size = batch_size
early_stop_th = early_stop_th
training_epochs = training_epochs
# tf Graph input and output
X = tf.placeholder(tf.float32, [None, node_num, feature_in]) # X is the input signal
Y = tf.placeholder(tf.float32, [None, n_output_vec]) # y is the regression output
lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_layer, state_is_tuple=True)
# define dictionaries to store layers weight & bias
weights_hidden = {}
weights_A = {}
biases = {}
weights_A['A1'] = tf.Variable(tf.random_normal([node_num, node_num], stddev=0.5))
weights_hidden['h1'] = tf.Variable(tf.random_normal([lstm_layer, node_num], stddev=0.5))
biases['h1'] = tf.Variable(tf.random_normal([1, node_num], stddev=0.5))
weights_hidden['out'] = tf.Variable(tf.random_normal([node_num, node_num], stddev=0.5))
biases['bout'] = tf.Variable(tf.random_normal([1, node_num], stddev=0.5))
# Construct model
pred= gcn_lstm(X, weights_hidden, weights_A, biases, node_num, lstm_cell)
# pred = scaler.inverse_transform(pred)
# Y_original = scaler.inverse_transform(Y)
cost = tf.sqrt(tf.reduce_mean(tf.pow(pred - Y, 2)))
#optimizer = tf.train.RMSPropOptimizer(learning_rate, decay).minimize(cost)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost_sq = 0.
num_train = X_training.shape[0]
total_batch = int(num_train/batch_size)
for i in range(total_batch):
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[i*batch_size:(i+1)*batch_size,],
Y: Y_training[i*batch_size:(i+1)*batch_size,]})
avg_cost_sq += np.square(c) * batch_size #/ total_batch
# rest part of training dataset
if total_batch * batch_size != num_train:
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[total_batch*batch_size:num_train,],
Y: Y_training[total_batch*batch_size:num_train,]})
avg_cost_sq += np.square(c) * (num_train - total_batch*batch_size)
avg_cost = np.sqrt(avg_cost_sq / num_train)
# validation
c_val, = sess.run([cost], feed_dict={X: X_val, Y: Y_val})
if c_val < best_val:
c_tes, preds = sess.run([cost, pred], feed_dict={X: X_test,Y: Y_test})
best_val = c_val
# save model
#saver.save(sess, './bikesharing_gcnn_ddgf')
test_error = c_tes
traing_error = avg_cost
early_stop_k = 0 # reset to 0
# update early stopping patience
if c_val >= best_val:
early_stop_k += 1
# threshold
if early_stop_k == early_stop_th:
pred_Y = scaler.inverse_transform(preds)
Y_true = scaler.inverse_transform(Y_test)
test_err = tf.sqrt(tf.reduce_mean(tf.pow(pred_Y - Y_true, 2)))
break
if epoch % display_step == 0:
print ("Epoch:", '%04d' % (epoch+1), "Training RMSE: ","{:.9f}".format(avg_cost))
print("Validation RMSE: ", c_val)
print("Lowest test RMSE: ", test_error)
print("epoch is ", epoch)
print("training RMSE is ", traing_error)
print("Optimization Finished! the lowest validation RMSE is ", best_val)
print("The scaled test RMSE is ", test_error)
return pred_Y, Y_true
def gcn_lstm(signal_in, weights_hidden, weights_A, biases, node_num, lstm_cell):
signal_in = tf.transpose(signal_in, [1, 0, 2]) # node_num, batch, feature_in
feature_len = signal_in.shape[2] # feature vector length at the node of the input graph
signal_in = tf.reshape(signal_in, [node_num, -1]) # node_num, batch*feature_in
Adj = 0.5*(weights_A['A1'] + tf.transpose(weights_A['A1']))
Adj = normalize_adj(Adj)
Z = tf.matmul(Adj, signal_in) # node_num, batch*feature_in
Z = tf.reshape(Z, [node_num, -1, int(feature_len)]) # node_num, batch, feature_in
Z = tf.transpose(Z,[1,2,0]) # batch, feature_in, node_num
# init_state = cell.zero_state(batch_size, tf.float32)
_, Z = tf.nn.dynamic_rnn(lstm_cell, Z, dtype = tf.float32) # init_state?
dense_output = tf.add(tf.matmul(Z[1], weights_hidden['h1']), biases['h1'])
dense_output = tf.nn.relu(dense_output)
final_output = tf.add(tf.matmul(dense_output, weights_hidden['out']), biases['bout']) # batch, node_num*horizon
return final_output
In particular, should I be weary that _, Z = tf.nn.dynamic_rnn(lstm_cell, Z, dtype = tf.float32) causes my variables defined elsewhere not to train?
Thanks a lot for any help :)
[1]: https://i.stack.imgur.com/MAO2t.png
[2]: https://i.stack.imgur.com/UDjHw.png
I resolved this.
I have three years of bike use data to make the prediction, and was using the ~last three months as my validation/test set. The last few months were winter with lower bike use. I got expected results (GCNN+LSTM outperforms GCNN, though not by much) when I shuffled my training data prior to allocating to sets (with sequences preserved for LSTM)
I am trying to set an instance so that dropout is compute only during the training session, but somehow it seems that the model doesn't see the dropout layer, as when modifying the probabilities nothing happens. I suspect it's a logic issue in my code, but I can't spot where. Also, I'm relatively new to this world, so please cope with my inexperience. Any help will be much appreciated.
Here's the code. I first create a Boolean placeholder
Train = tf.placeholder(tf.bool,shape=())
which will be then passed into a dictionary value as true(training) or False(test). Then I implemented the forward propagation as follows.
def forward_prop_cost(X, parameters,string,drop_probs,Train):
"""
Implements the forward propagation for the model: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SOFTMAX
Arguments:
X -- input dataset placeholder, of shape (input size, number of examples)
parameters -- python dictionary containing your parameters "W1", "b1", ...
string - ReLU or tanh
drop_probs = drop probabilities for each layer. First and last == 0
Train = boolean
Returns:
ZL -- the output of the last LINEAR unit
"""
L = len(drop_probs)-1
activations = []
activations.append(X)
if string == 'ReLU':
for i in range(1,L):
Zi = tf.matmul(parameters['W'+str(i)],activations[i-1]) + parameters['b'+str(i)]
if (Train == True and drop_probs[i] != 0):
Ai = tf.nn.dropout(tf.nn.relu(Zi),drop_probs[i])
else:
Ai = tf.nn.relu(Zi)
activations.append(Ai)
elif string == 'tanh': #needs update!
for i in range(1,L):
Zi = tf.matmul(parameters['W'+str(i)],activations[i-1]) + parameters['b'+str(i)]
Ai = tf.nn.dropout(tf.nn.tanh(Zi),drop_probs[i])
activations.append(Ai)
ZL = tf.matmul(parameters['W'+str(L)],activations[L-1]) + parameters['b'+str(L)]
logits = tf.transpose(ZL)
labels = tf.transpose(Y)
return ZL
Then I call the model function, where just at the end I pass the values of the Train as true or false, depending on the data set I'm using.
def model(X_train, Y_train, X_test, Y_test,hidden = [12288,25,12,6], string = 'ReLU',drop_probs = [0.,0.4,0.2,0.],
regular_param = 0.0, starter_learning_rate = 0.0001,
num_epochs = 1500, minibatch_size = 32, print_cost = True, learning_decay = False):
'''
Returns:
parameters -- parameters learnt by the model. They can then be used to predict.
'''
ops.reset_default_graph()
tf.set_random_seed(1)
seed = 3
(n_x, m) = X_train.shape # (n_x: input size, m : number of examples in the train set)
n_y = Y_train.shape[0] # n_y : output size
costs = [] # To keep track of the cost
graph = tf.Graph()
X, Y ,Train = create_placeholders(n_x, n_y)
parameters = initialize_parameters(hidden)
#print([n.name for n in tf.get_default_graph().as_graph_def().node])
ZL = forward_prop_cost(X, parameters,'ReLU',drop_probs,Train)
#cost = forward_prop_cost(X, parameters,'ReLU',drop_probs,regular_param )
cost = compute_cost(ZL,Y,parameters,regular_param)
#optimizer = tf.train.AdamOptimizer(learning_rate = starter_learning_rate).minimize(cost)
if learning_decay == True:
increasing = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(starter_learning_rate,increasing * minibatch_size,m, 0.95, staircase=True)
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost,global_step=increasing)
else:
optimizer = tf.train.AdamOptimizer(learning_rate = starter_learning_rate).minimize(cost)
# Initialize all the variables
init = tf.global_variables_initializer()
# Start the session to compute the tensorflow graph
with tf.Session() as sess:
# Run the initialization
sess.run(init, { Train: True } )
# Do the training loop
for epoch in range(num_epochs):
epoch_cost = 0.
num_minibatches = int(m / minibatch_size)
seed = seed + 1
minibatches = random_mini_batches(X_train, Y_train, minibatch_size, seed)
for minibatch in minibatches:
(minibatch_X, minibatch_Y) = minibatch
_ , minibatch_cost = sess.run([optimizer, cost], feed_dict={X: minibatch_X, Y: minibatch_Y})
epoch_cost += minibatch_cost / num_minibatches
# Print the cost every 100 epoch
if print_cost == True and epoch % 100 == 0:
print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
if print_cost == True and epoch % 5 == 0:
costs.append(epoch_cost)
# plot the cost
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations (per fives)')
plt.title("Learning rate =" + str(learning_rate))
plt.show()
parameters = sess.run(parameters)
print ("Parameters have been trained!")
# Calculate accuracy on the test set
correct_prediction = tf.equal(tf.argmax(ZL), tf.argmax(Y))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print ("Train Accuracy:", accuracy.eval({X: X_train, Y: Y_train, Train: True}))
print ("Test Accuracy:", accuracy.eval({X: X_test, Y: Y_test, Train: False}))
return parameters
I am currently taking the Deep Learning specialization by Deeplearning.ai on Coursera and am on the first assignment that requires implementing Neural Network with Logistic Regression mindset. The problem is that the assignment is implementation of Neural Network as Logistic Regression function for UNSTRUCTURED DATA (IMAGES). I have successfully completed the assignment, getting all the expected outputs. However, I am now trying to use the coded Neural Network for STRUCTURE DATA but come across broadcast error. Part of the code is as below :
The dataset code
path_train = r'C:\Users\Ahmed Ismail Khalid\Desktop\Research Paper\Research Paper Feature Sets\Balanced Feature Sets\Balanced Train combined scores.csv'
path_test = r'C:\Users\Ahmed Ismail Khalid\Desktop\Research Paper\Research Paper Feature Sets\Balanced Feature Sets\Balanced Test combined scores.csv'
df_train = pd.read_csv(path_train)
#df_train = df_train.to_numpy()
df_test = pd.read_csv(path_test)
#df_test = df_test.to_numpy()
x_train = df_train.iloc[:,1:19]
x_train = x_train.to_numpy()
x_train = x_train.T
y_train = df_train.iloc[:,19]
y_train = y_train.to_numpy()
y_train = y_train.reshape(y_train.shape[0],1)
y_train = y_train.T
x_test = df_test.iloc[:,1:19]
x_test = x_test.to_numpy()
x_test = x_test.T
y_test = df_test.iloc[:,19]
y_test = y_test.to_numpy()
y_test = y_test.reshape(y_test.shape[0],1)
y_test = y_test.T
print ("Number of training examples: m_train = " + str(m_train))
print ("Number of testing examples: m_test = " + str(m_test))
print ("train_set_x shape: " + str(x_train.shape))
print ("train_set_y shape: " + str(y_train.shape))
print ("test_set_x shape: " + str(x_test.shape))
print ("test_set_y shape: " + str(y_test.shape))
Output of Dataset Code
Number of training examples: df_train = 713
Number of testing examples: df_test = 237
x_train shape: (18, 713)
y_train shape: (1, 713)
x_test shape: (18, 237)
y_test shape: (1, 237)
The propagate function code
def propagate(w,b,X,Y) :
m = X.shape[1]
A = sigmoid((w.T * X) + b)
cost = (- 1 / m) * np.sum(np.dot(Y,np.log(A)) + np.dot((1 - Y), np.log(1 - A)))
dw = (1 / m) * np.dot((X,(A - Y)).T)
db = (1 / m) * np.sum(A - Y)
assert(dw.shape == w.shape)
assert(db.dtype == float)
cost = np.squeeze(cost)
assert(cost.shape == ())
grads = {"dw": dw,
"db": db}
return grads, cost
The optimize and model functions
**def optimize**(w,b,X,Y,num_iterations,learning_rate,print_cost) :
costs = []
for i in range(num_iterations) :
# Cost and gradient calculation
grads, cost = propagate(w,b,X,Y)
# Retrieve derivatives from gradients
dw = grads['dw']
db = grads['db']
# Update w and b
w = w - learning_rate * dw
b = b - learning_rate * db
if i % 100 == 0:
costs.append(cost)
# Print the cost every 100 training iterations
if print_cost and i % 100 == 0:
print ("Cost after iteration %i: %f" %(i, cost))
params = {"w": w,
"b": b}
grads = {"dw": dw,
"db": db}
return params, grads, costs
**def model**(X_train, Y_train, X_test, Y_test, num_iterations = 2000, learning_rate = 0.5, print_cost = False) :
# initialize parameters with zero
w, b = initialize_with_zeros(X_train.shape[0])
# Gradient descent (≈ 1 line of code)
parameters, grads, costs = optimize(w,b,X_train,Y_train,num_iterations,learning_rate,print_cost)
# Retrieve parameters w and b from dictionary "parameters"
w = parameters["w"]
b = parameters["b"]
# Predict train/test set examples (≈ 2 lines of code)
Y_prediction_train = predict(w,b,X_train)
Y_prediction_test = predict(w,b,X_test)
# Print train/test Errors
print("train accuracy: {} %".format(100 - np.mean(abs(Y_prediction_train - Y_train)) * 100))
print("test accuracy: {} %".format(100 - np.mean(abs(Y_prediction_test - Y_test)) * 100))
d = {"costs": costs,
"Y_prediction_test": Y_prediction_test,
"Y_prediction_train" : Y_prediction_train,
"w" : w,
"b" : b,
"learning_rate" : learning_rate,
"num_iterations": num_iterations}
return d
Model Function output
Cost after iteration 0: 0.693147
train accuracy: -0.1402524544179613 %
test accuracy: 0.4219409282700326 %
When I run the code, I get ValueError: operands could not be broadcast together with shapes (1,713) (713,18) at A = sigmoid((w.T * X) + b). I am pretty new to neural networks and usage of numpy, so I can't figure out the problem. Any and all help would be really appreciated. The entire .ipynb file containing the entire code can be downloaded from here
Thanks
The * operator is elementwise multiplication, and your arrays have incompatible shapes. You want matrix multiplication, which you can do with np.matmul() or with the # operator:
A = sigmoid(w.T # X + b)
A lot of ML, especially neural nets, is about keeping the shapes of things straight. Check the shapes of your w, X, and Y — they should be: (features, 1), (features, m), (1, m) respectively, where features is 18 for you, and m is 713.
You should also then be able to make sure that the shape of A matches Y.
I am attempting to complete the following tensorflow tutorial and (attempting problem 4): https://github.com/tensorflow/tensorflow/blob/master/tensorflow/examples/udacity/3_regularization.ipynb
However, I think I might be setting up the arrays of weights below wrong. As soon as I change hidden_layer to [image_size * image_size,1024,num_labels] (i.e. just one hidden layer), this works fine. Currently I am getting NaNs for the loss.
One possible solution is that the block
for i in range(1,len(weights)-1):
relus = tf.nn.dropout(tf.nn.relu(tf.matmul(relus, weights[i]) + biases[i]),p_hide)
is causing the problems since I am destroying the past value of relus and Neural Nets need them to do backpropagation. In fact when there is one hidden layer this block does not get executed.
batch_size = 128
hidden_layer = [image_size * image_size,1024,300,num_labels]
l2_regulariser = 0.005
p_hide = 0.5
graph = tf.Graph()
with graph.as_default():
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
# Variables.
weights = [None]*(len(hidden_layer)-1)
biases = [None]*(len(hidden_layer)-1)
for i in range(len(weights)):
weights[i] = tf.Variable(tf.truncated_normal([hidden_layer[i], hidden_layer[i+1]]))
biases[i] = tf.Variable(tf.zeros([hidden_layer[i+1]]))
# Training computation.
relus = tf.nn.dropout(tf.nn.relu(tf.matmul(tf_train_dataset, weights[0]) + biases[0]),p_hide)
for i in range(1,len(weights)-1):
relus = tf.nn.dropout(tf.nn.relu(tf.matmul(relus, weights[i]) + biases[i]),p_hide)
logits = tf.matmul(relus, weights[len(weights)-1]) + biases[len(weights)-1]
loss = 0
for weight in weights:
loss += tf.nn.l2_loss(weight)
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(logits, tf_train_labels))+ l2_regulariser*loss
# Optimizer.
global_step = tf.Variable(0) # count the number of steps taken.
learning_rate = tf.train.exponential_decay(0.5, global_step, decay_steps=20, decay_rate=0.9)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(logits)
relus = tf.nn.relu(tf.matmul(tf_valid_dataset, weights[0]) + biases[0])
for i in range(1,len(weights)-1):
relus = tf.nn.relu(tf.matmul(relus, weights[i]) + biases[i])
valid_prediction = tf.nn.softmax(tf.matmul(relus, weights[len(weights)-1]) + biases[len(weights)-1])
relus = tf.nn.relu(tf.matmul(tf_test_dataset, weights[0]) + biases[0])
for i in range(1,len(weights)-1):
relus = tf.nn.relu(tf.matmul(relus, weights[i]) + biases[i])
test_prediction = tf.nn.softmax(tf.matmul(relus, weights[len(weights)-1]) + biases[len(weights)-1])
######################
# The NN training part
######################
num_steps = 3001
with tf.Session(graph=graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels, global_step : int(step)}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
You should better initialize your weights:
tf.truncated_normal([hidden_layer[i], hidden_layer[i+1]], stddev=0.1)
And most of all, you should lower your learning rate to something around 0.01, 0.001...
I think your get a loss of NaN because the learning rate is too high and it breaks the network (you get exploding weights).
I'm writing a neural-network classifier in TensorFlow/Python for the notMNIST dataset. I've implemented l2 regularization and dropout on the hidden layers. It works fine as long as there is only one hidden layer, but when I added more layers (to improve accuracy), the loss function increases rapidly at each step, becoming NaN by step 5. I tried temporarily disabling Dropout and L2 regularization, but I get the same behavior as long as there are 2+ layers. I even rewrote my code from scratch (doing some refactoring to make it more flexible), but with the same results. The number and size of layers is controlled by hidden_layer_spec. What am I missing?
#works for np.array([1024]) with about 96.1% accuracy
hidden_layer_spec = np.array([1024, 300])
num_hidden_layers = hidden_layer_spec.shape[0]
batch_size = 256
beta = 0.0005
epochs = 100
stepsPerEpoch = float(train_dataset.shape[0]) / batch_size
num_steps = int(math.ceil(float(epochs) * stepsPerEpoch))
l2Graph = tf.Graph()
with l2Graph.as_default():
#with tf.device('/cpu:0'):
# Input data. For the training data, we use a placeholder that will be fed
# at run time with a training minibatch.
tf_train_dataset = tf.placeholder(tf.float32,
shape=(batch_size, image_size * image_size))
tf_train_labels = tf.placeholder(tf.float32, shape=(batch_size, num_labels))
tf_valid_dataset = tf.constant(valid_dataset)
tf_test_dataset = tf.constant(test_dataset)
weights = []
biases = []
for hi in range(0, num_hidden_layers + 1):
width = image_size * image_size if hi == 0 else hidden_layer_spec[hi - 1]
height = num_labels if hi == num_hidden_layers else hidden_layer_spec[hi]
weights.append(tf.Variable(tf.truncated_normal([width, height]), name = "w" + `hi + 1`))
biases.append(tf.Variable(tf.zeros([height]), name = "b" + `hi + 1`))
print(`width` + 'x' + `height`)
def logits(input, addDropoutLayer = False):
previous_layer = input
for hi in range(0, hidden_layer_spec.shape[0]):
previous_layer = tf.nn.relu(tf.matmul(previous_layer, weights[hi]) + biases[hi])
if addDropoutLayer:
previous_layer = tf.nn.dropout(previous_layer, 0.5)
return tf.matmul(previous_layer, weights[num_hidden_layers]) + biases[num_hidden_layers]
# Training computation.
train_logits = logits(tf_train_dataset, True)
l2 = tf.nn.l2_loss(weights[0])
for hi in range(1, len(weights)):
l2 = l2 + tf.nn.l2_loss(weights[0])
loss = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(train_logits, tf_train_labels)) + beta * l2
# Optimizer.
global_step = tf.Variable(0) # count the number of steps taken.
learning_rate = tf.train.exponential_decay(0.5, global_step, int(stepsPerEpoch) * 2, 0.96, staircase = True)
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss, global_step=global_step)
# Predictions for the training, validation, and test data.
train_prediction = tf.nn.softmax(train_logits)
valid_prediction = tf.nn.softmax(logits(tf_valid_dataset))
test_prediction = tf.nn.softmax(logits(tf_test_dataset))
saver = tf.train.Saver()
with tf.Session(graph=l2Graph) as session:
tf.initialize_all_variables().run()
print("Initialized")
for step in range(num_steps):
# Pick an offset within the training data, which has been randomized.
# Note: we could use better randomization across epochs.
offset = (step * batch_size) % (train_labels.shape[0] - batch_size)
# Generate a minibatch.
batch_data = train_dataset[offset:(offset + batch_size), :]
batch_labels = train_labels[offset:(offset + batch_size), :]
# Prepare a dictionary telling the session where to feed the minibatch.
# The key of the dictionary is the placeholder node of the graph to be fed,
# and the value is the numpy array to feed to it.
feed_dict = {tf_train_dataset : batch_data, tf_train_labels : batch_labels}
_, l, predictions = session.run(
[optimizer, loss, train_prediction], feed_dict=feed_dict)
if (step % 500 == 0):
print("Minibatch loss at step %d: %f" % (step, l))
print("Learning rate: " % learning_rate)
print("Minibatch accuracy: %.1f%%" % accuracy(predictions, batch_labels))
print("Validation accuracy: %.1f%%" % accuracy(
valid_prediction.eval(), valid_labels))
print("Test accuracy: %.1f%%" % accuracy(test_prediction.eval(), test_labels))
save_path = saver.save(session, "l2_degrade.ckpt")
print("Model save to " + `save_path`)
Turns out this was not so much a coding issue as a Deep Learning Issue. The extra layer made the gradients too unstable, and that lead to the loss function quickly devolving to NaN. The best way to fix this is to use Xavier initialization. Otherwise, the variance of the initial values will tend to be too high, causing instability. Also, decreasing the learning rate may help.
I had the same problem and reducing the batch size and learning rate worked for me.