LSTM+FFN performs more poorly than FFN - python

I am building several simple networks to predict the bike rentals at 500 stations in the upcoming hour, given rentals at all stations in the previous 24 hours. I am working with two architectures, one with a graph convolution (which amounts to updating each station with a learned linear combination of other stations, at each hour) and a FNN layer to prediction, and a second with a graph convolution -> LSTM -> FNN to prediction.
Before I describe more, I'm getting poorer performance for my model which includes an LSTM unit, which is confusing me.
See these two images for a description of each architecture, for each architecture I also add hourly meta-data (weather, time, etc) as variation, they are in the images in red, and not relevant to my question. Image links at the bottom of the post.
[Architecture 1: GCNN + FNN][1]
[Architecture 2: GCNN + LSTM + FNN][2]
Confusingly, the test RMSE for the first model is 3.46, for the second model its 3.57. Could someone please explain to me why the second wouldn't be lower, as it seems to be running the exact same processes, except with an additional LSTM unit.
Here are relevant snippets of my code for the GCNN+FNN model:
def gcnn_ddgf(hidden_layer, node_num, feature_in, horizon, learning_rate, beta, batch_size, early_stop_th, training_epochs, X_training, Y_training, X_val, Y_val, X_test, Y_test, scaler, display_step):
n_output_vec = node_num * horizon # length of output vector at the final layer
early_stop_k = 0 # early stop patience
best_val = 10000
traing_error = 0
test_error = 0
pred_Y = []
tf.reset_default_graph()
batch_size = batch_size
early_stop_th = early_stop_th
training_epochs = training_epochs
# tf Graph input and output
X = tf.placeholder(tf.float32, [None, node_num, feature_in]) # X is the input signal
Y = tf.placeholder(tf.float32, [None, n_output_vec]) # y is the regression output
# define dictionaries to store layers weight & bias
weights_hidden = {}
weights_A = {}
biases = {}
vec_length = feature_in
weights_hidden['h1'] = tf.Variable(tf.random_normal([vec_length, hidden_layer], stddev=0.5))
biases['b1'] = tf.Variable(tf.random_normal([1, hidden_layer], stddev=0.5))
weights_A['A1'] = tf.Variable(tf.random_normal([node_num, node_num], stddev=0.5))
weights_hidden['out'] = tf.Variable(tf.random_normal([hidden_layer, horizon], stddev=0.5))
biases['bout'] = tf.Variable(tf.random_normal([1, horizon], stddev=0.5))
# Construct model
pred= gcn(X, weights_hidden, weights_A, biases, node_num, horizon) #see below
pred = scaler.inverse_transform(pred)
Y_original = scaler.inverse_transform(Y)
cost = tf.sqrt(tf.reduce_mean(tf.pow(pred - Y_original, 2)))
#optimizer = tf.train.RMSPropOptimizer(learning_rate, decay).minimize(cost)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost_sq = 0.
num_train = X_training.shape[0]
total_batch = int(num_train/batch_size)
for i in range(total_batch):
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[i*batch_size:(i+1)*batch_size,],
Y: Y_training[i*batch_size:(i+1)*batch_size,]})
avg_cost_sq += np.square(c) * batch_size #/ total_batch
# rest part of training dataset
if total_batch * batch_size != num_train:
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[total_batch*batch_size:num_train,],
Y: Y_training[total_batch*batch_size:num_train,]})
avg_cost_sq += np.square(c) * (num_train - total_batch*batch_size)
avg_cost = np.sqrt(avg_cost_sq / num_train)
# validation
c_val, = sess.run([cost], feed_dict={X: X_val, Y: Y_val})
if c_val < best_val:
# testing
c_tes, preds, Y_true = sess.run([cost, pred, Y_original], feed_dict={X: X_test,Y: Y_test})
best_val = c_val
test_error = c_tes
traing_error = avg_cost
pred_Y = preds
early_stop_k = 0 # reset to 0
# update early stopping patience
if c_val >= best_val:
early_stop_k += 1
# threshold
if early_stop_k == early_stop_th:
break
if epoch % display_step == 0:
print ("Epoch:", '%04d' % (epoch+1), "Training RMSE: ","{:.9f}".format(avg_cost))
print("Validation RMSE: ", c_val)
print("Lowest test RMSE: ", test_error)
print("epoch is ", epoch)
print("training RMSE is ", traing_error)
print("Optimization Finished! the lowest validation RMSE is ", best_val)
print("The test RMSE is ", test_error)
return best_val, pred_Y ,Y_true,test_error
# code that creates the model
def gcn(signal_in, weights_hidden, weights_A, biases, node_num, horizon):
signal_in = tf.transpose(signal_in, [1, 0, 2]) # node_num, batch, feature_in
feature_len = signal_in.shape[2] # feature vector length at the node of the input graph
signal_in = tf.reshape(signal_in, [node_num, -1]) # node_num, batch*feature_in
Adj = 0.5*(weights_A['A1'] + tf.transpose(weights_A['A1']))
Adj = normalize_adj(Adj)
Z = tf.matmul(Adj, signal_in) # node_num, batch*feature_in
Z = tf.reshape(Z, [-1, int(feature_len)]) # node_num * batch, feature_in
signal_output = tf.add(tf.matmul(Z, weights_hidden['h1']), biases['b1'])
signal_output = tf.nn.relu(signal_output) # node_num * batch, hidden_vec
final_output = tf.add(tf.matmul(signal_output, weights_hidden['out']), biases['bout']) # node_num * batch, horizon
# final_output = tf.nn.relu(final_output)
final_output = tf.reshape(final_output, [node_num, -1, horizon]) # node_num, batch, horizon
final_output = tf.transpose(final_output, [1, 0, 2]) # batch, node_num, horizon
final_output = tf.reshape(final_output, [-1, node_num*horizon]) # batch, node_num*horizon
return final_output
And the code for the GCNN+LSTM+FNN model:
def gcnn_ddgf_lstm(node_num, feature_in, learning_rate, beta, batch_size, early_stop_th, training_epochs, X_training,
Y_training, X_val, Y_val, X_test, Y_test, scaler, lstm_layer):
n_output_vec = node_num # length of output vector at the final layer
early_stop_k = 0 # early stop patience
display_step = 1 # frequency of printing results
best_val = 10000
traing_error = 0
test_error = 0
predic_res = []
tf.reset_default_graph()
batch_size = batch_size
early_stop_th = early_stop_th
training_epochs = training_epochs
# tf Graph input and output
X = tf.placeholder(tf.float32, [None, node_num, feature_in]) # X is the input signal
Y = tf.placeholder(tf.float32, [None, n_output_vec]) # y is the regression output
lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_layer, state_is_tuple=True)
# define dictionaries to store layers weight & bias
weights_hidden = {}
weights_A = {}
biases = {}
weights_A['A1'] = tf.Variable(tf.random_normal([node_num, node_num], stddev=0.5))
weights_hidden['h1'] = tf.Variable(tf.random_normal([lstm_layer, node_num], stddev=0.5))
biases['h1'] = tf.Variable(tf.random_normal([1, node_num], stddev=0.5))
weights_hidden['out'] = tf.Variable(tf.random_normal([node_num, node_num], stddev=0.5))
biases['bout'] = tf.Variable(tf.random_normal([1, node_num], stddev=0.5))
# Construct model
pred= gcn_lstm(X, weights_hidden, weights_A, biases, node_num, lstm_cell)
# pred = scaler.inverse_transform(pred)
# Y_original = scaler.inverse_transform(Y)
cost = tf.sqrt(tf.reduce_mean(tf.pow(pred - Y, 2)))
#optimizer = tf.train.RMSPropOptimizer(learning_rate, decay).minimize(cost)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost_sq = 0.
num_train = X_training.shape[0]
total_batch = int(num_train/batch_size)
for i in range(total_batch):
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[i*batch_size:(i+1)*batch_size,],
Y: Y_training[i*batch_size:(i+1)*batch_size,]})
avg_cost_sq += np.square(c) * batch_size #/ total_batch
# rest part of training dataset
if total_batch * batch_size != num_train:
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[total_batch*batch_size:num_train,],
Y: Y_training[total_batch*batch_size:num_train,]})
avg_cost_sq += np.square(c) * (num_train - total_batch*batch_size)
avg_cost = np.sqrt(avg_cost_sq / num_train)
# validation
c_val, = sess.run([cost], feed_dict={X: X_val, Y: Y_val})
if c_val < best_val:
c_tes, preds = sess.run([cost, pred], feed_dict={X: X_test,Y: Y_test})
best_val = c_val
# save model
#saver.save(sess, './bikesharing_gcnn_ddgf')
test_error = c_tes
traing_error = avg_cost
early_stop_k = 0 # reset to 0
# update early stopping patience
if c_val >= best_val:
early_stop_k += 1
# threshold
if early_stop_k == early_stop_th:
pred_Y = scaler.inverse_transform(preds)
Y_true = scaler.inverse_transform(Y_test)
test_err = tf.sqrt(tf.reduce_mean(tf.pow(pred_Y - Y_true, 2)))
break
if epoch % display_step == 0:
print ("Epoch:", '%04d' % (epoch+1), "Training RMSE: ","{:.9f}".format(avg_cost))
print("Validation RMSE: ", c_val)
print("Lowest test RMSE: ", test_error)
print("epoch is ", epoch)
print("training RMSE is ", traing_error)
print("Optimization Finished! the lowest validation RMSE is ", best_val)
print("The scaled test RMSE is ", test_error)
return pred_Y, Y_true
def gcn_lstm(signal_in, weights_hidden, weights_A, biases, node_num, lstm_cell):
signal_in = tf.transpose(signal_in, [1, 0, 2]) # node_num, batch, feature_in
feature_len = signal_in.shape[2] # feature vector length at the node of the input graph
signal_in = tf.reshape(signal_in, [node_num, -1]) # node_num, batch*feature_in
Adj = 0.5*(weights_A['A1'] + tf.transpose(weights_A['A1']))
Adj = normalize_adj(Adj)
Z = tf.matmul(Adj, signal_in) # node_num, batch*feature_in
Z = tf.reshape(Z, [node_num, -1, int(feature_len)]) # node_num, batch, feature_in
Z = tf.transpose(Z,[1,2,0]) # batch, feature_in, node_num
# init_state = cell.zero_state(batch_size, tf.float32)
_, Z = tf.nn.dynamic_rnn(lstm_cell, Z, dtype = tf.float32) # init_state?
dense_output = tf.add(tf.matmul(Z[1], weights_hidden['h1']), biases['h1'])
dense_output = tf.nn.relu(dense_output)
final_output = tf.add(tf.matmul(dense_output, weights_hidden['out']), biases['bout']) # batch, node_num*horizon
return final_output
In particular, should I be weary that _, Z = tf.nn.dynamic_rnn(lstm_cell, Z, dtype = tf.float32) causes my variables defined elsewhere not to train?
Thanks a lot for any help :)
[1]: https://i.stack.imgur.com/MAO2t.png
[2]: https://i.stack.imgur.com/UDjHw.png

I resolved this.
I have three years of bike use data to make the prediction, and was using the ~last three months as my validation/test set. The last few months were winter with lower bike use. I got expected results (GCNN+LSTM outperforms GCNN, though not by much) when I shuffled my training data prior to allocating to sets (with sequences preserved for LSTM)

Related

Dropout not computed in tensorflow

I am trying to set an instance so that dropout is compute only during the training session, but somehow it seems that the model doesn't see the dropout layer, as when modifying the probabilities nothing happens. I suspect it's a logic issue in my code, but I can't spot where. Also, I'm relatively new to this world, so please cope with my inexperience. Any help will be much appreciated.
Here's the code. I first create a Boolean placeholder
Train = tf.placeholder(tf.bool,shape=())
which will be then passed into a dictionary value as true(training) or False(test). Then I implemented the forward propagation as follows.
def forward_prop_cost(X, parameters,string,drop_probs,Train):
"""
Implements the forward propagation for the model: LINEAR -> RELU -> LINEAR -> RELU -> LINEAR -> SOFTMAX
Arguments:
X -- input dataset placeholder, of shape (input size, number of examples)
parameters -- python dictionary containing your parameters "W1", "b1", ...
string - ReLU or tanh
drop_probs = drop probabilities for each layer. First and last == 0
Train = boolean
Returns:
ZL -- the output of the last LINEAR unit
"""
L = len(drop_probs)-1
activations = []
activations.append(X)
if string == 'ReLU':
for i in range(1,L):
Zi = tf.matmul(parameters['W'+str(i)],activations[i-1]) + parameters['b'+str(i)]
if (Train == True and drop_probs[i] != 0):
Ai = tf.nn.dropout(tf.nn.relu(Zi),drop_probs[i])
else:
Ai = tf.nn.relu(Zi)
activations.append(Ai)
elif string == 'tanh': #needs update!
for i in range(1,L):
Zi = tf.matmul(parameters['W'+str(i)],activations[i-1]) + parameters['b'+str(i)]
Ai = tf.nn.dropout(tf.nn.tanh(Zi),drop_probs[i])
activations.append(Ai)
ZL = tf.matmul(parameters['W'+str(L)],activations[L-1]) + parameters['b'+str(L)]
logits = tf.transpose(ZL)
labels = tf.transpose(Y)
return ZL
Then I call the model function, where just at the end I pass the values of the Train as true or false, depending on the data set I'm using.
def model(X_train, Y_train, X_test, Y_test,hidden = [12288,25,12,6], string = 'ReLU',drop_probs = [0.,0.4,0.2,0.],
regular_param = 0.0, starter_learning_rate = 0.0001,
num_epochs = 1500, minibatch_size = 32, print_cost = True, learning_decay = False):
'''
Returns:
parameters -- parameters learnt by the model. They can then be used to predict.
'''
ops.reset_default_graph()
tf.set_random_seed(1)
seed = 3
(n_x, m) = X_train.shape # (n_x: input size, m : number of examples in the train set)
n_y = Y_train.shape[0] # n_y : output size
costs = [] # To keep track of the cost
graph = tf.Graph()
X, Y ,Train = create_placeholders(n_x, n_y)
parameters = initialize_parameters(hidden)
#print([n.name for n in tf.get_default_graph().as_graph_def().node])
ZL = forward_prop_cost(X, parameters,'ReLU',drop_probs,Train)
#cost = forward_prop_cost(X, parameters,'ReLU',drop_probs,regular_param )
cost = compute_cost(ZL,Y,parameters,regular_param)
#optimizer = tf.train.AdamOptimizer(learning_rate = starter_learning_rate).minimize(cost)
if learning_decay == True:
increasing = tf.Variable(0, trainable=False)
learning_rate = tf.train.exponential_decay(starter_learning_rate,increasing * minibatch_size,m, 0.95, staircase=True)
optimizer = tf.train.AdamOptimizer(learning_rate = learning_rate).minimize(cost,global_step=increasing)
else:
optimizer = tf.train.AdamOptimizer(learning_rate = starter_learning_rate).minimize(cost)
# Initialize all the variables
init = tf.global_variables_initializer()
# Start the session to compute the tensorflow graph
with tf.Session() as sess:
# Run the initialization
sess.run(init, { Train: True } )
# Do the training loop
for epoch in range(num_epochs):
epoch_cost = 0.
num_minibatches = int(m / minibatch_size)
seed = seed + 1
minibatches = random_mini_batches(X_train, Y_train, minibatch_size, seed)
for minibatch in minibatches:
(minibatch_X, minibatch_Y) = minibatch
_ , minibatch_cost = sess.run([optimizer, cost], feed_dict={X: minibatch_X, Y: minibatch_Y})
epoch_cost += minibatch_cost / num_minibatches
# Print the cost every 100 epoch
if print_cost == True and epoch % 100 == 0:
print ("Cost after epoch %i: %f" % (epoch, epoch_cost))
if print_cost == True and epoch % 5 == 0:
costs.append(epoch_cost)
# plot the cost
plt.plot(np.squeeze(costs))
plt.ylabel('cost')
plt.xlabel('iterations (per fives)')
plt.title("Learning rate =" + str(learning_rate))
plt.show()
parameters = sess.run(parameters)
print ("Parameters have been trained!")
# Calculate accuracy on the test set
correct_prediction = tf.equal(tf.argmax(ZL), tf.argmax(Y))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print ("Train Accuracy:", accuracy.eval({X: X_train, Y: Y_train, Train: True}))
print ("Test Accuracy:", accuracy.eval({X: X_test, Y: Y_test, Train: False}))
return parameters

How to make Feed Forward NN more accurate?

I just finished writing my first ever Neural Network and it finally works, but it works really bad. I get about 0.37 accuracy. Any tips on how to make it more accurate? I have already tried different learning rates and also different number of hidden layer units, but I never get above 0.37 accuracy. I'm trying to classify data into one of the 3 classes 0, 1 or 2. I use a 1 hot Matrix as my Y. How could I improve my code?
X = data[1:, 2:]
m, n = X.shape
labels = data[1:, 1]
Y = np.zeros((m,3))
i = 0
for label in labels:
if label == 0:
Y[i,0] = 1
elif label == 1:
Y[i,1] = 1
elif label == 2:
Y[i,2] = 1
i += 1
slice_size = math.floor(m/5)
X_test = X[-slice_size:, :]
Y_test = Y[-slice_size:]
X_train = X[:slice_size, :]
Y_train = Y[:slice_size]
learning_rate = 0.00001
num_steps = 200
batch_size = 100
display_step = 2
n_nodes_hl1 = 5
n_nodes_hl2 = 5
n_nodes_hl3 = 5
n_classes = 3
n_inputs = 16
training_epochs = 500
x = tf.placeholder('float32', [None,n])
y = tf.placeholder('float32', [None, n_classes])
weights = {
'h1': tf.Variable(tf.random_normal([n_inputs, n_nodes_hl1])),
'h2': tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
'h3': tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
'out': tf.Variable(tf.random_normal([n_nodes_hl1, n_classes]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_nodes_hl1])),
'b2': tf.Variable(tf.random_normal([n_nodes_hl2])),
'b3': tf.Variable(tf.random_normal([n_nodes_hl3])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
def neural_network(data):
layer_1 = tf.add(tf.matmul(data, weights['h1']), biases['b1'])
layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
layer_3 = tf.add(tf.matmul(layer_2, weights['h3']), biases['b3'])
output = tf.matmul(layer_3, weights['out']) + biases['out']
return output
logits = neural_network(x)
prediction = tf.nn.softmax(logits)
loss_op =
tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y_train, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for step in range(1, num_steps+1):
x_step = np.asarray(X_train[step,:])
y_step = np.asarray(Y_train[step])
x_step = np.reshape(x_step, (1, n))
y_step = np.reshape(y_step, (1,n_classes))
sess.run(train_op, feed_dict={x:x_step , y:y_step})
if step % display_step == 0 or step == 1:
#Calculate batch loss and accuracy
loss, acc = sess.run([loss_op, accuracy], feed_dict={x: x_step,
y: y_step})
print("Step " + str(step) + ", Minibatch Loss= " +
"{:.4f}".format(loss) + ", Training Accuracy= " +
"{:.3f}".format(acc))
x_step_test = np.asarray(X_test)
y_step_test = np.asarray(Y_test)
x_step_test = np.reshape(x_step, (1, n))
y_step_test = np.reshape(y_step, (1,n_classes))
print("Optimization Finished!")
print("Testing Accuracy:",
sess.run(accuracy, feed_dict={x: x_step_test,
y: y_step_test}))
1.
x_step_test = np.asarray(X_test)
y_step_test = np.asarray(Y_test)
x_step_test = np.reshape(x_step, (1, n))
y_step_test = np.reshape(y_step, (1,n_classes))
Shouldn't this be:
x_step_test = np.asarray(X_test)
y_step_test = np.asarray(Y_test)
x_step_test = np.reshape(x_step_test, (1, n))
y_step_test = np.reshape(y_step_test, (1,n_classes))
Also check how u r taking the batches, there might be some problem.
Use train_test_split from sklearn.model_selection, it splits your train and test data after shuffling. Not shuffling your data might create problem if ur data have some pattern, eg. u have 99 data points, first 33 contain its a dog another 33 contains its a cat and for last 33 its a mouse, your neural net will train only on 66 dog and cat images and won't learn to recognise mouse.
Increase the learning rate, AdamOptimizer already decays the lr, use something like 0.1 or 0.01.
I guess tensorflow part is correct.

TensorFlow multiple values for loss

I'm working through this RNN tutorial to get a general idea of how to write an RNN using the lower level TensorFlow API. While I've gotten everything to work, I am getting different values for my total_loss depending on how I evaluate it within the session.
What is the difference in how the below losses are calculated? Why does running the train step with other nodes (i.e. in the same run statement) in the graph result in different loss values then when running the train step and other nodes separately (i.e. in different run statements)?
Here is the graph:
X = tf.placeholder(tf.int32, [batch_size, num_steps], name = 'X')
Y = tf.placeholder(tf.int32, [batch_size, num_steps], name = 'Y')
initial_state = tf.zeros([batch_size, state_size])
X_one_hot = tf.one_hot(X, num_classes)
rnn_inputs = tf.unstack(X_one_hot, axis = 1)
Y_one_hot = tf.one_hot(Y, num_classes)
Y_one_hot_list = tf.unstack(Y_one_hot, axis = 1)
with tf.variable_scope('RNN_cell'):
W = tf.get_variable('W', [num_classes + state_size, state_size])
b = tf.get_variable('b', [state_size], initializer = tf.constant_initializer(0.0))
tf.summary.histogram('RNN_cell/weights', W)
# define the RNN cell
def RNNCell(rnn_input, state, activation = tf.tanh):
with tf.variable_scope('RNN_cell', reuse = True):
W = tf.get_variable('W', [num_classes + state_size, state_size])
b = tf.get_variable('b', [state_size], initializer = tf.constant_initializer(0))
H = activation(tf.matmul(tf.concat([rnn_input, state], axis = 1), W) + b)
return H
# add RNN cells to the computational graph
state = initial_state
rnn_outputs = []
for rnn_input in rnn_inputs:
state = RNNCell(rnn_input, state, tf.tanh)
rnn_outputs.append(state)
final_state = rnn_outputs[-1]
# set up the softmax output layer
with tf.variable_scope('softmax_output'):
W = tf.get_variable('W', [state_size, num_classes])
b = tf.get_variable('b', [num_classes], initializer = tf.constant_initializer(0.0))
tf.summary.histogram('softmax_output/weights', W)
logits = [tf.matmul(rnn_output, W) + b for rnn_output in rnn_outputs]
probabilties = [tf.nn.softmax(logit) for logit in logits]
predictions = [tf.argmax(logit, 1) for logit in logits]
# set up loss function
losses = [tf.nn.softmax_cross_entropy_with_logits(labels = label, logits = logit) for
logit, label in zip(logits, Y_one_hot_list)]
total_loss = tf.reduce_mean(losses)
# set up the optimizer
train_step = tf.train.AdamOptimizer(learning_rate).minimize(total_loss)
tf.summary.scalar('loss', total_loss)
This version of the session evaluates the training loss, takes a train_step, and then evaluates the loss again.
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
train_writer = tf.summary.FileWriter( './RNN_Tutorial/temp1', sess.graph)
summary = tf.summary.merge_all()
for index, epoch in enumerate(gen_epochs(num_epochs, num_steps)):
training_state = np.zeros((batch_size, state_size))
for step, (x, y) in enumerate(epoch):
training_loss1 = sess.run(total_loss, feed_dict = {X: x, Y: y, initial_state: training_state})
sess.run(train_step, feed_dict = {X: x, Y: y, initial_state: training_state})
training_loss2 = sess.run(total_loss, feed_dict = {X: x, Y: y, initial_state: training_state})
if step % 1 == 0:
train_writer.add_summary(summary_str, global_step = step)
print(step, training_loss1, training_loss2)
The output looks like the model is not really learning. Here is the (partial) output, which doesn't really change through all 1000 iterations. It just sticks around 0.65 - 0.7
0 0.6757775 0.66556937
1 0.6581067 0.6867344
2 0.70850086 0.66878074
3 0.67115635 0.68184483
4 0.67868954 0.6858209
5 0.6853568 0.66989964
6 0.672376 0.6554015
7 0.66563135 0.6655373
8 0.660332 0.6666234
9 0.6514224 0.6536864
10 0.65912485 0.6518013
And here is the session when I run total_loss, losses, and final_state with the train_step:
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
train_writer = tf.summary.FileWriter( './RNN_Tutorial/temp1', sess.graph)
summary = tf.summary.merge_all()
for index, epoch in enumerate(gen_epochs(num_epochs, num_steps)):
training_state = np.zeros((batch_size, state_size))
for step, (x, y) in enumerate(epoch):
training_loss1 = sess.run(total_loss, feed_dict = {X: x, Y: y, initial_state: training_state})
tr_losses, training_loss_, training_state, _, summary_str = \
sess.run([losses,
total_loss,
final_state,
train_step,
summary], feed_dict={X:x, Y:y, initial_state:training_state})
training_loss2 = sess.run(total_loss, feed_dict = {X: x, Y: y, initial_state: training_state})
if step % 1 == 0:
train_writer.add_summary(summary_str, global_step = step)
print(step, training_loss1, training_loss_, training_loss2)
In this output, however, the total_loss calculated before the train step and the total loss calculated with train step have a steady decline and then plateau around 0.53 while the loss calculated after the train step (training_loss2) still fluctuates around 0.65 - 0.7 in the same way the first session did. Below is another partial output:
900 0.50464576 0.50464576 0.6973026
901 0.51603603 0.51603603 0.7115394
902 0.5465342 0.5465342 0.74994177
903 0.50591564 0.50591564 0.69172275
904 0.54837495 0.54837495 0.7333309
905 0.51697487 0.51697487 0.674438
906 0.5259896 0.5259896 0.70118546
907 0.5242365 0.5242365 0.71549624
908 0.50699174 0.50699174 0.7007787
909 0.5292892 0.5292892 0.7045353
910 0.49432433 0.49432433 0.73515224
I would think that the training loss would be the same for both versions of the session block. Why does using sess.run(total_loss, ...) then sess.run(train_step, ...) alone (i.e. in the first version) result in different loss values than when using sess.run([losses, total_loss, final_state, train_step], ...)?
Figured it out. Running the session without fetching and updating training_state = final_state within the second for loop was the issue. Without that, the model doesn't learn the longer dependencies built into the generated data.

some questions about the function run_epoch in ptb_word_lm.py of tensorflow rnn tuorial

tensorflow rnn ptb language model tuorial https://github.com/tensorflow/models/tree/master/tutorials/rnn/ptb
I have two question about run_epoch function in ptb_word_lm.py, only cpu device
for step in range(model.input.epoch_size):
feed_dict = {}
for i, (c, h) in enumerate(model.initial_state):
feed_dict[c] = state[i].c
feed_dict[h] = state[i].h
vals = session.run(fetches, feed_dict)
cost = vals["cost"]
state = vals["final_state"]
Qustion1: why do here need to creat a feed_dict to sess, I think in Class PTBmodel, it has create the initial state for lstm network
cell = tf.contrib.rnn.MultiRNNCell(
[cell for _ in range(config.num_layers)], state_is_tuple=True)
self._initial_state = cell.zero_state(config.batch_size, data_type())
state = self._initial_state
outputs = []
with tf.variable_scope("RNN"):
for time_step in range(self.num_steps):
if time_step > 0: tf.get_variable_scope().reuse_variables()
(cell_output, state) = cell(inputs[:, time_step, :], state)
outputs.append(cell_output)
Qustion2: why can sessrion.run(fetches, feed_dict) return values here, however, I try this in a test code, it return None
import tensorflow as tf
# Model parameters
W = tf.Variable([.3], dtype=tf.float32)
b = tf.Variable([-.3], dtype=tf.float32)
# Model input and output
x = tf.placeholder(tf.float32)
linear_model = W * x + b
y = tf.placeholder(tf.float32)
# loss
loss = tf.reduce_sum(tf.square(linear_model - y)) # sum of the squares
# optimizer
optimizer = tf.train.GradientDescentOptimizer(0.01)
# train = optimizer.minimize(loss)
# training data
x_train = [1, 2, 3, 4]
y_train = [0, -1, -2, -3]
# training loop
init = tf.global_variables_initializer()
tvars = tf.trainable_variables()
grads = tf.gradients(loss, tvars)
train = optimizer.apply_gradients(
zip(grads, tvars))
sess = tf.Session()
sess.run(init) # reset values to wrong
for i in range(100):
print sess.run(train, {x: x_train, y: y_train})
it just print
None
None
None
..
..
thank you!

Super high cost Tensorflow

I'm trying to make some price prediction on a kaggle dataset with Tensorflow.
My Neural network is learning, but, my cost function is really high and my predictions are far from the real output.
I tried to change my network by adding or removing some layers, neurons and activations functions.
I tried a lot with my hyper-parameters but that don't change so much things.
I don't think that the problem come from my datas, I checked on kaggle and that's the ones that most people uses.
If you have any idea why my cost is so high and how to reduce it and if you could explain it to me, it would be really great !
Her's my code:
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
import tensorflow as tf
from sklearn.utils import shuffle
df = pd.read_csv(r"C:\Users\User\Documents\TENSORFLOW\Prediction prix\train2.csv", sep=';')
df.head()
df = df.loc[:, ['OverallQual', 'GrLivArea', 'GarageCars', 'TotalBsmtSF', 'FullBath', 'SalePrice']]
df = df.replace(np.nan, 0)
df
%matplotlib inline
plt = sns.pairplot(df)
plt
df = shuffle(df)
df_train = df[0:1000]
df_test = df[1001:1451]
inputX = df_train.drop('SalePrice', 1).as_matrix()
inputX = inputX.astype(int)
inputY = df_train.loc[:, ['SalePrice']].as_matrix()
inputY = inputY.astype(int)
inputX_test = df_test.drop('SalePrice', 1).as_matrix()
inputX_test = inputX_test.astype(int)
inputY_test = df_test.loc[:, ['SalePrice']].as_matrix()
inputY_test = inputY_test.astype(int)
# Parameters
learning_rate = 0.01
training_epochs = 1000
batch_size = 500
display_step = 50
n_samples = inputX.shape[0]
x = tf.placeholder(tf.float32, [None, 5])
y = tf.placeholder(tf.float32, [None, 1])
def add_layer(inputs, in_size, out_size, activation_function=None):
Weights = tf.Variable(tf.random_normal([in_size, out_size], stddev=0.1))
biases = tf.Variable(tf.zeros([1, out_size]) + 0.1)
Wx_plus_b = tf.matmul(inputs, Weights) + biases
if activation_function is None:
output = Wx_plus_b
else:
output = activation_function(Wx_plus_b)
return output
l1 = add_layer(x, 5, 3, activation_function=tf.nn.relu)
pred = add_layer(l1, 3, 1)
# Mean squared error
cost = tf.reduce_sum(tf.pow(pred-y, 2))/(2*n_samples)
# Gradient descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = batch_size
# Loop over all batches
for i in range(total_batch):
# Run optimization op (backprop) and cost op (to get loss value)
_, c = sess.run([optimizer, cost], feed_dict={x: inputX,
y: inputY})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", \
"{:.9f}".format(avg_cost))
print("Optimization Finished!")
# Test model
correct_prediction = tf.equal(pred,y)
# Calculate accuracy
accuracy = tf.reduce_mean(tf.cast(correct_prediction, "float"))
print("Accuracy:", accuracy.eval({x: inputX, y: inputY}))
print(sess.run(pred, feed_dict={x: inputX_test}))
Epoch: 0001 cost= 10142407502702304395526144.000000000
Epoch: 0051 cost= 3256106752.000019550
Epoch: 0101 cost= 3256106752.000019550
Epoch: 0151 cost= 3256106752.000019550
Epoch: 0201 cost= 3256106752.000019550
...
Thanks for your help !
I see couple of problems with the implementation:
Inputs are not scaled.
Use sklearn StandardScaler to scale the inputs inputX, inputY (and also inputX_text and inputY_text) to make it zero mean and unit variance. You can use the inverse_transform to convert the outputs back to proper scale again.
sc = StandardScaler().fit(inputX)
inputX = sc.transform(inputX)
inputX_test = sc.transform(inputX_test)
The batch_size is too large, you are passing the entire set as a single batch. This should not cause the particular problem you are facing, but for better convergence try with reduced batch size. Implement a get_batch() generator function and do the following:
for batch_X, batch_Y in get_batch(input_X, input_Y, batch_size):
_, c = sess.run([optimizer, cost], feed_dict={x: batch_X,
y: batch_Y})
Try smaller Weights initialization (stddev) if you still see issues.
WORKING CODE BELOW:
inputX = df_train.drop('SalePrice', 1).as_matrix()
inputX = inputX.astype(int)
sc = StandardScaler().fit(inputX)
inputX = sc.transform(inputX)
inputY = df_train.loc[:, ['SalePrice']].as_matrix()
inputY = inputY.astype(int)
sc1 = StandardScaler().fit(inputY)
inputY = sc1.transform(inputY)
inputX_test = df_test.drop('SalePrice', 1).as_matrix()
inputX_test = inputX_test.astype(int)
inputX_test = sc.transform(inputX_test)
inputY_test = df_test.loc[:, ['SalePrice']].as_matrix()
inputY_test = inputY_test.astype(int)
inputY_test = sc1.transform(inputY_test)
learning_rate = 0.01
training_epochs = 1000
batch_size = 50
display_step = 50
n_samples = inputX.shape[0]
x = tf.placeholder(tf.float32, [None, 5])
y = tf.placeholder(tf.float32, [None, 1])
def get_batch(inputX, inputY, batch_size):
duration = len(inputX)
for i in range(0,duration//batch_size):
idx = i*batch_size
yield inputX[idx:idx+batch_size], inputY[idx:idx+batch_size]
def add_layer(inputs, in_size, out_size, activation_function=None):
Weights = tf.Variable(tf.random_normal([in_size, out_size], stddev=0.005))
biases = tf.Variable(tf.zeros([1, out_size]))
Wx_plus_b = tf.matmul(inputs, Weights) + biases
if activation_function is None:
output = Wx_plus_b
else:
output = activation_function(Wx_plus_b)
return output
l1 = add_layer(x, 5, 3, activation_function=tf.nn.relu)
pred = add_layer(l1, 3, 1)
# Mean squared error
cost = tf.reduce_mean(tf.pow(tf.subtract(pred, y), 2))
# Gradient descent
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
# Launch the graph
with tf.Session() as sess:
sess.run(init)
# Training cycle
for epoch in range(training_epochs):
avg_cost = 0.
total_batch = batch_size
# Loop over all batches
#for i in range(total_batch):
for batch_x, batch_y in get_batch(inputX, inputY, batch_size):
# Run optimization op (backprop) and cost op (to get loss value)
_, c, _l1, _pred = sess.run([optimizer, cost, l1, pred], feed_dict={x: batch_x, y: batch_y})
# Compute average loss
avg_cost += c / total_batch
# Display logs per epoch step
if epoch % display_step == 0:
print("Epoch:", '%04d' % (epoch+1), "cost=", "{:.9f} ".format(avg_cost))
#print(_l1, _pred)
print("Optimization Finished!")
I have already had a similar problem of a very high cost reached after a few training steps, and then the cost remaining constant there. For me it was a kind of overflow, with the gradients too big and creating Nan values quite early in training. I solved it by starting with a smaller learning rate (potentially much smaller), until the cost and gradients become more reasonable (a few dozen steps), and then back to a regular one (higher at the start, potentially decaying).
See my answer to this post for a similar case that was solved just by taking a smaller learning rate on start.
You can also clip your gradients to avoid this problem, using tf.clip_by_value. It sets a minimum and maximum value to your gradients, which avoids to have huge ones that send your weights straight to Nan after the first few iterations. To use it (with min and max at -1 and 1, which is probably too tight), replace
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
by
opt= tf.train.GradientDescentOptimizer(learning_rate)
gvs = opt.compute_gradients(cost)
capped_gvs = [(tf.clip_by_value(grad, -1., 1.), var) for grad, var in gvs]
optimizer = opt.apply_gradients(capped_gvs)

Categories

Resources