I am building several simple networks to predict the bike rentals at 500 stations in the upcoming hour, given rentals at all stations in the previous 24 hours. I am working with two architectures, one with a graph convolution (which amounts to updating each station with a learned linear combination of other stations, at each hour) and a FNN layer to prediction, and a second with a graph convolution -> LSTM -> FNN to prediction.
Before I describe more, I'm getting poorer performance for my model which includes an LSTM unit, which is confusing me.
See these two images for a description of each architecture, for each architecture I also add hourly meta-data (weather, time, etc) as variation, they are in the images in red, and not relevant to my question. Image links at the bottom of the post.
[Architecture 1: GCNN + FNN][1]
[Architecture 2: GCNN + LSTM + FNN][2]
Confusingly, the test RMSE for the first model is 3.46, for the second model its 3.57. Could someone please explain to me why the second wouldn't be lower, as it seems to be running the exact same processes, except with an additional LSTM unit.
Here are relevant snippets of my code for the GCNN+FNN model:
def gcnn_ddgf(hidden_layer, node_num, feature_in, horizon, learning_rate, beta, batch_size, early_stop_th, training_epochs, X_training, Y_training, X_val, Y_val, X_test, Y_test, scaler, display_step):
n_output_vec = node_num * horizon # length of output vector at the final layer
early_stop_k = 0 # early stop patience
best_val = 10000
traing_error = 0
test_error = 0
pred_Y = []
tf.reset_default_graph()
batch_size = batch_size
early_stop_th = early_stop_th
training_epochs = training_epochs
# tf Graph input and output
X = tf.placeholder(tf.float32, [None, node_num, feature_in]) # X is the input signal
Y = tf.placeholder(tf.float32, [None, n_output_vec]) # y is the regression output
# define dictionaries to store layers weight & bias
weights_hidden = {}
weights_A = {}
biases = {}
vec_length = feature_in
weights_hidden['h1'] = tf.Variable(tf.random_normal([vec_length, hidden_layer], stddev=0.5))
biases['b1'] = tf.Variable(tf.random_normal([1, hidden_layer], stddev=0.5))
weights_A['A1'] = tf.Variable(tf.random_normal([node_num, node_num], stddev=0.5))
weights_hidden['out'] = tf.Variable(tf.random_normal([hidden_layer, horizon], stddev=0.5))
biases['bout'] = tf.Variable(tf.random_normal([1, horizon], stddev=0.5))
# Construct model
pred= gcn(X, weights_hidden, weights_A, biases, node_num, horizon) #see below
pred = scaler.inverse_transform(pred)
Y_original = scaler.inverse_transform(Y)
cost = tf.sqrt(tf.reduce_mean(tf.pow(pred - Y_original, 2)))
#optimizer = tf.train.RMSPropOptimizer(learning_rate, decay).minimize(cost)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost_sq = 0.
num_train = X_training.shape[0]
total_batch = int(num_train/batch_size)
for i in range(total_batch):
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[i*batch_size:(i+1)*batch_size,],
Y: Y_training[i*batch_size:(i+1)*batch_size,]})
avg_cost_sq += np.square(c) * batch_size #/ total_batch
# rest part of training dataset
if total_batch * batch_size != num_train:
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[total_batch*batch_size:num_train,],
Y: Y_training[total_batch*batch_size:num_train,]})
avg_cost_sq += np.square(c) * (num_train - total_batch*batch_size)
avg_cost = np.sqrt(avg_cost_sq / num_train)
# validation
c_val, = sess.run([cost], feed_dict={X: X_val, Y: Y_val})
if c_val < best_val:
# testing
c_tes, preds, Y_true = sess.run([cost, pred, Y_original], feed_dict={X: X_test,Y: Y_test})
best_val = c_val
test_error = c_tes
traing_error = avg_cost
pred_Y = preds
early_stop_k = 0 # reset to 0
# update early stopping patience
if c_val >= best_val:
early_stop_k += 1
# threshold
if early_stop_k == early_stop_th:
break
if epoch % display_step == 0:
print ("Epoch:", '%04d' % (epoch+1), "Training RMSE: ","{:.9f}".format(avg_cost))
print("Validation RMSE: ", c_val)
print("Lowest test RMSE: ", test_error)
print("epoch is ", epoch)
print("training RMSE is ", traing_error)
print("Optimization Finished! the lowest validation RMSE is ", best_val)
print("The test RMSE is ", test_error)
return best_val, pred_Y ,Y_true,test_error
# code that creates the model
def gcn(signal_in, weights_hidden, weights_A, biases, node_num, horizon):
signal_in = tf.transpose(signal_in, [1, 0, 2]) # node_num, batch, feature_in
feature_len = signal_in.shape[2] # feature vector length at the node of the input graph
signal_in = tf.reshape(signal_in, [node_num, -1]) # node_num, batch*feature_in
Adj = 0.5*(weights_A['A1'] + tf.transpose(weights_A['A1']))
Adj = normalize_adj(Adj)
Z = tf.matmul(Adj, signal_in) # node_num, batch*feature_in
Z = tf.reshape(Z, [-1, int(feature_len)]) # node_num * batch, feature_in
signal_output = tf.add(tf.matmul(Z, weights_hidden['h1']), biases['b1'])
signal_output = tf.nn.relu(signal_output) # node_num * batch, hidden_vec
final_output = tf.add(tf.matmul(signal_output, weights_hidden['out']), biases['bout']) # node_num * batch, horizon
# final_output = tf.nn.relu(final_output)
final_output = tf.reshape(final_output, [node_num, -1, horizon]) # node_num, batch, horizon
final_output = tf.transpose(final_output, [1, 0, 2]) # batch, node_num, horizon
final_output = tf.reshape(final_output, [-1, node_num*horizon]) # batch, node_num*horizon
return final_output
And the code for the GCNN+LSTM+FNN model:
def gcnn_ddgf_lstm(node_num, feature_in, learning_rate, beta, batch_size, early_stop_th, training_epochs, X_training,
Y_training, X_val, Y_val, X_test, Y_test, scaler, lstm_layer):
n_output_vec = node_num # length of output vector at the final layer
early_stop_k = 0 # early stop patience
display_step = 1 # frequency of printing results
best_val = 10000
traing_error = 0
test_error = 0
predic_res = []
tf.reset_default_graph()
batch_size = batch_size
early_stop_th = early_stop_th
training_epochs = training_epochs
# tf Graph input and output
X = tf.placeholder(tf.float32, [None, node_num, feature_in]) # X is the input signal
Y = tf.placeholder(tf.float32, [None, n_output_vec]) # y is the regression output
lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_layer, state_is_tuple=True)
# define dictionaries to store layers weight & bias
weights_hidden = {}
weights_A = {}
biases = {}
weights_A['A1'] = tf.Variable(tf.random_normal([node_num, node_num], stddev=0.5))
weights_hidden['h1'] = tf.Variable(tf.random_normal([lstm_layer, node_num], stddev=0.5))
biases['h1'] = tf.Variable(tf.random_normal([1, node_num], stddev=0.5))
weights_hidden['out'] = tf.Variable(tf.random_normal([node_num, node_num], stddev=0.5))
biases['bout'] = tf.Variable(tf.random_normal([1, node_num], stddev=0.5))
# Construct model
pred= gcn_lstm(X, weights_hidden, weights_A, biases, node_num, lstm_cell)
# pred = scaler.inverse_transform(pred)
# Y_original = scaler.inverse_transform(Y)
cost = tf.sqrt(tf.reduce_mean(tf.pow(pred - Y, 2)))
#optimizer = tf.train.RMSPropOptimizer(learning_rate, decay).minimize(cost)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost_sq = 0.
num_train = X_training.shape[0]
total_batch = int(num_train/batch_size)
for i in range(total_batch):
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[i*batch_size:(i+1)*batch_size,],
Y: Y_training[i*batch_size:(i+1)*batch_size,]})
avg_cost_sq += np.square(c) * batch_size #/ total_batch
# rest part of training dataset
if total_batch * batch_size != num_train:
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[total_batch*batch_size:num_train,],
Y: Y_training[total_batch*batch_size:num_train,]})
avg_cost_sq += np.square(c) * (num_train - total_batch*batch_size)
avg_cost = np.sqrt(avg_cost_sq / num_train)
# validation
c_val, = sess.run([cost], feed_dict={X: X_val, Y: Y_val})
if c_val < best_val:
c_tes, preds = sess.run([cost, pred], feed_dict={X: X_test,Y: Y_test})
best_val = c_val
# save model
#saver.save(sess, './bikesharing_gcnn_ddgf')
test_error = c_tes
traing_error = avg_cost
early_stop_k = 0 # reset to 0
# update early stopping patience
if c_val >= best_val:
early_stop_k += 1
# threshold
if early_stop_k == early_stop_th:
pred_Y = scaler.inverse_transform(preds)
Y_true = scaler.inverse_transform(Y_test)
test_err = tf.sqrt(tf.reduce_mean(tf.pow(pred_Y - Y_true, 2)))
break
if epoch % display_step == 0:
print ("Epoch:", '%04d' % (epoch+1), "Training RMSE: ","{:.9f}".format(avg_cost))
print("Validation RMSE: ", c_val)
print("Lowest test RMSE: ", test_error)
print("epoch is ", epoch)
print("training RMSE is ", traing_error)
print("Optimization Finished! the lowest validation RMSE is ", best_val)
print("The scaled test RMSE is ", test_error)
return pred_Y, Y_true
def gcn_lstm(signal_in, weights_hidden, weights_A, biases, node_num, lstm_cell):
signal_in = tf.transpose(signal_in, [1, 0, 2]) # node_num, batch, feature_in
feature_len = signal_in.shape[2] # feature vector length at the node of the input graph
signal_in = tf.reshape(signal_in, [node_num, -1]) # node_num, batch*feature_in
Adj = 0.5*(weights_A['A1'] + tf.transpose(weights_A['A1']))
Adj = normalize_adj(Adj)
Z = tf.matmul(Adj, signal_in) # node_num, batch*feature_in
Z = tf.reshape(Z, [node_num, -1, int(feature_len)]) # node_num, batch, feature_in
Z = tf.transpose(Z,[1,2,0]) # batch, feature_in, node_num
# init_state = cell.zero_state(batch_size, tf.float32)
_, Z = tf.nn.dynamic_rnn(lstm_cell, Z, dtype = tf.float32) # init_state?
dense_output = tf.add(tf.matmul(Z[1], weights_hidden['h1']), biases['h1'])
dense_output = tf.nn.relu(dense_output)
final_output = tf.add(tf.matmul(dense_output, weights_hidden['out']), biases['bout']) # batch, node_num*horizon
return final_output
In particular, should I be weary that _, Z = tf.nn.dynamic_rnn(lstm_cell, Z, dtype = tf.float32) causes my variables defined elsewhere not to train?
Thanks a lot for any help :)
[1]: https://i.stack.imgur.com/MAO2t.png
[2]: https://i.stack.imgur.com/UDjHw.png
I resolved this.
I have three years of bike use data to make the prediction, and was using the ~last three months as my validation/test set. The last few months were winter with lower bike use. I got expected results (GCNN+LSTM outperforms GCNN, though not by much) when I shuffled my training data prior to allocating to sets (with sequences preserved for LSTM)
I created a model class which is a subclass of keras.Model. While training the model, I want to change the weights of the loss functions after some epochs. In order to do that I created boolean variables to my model indicating that the model should start training with additional loss function. I add a pseudo code that mainly shows what I am trying to achieve.
class MyModel(keras.Model):
self.start_loss_2 = False
def train_step(self):
# Check if training with loss_2 started
weight_loss_2 = 0.0
if self.start_loss_2:
weight_loss_2 = 0.5
# Pass the data through model
# Calculate two loss values
total_loss = loss_1 + weight_loss_2 * loss_2
# Calculate gradients with tf.Tape
# Update variables
# This is called via Callback after each epoch
def epoch_finised(epoch_num):
if epoch_num > START_LOSS_2:
self.start_loss_2 = True
My questions is:
Is it valid to use if-else statement whose value changes after some time? If it is not, how can achieve this?
Yes. You can create a tf.Variable and then assign a new value to it based on some training criteria.
Example:
import numpy as np
import tensorflow as tf
# simple toy network
x_in = tf.keras.Input((10))
x = tf.keras.layers.Dense(25)(x_in)
x_out = tf.keras.layers.Dense(1)(x)
# model
m = tf.keras.Model(x_in, x_out)
# fake data
X = tf.random.normal((100, 10))
y0 = tf.random.normal((100, ))
y1 = tf.random.normal((100, ))
# optimizer
m_opt = tf.keras.optimizers.Adam(1e-2)
# prep data
ds = tf.data.Dataset.from_tensor_slices((X, y0, y1))
ds = ds.repeat().batch(5)
train_iter = iter(ds)
# toy loss function that uses a weight
def loss_fn(y_true0, y_true1, y_pred, weight):
mse = tf.keras.losses.MSE
mse_0 = tf.math.reduce_mean(mse(y_true0, y_pred))
mse_1 = tf.math.reduce_mean(mse(y_true1, y_pred))
return mse_0 + weight * mse_1
NUM_EPOCHS = 4
NUM_BATCHES_PER_EPOCH = 10
START_NEW_LOSS_AT_GLOBAL_STEP = 20
# the weight variable set to 0 initially and then
# will be changed after a certain number of steps
# (or some other training criteria)
w = tf.Variable(0.0, trainable=False)
for epoch in range(NUM_EPOCHS):
losses = []
for batch in range(NUM_BATCHES_PER_EPOCH):
X_train, y0_train, y1_train = next(train_iter)
with tf.GradientTape() as tape:
y_hat = m(X_train)
loss = loss_fn(y0_train, y1_train, y_hat, w)
losses.append(loss)
m_vars = m.trainable_variables
m_grads = tape.gradient(loss, m_vars)
m_opt.apply_gradients(zip(m_grads, m_vars))
print(f"epoch: {epoch}\tloss: {np.mean(losses):.4f}")
losses = []
# if the criteria is met assign a huge number to see if the
# loss spikes up
if (epoch + 1) * (batch + 1) >= START_NEW_LOSS_AT_GLOBAL_STEP:
w.assign(10000.0)
# epoch: 0 loss: 1.8226
# epoch: 1 loss: 1.1143
# epoch: 2 loss: 8788.2227 <= looks like assign worked
# epoch: 3 loss: 10999.5449
I'm trying to implement linear regression using Rms Prop optimizer from scratch.
Code:
EPOCHS = 100
w3 = tf.Variable(w_vector, dtype = tf.float32)
w4 = tf.Variable(0, dtype = tf.float32)
lr = 1e-5
beta = 0.9
epilson = 1e-7
momentum = 0.0
for epoch in range(1,EPOCHS+1):
mom_w = 0
mom_b = 0
mean_square_w = 0
mean_gradient_w = 0
mean_square_b = 0
mean_gradient_b = 0
y_pred1 = tf.squeeze(tf.matmul(w3,x, transpose_a = True, transpose_b = True) + w4)
dw3, dw4 = gradients_mse(x, y, y_pred1)
# My eqautions for RMS prop
mean_square_w = beta * mean_square_w + (1-beta) * dw3 ** 2
mean_gradient_w = beta * mean_gradient_w + (1-beta) * dw3
mom_w = momentum * mom_w + lr * (dw3/(tf.sqrt(mean_square_w + epilson - mean_gradient_w ** 2)))
mean_square_b = beta * mean_square_b + (1-beta) * dw4 ** 2
mean_gradient_b = beta * mean_gradient_b + (1-beta) * dw4
mom_b = momentum * mom_b + lr * (dw4/(tf.sqrt(mean_square_b + epilson - mean_gradient_b ** 2)))
w3.assign_sub(mom_w)
w4.assign_sub(mom_b)
print('w3 : {}'.format(w3.numpy()))
print('w4 : {}'.format(w4.numpy()))
Output:
w3 : [[-1.2507935]]
w4 : 0.0033333366736769676
Now I create a single layer and single neuron neural network with no activation function. Assign the same weights in its neuron and use RMS prop as optimizer I get different final weights. However, this was not the case for sgd optimizer.
Code:
# using keras to get same results
def create_model():
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(units = 1, name = 'd1', input_shape = (x.shape[1],)))
model.compile(optimizer=tf.keras.optimizers.RMSprop(
learning_rate=1e-5, rho=0.9, momentum=0.0, epsilon=1e-07, centered=False),
loss="mse")
return model
model = create_model()
d1 = model.get_layer('d1')
d1_weights = [tf.constant(w_vector, dtype = tf.float32), tf.constant(np.array([0]), dtype = tf.float32)]
d1.set_weights(d1_weights)
model.fit(x, y, epochs = 100)
d1 = model.get_layer('d1')
print('w3 = {}'.format(d1.weights[0].numpy()))
print('w4 = {}'.format(d1.weights[1].numpy()[0]))
Output:
w3 = [[-1.2530397]]
w4 = 0.0010913893347606063
My gradients are calculate correctly for mse loss function. I have crosschecked them with tensorflows inbuilt gradient computation function gradient tape.
Code:
# Computing gradients
def gradients_mse(X, Y, Y_PREDS):
DW1 = tf.matmul(X, tf.reshape(Y-Y_PREDS, (X.shape[0],1)), transpose_a = True) * (-2/X.shape[0])
DW0 = (-2 / X.shape[0]) * tf.reduce_sum(Y - Y_PREDS)
return DW1, DW0
The only thing that can go wrong in this implementation is I think calculation of mom_w and mom_b using incorrect equations.
x.shape = [10,1]
The default batch size is 32 so it will have no effects on weight updates. The same code gives perfectly matching output when I try to use simple gradient descent instead of RMS prop.
I am able to perform classification with this code. It outputs the probability for each output labels. But I need to convert this so that it can predict the values. That is, I want to add a regression layer at the end instead of softmax. How can I achieve this? Let's say for example I trained the model for label 1,2,3,4,5. But I want the model to predict the values beyond those 5 labels. Example, Given the input, the model may predict 1.3 or 2.5, etc. I want a continuous output rather than a discrete output.
Update
I am trying to achieve a suggested solution from this question
Here
Let's say I have a training data. I train the model for whole number temperatures like 1,2,3,4,5 degrees. Basically, Those output temperatures are the labels. How can I predict the values that lies between two temperatures like 2.5 degree. It is not possible to train for every values of temperature. How can I achieve this?
My model gives probability of each class predicted
Temp Probability
1 .01
2 .05
3 .56
4 .24
5 .14
I want my model to predict the temperature values like 1.2, 2.7, etc. instead of predicting the probability of each class.
input_height = 1 # 1-Dimensional convulotion
input_width = 90 #window
num_labels = 5 #output labels
num_channels = 8 #input columns
batch_size = 10
kernel_size = 60
depth = 60
num_hidden = 1000
learning_rate = 0.0001
training_epochs = 8
total_batches = train_x.shape[0] # batch_size
X = tf.placeholder(tf.float32, shape=[None,input_height,input_width,num_channels],name="input")
# X = tf.placeholder(tf.float32, shape=[None,input_width * num_channels], name="input")
# X_reshaped = tf.reshape(X,[-1,1,90,3])
Y = tf.placeholder(tf.float32, shape=[None,num_labels])
c = apply_depthwise_conv(X,kernel_size,num_channels,depth)
p = apply_max_pool(c,20,2)
c = apply_depthwise_conv(p,6,depth*num_channels,depth//10)
shape = c.get_shape().as_list()
c_flat = tf.reshape(c, [-1, shape[1] * shape[2] * shape[3]])
f_weights_l1 = weight_variable([shape[1] * shape[2] * depth * num_channels * (depth//10), num_hidden])
f_biases_l1 = bias_variable([num_hidden])
f = tf.nn.tanh(tf.add(tf.matmul(c_flat, f_weights_l1),f_biases_l1))
out_weights = weight_variable([num_hidden, num_labels])
out_biases = bias_variable([num_labels])
y_ = tf.nn.softmax(tf.matmul(f, out_weights) + out_biases,name="y_")
loss = -tf.reduce_sum(Y * tf.log(y_))
optimizer = tf.train.GradientDescentOptimizer(learning_rate = learning_rate).minimize(loss)
correct_prediction = tf.equal(tf.argmax(y_,1), tf.argmax(Y,1)) #difference between correct output and expected output
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
cost_history = np.empty(shape=[1], dtype=float)
with tf.Session() as session:
tf.global_variables_initializer().run()
for epoch in range(training_epochs):
for b in range(total_batches):
offset = (b * batch_size) % (train_y.shape[0] - batch_size)
batch_x = train_x[offset:(offset + batch_size), :, :, :]
batch_y = train_y[offset:(offset + batch_size), :]
_, c = session.run([optimizer, loss], feed_dict={X: batch_x, Y: batch_y})
cost_history = np.append(cost_history, c)
print "Epoch: ", epoch, " Training Loss: ", c, " Training Accuracy: ",session.run(accuracy, feed_dict={X: train_x, Y: train_y})
print "Testing Accuracy:", session.run(accuracy, feed_dict={X: test_x, Y: test_y})
If you want to predict which class is detected, just do an arg_max on the output. The one with the highest probability is the detected class.
predict = tf.argmax(y_)
Problem Summary:
The issue is that even after running this code for multiple epochs, the cost isn't reducing much ( I have tried this for a variety of starting_learning_rates ). The equation that I am trying to optimize is ((m * pow(length, u) * pow(start_y, t) + c) where length and start_y are the inputs and u,t,m and c are learn-able parameters. I was able to observe (my dataset is quite small) that length * sqrt(start_y) is almost a constant and thought that tensorflow would be able to better help me find the value of the variables
This is my tensorflow code, combined_vehicles is an array with 129 rows and 2 columns( 2 features ), combined_labels is an array corresponding to labels for each of the examples in combined_vehicles
u = tf.Variable(0.0,dtype = "float32")
t = tf.Variable(0.0,dtype = "float32")
c = tf.Variable(0.0,dtype = "float32")
m = tf.Variable(0.0,dtype = "float32")
length = tf.placeholder(dtype = "float32", shape = [combined_vehicles.shape[0],1], name="length")
start_y = tf.placeholder(dtype = "float32", shape = [combined_vehicles.shape[0],1], name="start_y")
labels = tf.placeholder(dtype = "float32", shape = [combined_vehicles.shape[0],1], name = "labels")
output = tf.add(tf.multiply(tf.multiply(tf.pow(length, u), tf.pow(start_y, t)), m), c)
cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = output, labels = labels))
global_step = tf.Variable(0, trainable=False, name = 'global_step')
start_learning_rate = 0.0001
decay_steps = 100
learning_rate = tf.train.exponential_decay(start_learning_rate, global_step, decay_steps, 0.1, staircase=True )
result_output = output > 0.5
result_label = combined_labels > 0.5
correct_prediction = tf.equal( result_output, result_label )
accuracy = tf.reduce_mean( tf.cast( correct_prediction, "float" ) )
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(cost, global_step=global_step)
init = tf.global_variables_initializer()
with tf.Session() as sess:
epochs = 100
sess.run(init)
for i in range(epochs):
_,cost_estimate = sess.run([optimizer, cost], feed_dict = {length: combined_vehicles[:,0].reshape([combined_vehicles.shape[0],1]), start_y:combined_vehicles[:,1].reshape([combined_vehicles.shape[0],1]), labels: combined_labels})
total_accuracy = accuracy.eval({length: combined_vehicles[:,0].reshape([combined_vehicles.shape[0],1]), start_y:combined_vehicles[:,1].reshape([combined_vehicles.shape[0],1]), labels: combined_labels})