Python - Tensorflow, binary classification, always predicting 0 - python

I am just starting out with Tensorflow, trying to create a classic neural net for binary classification.
# Loading Dependencies
import math
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow.python.framework import ops
from sklearn.model_selection import train_test_split
import matplotlib.pyplot as plt
seed = 1234
tf.set_random_seed(seed)
np.random.seed(seed)
# Load and Split data
data = pd.read_json(file)
X = data["X"]
y = data["y"]
X = X.astype(np.float32)
y = y.astype(np.float32)
X_train, X_valid, y_train, y_valid = train_test_split(X, y, test_size = 0.3)
X_train = X_train.reshape(X_train.shape[0], -1).T
y_train = y_train.values.reshape((1, y_train.shape[0]))
X_valid = X_valid.reshape(X_valid.shape[0], -1).T
y_valid = y_valid.values.reshape((1, y_valid.shape[0]))
print("X Train: ", X_train.shape)
print("y Train: ", y_train.shape)
print("X Dev: ", X_valid.shape)
print("y Dev: ", y_valid.shape)
X Train: (16875, 1122)
y Train: (1, 1122)
X Dev: (16875, 482)
y Dev: (1, 482)
The training data contains float numbers, while the labels are just 0 or 1. However, these are also converted to float because I had some issues in the past.
Initializing the parameters
def initialize_parameters(layer_dimensions):
tf.set_random_seed(seed)
layers_count = len(layer_dimensions)
parameters = {}
for layer in range(1, layers_count):
parameters['W' + str(layer)] = tf.get_variable('W' + str(layer),
[layer_dimensions[layer], layer_dimensions[layer - 1]],
initializer = tf.contrib.layers.xavier_initializer(seed = seed))
parameters['b' + str(layer)] = tf.get_variable('b' + str(layer),
[layer_dimensions[layer], 1],
initializer = tf.zeros_initializer())
return parameters
Shapes are:
W1 - (50, 16875)
W2 - (25, 50)
W3 - (10, 25)
W4 - (5, 10)
W5 - (1, 5)
b1 - (50, 1)
b2 - (25, 1)
b3 - (10, 1)
b4 - (5, 1)
b5 - (1, 1)
I am specifying the number and the dimension of each layer when I am calling the model (see below)
Forward Propagation
def forward_propagation(X, parameters):
parameters_count = len(parameters) // 2
A = X
for layer in range(1, parameters_count):
W = parameters['W' + str(layer)]
b = parameters['b' + str(layer)]
Z = tf.add(tf.matmul(W, A), b)
A = tf.nn.relu(Z)
W = parameters['W' + str(parameters_count)]
b = parameters['b' + str(parameters_count)]
Z = tf.add(tf.matmul(W, A), b)
return Z
Compute the cost (I am using the sigmoid function since we are dealing with binary classification)
def compute_cost(Z, Y):
logits = tf.transpose(Z)
labels = tf.transpose(Y)
cost = tf.reduce_mean(tf.nn.sigmoid_cross_entropy_with_logits(logits = logits, labels = labels))
return cost
Putting it together
def model(X_train, y_train, X_valid, y_valid, layer_dimensions, alpha = 0.0001, epochs = 10):
ops.reset_default_graph()
tf.set_random_seed(seed)
(x_rows, m) = X_train.shape
y_rows = y_train.shape[0]
costs = []
X = tf.placeholder(tf.float32, shape=(x_rows, None), name="X")
y = tf.placeholder(tf.float32, shape=(y_rows, None), name="y")
parameters = initialize_parameters(layer_dimensions)
Z = forward_propagation(X, parameters)
cost = compute_cost(Z, y)
optimizer = tf.train.AdamOptimizer(learning_rate = alpha).minimize(cost)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(epochs):
_ , epoch_cost = sess.run([optimizer, cost], feed_dict={X: X_train, y: y_train})
print ("Cost after epoch %i: %f" % (epoch + 1, epoch_cost))
costs.append(epoch_cost)
parameters = sess.run(parameters)
correct_predictions = tf.equal(tf.argmax(Z), tf.argmax(y))
accuracy = tf.reduce_mean(tf.cast(correct_predictions, "float"))
print ("Train Accuracy:", accuracy.eval({X: X_train, y: y_train}))
print ("Test Accuracy:", accuracy.eval({X: X_valid, y: y_valid}))
return parameters
Now when I try to train my model it appears to reaches an optimum from the second epoch and the cost changes very little from that point on
parameters = model(X_train, y_train, X_valid, y_valid, [X_train.shape[0], 50, 25, 10, 5, 1])
Cost after epoch 1: 8.758244
Cost after epoch 2: 0.693096
Cost after epoch 3: 0.692992
Cost after epoch 4: 0.692737
Cost after epoch 5: 0.697333
Cost after epoch 6: 0.693062
Cost after epoch 7: 0.693151
Cost after epoch 8: 0.693152
Cost after epoch 9: 0.693152
Cost after epoch 10: 0.693155
Now for the predictions
def predict(X, parameters):
parameters_count = len(parameters) // 2
params = {}
for layer in range(1, parameters_count + 1):
params['W' + str(layer)] = tf.convert_to_tensor(parameters['W' + str(layer)])
params['b' + str(layer)] = tf.convert_to_tensor(parameters['b' + str(layer)])
(x_columns, x_rows) = X.shape
X_test = tf.placeholder(tf.float32, shape=(x_columns, x_rows))
Z = forward_propagation(X_test, params)
p = tf.argmax(Z)
sess = tf.Session()
prediction = sess.run(p, feed_dict = {X_test: X})
return prediction
However, this will predict 0 in every case..
predictions = predict(X_valid, parameters)
predictions
array([0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,
0, 0, 0, 0, 0....

X Train: (16875, 1122)
You have 16875 features for each sample, but only 1122 train data.
I think this may be not enough.
The sample code in tensorflow get-started only takes 784 features.
x = tf.placeholder(tf.float32, shape=[None, 784])
y_ = tf.placeholder(tf.float32, shape=[None, 10])
The MNIST data is split into three parts: 55,000 data points of training data (mnist.train), 10,000 points of test data (mnist.test), and 5,000 points of validation data (mnist.validation). This split is very important: it's essential in machine learning that we have separate data which we don't learn from so that we can make sure that what we've learned actually generalizes!
https://www.tensorflow.org/get_started/mnist/beginners

Related

Why am getting precision , recall as zero in ANFIS model using tensorflow in python

i have build ANFIS model with tensorflow for classification problem. For every epoch i am getting precision and recall as zero. I am using guassian membership function but when i print sigma it is giving 0.Used below code for training
## settings
n = X_train.shape[1] # no of input features
m = 2*n # number of fuzzy rules
learning_rate = 0.01
epochs = 1000
################################ train
X_train_t = tf.placeholder(tf.float32, shape=[None, n]) # Train input
y_train_t = tf.placeholder(tf.float32, shape=None) # Train output
mu = tf.get_variable(name="mu", shape=[m * n], initializer=tf.random_normal_initializer(0, 1)) # mean of Gaussian MFS
sigma = tf.get_variable(name="sigma", shape = [m * n], initializer=tf.random_normal_initializer(0, 1)) # std_dev of Gaussian MFS
w = tf.get_variable(name="w", shape= [1, m], initializer=tf.random_normal_initializer(0, 1))
rula = tf.reduce_prod(tf.reshape(tf.exp( -0.5* ((tf.tile(X_train_t, (1, m))- mu)**2) / (sigma**2)),
(-1, m, n)), axis=2) #activations
Y_train_t = tf.reduce_sum(rula*w,axis=1) / tf.clip_by_value(tf.reduce_sum(rula,axis=1), 1e-8, 1e8)
#loss = tf.losses.log_loss(y_train, Y_train) # loss function
loss = tf.losses.sigmoid_cross_entropy(y_train_t, Y_train_t) # loss function
#loss = tf.sqrt(tf.losses.mean_squared_error(y_train, Y_train))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate).minimize(loss) # optimizer
################################ test
X_test_t = tf.placeholder(tf.float32, shape=[None, n]) # Test input
y_test_t = tf.placeholder(tf.float32, shape=None) # Train output
rula_test = tf.reduce_prod(tf.reshape(tf.exp( -0.5* ((tf.tile(X_test_t, (1, m))- mu)**2) / (sigma**2)),
(-1, m, n)), axis=2) # rule activation
Y_test_t = tf.reduce_sum(rula_test*w,axis=1) / tf.clip_by_value(tf.reduce_sum(rula_test,axis=1), 1e-8, 1e8)
loss_test = tf.losses.sigmoid_cross_entropy(y_test_t, Y_test_t) # loss function
################################ start session
x_axis = []
tr_loss, te_loss = [],[]
tr_prec, te_prec = [], []
tr_rec, te_rec = [], []
init=tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for e in range(epochs):
Y_train, loss_tr, _ = sess.run([Y_train_t, loss, optimizer], feed_dict={X_train_t: X_train, y_train_t: y_train})
Y_test, loss_te = sess.run([Y_test_t, loss_test], feed_dict={X_test_t: X_test, y_test_t: y_test})
if (e+1) % 10 == 0:
x_axis.append(e+1)
tr_loss.append(loss_tr)
te_loss.append(loss_te)
Y_train = np.where(Y_train > 0, 1, 0)
Y_test = np.where(Y_test > 0, 1, 0)
prec_tr = precision_score(y_train,Y_train)
prec_te = precision_score(y_test,Y_test)
rec_tr = recall_score(y_train,Y_train)
rec_te = recall_score(y_test,Y_test)
tr_prec.append(prec_tr)
te_prec.append(prec_te)
tr_rec.append(rec_tr)
te_rec.append(rec_te)
code is referenced from https://github.com/subhalingamd/ANFIS-diabetes-prediction/blob/main/main.py
I am new to this algorithm.Please, help me where am gone wrong.

LSTM+FFN performs more poorly than FFN

I am building several simple networks to predict the bike rentals at 500 stations in the upcoming hour, given rentals at all stations in the previous 24 hours. I am working with two architectures, one with a graph convolution (which amounts to updating each station with a learned linear combination of other stations, at each hour) and a FNN layer to prediction, and a second with a graph convolution -> LSTM -> FNN to prediction.
Before I describe more, I'm getting poorer performance for my model which includes an LSTM unit, which is confusing me.
See these two images for a description of each architecture, for each architecture I also add hourly meta-data (weather, time, etc) as variation, they are in the images in red, and not relevant to my question. Image links at the bottom of the post.
[Architecture 1: GCNN + FNN][1]
[Architecture 2: GCNN + LSTM + FNN][2]
Confusingly, the test RMSE for the first model is 3.46, for the second model its 3.57. Could someone please explain to me why the second wouldn't be lower, as it seems to be running the exact same processes, except with an additional LSTM unit.
Here are relevant snippets of my code for the GCNN+FNN model:
def gcnn_ddgf(hidden_layer, node_num, feature_in, horizon, learning_rate, beta, batch_size, early_stop_th, training_epochs, X_training, Y_training, X_val, Y_val, X_test, Y_test, scaler, display_step):
n_output_vec = node_num * horizon # length of output vector at the final layer
early_stop_k = 0 # early stop patience
best_val = 10000
traing_error = 0
test_error = 0
pred_Y = []
tf.reset_default_graph()
batch_size = batch_size
early_stop_th = early_stop_th
training_epochs = training_epochs
# tf Graph input and output
X = tf.placeholder(tf.float32, [None, node_num, feature_in]) # X is the input signal
Y = tf.placeholder(tf.float32, [None, n_output_vec]) # y is the regression output
# define dictionaries to store layers weight & bias
weights_hidden = {}
weights_A = {}
biases = {}
vec_length = feature_in
weights_hidden['h1'] = tf.Variable(tf.random_normal([vec_length, hidden_layer], stddev=0.5))
biases['b1'] = tf.Variable(tf.random_normal([1, hidden_layer], stddev=0.5))
weights_A['A1'] = tf.Variable(tf.random_normal([node_num, node_num], stddev=0.5))
weights_hidden['out'] = tf.Variable(tf.random_normal([hidden_layer, horizon], stddev=0.5))
biases['bout'] = tf.Variable(tf.random_normal([1, horizon], stddev=0.5))
# Construct model
pred= gcn(X, weights_hidden, weights_A, biases, node_num, horizon) #see below
pred = scaler.inverse_transform(pred)
Y_original = scaler.inverse_transform(Y)
cost = tf.sqrt(tf.reduce_mean(tf.pow(pred - Y_original, 2)))
#optimizer = tf.train.RMSPropOptimizer(learning_rate, decay).minimize(cost)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost_sq = 0.
num_train = X_training.shape[0]
total_batch = int(num_train/batch_size)
for i in range(total_batch):
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[i*batch_size:(i+1)*batch_size,],
Y: Y_training[i*batch_size:(i+1)*batch_size,]})
avg_cost_sq += np.square(c) * batch_size #/ total_batch
# rest part of training dataset
if total_batch * batch_size != num_train:
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[total_batch*batch_size:num_train,],
Y: Y_training[total_batch*batch_size:num_train,]})
avg_cost_sq += np.square(c) * (num_train - total_batch*batch_size)
avg_cost = np.sqrt(avg_cost_sq / num_train)
# validation
c_val, = sess.run([cost], feed_dict={X: X_val, Y: Y_val})
if c_val < best_val:
# testing
c_tes, preds, Y_true = sess.run([cost, pred, Y_original], feed_dict={X: X_test,Y: Y_test})
best_val = c_val
test_error = c_tes
traing_error = avg_cost
pred_Y = preds
early_stop_k = 0 # reset to 0
# update early stopping patience
if c_val >= best_val:
early_stop_k += 1
# threshold
if early_stop_k == early_stop_th:
break
if epoch % display_step == 0:
print ("Epoch:", '%04d' % (epoch+1), "Training RMSE: ","{:.9f}".format(avg_cost))
print("Validation RMSE: ", c_val)
print("Lowest test RMSE: ", test_error)
print("epoch is ", epoch)
print("training RMSE is ", traing_error)
print("Optimization Finished! the lowest validation RMSE is ", best_val)
print("The test RMSE is ", test_error)
return best_val, pred_Y ,Y_true,test_error
# code that creates the model
def gcn(signal_in, weights_hidden, weights_A, biases, node_num, horizon):
signal_in = tf.transpose(signal_in, [1, 0, 2]) # node_num, batch, feature_in
feature_len = signal_in.shape[2] # feature vector length at the node of the input graph
signal_in = tf.reshape(signal_in, [node_num, -1]) # node_num, batch*feature_in
Adj = 0.5*(weights_A['A1'] + tf.transpose(weights_A['A1']))
Adj = normalize_adj(Adj)
Z = tf.matmul(Adj, signal_in) # node_num, batch*feature_in
Z = tf.reshape(Z, [-1, int(feature_len)]) # node_num * batch, feature_in
signal_output = tf.add(tf.matmul(Z, weights_hidden['h1']), biases['b1'])
signal_output = tf.nn.relu(signal_output) # node_num * batch, hidden_vec
final_output = tf.add(tf.matmul(signal_output, weights_hidden['out']), biases['bout']) # node_num * batch, horizon
# final_output = tf.nn.relu(final_output)
final_output = tf.reshape(final_output, [node_num, -1, horizon]) # node_num, batch, horizon
final_output = tf.transpose(final_output, [1, 0, 2]) # batch, node_num, horizon
final_output = tf.reshape(final_output, [-1, node_num*horizon]) # batch, node_num*horizon
return final_output
And the code for the GCNN+LSTM+FNN model:
def gcnn_ddgf_lstm(node_num, feature_in, learning_rate, beta, batch_size, early_stop_th, training_epochs, X_training,
Y_training, X_val, Y_val, X_test, Y_test, scaler, lstm_layer):
n_output_vec = node_num # length of output vector at the final layer
early_stop_k = 0 # early stop patience
display_step = 1 # frequency of printing results
best_val = 10000
traing_error = 0
test_error = 0
predic_res = []
tf.reset_default_graph()
batch_size = batch_size
early_stop_th = early_stop_th
training_epochs = training_epochs
# tf Graph input and output
X = tf.placeholder(tf.float32, [None, node_num, feature_in]) # X is the input signal
Y = tf.placeholder(tf.float32, [None, n_output_vec]) # y is the regression output
lstm_cell = tf.nn.rnn_cell.LSTMCell(lstm_layer, state_is_tuple=True)
# define dictionaries to store layers weight & bias
weights_hidden = {}
weights_A = {}
biases = {}
weights_A['A1'] = tf.Variable(tf.random_normal([node_num, node_num], stddev=0.5))
weights_hidden['h1'] = tf.Variable(tf.random_normal([lstm_layer, node_num], stddev=0.5))
biases['h1'] = tf.Variable(tf.random_normal([1, node_num], stddev=0.5))
weights_hidden['out'] = tf.Variable(tf.random_normal([node_num, node_num], stddev=0.5))
biases['bout'] = tf.Variable(tf.random_normal([1, node_num], stddev=0.5))
# Construct model
pred= gcn_lstm(X, weights_hidden, weights_A, biases, node_num, lstm_cell)
# pred = scaler.inverse_transform(pred)
# Y_original = scaler.inverse_transform(Y)
cost = tf.sqrt(tf.reduce_mean(tf.pow(pred - Y, 2)))
#optimizer = tf.train.RMSPropOptimizer(learning_rate, decay).minimize(cost)
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate, beta1=beta).minimize(cost)
# Initializing the variables
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
avg_cost_sq = 0.
num_train = X_training.shape[0]
total_batch = int(num_train/batch_size)
for i in range(total_batch):
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[i*batch_size:(i+1)*batch_size,],
Y: Y_training[i*batch_size:(i+1)*batch_size,]})
avg_cost_sq += np.square(c) * batch_size #/ total_batch
# rest part of training dataset
if total_batch * batch_size != num_train:
_, c = sess.run([optimizer, cost], feed_dict={X: X_training[total_batch*batch_size:num_train,],
Y: Y_training[total_batch*batch_size:num_train,]})
avg_cost_sq += np.square(c) * (num_train - total_batch*batch_size)
avg_cost = np.sqrt(avg_cost_sq / num_train)
# validation
c_val, = sess.run([cost], feed_dict={X: X_val, Y: Y_val})
if c_val < best_val:
c_tes, preds = sess.run([cost, pred], feed_dict={X: X_test,Y: Y_test})
best_val = c_val
# save model
#saver.save(sess, './bikesharing_gcnn_ddgf')
test_error = c_tes
traing_error = avg_cost
early_stop_k = 0 # reset to 0
# update early stopping patience
if c_val >= best_val:
early_stop_k += 1
# threshold
if early_stop_k == early_stop_th:
pred_Y = scaler.inverse_transform(preds)
Y_true = scaler.inverse_transform(Y_test)
test_err = tf.sqrt(tf.reduce_mean(tf.pow(pred_Y - Y_true, 2)))
break
if epoch % display_step == 0:
print ("Epoch:", '%04d' % (epoch+1), "Training RMSE: ","{:.9f}".format(avg_cost))
print("Validation RMSE: ", c_val)
print("Lowest test RMSE: ", test_error)
print("epoch is ", epoch)
print("training RMSE is ", traing_error)
print("Optimization Finished! the lowest validation RMSE is ", best_val)
print("The scaled test RMSE is ", test_error)
return pred_Y, Y_true
def gcn_lstm(signal_in, weights_hidden, weights_A, biases, node_num, lstm_cell):
signal_in = tf.transpose(signal_in, [1, 0, 2]) # node_num, batch, feature_in
feature_len = signal_in.shape[2] # feature vector length at the node of the input graph
signal_in = tf.reshape(signal_in, [node_num, -1]) # node_num, batch*feature_in
Adj = 0.5*(weights_A['A1'] + tf.transpose(weights_A['A1']))
Adj = normalize_adj(Adj)
Z = tf.matmul(Adj, signal_in) # node_num, batch*feature_in
Z = tf.reshape(Z, [node_num, -1, int(feature_len)]) # node_num, batch, feature_in
Z = tf.transpose(Z,[1,2,0]) # batch, feature_in, node_num
# init_state = cell.zero_state(batch_size, tf.float32)
_, Z = tf.nn.dynamic_rnn(lstm_cell, Z, dtype = tf.float32) # init_state?
dense_output = tf.add(tf.matmul(Z[1], weights_hidden['h1']), biases['h1'])
dense_output = tf.nn.relu(dense_output)
final_output = tf.add(tf.matmul(dense_output, weights_hidden['out']), biases['bout']) # batch, node_num*horizon
return final_output
In particular, should I be weary that _, Z = tf.nn.dynamic_rnn(lstm_cell, Z, dtype = tf.float32) causes my variables defined elsewhere not to train?
Thanks a lot for any help :)
[1]: https://i.stack.imgur.com/MAO2t.png
[2]: https://i.stack.imgur.com/UDjHw.png
I resolved this.
I have three years of bike use data to make the prediction, and was using the ~last three months as my validation/test set. The last few months were winter with lower bike use. I got expected results (GCNN+LSTM outperforms GCNN, though not by much) when I shuffled my training data prior to allocating to sets (with sequences preserved for LSTM)

Output vector of The final Layer of a neural net for a classification problem stuck at 0.5

The output layer is stuck at [0.5, 0.5] vector. Can anyone help in understanding if there is any problem with the code.
The neural net I'm trying to train is an X-OR gate, so the output vector should be close to the one hot vector representing the correct class(0 or 1) in this case, but the output vector after all epoch still stays at [0.5, 0.5]
class Backpropogation:
def setupWeightsBiases(self):
for i in range(1, self.num_layers):
self.weights_dict[i] = rnd.rand(self.layer_spec[i], self.layer_spec[i - 1])
self.bias_dict[i] = rnd.rand(self.layer_spec[i], 1)
def __init__(self, hidden_layer_neurons_tuple, train_data, num_output_classes, output_layer_func='sigmoid'):
self.train_input = train_data[0]
self.input_layer_size = self.train_input[0].size
self.train_input = self.train_input.reshape(self.train_input.shape[0], self.input_layer_size).T
self.output_layer_size = num_output_classes
self.train_output = train_data[1]
print(self.train_output.shape)
num_hidden_layer = len(hidden_layer_neurons_tuple)
self.hidden_layer_neurons_tuple = hidden_layer_neurons_tuple
self.layer_spec = [self.input_layer_size] + \
list(hidden_layer_neurons_tuple) + \
[num_output_classes]
self.layer_spec = tuple(self.layer_spec)
self.num_layers = num_hidden_layer + 2
self.train_data = train_data
self.activation_layer_gradient_dict = {}
self.preactivation_layer_gradient_dict = {}
self.weights_gradient_dict = {}
self.bias_gradient_dict = {}
self.curr_input = None
self.curr_output = None
self.weights_dict = {}
self.preactivation_layer_dict = {}
self.activation_layer_dict = {}
self.bias_dict = {}
self.setupWeightsBiases()
self.output = None
self.output_diff = None
self.num_output_classes = num_output_classes
def predictClass(self):
return np.argmax(self.activation_layer_dict[self.num_layers - 1])
def forwardPropogation(self, input):
# Load h[0] as the input data
self.activation_layer_dict[0] = input
'''
load input data into h[0]
for i in (1,L):
a[k] = W[k] * h[k-1] + b[k]
and finally calculate the Lth layer output with the special activation function
'''
for i in range(1, self.num_layers):
self.preactivation_layer_dict[i] = \
np.matmul(self.weights_dict[i], self.activation_layer_dict[i - 1]) + \
self.bias_dict[i]
# print(self.preactivation_layer_dict[i])
vec = self.preactivation_layer_dict[i]
self.activation_layer_dict[i] = self.activationFunction(vec)
# This will change h[L] to y'
self.activation_layer_dict[self.num_layers - 1] = self.outputFunction()
def findGradients(self, index):
class_label = self.train_output[index]
output_one_hot_vector = np.zeros((self.num_output_classes, 1))
output_one_hot_vector[class_label] = 1
output = self.activation_layer_dict[self.num_layers - 1]
self.preactivation_layer_gradient_dict[self.num_layers - 1] = -1 * (output_one_hot_vector - output)
for layer in reversed(range(1, self.num_layers)):
self.weights_gradient_dict[layer] = np.matmul(self.preactivation_layer_gradient_dict[layer],
self.activation_layer_dict[layer - 1].T)
self.bias_gradient_dict[layer] = self.preactivation_layer_gradient_dict[layer]
self.activation_layer_gradient_dict[layer - 1] = np.matmul(self.weights_dict[layer].T,
self.preactivation_layer_gradient_dict[layer])
if layer != 1:
self.preactivation_layer_gradient_dict[layer - 1] = np.multiply(
self.activation_layer_gradient_dict[layer - 1],
self.outputFunctionDiff(layer - 1))
def activationFunction(self, vec, type='sigmoid'):
if type == 'sigmoid':
return 1 / (1 + expit(-vec))
else:
print('Please select correct output function')
exit()
def outputFunction(self, type='sigmoid'):
if type == 'sigmoid':
return 1 / (1 + expit(-self.preactivation_layer_dict[self.num_layers - 1]))
else:
print('Please select correct output function')
exit()
def outputFunctionDiff(self, layer, type='sigmoid'):
op_layer = self.num_layers - 1
if type == 'sigmoid':
vec = self.preactivation_layer_dict[layer]
return np.multiply(self.activationFunction(vec), 1 - self.activationFunction(vec))
else:
print('Please select correct output function')
exit()
def updateWeightsAndBiases(self, learning_rate):
for layer in range(1, self.num_layers):
self.weights_dict[layer] = self.weights_dict[layer] - learning_rate * self.weights_gradient_dict[layer]
self.preactivation_layer_dict[layer] = self.preactivation_layer_dict[layer] - \
learning_rate * self.preactivation_layer_gradient_dict[layer]
if not (layer == self.num_layers - 1):
self.activation_layer_dict[layer] = self.activation_layer_dict[layer] - \
learning_rate * self.activation_layer_gradient_dict[layer]
self.bias_dict[layer] = self.bias_dict[layer] - learning_rate * self.bias_gradient_dict[layer]
def getLoss(self, index):
return np.log2(self.activation_layer_dict[self.num_layers - 1][self.train_output[index], 0])
def train(self, learning_rate, num_epochs):
for curr_epoch in range(num_epochs):
print('Evaluating at ' + str(curr_epoch))
index_array = list(np.arange(0, self.train_input.shape[1]))
np.random.shuffle(index_array)
for train_data_index in index_array:
test_input = self.train_input[:, [train_data_index]]
self.forwardPropogation(test_input)
# print(self.activation_layer_dict[self.num_layers - 1])
self.findGradients(train_data_index)
self.updateWeightsAndBiases(learning_rate)
print('Loss ' + str(self.getLoss(train_data_index)))
# Assumes a 2D array of 784xN array as test input
# This will return output classes of the data
def test(self, test_data):
index_range = test_data.shape[1]
test_class_list = []
for index in range(index_range):
self.forwardPropogation(test_data[:, [index]])
test_class_list.append(self.predictClass())
return test_class_list
# train the NN with BP
train_data = (np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0]))
b = Backpropogation((2, 2), train_data, 2)
The following code (check this for implementation and this for the theory) implements a neural net with backpropagation from scratch, using a single output unit with sigmoid activation (otherwise it looks similar to your implementation).
Using this the XOR function can be learnt with appropriate learning rate and epochs (although it can be sometimes stuck at local minima, you can consider implementing drop-out etc. regularizers). Also, you can convert it to 2-output (softmax?) version of yours, can you figure out any issue in your implementation? e.g., you can look at the following pointers:
batch updation of parameters during backpropagation instead of stochastic updates
running for enough epochs
changing the learning rate
using Relu activation instead of sigmoid for the hidden layers (to cope with vanishing gradient)
etc.
from sklearn.metrics import accuracy_score, mean_squared_error
class FFSNNetwork:
def __init__(self, n_inputs, hidden_sizes=[2]):
#intialize the inputs
self.nx = n_inputs
self.ny = 1 # number of neurons in the output layer
self.nh = len(hidden_sizes)
self.sizes = [self.nx] + hidden_sizes + [self.ny]
self.W = {}
self.B = {}
for i in range(self.nh+1):
self.W[i+1] = np.random.rand(self.sizes[i], self.sizes[i+1])
self.B[i+1] = np.random.rand(1, self.sizes[i+1])
def sigmoid(self, x):
return 1.0/(1.0 + np.exp(-x))
def forward_pass(self, x):
self.A = {}
self.H = {}
self.H[0] = x.reshape(1, -1)
for i in range(self.nh+1):
self.A[i+1] = np.matmul(self.H[i], self.W[i+1]) + self.B[i+1]
self.H[i+1] = self.sigmoid(self.A[i+1])
return self.H[self.nh+1]
def grad_sigmoid(self, x):
return x*(1-x)
def grad(self, x, y):
self.forward_pass(x)
self.dW = {}
self.dB = {}
self.dH = {}
self.dA = {}
L = self.nh + 1
self.dA[L] = (self.H[L] - y)
for k in range(L, 0, -1):
self.dW[k] = np.matmul(self.H[k-1].T, self.dA[k])
self.dB[k] = self.dA[k]
self.dH[k-1] = np.matmul(self.dA[k], self.W[k].T)
self.dA[k-1] = np.multiply(self.dH[k-1], self.grad_sigmoid(self.H[k-1]))
def fit(self, X, Y, epochs=1, learning_rate=1, initialize=True):
# initialize w, b
if initialize:
for i in range(self.nh+1):
self.W[i+1] = np.random.randn(self.sizes[i], self.sizes[i+1])
self.B[i+1] = np.zeros((1, self.sizes[i+1]))
for e in range(epochs):
dW = {}
dB = {}
for i in range(self.nh+1):
dW[i+1] = np.zeros((self.sizes[i], self.sizes[i+1]))
dB[i+1] = np.zeros((1, self.sizes[i+1]))
for x, y in zip(X, Y):
self.grad(x, y)
for i in range(self.nh+1):
dW[i+1] += self.dW[i+1]
dB[i+1] += self.dB[i+1]
m = X.shape[1]
for i in range(self.nh+1):
self.W[i+1] -= learning_rate * dW[i+1] / m
self.B[i+1] -= learning_rate * dB[i+1] / m
Y_pred = self.predict(X)
print('loss at epoch {} = {}'.format(e, mean_squared_error(Y_pred, Y)))
def predict(self, X):
Y_pred = []
for x in X:
y_pred = self.forward_pass(x)
Y_pred.append(y_pred)
return np.array(Y_pred).squeeze()
Now, train the network:
#train the network with two hidden layers - 2 neurons and 2 neurons
ffsnn = FFSNNetwork(2, [2, 2])
# XOR data
X_train, y_train = np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0])
ffsnn.fit(X_train, y_train, epochs=5000, learning_rate=.15)
Next, predict with the network:
y_pred_prob = ffsnn.predict(X_train) # P(y = 1)
y_pred = (y_pred_prob >= 0.5).astype("int").ravel() # threshold = 0.5
X_train
# array([[0, 0], [0, 1], [1, 0], [1, 1]])
y_train
# array([0, 1, 1, 0])
y_pred_prob
# array([0.00803102, 0.99439243, 0.99097831, 0.00664639])
y_pred
# array([0, 1, 1, 0])
accuracy_score(y_train, y_pred)
# 1.0
Note that here the MSE between the true and predicted y values is used to plot the loss function, you can plot BCE (cross entropy) loss function too.
Finally, the following animations show how the loss function is minimized and also how the decision boundary is learnt:
Note that the green and red points represent the positive (with label 1) and negative (with label 0) training data points, respectively, in the above animation, notice how they are separated with the decision boundaries during the final phase of training epochs (darker region for negative and lighter region for positive datapoints corresponding to XOR).
You could implement the same with high level deep learning libraries such as keras with a few lines of code:
import tensorflow as tf
from tensorflow import keras
inputs = keras.Input(shape=(2,), name="in")
x = layers.Dense(4, activation="relu", name="dense_1")(inputs)
x = layers.Dense(4, activation="relu", name="dense_2")(x)
outputs = layers.Dense(1, activation="sigmoid", name="out")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
X_train, y_train = np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0])
model.compile(
optimizer=keras.optimizers.Adam(), # Optimizer
# Loss function to minimize
loss=tf.keras.losses.BinaryCrossentropy(),
# List of metrics to monitor
metrics=[keras.metrics.BinaryAccuracy(name="accuracy")],
)
print("Fit model on training data")
history = model.fit(
X_train,
y_train,
batch_size=4,
epochs=1000)
# ...
# Epoch 371/1000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.5178 - accuracy: 0.7500
# Epoch 372/1000
# 4/4 [==============================] - 0s 499us/sample - loss: 0.5169 - accuracy: 0.7500
# Epoch 373/1000
# 4/4 [==============================] - 0s 499us/sample - loss: 0.5160 - accuracy: 1.0000
# Epoch 374/1000
# 4/4 [==============================] - 0s 499us/sample - loss: 0.5150 - accuracy: 1.0000
# ...
print("Evaluate")
results = model.evaluate(X_train, y_train, batch_size=4)
print("loss, acc:", results)
# loss, acc: [0.1260240525007248, 1.0]
The following figure shows the loss / accuracy during training epochs.
Finally, with keras and softmax (instead of sigmoid):
from keras.utils import to_categorical
X_train, y_train = np.array([[0, 0], [0, 1], [1,0], [1, 1]]), np.array([0, 1, 1, 0])
y_train = to_categorical(y_train, num_classes=2)
inputs = keras.Input(shape=(2,), name="in")
x = layers.Dense(4, activation="relu", name="dense_1")(inputs)
x = layers.Dense(4, activation="relu", name="dense_2")(x)
outputs = layers.Dense(2, activation="softmax", name="out")(x)
model = keras.Model(inputs=inputs, outputs=outputs)
model.compile(
optimizer='rmsprop',
loss='categorical_crossentropy',
metrics=['acc']
)
print("Fit model on training data")
history = model.fit(
X_train,
y_train,
batch_size=4,
epochs=2000)
# Epoch 663/2000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.3893 - acc: 0.7500
# Epoch 664/2000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.3888 - acc: 1.0000
# Epoch 665/2000
# 4/4 [==============================] - 0s 500us/sample - loss: 0.3878 - acc: 1.0000
print("Evaluate")
results = model.evaluate(X_train, y_train, batch_size=4)
print("loss, acc:", results)
# loss, acc: [0.014970880933105946, 1.0]
with the following loss / accuracy convergence:

CNN with Tensorflow, low accuracy on CIFAR-10 and not improving

On running the first training epoch of a 3-layer convnet on CIFAR-10, I am neither able to achieve a high enough validation accuracy nor minimize the objective function.
Specifically, the accuracy varies on the first iteration, and then settles at 8.7% for the following iterations. What's peculiar is that I've also trained a 2-layer, fully-connected network which does substantially better, consistently getting around 43% accuracy on the validation set.
NOTE: Bulk of the code is from a Jupyter notebook designed as an introduction to barebones Tensorflow (and Keras) provided as part of an assignment for Stanford's CS231n Convolutional Neural Networks for Visual Recognition and although I am neither a student of the course nor of the university, I am doing this purely for experiential purposes and out of my newborn interests in CV / deep learning.
My contribution is only implementations for the forward pass and the initialization of the network's parameters.
The author of the notebook left a comment stating that when correctly implemented this model should achieve above 40% accuracy after the first epoch without any hyperparameter tuning.
Implementation Notes
49,000 / 1000 : train/validation split, batch size = 64
Weights are initialized using Kaiming normalization, bias initialized with 0s
learning rate = 3e-3
Here are each of the layers of convnet in detail:
Convolutional layer (with bias) with 32 5x5 filters, with zero-padding 2
ReLU Convolutional layer (with bias) with 16 3x3 filters, with zero-padding 1
ReLU Fully-connected layer (with bias) to compute scores for 10 classes
Code
( mine is written between the 'TODO' comment blocks )
import tensorflow as tf
import numpy as np
def load_cifar10(num_training=49000, num_validation=1000, num_test=10000):
cifar10 = tf.keras.datasets.cifar10.load_data()
(X_train, y_train), (X_test, y_test) = cifar10
X_train = np.asarray(X_train, dtype=np.float32)
y_train = np.asarray(y_train, dtype=np.int32).flatten()
X_test = np.asarray(X_test, dtype=np.float32)
y_test = np.asarray(y_test, dtype=np.int32).flatten()
mask = range(num_training, num_training + num_validation)
X_val = X_train[mask]
y_val = y_train[mask]
mask = range(num_training)
X_train = X_train[mask]
y_train = y_train[mask]
mask = range(num_test)
X_test = X_test[mask]
y_test = y_test[mask]
mean_pixel = X_train.mean(axis=(0, 1, 2), keepdims=True)
std_pixel = X_train.std(axis=(0, 1, 2), keepdims=True)
X_train = (X_train - mean_pixel) / std_pixel
X_val = (X_val - mean_pixel) / std_pixel
X_test = (X_test - mean_pixel) / std_pixel
return X_train, y_train, X_val, y_val, X_test, y_test
class Dataset(object):
def __init__(self, X, y, batch_size, shuffle=False):
assert X.shape[0] == y.shape[0], 'Got different numbers of data and labels'
self.X, self.y = X, y
self.batch_size, self.shuffle = batch_size, shuffle
def __iter__(self):
N, B = self.X.shape[0], self.batch_size
idxs = np.arange(N)
if self.shuffle:
np.random.shuffle(idxs)
return iter((self.X[i:i+B], self.y[i:i+B]) for i in range(0, N, B))
def flatten(x):
N = tf.shape(x)[0]
return tf.reshape(x, (N, -1))
def three_layer_convnet(x, params):
conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b = params
scores = None
############################################################################
# TODO: Implement the forward pass for the three-layer ConvNet. #
############################################################################
h1_conv = tf.nn.conv2d(x,
conv_w1 + conv_b1,
strides=[1, 1, 1, 1],
padding='SAME'
)
h1 = tf.nn.relu(h1_conv)
h2_conv = tf.nn.conv2d(h1,
conv_w2 + conv_b2,
strides=[1, 1, 1, 1],
padding='SAME'
)
h2 = tf.nn.relu(h2_conv)
fc_params = flatten(fc_w + fc_b)
h2 = flatten(h2)
scores = tf.matmul(h2, fc_params)
############################################################################
# END OF YOUR CODE #
############################################################################
return scores
def training_step(scores, y, params, learning_rate):
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=scores)
loss = tf.reduce_mean(losses)
grad_params = tf.gradients(loss, params)
new_weights = []
for w, grad_w in zip(params, grad_params):
new_w = tf.assign_sub(w, learning_rate * grad_w)
new_weights.append(new_w)
with tf.control_dependencies(new_weights):
return tf.identity(loss)
def check_accuracy(sess, dset, x, scores, is_training=None):
num_correct, num_samples = 0, 0
for x_batch, y_batch in dset:
feed_dict = {x: x_batch, is_training: 0}
scores_np = sess.run(scores, feed_dict=feed_dict)
y_pred = scores_np.argmax(axis=1)
num_samples += x_batch.shape[0]
num_correct += (y_pred == y_batch).sum()
acc = float(num_correct) / num_samples
print('Got %d / %d correct (%.2f%%)' % (num_correct, num_samples, 100 * acc))
def kaiming_normal(shape):
if len(shape) == 2:
fan_in, fan_out = shape[0], shape[1]
elif len(shape) == 4:
fan_in, fan_out = np.prod(shape[:3]), shape[3]
return tf.random_normal(shape) * np.sqrt(2.0 / fan_in)
def three_layer_convnet_init():
params = None
############################################################################
# TODO: Initialize the parameters of the three-layer network. #
############################################################################
conv_w1 = tf.Variable(kaiming_normal((5, 5, 3, 32)))
conv_b1 = tf.Variable(tf.zeros((32,)))
conv_w2 = tf.Variable(kaiming_normal((3, 3, 32, 16)))
conv_b2 = tf.Variable(tf.zeros((16,)))
fc_w = tf.Variable(kaiming_normal((32 * 32 * 16, 10)))
fc_b = tf.Variable(tf.zeros((10,)))
params = [conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b]
############################################################################
# END OF YOUR CODE #
############################################################################
return params
def main():
learning_rate = 3e-3
tf.reset_default_graph()
is_training = tf.placeholder(tf.bool, name='is_training')
X_train, y_train, X_val, y_val, X_test, y_test = load_cifar10()
train_dset = Dataset(X_train, y_train, batch_size=64, shuffle=True)
test_dset = Dataset(X_test, y_test, batch_size=64)
val_dset = Dataset(X_val, y_val, batch_size=64, shuffle=False)
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape, y_train.dtype)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
device = '/cpu:0'
with tf.device(device):
x = tf.placeholder(tf.float32, [None, 32, 32, 3])
y = tf.placeholder(tf.int32, [None])
params = three_layer_convnet_init()
scores = three_layer_convnet(x, params)
loss = training_step(scores, y, params, learning_rate)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for t, (x_np, y_np) in enumerate(train_dset):
feed_dict = {x: x_np, y: y_np}
loss_np = sess.run(loss, feed_dict=feed_dict)
if t % 100 == 0:
print('Iteration %d, loss = %.4f' % (t, loss_np))
check_accuracy(sess, val_dset, x, scores, is_training)
if __name__=="__main__":
main()
EDIT: removed unnecessary comments and code
the problem is here
h1_conv = tf.nn.conv2d(x,
conv_w1 + conv_b1,
strides=[1, 1, 1, 1],
padding='SAME'
)
This is wrong as here you are adding bias values (conv_b1) to the filter conv_w1 but bias has to be added to the output of conv layer. The right way would be something like this
h1_conv = tf.nn.conv2d(x,
conv_w1,
strides=[1, 1, 1, 1],
padding='SAME'
)
h1_bias = tf.nn.bias_add(h1_conv, conv_b1)
h1 = tf.nn.relu(h1_bias)
And Correct it for h2 too.

How to make Feed Forward NN more accurate?

I just finished writing my first ever Neural Network and it finally works, but it works really bad. I get about 0.37 accuracy. Any tips on how to make it more accurate? I have already tried different learning rates and also different number of hidden layer units, but I never get above 0.37 accuracy. I'm trying to classify data into one of the 3 classes 0, 1 or 2. I use a 1 hot Matrix as my Y. How could I improve my code?
X = data[1:, 2:]
m, n = X.shape
labels = data[1:, 1]
Y = np.zeros((m,3))
i = 0
for label in labels:
if label == 0:
Y[i,0] = 1
elif label == 1:
Y[i,1] = 1
elif label == 2:
Y[i,2] = 1
i += 1
slice_size = math.floor(m/5)
X_test = X[-slice_size:, :]
Y_test = Y[-slice_size:]
X_train = X[:slice_size, :]
Y_train = Y[:slice_size]
learning_rate = 0.00001
num_steps = 200
batch_size = 100
display_step = 2
n_nodes_hl1 = 5
n_nodes_hl2 = 5
n_nodes_hl3 = 5
n_classes = 3
n_inputs = 16
training_epochs = 500
x = tf.placeholder('float32', [None,n])
y = tf.placeholder('float32', [None, n_classes])
weights = {
'h1': tf.Variable(tf.random_normal([n_inputs, n_nodes_hl1])),
'h2': tf.Variable(tf.random_normal([n_nodes_hl1, n_nodes_hl2])),
'h3': tf.Variable(tf.random_normal([n_nodes_hl2, n_nodes_hl3])),
'out': tf.Variable(tf.random_normal([n_nodes_hl1, n_classes]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_nodes_hl1])),
'b2': tf.Variable(tf.random_normal([n_nodes_hl2])),
'b3': tf.Variable(tf.random_normal([n_nodes_hl3])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
def neural_network(data):
layer_1 = tf.add(tf.matmul(data, weights['h1']), biases['b1'])
layer_2 = tf.add(tf.matmul(layer_1, weights['h2']), biases['b2'])
layer_3 = tf.add(tf.matmul(layer_2, weights['h3']), biases['b3'])
output = tf.matmul(layer_3, weights['out']) + biases['out']
return output
logits = neural_network(x)
prediction = tf.nn.softmax(logits)
loss_op =
tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=logits,
labels=y))
optimizer = tf.train.AdamOptimizer(learning_rate=learning_rate)
train_op = optimizer.minimize(loss_op)
correct_pred = tf.equal(tf.argmax(prediction, 1), tf.argmax(Y_train, 1))
accuracy = tf.reduce_mean(tf.cast(correct_pred, tf.float32))
# Initialize the variables (i.e. assign their default value)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for step in range(1, num_steps+1):
x_step = np.asarray(X_train[step,:])
y_step = np.asarray(Y_train[step])
x_step = np.reshape(x_step, (1, n))
y_step = np.reshape(y_step, (1,n_classes))
sess.run(train_op, feed_dict={x:x_step , y:y_step})
if step % display_step == 0 or step == 1:
#Calculate batch loss and accuracy
loss, acc = sess.run([loss_op, accuracy], feed_dict={x: x_step,
y: y_step})
print("Step " + str(step) + ", Minibatch Loss= " +
"{:.4f}".format(loss) + ", Training Accuracy= " +
"{:.3f}".format(acc))
x_step_test = np.asarray(X_test)
y_step_test = np.asarray(Y_test)
x_step_test = np.reshape(x_step, (1, n))
y_step_test = np.reshape(y_step, (1,n_classes))
print("Optimization Finished!")
print("Testing Accuracy:",
sess.run(accuracy, feed_dict={x: x_step_test,
y: y_step_test}))
1.
x_step_test = np.asarray(X_test)
y_step_test = np.asarray(Y_test)
x_step_test = np.reshape(x_step, (1, n))
y_step_test = np.reshape(y_step, (1,n_classes))
Shouldn't this be:
x_step_test = np.asarray(X_test)
y_step_test = np.asarray(Y_test)
x_step_test = np.reshape(x_step_test, (1, n))
y_step_test = np.reshape(y_step_test, (1,n_classes))
Also check how u r taking the batches, there might be some problem.
Use train_test_split from sklearn.model_selection, it splits your train and test data after shuffling. Not shuffling your data might create problem if ur data have some pattern, eg. u have 99 data points, first 33 contain its a dog another 33 contains its a cat and for last 33 its a mouse, your neural net will train only on 66 dog and cat images and won't learn to recognise mouse.
Increase the learning rate, AdamOptimizer already decays the lr, use something like 0.1 or 0.01.
I guess tensorflow part is correct.

Categories

Resources