I'm trying to implement linear regression using Rms Prop optimizer from scratch.
Code:
EPOCHS = 100
w3 = tf.Variable(w_vector, dtype = tf.float32)
w4 = tf.Variable(0, dtype = tf.float32)
lr = 1e-5
beta = 0.9
epilson = 1e-7
momentum = 0.0
for epoch in range(1,EPOCHS+1):
mom_w = 0
mom_b = 0
mean_square_w = 0
mean_gradient_w = 0
mean_square_b = 0
mean_gradient_b = 0
y_pred1 = tf.squeeze(tf.matmul(w3,x, transpose_a = True, transpose_b = True) + w4)
dw3, dw4 = gradients_mse(x, y, y_pred1)
# My eqautions for RMS prop
mean_square_w = beta * mean_square_w + (1-beta) * dw3 ** 2
mean_gradient_w = beta * mean_gradient_w + (1-beta) * dw3
mom_w = momentum * mom_w + lr * (dw3/(tf.sqrt(mean_square_w + epilson - mean_gradient_w ** 2)))
mean_square_b = beta * mean_square_b + (1-beta) * dw4 ** 2
mean_gradient_b = beta * mean_gradient_b + (1-beta) * dw4
mom_b = momentum * mom_b + lr * (dw4/(tf.sqrt(mean_square_b + epilson - mean_gradient_b ** 2)))
w3.assign_sub(mom_w)
w4.assign_sub(mom_b)
print('w3 : {}'.format(w3.numpy()))
print('w4 : {}'.format(w4.numpy()))
Output:
w3 : [[-1.2507935]]
w4 : 0.0033333366736769676
Now I create a single layer and single neuron neural network with no activation function. Assign the same weights in its neuron and use RMS prop as optimizer I get different final weights. However, this was not the case for sgd optimizer.
Code:
# using keras to get same results
def create_model():
model = tf.keras.Sequential()
model.add(tf.keras.layers.Dense(units = 1, name = 'd1', input_shape = (x.shape[1],)))
model.compile(optimizer=tf.keras.optimizers.RMSprop(
learning_rate=1e-5, rho=0.9, momentum=0.0, epsilon=1e-07, centered=False),
loss="mse")
return model
model = create_model()
d1 = model.get_layer('d1')
d1_weights = [tf.constant(w_vector, dtype = tf.float32), tf.constant(np.array([0]), dtype = tf.float32)]
d1.set_weights(d1_weights)
model.fit(x, y, epochs = 100)
d1 = model.get_layer('d1')
print('w3 = {}'.format(d1.weights[0].numpy()))
print('w4 = {}'.format(d1.weights[1].numpy()[0]))
Output:
w3 = [[-1.2530397]]
w4 = 0.0010913893347606063
My gradients are calculate correctly for mse loss function. I have crosschecked them with tensorflows inbuilt gradient computation function gradient tape.
Code:
# Computing gradients
def gradients_mse(X, Y, Y_PREDS):
DW1 = tf.matmul(X, tf.reshape(Y-Y_PREDS, (X.shape[0],1)), transpose_a = True) * (-2/X.shape[0])
DW0 = (-2 / X.shape[0]) * tf.reduce_sum(Y - Y_PREDS)
return DW1, DW0
The only thing that can go wrong in this implementation is I think calculation of mom_w and mom_b using incorrect equations.
x.shape = [10,1]
The default batch size is 32 so it will have no effects on weight updates. The same code gives perfectly matching output when I try to use simple gradient descent instead of RMS prop.
Related
So I made a simple neural network for MNIST (784 input neurons, 30 hidden neurons, and 10 output neurons), but the cost function (MSE) always increases to 4.5 and never decreases, and the output neurons eventually all just output 1. Here's the code:
np.set_printoptions(suppress=True)
epochs = 50
batch = 60000
learning_rate = 3
B1 = np.random.randn(30, 1)
B2 = np.random.randn(10, 1)
W1 = np.random.randn(784, 30)
W2 = np.random.randn(30, 10)
for i in range(epochs):
X, Y = shuffle(X, Y)
c_B1 = np.zeros(B1.shape)
c_B2 = np.zeros(B2.shape)
c_W1 = np.zeros(W1.shape)
c_W2 = np.zeros(W2.shape)
for b in range(0, np.size(X, 0), batch):
inputs = X[b:b+batch]
outputs = Y[b:b+batch]
Z1 = nn_forward(inputs, W1.T, B1)
A1 = sigmoid(Z1)
Z2 = nn_forward(A1, W2.T, B2)
A2 = sigmoid(Z2)
e_L = (outputs - A2) * d_sig(Z2)
e_1 = np.multiply(np.dot(e_L, W2.T), d_sig(Z1))
d_B2 = np.sum(e_L, axis=0)
d_B1 = np.sum(e_1, axis=0)
d_W2 = np.dot(A1.T, e_L)
d_W1 = np.dot(inputs.T, e_1)
d_B2 = d_B2.reshape((np.size(B2, 0), 1))
d_B1 = d_B1.reshape((np.size(B1, 0), 1))
c_B1 = np.add(c_B1, d_B1)
c_B2 = np.add(c_B2, d_B2)
c_W1 = np.add(c_W1, d_W1)
c_W2 = np.add(c_W2, d_W2)
B1 = np.subtract(B1, (learning_rate/batch) * c_B1)
B2 = np.subtract(B2, (learning_rate/batch) * c_B2)
W1 = np.subtract(W1, (learning_rate/batch) * c_W1)
W2 = np.subtract(W2, (learning_rate/batch) * c_W2)
print(i, cost(outputs, A2))
What am I doing wrong?
Two things I notice right away:
Why do you use MSE as loss-function for a classification problem? MSE Is usually used for regression problems. Try using crossentropy.
You have sigmoid as output activation, which maps your input x to the interval (0,1), so in case you like to do classification you should look at the argmax of your output vector and use this as predicted class label.
I am trying to do polynomial regression by pytorch. First I just tried only linear regression (b + wx).
model_1 = RegressionModel()
W = torch.zeros(1, requires_grad=True)
b = torch.zeros(1, requires_grad = True)
optimizer_1 = torch.optim.SGD([W, b], lr = 0.001)
x_train = torch.FloatTensor(dataset.x_data['LSTAT'])
y_train = torch.FloatTensor(dataset.data['target'])
nb_epochs = 10000
for epoch in range(nb_epochs + 1):
hypothesis = x_train * W + b
cost = torch.nn.functional.mse_loss(hypothesis, y_train.float())
optimizer_1.zero_grad()
cost.backward()
optimizer_1.step()
print('Epoch {:4d}/{} W: {:.3f}, b: {:.3f}, Cost: {:.6f}'.format(epoch,
nb_epochs, W.item(), b.item(), cost.item()))
Then I changed and added some variables to do polynomial regression (b + w1x + w2x^2)
model_2 = RegressionModel()
W1 = torch.zeros(1, requires_grad=True)
W2 = torch.zeros(1, requires_grad=True)
b = torch.zeros(1, requires_grad = True)
optimizer_2 = torch.optim.SGD([W2, W1, b], lr = 0.0000099)
x_train = torch.FloatTensor(dataset.x_data['LSTAT'])
y_train = torch.FloatTensor(dataset.data['target'])
nb_epochs = 10000
for epoch in range(nb_epochs + 1):
hypothesis = b + x_train * W1 + x_train * x_train * W2
cost = torch.nn.functional.mse_loss(hypothesis, y_train.float())
optimizer_2.zero_grad()
cost.backward()
optimizer_2.step()
print('Epoch {:4d}/{} W1: {:.3f}, W2: {:.3f}, b: {:.3f}, Cost:
{:.6f}'.format(epoch, nb_epochs, W1.item(), W2.item(), b.item(),
cost.item()))
Can I try polynomial regression like this? If not, I would be really appreciate if you let me know. I'm really noob to pytorch...
Your code should work. When working with larger data, it will be more efficient if you do the regression in a single matrix operation. For that, you need to first pre-compute polynomials of your input features:
x_train_polynomial = torch.stack([x_train, x_train ** 2], dim=1)
To save some lines, you can rewrite the projection a linear layer:
import torch.nn as nn
projection = nn.Linear(2, 1, bias=True)
In the training loop, you can call:
hypothesis = projection(x_train_polynomial)
I'm trying to code a neural network from scratch in python. To check whether everything works I wanted to overfit the network but the loss seems to explode at first and then comes back to the initial value and stops there (Doesn't converge). I've checked my code and could find the reason. I assume my understanding or implementation of backpropagation is incorrect but there might be some other reason. Can anyone help me out or at least point me in the right direction?
# Initialize weights and biases given dimesnsions (For this example the dimensions are set to [12288, 64, 1])
def initialize_parameters(dims):
# Initiate parameters
parameters = {}
L = len(dims) # Number of layers in the network
# Loop over the given dimensions. Initialize random weights and set biases to zero.
for i in range(1, L):
parameters["W" + str(i)] = np.random.randn(dims[i], dims[i-1]) * 0.01
parameters["b" + str(i)] = np.zeros([dims[i], 1])
return parameters
# Activation Functions
def relu(x, deriv=False):
if deriv:
return 1. * (x > 0)
else:
return np.maximum(0,x)
def sigmoid(x, deriv=False):
if deriv:
return x * (1-x)
else:
return 1/(1 + np.exp(-x))
# Forward and backward pass for 2 layer neural network. (1st relu, 2nd sigmoid)
def forward_backward(X, Y, parameters):
# Array for storing gradients
grads = {}
# Get the length of examples
m = Y.shape[1]
# First layer
Z1 = np.dot(parameters["W1"], X) + parameters["b1"]
A1 = relu(Z1)
# Second layer
Z2 = np.dot(parameters["W2"], A1) + parameters["b2"]
AL = sigmoid(Z2)
# Compute cost
cost = (-1 / m) * np.sum(np.multiply(Y, np.log(AL)) + np.multiply(1 - Y, np.log(1 - AL)))
# Backpropagation
# Second Layer
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
dZ2 = dAL * sigmoid(AL, deriv=True)
grads["dW2"] = np.dot(dZ2, A1.T) / m
grads["db2"] = np.sum(dZ2, axis=1, keepdims=True) / m
# First layer
dA1 = np.dot(parameters["W2"].T, dZ2)
dZ1 = dA1 * relu(A1, deriv=True)
grads["dW1"] = np.dot(dZ1, X.T)
grads["db1"] = np.sum(dZ1, axis=1, keepdims=True) / m
return AL, grads, cost
# Hyperparameters
dims = [12288, 64, 1]
epoches = 2000
learning_rate = 0.1
# Initialize parameters
parameters = initialize_parameters(dims)
log_list = []
# Train the network
for i in range(epoches):
# Get X and Y
x = np.array(train[0:10],ndmin=2).T
y = np.array(labels[0:10], ndmin=2).T
# Perform forward and backward pass
AL, grads, cost = forward_backward(x, y, parameters)
# Compute cost and append to the log_list
log_list.append(cost)
# Update parameters with computed gradients
parameters = update_parameters(grads, parameters, learning_rate)
plt.plot(log_list)
plt.title("Loss of the network")
plt.show()
I am struggling to find the place where you calculate the error gradients and the input training data sample would also help...
I don't know if this will help you, but I'll share my solution for Python neural network to learn XOR problem.
import numpy as np
def sigmoid_function(x, derivative=False):
"""
Sigmoid function
“x” is the input and “y” the output, the nonlinear properties of this function means that
the rate of change is slower at the extremes and faster in the centre. Put plainly,
we want the neuron to “make its mind up” instead of indecisively staying in the middle.
:param x: Float
:param Derivative: Boolean
:return: Float
"""
if (derivative):
return x * (1 - x) # Derivative using the chain rule.
else:
return 1 / (1 + np.exp(-x))
# create dataset for XOR problem
input_data = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
ideal_output = np.array([[0.0], [1.0], [1.0], [0.0]])
#initialize variables
learning_rate = 0.1
epoch = 50000 #number or iterations basically - One round of forward and back propagation is called an epoch
# get the second element from the numpy array shape field to detect the count of features for input layer
input_layer_neurons = input_data.shape[1]
hidden_layer_neurons = 3 #number of hidden layer neurons
output_layer_neurons = 1 #number of output layer neurons
#init weight & bias
weights_hidden = np.random.uniform(size=(input_layer_neurons, hidden_layer_neurons))
bias_hidden = np.random.uniform(1, hidden_layer_neurons)
weights_output = np.random.uniform(size=(hidden_layer_neurons, output_layer_neurons))
bias_output = np.random.uniform(1, output_layer_neurons)
for i in range(epoch):
#forward propagation
hidden_layer_input_temp = np.dot(input_data, weights_hidden) #matrix dot product to adjust for weights in the layer
hidden_layer_input = hidden_layer_input_temp + bias_hidden #adjust for bias
hidden_layer_activations = sigmoid_function(hidden_layer_input) #use the activation function
output_layer_input_temp = np.dot(hidden_layer_activations, weights_output)
output_layer_input = output_layer_input_temp + bias_output
output = sigmoid_function(output_layer_input) #final output
#backpropagation (where adjusting of the weights happens)
error = ideal_output - output #error gradient
if (i % 1000 == 0):
print("Error: {}".format(np.mean(abs(error))))
#use derivatives to compute slope of output and hidden layers
slope_output_layer = sigmoid_function(output, derivative=True)
slope_hidden_layer = sigmoid_function(hidden_layer_activations, derivative=True)
#calculate deltas
delta_output = error * slope_output_layer
error_hidden_layer = delta_output.dot(weights_output.T) #calculates the error at hidden layer
delta_hidden = error_hidden_layer * slope_hidden_layer
#change the weights
weights_output += hidden_layer_activations.T.dot(delta_output) * learning_rate
bias_output += np.sum(delta_output, axis=0, keepdims=True) * learning_rate
weights_hidden += input_data.T.dot(delta_hidden) * learning_rate
bias_hidden += np.sum(delta_hidden, axis=0, keepdims=True) * learning_rate
I am trying to use tf.nn.weighted_cross_entropy_with_logits API, but I found I just can not get the right result when the weight is not 1.0 (1.0 means no weight).
import tensorflow as tf
import numpy as np
def my_binary_crossentropy_np(labels, output, weight=10.0):
"""
Weighted binary crossentropy between an output tensor
and a target tensor.
"""
# transform back to logits
epsilon = 1e-08
np.clip(output, epsilon, 1.0 - epsilon, out=output)
output = np.log(output / (1.0 - output))
# https://www.tensorflow.org/api_docs/python/tf/nn/weighted_cross_entropy_with_logits
# l = 1 + (q - 1) * z
# (1 - z) * x + l * (log(1 + exp(-abs(x))) + max(-x, 0))
l = 1.0 + (weight - 1.0) * labels
loss1 = np.multiply(1.0 - labels, output)
loss2 = np.multiply(l, np.log(1.0 + np.exp(-abs(output))))
loss3 = np.maximum(-output, 0)
loss = loss1 + loss2 + loss3
return np.mean(loss)
def my_binary_crossentropy_tf(labels, output, weight=1.0):
"""
Weighted binary crossentropy between an output tensor
and a target tensor.
"""
epsilon = 1e-08
output = tf.clip_by_value(output, epsilon, 1.0 - epsilon)
output = tf.log(output / (1.0 - output))
# compute weighted loss
#loss = tf.nn.sigmoid_cross_entropy_with_logits(labels=labels, logits=output)
loss = tf.nn.weighted_cross_entropy_with_logits(targets=labels, logits=output, pos_weight=weight)
return tf.reduce_mean(loss)
# generate random test data and random label
predict = np.random.rand(10, 8)
label = np.random.rand(10, 8)
label[label >= 0.5] = 1
label[label < 0.5] = 0
loss1 = my_binary_crossentropy_np(label, predict, 1.0)
print('loss1 = ', loss1)
loss1 = my_binary_crossentropy_np(label, predict, 10.0)
print('loss1 = ', loss1)
predict_tf = tf.convert_to_tensor(predict)
loss2 = my_binary_crossentropy_tf(label, predict_tf, 1.0)
loss2 = tf.Session().run(loss2)
print('loss2 = ', loss2)
loss2 = my_binary_crossentropy_tf(label, predict_tf, 10.0)
loss2 = tf.Session().run(loss2)
print('loss2 = ', loss2)
running result:
loss1 = 1.02193164517
loss1 = 1.96332399324
loss2 = 1.02193164517
loss2 = 4.80529539791
The implementation of my_binary_crossentropy_np is wrong.
Here is the right one:
l = (weight - 1.0) * labels + 1.0
loss1 = np.multiply(1.0 - labels, output)
loss2 = np.multiply(l, np.log(1.0 + np.exp(-abs(output))) + np.maximum(-output, 0))
loss = loss1 + loss2
I would like to implement YOLO (You Only Look Once) with Tensorflow.
But I write the loss function and train the network, the loss value is very huge like this:
2016-09-11 13:55:03.753679: step 0, loss = 3371113119744.00 (3.9 examples/sec; 2.548 sec/batch)
2016-09-11 13:55:14.444871: step 10, loss = nan (19.8 examples/sec; 0.505 sec/batch)
The 0 step is already very huge and the others become nan value.
I cannot figure it out what the reason is
Here is the loss function I write about YOLO:
def inference_loss(y_out, y_true):
'''
Args:
y_true: Ground Truth output
y_out: Predicted output
The form of the ground truth vector is:
######################################
##1225 values in total: 7*7=49 cells
##each cell vector has 25 values: bounding box (x,y,h,w), class one hot vector (p1,p2,...,p20), objectness score (0 or 1)##
##49 * 25 = 1225
######################################
Returns:
The loss caused by y_out
'''
lambda_coor = 5
lambda_noobj = 0.5
box_loss = 0.0
score_loss = 0.0
class_loss = 0.0
for i in range(49):
#the first bounding box
y_out_box1 = y_out[:,i*30:i*30+4]
#the second bounding box
y_out_box2 = y_out[:,i*30+4:i*30+8]
#ground truth bounding box
y_true_box = y_true[:,i*25:i*25+4]
#l2 loss of the predicted bounding box
box_loss_piece = tf.reduce_sum(tf.square(y_true_box - y_out_box1), 1) + tf.reduce_sum(tf.square(y_true_box - y_out_box2), 1)
#bounding box loss
box_loss_piece = box_loss_piece * lambda_coor * y_true[:,i*25+24]
box_loss = box_loss + box_loss_piece
#predicted score
y_out_score1 = y_out[:,i*30+8]
y_out_score2 = y_out[:,i*30+9]
#ground truth score
y_true_score = y_true[:,i*25+24]
#the first score
score_loss1_piece = tf.square(y_true_score - y_out_score1) + tf.square(y_true_score - y_out_score2)
#the second score
score_loss2_piece = lambda_noobj * score_loss1_piece
#score loss
score_loss1_piece = score_loss1_piece * y_true[:,i*25+24]
score_loss2_piece = score_loss2_piece * (1 - y_true[:,i*25+24])
score_loss = score_loss + score_loss1_piece + score_loss2_piece
#one hot predicted class vector and ground truth vector
y_out_class = y_out[:,i*30+10:(i+1)*30]
y_true_class = y_true[:,i*25+4:i*25+24]
# class loss
class_loss_piece = tf.reduce_sum(tf.square(y_true_class - y_out_class), 1)
class_loss = class_loss + class_loss_piece * y_true[:,i*25+24]
#total loss of one batch
loss = tf.reduce_sum(box_loss+score_loss+class_loss, 0)
return loss
And this is the training code I wrote:
def train_test():
with tf.Graph().as_default():
global_step = tf.Variable(0, trainable=False)
data_batch_generator = yolo_inputs.generate_batch_data(voclabelpath, imagenamefile, BATCH_NUM, sample_number=10000, iteration = 5000)
training_image_batch = tf.placeholder(tf.float32, shape = [BATCH_NUM, 448, 448, 3])
training_label_batch = tf.placeholder(tf.float32, shape = [BATCH_NUM, 1225])
#inference and loss
yolotinyinstance = yolo_tiny.YOLO()
yolotinyinstance.build(training_image_batch)
net_out = yolotinyinstance.fc12
loss = inference_loss(net_out, training_label_batch)
train_op = train(loss, global_step)
saver = tf.train.Saver(tf.all_variables())
summary_op = tf.merge_all_summaries()
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
summary_writer = tf.train.SummaryWriter(TRAIN_DIR, sess.graph)
step = 0
for x,y in data_batch_generator:
start_time = time.time()
_, loss_value = sess.run([train_op, loss], feed_dict = {training_image_batch: x,
training_label_batch:y})
It confuses me for a while. Could anyone help?
Thank you very much.