Increasing trainig losses in Neural network - python

I trying implement neural network from scratch .But the thing is loss increases while training the. The Neural network consist of three layer(input,hindden,output). I have added regualrisation in loss function Compute_loss.
Here is the code.
import numpy as np
import random
# define neural network class
class Neuralnetwork():
def __init__(self, X, Y,regulization_L2=False,regulization_L1=False,dropout_forward_bool=False): # initialize parameters
self.size=[32*32, 100, 10] # size of input, hidden and output layer
self.parameters = {} # initialize parameters
self.learning_rate = 0.0001 # learning rate
self.num_iterations = 100
self.X = None
self.Y = None
self.loss = []
self.regulization_l2=regulization_L2
self.regulization_l1=regulization_L1
self.lambda_=0.000001
self.dropout=0.02
self.dropout_forword_bool=dropout_forward_bool
def initialize_parameters(self): # initialize parameters for neural network
np.random.seed(2)
self.input_layer_size = self.size[0] # size of input layer
self.hidden_layer_size = self.size[1] # size of hidden layer
self.output_layer_size = self.size[2] # size of output layer
self.parameters['W1'] = np.random.randn(self.hidden_layer_size, self.input_layer_size) * 0.01 # initialize weights for hidden layer
self.parameters['b1'] = np.zeros((self.hidden_layer_size, 1)) # initialize bias for hidden layer
self.parameters['W2'] = np.random.randn(self.output_layer_size, self.hidden_layer_size) * 0.01 # initialize weights for output layer
self.parameters['b2'] = np.zeros((self.output_layer_size, 1)) # initialize bias for output layer
def sigmoid(self, Z): # sigmoid function
return 1 / (1 + np.exp(-Z))
def relu(self, Z): # relu function
return np.maximum(0, Z)
def sigmoid_backward(self, dA, Z): # derivative of sigmoid function
sig = self.sigmoid(Z)
return dA * sig * (1 - sig)
def relu_backward(self, dA, Z): # derivative of relu function
dZ = np.array(dA, copy=True)
dZ[Z <= 0] = 0
return dZ
def forward_propagation(self, X,): # forward propagation function for neural network
# dropout the nodes by making the weights zero and bias zero for the nodes
if self.dropout_forword_bool==True and self.dropout>0:
for node in range(self.parameters['W1'].shape[0]):
for weight in range(self.parameters['W1'].shape[1]):
if random.uniform(0,1) < self.dropout:
self.parameters['W1'][node][weight] = 0
self.parameters['b1'][node]=0
# calculate Z1, A1, Z2, A2
Z1 = np.dot(self.parameters['W1'], X.T) + self.parameters['b1'] # calculate Z1
A1 = self.relu(Z1)
Z2 = np.dot(self.parameters['W2'], A1) + self.parameters['b2']
A2 = self.sigmoid(Z2)
return A2, Z2, A1, Z1
def L1_regularization(self, weights): # L1 regularization function to avoid overfitting
sum=0
# calculate sum of absolute values of weights
for i in range(weights.shape[0]):
for j in range(weights.shape[1]):
sum=sum+abs(weights[i][j])
ans=self.lambda_ *sum # multiply sum with lambda value to set the regularization strength
return ans
def L2_regularization(self, weights):
sum=0
# calculate sum of square of weights
for i in range(weights.shape[0]):
for j in range(weights.shape[1]):
sum=sum+((weights[i][j])**2)
ans=self.lambda_ *sum
return ans
def compute_cost(self, A2, Y): # compute cost function
m = len(Y)
if self.regulization_l2==True: # if L2 regularization is true
cost = -1/m * np.sum(np.multiply(Y, np.log(A2)) + np.multiply((1 - Y), np.log(1 - A2))) # using cross entropy loss function Formula
cost = np.squeeze(cost) # squeeze the cost value to remove the extra dimension
#cost=cost + (self.L2_regularization(self.parameters['W1']) + self.L2_regularization(self.parameters['W2']))/m # add L2 regularization cost to the loss function
return cost
elif self.regulization_l1==True: # if L1 regularization is true
cost = -1/m * np.sum(np.multiply(Y, np.log(A2)) + np.multiply((1 - Y), np.log(1 - A2))) # using cross entropy loss function Formula
cost = np.squeeze(cost)
#cost=cost + (self.L1_regularization(self.parameters['W1']) + self.L1_regularization(self.parameters['W2']))/m # add L1 regularization cost to the loss function
return cost
else:
cost = -1/m * np.sum(np.multiply(Y, np.log(A2)) + np.multiply((1 - Y), np.log(1 - A2))) # using cross entropy loss function Formula Without regularization
cost = np.squeeze(cost)
return cost
def backward_propagation(self, X, Y, A2, Z2, A1, Z1):
# backward propagation function for neural network does the backpropagation and calculates the gradients of weights and bias
m = len(Y)
dZ2 = A2 - Y.T # dz2 is the derivative of cost function with respect to z2
if self.regulization_l1==True:
# calculate dW2 and add L1 regularization to it
dW2 = np.dot(dZ2, A1.T) / m + self.L1_regularization(self.parameters['W2'])/m # calculate dW2 and add L1 regularization to it
db2 = np.sum(dZ2, axis=1, keepdims=True) / m
dA= np.dot(self.parameters['W2'].T, dZ2)
dZ1 = dA * self.relu_backward(dA, Z1)
dW1 = np.dot(dZ1, X) / m + self.L1_regularization(self.parameters['W1'])/m
db1 = np.sum(dZ1, axis=1, keepdims=True) / m
return dW1, dW2, db1, db2
elif self.regulization_l2==True:
# calculate dW2 and add L2 regularization to it
dW2 = np.dot(dZ2, A1.T) / m + self.L2_regularization(self.parameters['W2'])/m
db2 = np.sum(dZ2, axis=1, keepdims=True) / m
dA= np.dot(self.parameters['W2'].T, dZ2)
dZ1 = dA * self.relu_backward(dA, Z1)
dW1 = np.dot(dZ1, X) / m + self.L2_regularization(self.parameters['W1'])/m
db1 = np.sum(dZ1, axis=1, keepdims=True) / m
return dW1, dW2, db1, db2
else:
# calculate dW2 and db2 without regularization
dW2 = np.dot(dZ2, A1.T) / m
db2 = np.sum(dZ2, axis=1, keepdims=True) / m
dA= np.dot(self.parameters['W2'].T, dZ2)
dZ1 = dA * self.relu_backward(dA, Z1)
dW1 = np.dot(dZ1, X) / m
db1 = np.sum(dZ1, axis=1, keepdims=True) / m
return dW1, dW2, db1, db2
def update_parameters(self, dW1, dW2, db1, db2):
# update the parameters using the gradients calculated in backward propagation
self.parameters['W1'] = self.parameters['W1'] - self.learning_rate * dW1
self.parameters['W2'] = self.parameters['W2'] - self.learning_rate * dW2
self.parameters['b1'] = self.parameters['b1'] - self.learning_rate * db1
self.parameters['b2'] = self.parameters['b2'] - self.learning_rate * db2
def fit(self, X, Y):
# fit function is used to train the model
self.initialize_parameters() # initialize the parameters
for i in range(self.num_iterations): # loop over the number of iterations
A2, Z2, A1, Z1 = self.forward_propagation(X) # forward propagation
cost = self.compute_cost(A2, Y)
dW1, dW2, db1, db2 = self.backward_propagation(X, Y, A2, Z2, A1, Z1) # backward propagation to calculate the gradients
# save the gradients in the parameters dictionary
self.parameters['dW1'] = dW1
self.parameters['dW2'] = dW2
self.parameters['db1'] = db1
self.parameters['db2'] = db2
# update the parameters
self.update_parameters(dW1, dW2, db1, db2)
self.loss.append(cost)
if i % 10 == 0:
print(f'Cost after iteration {i}: {cost}')
def predict(self, X):
# predict function is used to predict the output for the given input
A2, _, _, _ = self.forward_propagation(X) # forward propagation
predictions = np.round(A2) # round the output to get the predictions
return predictions
def plot_loss(self): # plot the loss
plt.plot(self.loss)
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.show()
def accuracy(self, X, Y):
predictions = self.predict(X)
return (np.sum((predictions == Y) / Y.shape[1])) * 100
def check_gradient(self, X, Y): # check the gradient
# calculate the gradient using backpropagation
A2, Z2, A1, Z1 = self.forward_propagation(X)
epsilon = 1e-7
dW1, dW2, db1, db2 = self.backward_propagation(X, Y, A2, Z2, A1, Z1) # backward propagation to calculate the gradients
for i in range(1, 3):
# calculate the gradient using the formula
self.parameters[f'W{i}'] += epsilon # add epsilon to W1
A2, Z2, A1, Z1 = self.forward_propagation(X) # forward propagation
cost_plus = self.compute_cost(A2, Y)
self.parameters[f'W{i}'] -= 2 * epsilon # subtract epsilon from W1
A2, Z2, A1, Z1 = self.forward_propagation(X) # forward propagation
cost_minus = self.compute_cost(A2, Y)
gradient = (cost_plus - cost_minus) / (2 * epsilon) # calculate the gradient using the formula
numerator = np.linalg.norm(gradient - self.parameters[f'dW{i}']) # calculate the numerator using the norm of the difference between the gradients
denominator = np.linalg.norm(gradient) + np.linalg.norm(self.parameters[f'dW{i}']) # calculate the denominator using the norm of the gradients
difference = numerator / denominator
if(difference > 1e-7):
print("There is a mistake in the backward propagation! difference = " + str(difference))
when i call mdoel:
print('--------------- model with L2 Regularization --------------------')
model_L2 = Neuralnetwork(x_train, y_train,regulization_L2=True, regulization_L1=False,dropout_forward_bool=False)
model_L2.fit(x_train, y_train)
here is the output
--------------- model with L2 Regularization --------------------
Cost after iteration 0: 1430.3418627178976
Cost after iteration 10: 1446.5808681718697
Cost after iteration 20: 1459.8884483327824
How do i correct this and why is it increasing?

Your example is missing x_train so I couldn't reproduce your behavior - but it's not a tragedy because I made something that might help you - a small notebook in which I implemented the backprop. alg. as an example to understand it myself. I have made 5 versions in which I go step by step from a simple numpy implentation to one with pytorch and cuda. For your problem the versions 1 to 3 of the implementation are interesting. I hope this helps you.
backpropagationnotebook

Related

Simple Numpy MNIST Classifier Outputting Equal Probabilities

I'm working on building an MNIST classifier from scratch in numpy, however after training, my model always outputs a 0.1 probability for every digit.
I'm using softmax + cross entropy loss, and outputting an even distribution of probabilities seems to make my loss go to 0 without the accuracy increasing.
So, the model is learning how to minimise loss, just in the wrong way XD
Here is my forward and backward pass:
def onehot(i):
return np.eye(10)[i]
def softmax(x):
exp = np.exp(x - np.max(x))
return exp/np.sum(exp)
def relu(x):
return np.maximum(x, 0)
def loss(x, y):
return -np.sum(y * np.log(x))
def forward_backward(x, y, w1, w2):
# forward
l1 = x # w1
r = relu(l1)
l2 = r # w2
out = softmax(l2)
# loss
l = loss(out, y)
# backward
dl2 = out - y
dw2 = r.T # dl2
dr = dl2 # w2.T
dl1 = dr >= 0
dw1 = x.T # dl1
return out, l, dw1, dw2
And my training:
w1 = np.random.randn(784, 128)
w2 = np.random.randn(128, 10)
losses = []
for i in range(batches):
x = images[i]
y = onehot(labels[i])
out, l, dw1, dw2 = forward_backward(x, y, w1 ,w2)
w1 -= dw1 * lr
w2 -= dw2 * lr
losses.append(l)
at the end, printing out gives me: array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]])
and my loss function decreases until exactly 0 and all outputs are even.

Gradient descent for linear regression with numpy

I want to implement gradient descent with numpy for linear regression but I have some error in this code:
import numpy as np
# Code Example
rng = np.random.RandomState(10)
X = 10*rng.rand(1000, 5) # feature matrix
y = 0.9 + np.dot(X, [2.2, 4, -4, 1, 2]) # target vector
# GD implementation for linear regression
def GD(X, y, eta=0.1, n_iter=20):
theta = np.zeros((X.shape[0], X.shape[1]))
for i in range(n_iter):
grad = 2 * np.mean((np.dot(theta.T, X) - y) * X)
theta = theta - eta * grad
return theta
# SGD implementation for linear regression
def SGD(X, y, eta=0.1, n_iter=20):
theta = np.zeros(1, X.shape[1])
for i in range(n_iter):
for j in range(X.shape[0]):
grad = 2 * np.mean((np.dot(theta.T, X[j,:]) - y[j]) * X[j,:])
theta = theta - eta * grad
return theta
# MSE loss for linear regression with numpy
def MSE(X, y, theta):
return np.mean((X.dot(theta.T) - y)**2)
# linear regression with GD and MSE with numpy
theta_gd = GD(X, y)
theta_sgd = SGD(X, y)
print('MSE with GD: ', MSE(X, y, theta_gd))
print('MSE with SGD: ', MSE(X, y, theta_sgd))
The error is
grad = 2 * np.mean((np.dot(theta.T, X) - y) * X)
ValueError: operands could not be broadcast together with shapes (5,5) (1000,)
and I can't solve it.
Minor changes in your code that resolve dimensionality issues during matrix multiplication make the code run successfully. In particular, note that a linear regression on a design matrix X of dimension Nxk has a parameter vector theta of size k.
In addition, I'd suggest some changes in SGD() that make it a proper stochastic gradient descent. Namely, evaluating the gradient over random subsets of the data realized as realized by randomly partitioning the index set of the train data with np.random.shuffle() and looping through it. The batch_size determines the size of each subset after which the parameter estimate is updated. The argument seed ensures reproducibility.
# GD implementation for linear regression
def GD(X, y, eta=0.001, n_iter=100):
theta = np.zeros(X.shape[1])
for i in range(n_iter):
for j in range(X.shape[0]):
grad = (2 * np.mean(X[j,:] # theta - y[j]) * X[j,:]) # changed line
theta -= eta * grad
return theta
# SGD implementation for linear regression
def SGD(X, y, eta=0.001, n_iter=1000, batch_size=25, seed=7678):
theta = np.zeros(X.shape[1])
indexSet = list(range(len(X)))
np.random.seed(seed)
for i in range(n_iter):
np.random.shuffle(indexSet) # random shuffle of index set
for j in range(round(len(X) / batch_size)+1):
X_sub = X[indexSet[j*batch_size:(j+1)*batch_size],:]
y_sub = y[indexSet[j*batch_size:(j+1)*batch_size]]
if(len(X_sub) > 0):
grad = (2 * np.mean(X_sub # theta - y_sub) * X_sub) # changed line
theta -= eta * np.mean(grad, axis=0)
return theta
Running the code, I get
print('MSE with GD : ', MSE(X, y, theta_gd))
print('MSE with SGD: ', MSE(X, y, theta_sgd))
> MSE with GD : 0.07602
MSE with SGD: 0.05762
Each observation has 5 features, and X contains 1000 observations:
X = rng.rand(1000, 5) * 10 # X.shape == (1000, 5)
Create y which is perfectly linearly correlated with X (with no distortions):
real_weights = np.array([2.2, 4, -4, 1, 2]).reshape(-1, 1)
real_bias = 0.9
y = X # real_weights + real_bias # y.shape == (1000, 1)
G.D. implementation for linear regression:
Note:
w (weights) is your theta variable.
I have also added the calculation of b (bias).
def GD(X, y, eta=0.1, n_iter=20):
# Initialize weights and a bias (all zeros):
w = np.zeros((X.shape[1], 1)) # w.shape == (5, 1)
b = 0
# Gradient descent
for i in range(n_iter):
errors = X # w + b - y # errors.shape == (1000, 1)
dw = 2 * np.mean(errors * X, axis=0).reshape(5, 1)
db = 2 * np.mean(errors)
w -= eta * dw
b -= eta * db
return w, b
Testing:
w, b = GD(X, y, eta=0.003, n_iter=5000)
print(w, b)
[[ 2.20464905]
[ 4.00510139]
[-3.99569374]
[ 1.00444026]
[ 2.00407476]] 0.7805448262466914
Notes:
Your function SGD also contains some error..
I'm using the # operator because it's just my preference over np.dot.

What's wrong with my backpropagation?

I'm trying to code a neural network from scratch in python. To check whether everything works I wanted to overfit the network but the loss seems to explode at first and then comes back to the initial value and stops there (Doesn't converge). I've checked my code and could find the reason. I assume my understanding or implementation of backpropagation is incorrect but there might be some other reason. Can anyone help me out or at least point me in the right direction?
# Initialize weights and biases given dimesnsions (For this example the dimensions are set to [12288, 64, 1])
def initialize_parameters(dims):
# Initiate parameters
parameters = {}
L = len(dims) # Number of layers in the network
# Loop over the given dimensions. Initialize random weights and set biases to zero.
for i in range(1, L):
parameters["W" + str(i)] = np.random.randn(dims[i], dims[i-1]) * 0.01
parameters["b" + str(i)] = np.zeros([dims[i], 1])
return parameters
# Activation Functions
def relu(x, deriv=False):
if deriv:
return 1. * (x > 0)
else:
return np.maximum(0,x)
def sigmoid(x, deriv=False):
if deriv:
return x * (1-x)
else:
return 1/(1 + np.exp(-x))
# Forward and backward pass for 2 layer neural network. (1st relu, 2nd sigmoid)
def forward_backward(X, Y, parameters):
# Array for storing gradients
grads = {}
# Get the length of examples
m = Y.shape[1]
# First layer
Z1 = np.dot(parameters["W1"], X) + parameters["b1"]
A1 = relu(Z1)
# Second layer
Z2 = np.dot(parameters["W2"], A1) + parameters["b2"]
AL = sigmoid(Z2)
# Compute cost
cost = (-1 / m) * np.sum(np.multiply(Y, np.log(AL)) + np.multiply(1 - Y, np.log(1 - AL)))
# Backpropagation
# Second Layer
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
dZ2 = dAL * sigmoid(AL, deriv=True)
grads["dW2"] = np.dot(dZ2, A1.T) / m
grads["db2"] = np.sum(dZ2, axis=1, keepdims=True) / m
# First layer
dA1 = np.dot(parameters["W2"].T, dZ2)
dZ1 = dA1 * relu(A1, deriv=True)
grads["dW1"] = np.dot(dZ1, X.T)
grads["db1"] = np.sum(dZ1, axis=1, keepdims=True) / m
return AL, grads, cost
# Hyperparameters
dims = [12288, 64, 1]
epoches = 2000
learning_rate = 0.1
# Initialize parameters
parameters = initialize_parameters(dims)
log_list = []
# Train the network
for i in range(epoches):
# Get X and Y
x = np.array(train[0:10],ndmin=2).T
y = np.array(labels[0:10], ndmin=2).T
# Perform forward and backward pass
AL, grads, cost = forward_backward(x, y, parameters)
# Compute cost and append to the log_list
log_list.append(cost)
# Update parameters with computed gradients
parameters = update_parameters(grads, parameters, learning_rate)
plt.plot(log_list)
plt.title("Loss of the network")
plt.show()
I am struggling to find the place where you calculate the error gradients and the input training data sample would also help...
I don't know if this will help you, but I'll share my solution for Python neural network to learn XOR problem.
import numpy as np
def sigmoid_function(x, derivative=False):
"""
Sigmoid function
“x” is the input and “y” the output, the nonlinear properties of this function means that
the rate of change is slower at the extremes and faster in the centre. Put plainly,
we want the neuron to “make its mind up” instead of indecisively staying in the middle.
:param x: Float
:param Derivative: Boolean
:return: Float
"""
if (derivative):
return x * (1 - x) # Derivative using the chain rule.
else:
return 1 / (1 + np.exp(-x))
# create dataset for XOR problem
input_data = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
ideal_output = np.array([[0.0], [1.0], [1.0], [0.0]])
#initialize variables
learning_rate = 0.1
epoch = 50000 #number or iterations basically - One round of forward and back propagation is called an epoch
# get the second element from the numpy array shape field to detect the count of features for input layer
input_layer_neurons = input_data.shape[1]
hidden_layer_neurons = 3 #number of hidden layer neurons
output_layer_neurons = 1 #number of output layer neurons
#init weight & bias
weights_hidden = np.random.uniform(size=(input_layer_neurons, hidden_layer_neurons))
bias_hidden = np.random.uniform(1, hidden_layer_neurons)
weights_output = np.random.uniform(size=(hidden_layer_neurons, output_layer_neurons))
bias_output = np.random.uniform(1, output_layer_neurons)
for i in range(epoch):
#forward propagation
hidden_layer_input_temp = np.dot(input_data, weights_hidden) #matrix dot product to adjust for weights in the layer
hidden_layer_input = hidden_layer_input_temp + bias_hidden #adjust for bias
hidden_layer_activations = sigmoid_function(hidden_layer_input) #use the activation function
output_layer_input_temp = np.dot(hidden_layer_activations, weights_output)
output_layer_input = output_layer_input_temp + bias_output
output = sigmoid_function(output_layer_input) #final output
#backpropagation (where adjusting of the weights happens)
error = ideal_output - output #error gradient
if (i % 1000 == 0):
print("Error: {}".format(np.mean(abs(error))))
#use derivatives to compute slope of output and hidden layers
slope_output_layer = sigmoid_function(output, derivative=True)
slope_hidden_layer = sigmoid_function(hidden_layer_activations, derivative=True)
#calculate deltas
delta_output = error * slope_output_layer
error_hidden_layer = delta_output.dot(weights_output.T) #calculates the error at hidden layer
delta_hidden = error_hidden_layer * slope_hidden_layer
#change the weights
weights_output += hidden_layer_activations.T.dot(delta_output) * learning_rate
bias_output += np.sum(delta_output, axis=0, keepdims=True) * learning_rate
weights_hidden += input_data.T.dot(delta_hidden) * learning_rate
bias_hidden += np.sum(delta_hidden, axis=0, keepdims=True) * learning_rate

Using Backward Propagation in fmin_cg

I am trying to build an ANN in python, and I've been able to get so far as to to forward pass, but I get a problem when I try to do backward propagation. In my function nnCostFunction, the gradient grad is define as:
grad = tr(c_[Theta1_grad.swapaxes(1,0).reshape(1,-1), Theta2_grad.swapaxes(1,0).reshape(1,-1)])
But this is a problem because I am using scipy.optimize.fmin_cg to calculate nn_params and cost, and fmin_cg accepts only a single value (the J value for my forward pass) and cannot accept grad...
nn_params, cost = op.fmin_cg(lambda t: nnCostFunction(t, input_layer_size, hidden_layer_size, num_labels, X, y, lam), initial_nn_params, gtol = 0.001, maxiter = 40, full_output=1)[0, 1]
Is there a way to fix this so I can include backward propagation in my network? I know there is a scipy.optimize.minimize function, but I am having some difficulty understand how to use it and get the results I need. Does anyone know what needs to be done?
Your help is greatly appreciated, thanks.
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lam):
'''
Given NN parameters, layer sizes, number of labels, data, and learning rate, returns the cost of traversing NN.
'''
Theta1 = (reshape(nn_params[:(hidden_layer_size*(input_layer_size+1))],(hidden_layer_size,(input_layer_size+1))))
Theta2 = (reshape(nn_params[((hidden_layer_size*(input_layer_size+1))):],(num_labels, (hidden_layer_size+1))))
m = X.shape[0]
n = X.shape[1]
#forward pass
y_eye = eye(num_labels)
y_new = np.zeros((y.shape[0],num_labels))
for z in range(y.shape[0]):
y_new[z,:] = y_eye[int(y[z])-1]
y = y_new
a_1 = c_[ones((m,1)),X]
z_2 = tr(Theta1.dot(tr(a_1)))
a_2 = tr(sigmoid(Theta1.dot(tr(a_1))))
a_2 = c_[ones((a_2.shape[0],1)), a_2]
a_3 = tr(sigmoid(Theta2.dot(tr(a_2))))
J_reg = lam/(2.*m) * (sum(sum(Theta1[:,1:]**2)) + sum(sum(Theta2[:,1:]**2)))
J = (1./m) * sum(sum(-y*log(a_3) - (1-y)*log(1-a_3))) + J_reg
#Backprop
d_3 = a_3 - y
d_2 = d_3.dot(Theta2[:,1:])*sigmoidGradient(z_2)
Theta1_grad = 1./m * tr(d_2).dot(a_1)
Theta2_grad = 1./m * tr(d_3).dot(a_2)
#Add regularization
Theta1_grad[:,1:] = Theta1_grad[:,1:] + lam*1.0/m*Theta1[:,1:]
Theta2_grad[:,1:] = Theta2_grad[:,1:] + lam*1.0/m*Theta2[:,1:]
#Unroll gradients
grad = tr(c_[Theta1_grad.swapaxes(1,0).reshape(1,-1), Theta2_grad.swapaxes(1,0).reshape(1,-1)])
return J, grad
def nn_train(X,y,lam = 1.0, hidden_layer_size = 10):
'''
Train neural network given the features and class arrays, learning rate, and size of the hidden layer.
Return parameters Theta1, Theta2.
'''
# NN input and output layer sizes
input_layer_size = X.shape[1]
num_labels = unique(y).shape[0] #output layer
# Initialize NN parameters
initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)
# Unroll parameters
initial_nn_params = np.append(initial_Theta1.flatten(1), initial_Theta2.flatten(1))
initial_nn_params = reshape(initial_nn_params,(len(initial_nn_params),)) #flatten into 1-d array
# Find and print initial cost:
J_init = nnCostFunction(initial_nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,lam)[0]
grad_init = nnCostFunction(initial_nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,lam)[1]
print 'Initial J cost: ' + str(J_init)
print 'Initial grad cost: ' + str(grad_init)
# Implement backprop and train network, run fmin
print 'Training Neural Network...'
print 'fmin results:'
nn_params, cost = op.fmin_cg(lambda t: nnCostFunction(t, input_layer_size, hidden_layer_size, num_labels, X, y, lam), initial_nn_params, gtol = 0.001, maxiter = 40, full_output=1)[0, 1]
Theta1 = (reshape(nn_params[:(hidden_layer_size*(input_layer_size+1))],(hidden_layer_size,(input_layer_size+1))))
Theta2 = (reshape(nn_params[((hidden_layer_size*(input_layer_size+1))):],(num_labels, (hidden_layer_size+1))))
return Theta1, Theta2

Using Gradient Descent on Linear Regression Yields an Incorrect Bias

I've got a toy example set up of a linear regression model with one input variable and one output variable. The problem that I'm encountering is that the output for the bias is far off from the generated data. If I manually set the bias then it will produce a weight and bias which is close enough to the original.
I've written two pieces of code gen_data which generates data and GradientDescent which performs that gradient descent algorithm to find the weight and bias.
def gen_data(num_points=50, slope=1, bias=10, x_max=50):
f = lambda z: slope * z + bias
x = np.zeros(shape=(num_points, 1))
y = np.zeros(shape=(num_points, 1))
for i in range(num_points):
x_temp = np.random.uniform()*x_max
x[i] = x_temp
y[i] = f(x_temp) + np.random.normal(scale=3.0)
return (x, y)
# \mathbb{R}^1 with no regularization
def gradientDescent2(x, y, learning_rate=0.0001, epochs=100):
theta = np.random.rand()
bias = np.random.rand()
for i in range(0, epochs):
loss = (theta * x + bias) - y
cost = np.mean(loss**2) / 2
# print('Iteration {} | Cost: {}'.format(i, cost))
grad_b = np.mean(loss)
grad_t = np.mean(loss*x)
# updates
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
return (theta, bias)
If you want to use batch update, don't set your batch_size equals to your simple size. (I also believe that batch_update is not very suitable for this case.)
2.Your gradient calculation and parameter update are incorrect, the gradient should be:
grad_b = 1
grad_t = x
For the parameter update, you should always trying to minimize the loss, so it should be
if loss>0:
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
elif loss< 0:
bias += learning_rate * grad_b
theta += learning_rate * grad_t
After all, below is the modified code works well.
import numpy as np
import sys
def gen_data(num_points=500, slope=1, bias=10, x_max=50):
f = lambda z: slope * z + bias
x = np.zeros(shape=(num_points))
y = np.zeros(shape=(num_points))
for i in range(num_points):
x_temp = np.random.uniform()*x_max
x[i] = x_temp
y[i] = f(x_temp) #+ np.random.normal(scale=3.0)
#print('x:',x[i],' y:',y[i])
return (x, y)
def gradientDescent2(x, y, learning_rate=0.001, epochs=100):
theta = np.random.rand()
bias = np.random.rand()
for i in range(0, epochs):
for j in range(len(x)):
loss = (theta * x[j] + bias) - y[j]
cost = np.mean(loss**2) / 2
# print('Iteration {} | Cost: {}'.format(i, cost))
grad_b = 1
grad_t = x[j]
if loss>0:
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
elif loss< 0:
bias += learning_rate * grad_b
theta += learning_rate * grad_t
return (theta, bias)
def main():
x,y =gen_data()
ta,bias = gradientDescent2(x,y)
print('theta:',ta)
print('bias:',bias)
if __name__ == '__main__':
sys.exit(int(main() or 0))

Categories

Resources