Using Backward Propagation in fmin_cg - python

I am trying to build an ANN in python, and I've been able to get so far as to to forward pass, but I get a problem when I try to do backward propagation. In my function nnCostFunction, the gradient grad is define as:
grad = tr(c_[Theta1_grad.swapaxes(1,0).reshape(1,-1), Theta2_grad.swapaxes(1,0).reshape(1,-1)])
But this is a problem because I am using scipy.optimize.fmin_cg to calculate nn_params and cost, and fmin_cg accepts only a single value (the J value for my forward pass) and cannot accept grad...
nn_params, cost = op.fmin_cg(lambda t: nnCostFunction(t, input_layer_size, hidden_layer_size, num_labels, X, y, lam), initial_nn_params, gtol = 0.001, maxiter = 40, full_output=1)[0, 1]
Is there a way to fix this so I can include backward propagation in my network? I know there is a scipy.optimize.minimize function, but I am having some difficulty understand how to use it and get the results I need. Does anyone know what needs to be done?
Your help is greatly appreciated, thanks.
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lam):
'''
Given NN parameters, layer sizes, number of labels, data, and learning rate, returns the cost of traversing NN.
'''
Theta1 = (reshape(nn_params[:(hidden_layer_size*(input_layer_size+1))],(hidden_layer_size,(input_layer_size+1))))
Theta2 = (reshape(nn_params[((hidden_layer_size*(input_layer_size+1))):],(num_labels, (hidden_layer_size+1))))
m = X.shape[0]
n = X.shape[1]
#forward pass
y_eye = eye(num_labels)
y_new = np.zeros((y.shape[0],num_labels))
for z in range(y.shape[0]):
y_new[z,:] = y_eye[int(y[z])-1]
y = y_new
a_1 = c_[ones((m,1)),X]
z_2 = tr(Theta1.dot(tr(a_1)))
a_2 = tr(sigmoid(Theta1.dot(tr(a_1))))
a_2 = c_[ones((a_2.shape[0],1)), a_2]
a_3 = tr(sigmoid(Theta2.dot(tr(a_2))))
J_reg = lam/(2.*m) * (sum(sum(Theta1[:,1:]**2)) + sum(sum(Theta2[:,1:]**2)))
J = (1./m) * sum(sum(-y*log(a_3) - (1-y)*log(1-a_3))) + J_reg
#Backprop
d_3 = a_3 - y
d_2 = d_3.dot(Theta2[:,1:])*sigmoidGradient(z_2)
Theta1_grad = 1./m * tr(d_2).dot(a_1)
Theta2_grad = 1./m * tr(d_3).dot(a_2)
#Add regularization
Theta1_grad[:,1:] = Theta1_grad[:,1:] + lam*1.0/m*Theta1[:,1:]
Theta2_grad[:,1:] = Theta2_grad[:,1:] + lam*1.0/m*Theta2[:,1:]
#Unroll gradients
grad = tr(c_[Theta1_grad.swapaxes(1,0).reshape(1,-1), Theta2_grad.swapaxes(1,0).reshape(1,-1)])
return J, grad
def nn_train(X,y,lam = 1.0, hidden_layer_size = 10):
'''
Train neural network given the features and class arrays, learning rate, and size of the hidden layer.
Return parameters Theta1, Theta2.
'''
# NN input and output layer sizes
input_layer_size = X.shape[1]
num_labels = unique(y).shape[0] #output layer
# Initialize NN parameters
initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)
# Unroll parameters
initial_nn_params = np.append(initial_Theta1.flatten(1), initial_Theta2.flatten(1))
initial_nn_params = reshape(initial_nn_params,(len(initial_nn_params),)) #flatten into 1-d array
# Find and print initial cost:
J_init = nnCostFunction(initial_nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,lam)[0]
grad_init = nnCostFunction(initial_nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,lam)[1]
print 'Initial J cost: ' + str(J_init)
print 'Initial grad cost: ' + str(grad_init)
# Implement backprop and train network, run fmin
print 'Training Neural Network...'
print 'fmin results:'
nn_params, cost = op.fmin_cg(lambda t: nnCostFunction(t, input_layer_size, hidden_layer_size, num_labels, X, y, lam), initial_nn_params, gtol = 0.001, maxiter = 40, full_output=1)[0, 1]
Theta1 = (reshape(nn_params[:(hidden_layer_size*(input_layer_size+1))],(hidden_layer_size,(input_layer_size+1))))
Theta2 = (reshape(nn_params[((hidden_layer_size*(input_layer_size+1))):],(num_labels, (hidden_layer_size+1))))
return Theta1, Theta2

Related

Increasing trainig losses in Neural network

I trying implement neural network from scratch .But the thing is loss increases while training the. The Neural network consist of three layer(input,hindden,output). I have added regualrisation in loss function Compute_loss.
Here is the code.
import numpy as np
import random
# define neural network class
class Neuralnetwork():
def __init__(self, X, Y,regulization_L2=False,regulization_L1=False,dropout_forward_bool=False): # initialize parameters
self.size=[32*32, 100, 10] # size of input, hidden and output layer
self.parameters = {} # initialize parameters
self.learning_rate = 0.0001 # learning rate
self.num_iterations = 100
self.X = None
self.Y = None
self.loss = []
self.regulization_l2=regulization_L2
self.regulization_l1=regulization_L1
self.lambda_=0.000001
self.dropout=0.02
self.dropout_forword_bool=dropout_forward_bool
def initialize_parameters(self): # initialize parameters for neural network
np.random.seed(2)
self.input_layer_size = self.size[0] # size of input layer
self.hidden_layer_size = self.size[1] # size of hidden layer
self.output_layer_size = self.size[2] # size of output layer
self.parameters['W1'] = np.random.randn(self.hidden_layer_size, self.input_layer_size) * 0.01 # initialize weights for hidden layer
self.parameters['b1'] = np.zeros((self.hidden_layer_size, 1)) # initialize bias for hidden layer
self.parameters['W2'] = np.random.randn(self.output_layer_size, self.hidden_layer_size) * 0.01 # initialize weights for output layer
self.parameters['b2'] = np.zeros((self.output_layer_size, 1)) # initialize bias for output layer
def sigmoid(self, Z): # sigmoid function
return 1 / (1 + np.exp(-Z))
def relu(self, Z): # relu function
return np.maximum(0, Z)
def sigmoid_backward(self, dA, Z): # derivative of sigmoid function
sig = self.sigmoid(Z)
return dA * sig * (1 - sig)
def relu_backward(self, dA, Z): # derivative of relu function
dZ = np.array(dA, copy=True)
dZ[Z <= 0] = 0
return dZ
def forward_propagation(self, X,): # forward propagation function for neural network
# dropout the nodes by making the weights zero and bias zero for the nodes
if self.dropout_forword_bool==True and self.dropout>0:
for node in range(self.parameters['W1'].shape[0]):
for weight in range(self.parameters['W1'].shape[1]):
if random.uniform(0,1) < self.dropout:
self.parameters['W1'][node][weight] = 0
self.parameters['b1'][node]=0
# calculate Z1, A1, Z2, A2
Z1 = np.dot(self.parameters['W1'], X.T) + self.parameters['b1'] # calculate Z1
A1 = self.relu(Z1)
Z2 = np.dot(self.parameters['W2'], A1) + self.parameters['b2']
A2 = self.sigmoid(Z2)
return A2, Z2, A1, Z1
def L1_regularization(self, weights): # L1 regularization function to avoid overfitting
sum=0
# calculate sum of absolute values of weights
for i in range(weights.shape[0]):
for j in range(weights.shape[1]):
sum=sum+abs(weights[i][j])
ans=self.lambda_ *sum # multiply sum with lambda value to set the regularization strength
return ans
def L2_regularization(self, weights):
sum=0
# calculate sum of square of weights
for i in range(weights.shape[0]):
for j in range(weights.shape[1]):
sum=sum+((weights[i][j])**2)
ans=self.lambda_ *sum
return ans
def compute_cost(self, A2, Y): # compute cost function
m = len(Y)
if self.regulization_l2==True: # if L2 regularization is true
cost = -1/m * np.sum(np.multiply(Y, np.log(A2)) + np.multiply((1 - Y), np.log(1 - A2))) # using cross entropy loss function Formula
cost = np.squeeze(cost) # squeeze the cost value to remove the extra dimension
#cost=cost + (self.L2_regularization(self.parameters['W1']) + self.L2_regularization(self.parameters['W2']))/m # add L2 regularization cost to the loss function
return cost
elif self.regulization_l1==True: # if L1 regularization is true
cost = -1/m * np.sum(np.multiply(Y, np.log(A2)) + np.multiply((1 - Y), np.log(1 - A2))) # using cross entropy loss function Formula
cost = np.squeeze(cost)
#cost=cost + (self.L1_regularization(self.parameters['W1']) + self.L1_regularization(self.parameters['W2']))/m # add L1 regularization cost to the loss function
return cost
else:
cost = -1/m * np.sum(np.multiply(Y, np.log(A2)) + np.multiply((1 - Y), np.log(1 - A2))) # using cross entropy loss function Formula Without regularization
cost = np.squeeze(cost)
return cost
def backward_propagation(self, X, Y, A2, Z2, A1, Z1):
# backward propagation function for neural network does the backpropagation and calculates the gradients of weights and bias
m = len(Y)
dZ2 = A2 - Y.T # dz2 is the derivative of cost function with respect to z2
if self.regulization_l1==True:
# calculate dW2 and add L1 regularization to it
dW2 = np.dot(dZ2, A1.T) / m + self.L1_regularization(self.parameters['W2'])/m # calculate dW2 and add L1 regularization to it
db2 = np.sum(dZ2, axis=1, keepdims=True) / m
dA= np.dot(self.parameters['W2'].T, dZ2)
dZ1 = dA * self.relu_backward(dA, Z1)
dW1 = np.dot(dZ1, X) / m + self.L1_regularization(self.parameters['W1'])/m
db1 = np.sum(dZ1, axis=1, keepdims=True) / m
return dW1, dW2, db1, db2
elif self.regulization_l2==True:
# calculate dW2 and add L2 regularization to it
dW2 = np.dot(dZ2, A1.T) / m + self.L2_regularization(self.parameters['W2'])/m
db2 = np.sum(dZ2, axis=1, keepdims=True) / m
dA= np.dot(self.parameters['W2'].T, dZ2)
dZ1 = dA * self.relu_backward(dA, Z1)
dW1 = np.dot(dZ1, X) / m + self.L2_regularization(self.parameters['W1'])/m
db1 = np.sum(dZ1, axis=1, keepdims=True) / m
return dW1, dW2, db1, db2
else:
# calculate dW2 and db2 without regularization
dW2 = np.dot(dZ2, A1.T) / m
db2 = np.sum(dZ2, axis=1, keepdims=True) / m
dA= np.dot(self.parameters['W2'].T, dZ2)
dZ1 = dA * self.relu_backward(dA, Z1)
dW1 = np.dot(dZ1, X) / m
db1 = np.sum(dZ1, axis=1, keepdims=True) / m
return dW1, dW2, db1, db2
def update_parameters(self, dW1, dW2, db1, db2):
# update the parameters using the gradients calculated in backward propagation
self.parameters['W1'] = self.parameters['W1'] - self.learning_rate * dW1
self.parameters['W2'] = self.parameters['W2'] - self.learning_rate * dW2
self.parameters['b1'] = self.parameters['b1'] - self.learning_rate * db1
self.parameters['b2'] = self.parameters['b2'] - self.learning_rate * db2
def fit(self, X, Y):
# fit function is used to train the model
self.initialize_parameters() # initialize the parameters
for i in range(self.num_iterations): # loop over the number of iterations
A2, Z2, A1, Z1 = self.forward_propagation(X) # forward propagation
cost = self.compute_cost(A2, Y)
dW1, dW2, db1, db2 = self.backward_propagation(X, Y, A2, Z2, A1, Z1) # backward propagation to calculate the gradients
# save the gradients in the parameters dictionary
self.parameters['dW1'] = dW1
self.parameters['dW2'] = dW2
self.parameters['db1'] = db1
self.parameters['db2'] = db2
# update the parameters
self.update_parameters(dW1, dW2, db1, db2)
self.loss.append(cost)
if i % 10 == 0:
print(f'Cost after iteration {i}: {cost}')
def predict(self, X):
# predict function is used to predict the output for the given input
A2, _, _, _ = self.forward_propagation(X) # forward propagation
predictions = np.round(A2) # round the output to get the predictions
return predictions
def plot_loss(self): # plot the loss
plt.plot(self.loss)
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.show()
def accuracy(self, X, Y):
predictions = self.predict(X)
return (np.sum((predictions == Y) / Y.shape[1])) * 100
def check_gradient(self, X, Y): # check the gradient
# calculate the gradient using backpropagation
A2, Z2, A1, Z1 = self.forward_propagation(X)
epsilon = 1e-7
dW1, dW2, db1, db2 = self.backward_propagation(X, Y, A2, Z2, A1, Z1) # backward propagation to calculate the gradients
for i in range(1, 3):
# calculate the gradient using the formula
self.parameters[f'W{i}'] += epsilon # add epsilon to W1
A2, Z2, A1, Z1 = self.forward_propagation(X) # forward propagation
cost_plus = self.compute_cost(A2, Y)
self.parameters[f'W{i}'] -= 2 * epsilon # subtract epsilon from W1
A2, Z2, A1, Z1 = self.forward_propagation(X) # forward propagation
cost_minus = self.compute_cost(A2, Y)
gradient = (cost_plus - cost_minus) / (2 * epsilon) # calculate the gradient using the formula
numerator = np.linalg.norm(gradient - self.parameters[f'dW{i}']) # calculate the numerator using the norm of the difference between the gradients
denominator = np.linalg.norm(gradient) + np.linalg.norm(self.parameters[f'dW{i}']) # calculate the denominator using the norm of the gradients
difference = numerator / denominator
if(difference > 1e-7):
print("There is a mistake in the backward propagation! difference = " + str(difference))
when i call mdoel:
print('--------------- model with L2 Regularization --------------------')
model_L2 = Neuralnetwork(x_train, y_train,regulization_L2=True, regulization_L1=False,dropout_forward_bool=False)
model_L2.fit(x_train, y_train)
here is the output
--------------- model with L2 Regularization --------------------
Cost after iteration 0: 1430.3418627178976
Cost after iteration 10: 1446.5808681718697
Cost after iteration 20: 1459.8884483327824
How do i correct this and why is it increasing?
Your example is missing x_train so I couldn't reproduce your behavior - but it's not a tragedy because I made something that might help you - a small notebook in which I implemented the backprop. alg. as an example to understand it myself. I have made 5 versions in which I go step by step from a simple numpy implentation to one with pytorch and cuda. For your problem the versions 1 to 3 of the implementation are interesting. I hope this helps you.
backpropagationnotebook

Gradient descent for linear regression with numpy

I want to implement gradient descent with numpy for linear regression but I have some error in this code:
import numpy as np
# Code Example
rng = np.random.RandomState(10)
X = 10*rng.rand(1000, 5) # feature matrix
y = 0.9 + np.dot(X, [2.2, 4, -4, 1, 2]) # target vector
# GD implementation for linear regression
def GD(X, y, eta=0.1, n_iter=20):
theta = np.zeros((X.shape[0], X.shape[1]))
for i in range(n_iter):
grad = 2 * np.mean((np.dot(theta.T, X) - y) * X)
theta = theta - eta * grad
return theta
# SGD implementation for linear regression
def SGD(X, y, eta=0.1, n_iter=20):
theta = np.zeros(1, X.shape[1])
for i in range(n_iter):
for j in range(X.shape[0]):
grad = 2 * np.mean((np.dot(theta.T, X[j,:]) - y[j]) * X[j,:])
theta = theta - eta * grad
return theta
# MSE loss for linear regression with numpy
def MSE(X, y, theta):
return np.mean((X.dot(theta.T) - y)**2)
# linear regression with GD and MSE with numpy
theta_gd = GD(X, y)
theta_sgd = SGD(X, y)
print('MSE with GD: ', MSE(X, y, theta_gd))
print('MSE with SGD: ', MSE(X, y, theta_sgd))
The error is
grad = 2 * np.mean((np.dot(theta.T, X) - y) * X)
ValueError: operands could not be broadcast together with shapes (5,5) (1000,)
and I can't solve it.
Minor changes in your code that resolve dimensionality issues during matrix multiplication make the code run successfully. In particular, note that a linear regression on a design matrix X of dimension Nxk has a parameter vector theta of size k.
In addition, I'd suggest some changes in SGD() that make it a proper stochastic gradient descent. Namely, evaluating the gradient over random subsets of the data realized as realized by randomly partitioning the index set of the train data with np.random.shuffle() and looping through it. The batch_size determines the size of each subset after which the parameter estimate is updated. The argument seed ensures reproducibility.
# GD implementation for linear regression
def GD(X, y, eta=0.001, n_iter=100):
theta = np.zeros(X.shape[1])
for i in range(n_iter):
for j in range(X.shape[0]):
grad = (2 * np.mean(X[j,:] # theta - y[j]) * X[j,:]) # changed line
theta -= eta * grad
return theta
# SGD implementation for linear regression
def SGD(X, y, eta=0.001, n_iter=1000, batch_size=25, seed=7678):
theta = np.zeros(X.shape[1])
indexSet = list(range(len(X)))
np.random.seed(seed)
for i in range(n_iter):
np.random.shuffle(indexSet) # random shuffle of index set
for j in range(round(len(X) / batch_size)+1):
X_sub = X[indexSet[j*batch_size:(j+1)*batch_size],:]
y_sub = y[indexSet[j*batch_size:(j+1)*batch_size]]
if(len(X_sub) > 0):
grad = (2 * np.mean(X_sub # theta - y_sub) * X_sub) # changed line
theta -= eta * np.mean(grad, axis=0)
return theta
Running the code, I get
print('MSE with GD : ', MSE(X, y, theta_gd))
print('MSE with SGD: ', MSE(X, y, theta_sgd))
> MSE with GD : 0.07602
MSE with SGD: 0.05762
Each observation has 5 features, and X contains 1000 observations:
X = rng.rand(1000, 5) * 10 # X.shape == (1000, 5)
Create y which is perfectly linearly correlated with X (with no distortions):
real_weights = np.array([2.2, 4, -4, 1, 2]).reshape(-1, 1)
real_bias = 0.9
y = X # real_weights + real_bias # y.shape == (1000, 1)
G.D. implementation for linear regression:
Note:
w (weights) is your theta variable.
I have also added the calculation of b (bias).
def GD(X, y, eta=0.1, n_iter=20):
# Initialize weights and a bias (all zeros):
w = np.zeros((X.shape[1], 1)) # w.shape == (5, 1)
b = 0
# Gradient descent
for i in range(n_iter):
errors = X # w + b - y # errors.shape == (1000, 1)
dw = 2 * np.mean(errors * X, axis=0).reshape(5, 1)
db = 2 * np.mean(errors)
w -= eta * dw
b -= eta * db
return w, b
Testing:
w, b = GD(X, y, eta=0.003, n_iter=5000)
print(w, b)
[[ 2.20464905]
[ 4.00510139]
[-3.99569374]
[ 1.00444026]
[ 2.00407476]] 0.7805448262466914
Notes:
Your function SGD also contains some error..
I'm using the # operator because it's just my preference over np.dot.

What's wrong with my backpropagation?

I'm trying to code a neural network from scratch in python. To check whether everything works I wanted to overfit the network but the loss seems to explode at first and then comes back to the initial value and stops there (Doesn't converge). I've checked my code and could find the reason. I assume my understanding or implementation of backpropagation is incorrect but there might be some other reason. Can anyone help me out or at least point me in the right direction?
# Initialize weights and biases given dimesnsions (For this example the dimensions are set to [12288, 64, 1])
def initialize_parameters(dims):
# Initiate parameters
parameters = {}
L = len(dims) # Number of layers in the network
# Loop over the given dimensions. Initialize random weights and set biases to zero.
for i in range(1, L):
parameters["W" + str(i)] = np.random.randn(dims[i], dims[i-1]) * 0.01
parameters["b" + str(i)] = np.zeros([dims[i], 1])
return parameters
# Activation Functions
def relu(x, deriv=False):
if deriv:
return 1. * (x > 0)
else:
return np.maximum(0,x)
def sigmoid(x, deriv=False):
if deriv:
return x * (1-x)
else:
return 1/(1 + np.exp(-x))
# Forward and backward pass for 2 layer neural network. (1st relu, 2nd sigmoid)
def forward_backward(X, Y, parameters):
# Array for storing gradients
grads = {}
# Get the length of examples
m = Y.shape[1]
# First layer
Z1 = np.dot(parameters["W1"], X) + parameters["b1"]
A1 = relu(Z1)
# Second layer
Z2 = np.dot(parameters["W2"], A1) + parameters["b2"]
AL = sigmoid(Z2)
# Compute cost
cost = (-1 / m) * np.sum(np.multiply(Y, np.log(AL)) + np.multiply(1 - Y, np.log(1 - AL)))
# Backpropagation
# Second Layer
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
dZ2 = dAL * sigmoid(AL, deriv=True)
grads["dW2"] = np.dot(dZ2, A1.T) / m
grads["db2"] = np.sum(dZ2, axis=1, keepdims=True) / m
# First layer
dA1 = np.dot(parameters["W2"].T, dZ2)
dZ1 = dA1 * relu(A1, deriv=True)
grads["dW1"] = np.dot(dZ1, X.T)
grads["db1"] = np.sum(dZ1, axis=1, keepdims=True) / m
return AL, grads, cost
# Hyperparameters
dims = [12288, 64, 1]
epoches = 2000
learning_rate = 0.1
# Initialize parameters
parameters = initialize_parameters(dims)
log_list = []
# Train the network
for i in range(epoches):
# Get X and Y
x = np.array(train[0:10],ndmin=2).T
y = np.array(labels[0:10], ndmin=2).T
# Perform forward and backward pass
AL, grads, cost = forward_backward(x, y, parameters)
# Compute cost and append to the log_list
log_list.append(cost)
# Update parameters with computed gradients
parameters = update_parameters(grads, parameters, learning_rate)
plt.plot(log_list)
plt.title("Loss of the network")
plt.show()
I am struggling to find the place where you calculate the error gradients and the input training data sample would also help...
I don't know if this will help you, but I'll share my solution for Python neural network to learn XOR problem.
import numpy as np
def sigmoid_function(x, derivative=False):
"""
Sigmoid function
“x” is the input and “y” the output, the nonlinear properties of this function means that
the rate of change is slower at the extremes and faster in the centre. Put plainly,
we want the neuron to “make its mind up” instead of indecisively staying in the middle.
:param x: Float
:param Derivative: Boolean
:return: Float
"""
if (derivative):
return x * (1 - x) # Derivative using the chain rule.
else:
return 1 / (1 + np.exp(-x))
# create dataset for XOR problem
input_data = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
ideal_output = np.array([[0.0], [1.0], [1.0], [0.0]])
#initialize variables
learning_rate = 0.1
epoch = 50000 #number or iterations basically - One round of forward and back propagation is called an epoch
# get the second element from the numpy array shape field to detect the count of features for input layer
input_layer_neurons = input_data.shape[1]
hidden_layer_neurons = 3 #number of hidden layer neurons
output_layer_neurons = 1 #number of output layer neurons
#init weight & bias
weights_hidden = np.random.uniform(size=(input_layer_neurons, hidden_layer_neurons))
bias_hidden = np.random.uniform(1, hidden_layer_neurons)
weights_output = np.random.uniform(size=(hidden_layer_neurons, output_layer_neurons))
bias_output = np.random.uniform(1, output_layer_neurons)
for i in range(epoch):
#forward propagation
hidden_layer_input_temp = np.dot(input_data, weights_hidden) #matrix dot product to adjust for weights in the layer
hidden_layer_input = hidden_layer_input_temp + bias_hidden #adjust for bias
hidden_layer_activations = sigmoid_function(hidden_layer_input) #use the activation function
output_layer_input_temp = np.dot(hidden_layer_activations, weights_output)
output_layer_input = output_layer_input_temp + bias_output
output = sigmoid_function(output_layer_input) #final output
#backpropagation (where adjusting of the weights happens)
error = ideal_output - output #error gradient
if (i % 1000 == 0):
print("Error: {}".format(np.mean(abs(error))))
#use derivatives to compute slope of output and hidden layers
slope_output_layer = sigmoid_function(output, derivative=True)
slope_hidden_layer = sigmoid_function(hidden_layer_activations, derivative=True)
#calculate deltas
delta_output = error * slope_output_layer
error_hidden_layer = delta_output.dot(weights_output.T) #calculates the error at hidden layer
delta_hidden = error_hidden_layer * slope_hidden_layer
#change the weights
weights_output += hidden_layer_activations.T.dot(delta_output) * learning_rate
bias_output += np.sum(delta_output, axis=0, keepdims=True) * learning_rate
weights_hidden += input_data.T.dot(delta_hidden) * learning_rate
bias_hidden += np.sum(delta_hidden, axis=0, keepdims=True) * learning_rate

Implementation of normalizing flows in Keras

I've been trying to implement a simple version of normalizing flows with Keras, as explained in this paper: https://arxiv.org/pdf/1505.05770.pdf
My problem is that the loss is always -infinity, and I can't get what I did wrong. Can anybody help me ?
Here is the procedure:
the encoder generates vectors of size latent_dim = 100. These are z_mean, z_log_var, u, b, w.
From z_mean and z_log_var, using the reparametrization trick I can sample z_0 ~ N(z_mean, z_log_var).
Then I can compute log(abs(1+u.T.dot(psi(z_0))))
Then I can compute z_1
Here is the code for those four steps:
def sampling(args):
z_mean, z_log_var = args
# sample epsilon according to N(O,I)
epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.,
std=epsilon_std)
# generate z0 according to N(z_mean, z_log_var)
z0 = z_mean + K.exp(z_log_var / 2) * epsilon
print('z0', z0)
return z0
def logdet_loss(args):
z0, w, u, b = args
b2 = K.squeeze(b, 1)
beta = K.sum(tf.multiply(w, z0), 1) # <w|z0>
linear_trans = beta + b2 # <w|z0> + b
# change u2 so that the transformation z0->z1 is invertible
alpha = K.sum(tf.multiply(w, u), 1) #
diag1 = tf.diag(K.softplus(alpha) - 1 - alpha)
u2 = u + K.dot(diag1, w) / K.sum(K.square(w)+1e-7)
gamma = K.sum(tf.multiply(w,u2), 1)
logdet = K.log(K.abs(1 + (1 - K.square(K.tanh(linear_trans)))*gamma) + 1e-6)
return logdet
def transform_z0(args):
z0, w, u, b = args
b2 = K.squeeze(b, 1)
beta = K.sum(tf.multiply(w, z0), 1)
# change u2 so that the transformation z0->z1 is invertible
alpha = K.sum(tf.multiply(w, u), 1)
diag1 = tf.diag(K.softplus(alpha) - 1 - alpha)
u2 = u + K.dot(diag1, w) / K.sum(K.square(w)+1e-7)
diag2 = tf.diag(K.tanh(beta + b2))
# generate z1
z1 = z0 + K.dot(diag2,u2)
return z1
Then here is the loss (where logdet is defined above)
def vae_loss(x, x_decoded_mean):
xent_loss = K.mean(objectives.categorical_crossentropy(x, x_decoded_mean), -1)
ln_q0z0 = K.sum(log_normal2(z0, z_mean, z_log_var, eps=1e-6), -1)
ln_pz1 = K.sum(log_stdnormal(z1), -1)
result = K.mean(logdet + ln_pz1 + xent_loss - ln_q0z0)
return result
I modified the Keras tutorial on VAE here : https://github.com/sbaurdlp/keras-iaf-mnist
If someone is interested to look...
Strangely adding more layers doesn't improve performance, and I can't see what is wrong in the code
As I couldn't make it work, I have tried to implement the normalizing flow described in this paper: Improved Variational Inference
with Inverse Autoregressive Flow
However I still meet the same problem of diverging loss (towards -infinity), which makes no sense. There must be a problem with my implementation.
Here are the important parts:
# the encoder
h = encoder_block(x) # a convnet taking proteins as input (matrices of size 400x22), I don't describe it since it isn't very important
z_log_var = Dense(latent_dim)(h)
z_mean = Dense(latent_dim)(h)
h_ = Dense(latent_dim)(h)
encoder = Model(x, [z_mean,z_log_var, h_])
# the latent variables (only one transformation to keep it simple)
latent_input = Input(shape=(latent_dim, 2), batch_shape=(batch_size, latent_dim, 2))
hl = Convolution1D(1, filter_length, activation="relu", border_mode="same")(latent_input)
hl = Reshape((latent_dim,))(hl)
mean_1 = Dense(latent_dim)(hl)
std_1 = Dense(latent_dim)(hl)
latent_model = Model(latent_input, [mean_1, std_1])
# the decoder
decoder_input = Input((latent_dim,), batch_shape=(batch_size, latent_dim))
decoder=decoder_block() # a convnet that I don't describe
x_decoded_mean = decoder(decoder_input)
generator = Model(decoder_input, x_decoded_mean)
# the VAE
z_mean, z_log_var, other = encoder(vae_input)
eps = Lambda(sample_eps, name='sample_eps')([z_mean, z_log_var, other])
z0 = Lambda(sample_z0, name='sample_z0')([z_mean, z_log_var, eps])
l = Lambda(sample_l, name='sample_l')([eps, z_log_var])
mean, std = latent_model(merge([Reshape((latent_dim,1))(z0), Reshape((latent_dim,1))(other)], mode="concat", concat_axis=-1))
z = Lambda(transform_z0)([z0, mean, std])
l = Lambda(transform_l)([l, std])
x_decoded_mean = generator(z)
vae = Model(vae_input, x_decoded_mean)
# and here is the loss
def vae_loss(x, x_decoded_mean):
xent_loss = K.mean(objectives.categorical_crossentropy(x, x_decoded_mean), -1)
ln_q0z0 = K.sum(log_normal2(z0, z_mean, z_log_var), -1)
ln_pz1 = K.sum(log_stdnormal(z), -1)
result = K.mean(l + ln_pz1 + xent_loss - ln_q0z0)
return result
Here are the utils functions I use above in the Lambda layers:
def sample_eps(args):
# sample epsilon according to N(O,I)
epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.,
std=epsilon_std)
return epsilon
def sample_z0(args):
z_mean, z_log_var, epsilon = args
# generate z0 according to N(z_mean, z_log_var)
z0 = z_mean + K.exp(z_log_var / 2) * epsilon
return z0
def sample_l(args):
epsilon, z_log_var = args
l = -0.5*K.sum(z_log_var + epsilon**2 + K.log(2*math.pi), -1)
return l
def transform_z0(args):
z0, mean, std = args
z = z0
sig_std = K.sigmoid(std)
z *= sig_std
z += (1-sig_std)*mean
return z
def transform_l(args):
l, std = args
sig_std = K.sigmoid(std)
l -= K.sum(K.log(sig_std+1e-8), -1)
return l

Use of scipy.optimize.minimize in Neural Network

Trying to use Backpropagation Neural Network for multiclass classification. I have found this code and try to adapt it. It is based on the lections of Machine Learning in Coursera from Andrew Ng.
I don't understand exactly the implementation of scipy.optimize.minimize function here. It is used just once in the code. Is it iteratively updating the weights of the network? How can I visualize (plot) it's performance to see when it converges?
Using this function what parameters I can adjust to achieve better performance? I found here a list common parameters:
Number of neurons in the hidden layer: this is hidden_layer_size=25 in my code
Learning rate: can I still adjust that using built-in minimization function?
Momentum: is that reg_lambda=0 in my case? Regularization parameter to avoid overfitting, right?
Epoch: maxiter=500
Here is my training data (target class is in the last column):
65535, 3670, 65535, 3885, -0.73, 1
65535, 3962, 65535, 3556, -0.72, 1
65535, 3573, 65535, 3529, -0.61, 1
3758, 3123, 4117, 3173, -0.21, 0
3906, 3119, 4288, 3135, -0.28, 0
3750, 3073, 4080, 3212, -0.26, 0
65535, 3458, 65535, 3330, -0.85, 2
65535, 3315, 65535, 3306, -0.87, 2
65535, 3950, 65535, 3613, -0.84, 2
65535, 32576, 65535, 19613, -0.35, 3
65535, 16657, 65535, 16618, -0.37, 3
65535, 16657, 65535, 16618, -0.32, 3
The dependencies are so obvious, I think it should be so easy to classify it...
But results are terrible. I get accuracy of 0.6 to 0.8. This is absolutely inappropriate for my application. I know I need more data normally, but I would be already happy when I could at least fit the training data (without taking into account potential overfitting)
Here is the code:
import numpy as np
from scipy import optimize
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
import math
class NN_1HL(object):
def __init__(self, reg_lambda=0, epsilon_init=0.12, hidden_layer_size=25, opti_method='TNC', maxiter=500):
self.reg_lambda = reg_lambda
self.epsilon_init = epsilon_init
self.hidden_layer_size = hidden_layer_size
self.activation_func = self.sigmoid
self.activation_func_prime = self.sigmoid_prime
self.method = opti_method
self.maxiter = maxiter
def sigmoid(self, z):
return 1 / (1 + np.exp(-z))
def sigmoid_prime(self, z):
sig = self.sigmoid(z)
return sig * (1 - sig)
def sumsqr(self, a):
return np.sum(a ** 2)
def rand_init(self, l_in, l_out):
self.epsilon_init = (math.sqrt(6))/(math.sqrt(l_in + l_out))
return np.random.rand(l_out, l_in + 1) * 2 * self.epsilon_init - self.epsilon_init
def pack_thetas(self, t1, t2):
return np.concatenate((t1.reshape(-1), t2.reshape(-1)))
def unpack_thetas(self, thetas, input_layer_size, hidden_layer_size, num_labels):
t1_start = 0
t1_end = hidden_layer_size * (input_layer_size + 1)
t1 = thetas[t1_start:t1_end].reshape((hidden_layer_size, input_layer_size + 1))
t2 = thetas[t1_end:].reshape((num_labels, hidden_layer_size + 1))
return t1, t2
def _forward(self, X, t1, t2):
m = X.shape[0]
ones = None
if len(X.shape) == 1:
ones = np.array(1).reshape(1,)
else:
ones = np.ones(m).reshape(m,1)
# Input layer
a1 = np.hstack((ones, X))
# Hidden Layer
z2 = np.dot(t1, a1.T)
a2 = self.activation_func(z2)
a2 = np.hstack((ones, a2.T))
# Output layer
z3 = np.dot(t2, a2.T)
a3 = self.activation_func(z3)
return a1, z2, a2, z3, a3
def function(self, thetas, input_layer_size, hidden_layer_size, num_labels, X, y, reg_lambda):
t1, t2 = self.unpack_thetas(thetas, input_layer_size, hidden_layer_size, num_labels)
m = X.shape[0]
Y = np.eye(num_labels)[y]
_, _, _, _, h = self._forward(X, t1, t2)
costPositive = -Y * np.log(h).T
costNegative = (1 - Y) * np.log(1 - h).T
cost = costPositive - costNegative
J = np.sum(cost) / m
if reg_lambda != 0:
t1f = t1[:, 1:]
t2f = t2[:, 1:]
reg = (self.reg_lambda / (2 * m)) * (self.sumsqr(t1f) + self.sumsqr(t2f))
J = J + reg
return J
def function_prime(self, thetas, input_layer_size, hidden_layer_size, num_labels, X, y, reg_lambda):
t1, t2 = self.unpack_thetas(thetas, input_layer_size, hidden_layer_size, num_labels)
m = X.shape[0]
t1f = t1[:, 1:]
t2f = t2[:, 1:]
Y = np.eye(num_labels)[y]
Delta1, Delta2 = 0, 0
for i, row in enumerate(X):
a1, z2, a2, z3, a3 = self._forward(row, t1, t2)
# Backprop
d3 = a3 - Y[i, :].T
d2 = np.dot(t2f.T, d3) * self.activation_func_prime(z2)
Delta2 += np.dot(d3[np.newaxis].T, a2[np.newaxis])
Delta1 += np.dot(d2[np.newaxis].T, a1[np.newaxis])
Theta1_grad = (1 / m) * Delta1
Theta2_grad = (1 / m) * Delta2
if reg_lambda != 0:
Theta1_grad[:, 1:] = Theta1_grad[:, 1:] + (reg_lambda / m) * t1f
Theta2_grad[:, 1:] = Theta2_grad[:, 1:] + (reg_lambda / m) * t2f
return self.pack_thetas(Theta1_grad, Theta2_grad)
def fit(self, X, y):
num_features = X.shape[0]
input_layer_size = X.shape[1]
num_labels = len(set(y))
theta1_0 = self.rand_init(input_layer_size, self.hidden_layer_size)
theta2_0 = self.rand_init(self.hidden_layer_size, num_labels)
thetas0 = self.pack_thetas(theta1_0, theta2_0)
options = {'maxiter': self.maxiter}
_res = optimize.minimize(self.function, thetas0, jac=self.function_prime, method=self.method,
args=(input_layer_size, self.hidden_layer_size, num_labels, X, y, 0), options=options)
self.t1, self.t2 = self.unpack_thetas(_res.x, input_layer_size, self.hidden_layer_size, num_labels)
np.savetxt("weights_t1.txt", self.t1, newline="\n")
np.savetxt("weights_t2.txt", self.t2, newline="\n")
def predict(self, X):
return self.predict_proba(X).argmax(0)
def predict_proba(self, X):
_, _, _, _, h = self._forward(X, self.t1, self.t2)
return h
##################
# IR data #
##################
values = np.loadtxt('infrared_data.txt', delimiter=', ', usecols=[0,1,2,3,4])
targets = np.loadtxt('infrared_data.txt', delimiter=', ', dtype=(int), usecols=[5])
X_train, X_test, y_train, y_test = cross_validation.train_test_split(values, targets, test_size=0.4)
nn = NN_1HL()
nn.fit(values, targets)
print("Accuracy of classification: "+str(accuracy_score(y_test, nn.predict(X_test))))
In the given code scipy.optimize.minimize iteratively minimizes function given it's derivative (Jacobi's matrix). According to the documentation, use can specify callback argument to a function that will be called after each iteration — this will let you measure performance, though I'm not sure if it'll let you halt the optimization process.
All parameters you listed are hyperparameters, it's hard to optimize them directly:
Number of neurons in the hidden layer is a discrete valued parameters, and, thus, is not optimizable via gradient techniques. Moreover, it affects NeuralNet architecture, so you can't optimize it while training the net. What you can do, though, is to use some higher-level routine to search for possible options, like exhaustive grid search with cross-validation (for example look at GridSearchCV) or other tools for hyperparameter search (hyperopt, spearmint, MOE, etc).
Learning rate does not seem to be customizable for most of the optimization methods available. But, actually, learning rate in gradient descent is just a Newton's method with Hessian "approximated" by 1 / eta I — diagonal matrix with inverted learning rates on the major diagonal. So you can try hessian-based methods with this heuristic.
Momentum is completely unrelated to regularization. It's an optimization technique, and, since you use scipy for optimization, is unavailable for you.

Categories

Resources