Implementation of normalizing flows in Keras - python

I've been trying to implement a simple version of normalizing flows with Keras, as explained in this paper: https://arxiv.org/pdf/1505.05770.pdf
My problem is that the loss is always -infinity, and I can't get what I did wrong. Can anybody help me ?
Here is the procedure:
the encoder generates vectors of size latent_dim = 100. These are z_mean, z_log_var, u, b, w.
From z_mean and z_log_var, using the reparametrization trick I can sample z_0 ~ N(z_mean, z_log_var).
Then I can compute log(abs(1+u.T.dot(psi(z_0))))
Then I can compute z_1
Here is the code for those four steps:
def sampling(args):
z_mean, z_log_var = args
# sample epsilon according to N(O,I)
epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.,
std=epsilon_std)
# generate z0 according to N(z_mean, z_log_var)
z0 = z_mean + K.exp(z_log_var / 2) * epsilon
print('z0', z0)
return z0
def logdet_loss(args):
z0, w, u, b = args
b2 = K.squeeze(b, 1)
beta = K.sum(tf.multiply(w, z0), 1) # <w|z0>
linear_trans = beta + b2 # <w|z0> + b
# change u2 so that the transformation z0->z1 is invertible
alpha = K.sum(tf.multiply(w, u), 1) #
diag1 = tf.diag(K.softplus(alpha) - 1 - alpha)
u2 = u + K.dot(diag1, w) / K.sum(K.square(w)+1e-7)
gamma = K.sum(tf.multiply(w,u2), 1)
logdet = K.log(K.abs(1 + (1 - K.square(K.tanh(linear_trans)))*gamma) + 1e-6)
return logdet
def transform_z0(args):
z0, w, u, b = args
b2 = K.squeeze(b, 1)
beta = K.sum(tf.multiply(w, z0), 1)
# change u2 so that the transformation z0->z1 is invertible
alpha = K.sum(tf.multiply(w, u), 1)
diag1 = tf.diag(K.softplus(alpha) - 1 - alpha)
u2 = u + K.dot(diag1, w) / K.sum(K.square(w)+1e-7)
diag2 = tf.diag(K.tanh(beta + b2))
# generate z1
z1 = z0 + K.dot(diag2,u2)
return z1
Then here is the loss (where logdet is defined above)
def vae_loss(x, x_decoded_mean):
xent_loss = K.mean(objectives.categorical_crossentropy(x, x_decoded_mean), -1)
ln_q0z0 = K.sum(log_normal2(z0, z_mean, z_log_var, eps=1e-6), -1)
ln_pz1 = K.sum(log_stdnormal(z1), -1)
result = K.mean(logdet + ln_pz1 + xent_loss - ln_q0z0)
return result

I modified the Keras tutorial on VAE here : https://github.com/sbaurdlp/keras-iaf-mnist
If someone is interested to look...
Strangely adding more layers doesn't improve performance, and I can't see what is wrong in the code

As I couldn't make it work, I have tried to implement the normalizing flow described in this paper: Improved Variational Inference
with Inverse Autoregressive Flow
However I still meet the same problem of diverging loss (towards -infinity), which makes no sense. There must be a problem with my implementation.
Here are the important parts:
# the encoder
h = encoder_block(x) # a convnet taking proteins as input (matrices of size 400x22), I don't describe it since it isn't very important
z_log_var = Dense(latent_dim)(h)
z_mean = Dense(latent_dim)(h)
h_ = Dense(latent_dim)(h)
encoder = Model(x, [z_mean,z_log_var, h_])
# the latent variables (only one transformation to keep it simple)
latent_input = Input(shape=(latent_dim, 2), batch_shape=(batch_size, latent_dim, 2))
hl = Convolution1D(1, filter_length, activation="relu", border_mode="same")(latent_input)
hl = Reshape((latent_dim,))(hl)
mean_1 = Dense(latent_dim)(hl)
std_1 = Dense(latent_dim)(hl)
latent_model = Model(latent_input, [mean_1, std_1])
# the decoder
decoder_input = Input((latent_dim,), batch_shape=(batch_size, latent_dim))
decoder=decoder_block() # a convnet that I don't describe
x_decoded_mean = decoder(decoder_input)
generator = Model(decoder_input, x_decoded_mean)
# the VAE
z_mean, z_log_var, other = encoder(vae_input)
eps = Lambda(sample_eps, name='sample_eps')([z_mean, z_log_var, other])
z0 = Lambda(sample_z0, name='sample_z0')([z_mean, z_log_var, eps])
l = Lambda(sample_l, name='sample_l')([eps, z_log_var])
mean, std = latent_model(merge([Reshape((latent_dim,1))(z0), Reshape((latent_dim,1))(other)], mode="concat", concat_axis=-1))
z = Lambda(transform_z0)([z0, mean, std])
l = Lambda(transform_l)([l, std])
x_decoded_mean = generator(z)
vae = Model(vae_input, x_decoded_mean)
# and here is the loss
def vae_loss(x, x_decoded_mean):
xent_loss = K.mean(objectives.categorical_crossentropy(x, x_decoded_mean), -1)
ln_q0z0 = K.sum(log_normal2(z0, z_mean, z_log_var), -1)
ln_pz1 = K.sum(log_stdnormal(z), -1)
result = K.mean(l + ln_pz1 + xent_loss - ln_q0z0)
return result
Here are the utils functions I use above in the Lambda layers:
def sample_eps(args):
# sample epsilon according to N(O,I)
epsilon = K.random_normal(shape=(batch_size, latent_dim), mean=0.,
std=epsilon_std)
return epsilon
def sample_z0(args):
z_mean, z_log_var, epsilon = args
# generate z0 according to N(z_mean, z_log_var)
z0 = z_mean + K.exp(z_log_var / 2) * epsilon
return z0
def sample_l(args):
epsilon, z_log_var = args
l = -0.5*K.sum(z_log_var + epsilon**2 + K.log(2*math.pi), -1)
return l
def transform_z0(args):
z0, mean, std = args
z = z0
sig_std = K.sigmoid(std)
z *= sig_std
z += (1-sig_std)*mean
return z
def transform_l(args):
l, std = args
sig_std = K.sigmoid(std)
l -= K.sum(K.log(sig_std+1e-8), -1)
return l

Related

Increasing trainig losses in Neural network

I trying implement neural network from scratch .But the thing is loss increases while training the. The Neural network consist of three layer(input,hindden,output). I have added regualrisation in loss function Compute_loss.
Here is the code.
import numpy as np
import random
# define neural network class
class Neuralnetwork():
def __init__(self, X, Y,regulization_L2=False,regulization_L1=False,dropout_forward_bool=False): # initialize parameters
self.size=[32*32, 100, 10] # size of input, hidden and output layer
self.parameters = {} # initialize parameters
self.learning_rate = 0.0001 # learning rate
self.num_iterations = 100
self.X = None
self.Y = None
self.loss = []
self.regulization_l2=regulization_L2
self.regulization_l1=regulization_L1
self.lambda_=0.000001
self.dropout=0.02
self.dropout_forword_bool=dropout_forward_bool
def initialize_parameters(self): # initialize parameters for neural network
np.random.seed(2)
self.input_layer_size = self.size[0] # size of input layer
self.hidden_layer_size = self.size[1] # size of hidden layer
self.output_layer_size = self.size[2] # size of output layer
self.parameters['W1'] = np.random.randn(self.hidden_layer_size, self.input_layer_size) * 0.01 # initialize weights for hidden layer
self.parameters['b1'] = np.zeros((self.hidden_layer_size, 1)) # initialize bias for hidden layer
self.parameters['W2'] = np.random.randn(self.output_layer_size, self.hidden_layer_size) * 0.01 # initialize weights for output layer
self.parameters['b2'] = np.zeros((self.output_layer_size, 1)) # initialize bias for output layer
def sigmoid(self, Z): # sigmoid function
return 1 / (1 + np.exp(-Z))
def relu(self, Z): # relu function
return np.maximum(0, Z)
def sigmoid_backward(self, dA, Z): # derivative of sigmoid function
sig = self.sigmoid(Z)
return dA * sig * (1 - sig)
def relu_backward(self, dA, Z): # derivative of relu function
dZ = np.array(dA, copy=True)
dZ[Z <= 0] = 0
return dZ
def forward_propagation(self, X,): # forward propagation function for neural network
# dropout the nodes by making the weights zero and bias zero for the nodes
if self.dropout_forword_bool==True and self.dropout>0:
for node in range(self.parameters['W1'].shape[0]):
for weight in range(self.parameters['W1'].shape[1]):
if random.uniform(0,1) < self.dropout:
self.parameters['W1'][node][weight] = 0
self.parameters['b1'][node]=0
# calculate Z1, A1, Z2, A2
Z1 = np.dot(self.parameters['W1'], X.T) + self.parameters['b1'] # calculate Z1
A1 = self.relu(Z1)
Z2 = np.dot(self.parameters['W2'], A1) + self.parameters['b2']
A2 = self.sigmoid(Z2)
return A2, Z2, A1, Z1
def L1_regularization(self, weights): # L1 regularization function to avoid overfitting
sum=0
# calculate sum of absolute values of weights
for i in range(weights.shape[0]):
for j in range(weights.shape[1]):
sum=sum+abs(weights[i][j])
ans=self.lambda_ *sum # multiply sum with lambda value to set the regularization strength
return ans
def L2_regularization(self, weights):
sum=0
# calculate sum of square of weights
for i in range(weights.shape[0]):
for j in range(weights.shape[1]):
sum=sum+((weights[i][j])**2)
ans=self.lambda_ *sum
return ans
def compute_cost(self, A2, Y): # compute cost function
m = len(Y)
if self.regulization_l2==True: # if L2 regularization is true
cost = -1/m * np.sum(np.multiply(Y, np.log(A2)) + np.multiply((1 - Y), np.log(1 - A2))) # using cross entropy loss function Formula
cost = np.squeeze(cost) # squeeze the cost value to remove the extra dimension
#cost=cost + (self.L2_regularization(self.parameters['W1']) + self.L2_regularization(self.parameters['W2']))/m # add L2 regularization cost to the loss function
return cost
elif self.regulization_l1==True: # if L1 regularization is true
cost = -1/m * np.sum(np.multiply(Y, np.log(A2)) + np.multiply((1 - Y), np.log(1 - A2))) # using cross entropy loss function Formula
cost = np.squeeze(cost)
#cost=cost + (self.L1_regularization(self.parameters['W1']) + self.L1_regularization(self.parameters['W2']))/m # add L1 regularization cost to the loss function
return cost
else:
cost = -1/m * np.sum(np.multiply(Y, np.log(A2)) + np.multiply((1 - Y), np.log(1 - A2))) # using cross entropy loss function Formula Without regularization
cost = np.squeeze(cost)
return cost
def backward_propagation(self, X, Y, A2, Z2, A1, Z1):
# backward propagation function for neural network does the backpropagation and calculates the gradients of weights and bias
m = len(Y)
dZ2 = A2 - Y.T # dz2 is the derivative of cost function with respect to z2
if self.regulization_l1==True:
# calculate dW2 and add L1 regularization to it
dW2 = np.dot(dZ2, A1.T) / m + self.L1_regularization(self.parameters['W2'])/m # calculate dW2 and add L1 regularization to it
db2 = np.sum(dZ2, axis=1, keepdims=True) / m
dA= np.dot(self.parameters['W2'].T, dZ2)
dZ1 = dA * self.relu_backward(dA, Z1)
dW1 = np.dot(dZ1, X) / m + self.L1_regularization(self.parameters['W1'])/m
db1 = np.sum(dZ1, axis=1, keepdims=True) / m
return dW1, dW2, db1, db2
elif self.regulization_l2==True:
# calculate dW2 and add L2 regularization to it
dW2 = np.dot(dZ2, A1.T) / m + self.L2_regularization(self.parameters['W2'])/m
db2 = np.sum(dZ2, axis=1, keepdims=True) / m
dA= np.dot(self.parameters['W2'].T, dZ2)
dZ1 = dA * self.relu_backward(dA, Z1)
dW1 = np.dot(dZ1, X) / m + self.L2_regularization(self.parameters['W1'])/m
db1 = np.sum(dZ1, axis=1, keepdims=True) / m
return dW1, dW2, db1, db2
else:
# calculate dW2 and db2 without regularization
dW2 = np.dot(dZ2, A1.T) / m
db2 = np.sum(dZ2, axis=1, keepdims=True) / m
dA= np.dot(self.parameters['W2'].T, dZ2)
dZ1 = dA * self.relu_backward(dA, Z1)
dW1 = np.dot(dZ1, X) / m
db1 = np.sum(dZ1, axis=1, keepdims=True) / m
return dW1, dW2, db1, db2
def update_parameters(self, dW1, dW2, db1, db2):
# update the parameters using the gradients calculated in backward propagation
self.parameters['W1'] = self.parameters['W1'] - self.learning_rate * dW1
self.parameters['W2'] = self.parameters['W2'] - self.learning_rate * dW2
self.parameters['b1'] = self.parameters['b1'] - self.learning_rate * db1
self.parameters['b2'] = self.parameters['b2'] - self.learning_rate * db2
def fit(self, X, Y):
# fit function is used to train the model
self.initialize_parameters() # initialize the parameters
for i in range(self.num_iterations): # loop over the number of iterations
A2, Z2, A1, Z1 = self.forward_propagation(X) # forward propagation
cost = self.compute_cost(A2, Y)
dW1, dW2, db1, db2 = self.backward_propagation(X, Y, A2, Z2, A1, Z1) # backward propagation to calculate the gradients
# save the gradients in the parameters dictionary
self.parameters['dW1'] = dW1
self.parameters['dW2'] = dW2
self.parameters['db1'] = db1
self.parameters['db2'] = db2
# update the parameters
self.update_parameters(dW1, dW2, db1, db2)
self.loss.append(cost)
if i % 10 == 0:
print(f'Cost after iteration {i}: {cost}')
def predict(self, X):
# predict function is used to predict the output for the given input
A2, _, _, _ = self.forward_propagation(X) # forward propagation
predictions = np.round(A2) # round the output to get the predictions
return predictions
def plot_loss(self): # plot the loss
plt.plot(self.loss)
plt.xlabel('Iterations')
plt.ylabel('Loss')
plt.show()
def accuracy(self, X, Y):
predictions = self.predict(X)
return (np.sum((predictions == Y) / Y.shape[1])) * 100
def check_gradient(self, X, Y): # check the gradient
# calculate the gradient using backpropagation
A2, Z2, A1, Z1 = self.forward_propagation(X)
epsilon = 1e-7
dW1, dW2, db1, db2 = self.backward_propagation(X, Y, A2, Z2, A1, Z1) # backward propagation to calculate the gradients
for i in range(1, 3):
# calculate the gradient using the formula
self.parameters[f'W{i}'] += epsilon # add epsilon to W1
A2, Z2, A1, Z1 = self.forward_propagation(X) # forward propagation
cost_plus = self.compute_cost(A2, Y)
self.parameters[f'W{i}'] -= 2 * epsilon # subtract epsilon from W1
A2, Z2, A1, Z1 = self.forward_propagation(X) # forward propagation
cost_minus = self.compute_cost(A2, Y)
gradient = (cost_plus - cost_minus) / (2 * epsilon) # calculate the gradient using the formula
numerator = np.linalg.norm(gradient - self.parameters[f'dW{i}']) # calculate the numerator using the norm of the difference between the gradients
denominator = np.linalg.norm(gradient) + np.linalg.norm(self.parameters[f'dW{i}']) # calculate the denominator using the norm of the gradients
difference = numerator / denominator
if(difference > 1e-7):
print("There is a mistake in the backward propagation! difference = " + str(difference))
when i call mdoel:
print('--------------- model with L2 Regularization --------------------')
model_L2 = Neuralnetwork(x_train, y_train,regulization_L2=True, regulization_L1=False,dropout_forward_bool=False)
model_L2.fit(x_train, y_train)
here is the output
--------------- model with L2 Regularization --------------------
Cost after iteration 0: 1430.3418627178976
Cost after iteration 10: 1446.5808681718697
Cost after iteration 20: 1459.8884483327824
How do i correct this and why is it increasing?
Your example is missing x_train so I couldn't reproduce your behavior - but it's not a tragedy because I made something that might help you - a small notebook in which I implemented the backprop. alg. as an example to understand it myself. I have made 5 versions in which I go step by step from a simple numpy implentation to one with pytorch and cuda. For your problem the versions 1 to 3 of the implementation are interesting. I hope this helps you.
backpropagationnotebook

Simple Numpy MNIST Classifier Outputting Equal Probabilities

I'm working on building an MNIST classifier from scratch in numpy, however after training, my model always outputs a 0.1 probability for every digit.
I'm using softmax + cross entropy loss, and outputting an even distribution of probabilities seems to make my loss go to 0 without the accuracy increasing.
So, the model is learning how to minimise loss, just in the wrong way XD
Here is my forward and backward pass:
def onehot(i):
return np.eye(10)[i]
def softmax(x):
exp = np.exp(x - np.max(x))
return exp/np.sum(exp)
def relu(x):
return np.maximum(x, 0)
def loss(x, y):
return -np.sum(y * np.log(x))
def forward_backward(x, y, w1, w2):
# forward
l1 = x # w1
r = relu(l1)
l2 = r # w2
out = softmax(l2)
# loss
l = loss(out, y)
# backward
dl2 = out - y
dw2 = r.T # dl2
dr = dl2 # w2.T
dl1 = dr >= 0
dw1 = x.T # dl1
return out, l, dw1, dw2
And my training:
w1 = np.random.randn(784, 128)
w2 = np.random.randn(128, 10)
losses = []
for i in range(batches):
x = images[i]
y = onehot(labels[i])
out, l, dw1, dw2 = forward_backward(x, y, w1 ,w2)
w1 -= dw1 * lr
w2 -= dw2 * lr
losses.append(l)
at the end, printing out gives me: array([[0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1, 0.1]])
and my loss function decreases until exactly 0 and all outputs are even.

Using Backward Propagation in fmin_cg

I am trying to build an ANN in python, and I've been able to get so far as to to forward pass, but I get a problem when I try to do backward propagation. In my function nnCostFunction, the gradient grad is define as:
grad = tr(c_[Theta1_grad.swapaxes(1,0).reshape(1,-1), Theta2_grad.swapaxes(1,0).reshape(1,-1)])
But this is a problem because I am using scipy.optimize.fmin_cg to calculate nn_params and cost, and fmin_cg accepts only a single value (the J value for my forward pass) and cannot accept grad...
nn_params, cost = op.fmin_cg(lambda t: nnCostFunction(t, input_layer_size, hidden_layer_size, num_labels, X, y, lam), initial_nn_params, gtol = 0.001, maxiter = 40, full_output=1)[0, 1]
Is there a way to fix this so I can include backward propagation in my network? I know there is a scipy.optimize.minimize function, but I am having some difficulty understand how to use it and get the results I need. Does anyone know what needs to be done?
Your help is greatly appreciated, thanks.
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lam):
'''
Given NN parameters, layer sizes, number of labels, data, and learning rate, returns the cost of traversing NN.
'''
Theta1 = (reshape(nn_params[:(hidden_layer_size*(input_layer_size+1))],(hidden_layer_size,(input_layer_size+1))))
Theta2 = (reshape(nn_params[((hidden_layer_size*(input_layer_size+1))):],(num_labels, (hidden_layer_size+1))))
m = X.shape[0]
n = X.shape[1]
#forward pass
y_eye = eye(num_labels)
y_new = np.zeros((y.shape[0],num_labels))
for z in range(y.shape[0]):
y_new[z,:] = y_eye[int(y[z])-1]
y = y_new
a_1 = c_[ones((m,1)),X]
z_2 = tr(Theta1.dot(tr(a_1)))
a_2 = tr(sigmoid(Theta1.dot(tr(a_1))))
a_2 = c_[ones((a_2.shape[0],1)), a_2]
a_3 = tr(sigmoid(Theta2.dot(tr(a_2))))
J_reg = lam/(2.*m) * (sum(sum(Theta1[:,1:]**2)) + sum(sum(Theta2[:,1:]**2)))
J = (1./m) * sum(sum(-y*log(a_3) - (1-y)*log(1-a_3))) + J_reg
#Backprop
d_3 = a_3 - y
d_2 = d_3.dot(Theta2[:,1:])*sigmoidGradient(z_2)
Theta1_grad = 1./m * tr(d_2).dot(a_1)
Theta2_grad = 1./m * tr(d_3).dot(a_2)
#Add regularization
Theta1_grad[:,1:] = Theta1_grad[:,1:] + lam*1.0/m*Theta1[:,1:]
Theta2_grad[:,1:] = Theta2_grad[:,1:] + lam*1.0/m*Theta2[:,1:]
#Unroll gradients
grad = tr(c_[Theta1_grad.swapaxes(1,0).reshape(1,-1), Theta2_grad.swapaxes(1,0).reshape(1,-1)])
return J, grad
def nn_train(X,y,lam = 1.0, hidden_layer_size = 10):
'''
Train neural network given the features and class arrays, learning rate, and size of the hidden layer.
Return parameters Theta1, Theta2.
'''
# NN input and output layer sizes
input_layer_size = X.shape[1]
num_labels = unique(y).shape[0] #output layer
# Initialize NN parameters
initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)
# Unroll parameters
initial_nn_params = np.append(initial_Theta1.flatten(1), initial_Theta2.flatten(1))
initial_nn_params = reshape(initial_nn_params,(len(initial_nn_params),)) #flatten into 1-d array
# Find and print initial cost:
J_init = nnCostFunction(initial_nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,lam)[0]
grad_init = nnCostFunction(initial_nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,lam)[1]
print 'Initial J cost: ' + str(J_init)
print 'Initial grad cost: ' + str(grad_init)
# Implement backprop and train network, run fmin
print 'Training Neural Network...'
print 'fmin results:'
nn_params, cost = op.fmin_cg(lambda t: nnCostFunction(t, input_layer_size, hidden_layer_size, num_labels, X, y, lam), initial_nn_params, gtol = 0.001, maxiter = 40, full_output=1)[0, 1]
Theta1 = (reshape(nn_params[:(hidden_layer_size*(input_layer_size+1))],(hidden_layer_size,(input_layer_size+1))))
Theta2 = (reshape(nn_params[((hidden_layer_size*(input_layer_size+1))):],(num_labels, (hidden_layer_size+1))))
return Theta1, Theta2

Using Gradient Descent on Linear Regression Yields an Incorrect Bias

I've got a toy example set up of a linear regression model with one input variable and one output variable. The problem that I'm encountering is that the output for the bias is far off from the generated data. If I manually set the bias then it will produce a weight and bias which is close enough to the original.
I've written two pieces of code gen_data which generates data and GradientDescent which performs that gradient descent algorithm to find the weight and bias.
def gen_data(num_points=50, slope=1, bias=10, x_max=50):
f = lambda z: slope * z + bias
x = np.zeros(shape=(num_points, 1))
y = np.zeros(shape=(num_points, 1))
for i in range(num_points):
x_temp = np.random.uniform()*x_max
x[i] = x_temp
y[i] = f(x_temp) + np.random.normal(scale=3.0)
return (x, y)
# \mathbb{R}^1 with no regularization
def gradientDescent2(x, y, learning_rate=0.0001, epochs=100):
theta = np.random.rand()
bias = np.random.rand()
for i in range(0, epochs):
loss = (theta * x + bias) - y
cost = np.mean(loss**2) / 2
# print('Iteration {} | Cost: {}'.format(i, cost))
grad_b = np.mean(loss)
grad_t = np.mean(loss*x)
# updates
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
return (theta, bias)
If you want to use batch update, don't set your batch_size equals to your simple size. (I also believe that batch_update is not very suitable for this case.)
2.Your gradient calculation and parameter update are incorrect, the gradient should be:
grad_b = 1
grad_t = x
For the parameter update, you should always trying to minimize the loss, so it should be
if loss>0:
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
elif loss< 0:
bias += learning_rate * grad_b
theta += learning_rate * grad_t
After all, below is the modified code works well.
import numpy as np
import sys
def gen_data(num_points=500, slope=1, bias=10, x_max=50):
f = lambda z: slope * z + bias
x = np.zeros(shape=(num_points))
y = np.zeros(shape=(num_points))
for i in range(num_points):
x_temp = np.random.uniform()*x_max
x[i] = x_temp
y[i] = f(x_temp) #+ np.random.normal(scale=3.0)
#print('x:',x[i],' y:',y[i])
return (x, y)
def gradientDescent2(x, y, learning_rate=0.001, epochs=100):
theta = np.random.rand()
bias = np.random.rand()
for i in range(0, epochs):
for j in range(len(x)):
loss = (theta * x[j] + bias) - y[j]
cost = np.mean(loss**2) / 2
# print('Iteration {} | Cost: {}'.format(i, cost))
grad_b = 1
grad_t = x[j]
if loss>0:
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
elif loss< 0:
bias += learning_rate * grad_b
theta += learning_rate * grad_t
return (theta, bias)
def main():
x,y =gen_data()
ta,bias = gradientDescent2(x,y)
print('theta:',ta)
print('bias:',bias)
if __name__ == '__main__':
sys.exit(int(main() or 0))

Use of scipy.optimize.minimize in Neural Network

Trying to use Backpropagation Neural Network for multiclass classification. I have found this code and try to adapt it. It is based on the lections of Machine Learning in Coursera from Andrew Ng.
I don't understand exactly the implementation of scipy.optimize.minimize function here. It is used just once in the code. Is it iteratively updating the weights of the network? How can I visualize (plot) it's performance to see when it converges?
Using this function what parameters I can adjust to achieve better performance? I found here a list common parameters:
Number of neurons in the hidden layer: this is hidden_layer_size=25 in my code
Learning rate: can I still adjust that using built-in minimization function?
Momentum: is that reg_lambda=0 in my case? Regularization parameter to avoid overfitting, right?
Epoch: maxiter=500
Here is my training data (target class is in the last column):
65535, 3670, 65535, 3885, -0.73, 1
65535, 3962, 65535, 3556, -0.72, 1
65535, 3573, 65535, 3529, -0.61, 1
3758, 3123, 4117, 3173, -0.21, 0
3906, 3119, 4288, 3135, -0.28, 0
3750, 3073, 4080, 3212, -0.26, 0
65535, 3458, 65535, 3330, -0.85, 2
65535, 3315, 65535, 3306, -0.87, 2
65535, 3950, 65535, 3613, -0.84, 2
65535, 32576, 65535, 19613, -0.35, 3
65535, 16657, 65535, 16618, -0.37, 3
65535, 16657, 65535, 16618, -0.32, 3
The dependencies are so obvious, I think it should be so easy to classify it...
But results are terrible. I get accuracy of 0.6 to 0.8. This is absolutely inappropriate for my application. I know I need more data normally, but I would be already happy when I could at least fit the training data (without taking into account potential overfitting)
Here is the code:
import numpy as np
from scipy import optimize
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
import math
class NN_1HL(object):
def __init__(self, reg_lambda=0, epsilon_init=0.12, hidden_layer_size=25, opti_method='TNC', maxiter=500):
self.reg_lambda = reg_lambda
self.epsilon_init = epsilon_init
self.hidden_layer_size = hidden_layer_size
self.activation_func = self.sigmoid
self.activation_func_prime = self.sigmoid_prime
self.method = opti_method
self.maxiter = maxiter
def sigmoid(self, z):
return 1 / (1 + np.exp(-z))
def sigmoid_prime(self, z):
sig = self.sigmoid(z)
return sig * (1 - sig)
def sumsqr(self, a):
return np.sum(a ** 2)
def rand_init(self, l_in, l_out):
self.epsilon_init = (math.sqrt(6))/(math.sqrt(l_in + l_out))
return np.random.rand(l_out, l_in + 1) * 2 * self.epsilon_init - self.epsilon_init
def pack_thetas(self, t1, t2):
return np.concatenate((t1.reshape(-1), t2.reshape(-1)))
def unpack_thetas(self, thetas, input_layer_size, hidden_layer_size, num_labels):
t1_start = 0
t1_end = hidden_layer_size * (input_layer_size + 1)
t1 = thetas[t1_start:t1_end].reshape((hidden_layer_size, input_layer_size + 1))
t2 = thetas[t1_end:].reshape((num_labels, hidden_layer_size + 1))
return t1, t2
def _forward(self, X, t1, t2):
m = X.shape[0]
ones = None
if len(X.shape) == 1:
ones = np.array(1).reshape(1,)
else:
ones = np.ones(m).reshape(m,1)
# Input layer
a1 = np.hstack((ones, X))
# Hidden Layer
z2 = np.dot(t1, a1.T)
a2 = self.activation_func(z2)
a2 = np.hstack((ones, a2.T))
# Output layer
z3 = np.dot(t2, a2.T)
a3 = self.activation_func(z3)
return a1, z2, a2, z3, a3
def function(self, thetas, input_layer_size, hidden_layer_size, num_labels, X, y, reg_lambda):
t1, t2 = self.unpack_thetas(thetas, input_layer_size, hidden_layer_size, num_labels)
m = X.shape[0]
Y = np.eye(num_labels)[y]
_, _, _, _, h = self._forward(X, t1, t2)
costPositive = -Y * np.log(h).T
costNegative = (1 - Y) * np.log(1 - h).T
cost = costPositive - costNegative
J = np.sum(cost) / m
if reg_lambda != 0:
t1f = t1[:, 1:]
t2f = t2[:, 1:]
reg = (self.reg_lambda / (2 * m)) * (self.sumsqr(t1f) + self.sumsqr(t2f))
J = J + reg
return J
def function_prime(self, thetas, input_layer_size, hidden_layer_size, num_labels, X, y, reg_lambda):
t1, t2 = self.unpack_thetas(thetas, input_layer_size, hidden_layer_size, num_labels)
m = X.shape[0]
t1f = t1[:, 1:]
t2f = t2[:, 1:]
Y = np.eye(num_labels)[y]
Delta1, Delta2 = 0, 0
for i, row in enumerate(X):
a1, z2, a2, z3, a3 = self._forward(row, t1, t2)
# Backprop
d3 = a3 - Y[i, :].T
d2 = np.dot(t2f.T, d3) * self.activation_func_prime(z2)
Delta2 += np.dot(d3[np.newaxis].T, a2[np.newaxis])
Delta1 += np.dot(d2[np.newaxis].T, a1[np.newaxis])
Theta1_grad = (1 / m) * Delta1
Theta2_grad = (1 / m) * Delta2
if reg_lambda != 0:
Theta1_grad[:, 1:] = Theta1_grad[:, 1:] + (reg_lambda / m) * t1f
Theta2_grad[:, 1:] = Theta2_grad[:, 1:] + (reg_lambda / m) * t2f
return self.pack_thetas(Theta1_grad, Theta2_grad)
def fit(self, X, y):
num_features = X.shape[0]
input_layer_size = X.shape[1]
num_labels = len(set(y))
theta1_0 = self.rand_init(input_layer_size, self.hidden_layer_size)
theta2_0 = self.rand_init(self.hidden_layer_size, num_labels)
thetas0 = self.pack_thetas(theta1_0, theta2_0)
options = {'maxiter': self.maxiter}
_res = optimize.minimize(self.function, thetas0, jac=self.function_prime, method=self.method,
args=(input_layer_size, self.hidden_layer_size, num_labels, X, y, 0), options=options)
self.t1, self.t2 = self.unpack_thetas(_res.x, input_layer_size, self.hidden_layer_size, num_labels)
np.savetxt("weights_t1.txt", self.t1, newline="\n")
np.savetxt("weights_t2.txt", self.t2, newline="\n")
def predict(self, X):
return self.predict_proba(X).argmax(0)
def predict_proba(self, X):
_, _, _, _, h = self._forward(X, self.t1, self.t2)
return h
##################
# IR data #
##################
values = np.loadtxt('infrared_data.txt', delimiter=', ', usecols=[0,1,2,3,4])
targets = np.loadtxt('infrared_data.txt', delimiter=', ', dtype=(int), usecols=[5])
X_train, X_test, y_train, y_test = cross_validation.train_test_split(values, targets, test_size=0.4)
nn = NN_1HL()
nn.fit(values, targets)
print("Accuracy of classification: "+str(accuracy_score(y_test, nn.predict(X_test))))
In the given code scipy.optimize.minimize iteratively minimizes function given it's derivative (Jacobi's matrix). According to the documentation, use can specify callback argument to a function that will be called after each iteration — this will let you measure performance, though I'm not sure if it'll let you halt the optimization process.
All parameters you listed are hyperparameters, it's hard to optimize them directly:
Number of neurons in the hidden layer is a discrete valued parameters, and, thus, is not optimizable via gradient techniques. Moreover, it affects NeuralNet architecture, so you can't optimize it while training the net. What you can do, though, is to use some higher-level routine to search for possible options, like exhaustive grid search with cross-validation (for example look at GridSearchCV) or other tools for hyperparameter search (hyperopt, spearmint, MOE, etc).
Learning rate does not seem to be customizable for most of the optimization methods available. But, actually, learning rate in gradient descent is just a Newton's method with Hessian "approximated" by 1 / eta I — diagonal matrix with inverted learning rates on the major diagonal. So you can try hessian-based methods with this heuristic.
Momentum is completely unrelated to regularization. It's an optimization technique, and, since you use scipy for optimization, is unavailable for you.

Categories

Resources