My Standard Neural Network Cost is Going Up - python

I have been trying to create a simple standard neural network from scratch but I can't seem to get it to work normally. Sometimes the cost skyrockets, other times the cost doesn't even change. I'm not sure what the problem is but it would be really helpful if someone could help me.
I have all of the information on Github. If any more information is needed kindly reply and I will provide it.
https://github.com/enriquedellxps/Neural-Network
Function for generating batches:
def batcher(data, batch_size):
# get the number of batches
num_batches_norem = data.shape[1] // batch_size
if data.shape[1] % batch_size == 0:
remainder_quantity = 0
else:
remainder_size = data.shape[1] % batch_size
remainder_quantity = 1
num_batches = num_batches_norem + remainder_quantity
changer = 0
for mb in range(num_batches_norem):
current_batch = data[:, changer:changer + batch_size]
changer += batch_size
yield current_batch
for last_mb in range(remainder_quantity):
last_batch = data[:, changer:changer + remainder_size]
yield last_batch
Function for g(z):
def activationer(a, z):
# ACTIVATION FUNCTIONS
# Sigmoid Activation Function
def sigmoid(z):
g = scipy.special.expit(z)
return g
# Tanh (Hyperbolic Tangent Function) Activation Function
def tanh(z):
g = (np.exp(z) - np.exp(-1 * z)) / ((np.exp(z) + np.exp(-1 * z)))
return g
# ReLU (Rectified Linear Unit) Activation Function
def ReLU(z):
g = np.maximum(0, z)
return g
# Leaky ReLU (Leaky Rectified Linear Unit) Activation Function
def Leaky_ReLU(z):
g = np.maximum(0.01 * z, z)
return g
def softmax(z):
z_exp = np.exp(z)
g = z_exp / np.sum(z_exp, axis=0, keepdims=True)
return g
if a == "sigmoid":
res = sigmoid(z)
elif a == "tanh":
res = tanh(z)
elif a == "relu":
res = ReLU(z)
elif a == "leaky relu":
res = Leaky_ReLU(z)
elif a == "softmax":
res = softmax(z)
return res
NN Class:
class DeepNeuralNetwork:
def __init__(self, n_x, n_h, n_y, nl, activations, alpha):
assert nl == len(activations), f"L: {nl}, Number of Activations: {len(activations)}"
# Assign inputs to the self object
self.n_x = n_x
self.n_h = n_h
self.n_y = n_y
self.nl = nl
self.activations = activations
self.alpha = alpha
# Initialize Parameters
def initialize_parameters(self):
n_x = self.n_x
n_h = self.n_h
n_y = self.n_y
activations = self.activations
parameters = []
for l in range(self.nl):
np.random.seed(8)
if l == 0:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_h, n_x) * np.sqrt(2 / n_x), np.zeros((n_h, 1))]) # aka W1, b1 | Xavier
else:
parameters.append([np.random.randn(n_h, n_x) * np.sqrt(1 / n_x), np.zeros((n_h, 1))]) # aka W1, b1 | He
elif l == self.nl - 1:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_y, n_h) * np.sqrt(2 / n_h), np.zeros((n_y, 1))]) # aka WL, bL | Xavier
else:
parameters.append([np.random.randn(n_y, n_h) * np.sqrt(1 / n_h), np.zeros((n_y, 1))]) # aka WL, bL | He
else:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_h, n_h) * np.sqrt(2 / n_h), np.zeros((n_h, 1))]) # hidden params | Xavier
else:
parameters.append([np.random.randn(n_h, n_h) * np.sqrt(1 / n_h), np.zeros((n_h, 1))]) # hidden params | He
return parameters
# Forward Propagation
def forward_propagation(self, parameters, input_data):
batch_size = input_data.shape[1] # Get the amount of examples in the batch
caches = []
self.caches = caches
current_activation = input_data # Set first activation - A0 - as the input
caches.append(current_activation)
for l in range(self.nl):
W, b = parameters[l][0], parameters[l][1] # Get weights and biases for current layer
Z = W # current_activation + b # Compute the linear activation
current_activation = activationer(self.activations[l], Z) # Compute the full activation
caches.append(current_activation)
return current_activation
# Compute Cost
def compute_cost(self, yhat, y):
batch_size = y.shape[1] # Get the amount of examples in the batch
cost = (1 / batch_size) * np.sum(-1 * (y * np.log(yhat) + ((1 - y) * (np.log(1 - yhat))))) # Compute the cross-entropy cost
cost = np.squeeze(cost) # Turn [[17]] to 17
return cost
# Backward Propagation
def backward_propagation(self, parameters, y):
caches = self.caches
batch_size = y.shape[1]
grads = []
for l in reversed(range(1, self.nl + 1)):
if l == self.nl:
dZ = caches[l] - y
dW = (1 / batch_size) * dZ # caches[l - 1].T
db = (1 / batch_size) * np.sum(dZ, axis=1, keepdims=True)
grads.append([dW, db])
else:
dA = parameters[l][0].T # dZ
dZ = dA * np.multiply(caches[l], (1 - caches[l]))
dW = (1 / batch_size) * dZ # caches[l - 1].T
db = (1 / batch_size) * np.sum(dZ, axis=1, keepdims=True)
grads.append([dW, db])
return grads
# Update Parameters
def update_parameters(self, parameters, gradients):
for l in range(self.nl):
parameters[l][0] = parameters[l][0] - self.alpha * gradients[self.nl - l - 1][0]
parameters[l][1] = parameters[l][1] - self.alpha * gradients[self.nl - l - 1][1]
return parameters
Running it:
dnn = DeepNeuralNetwork(12288, 20, 1, 4, ["relu", "relu", "relu", "sigmoid"], 0.001)
params = dnn.initialize_parameters()
epochs = 100
for e in range(epochs):
for i, j in zip(train_x_batched, train_y_batched):
yhat = dnn.forward_propagation(params, i)
cost = dnn.compute_cost(yhat, j)
grads = dnn.backward_propagation(params, j)
params = update_parameters(params, grads)
print(cost) # This usually starts going down then skyrockets. Even if I lower the learning rate to 0.00001
Thanks :)

Related

Asymmetric loss function from pytorch to tensorflow

I am converting the asymmetric loss function from this paper ASL which is written in pytorch here: ASL github.
The pytorch code looks like this:
class AsymmetricLoss(nn.Module):
def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_torch_grad_focal_loss=True):
super(AsymmetricLoss, self).__init__()
self.gamma_neg = gamma_neg
self.gamma_pos = gamma_pos
self.clip = clip
self.disable_torch_grad_focal_loss = disable_torch_grad_focal_loss
self.eps = eps
def forward(self, x, y):
""""
Parameters
----------
x: input logits
y: targets (multi-label binarized vector)
"""
# Calculating Probabilities
x_sigmoid = torch.sigmoid(x)
xs_pos = x_sigmoid
xs_neg = 1 - x_sigmoid
# Asymmetric Clipping
if self.clip is not None and self.clip > 0:
xs_neg = (xs_neg + self.clip).clamp(max=1)
# Basic CE calculation
los_pos = y * torch.log(xs_pos.clamp(min=self.eps))
los_neg = (1 - y) * torch.log(xs_neg.clamp(min=self.eps))
loss = los_pos + los_neg
# Asymmetric Focusing
if self.gamma_neg > 0 or self.gamma_pos > 0:
if self.disable_torch_grad_focal_loss:
torch.set_grad_enabled(False)
pt0 = xs_pos * y
pt1 = xs_neg * (1 - y) # pt = p if t > 0 else 1-p
pt = pt0 + pt1
one_sided_gamma = self.gamma_pos * y + self.gamma_neg * (1 - y)
one_sided_w = torch.pow(1 - pt, one_sided_gamma)
if self.disable_torch_grad_focal_loss:
torch.set_grad_enabled(True)
loss *= one_sided_w
return -loss.sum()
And what I have changed it to in tensorflow:
class AsymmetricLoss(tf.keras.losses.Loss):
def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_grad_focal_loss=False):
super(AsymmetricLoss, self).__init__()
self.gamma_neg = gamma_neg
self.gamma_pos = gamma_pos
self.clip = clip
self.eps = eps
self.disable_grad_focal_loss = disable_grad_focal_loss
def call(self, y_true, y_pred):
y_pred_sigmoid = tf.sigmoid(y_pred)
y_preds_pos = y_pred_sigmoid
y_preds_neg = 1 - y_pred_sigmoid
y_true = tf.cast(y_true, tf.float32)
if self.clip is not None and self.clip > 0:
y_preds_neg = tf.clip_by_value(y_preds_neg + self.clip,clip_value_max=1, clip_value_min=-np.inf)
print(y_preds_neg.dtype)
print(y_preds_pos.dtype)
print(y_true.dtype)
los_pos = y_true * tf.math.log(tf.clip_by_value(y_preds_pos, clip_value_min=self.eps, clip_value_max=np.inf))
los_neg = (1-y_true) * tf.math.log(tf.clip_by_value(y_preds_neg, clip_value_min=self.eps, clip_value_max=np.inf))
loss = los_pos + los_neg
if self.gamma_neg > 0 or self.gamma_pos > 0:
if self.disable_grad_focal_loss:
#torch.set_set_grad_enable(False)
pass
pt0 = y_preds_pos * y_true
pt1 = y_preds_neg * (1-y_true)
pt = pt0 + pt1
one_sided_gamma = self.gamma_pos * y_true + self.gamma_neg *(1-y_true)
one_sided_w = tf.math.pow(1-pt, one_sided_gamma)
if self.disable_grad_focal_loss:
#torch.set_set_grad_enable(True)
pass
loss *= one_sided_w
y_true = tf.cast(y_true, tf.int64)
return -tf.math.reduce_sum(loss)
There is especially two things I am unsure about.
In the pytorch code the turns on and off the grad with torch.set_grad_enable(bool) which I can not find an equliant in tensorflow (tf.stop_gradient, doesnt seem to be the same). I have also read that the grad is not calculated in the call method so it doesnt matter, but this I am not sure about.
The second thing is the sum in the return. Is so that tensorflow sums up the loss by it self so its not correct to have it in the loss function?

XOR classification using multilayer perceptron

I want to implement a multi-layer perceptron.
I found some code on GitHub that classifies MNIST quite well (96%). However, for some reason, it does not cope with the XOR task.
I want to understand why.
Here is the code:
perceptron.py
import random
import numpy as np
class Perceptron:
def __init__(self, *, layer_sizes, activation_functions, cost_function_deriv):
self.layer_sizes = layer_sizes
if len(self.layer_sizes) - 1 != len(activation_functions):
raise ValueError("...")
self.activation_functions = activation_functions
self.cost_function_deriv = cost_function_deriv
self.biases = [np.random.randn(y, 1) for y in layer_sizes[1:]]
self.weights = [np.random.randn(y, x) for x, y in zip(layer_sizes[:-1], layer_sizes[1:])]
def train(self, training_data, test_data, epochs, mini_batch_size, lr):
test_data_len = len(test_data)
for epoch in range(epochs):
random.shuffle(training_data)
mini_batches = [training_data[x: x + mini_batch_size]
for x in range(0, len(training_data), mini_batch_size)]
for mini_batch in mini_batches:
mb_len = len(mini_batch)
gradient_weights = [np.zeros(w.shape) for w in self.weights]
gradient_biases = [np.zeros(b.shape) for b in self.biases]
for x, y in mini_batch:
delta_gradient_biases, delta_gradient_weights = self.backpropagation(np.array(x), y)
gradient_weights = [grad + delta for grad, delta in zip(gradient_weights, delta_gradient_weights)]
gradient_biases = [grad + delta for grad, delta in zip(gradient_biases, delta_gradient_biases)]
self.weights = [w - (lr / mb_len) * grad for w, grad in zip(self.weights, gradient_weights)]
self.biases = [b - (lr / mb_len) * grad for b, grad in zip(self.biases, gradient_biases)]
correct_answers = self.how_many_correct_answers(test_data)
print(f"Epoch number {epoch}: {correct_answers}/{test_data_len} correct answers")
def backpropagation(self, x, y):
gradient_b = [np.zeros(b.shape) for b in self.biases]
gradient_w = [np.zeros(w.shape) for w in self.weights]
activations = [x]
prev_activation = x
for i, (b, w) in enumerate(zip(self.biases, self.weights)):
current_activation = self.activation_functions[i](np.dot(w, prev_activation) + b)
activations.append(current_activation)
prev_activation = current_activation
delta = self.cost_function_deriv(activations[-1], y) * self.activation_functions[-1].deriv(activations[-1])
gradient_b[-1] = delta
gradient_w[-1] = np.dot(delta, activations[-2].T)
for i in range(2, len(self.layer_sizes)):
z = activations[-i]
act_der = self.activation_functions[-i + 1].deriv(z)
delta = np.dot(self.weights[-i + 1].T, delta) * act_der
gradient_b[-i] = delta
gradient_w[-i] = np.dot(delta, activations[-i - 1].T)
# Normal indexing variant:
# for i in range(len(self.layers) - 1, 0, -1):
# z = activations[i]
# act_der = self.activation_functions[i].deriv(z)
# delta = np.dot(self.weights[i].T, delta) * act_der
# gradient_b[i - 1] = delta
# gradient_w[i - 1] = np.dot(delta, activations[i - 1].T)
return gradient_b, gradient_w
def feedforward(self, a):
for i, (b, w) in enumerate(zip(self.biases, self.weights)):
a = self.activation_functions[i](np.dot(w, a) + b)
return a
def how_many_correct_answers(self, test_data):
k = 0
for x, y in test_data:
y_predict = np.argmax(self.feedforward(x))
print(y_predict, y)
k += int(y_predict == y)
return k
main.py
from copy import deepcopy
import numpy as np
from perceptron import Perceptron
class Sigmoid:
out_min_max = [0, 1]
def __call__(self, x):
return 1. / (1. + np.exp(-x))
def deriv(self, y):
# t = self(x)
# return t * (1. - t)
return y * (1. - y)
def cost_function_derivative(y_predict, y_true_label):
label_vector = np.zeros(y_predict.shape)
label_vector[y_true_label] = 1.0
return y_predict - label_vector
def main():
training_data = np.asarray([[[[0], [0]], 0],
[[[0], [1]], 1],
[[[1], [0]], 1],
[[[1], [1]], 0]])
layer_sizes = [2, 8, 2]
model = Perceptron(layer_sizes=layer_sizes,
activation_functions=[Sigmoid(), Sigmoid()],
cost_function_deriv=cost_function_derivative)
model.train(deepcopy(training_data),
deepcopy(training_data),
epochs=10000,
mini_batch_size=4,
lr=0.01)
if __name__ == '__main__':
main()
The final output in format 'y_predict y_true' (after each epoch):
0 0
0 1
0 1
0 0
If remove random.shuffle(training_data) then:
1 0
0 1
1 1
0 0
But not 0 1 1 0
I figured it out. It requires the following.
mini_batch_size=1
# random.shuffle(training_data) -- comment
epochs=10000
And it's better to do this:
lr=0.1
The result in most cases is obtained after ~1000 epochs:
0 0
1 1
1 1
0 0

Increasing validation loss from the very beginning

I've been doing a very simply binary cat/dog classification project with machine learning. I understand the problem of overfitting, but what's strange in my case is that the validation loss begins to rise from the very beginning. I've tried many different sets of hyperparameters, with L2 regularization, learning rate decay and stochastic gradient descent, and a large training set, but the issue remained. Here is the learning graph from one of the trials (the horizontal axis should be per 10 epochs):
The hyperparameters are: two hidden layers with 50 and 10 units, initial alpha = 0.05, alpha decay rate = 0.95 per 50 epochs, mini-batch size = 64, lambda = 0.05
Here are other sample learning graphs:
I developed my model on the basis of what's provided in Andrew Ng's Deep Learning Specialization, so I didn't expect many bugs. My full code, as required, is attached below:
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
from scipy import special
#Data Preprocessing (the same for dev set, which I omit here)
path = '/Users/bobby/Downloads/kagglecatsanddogs_3367a/PetImages'
train_set = []
img_size = 80
categories = ['dogs_train','cats_train']
epsilon = 1e-8
for category in categories:
path_animal = os.path.join(path, category)
for img in os.listdir(path_animal):
try:
img_array = cv2.imread(os.path.join(path_animal, img), cv2.IMREAD_GRAYSCALE)
new_img_array = cv2.resize(img_array, (img_size, img_size))
flattened_img_array = new_img_array.reshape(img_size*img_size)
train_set.append([flattened_img_array, categories.index(category)])
except:
continue
import random
random.shuffle(train_set)
X_train = []
Y_train = []
for sample in train_set:
X_train.append(sample[0])
Y_train.append(sample[1])
X_train = (np.array(X_train).T)/255
Y_train = np.array(Y_train).reshape((1, np.array(Y_train).shape[0]))
def create_mini_batches(X, Y, mini_batch_size):
m = X.shape[1]
mini_batches = []
num_mini_batches = m // mini_batch_size
permutation = list(np.random.permutation(m))
shuffled_X = X[:, permutation]
shuffled_Y = Y[:, permutation]
for i in range(num_mini_batches):
select_X = shuffled_X[:, mini_batch_size*i : mini_batch_size*(i+1)]
select_Y = shuffled_Y[:, mini_batch_size*i : mini_batch_size*(i+1)]
mini_batch = (select_X, select_Y)
mini_batches.append(mini_batch)
if m % mini_batch_size != 0:
last_X = shuffled_X[:, mini_batch_size*num_mini_batches:m]
last_Y = shuffled_Y[:, mini_batch_size*num_mini_batches:m]
last_mini_batch = (last_X, last_Y)
mini_batches.append(last_mini_batch)
return mini_batches
def initialize_parameters(layers_dims):
L = len(layers_dims) # number of layers (including input layer), in this case L=4.
parameters = {}
for l in range(1,L): # range(1,4).
parameters['W' + str(l)] = np.random.randn(layers_dims[l],layers_dims[l-1]) * np.sqrt(2/layers_dims[l-1])
parameters['b' + str(l)] = np.zeros((layers_dims[l],1))
return parameters
def sigmoid(Z):
A = special.expit(Z)
return A,Z
def relu(Z):
A = np.maximum(0.01*Z, Z)
return A,Z
def forward_propagation(X, parameters):
caches = [] #list containing Z for every node
A = X
L = int(len(parameters)/2)
for l in range(1,L):
A_prev = A
W = parameters['W'+str(l)]
b = parameters['b'+str(l)]
Z = np.dot(W, A_prev) + b
A, activation_cache = relu(Z) #activation_cache contains z[l].
linear_cache = (A_prev, W, b) #linear_cache contains A[l-1], W[l], b[l].
cache = (linear_cache, activation_cache)
caches.append(cache)
W = parameters['W'+str(L)]
b = parameters['b'+str(L)]
Z = np.dot(W, A) + b
AL, activation_cache = sigmoid(Z)
linear_cache = (A, W, b)
cache = (linear_cache, activation_cache)
caches.append(cache)
return AL, caches
def compute_cost(AL, Y, parameters, lambd):
m = Y.shape[1] # number of examples
L = int(len(parameters)/2) #[6400,100,20,1] L=3 (0,1,2)
reg_cost = 0
for l in range(L):
W = parameters['W' + str(l+1)]
reg_cost += np.sum(np.square(W))
J = (-1/m)*(np.sum(Y*np.log(AL+epsilon)+(1-Y)*np.log(1-AL+epsilon))) + (1/m) * (lambd/2) * reg_cost
J = np.squeeze(J)
return J
def linear_backward(dZ, linear_cache, lambd):
A_prev, W, b = linear_cache
m = A_prev.shape[1]
dW = (1/m) * np.dot(dZ,A_prev.T) + (lambd/m)*W
db = (1/m) * np.sum(dZ,axis=1,keepdims=True)
dA_prev = np.dot(W.T,dZ)
return dA_prev, dW, db
def relu_gradient(Z):
dZ = np.where(Z > 0, 1, 0.01)
return dZ
def sigmoid_gradient(Z):
dZ = special.expit(Z)*(1-special.expit(Z))
return dZ
def linear_activation_backward(dA, cache, lambd, A, Y, activation):
linear_cache, activation_cache = cache
if activation == 'relu':
dZ = dA * relu_gradient(activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
elif activation == 'sigmoid':
dZ = A - Y
dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
return dA_prev, dW, db
def L_model_backward(AL, Y, caches, lambd):
grads = {}
L = len(caches)
m = AL.shape[1]
Y = Y.reshape(AL.shape)
cache_final_layer = caches[L-1]
grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(_, cache_final_layer, lambd, AL, Y, activation='sigmoid')
for l in reversed(range(L-1)):
current_cache = caches[l]
grads["dA" + str(l)], grads["dW" + str(l+1)], grads["db" + str(l+1)] = linear_activation_backward(grads['dA' + str(l+1)], current_cache, lambd, _, _, activation='relu')
return grads
def update_parameters(parameters, grads, learning_rate):
L = len(parameters) // 2
for l in range(L):
parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
return parameters
def Neural_Network_Model(X_train, Y_train, X_dev, Y_dev, layers_dims, learning_rate, num_epoch, mini_batch_size, lambd, k):
mini_batches = create_mini_batches(X_train, Y_train, mini_batch_size) #[(X{1},Y{1}),(X{2},Y{2}),...,(X{n},Y{n})]
costs_train = []
costs_dev = []
parameters = initialize_parameters(layers_dims)
AL_dev, caches_dev = forward_propagation(X_dev, parameters)
J_dev = compute_cost(AL_dev, Y_dev, parameters, 0)
costs_dev.append(J_dev)
for i in range(num_epoch):
for mini_batch in mini_batches:
(minibatch_X, minibatch_Y) = mini_batch
AL, caches = forward_propagation(minibatch_X, parameters)
J_train = compute_cost(AL, minibatch_Y, parameters, lambd)
grads = L_model_backward(AL, minibatch_Y, caches, lambd)
parameters = update_parameters(parameters, grads, learning_rate)
if i % 10 == 0:
costs_train.append(J_train)
AL_dev, caches_dev = forward_propagation(X_dev, parameters)
J_dev = compute_cost(AL_dev, Y_dev, parameters, 0)
costs_dev.append(J_dev)
if i % 100 == 0:
print ("Cost after epoch %i: %f" %(i, J_train))
learning_rate = learning_rate * (k**(i/50))
plt.plot(np.squeeze(costs_train),'r')
plt.plot(np.squeeze(costs_dev),'b')
plt.ylabel('cost')
plt.xlabel('epochs (per thirties)')
plt.show()
return parameters, costs_train, costs_dev
parameters_updated, costs_train, costs_dev = Neural_Network_Model(X_train, Y_train, X_dev, Y_dev, [6400, 50, 10, 1], 0.05, 1000, 64, 0.05, 0.95)
I would really be grateful for anyone who is patient enough to read through my code. If the problem is still overfitting, could you offer some advice as to how to address this issue? I'm at a loss here because the validation loss goes up at a very early stage, so early stopping would cause underfitting by preventing the model from learning more deeply. Any advice would be appreciated.
When Validation Loss starts to increase in early beginning like images you added, it means that there's there is something wrong in the model.
It's not clear what's it as you didn't show your model.
You could check the following links that will help you:
Basic Cats vs Dogs Detailed Example in Colab
Detailed explanation for Over-fitting in TF Tutorial
or add your full code

Neural Network from scratch does not converge

I'm trying to implement a neural network from scratch in order to gain better insight about it and I run into a weird problem. When I use Relu function for hidden layers as an activation function, the model did not converge whereas it did converge once sigmoid function is used. Here is my vanilla code: When you change first 2 layers' activation function from relu to sigmoid, you can see that it converges, though it may have a problem sometimes. Where could be the problem? It's been three days and I still couldnt find it, though I found some little bugs. Thanks in advance.
Here is the toy dataset I've been using(just paste it to where this code is located).
Dataset
import numpy as np
import pandas as pd
class NeuralNetwork():
def __init__(self, epoch=10000, alpha=0.075, algorithm="gradient_descent"):
# hyperparameters
self.epoch = epoch
self.alpha = alpha
self.algorithm = algorithm
# parameters
self.params = {}
self.layer_no = 1
# logs
self.cost_vals = []
def createLayer(self, size, activation_func, randomness=True):
if randomness == True:
self.params["W" + str(self.layer_no)] = np.random.randn(size[0], size[1]) * 0.01
else:
self.params["W" + str(self.layer_no)] = np.zeros(size)
self.params["b" + str(self.layer_no)] = np.zeros((size[0], 1))
self.params["func" + str(self.layer_no)] = activation_func
self.layer_no += 1
def sigmoid(self, X):
return 1 / (1 + np.exp(-X))
def relu(self, X):
return np.maximum(X, 0) * 0.01
def tanh(self, X):
return (np.exp(X) - np.exp(-X)) / (np.exp(X) + np.exp(-X))
def derivative_sigmoid(self, X):
der_x = self.sigmoid(X)
return der_x * (1 - der_x)
def derivative_relu(self, X):
X[X<=0] = 0
X[X>0] = 1
return X
def derivative_tanh(self, X):
tanhx = self.tanh(X)
return 1 - np.power(tanhx, 2)
def activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.sigmoid(Zl)
elif act_func_name == "relu":
return self.relu(Zl)
elif act_func_name == "tanh":
return self.tanh(Zl)
def derivative_activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.derivative_sigmoid(Zl)
elif act_func_name == "relu":
return self.derivative_relu(Zl)
elif act_func_name == "tanh":
return self.derivative_tanh(Zl)
def train(self, X, Y):
m = Y.shape[0] # number of training examples
self.params["A0"] = X
self.params["Z0"] = None
for i in range(self.epoch):
# forward prop
for l in range(1, self.layer_no): # 1,2,3
Zl = np.dot(self.params["W" + str(l)], self.params["A" + str(l - 1)]) + self.params["b" + str(l)] # linear function of a layer with vectorization
Al = self.activation_function(Zl, self.params["func" + str(l)]) # activated form of Zl
self.params["Z" + str(l)] = Zl
self.params["A" + str(l)] = Al
# cost function
cost_val = - 1 / m * np.sum(np.multiply(Y, np.log(Al)) + np.multiply((1 - Y), np.log(1 - Al)))
cost_val = np.squeeze(cost_val)
if i % 500 == 0:
print(cost_val)
self.cost_vals.append(cost_val)
# backward prop
dAl = - (np.divide(Y, Al) - np.divide(1 - Y, 1 - Al)) # gradiant of last layer of A
for l in reversed(range(1, self.layer_no)): # 3,2,1
# backward prop
dZl = np.multiply(dAl,
self.derivative_activation_function(self.params["Z" + str(l)], self.params["func" + str(l)])) # gradient of layer l of Z
dAl1 = np.dot(self.params["W" + str(l)].T, dZl) # gradient of previous layer of A
dWl = 1 / m * np.dot(dZl, self.params["A" + str(l - 1)].T) # gradient of parameters W in layer l
dbl = 1 / m * np.sum(dZl, axis=1, keepdims=True) # gradient of parameters b in layer l
# update parameters
self.params["W" + str(l)] -= self.alpha * dWl
self.params["b" + str(l)] -= self.alpha * dbl
dAl = dAl1 # assign gradient of previous layer of A to the current one so as to use it while back-propagation
def iris_data():
from sklearn.model_selection import train_test_split
datas = pd.read_csv('iris_nn.data').to_numpy()
X = datas[:, 0:4].astype(float)
Y = datas[:, 4:5]
Y = np.asarray([1 if (y == 'Iris-setosa') else 0 for y in Y]).reshape((Y.shape[0], 1))
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
return X_train.T, Y_train.T
X, Y = iris_data()
model = NeuralNetwork()
model.createLayer((5,4), "relu")
model.createLayer((7,5), "relu")
model.createLayer((1,7), "sigmoid")
model.train(X,Y)
#

nan in numpy dot product

Here below is a piece of code for realizing a 2-layer neuron network for fitting problem in numpy. The activatin function is ReLU. The training algorithm is Adam. The loss function is half of the mean squared error. However, when the batch size is large(e.g. 10000), the loss will become nan after some iterations. The problem won't happen for small batch size. Could anyone help me explain why this may happen?(data are from matlab workspace:6_final_mapping_pos.mat)
#
import scipy.io as sio
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
data = sio.loadmat('6_final_mapping_pos.mat')
class NeuralNetwork():
def __init__(self):
self.batch_size = 256
self.input_size = 5 # input dimension is 5
self.hidden_layer1_size = 50
self.output_size = 1 # output dimension is 5
self.train_data = data['training_data_pos']
self.df_traindata = pd.DataFrame(data=self.train_data)
self.validation_data_num = 17142
self.valid_data = data['validation_data_pos']
self.df_validdata = pd.DataFrame(data=self.valid_data)
# weight initialization for ReLu
self.W1 = np.random.randn(self.input_size, self.hidden_layer1_size)/ np.sqrt(self.input_size/2)
self.W2 = np.random.randn(self.hidden_layer1_size, self.output_size)/ np.sqrt(self.hidden_layer1_size/2)
#bias initialization
self.b1 = np.zeros((1,self.hidden_layer1_size))
self.b2 = np.zeros((1,self.output_size))
self.lr = 5e-3 # learning rate
self.reg = 1e-3 # regularization strength
self.p = 0.5 # dropout probability = 1-p
self.first_moment_W3=0
self.second_moment_W3=0
self.first_moment_W2=0
self.second_moment_W2=0
self.first_moment_W1=0
self.second_moment_W1=0
self.first_moment_b3=0
self.second_moment_b3=0
self.first_moment_b2=0
self.second_moment_b2=0
self.first_moment_b1=0
self.second_moment_b1=0
def feedforward(self):
### randomly selected mini-batch as inputs
self.df_sample_t = self.df_traindata.sample(n = self.batch_size)
self.train_input = self.df_sample_t.as_matrix(columns=[0,1,2,3,4])
self.train_output = self.df_sample_t.as_matrix(columns=[5])
#hidden layer with dropput technique
self.hidden_layer1 = np.maximum(0, (np.dot(self.train_input, self.W1) + self.b1))
U1= np.random.rand(*self.hidden_layer1.shape) < self.p # drop mask
self.hidden_layer1 *= U1 # drop!
self.output_layer = np.dot(self.hidden_layer1, self.W2) + self.b2
self.data_loss = np.sum(0.5*(self.output_layer-self.train_output)**2) / self.batch_size
self.reg_loss = 0.5*self.reg*np.sum(self.W1*self.W1) + 0.5*self.reg*np.sum(self.W2*self.W2)
self.total_loss = self.data_loss + self.reg_loss
def backpropagation(self):
self.d_output = (self.output_layer-self.train_output)/ self.batch_size
#data part
self.dW2 = np.dot(self.hidden_layer1.T, self.d_output)
self.db2 = np.sum(self.d_output, axis=0, keepdims=True)
self.dhidden1 = np.dot(self.d_output, self.W2.T)
self.dhidden1[self.hidden_layer1<= 0] = 0
self.dW1 = np.dot(self.train_input.T, self.dhidden1)
self.db1 = np.sum(self.dhidden1, axis=0, keepdims=True)
#regularization part
self.dW2 = self.dW2 + self.reg * self.W2
self.dW1 = self.dW1 + self.reg * self.W1
def Adam(self, epoch, dW2, dW1, db2, db1):
beta1 = 0.9
beta2 = 0.99
self.first_moment_W2 = beta1*self.first_moment_W2 + (1-beta1)*dW2
self.second_moment_W2 = beta2*self.second_moment_W2 + (1-beta2)*dW2*dW2
first_unbias_W2 = self.first_moment_W2 /(1-beta1 ** epoch)
second_unbias_W2 = self.second_moment_W2 /(1-beta2 ** epoch)
self.W2 -= self.lr * first_unbias_W2 / (np.sqrt(second_unbias_W2) +1e-7)
self.first_moment_W1 = beta1*self.first_moment_W1 + (1-beta1)*dW1
self.second_moment_W1 = beta2*self.second_moment_W1 + (1-beta2)*dW1*dW1
first_unbias_W1 = self.first_moment_W1 /(1-beta1 ** epoch)
second_unbias_W1 = self.second_moment_W1 /(1-beta2 ** epoch)
self.W1 -= self.lr * first_unbias_W1 / (np.sqrt(second_unbias_W1) +1e-7)
self.first_moment_b2 = beta1*self.first_moment_b2 + (1-beta1)*db2
self.second_moment_b2 = beta2*self.second_moment_b2 + (1-beta2)*db2*db2
first_unbias_b2 = self.first_moment_b2 /(1-beta1 ** epoch)
second_unbias_b2 = self.second_moment_b2 /(1-beta2 ** epoch)
self.b2 -= self.lr * first_unbias_b2 / (np.sqrt(second_unbias_b2) +1e-7)
self.first_moment_b1 = beta1*self.first_moment_b1 + (1-beta1)*db1
self.second_moment_b1 = beta2*self.second_moment_b1 + (1-beta2)*db1*db1
first_unbias_b1 = self.first_moment_b1 /(1-beta1 ** epoch)
second_unbias_b1 = self.second_moment_b1 /(1-beta2 ** epoch)
self.b1 -= self.lr * first_unbias_b1 / (np.sqrt(second_unbias_b1) +1e-7)
def validation(self):
self.df_sample_v = self.df_validdata.sample(n = self.validation_data_num)
self.valid_input = self.df_sample_v.as_matrix(columns=[0,1,2,3,4])
self.valid_output = self.df_sample_v.as_matrix(columns=[5])
self.hidden_layer1 = np.maximum(0, np.dot(self.valid_input, self.W1) + self.b1) *self.p
self.output_layer = np.dot(self.hidden_layer1, self.W2) + self.b2
self.data_loss = np.sum(0.5*(self.output_layer-self.valid_output)**2) / self.validation_data_num
self.reg_loss = 0.5*self.reg*np.sum(self.W1*self.W1) + 0.5*self.reg*np.sum(self.W2*self.W2)
self.total_loss = self.data_loss + self.reg_loss
NN = NeuralNetwork()
num_iterations = 120
training_loss = np.array([])
validation_loss = np.array([])
validation_dataloss = np.array([])
t=1
T=np.array([range(1,num_iterations)]).T
# Training and validation
while(t < num_iterations):
NN.feedforward()
NN.backpropagation()
NN.Adam(t, NN.dW2, NN.dW1, NN.db2, NN.db1)
training_loss = np.append(training_loss, NN.total_loss)
if t % 10 == 0:
print ("training:" + "total loss = %f, data loss = %f, regularization loss = %f" % (NN.total_loss,NN.data_loss,NN.reg_loss))
NN.validation()
validation_loss = np.append(validation_loss, NN.total_loss)
validation_dataloss = np.append(validation_dataloss, NN.data_loss)
if t % 10 == 0:
print ("validation:" + "total loss = %f, data loss = %f, regularization loss = %f" % (NN.total_loss,NN.data_loss,NN.reg_loss))
t+=1

Categories

Resources