I want to implement a multi-layer perceptron.
I found some code on GitHub that classifies MNIST quite well (96%). However, for some reason, it does not cope with the XOR task.
I want to understand why.
Here is the code:
perceptron.py
import random
import numpy as np
class Perceptron:
def __init__(self, *, layer_sizes, activation_functions, cost_function_deriv):
self.layer_sizes = layer_sizes
if len(self.layer_sizes) - 1 != len(activation_functions):
raise ValueError("...")
self.activation_functions = activation_functions
self.cost_function_deriv = cost_function_deriv
self.biases = [np.random.randn(y, 1) for y in layer_sizes[1:]]
self.weights = [np.random.randn(y, x) for x, y in zip(layer_sizes[:-1], layer_sizes[1:])]
def train(self, training_data, test_data, epochs, mini_batch_size, lr):
test_data_len = len(test_data)
for epoch in range(epochs):
random.shuffle(training_data)
mini_batches = [training_data[x: x + mini_batch_size]
for x in range(0, len(training_data), mini_batch_size)]
for mini_batch in mini_batches:
mb_len = len(mini_batch)
gradient_weights = [np.zeros(w.shape) for w in self.weights]
gradient_biases = [np.zeros(b.shape) for b in self.biases]
for x, y in mini_batch:
delta_gradient_biases, delta_gradient_weights = self.backpropagation(np.array(x), y)
gradient_weights = [grad + delta for grad, delta in zip(gradient_weights, delta_gradient_weights)]
gradient_biases = [grad + delta for grad, delta in zip(gradient_biases, delta_gradient_biases)]
self.weights = [w - (lr / mb_len) * grad for w, grad in zip(self.weights, gradient_weights)]
self.biases = [b - (lr / mb_len) * grad for b, grad in zip(self.biases, gradient_biases)]
correct_answers = self.how_many_correct_answers(test_data)
print(f"Epoch number {epoch}: {correct_answers}/{test_data_len} correct answers")
def backpropagation(self, x, y):
gradient_b = [np.zeros(b.shape) for b in self.biases]
gradient_w = [np.zeros(w.shape) for w in self.weights]
activations = [x]
prev_activation = x
for i, (b, w) in enumerate(zip(self.biases, self.weights)):
current_activation = self.activation_functions[i](np.dot(w, prev_activation) + b)
activations.append(current_activation)
prev_activation = current_activation
delta = self.cost_function_deriv(activations[-1], y) * self.activation_functions[-1].deriv(activations[-1])
gradient_b[-1] = delta
gradient_w[-1] = np.dot(delta, activations[-2].T)
for i in range(2, len(self.layer_sizes)):
z = activations[-i]
act_der = self.activation_functions[-i + 1].deriv(z)
delta = np.dot(self.weights[-i + 1].T, delta) * act_der
gradient_b[-i] = delta
gradient_w[-i] = np.dot(delta, activations[-i - 1].T)
# Normal indexing variant:
# for i in range(len(self.layers) - 1, 0, -1):
# z = activations[i]
# act_der = self.activation_functions[i].deriv(z)
# delta = np.dot(self.weights[i].T, delta) * act_der
# gradient_b[i - 1] = delta
# gradient_w[i - 1] = np.dot(delta, activations[i - 1].T)
return gradient_b, gradient_w
def feedforward(self, a):
for i, (b, w) in enumerate(zip(self.biases, self.weights)):
a = self.activation_functions[i](np.dot(w, a) + b)
return a
def how_many_correct_answers(self, test_data):
k = 0
for x, y in test_data:
y_predict = np.argmax(self.feedforward(x))
print(y_predict, y)
k += int(y_predict == y)
return k
main.py
from copy import deepcopy
import numpy as np
from perceptron import Perceptron
class Sigmoid:
out_min_max = [0, 1]
def __call__(self, x):
return 1. / (1. + np.exp(-x))
def deriv(self, y):
# t = self(x)
# return t * (1. - t)
return y * (1. - y)
def cost_function_derivative(y_predict, y_true_label):
label_vector = np.zeros(y_predict.shape)
label_vector[y_true_label] = 1.0
return y_predict - label_vector
def main():
training_data = np.asarray([[[[0], [0]], 0],
[[[0], [1]], 1],
[[[1], [0]], 1],
[[[1], [1]], 0]])
layer_sizes = [2, 8, 2]
model = Perceptron(layer_sizes=layer_sizes,
activation_functions=[Sigmoid(), Sigmoid()],
cost_function_deriv=cost_function_derivative)
model.train(deepcopy(training_data),
deepcopy(training_data),
epochs=10000,
mini_batch_size=4,
lr=0.01)
if __name__ == '__main__':
main()
The final output in format 'y_predict y_true' (after each epoch):
0 0
0 1
0 1
0 0
If remove random.shuffle(training_data) then:
1 0
0 1
1 1
0 0
But not 0 1 1 0
I figured it out. It requires the following.
mini_batch_size=1
# random.shuffle(training_data) -- comment
epochs=10000
And it's better to do this:
lr=0.1
The result in most cases is obtained after ~1000 epochs:
0 0
1 1
1 1
0 0
Related
I have written a Dense Class for a FC layer in a CNN but to test if it works simply as a FC ANN, I tried to train a dataset over it but the loss never falls. I cannot seem to find the issue.
Here's the code:
class Dense:
# Constructor
def __init__(self, size, in_size, activation = 'relu'):
# Assign vars
self.size = size; self.activation = activation
# Initialize Weights and Biases
weights_dims = (size, in_size)
self.weights = np.random.standard_normal(weights_dims) * 0.01
self.biases = np.zeros([size, 1])
# Initialize Accumulators
self.sigma_acc = self.biases * 0
self.delta_acc = self.weights * 0
# ReLU Activation Function
def relu(self, arr):
return arr * (arr > 0)
# Softmax Activation Function
def softmax(self, arr):
arr -= arr.max()
exp = np.exp(arr)
return exp / np.sum(exp)
# Activation Manager Function
def activate(self, arr):
if self.activation == 'relu': return self.relu(arr)
if self.activation == 'softmax': return self.softmax(arr)
# Forward Propagation
def step(self, vec):
# Assign Input
self._in = vec
# Dot
z = np.dot(self.weights, vec) + self.biases
a = self.activate(z)
# Return
self.out = a
return self.out
# Back Propagation
def back(self, grad):
# Calculate sigma
sigma = grad if self.activation == 'softmax' else grad * (self.out > 0)
# Calculate delta
delta = np.dot(sigma, self._in.T)
# Accumulate
self.sigma_acc += sigma
self.delta_acc += delta
# Return global gradient
global_grad = np.dot(self.weights.T, sigma)
return global_grad
# Train
def update(self, alpha, batch_size):
dw = self.delta_acc / batch_size; self.delta_acc *= 0
db = self.sigma_acc / batch_size; self.sigma_acc *= 0
self.weights -= alpha * dw
self.biases -= alpha * db
To connect them as a model, I just add instances of this Dense class into a list and loop through them forwards and backwards using the step() and back() functions respectively.
Kindly inform me if you see any issue! Thanks.
This is how I created my network maybe could help you.
import numpy as np
X = np.array(([0, 0, 0], [0, 0, 1], [0, 1, 0],
[0, 1, 1], [1, 0, 0], [1, 0, 1],
[1, 1, 0], [1, 1, 1]), dtype=float)
y = np.array(([1], [0], [0], [0], [0], [0], [0], [1]), dtype=float)
xPredicted = np.array(([0, 0, 1]), dtype=float)
X = X/np.amax(X, axis=0)
xPredicted = xPredicted/np.amax(X, axis=0)
lossFile = open("Enter file", "w")
class Neural_Network(object):
def __init__(self, inputLayerSize, outputLayerSize, hiddenLayerSize):
self.inputLayerSize = inputLayerSize
self.outputLayerSize = outputLayerSize
self.hiddenLayerSize = hiddenLayerSize
self.W1 = \
np.random.randn(self.inputLayerSize, self.hiddenLayerSize)
self.W2 = \
np.random.randn(self.hiddenLayerSize, self.outputLayerSize)
def feedForward(self, X):
self.z = np.dot(X, self.W1)
self.z2 = self.activationSigmoid(self.z)
self.z3 = np.dot(self.z2, self.W2)
o = self.activationSigmoid(self.z3)
return o
def backwardPropagate(self, X, y, o):
self.o_error = y - o
self.o_delta = self.o_error*self.activationSigmoidPrime(o)
self.z2_error = self.o_delta.dot(self.W2.T)
self.z2_delta = self.z2_error*self.activationSigmoidPrime(self.z2)
self.W1 += X.T.dot(self.z2_delta)
self.W2 += self.z2.T.dot(self.o_delta)
def trainNetwork(self, X, y):
o = self.feedForward(X)
self.backwardPropagate(X, y, o)
def activationSigmoid(self, s):
return 1/(1+np.exp(-s))
def activationSigmoidPrime(self, s):
return s * (1 - s)
def saveSumSquaredLossList(self, i, error):
lossFile.write(str(i)+","+str(error.tolist())+"\n")
def saveWeights(self):
np.savetxt("Enter file", self.W1, fmt="%s")
np.savetxt("Enter file",
self.W2, fmt="%s")
def predictOutput(self):
print("Predicted XOR output data based on trained weights: ")
print("Expected (X1-X3); \n" + str(X))
print("Output (Y1): \n" + str(self.feedForward(xPredicted)))
myNeuralNetwork = Neural_Network(3, 1, 4)
trainingEpochs = 1000
for i in range(trainingEpochs):
print("Epoch # " + str(i) + "\n")
print("Network Input : \n" + str(X))
print("Expected Output of XOR Gate Neural Network: \n" + str(y))
print("Actual Output from XOR Gate Neural Network: \n" +
str(myNeuralNetwork.feedForward(X)))
Loss = np.mean(np.square(y - myNeuralNetwork.feedForward(X)))
myNeuralNetwork.saveSumSquaredLossList(i, Loss)
print("Sum Squared Loss: \n" + str(Loss))
print("\n")
myNeuralNetwork.trainNetwork(X, y)
myNeuralNetwork.saveWeights()
myNeuralNetwork.predictOutput()
I've been doing a very simply binary cat/dog classification project with machine learning. I understand the problem of overfitting, but what's strange in my case is that the validation loss begins to rise from the very beginning. I've tried many different sets of hyperparameters, with L2 regularization, learning rate decay and stochastic gradient descent, and a large training set, but the issue remained. Here is the learning graph from one of the trials (the horizontal axis should be per 10 epochs):
The hyperparameters are: two hidden layers with 50 and 10 units, initial alpha = 0.05, alpha decay rate = 0.95 per 50 epochs, mini-batch size = 64, lambda = 0.05
Here are other sample learning graphs:
I developed my model on the basis of what's provided in Andrew Ng's Deep Learning Specialization, so I didn't expect many bugs. My full code, as required, is attached below:
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
from scipy import special
#Data Preprocessing (the same for dev set, which I omit here)
path = '/Users/bobby/Downloads/kagglecatsanddogs_3367a/PetImages'
train_set = []
img_size = 80
categories = ['dogs_train','cats_train']
epsilon = 1e-8
for category in categories:
path_animal = os.path.join(path, category)
for img in os.listdir(path_animal):
try:
img_array = cv2.imread(os.path.join(path_animal, img), cv2.IMREAD_GRAYSCALE)
new_img_array = cv2.resize(img_array, (img_size, img_size))
flattened_img_array = new_img_array.reshape(img_size*img_size)
train_set.append([flattened_img_array, categories.index(category)])
except:
continue
import random
random.shuffle(train_set)
X_train = []
Y_train = []
for sample in train_set:
X_train.append(sample[0])
Y_train.append(sample[1])
X_train = (np.array(X_train).T)/255
Y_train = np.array(Y_train).reshape((1, np.array(Y_train).shape[0]))
def create_mini_batches(X, Y, mini_batch_size):
m = X.shape[1]
mini_batches = []
num_mini_batches = m // mini_batch_size
permutation = list(np.random.permutation(m))
shuffled_X = X[:, permutation]
shuffled_Y = Y[:, permutation]
for i in range(num_mini_batches):
select_X = shuffled_X[:, mini_batch_size*i : mini_batch_size*(i+1)]
select_Y = shuffled_Y[:, mini_batch_size*i : mini_batch_size*(i+1)]
mini_batch = (select_X, select_Y)
mini_batches.append(mini_batch)
if m % mini_batch_size != 0:
last_X = shuffled_X[:, mini_batch_size*num_mini_batches:m]
last_Y = shuffled_Y[:, mini_batch_size*num_mini_batches:m]
last_mini_batch = (last_X, last_Y)
mini_batches.append(last_mini_batch)
return mini_batches
def initialize_parameters(layers_dims):
L = len(layers_dims) # number of layers (including input layer), in this case L=4.
parameters = {}
for l in range(1,L): # range(1,4).
parameters['W' + str(l)] = np.random.randn(layers_dims[l],layers_dims[l-1]) * np.sqrt(2/layers_dims[l-1])
parameters['b' + str(l)] = np.zeros((layers_dims[l],1))
return parameters
def sigmoid(Z):
A = special.expit(Z)
return A,Z
def relu(Z):
A = np.maximum(0.01*Z, Z)
return A,Z
def forward_propagation(X, parameters):
caches = [] #list containing Z for every node
A = X
L = int(len(parameters)/2)
for l in range(1,L):
A_prev = A
W = parameters['W'+str(l)]
b = parameters['b'+str(l)]
Z = np.dot(W, A_prev) + b
A, activation_cache = relu(Z) #activation_cache contains z[l].
linear_cache = (A_prev, W, b) #linear_cache contains A[l-1], W[l], b[l].
cache = (linear_cache, activation_cache)
caches.append(cache)
W = parameters['W'+str(L)]
b = parameters['b'+str(L)]
Z = np.dot(W, A) + b
AL, activation_cache = sigmoid(Z)
linear_cache = (A, W, b)
cache = (linear_cache, activation_cache)
caches.append(cache)
return AL, caches
def compute_cost(AL, Y, parameters, lambd):
m = Y.shape[1] # number of examples
L = int(len(parameters)/2) #[6400,100,20,1] L=3 (0,1,2)
reg_cost = 0
for l in range(L):
W = parameters['W' + str(l+1)]
reg_cost += np.sum(np.square(W))
J = (-1/m)*(np.sum(Y*np.log(AL+epsilon)+(1-Y)*np.log(1-AL+epsilon))) + (1/m) * (lambd/2) * reg_cost
J = np.squeeze(J)
return J
def linear_backward(dZ, linear_cache, lambd):
A_prev, W, b = linear_cache
m = A_prev.shape[1]
dW = (1/m) * np.dot(dZ,A_prev.T) + (lambd/m)*W
db = (1/m) * np.sum(dZ,axis=1,keepdims=True)
dA_prev = np.dot(W.T,dZ)
return dA_prev, dW, db
def relu_gradient(Z):
dZ = np.where(Z > 0, 1, 0.01)
return dZ
def sigmoid_gradient(Z):
dZ = special.expit(Z)*(1-special.expit(Z))
return dZ
def linear_activation_backward(dA, cache, lambd, A, Y, activation):
linear_cache, activation_cache = cache
if activation == 'relu':
dZ = dA * relu_gradient(activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
elif activation == 'sigmoid':
dZ = A - Y
dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
return dA_prev, dW, db
def L_model_backward(AL, Y, caches, lambd):
grads = {}
L = len(caches)
m = AL.shape[1]
Y = Y.reshape(AL.shape)
cache_final_layer = caches[L-1]
grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(_, cache_final_layer, lambd, AL, Y, activation='sigmoid')
for l in reversed(range(L-1)):
current_cache = caches[l]
grads["dA" + str(l)], grads["dW" + str(l+1)], grads["db" + str(l+1)] = linear_activation_backward(grads['dA' + str(l+1)], current_cache, lambd, _, _, activation='relu')
return grads
def update_parameters(parameters, grads, learning_rate):
L = len(parameters) // 2
for l in range(L):
parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
return parameters
def Neural_Network_Model(X_train, Y_train, X_dev, Y_dev, layers_dims, learning_rate, num_epoch, mini_batch_size, lambd, k):
mini_batches = create_mini_batches(X_train, Y_train, mini_batch_size) #[(X{1},Y{1}),(X{2},Y{2}),...,(X{n},Y{n})]
costs_train = []
costs_dev = []
parameters = initialize_parameters(layers_dims)
AL_dev, caches_dev = forward_propagation(X_dev, parameters)
J_dev = compute_cost(AL_dev, Y_dev, parameters, 0)
costs_dev.append(J_dev)
for i in range(num_epoch):
for mini_batch in mini_batches:
(minibatch_X, minibatch_Y) = mini_batch
AL, caches = forward_propagation(minibatch_X, parameters)
J_train = compute_cost(AL, minibatch_Y, parameters, lambd)
grads = L_model_backward(AL, minibatch_Y, caches, lambd)
parameters = update_parameters(parameters, grads, learning_rate)
if i % 10 == 0:
costs_train.append(J_train)
AL_dev, caches_dev = forward_propagation(X_dev, parameters)
J_dev = compute_cost(AL_dev, Y_dev, parameters, 0)
costs_dev.append(J_dev)
if i % 100 == 0:
print ("Cost after epoch %i: %f" %(i, J_train))
learning_rate = learning_rate * (k**(i/50))
plt.plot(np.squeeze(costs_train),'r')
plt.plot(np.squeeze(costs_dev),'b')
plt.ylabel('cost')
plt.xlabel('epochs (per thirties)')
plt.show()
return parameters, costs_train, costs_dev
parameters_updated, costs_train, costs_dev = Neural_Network_Model(X_train, Y_train, X_dev, Y_dev, [6400, 50, 10, 1], 0.05, 1000, 64, 0.05, 0.95)
I would really be grateful for anyone who is patient enough to read through my code. If the problem is still overfitting, could you offer some advice as to how to address this issue? I'm at a loss here because the validation loss goes up at a very early stage, so early stopping would cause underfitting by preventing the model from learning more deeply. Any advice would be appreciated.
When Validation Loss starts to increase in early beginning like images you added, it means that there's there is something wrong in the model.
It's not clear what's it as you didn't show your model.
You could check the following links that will help you:
Basic Cats vs Dogs Detailed Example in Colab
Detailed explanation for Over-fitting in TF Tutorial
or add your full code
I have been trying to create a simple standard neural network from scratch but I can't seem to get it to work normally. Sometimes the cost skyrockets, other times the cost doesn't even change. I'm not sure what the problem is but it would be really helpful if someone could help me.
I have all of the information on Github. If any more information is needed kindly reply and I will provide it.
https://github.com/enriquedellxps/Neural-Network
Function for generating batches:
def batcher(data, batch_size):
# get the number of batches
num_batches_norem = data.shape[1] // batch_size
if data.shape[1] % batch_size == 0:
remainder_quantity = 0
else:
remainder_size = data.shape[1] % batch_size
remainder_quantity = 1
num_batches = num_batches_norem + remainder_quantity
changer = 0
for mb in range(num_batches_norem):
current_batch = data[:, changer:changer + batch_size]
changer += batch_size
yield current_batch
for last_mb in range(remainder_quantity):
last_batch = data[:, changer:changer + remainder_size]
yield last_batch
Function for g(z):
def activationer(a, z):
# ACTIVATION FUNCTIONS
# Sigmoid Activation Function
def sigmoid(z):
g = scipy.special.expit(z)
return g
# Tanh (Hyperbolic Tangent Function) Activation Function
def tanh(z):
g = (np.exp(z) - np.exp(-1 * z)) / ((np.exp(z) + np.exp(-1 * z)))
return g
# ReLU (Rectified Linear Unit) Activation Function
def ReLU(z):
g = np.maximum(0, z)
return g
# Leaky ReLU (Leaky Rectified Linear Unit) Activation Function
def Leaky_ReLU(z):
g = np.maximum(0.01 * z, z)
return g
def softmax(z):
z_exp = np.exp(z)
g = z_exp / np.sum(z_exp, axis=0, keepdims=True)
return g
if a == "sigmoid":
res = sigmoid(z)
elif a == "tanh":
res = tanh(z)
elif a == "relu":
res = ReLU(z)
elif a == "leaky relu":
res = Leaky_ReLU(z)
elif a == "softmax":
res = softmax(z)
return res
NN Class:
class DeepNeuralNetwork:
def __init__(self, n_x, n_h, n_y, nl, activations, alpha):
assert nl == len(activations), f"L: {nl}, Number of Activations: {len(activations)}"
# Assign inputs to the self object
self.n_x = n_x
self.n_h = n_h
self.n_y = n_y
self.nl = nl
self.activations = activations
self.alpha = alpha
# Initialize Parameters
def initialize_parameters(self):
n_x = self.n_x
n_h = self.n_h
n_y = self.n_y
activations = self.activations
parameters = []
for l in range(self.nl):
np.random.seed(8)
if l == 0:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_h, n_x) * np.sqrt(2 / n_x), np.zeros((n_h, 1))]) # aka W1, b1 | Xavier
else:
parameters.append([np.random.randn(n_h, n_x) * np.sqrt(1 / n_x), np.zeros((n_h, 1))]) # aka W1, b1 | He
elif l == self.nl - 1:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_y, n_h) * np.sqrt(2 / n_h), np.zeros((n_y, 1))]) # aka WL, bL | Xavier
else:
parameters.append([np.random.randn(n_y, n_h) * np.sqrt(1 / n_h), np.zeros((n_y, 1))]) # aka WL, bL | He
else:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_h, n_h) * np.sqrt(2 / n_h), np.zeros((n_h, 1))]) # hidden params | Xavier
else:
parameters.append([np.random.randn(n_h, n_h) * np.sqrt(1 / n_h), np.zeros((n_h, 1))]) # hidden params | He
return parameters
# Forward Propagation
def forward_propagation(self, parameters, input_data):
batch_size = input_data.shape[1] # Get the amount of examples in the batch
caches = []
self.caches = caches
current_activation = input_data # Set first activation - A0 - as the input
caches.append(current_activation)
for l in range(self.nl):
W, b = parameters[l][0], parameters[l][1] # Get weights and biases for current layer
Z = W # current_activation + b # Compute the linear activation
current_activation = activationer(self.activations[l], Z) # Compute the full activation
caches.append(current_activation)
return current_activation
# Compute Cost
def compute_cost(self, yhat, y):
batch_size = y.shape[1] # Get the amount of examples in the batch
cost = (1 / batch_size) * np.sum(-1 * (y * np.log(yhat) + ((1 - y) * (np.log(1 - yhat))))) # Compute the cross-entropy cost
cost = np.squeeze(cost) # Turn [[17]] to 17
return cost
# Backward Propagation
def backward_propagation(self, parameters, y):
caches = self.caches
batch_size = y.shape[1]
grads = []
for l in reversed(range(1, self.nl + 1)):
if l == self.nl:
dZ = caches[l] - y
dW = (1 / batch_size) * dZ # caches[l - 1].T
db = (1 / batch_size) * np.sum(dZ, axis=1, keepdims=True)
grads.append([dW, db])
else:
dA = parameters[l][0].T # dZ
dZ = dA * np.multiply(caches[l], (1 - caches[l]))
dW = (1 / batch_size) * dZ # caches[l - 1].T
db = (1 / batch_size) * np.sum(dZ, axis=1, keepdims=True)
grads.append([dW, db])
return grads
# Update Parameters
def update_parameters(self, parameters, gradients):
for l in range(self.nl):
parameters[l][0] = parameters[l][0] - self.alpha * gradients[self.nl - l - 1][0]
parameters[l][1] = parameters[l][1] - self.alpha * gradients[self.nl - l - 1][1]
return parameters
Running it:
dnn = DeepNeuralNetwork(12288, 20, 1, 4, ["relu", "relu", "relu", "sigmoid"], 0.001)
params = dnn.initialize_parameters()
epochs = 100
for e in range(epochs):
for i, j in zip(train_x_batched, train_y_batched):
yhat = dnn.forward_propagation(params, i)
cost = dnn.compute_cost(yhat, j)
grads = dnn.backward_propagation(params, j)
params = update_parameters(params, grads)
print(cost) # This usually starts going down then skyrockets. Even if I lower the learning rate to 0.00001
Thanks :)
I'm trying to implement a neural network from scratch in order to gain better insight about it and I run into a weird problem. When I use Relu function for hidden layers as an activation function, the model did not converge whereas it did converge once sigmoid function is used. Here is my vanilla code: When you change first 2 layers' activation function from relu to sigmoid, you can see that it converges, though it may have a problem sometimes. Where could be the problem? It's been three days and I still couldnt find it, though I found some little bugs. Thanks in advance.
Here is the toy dataset I've been using(just paste it to where this code is located).
Dataset
import numpy as np
import pandas as pd
class NeuralNetwork():
def __init__(self, epoch=10000, alpha=0.075, algorithm="gradient_descent"):
# hyperparameters
self.epoch = epoch
self.alpha = alpha
self.algorithm = algorithm
# parameters
self.params = {}
self.layer_no = 1
# logs
self.cost_vals = []
def createLayer(self, size, activation_func, randomness=True):
if randomness == True:
self.params["W" + str(self.layer_no)] = np.random.randn(size[0], size[1]) * 0.01
else:
self.params["W" + str(self.layer_no)] = np.zeros(size)
self.params["b" + str(self.layer_no)] = np.zeros((size[0], 1))
self.params["func" + str(self.layer_no)] = activation_func
self.layer_no += 1
def sigmoid(self, X):
return 1 / (1 + np.exp(-X))
def relu(self, X):
return np.maximum(X, 0) * 0.01
def tanh(self, X):
return (np.exp(X) - np.exp(-X)) / (np.exp(X) + np.exp(-X))
def derivative_sigmoid(self, X):
der_x = self.sigmoid(X)
return der_x * (1 - der_x)
def derivative_relu(self, X):
X[X<=0] = 0
X[X>0] = 1
return X
def derivative_tanh(self, X):
tanhx = self.tanh(X)
return 1 - np.power(tanhx, 2)
def activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.sigmoid(Zl)
elif act_func_name == "relu":
return self.relu(Zl)
elif act_func_name == "tanh":
return self.tanh(Zl)
def derivative_activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.derivative_sigmoid(Zl)
elif act_func_name == "relu":
return self.derivative_relu(Zl)
elif act_func_name == "tanh":
return self.derivative_tanh(Zl)
def train(self, X, Y):
m = Y.shape[0] # number of training examples
self.params["A0"] = X
self.params["Z0"] = None
for i in range(self.epoch):
# forward prop
for l in range(1, self.layer_no): # 1,2,3
Zl = np.dot(self.params["W" + str(l)], self.params["A" + str(l - 1)]) + self.params["b" + str(l)] # linear function of a layer with vectorization
Al = self.activation_function(Zl, self.params["func" + str(l)]) # activated form of Zl
self.params["Z" + str(l)] = Zl
self.params["A" + str(l)] = Al
# cost function
cost_val = - 1 / m * np.sum(np.multiply(Y, np.log(Al)) + np.multiply((1 - Y), np.log(1 - Al)))
cost_val = np.squeeze(cost_val)
if i % 500 == 0:
print(cost_val)
self.cost_vals.append(cost_val)
# backward prop
dAl = - (np.divide(Y, Al) - np.divide(1 - Y, 1 - Al)) # gradiant of last layer of A
for l in reversed(range(1, self.layer_no)): # 3,2,1
# backward prop
dZl = np.multiply(dAl,
self.derivative_activation_function(self.params["Z" + str(l)], self.params["func" + str(l)])) # gradient of layer l of Z
dAl1 = np.dot(self.params["W" + str(l)].T, dZl) # gradient of previous layer of A
dWl = 1 / m * np.dot(dZl, self.params["A" + str(l - 1)].T) # gradient of parameters W in layer l
dbl = 1 / m * np.sum(dZl, axis=1, keepdims=True) # gradient of parameters b in layer l
# update parameters
self.params["W" + str(l)] -= self.alpha * dWl
self.params["b" + str(l)] -= self.alpha * dbl
dAl = dAl1 # assign gradient of previous layer of A to the current one so as to use it while back-propagation
def iris_data():
from sklearn.model_selection import train_test_split
datas = pd.read_csv('iris_nn.data').to_numpy()
X = datas[:, 0:4].astype(float)
Y = datas[:, 4:5]
Y = np.asarray([1 if (y == 'Iris-setosa') else 0 for y in Y]).reshape((Y.shape[0], 1))
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
return X_train.T, Y_train.T
X, Y = iris_data()
model = NeuralNetwork()
model.createLayer((5,4), "relu")
model.createLayer((7,5), "relu")
model.createLayer((1,7), "sigmoid")
model.train(X,Y)
#
I think I've split my training data in 5 kold, is there a way for me to label/identify each of the 5 splits so I can then send each into my algorithm to calculate their own accuracies?
from sklearn.model_selection import KFold
kf = KFold(n_splits=5)
splits=kf.get_n_splits(X_train)
print(splits)
Separately, I have also tried splitting my data to then run in my logistic regression but this outputs nan % accuracy:
X_train1 = X[0:84]
Y_train1 = Y[0:84]
X_train2 = X[85:170]
Y_train2 = Y[85:170]
X_train3 = X[171:255]
Y_train3 = Y[171:255]
X_train4 = X[256:340]
Y_train4 = Y[256:340]
X_train5 = X[341:426]
Y_train5 = Y[341:426]
def Sigmoid(z):
return 1/(1 + np.exp(-z))
def Hypothesis(theta, x):
return Sigmoid(x # theta)
def Cost_Function(X,Y,theta,m):
hi = Hypothesis(theta, x)
_y = Y.reshape(-1, 1)
J = 1/float(m) * np.sum(-_y * np.log(hi) - (1-_y) * np.log(1-hi))
return J
def Cost_Function_Regularisation(X,Y,theta,m,alpha):
hi = Hypothesis(theta,X)
_y = Y.reshape(-1, 1)
J = alpha/float(m) * X.T # (hi - _y)
return J
def Cost_Function_Regularisation(X,Y,theta,m,alpha):
hi = Hypothesis(theta,X)
_y = Y.reshape(-1, 1)
J = alpha/float(m) * X.T # (hi - _y)
return J
def Gradient_Descent(X,Y,theta,m,alpha):
new_theta = theta - Cost_Function_Regularisation(X,Y,theta,m,alpha)
return new_theta
def Accuracy(theta):
correct = 0
length = len(X_test)
prediction = (Hypothesis(theta, X_test) > 0.5)
_y = Y_test.reshape(-1, 1)
correct = prediction == _y
my_accuracy = (np.sum(correct) / length)*100
print ('LR Accuracy CV: ', my_accuracy, "%")
def Logistic_Regression(X,Y,alpha,theta,num_iters):
m = len(Y)
for x in range(num_iters):
new_theta = Gradient_Descent(X,Y,theta,m,alpha)
theta = new_theta
if x % 100 == 0:
print #('theta: ', theta)
print #('cost: ', Cost_Function(X,Y,theta,m))
Accuracy(theta)
ep = .012
initial_theta = np.random.rand(X_train.shape[1],1) * 2 * ep - ep
alpha = 0.5
iterations = 10000
Logistic_Regression(X_train1,Y_train1,alpha,initial_theta,iterations)
Logistic_Regression(X_train2,Y_train2,alpha,initial_theta,iterations)
Logistic_Regression(X_train3,Y_train3,alpha,initial_theta,iterations)
Logistic_Regression(X_train4,Y_train4,alpha,initial_theta,iterations)
Logistic_Regression(X_train5,Y_train5,alpha,initial_theta,iterations
get_n_splits returns the "number of splits" you configured for skf.
Look at the documentation here for an example : http://scikit-learn.org/stable/modules/generated/sklearn.model_selection.StratifiedKFold.html