I've been programming this neural network composed of 4 layers:
The first one with 2 neural, second with 2, third with 2 and the output one with one neurons
I made this schema to show what I'm trying to reproduce :
Here is the code, you can try to run it (python 3.7) :
import numpy as np
import matplotlib.pyplot as plt
#Calculus of the sigmoid
def sigmoid(z):
return 1.0/(1+ np.exp(-z))
#Calculus of the sigmoid derivation
def sigmoid_derivative(y):
return y * (1.0 - y)
#Initialisation of the class (input, output, targets, weights, biais)
class NeuralNetwork:
def __init__(self, x, y):
self.input = x
self.weights1 = np.random.rand(self.input.shape[1],2)
self.weights2 = np.random.rand(2,2)
self.weights3 = np.random.rand(2,2)
self.weights4 = np.random.rand(2,1)
self.y = y
self.output = np.zeros(self.y.shape)
self.bias1 = np.random.rand(1,2)
self.bias2 = np.random.rand(1,2)
self.bias3 = np.random.rand(1,2)
self.bias4 = np.random.rand(1,1)
self.learning_rate = 0.005
#simple feed forward
def feedforward(self):
self.layer1 = sigmoid(np.dot(self.input, self.weights1) + self.bias1)
self.layer2 = sigmoid(np.dot(self.layer1, self.weights2) + self.bias2)
self.layer3 = sigmoid(np.dot(self.layer1, self.weights3) + self.bias3)
self.output = sigmoid(np.dot(self.layer2, self.weights4) + self.bias4)
#Back propagation algorithme
def backprop(self):
# application of the chain rule to find derivative of the loss function with respect to weights4, weights3, weights2, weights1 and the associated bias
delta_4 = 2*(self.y - self.output) * sigmoid_derivative(self.output)
d_weights4 = np.dot(self.layer3.T, delta_4)
d_bias4 = delta_4
d_bias4 = d_bias4.mean(axis=0)
delta_3 = np.dot(delta_4, self.weights4.T) * sigmoid_derivative(self.layer3)
d_weights3 = np.dot(self.layer2.T, delta_3)
d_bias3 = delta_3
d_bias3 = d_bias3.mean(axis=0)
delta_2 = np.dot(delta_3, self.weights3.T) * sigmoid_derivative(self.layer2)
d_weights2 = np.dot(self.layer1.T, delta_2)
d_bias2 = delta_2
d_bias2 = d_bias2.mean(axis=0)
delta_1 = np.dot(delta_2, self.weights2.T) * sigmoid_derivative(self.layer1)
d_weights1 = np.dot(self.input.T, delta_1)
d_bias1 = delta_1
d_bias1 = d_bias1.mean(axis=0)
# update the weights with the derivative (slope) of the loss function
self.weights1 += d_weights1 * self.learning_rate
self.weights2 += d_weights2 * self.learning_rate
self.weights3 += d_weights3 * self.learning_rate
self.weights4 += d_weights4 * self.learning_rate
self.bias1 += d_bias1 * self.learning_rate
self.bias2 += d_bias2 * self.learning_rate
self.bias3 += d_bias3 * self.learning_rate
self.bias4 += d_bias4 * self.learning_rate
def cost(self):
return np.mean((self.output - self.y)**2)
if __name__ == "__main__":
#Number of rows per class
row_per_class = 200
#generate rows
#Creating a data set hard to resolve
sick_people = (np.random.randn(row_per_class,2))
row_sick = int(row_per_class/8)
healthy_people = 2*(np.random.randn(row_sick,2)) + np.array([0,10])
healthy_people2 = 2*(np.random.randn(row_sick,2)) + np.array([0,-10])
healthy_people3 = 2*(np.random.randn(row_sick,2)) + np.array([10,0])
healthy_people4 = 2*(np.random.randn(row_sick,2)) + np.array([-10,0])
healthy_people5 = 2*(np.random.randn(row_sick,2)) + np.array([10,10])
healthy_people6 = 2*(np.random.randn(row_sick,2)) + np.array([10,-10])
healthy_people7 = 2*(np.random.randn(row_sick,2)) + np.array([-10,10])
healthy_people8 = 2*(np.random.randn(row_sick,2)) + np.array([-10,-10])
features = np.vstack([sick_people, healthy_people2, healthy_people, healthy_people3, healthy_people4, healthy_people5, healthy_people6, healthy_people7, healthy_people8])
targets = (np.concatenate((np.zeros(row_per_class), np.zeros(row_per_class)+1)))
#To have a good vision of the dataset created just above
plt.scatter(features[:,0], features[:,1], c=targets, cmap = plt.cm.Spectral)
plt.show()
targets = targets[np.newaxis].T
#Initialing the neural network
nn = NeuralNetwork(features,targets)
#Test without training, we can see the current accuracy
nn.feedforward()
predictions = np.around(nn.output)
print ("Accuracy", np.mean(predictions == nn.y))
#Training part
for i in range(30000):
if i % 1000 == 0:
print (nn.cost())
nn.feedforward()
nn.backprop()
# Re Testing of the feedforward after the training
nn.feedforward()
predictions = np.around(nn.output)
print ("Accuracy", np.mean(predictions == nn.y))
predictions = np.around(np.squeeze(np.asarray(nn.output)))
#Show on graph how well the training went
plt.scatter(features[:,0], features[:,1], c=predictions, cmap = plt.cm.Spectral)
plt.show()
# It allows us to have a better vision of the result, we project random point by thousands and
# see the graph
row_per_class = 2000
#generate rows
sick_people = (np.random.randn(row_per_class,2))*4
sick_people2 = (np.random.randn(row_per_class,2))*4
healthy_people = (np.random.randn(row_per_class,2))*4
healthy_people2 = (np.random.randn(row_per_class,2))*4
features = np.vstack([sick_people,sick_people2, healthy_people, healthy_people2])
nn.input = features
nn.feedforward()
predictions = np.around(np.squeeze(np.asarray(nn.output)))
plt.scatter(features[:,0], features[:,1], c=predictions, cmap = plt.cm.Spectral)
plt.show()
It looks like I've respected the mathematic concept of back propagation but the accuracy is never good neither the cost.
It looks like it is random.
Here is the tutorial I have used to make this code (especially the back propagation) :
https://theclevermachine.wordpress.com/2014/09/06/derivation-error-backpropagation-gradient-descent-for-neural-networks/
Thank you so much for your help !
Matrix connections in your feedforward function are wrong
#simple feed forward
def feedforward(self):
self.layer1 = sigmoid(np.dot(self.input, self.weights1) + self.bias1)
self.layer2 = sigmoid(np.dot(self.layer1, self.weights2) + self.bias2)
self.layer3 = sigmoid(np.dot(self.layer1, self.weights3) + self.bias3)
self.output = sigmoid(np.dot(self.layer2, self.weights4) + self.bias4)
must be
#simple feed forward
def feedforward(self):
self.layer1 = sigmoid(np.dot(self.input, self.weights1) + self.bias1)
self.layer2 = sigmoid(np.dot(self.layer1, self.weights2) + self.bias2)
self.layer3 = sigmoid(np.dot(self.layer2, self.weights3) + self.bias3)
self.output = sigmoid(np.dot(self.layer3, self.weights4) + self.bias4)
I tried your code this way and it seems to work for me
Here is how the prediction looks like
By the way, not that it makes a big difference but theoretically, you should use a binary cross entropy cost function rather than MSE because your problem here is logistic regression. MSE may make it non-convex that would otherwise be convex.
Related
I'm trying to implement a neural network from scratch in order to gain better insight about it and I run into a weird problem. When I use Relu function for hidden layers as an activation function, the model did not converge whereas it did converge once sigmoid function is used. Here is my vanilla code: When you change first 2 layers' activation function from relu to sigmoid, you can see that it converges, though it may have a problem sometimes. Where could be the problem? It's been three days and I still couldnt find it, though I found some little bugs. Thanks in advance.
Here is the toy dataset I've been using(just paste it to where this code is located).
Dataset
import numpy as np
import pandas as pd
class NeuralNetwork():
def __init__(self, epoch=10000, alpha=0.075, algorithm="gradient_descent"):
# hyperparameters
self.epoch = epoch
self.alpha = alpha
self.algorithm = algorithm
# parameters
self.params = {}
self.layer_no = 1
# logs
self.cost_vals = []
def createLayer(self, size, activation_func, randomness=True):
if randomness == True:
self.params["W" + str(self.layer_no)] = np.random.randn(size[0], size[1]) * 0.01
else:
self.params["W" + str(self.layer_no)] = np.zeros(size)
self.params["b" + str(self.layer_no)] = np.zeros((size[0], 1))
self.params["func" + str(self.layer_no)] = activation_func
self.layer_no += 1
def sigmoid(self, X):
return 1 / (1 + np.exp(-X))
def relu(self, X):
return np.maximum(X, 0) * 0.01
def tanh(self, X):
return (np.exp(X) - np.exp(-X)) / (np.exp(X) + np.exp(-X))
def derivative_sigmoid(self, X):
der_x = self.sigmoid(X)
return der_x * (1 - der_x)
def derivative_relu(self, X):
X[X<=0] = 0
X[X>0] = 1
return X
def derivative_tanh(self, X):
tanhx = self.tanh(X)
return 1 - np.power(tanhx, 2)
def activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.sigmoid(Zl)
elif act_func_name == "relu":
return self.relu(Zl)
elif act_func_name == "tanh":
return self.tanh(Zl)
def derivative_activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.derivative_sigmoid(Zl)
elif act_func_name == "relu":
return self.derivative_relu(Zl)
elif act_func_name == "tanh":
return self.derivative_tanh(Zl)
def train(self, X, Y):
m = Y.shape[0] # number of training examples
self.params["A0"] = X
self.params["Z0"] = None
for i in range(self.epoch):
# forward prop
for l in range(1, self.layer_no): # 1,2,3
Zl = np.dot(self.params["W" + str(l)], self.params["A" + str(l - 1)]) + self.params["b" + str(l)] # linear function of a layer with vectorization
Al = self.activation_function(Zl, self.params["func" + str(l)]) # activated form of Zl
self.params["Z" + str(l)] = Zl
self.params["A" + str(l)] = Al
# cost function
cost_val = - 1 / m * np.sum(np.multiply(Y, np.log(Al)) + np.multiply((1 - Y), np.log(1 - Al)))
cost_val = np.squeeze(cost_val)
if i % 500 == 0:
print(cost_val)
self.cost_vals.append(cost_val)
# backward prop
dAl = - (np.divide(Y, Al) - np.divide(1 - Y, 1 - Al)) # gradiant of last layer of A
for l in reversed(range(1, self.layer_no)): # 3,2,1
# backward prop
dZl = np.multiply(dAl,
self.derivative_activation_function(self.params["Z" + str(l)], self.params["func" + str(l)])) # gradient of layer l of Z
dAl1 = np.dot(self.params["W" + str(l)].T, dZl) # gradient of previous layer of A
dWl = 1 / m * np.dot(dZl, self.params["A" + str(l - 1)].T) # gradient of parameters W in layer l
dbl = 1 / m * np.sum(dZl, axis=1, keepdims=True) # gradient of parameters b in layer l
# update parameters
self.params["W" + str(l)] -= self.alpha * dWl
self.params["b" + str(l)] -= self.alpha * dbl
dAl = dAl1 # assign gradient of previous layer of A to the current one so as to use it while back-propagation
def iris_data():
from sklearn.model_selection import train_test_split
datas = pd.read_csv('iris_nn.data').to_numpy()
X = datas[:, 0:4].astype(float)
Y = datas[:, 4:5]
Y = np.asarray([1 if (y == 'Iris-setosa') else 0 for y in Y]).reshape((Y.shape[0], 1))
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
return X_train.T, Y_train.T
X, Y = iris_data()
model = NeuralNetwork()
model.createLayer((5,4), "relu")
model.createLayer((7,5), "relu")
model.createLayer((1,7), "sigmoid")
model.train(X,Y)
#
We have to write a simple 3-layer NN that learn f(x)=x² with the softplus activation function at the end.
In my implementation the reults are just rubbish and I don't know what I'm doing wrong.
import autograd.numpy as np
from autograd import grad
from autograd import elementwise_grad
from autograd import hessian
import random
class Neural_Net(object):
def __init__(self, inputSize, hiddenSize, outputSize,
learning_rate=0.0001, epochs=100,
activation1="sigmoid", activation2="softplus"):
self.inputSize = inputSize
self.outputSize = outputSize
self.hiddenSize = hiddenSize
self.learning_rate = learning_rate
self.epochs = epochs
if activation1 == 'softplus':
self.activation1 = softplus
self.activation1_grad = softplus_grad
if activation1 == 'sigmoid':
self.activation1 = sigmoid
self.activation1_grad = sigmoid_grad
if activation1 == 'tanh':
self.activation1 = np.tanh
self.activation1_grad = tanh_grad
if activation2 == 'softplus':
self.activation2 = softplus
self.activation2_grad = softplus_grad
if activation2 == 'sigmoid':
self.activation2 = sigmoid
self.activation2_grad = sigmoid_grad
if activation2 == 'tanh':
self.activation2 = np.tanh
self.activation2_grad = tanh_grad
self.W1 = np.random.randn(self.inputSize, self.hiddenSize)
self.b1 = np.ones((1, self.hiddenSize))
self.W2 = np.random.randn(self.hiddenSize, self.outputSize)
self.b2 = np.ones((1, self.outputSize))
def forward_prop(self, X):
self.Z1 = np.dot(X, self.W1) + self.b1
self.A1 = self.activation1(self.Z1)
self.Z2 = np.dot(self.A1, self.W2) + self.b2
self.A2 = self.activation2(self.Z2)
return self.A2
def back_prop(self, X, Y):
self.dA2 = (self.A2 - Y)*self.activation2_grad(self.Z2)
self.dA1 = (np.dot(self.dA2,self.W2.T))*self.activation1_grad(self.Z1)
self.W1 -= self.learning_rate*X.T.dot(self.dA1)
self.b1 -= self.learning_rate*self.dA1
self.W2 -= self.learning_rate*np.dot(self.A1.T, self.dA2)
self.b2 -= self.learning_rate*self.dA2
def train(self, X, Y):
self.forward_prop(X)
self.back_prop(X, Y)
def softplus(x):
return np.log(1 + np.exp(x))
def sigmoid(x):
return 1/(1+np.exp(-x))
softplus_grad = elementwise_grad(softplus)
sigmoid_grad = elementwise_grad(sigmoid)
tanh_grad = elementwise_grad(np.tanh)
NN1 = Neural_Net(inputSize=1, hiddenSize=1, outputSize=1, epochs=10000)
for epoch in range(NN1.epochs):
X = np.array(([[random.randint(1, 100)]]))
Y = np.square(X)
A2 = NN1.forward_prop(X)
print("Input: " + str(X))
print("Actual Output: " + str(Y))
print("Predicted Output: " + str(A2))
print("Loss: " + str(np.mean(np.square(Y - A2))))
print("\n")
NN1.train(X, Y)
The expected output just increases and depending on which parameters I choose it becomes NaN or inf before it finishes
Just like at forward step you calculate Z1 then A1 then Z2 then A2, at backward step you should calculate gradients at opposite order: dA2 then dZ2 then dA1 then dZ1. You don't calculate dZ2 or dZ1, therefore it cannot work. Maybe you have other problems as well, but this one is the most obvious.
To check that the gradients are correct, calculate them directly (for each weight or bias, increase it by a small value epsilon, see how much the error changed, divide by epsilon). Such a direct calculation should be close to weight gradients. You don't calculate them explicitly, but should for test purposes.
More problems:
if hiddensize=1, it means you have just one neuron in the middle. It is not enough to approximate x**2.
if the output activation is sigmoid, it can output only numbers from 0 to 1. How do you output numbers from 1 to 100 squared? 1**2=1, 2**2=4, ..., 100**2=10000, while your output unit is only able to output numbers from 0 to 1.
Currently I am trying to implement a three layer network (2 hidden layers and 1 output layer) and having this network classify XOR. I am having an issue where my output layer will always converge to around 0.5 - I cannot figure out why this is happening and would like some guidance as to why.
import numpy as np
from matplotlib import pyplot as plt
# XOR
x=np.array(([0,0],[0,1],[1,0],[1,1]), dtype=float)
y=np.array(([0],[1],[1],[0]), dtype=float)
class NN:
def __init__(self, x, y):
self.weights1 = np.random.uniform(-0.5, 0.5, (2,3))
self.weights2 = np.random.uniform(-0.5, 0.5, (3,4))
self.weights3 = np.random.uniform(-0.5, 0.5, (4,1))
self.output = np.zeros(1)
def forward_prop(self, training_data):
self.layer1 = logistic_function(np.dot(training_data, self.weights1))
self.layer2 = logistic_function(np.dot(self.layer1, self.weights2))
self.output = logistic_function(np.dot(self.layer2, self.weights3))
return self.output
def back_prop(self, training_data, test_data):
self.delta = loss_function(test_data, self.output) * logistic_deriv(self.output)
self.e_weights3 = self.delta.dot(self.weights3.T)
self.d_weights3 = self.e_weights3 * logistic_deriv(self.layer2)
self.e_weights2 = self.d_weights3.dot(self.weights2.T)
self.d_weights2 = self.e_weights2 * logistic_deriv(self.layer1)
self.e_weights1 = self.d_weights2.dot(self.weights1.T)
self.d_weights1 = self.e_weights1 * logistic_deriv(training_data)
self.weights1 -= 0.01 * training_data.T.dot(self.d_weights1)
self.weights2 -= 0.01 * self.layer1.T.dot(self.d_weights2)
self.weights3 -= 0.01 * self.layer2.T.dot(self.d_weights3)
# Activation function
def logistic_function(z):
return 1.0 / (1.0 + np.exp(-z))
# Derivative function
def logistic_deriv(z):
return logistic_function(z) * (1.0 - logistic_function(z))
# Squared loss function
def loss_function(target_y, output_y):
loss = target_y - output_y
return 0.5 * np.power(loss,2)
network = NN(x, y)
for i in range(1000):
for j in range(0,len(x)):
network.forward_prop(x[j])
network.back_prop(x[j],y[j])
print(network.forward_prop(x[0]))
print(network.forward_prop(x[1]))
print(network.forward_prop(x[2]))
print(network.forward_prop(x[3]))
So basically, if the weights, which are initially assigned randomly, are within a range, of about 0.4 to -0.4 for w0 for example, then they will change and the accuracy will improve. However, if the weights are assigned a random number that is outside this range, then they won't be changed at all. I can't figure it out. Any suggestions would be great :)
import numpy as np
from matplotlib import pyplot as plt
class NN_model:
def __init__(self, data):
self.data = data
self.w0 = np.random.randn()
self.w1 = np.random.randn()
self.bias = np.random.randn()
self.trained = False
def sigmoid(self, x):
return 1 / (1 + np.exp(-x))
def sigmoid_p(self, x):
return self.sigmoid(x) * (1 - self.sigmoid(x))
def calculate_loss(self, pred, target):
loss = np.square(target - pred)
return loss
def training_loop(self, data):
costs = []
learning_rate = 0.005
print(self.w0)
if self.trained == True:
print('Model already trained')
pass
else:
for i in range(50):
for i in range(100000):
ri = np.random.randint(len(data))
point = data[ri]
sig_out = ((self.w0 * point[0]) + (self.w1 * point[1]) + self.bias)
pred = self.sigmoid(sig_out)
cost = self.calculate_loss(pred, point[2])
costs.append(cost)
dcost_dpred = 2 * (pred - point[2])
dpred_dsigout = self.sigmoid_p(sig_out)
dsigout_dw0 = point[0]
dsigout_dw1 = point[1]
dsigout_dbias = 1
dcost_dw0 = dcost_dpred * dpred_dsigout * dsigout_dw0
dcost_dw1 = dcost_dpred * dpred_dsigout * dsigout_dw1
dcost_dbias = dcost_dpred * dpred_dsigout * dsigout_dbias
self.w0 += - learning_rate * dcost_dw0
self.w1 += - learning_rate * dcost_dw0
self.bias += - learning_rate * dcost_dbias
print(self.w0, self.w1, self.bias)
#-0.0752623452445784 0.2447376547554179 4.032995041915469
#-0.3042823068224879 0.015717693177505765 18.643149928253827
self.trained = True
plt.plot(costs)
plt.show()
def predict(self, test_data):
if self.trained == True:
pred = self.sigmoid( (test_data[0] * self.w0) + (test_data[1] * self.w1) + self.bias )
print(pred)
if pred > 0.5:
print('Woman')
else:
print('Man')
else:
print('Error: Model has not been trained yet')
You have to have intuitive understanding of how the neural networks work behind the scene. There is no rule of thumb to decide the best values for your weights.
It might be possible that your initial weights pulls your near to accuracy really fast but in your example it might seem that while taking weights in a wide range with learning rate = 0.005, it will take some time until your weights are reduced. Therefore learning rates also play crucial role. Try tweaking that as well.
Here below is a piece of code for realizing a 2-layer neuron network for fitting problem in numpy. The activatin function is ReLU. The training algorithm is Adam. The loss function is half of the mean squared error. However, when the batch size is large(e.g. 10000), the loss will become nan after some iterations. The problem won't happen for small batch size. Could anyone help me explain why this may happen?(data are from matlab workspace:6_final_mapping_pos.mat)
#
import scipy.io as sio
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
data = sio.loadmat('6_final_mapping_pos.mat')
class NeuralNetwork():
def __init__(self):
self.batch_size = 256
self.input_size = 5 # input dimension is 5
self.hidden_layer1_size = 50
self.output_size = 1 # output dimension is 5
self.train_data = data['training_data_pos']
self.df_traindata = pd.DataFrame(data=self.train_data)
self.validation_data_num = 17142
self.valid_data = data['validation_data_pos']
self.df_validdata = pd.DataFrame(data=self.valid_data)
# weight initialization for ReLu
self.W1 = np.random.randn(self.input_size, self.hidden_layer1_size)/ np.sqrt(self.input_size/2)
self.W2 = np.random.randn(self.hidden_layer1_size, self.output_size)/ np.sqrt(self.hidden_layer1_size/2)
#bias initialization
self.b1 = np.zeros((1,self.hidden_layer1_size))
self.b2 = np.zeros((1,self.output_size))
self.lr = 5e-3 # learning rate
self.reg = 1e-3 # regularization strength
self.p = 0.5 # dropout probability = 1-p
self.first_moment_W3=0
self.second_moment_W3=0
self.first_moment_W2=0
self.second_moment_W2=0
self.first_moment_W1=0
self.second_moment_W1=0
self.first_moment_b3=0
self.second_moment_b3=0
self.first_moment_b2=0
self.second_moment_b2=0
self.first_moment_b1=0
self.second_moment_b1=0
def feedforward(self):
### randomly selected mini-batch as inputs
self.df_sample_t = self.df_traindata.sample(n = self.batch_size)
self.train_input = self.df_sample_t.as_matrix(columns=[0,1,2,3,4])
self.train_output = self.df_sample_t.as_matrix(columns=[5])
#hidden layer with dropput technique
self.hidden_layer1 = np.maximum(0, (np.dot(self.train_input, self.W1) + self.b1))
U1= np.random.rand(*self.hidden_layer1.shape) < self.p # drop mask
self.hidden_layer1 *= U1 # drop!
self.output_layer = np.dot(self.hidden_layer1, self.W2) + self.b2
self.data_loss = np.sum(0.5*(self.output_layer-self.train_output)**2) / self.batch_size
self.reg_loss = 0.5*self.reg*np.sum(self.W1*self.W1) + 0.5*self.reg*np.sum(self.W2*self.W2)
self.total_loss = self.data_loss + self.reg_loss
def backpropagation(self):
self.d_output = (self.output_layer-self.train_output)/ self.batch_size
#data part
self.dW2 = np.dot(self.hidden_layer1.T, self.d_output)
self.db2 = np.sum(self.d_output, axis=0, keepdims=True)
self.dhidden1 = np.dot(self.d_output, self.W2.T)
self.dhidden1[self.hidden_layer1<= 0] = 0
self.dW1 = np.dot(self.train_input.T, self.dhidden1)
self.db1 = np.sum(self.dhidden1, axis=0, keepdims=True)
#regularization part
self.dW2 = self.dW2 + self.reg * self.W2
self.dW1 = self.dW1 + self.reg * self.W1
def Adam(self, epoch, dW2, dW1, db2, db1):
beta1 = 0.9
beta2 = 0.99
self.first_moment_W2 = beta1*self.first_moment_W2 + (1-beta1)*dW2
self.second_moment_W2 = beta2*self.second_moment_W2 + (1-beta2)*dW2*dW2
first_unbias_W2 = self.first_moment_W2 /(1-beta1 ** epoch)
second_unbias_W2 = self.second_moment_W2 /(1-beta2 ** epoch)
self.W2 -= self.lr * first_unbias_W2 / (np.sqrt(second_unbias_W2) +1e-7)
self.first_moment_W1 = beta1*self.first_moment_W1 + (1-beta1)*dW1
self.second_moment_W1 = beta2*self.second_moment_W1 + (1-beta2)*dW1*dW1
first_unbias_W1 = self.first_moment_W1 /(1-beta1 ** epoch)
second_unbias_W1 = self.second_moment_W1 /(1-beta2 ** epoch)
self.W1 -= self.lr * first_unbias_W1 / (np.sqrt(second_unbias_W1) +1e-7)
self.first_moment_b2 = beta1*self.first_moment_b2 + (1-beta1)*db2
self.second_moment_b2 = beta2*self.second_moment_b2 + (1-beta2)*db2*db2
first_unbias_b2 = self.first_moment_b2 /(1-beta1 ** epoch)
second_unbias_b2 = self.second_moment_b2 /(1-beta2 ** epoch)
self.b2 -= self.lr * first_unbias_b2 / (np.sqrt(second_unbias_b2) +1e-7)
self.first_moment_b1 = beta1*self.first_moment_b1 + (1-beta1)*db1
self.second_moment_b1 = beta2*self.second_moment_b1 + (1-beta2)*db1*db1
first_unbias_b1 = self.first_moment_b1 /(1-beta1 ** epoch)
second_unbias_b1 = self.second_moment_b1 /(1-beta2 ** epoch)
self.b1 -= self.lr * first_unbias_b1 / (np.sqrt(second_unbias_b1) +1e-7)
def validation(self):
self.df_sample_v = self.df_validdata.sample(n = self.validation_data_num)
self.valid_input = self.df_sample_v.as_matrix(columns=[0,1,2,3,4])
self.valid_output = self.df_sample_v.as_matrix(columns=[5])
self.hidden_layer1 = np.maximum(0, np.dot(self.valid_input, self.W1) + self.b1) *self.p
self.output_layer = np.dot(self.hidden_layer1, self.W2) + self.b2
self.data_loss = np.sum(0.5*(self.output_layer-self.valid_output)**2) / self.validation_data_num
self.reg_loss = 0.5*self.reg*np.sum(self.W1*self.W1) + 0.5*self.reg*np.sum(self.W2*self.W2)
self.total_loss = self.data_loss + self.reg_loss
NN = NeuralNetwork()
num_iterations = 120
training_loss = np.array([])
validation_loss = np.array([])
validation_dataloss = np.array([])
t=1
T=np.array([range(1,num_iterations)]).T
# Training and validation
while(t < num_iterations):
NN.feedforward()
NN.backpropagation()
NN.Adam(t, NN.dW2, NN.dW1, NN.db2, NN.db1)
training_loss = np.append(training_loss, NN.total_loss)
if t % 10 == 0:
print ("training:" + "total loss = %f, data loss = %f, regularization loss = %f" % (NN.total_loss,NN.data_loss,NN.reg_loss))
NN.validation()
validation_loss = np.append(validation_loss, NN.total_loss)
validation_dataloss = np.append(validation_dataloss, NN.data_loss)
if t % 10 == 0:
print ("validation:" + "total loss = %f, data loss = %f, regularization loss = %f" % (NN.total_loss,NN.data_loss,NN.reg_loss))
t+=1