Currently I am trying to implement a three layer network (2 hidden layers and 1 output layer) and having this network classify XOR. I am having an issue where my output layer will always converge to around 0.5 - I cannot figure out why this is happening and would like some guidance as to why.
import numpy as np
from matplotlib import pyplot as plt
# XOR
x=np.array(([0,0],[0,1],[1,0],[1,1]), dtype=float)
y=np.array(([0],[1],[1],[0]), dtype=float)
class NN:
def __init__(self, x, y):
self.weights1 = np.random.uniform(-0.5, 0.5, (2,3))
self.weights2 = np.random.uniform(-0.5, 0.5, (3,4))
self.weights3 = np.random.uniform(-0.5, 0.5, (4,1))
self.output = np.zeros(1)
def forward_prop(self, training_data):
self.layer1 = logistic_function(np.dot(training_data, self.weights1))
self.layer2 = logistic_function(np.dot(self.layer1, self.weights2))
self.output = logistic_function(np.dot(self.layer2, self.weights3))
return self.output
def back_prop(self, training_data, test_data):
self.delta = loss_function(test_data, self.output) * logistic_deriv(self.output)
self.e_weights3 = self.delta.dot(self.weights3.T)
self.d_weights3 = self.e_weights3 * logistic_deriv(self.layer2)
self.e_weights2 = self.d_weights3.dot(self.weights2.T)
self.d_weights2 = self.e_weights2 * logistic_deriv(self.layer1)
self.e_weights1 = self.d_weights2.dot(self.weights1.T)
self.d_weights1 = self.e_weights1 * logistic_deriv(training_data)
self.weights1 -= 0.01 * training_data.T.dot(self.d_weights1)
self.weights2 -= 0.01 * self.layer1.T.dot(self.d_weights2)
self.weights3 -= 0.01 * self.layer2.T.dot(self.d_weights3)
# Activation function
def logistic_function(z):
return 1.0 / (1.0 + np.exp(-z))
# Derivative function
def logistic_deriv(z):
return logistic_function(z) * (1.0 - logistic_function(z))
# Squared loss function
def loss_function(target_y, output_y):
loss = target_y - output_y
return 0.5 * np.power(loss,2)
network = NN(x, y)
for i in range(1000):
for j in range(0,len(x)):
network.forward_prop(x[j])
network.back_prop(x[j],y[j])
print(network.forward_prop(x[0]))
print(network.forward_prop(x[1]))
print(network.forward_prop(x[2]))
print(network.forward_prop(x[3]))
Related
I am converting the asymmetric loss function from this paper ASL which is written in pytorch here: ASL github.
The pytorch code looks like this:
class AsymmetricLoss(nn.Module):
def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_torch_grad_focal_loss=True):
super(AsymmetricLoss, self).__init__()
self.gamma_neg = gamma_neg
self.gamma_pos = gamma_pos
self.clip = clip
self.disable_torch_grad_focal_loss = disable_torch_grad_focal_loss
self.eps = eps
def forward(self, x, y):
""""
Parameters
----------
x: input logits
y: targets (multi-label binarized vector)
"""
# Calculating Probabilities
x_sigmoid = torch.sigmoid(x)
xs_pos = x_sigmoid
xs_neg = 1 - x_sigmoid
# Asymmetric Clipping
if self.clip is not None and self.clip > 0:
xs_neg = (xs_neg + self.clip).clamp(max=1)
# Basic CE calculation
los_pos = y * torch.log(xs_pos.clamp(min=self.eps))
los_neg = (1 - y) * torch.log(xs_neg.clamp(min=self.eps))
loss = los_pos + los_neg
# Asymmetric Focusing
if self.gamma_neg > 0 or self.gamma_pos > 0:
if self.disable_torch_grad_focal_loss:
torch.set_grad_enabled(False)
pt0 = xs_pos * y
pt1 = xs_neg * (1 - y) # pt = p if t > 0 else 1-p
pt = pt0 + pt1
one_sided_gamma = self.gamma_pos * y + self.gamma_neg * (1 - y)
one_sided_w = torch.pow(1 - pt, one_sided_gamma)
if self.disable_torch_grad_focal_loss:
torch.set_grad_enabled(True)
loss *= one_sided_w
return -loss.sum()
And what I have changed it to in tensorflow:
class AsymmetricLoss(tf.keras.losses.Loss):
def __init__(self, gamma_neg=4, gamma_pos=1, clip=0.05, eps=1e-8, disable_grad_focal_loss=False):
super(AsymmetricLoss, self).__init__()
self.gamma_neg = gamma_neg
self.gamma_pos = gamma_pos
self.clip = clip
self.eps = eps
self.disable_grad_focal_loss = disable_grad_focal_loss
def call(self, y_true, y_pred):
y_pred_sigmoid = tf.sigmoid(y_pred)
y_preds_pos = y_pred_sigmoid
y_preds_neg = 1 - y_pred_sigmoid
y_true = tf.cast(y_true, tf.float32)
if self.clip is not None and self.clip > 0:
y_preds_neg = tf.clip_by_value(y_preds_neg + self.clip,clip_value_max=1, clip_value_min=-np.inf)
print(y_preds_neg.dtype)
print(y_preds_pos.dtype)
print(y_true.dtype)
los_pos = y_true * tf.math.log(tf.clip_by_value(y_preds_pos, clip_value_min=self.eps, clip_value_max=np.inf))
los_neg = (1-y_true) * tf.math.log(tf.clip_by_value(y_preds_neg, clip_value_min=self.eps, clip_value_max=np.inf))
loss = los_pos + los_neg
if self.gamma_neg > 0 or self.gamma_pos > 0:
if self.disable_grad_focal_loss:
#torch.set_set_grad_enable(False)
pass
pt0 = y_preds_pos * y_true
pt1 = y_preds_neg * (1-y_true)
pt = pt0 + pt1
one_sided_gamma = self.gamma_pos * y_true + self.gamma_neg *(1-y_true)
one_sided_w = tf.math.pow(1-pt, one_sided_gamma)
if self.disable_grad_focal_loss:
#torch.set_set_grad_enable(True)
pass
loss *= one_sided_w
y_true = tf.cast(y_true, tf.int64)
return -tf.math.reduce_sum(loss)
There is especially two things I am unsure about.
In the pytorch code the turns on and off the grad with torch.set_grad_enable(bool) which I can not find an equliant in tensorflow (tf.stop_gradient, doesnt seem to be the same). I have also read that the grad is not calculated in the call method so it doesnt matter, but this I am not sure about.
The second thing is the sum in the return. Is so that tensorflow sums up the loss by it self so its not correct to have it in the loss function?
I have been trying to create a simple standard neural network from scratch but I can't seem to get it to work normally. Sometimes the cost skyrockets, other times the cost doesn't even change. I'm not sure what the problem is but it would be really helpful if someone could help me.
I have all of the information on Github. If any more information is needed kindly reply and I will provide it.
https://github.com/enriquedellxps/Neural-Network
Function for generating batches:
def batcher(data, batch_size):
# get the number of batches
num_batches_norem = data.shape[1] // batch_size
if data.shape[1] % batch_size == 0:
remainder_quantity = 0
else:
remainder_size = data.shape[1] % batch_size
remainder_quantity = 1
num_batches = num_batches_norem + remainder_quantity
changer = 0
for mb in range(num_batches_norem):
current_batch = data[:, changer:changer + batch_size]
changer += batch_size
yield current_batch
for last_mb in range(remainder_quantity):
last_batch = data[:, changer:changer + remainder_size]
yield last_batch
Function for g(z):
def activationer(a, z):
# ACTIVATION FUNCTIONS
# Sigmoid Activation Function
def sigmoid(z):
g = scipy.special.expit(z)
return g
# Tanh (Hyperbolic Tangent Function) Activation Function
def tanh(z):
g = (np.exp(z) - np.exp(-1 * z)) / ((np.exp(z) + np.exp(-1 * z)))
return g
# ReLU (Rectified Linear Unit) Activation Function
def ReLU(z):
g = np.maximum(0, z)
return g
# Leaky ReLU (Leaky Rectified Linear Unit) Activation Function
def Leaky_ReLU(z):
g = np.maximum(0.01 * z, z)
return g
def softmax(z):
z_exp = np.exp(z)
g = z_exp / np.sum(z_exp, axis=0, keepdims=True)
return g
if a == "sigmoid":
res = sigmoid(z)
elif a == "tanh":
res = tanh(z)
elif a == "relu":
res = ReLU(z)
elif a == "leaky relu":
res = Leaky_ReLU(z)
elif a == "softmax":
res = softmax(z)
return res
NN Class:
class DeepNeuralNetwork:
def __init__(self, n_x, n_h, n_y, nl, activations, alpha):
assert nl == len(activations), f"L: {nl}, Number of Activations: {len(activations)}"
# Assign inputs to the self object
self.n_x = n_x
self.n_h = n_h
self.n_y = n_y
self.nl = nl
self.activations = activations
self.alpha = alpha
# Initialize Parameters
def initialize_parameters(self):
n_x = self.n_x
n_h = self.n_h
n_y = self.n_y
activations = self.activations
parameters = []
for l in range(self.nl):
np.random.seed(8)
if l == 0:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_h, n_x) * np.sqrt(2 / n_x), np.zeros((n_h, 1))]) # aka W1, b1 | Xavier
else:
parameters.append([np.random.randn(n_h, n_x) * np.sqrt(1 / n_x), np.zeros((n_h, 1))]) # aka W1, b1 | He
elif l == self.nl - 1:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_y, n_h) * np.sqrt(2 / n_h), np.zeros((n_y, 1))]) # aka WL, bL | Xavier
else:
parameters.append([np.random.randn(n_y, n_h) * np.sqrt(1 / n_h), np.zeros((n_y, 1))]) # aka WL, bL | He
else:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_h, n_h) * np.sqrt(2 / n_h), np.zeros((n_h, 1))]) # hidden params | Xavier
else:
parameters.append([np.random.randn(n_h, n_h) * np.sqrt(1 / n_h), np.zeros((n_h, 1))]) # hidden params | He
return parameters
# Forward Propagation
def forward_propagation(self, parameters, input_data):
batch_size = input_data.shape[1] # Get the amount of examples in the batch
caches = []
self.caches = caches
current_activation = input_data # Set first activation - A0 - as the input
caches.append(current_activation)
for l in range(self.nl):
W, b = parameters[l][0], parameters[l][1] # Get weights and biases for current layer
Z = W # current_activation + b # Compute the linear activation
current_activation = activationer(self.activations[l], Z) # Compute the full activation
caches.append(current_activation)
return current_activation
# Compute Cost
def compute_cost(self, yhat, y):
batch_size = y.shape[1] # Get the amount of examples in the batch
cost = (1 / batch_size) * np.sum(-1 * (y * np.log(yhat) + ((1 - y) * (np.log(1 - yhat))))) # Compute the cross-entropy cost
cost = np.squeeze(cost) # Turn [[17]] to 17
return cost
# Backward Propagation
def backward_propagation(self, parameters, y):
caches = self.caches
batch_size = y.shape[1]
grads = []
for l in reversed(range(1, self.nl + 1)):
if l == self.nl:
dZ = caches[l] - y
dW = (1 / batch_size) * dZ # caches[l - 1].T
db = (1 / batch_size) * np.sum(dZ, axis=1, keepdims=True)
grads.append([dW, db])
else:
dA = parameters[l][0].T # dZ
dZ = dA * np.multiply(caches[l], (1 - caches[l]))
dW = (1 / batch_size) * dZ # caches[l - 1].T
db = (1 / batch_size) * np.sum(dZ, axis=1, keepdims=True)
grads.append([dW, db])
return grads
# Update Parameters
def update_parameters(self, parameters, gradients):
for l in range(self.nl):
parameters[l][0] = parameters[l][0] - self.alpha * gradients[self.nl - l - 1][0]
parameters[l][1] = parameters[l][1] - self.alpha * gradients[self.nl - l - 1][1]
return parameters
Running it:
dnn = DeepNeuralNetwork(12288, 20, 1, 4, ["relu", "relu", "relu", "sigmoid"], 0.001)
params = dnn.initialize_parameters()
epochs = 100
for e in range(epochs):
for i, j in zip(train_x_batched, train_y_batched):
yhat = dnn.forward_propagation(params, i)
cost = dnn.compute_cost(yhat, j)
grads = dnn.backward_propagation(params, j)
params = update_parameters(params, grads)
print(cost) # This usually starts going down then skyrockets. Even if I lower the learning rate to 0.00001
Thanks :)
I'm trying to implement a neural network from scratch in order to gain better insight about it and I run into a weird problem. When I use Relu function for hidden layers as an activation function, the model did not converge whereas it did converge once sigmoid function is used. Here is my vanilla code: When you change first 2 layers' activation function from relu to sigmoid, you can see that it converges, though it may have a problem sometimes. Where could be the problem? It's been three days and I still couldnt find it, though I found some little bugs. Thanks in advance.
Here is the toy dataset I've been using(just paste it to where this code is located).
Dataset
import numpy as np
import pandas as pd
class NeuralNetwork():
def __init__(self, epoch=10000, alpha=0.075, algorithm="gradient_descent"):
# hyperparameters
self.epoch = epoch
self.alpha = alpha
self.algorithm = algorithm
# parameters
self.params = {}
self.layer_no = 1
# logs
self.cost_vals = []
def createLayer(self, size, activation_func, randomness=True):
if randomness == True:
self.params["W" + str(self.layer_no)] = np.random.randn(size[0], size[1]) * 0.01
else:
self.params["W" + str(self.layer_no)] = np.zeros(size)
self.params["b" + str(self.layer_no)] = np.zeros((size[0], 1))
self.params["func" + str(self.layer_no)] = activation_func
self.layer_no += 1
def sigmoid(self, X):
return 1 / (1 + np.exp(-X))
def relu(self, X):
return np.maximum(X, 0) * 0.01
def tanh(self, X):
return (np.exp(X) - np.exp(-X)) / (np.exp(X) + np.exp(-X))
def derivative_sigmoid(self, X):
der_x = self.sigmoid(X)
return der_x * (1 - der_x)
def derivative_relu(self, X):
X[X<=0] = 0
X[X>0] = 1
return X
def derivative_tanh(self, X):
tanhx = self.tanh(X)
return 1 - np.power(tanhx, 2)
def activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.sigmoid(Zl)
elif act_func_name == "relu":
return self.relu(Zl)
elif act_func_name == "tanh":
return self.tanh(Zl)
def derivative_activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.derivative_sigmoid(Zl)
elif act_func_name == "relu":
return self.derivative_relu(Zl)
elif act_func_name == "tanh":
return self.derivative_tanh(Zl)
def train(self, X, Y):
m = Y.shape[0] # number of training examples
self.params["A0"] = X
self.params["Z0"] = None
for i in range(self.epoch):
# forward prop
for l in range(1, self.layer_no): # 1,2,3
Zl = np.dot(self.params["W" + str(l)], self.params["A" + str(l - 1)]) + self.params["b" + str(l)] # linear function of a layer with vectorization
Al = self.activation_function(Zl, self.params["func" + str(l)]) # activated form of Zl
self.params["Z" + str(l)] = Zl
self.params["A" + str(l)] = Al
# cost function
cost_val = - 1 / m * np.sum(np.multiply(Y, np.log(Al)) + np.multiply((1 - Y), np.log(1 - Al)))
cost_val = np.squeeze(cost_val)
if i % 500 == 0:
print(cost_val)
self.cost_vals.append(cost_val)
# backward prop
dAl = - (np.divide(Y, Al) - np.divide(1 - Y, 1 - Al)) # gradiant of last layer of A
for l in reversed(range(1, self.layer_no)): # 3,2,1
# backward prop
dZl = np.multiply(dAl,
self.derivative_activation_function(self.params["Z" + str(l)], self.params["func" + str(l)])) # gradient of layer l of Z
dAl1 = np.dot(self.params["W" + str(l)].T, dZl) # gradient of previous layer of A
dWl = 1 / m * np.dot(dZl, self.params["A" + str(l - 1)].T) # gradient of parameters W in layer l
dbl = 1 / m * np.sum(dZl, axis=1, keepdims=True) # gradient of parameters b in layer l
# update parameters
self.params["W" + str(l)] -= self.alpha * dWl
self.params["b" + str(l)] -= self.alpha * dbl
dAl = dAl1 # assign gradient of previous layer of A to the current one so as to use it while back-propagation
def iris_data():
from sklearn.model_selection import train_test_split
datas = pd.read_csv('iris_nn.data').to_numpy()
X = datas[:, 0:4].astype(float)
Y = datas[:, 4:5]
Y = np.asarray([1 if (y == 'Iris-setosa') else 0 for y in Y]).reshape((Y.shape[0], 1))
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
return X_train.T, Y_train.T
X, Y = iris_data()
model = NeuralNetwork()
model.createLayer((5,4), "relu")
model.createLayer((7,5), "relu")
model.createLayer((1,7), "sigmoid")
model.train(X,Y)
#
I've been programming this neural network composed of 4 layers:
The first one with 2 neural, second with 2, third with 2 and the output one with one neurons
I made this schema to show what I'm trying to reproduce :
Here is the code, you can try to run it (python 3.7) :
import numpy as np
import matplotlib.pyplot as plt
#Calculus of the sigmoid
def sigmoid(z):
return 1.0/(1+ np.exp(-z))
#Calculus of the sigmoid derivation
def sigmoid_derivative(y):
return y * (1.0 - y)
#Initialisation of the class (input, output, targets, weights, biais)
class NeuralNetwork:
def __init__(self, x, y):
self.input = x
self.weights1 = np.random.rand(self.input.shape[1],2)
self.weights2 = np.random.rand(2,2)
self.weights3 = np.random.rand(2,2)
self.weights4 = np.random.rand(2,1)
self.y = y
self.output = np.zeros(self.y.shape)
self.bias1 = np.random.rand(1,2)
self.bias2 = np.random.rand(1,2)
self.bias3 = np.random.rand(1,2)
self.bias4 = np.random.rand(1,1)
self.learning_rate = 0.005
#simple feed forward
def feedforward(self):
self.layer1 = sigmoid(np.dot(self.input, self.weights1) + self.bias1)
self.layer2 = sigmoid(np.dot(self.layer1, self.weights2) + self.bias2)
self.layer3 = sigmoid(np.dot(self.layer1, self.weights3) + self.bias3)
self.output = sigmoid(np.dot(self.layer2, self.weights4) + self.bias4)
#Back propagation algorithme
def backprop(self):
# application of the chain rule to find derivative of the loss function with respect to weights4, weights3, weights2, weights1 and the associated bias
delta_4 = 2*(self.y - self.output) * sigmoid_derivative(self.output)
d_weights4 = np.dot(self.layer3.T, delta_4)
d_bias4 = delta_4
d_bias4 = d_bias4.mean(axis=0)
delta_3 = np.dot(delta_4, self.weights4.T) * sigmoid_derivative(self.layer3)
d_weights3 = np.dot(self.layer2.T, delta_3)
d_bias3 = delta_3
d_bias3 = d_bias3.mean(axis=0)
delta_2 = np.dot(delta_3, self.weights3.T) * sigmoid_derivative(self.layer2)
d_weights2 = np.dot(self.layer1.T, delta_2)
d_bias2 = delta_2
d_bias2 = d_bias2.mean(axis=0)
delta_1 = np.dot(delta_2, self.weights2.T) * sigmoid_derivative(self.layer1)
d_weights1 = np.dot(self.input.T, delta_1)
d_bias1 = delta_1
d_bias1 = d_bias1.mean(axis=0)
# update the weights with the derivative (slope) of the loss function
self.weights1 += d_weights1 * self.learning_rate
self.weights2 += d_weights2 * self.learning_rate
self.weights3 += d_weights3 * self.learning_rate
self.weights4 += d_weights4 * self.learning_rate
self.bias1 += d_bias1 * self.learning_rate
self.bias2 += d_bias2 * self.learning_rate
self.bias3 += d_bias3 * self.learning_rate
self.bias4 += d_bias4 * self.learning_rate
def cost(self):
return np.mean((self.output - self.y)**2)
if __name__ == "__main__":
#Number of rows per class
row_per_class = 200
#generate rows
#Creating a data set hard to resolve
sick_people = (np.random.randn(row_per_class,2))
row_sick = int(row_per_class/8)
healthy_people = 2*(np.random.randn(row_sick,2)) + np.array([0,10])
healthy_people2 = 2*(np.random.randn(row_sick,2)) + np.array([0,-10])
healthy_people3 = 2*(np.random.randn(row_sick,2)) + np.array([10,0])
healthy_people4 = 2*(np.random.randn(row_sick,2)) + np.array([-10,0])
healthy_people5 = 2*(np.random.randn(row_sick,2)) + np.array([10,10])
healthy_people6 = 2*(np.random.randn(row_sick,2)) + np.array([10,-10])
healthy_people7 = 2*(np.random.randn(row_sick,2)) + np.array([-10,10])
healthy_people8 = 2*(np.random.randn(row_sick,2)) + np.array([-10,-10])
features = np.vstack([sick_people, healthy_people2, healthy_people, healthy_people3, healthy_people4, healthy_people5, healthy_people6, healthy_people7, healthy_people8])
targets = (np.concatenate((np.zeros(row_per_class), np.zeros(row_per_class)+1)))
#To have a good vision of the dataset created just above
plt.scatter(features[:,0], features[:,1], c=targets, cmap = plt.cm.Spectral)
plt.show()
targets = targets[np.newaxis].T
#Initialing the neural network
nn = NeuralNetwork(features,targets)
#Test without training, we can see the current accuracy
nn.feedforward()
predictions = np.around(nn.output)
print ("Accuracy", np.mean(predictions == nn.y))
#Training part
for i in range(30000):
if i % 1000 == 0:
print (nn.cost())
nn.feedforward()
nn.backprop()
# Re Testing of the feedforward after the training
nn.feedforward()
predictions = np.around(nn.output)
print ("Accuracy", np.mean(predictions == nn.y))
predictions = np.around(np.squeeze(np.asarray(nn.output)))
#Show on graph how well the training went
plt.scatter(features[:,0], features[:,1], c=predictions, cmap = plt.cm.Spectral)
plt.show()
# It allows us to have a better vision of the result, we project random point by thousands and
# see the graph
row_per_class = 2000
#generate rows
sick_people = (np.random.randn(row_per_class,2))*4
sick_people2 = (np.random.randn(row_per_class,2))*4
healthy_people = (np.random.randn(row_per_class,2))*4
healthy_people2 = (np.random.randn(row_per_class,2))*4
features = np.vstack([sick_people,sick_people2, healthy_people, healthy_people2])
nn.input = features
nn.feedforward()
predictions = np.around(np.squeeze(np.asarray(nn.output)))
plt.scatter(features[:,0], features[:,1], c=predictions, cmap = plt.cm.Spectral)
plt.show()
It looks like I've respected the mathematic concept of back propagation but the accuracy is never good neither the cost.
It looks like it is random.
Here is the tutorial I have used to make this code (especially the back propagation) :
https://theclevermachine.wordpress.com/2014/09/06/derivation-error-backpropagation-gradient-descent-for-neural-networks/
Thank you so much for your help !
Matrix connections in your feedforward function are wrong
#simple feed forward
def feedforward(self):
self.layer1 = sigmoid(np.dot(self.input, self.weights1) + self.bias1)
self.layer2 = sigmoid(np.dot(self.layer1, self.weights2) + self.bias2)
self.layer3 = sigmoid(np.dot(self.layer1, self.weights3) + self.bias3)
self.output = sigmoid(np.dot(self.layer2, self.weights4) + self.bias4)
must be
#simple feed forward
def feedforward(self):
self.layer1 = sigmoid(np.dot(self.input, self.weights1) + self.bias1)
self.layer2 = sigmoid(np.dot(self.layer1, self.weights2) + self.bias2)
self.layer3 = sigmoid(np.dot(self.layer2, self.weights3) + self.bias3)
self.output = sigmoid(np.dot(self.layer3, self.weights4) + self.bias4)
I tried your code this way and it seems to work for me
Here is how the prediction looks like
By the way, not that it makes a big difference but theoretically, you should use a binary cross entropy cost function rather than MSE because your problem here is logistic regression. MSE may make it non-convex that would otherwise be convex.
So basically, if the weights, which are initially assigned randomly, are within a range, of about 0.4 to -0.4 for w0 for example, then they will change and the accuracy will improve. However, if the weights are assigned a random number that is outside this range, then they won't be changed at all. I can't figure it out. Any suggestions would be great :)
import numpy as np
from matplotlib import pyplot as plt
class NN_model:
def __init__(self, data):
self.data = data
self.w0 = np.random.randn()
self.w1 = np.random.randn()
self.bias = np.random.randn()
self.trained = False
def sigmoid(self, x):
return 1 / (1 + np.exp(-x))
def sigmoid_p(self, x):
return self.sigmoid(x) * (1 - self.sigmoid(x))
def calculate_loss(self, pred, target):
loss = np.square(target - pred)
return loss
def training_loop(self, data):
costs = []
learning_rate = 0.005
print(self.w0)
if self.trained == True:
print('Model already trained')
pass
else:
for i in range(50):
for i in range(100000):
ri = np.random.randint(len(data))
point = data[ri]
sig_out = ((self.w0 * point[0]) + (self.w1 * point[1]) + self.bias)
pred = self.sigmoid(sig_out)
cost = self.calculate_loss(pred, point[2])
costs.append(cost)
dcost_dpred = 2 * (pred - point[2])
dpred_dsigout = self.sigmoid_p(sig_out)
dsigout_dw0 = point[0]
dsigout_dw1 = point[1]
dsigout_dbias = 1
dcost_dw0 = dcost_dpred * dpred_dsigout * dsigout_dw0
dcost_dw1 = dcost_dpred * dpred_dsigout * dsigout_dw1
dcost_dbias = dcost_dpred * dpred_dsigout * dsigout_dbias
self.w0 += - learning_rate * dcost_dw0
self.w1 += - learning_rate * dcost_dw0
self.bias += - learning_rate * dcost_dbias
print(self.w0, self.w1, self.bias)
#-0.0752623452445784 0.2447376547554179 4.032995041915469
#-0.3042823068224879 0.015717693177505765 18.643149928253827
self.trained = True
plt.plot(costs)
plt.show()
def predict(self, test_data):
if self.trained == True:
pred = self.sigmoid( (test_data[0] * self.w0) + (test_data[1] * self.w1) + self.bias )
print(pred)
if pred > 0.5:
print('Woman')
else:
print('Man')
else:
print('Error: Model has not been trained yet')
You have to have intuitive understanding of how the neural networks work behind the scene. There is no rule of thumb to decide the best values for your weights.
It might be possible that your initial weights pulls your near to accuracy really fast but in your example it might seem that while taking weights in a wide range with learning rate = 0.005, it will take some time until your weights are reduced. Therefore learning rates also play crucial role. Try tweaking that as well.