Related
I am building a custom loss function that needs to know whether the truth and the prediction have N pixels above a threshold. This is because the logic breaks if I supply an np.where() array which is empty. I can get around this issue by using try/else to return a 'flagged constant' in the case that the function fails on the empty set, but I'd like to do something different. Here is my current method.
def some_loss(cutoff=20, min_pix=10):
def gen_loss(y_true, y_pred):
trues = tf.map_fn(fn = lambda x: x, elems = y_true)
preds = tf.map_fn(fn = lambda x: x, elems = y_pred)
for idx in tf.range(tf.shape(y_true)[0]):
# binarize both by cutoff
true = y_true[idx]
pred = y_pred[idx]
true = tf.where(true < cutoff, 0.0, 1.0)
pred = tf.where(pred < cutoff, 0.0, 1.0)
# now I sum each to get the number of pixels above threshold
n_true, n_pred = tf.reduce_sum(true), tf.reduce_sum(pred)
# then I create a switch using tf.conditional
switch = tf.cond(tf.logical_or(n_true < min_pix, n_pred < min_pix), lambda: tf.zeros_like(true), lambda: tf.ones_like(true))
# this essentially allows me to turn off the loss if either condition is met
# so I then run the function
loss = get_loss(true, pred) # returns random constant if either is below threshold
loss += tf.reduce_sum(tf.math.multiply(loss, switch))
return loss
return gen_loss
This may work, it compiles and trains a convolutional model. However, I don't like that there are random constants wandering about my loss function, and I'd rather only operate the function get_loss() if both true and pred meet the minimum conditions.
I'd prefer to make two tensors, one with samples not meeting the condition, the other with samples meeting the condition.
Separately, I've tried to use tf.conditional to test for each case and call a separate loss function in either case. The code is repeated below.
def avgMED(scaler, cutoff=20, min_N=30,c=3):
def AVGmed(y_true, y_pred):
const = tf.constant([c],tf.float32) # constant c, multiplied by MED (
batch_size = tf.cast(tf.shape(y_true)[0], tf.float32)
MSE = tf.reduce_mean(tf.square(y_true-y_pred))
y_true = tf.reshape(y_true, shape=(tf.shape(y_true)[0], -1))
y_pred = tf.reshape(y_pred, shape=(tf.shape(y_pred)[0], -1))
loss, loss_med = tf.cast(0,dtype=tf.float32), tf.cast(0,dtype=tf.float32)
# rescale
y_true = y_true*scaler.scale_
y_true = y_true+scaler.mean_
y_pred = y_pred*scaler.scale_
y_pred = y_pred+scaler.mean_
trues = tf.map_fn(fn = lambda x: x, elems=y_true)
preds = tf.map_fn(fn = lambda x: x, elems=y_pred)
min_nonzero_pixels = tf.reduce_sum(tf.constant(min_N, dtype=tf.float32))
for idx in tf.range(batch_size):
idx = tf.cast(idx, tf.int32)
true = trues[idx]
pred = preds[idx]
MSE = tf.reduce_mean(tfm.square(tfm.subtract(true,pred)))
true = tf.where(true<cutoff,0.0,1.0)
pred = tf.where(pred<cutoff,0.0,1.0)
n_true = tf.reduce_sum(true)
n_pred = tf.reduce_sum(pred)
loss_TA = tf.cond(tf.logical_or(n_true < min_nonzero_pixels, n_pred < min_nonzero_pixels), get_zero(true,pred), get_MED(true,pred))
loss_med += loss_TA.read(0)
loss += loss_med + MSE # do we benefit from reducing across the batch dimension? we should be able to look at familiar batches and see the little increase due to the distance component
tf.print(n_true,n_pred)
tf.print(loss_med)
return loss # this is essentially MSE given c ~ 0. Thus, this will show if there are some weird gradients flowing through that are preventing the model from learning
return AVGmed
def get_MED(A,B):
# takes in binary tensors
indices_A, indices_B = tf.where(A), tf.where(B)
coordX_A_TA, coordY_A_TA = find_coord(indices_A) # finds x,y coordinates and returns tensor array
coordX_B_TA, coordY_B_TA = find_coord(indices_B)
mindists_AB_TA = find_min_distances(coordX_A_TA, coordY_A_TA, coordX_B_TA, coordY_B_TA)
mindists_BA_TA = find_min_distances(coordX_B_TA, coordY_B_TA, coordX_A_TA, coordY_A_TA)
# MED = mean error distance =
med_AB = tf.reduce_mean(mindists_AB_TA.read(0))
med_BA = tf.reduce_mean(mindists_BA_TA.read(0))
avg_med = tfm.divide(tfm.add(med_AB,med_BA),tf.constant(0.5))
loss_TA = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
loss_TA.write(loss_TA.size(), avg_med)
return loss_TA
def get_zero(A,B):
loss_TA = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
loss_TA.write(loss_TA.size(), 0)
return loss_TA
However, with this framework I am now getting new errors about my generator not having enough data, which is absurd given the batch size I test with is 10 and 1 step_per_epoch on a train size of 100. Got a warning about not closing the TensorArray, which I expect happens whether the conditional is true or false. Inching closer to a solution but could use some guidance on how problematic my tensorflow logic is.
This code is built up as follows: My robot takes a picture, some tf computer vision model calculates where in the picture the target object starts. This information (x1 and x2 coordinate) is passed to a pytorch model. It should learn to predict the correct motor activations, in order to get closer to the target. After the movement is executed, the robot takes a picture again and the tf cv model should calculate whether the motor activation brought the robot closer to the desired state (x1 at 10, x2 coordinate at at31)
However every time i run the code pytorch is not able to calculate the gradients.
I'm wondering if this is some data-type problem or if it is a more general one: Is it impossible to calculate the gradients if the loss is not calculated directly from the pytorch network's output?
Any help and suggestions will be greatly appreciated.
#define policy model (model to learn a policy for my robot)
import torch
import torch.nn as nn
import torch.nn.functional as F
class policy_gradient_model(nn.Module):
def __init__(self):
super(policy_gradient_model, self).__init__()
self.fc0 = nn.Linear(2, 2)
self.fc1 = nn.Linear(2, 32)
self.fc2 = nn.Linear(32, 64)
self.fc3 = nn.Linear(64,32)
self.fc4 = nn.Linear(32,32)
self.fc5 = nn.Linear(32, 2)
def forward(self,x):
x = self.fc0(x)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.relu(self.fc4(x))
x = F.relu(self.fc5(x))
return x
policy_model = policy_gradient_model().double()
print(policy_model)
optimizer = torch.optim.AdamW(policy_model.parameters(), lr=0.005, betas=(0.9,0.999), eps=1e-08, weight_decay=0.01, amsgrad=False)
#make robot move as predicted by pytorch network (not all code included)
def move(motor_controls):
#define curvature
# motor_controls[0] = sigmoid(motor_controls[0])
activation_left = 1+(motor_controls[0])*99
activation_right = 1+(1- motor_controls[0])*99
print("activation left:", activation_left, ". activation right:",activation_right, ". time:", motor_controls[1]*100)
#start movement
#main
import cv2
import numpy as np
import time
from torch.autograd import Variable
print("start training")
losses=[]
losses_end_of_epoch=[]
number_of_steps_each_epoch=[]
loss_function = nn.MSELoss(reduction='mean')
#each epoch
for epoch in range(2):
count=0
target_reached=False
while target_reached==False:
print("epoch: ", epoch, ". step:", count)
###process and take picture
indices = process_picture()
###binary_network(sliced)=indices as input for policy model
optimizer.zero_grad()
###output: 1 for curvature, 1 for duration of movement
motor_controls = policy_model(Variable(torch.from_numpy(indices))).detach().numpy()
print("NO TANH output for motor: 1)activation left, 2)time ", motor_controls)
motor_controls[0] = np.tanh(motor_controls[0])
motor_controls[1] = np.tanh(motor_controls[1])
print("TANH output for motor: 1)activation left, 2)time ", motor_controls)
###execute suggested action
move(motor_controls)
###take and process picture2 (after movement)
indices = (process_picture())
###loss=(binary_network(picture2) - desired
print("calculate loss")
print("idx", indices, type(torch.tensor(indices)))
# loss = 0
# loss = (indices[0]-10)**2+(indices[1]-31)**2
# loss = loss/2
print("shape of indices", indices.shape)
array=np.zeros((1,2))
array[0]=indices
print(array.shape, type(array))
array2 = torch.ones([1,2])
loss = loss_function(torch.tensor(array).double(), torch.tensor([[10.0,31.0]]).double()).float()
print("loss: ", loss, type(loss), loss.shape)
# array2[0] = loss_function(torch.tensor(array).double(),
torch.tensor([[10.0,31.0]]).double()).float()
losses.append(loss)
#start line causing the error-message (still part of main)
###calculate gradients
loss.backward()
#end line causing the error-message (still part of main)
###apply gradients
optimizer.step()
#Output (so far as intented) (not all included)
#calculate loss
idx [14. 15.] <class 'torch.Tensor'>
shape of indices (2,)
(1, 2) <class 'numpy.ndarray'>
loss: tensor(136.) <class 'torch.Tensor'> torch.Size([])
#Error Message:
Traceback (most recent call last):
File "/home/pi/Desktop/GradientPolicyLearning/PolicyModel.py", line 259, in <module>
array2.backward()
File "/home/pi/.local/lib/python3.7/site-packages/torch/tensor.py", line 134, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/pi/.local/lib/python3.7/site-packages/torch/autograd/__init__.py", line 99, in
backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
If you call .detach() on the prediction, that will delete the gradients. Since you are first getting indices from the model and then trying to backprop the error, I would suggest
prediction = policy_model(torch.from_numpy(indices))
motor_controls = prediction.clone().detach().numpy()
This would keep the predictions as it is with the calculated gradients that can be backproped.
Now you can do
loss = loss_function(prediction, torch.tensor([[10.0,31.0]]).double()).float()
Note, you might wanna call double of the prediction if it throws an error.
It is indeed impossible to calculate the gradients if the loss is not calculated directly from the PyTorch network's output because then you would not be able to apply the chain rule which is used to optimise the gradients.
simple solution, turn on the Context Manager that sets gradient calculation to ON, if it is off
torch.set_grad_enabled(True) # Context-manager
Make sure that all your inputs into the NN, the output of NN and ground truth/target values are all of type torch.tensor and not list, numpy.array or any other iterable.
Also, make sure that they are not converted to list or numpy.array at any point either.
In my case, I got this error because I performed list comprehension on the tensor containing predicted values from NN. I did this to get the max value in each row. Then, converted the list back to a torch.tensor. before calculating the loss.
This back and forth conversion disables the gradient calculations
In my case, I got past this error by specifying requires_grad=True when defining my input tensors
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('dark_background')
# define rosenbrock function and gradient
a = 1
b = 5
def f(x):
return (a - x[0]) ** 2 + b * (x[1] - x[0] ** 2) ** 2
def jac(x):
dx1 = -2 * a + 4 * b * x[0] ** 3 - 4 * b * x[0] * x[1] + 2 * x[0]
dx2 = 2 * b * (x[1] - x[0] ** 2)
return np.array([dx1, dx2])
# create stochastic rosenbrock function and gradient
def f_rand(x):
return f(x) * np.random.uniform(0.5, 1.5)
def jac_rand(x): return jac(x) * np.random.uniform(0.5, 1.5)
# use hand coded adam
x = np.array([0.1, 0.1])
x0 = x.copy()
j = jac_rand(x)
beta1=0.9
beta2=0.999
eps=1e-8
m = x * 0
v = x * 0
learning_rate = .1
for ii in range(200):
m = (1 - beta1) * j + beta1 * m # first moment estimate.
v = (1 - beta2) * (j ** 2) + beta2 * v # second moment estimate.
mhat = m / (1 - beta1 ** (ii + 1)) # bias correction.
vhat = v / (1 - beta2 ** (ii + 1))
x = x - learning_rate * mhat / (np.sqrt(vhat) + eps)
x -= learning_rate * v
j = jac_rand(x)
print('hand code finds optimal to be ', x, f(x))
# attempt to use pytorch
import torch
x_tensor = torch.tensor(x0, requires_grad=True)
optimizer = torch.optim.Adam([x_tensor], lr=learning_rate)
def closure():
optimizer.zero_grad()
loss = f_rand(x_tensor)
loss.backward()
return loss
for ii in range(200):
optimizer.step(closure)
print('My PyTorch attempt found ', x_tensor, f(x_tensor))
Following worked for me:
loss.requires_grad = True
loss.backward()
I am referring to this Variational Autoencoder code in github for CelebA dataset. There is an encoder and decoder as usual. The encoder outputs x which is used to get the log variance and mean of the posterior distribution. Now the reparameterization trick gives us the encoded
z = sigma * u + mean
where u comes from normal distribution with mean = 0 and variance = 1. I can figure all of this in the code. However in the 'ScaleShift' class, which is a new keras layer, I don't understand why we are using add_loss(lodget). I checked the Keras documentation for writing your own layers and there doesn't seem to be a requirement for defining loss for a layer. I am confused by what this line's functionality is.
#new keras layer
class ScaleShift(Layer):
def __init__(self, **kwargs):
super(ScaleShift, self).__init__(**kwargs)
def call(self, inputs):
z, shift, log_scale = inputs
#reparameterization trick
z = K.exp(log_scale) * z + shift
#what are the next two lines doing?
logdet = -K.sum(K.mean(log_scale, 0))
self.add_loss(logdet)
return z
#gets the mean parameter(z_shift) from x, and similarly the log variance parameter(z_log_scale)
z_shift = Dense(z_dim)(x)
z_log_scale = Dense(z_dim)(x)
#keras lambda layer, spits out random normal variable u of same dimension as z_shift
u = Lambda(lambda z: K.random_normal(shape=K.shape(z)))(z_shift)
z = ScaleShift()([u, z_shift, z_log_scale])
x_recon = decoder(z)
x_out = Subtract()([x_in, x_recon])
recon_loss = 0.5 * K.sum(K.mean(x_out**2, 0)) + 0.5 * np.log(2*np.pi) * np.prod(K.int_shape(x_out)[1:])
#KL divergence loss
z_loss = 0.5 * K.sum(K.mean(z**2, 0)) - 0.5 * K.sum(K.mean(u**2, 0))
vae_loss = recon_loss + z_loss
vae = Model(x_in, x_out)
#VAE loss to be optimised
vae.add_loss(vae_loss)
vae.compile(optimizer=Adam(1e-4))
I have the following function
def msfe(ys, ts):
ys=ys.detach().numpy() #output from the network
ts=ts.detach().numpy() #Target (true labels)
pred_class = (ys>=0.5)
n_0 = sum(ts==0) #Number of true negatives
n_1 = sum(ts==1) #Number of true positives
FPE = sum((ts==0)[[bool(p) for p in (pred_class==1)]])/n_0 #False positive error
FNE = sum((ts==1)[[bool(p) for p in (pred_class==0)]])/n_1 #False negative error
loss= FPE**2+FNE**2
loss=torch.tensor(loss,dtype=torch.float64,requires_grad=True)
return loss
and I wonder, if the autograd in Pytorch works properly, since ys and ts does not have the grad flag.
So my question is: do all the variables (FPE,FNE,ys,ts,n_1,n_0) have to be tensors, before optimizer.step() works, or is it okay that it is only the final function (loss) which is ?
All of the variables you want to optimise via optimizer.step() need to have gradient.
In your case it would be y predicted by network, so you shouldn't detach it (from graph).
Usually you don't change your targets, so those don't need gradients. You shouldn't have to detach them though, tensors by default don't require gradient and won't be backpropagated.
Loss will have gradient if it's ingredients (at least one) have gradient.
Overall you rarely need to take care of it manually.
BTW. don't use numpy with PyTorch, there is rarely ever the case to do so. You can perform most of the operations you can do on numpy array on PyTorch's tensor.
BTW2. There is no such thing as Variable in pytorch anymore, only tensors which require gradient and those that don't.
Non-differentiability
1.1 Problems with existing code
Indeed, you are using functions which are not differentiable (namely >= and ==). Those will give you trouble only in the case of your outputs, as those required gradient (you can use == and >= for targets though).
Below I have attached your loss function and outlined problems in it in the comments:
# Gradient can't propagate if you detach and work in another framework
# Most Python constructs should be fine, detaching will ruin it though.
def msfe(outputs, targets):
# outputs=outputs.detach().numpy() # Do not detach, no need to do that
# targets=targets.detach().numpy() # No need for numpy either
pred_class = outputs >= 0.5 # This one is non-differentiable
# n_0 = sum(targets==0) # Do not use sum, there is pytorch function for that
# n_1 = sum(targets==1)
n_0 = torch.sum(targets == 0) # Those are not differentiable, but...
n_1 = torch.sum(targets == 1) # It does not matter as those are targets
# FPE = sum((targets==0)[[bool(p) for p in (pred_class==1)]])/n_0 # Do not use Python bools
# FNE = sum((targets==1)[[bool(p) for p in (pred_class==0)]])/n_1 # Stay within PyTorch
# Those two below are non-differentiable due to == sign as well
FPE = torch.sum((targets == 0.0) * (pred_class == 1.0)).float() / n_0
FNE = torch.sum((targets == 1.0) * (pred_class == 0.0)).float() / n_1
# This is obviously fine
loss = FPE ** 2 + FNE ** 2
# Loss should be a tensor already, don't do things like that
# Gradient will not be propagated, you will have a new tensor
# Always returning gradient of `1` and that's all
# loss = torch.tensor(loss, dtype=torch.float64, requires_grad=True)
return loss
1.2 Possible solution
So, you need to get rid of 3 non-differentiable parts. You could in principle try to approximate it with continuous outputs from your network (provided you are using sigmoid as activation). Here is my take:
def msfe_approximation(outputs, targets):
n_0 = torch.sum(targets == 0) # Gradient does not flow through it, it's okay
n_1 = torch.sum(targets == 1) # Same as above
FPE = torch.sum((targets == 0) * outputs).float() / n_0
FNE = torch.sum((targets == 1) * (1 - outputs)).float() / n_1
return FPE ** 2 + FNE ** 2
Notice that to minimize FPE outputs will try to be zero on the indices where targets are zero. Similarly for FNE, if targets are 1, network will try to output 1 as well.
Notice similarity of this idea to BCELoss (Binary CrossEntropy).
And lastly, example you can run this on, just for sanity check:
if __name__ == "__main__":
model = torch.nn.Sequential(
torch.nn.Linear(30, 100),
torch.nn.ReLU(),
torch.nn.Linear(100, 200),
torch.nn.ReLU(),
torch.nn.Linear(200, 1),
torch.nn.Sigmoid(),
)
optimizer = torch.optim.Adam(model.parameters())
targets = torch.randint(high=2, size=(64, 1)) # random targets
inputs = torch.rand(64, 30) # random data
for _ in range(1000):
optimizer.zero_grad()
outputs = model(inputs)
loss = msfe_approximation(outputs, targets)
print(loss)
loss.backward()
optimizer.step()
print(((model(inputs) >= 0.5) == targets).float().mean())
I'm trying to code a neural network from scratch in python. To check whether everything works I wanted to overfit the network but the loss seems to explode at first and then comes back to the initial value and stops there (Doesn't converge). I've checked my code and could find the reason. I assume my understanding or implementation of backpropagation is incorrect but there might be some other reason. Can anyone help me out or at least point me in the right direction?
# Initialize weights and biases given dimesnsions (For this example the dimensions are set to [12288, 64, 1])
def initialize_parameters(dims):
# Initiate parameters
parameters = {}
L = len(dims) # Number of layers in the network
# Loop over the given dimensions. Initialize random weights and set biases to zero.
for i in range(1, L):
parameters["W" + str(i)] = np.random.randn(dims[i], dims[i-1]) * 0.01
parameters["b" + str(i)] = np.zeros([dims[i], 1])
return parameters
# Activation Functions
def relu(x, deriv=False):
if deriv:
return 1. * (x > 0)
else:
return np.maximum(0,x)
def sigmoid(x, deriv=False):
if deriv:
return x * (1-x)
else:
return 1/(1 + np.exp(-x))
# Forward and backward pass for 2 layer neural network. (1st relu, 2nd sigmoid)
def forward_backward(X, Y, parameters):
# Array for storing gradients
grads = {}
# Get the length of examples
m = Y.shape[1]
# First layer
Z1 = np.dot(parameters["W1"], X) + parameters["b1"]
A1 = relu(Z1)
# Second layer
Z2 = np.dot(parameters["W2"], A1) + parameters["b2"]
AL = sigmoid(Z2)
# Compute cost
cost = (-1 / m) * np.sum(np.multiply(Y, np.log(AL)) + np.multiply(1 - Y, np.log(1 - AL)))
# Backpropagation
# Second Layer
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
dZ2 = dAL * sigmoid(AL, deriv=True)
grads["dW2"] = np.dot(dZ2, A1.T) / m
grads["db2"] = np.sum(dZ2, axis=1, keepdims=True) / m
# First layer
dA1 = np.dot(parameters["W2"].T, dZ2)
dZ1 = dA1 * relu(A1, deriv=True)
grads["dW1"] = np.dot(dZ1, X.T)
grads["db1"] = np.sum(dZ1, axis=1, keepdims=True) / m
return AL, grads, cost
# Hyperparameters
dims = [12288, 64, 1]
epoches = 2000
learning_rate = 0.1
# Initialize parameters
parameters = initialize_parameters(dims)
log_list = []
# Train the network
for i in range(epoches):
# Get X and Y
x = np.array(train[0:10],ndmin=2).T
y = np.array(labels[0:10], ndmin=2).T
# Perform forward and backward pass
AL, grads, cost = forward_backward(x, y, parameters)
# Compute cost and append to the log_list
log_list.append(cost)
# Update parameters with computed gradients
parameters = update_parameters(grads, parameters, learning_rate)
plt.plot(log_list)
plt.title("Loss of the network")
plt.show()
I am struggling to find the place where you calculate the error gradients and the input training data sample would also help...
I don't know if this will help you, but I'll share my solution for Python neural network to learn XOR problem.
import numpy as np
def sigmoid_function(x, derivative=False):
"""
Sigmoid function
“x” is the input and “y” the output, the nonlinear properties of this function means that
the rate of change is slower at the extremes and faster in the centre. Put plainly,
we want the neuron to “make its mind up” instead of indecisively staying in the middle.
:param x: Float
:param Derivative: Boolean
:return: Float
"""
if (derivative):
return x * (1 - x) # Derivative using the chain rule.
else:
return 1 / (1 + np.exp(-x))
# create dataset for XOR problem
input_data = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
ideal_output = np.array([[0.0], [1.0], [1.0], [0.0]])
#initialize variables
learning_rate = 0.1
epoch = 50000 #number or iterations basically - One round of forward and back propagation is called an epoch
# get the second element from the numpy array shape field to detect the count of features for input layer
input_layer_neurons = input_data.shape[1]
hidden_layer_neurons = 3 #number of hidden layer neurons
output_layer_neurons = 1 #number of output layer neurons
#init weight & bias
weights_hidden = np.random.uniform(size=(input_layer_neurons, hidden_layer_neurons))
bias_hidden = np.random.uniform(1, hidden_layer_neurons)
weights_output = np.random.uniform(size=(hidden_layer_neurons, output_layer_neurons))
bias_output = np.random.uniform(1, output_layer_neurons)
for i in range(epoch):
#forward propagation
hidden_layer_input_temp = np.dot(input_data, weights_hidden) #matrix dot product to adjust for weights in the layer
hidden_layer_input = hidden_layer_input_temp + bias_hidden #adjust for bias
hidden_layer_activations = sigmoid_function(hidden_layer_input) #use the activation function
output_layer_input_temp = np.dot(hidden_layer_activations, weights_output)
output_layer_input = output_layer_input_temp + bias_output
output = sigmoid_function(output_layer_input) #final output
#backpropagation (where adjusting of the weights happens)
error = ideal_output - output #error gradient
if (i % 1000 == 0):
print("Error: {}".format(np.mean(abs(error))))
#use derivatives to compute slope of output and hidden layers
slope_output_layer = sigmoid_function(output, derivative=True)
slope_hidden_layer = sigmoid_function(hidden_layer_activations, derivative=True)
#calculate deltas
delta_output = error * slope_output_layer
error_hidden_layer = delta_output.dot(weights_output.T) #calculates the error at hidden layer
delta_hidden = error_hidden_layer * slope_hidden_layer
#change the weights
weights_output += hidden_layer_activations.T.dot(delta_output) * learning_rate
bias_output += np.sum(delta_output, axis=0, keepdims=True) * learning_rate
weights_hidden += input_data.T.dot(delta_hidden) * learning_rate
bias_hidden += np.sum(delta_hidden, axis=0, keepdims=True) * learning_rate