I am currently in the process of trying to implement my own neural network from scratch to test my understanding of the method. I thought things were going well, as my network managed to approximate AND and XOR functions without an issue, but it turns out it is having a problem with learning to approximate a simple square function.
I have attempted to use a variety of different network configurations, with anywhere from 1 to 3 layers, and 1-64 nodes. I have varied the learning rate from 0.1 to 0.00000001, and implemented weight decay as I thought some regularisation might provide some insight as to what went wrong. I have also implemented gradient check, which is giving me conflicting answers, as it varies greatly from attempt to attempt, ranging from a dreadful 0.6 difference to a fantastic 1e-10. I am using the leaky ReLU activation function, and MSE as my cost function.
Could somebody help me spot what I am missing? Or is this purely down to optimising the hyper parameters?
My code is as follows:
import matplotlib.pyplot as plt
import numpy as np
import Sub_Script as ss
# Create sample data set using X**2
X = np.expand_dims(np.linspace(0, 1, 201), axis=0)
y = X**2
plt.plot(X.T, y.T)
# Hyper-parameters
layer_dims = [1, 64, 1]
learning_rate = 0.000001
iterations = 50000
decay = 0.00000001
num_ex = y.shape[1]
# Initializations
num_layers = len(layer_dims)
weights = [None] + [np.random.randn(layer_dims[l], layer_dims[l-1])*np.sqrt(2/layer_dims[l-1])for l in range(1, num_layers)]
biases = [None] + [np.zeros((layer_dims[l], 1)) for l in range(1, num_layers)]
dweights, dbiases, dw_approx, db_approx = ss.grad_check(weights, biases, num_layers, X, y, decay, num_ex)
# Main function: Iteration loop
for iter in range(iterations):
# Main function: Forward Propagation
z_values, acts = ss.forward_propagation(weights, biases, num_layers, X)
dweights, dbiases = ss.backward_propagation(weights, biases, num_layers, z_values, acts, y)
weights, biases = ss.update_paras(weights, biases, dweights, dbiases, learning_rate, decay, num_ex)
if iter % (1000+1) == 0:
print('Cost: ', ss.mse(acts[-1], y, weights, decay, num_ex))
# Gradient Checking
dweights, dbiases, dw_approx, db_approx = ss.grad_check(weights, biases, num_layers, X, y, decay, num_ex)
# Visualization
plt.plot(X.T, acts[-1].T)
With containing the neural network functions:
import numpy as np
import copy as cp
# Construct sub functions, forward, backward propagation and cost and activation functions
# Leaky ReLU Activation Function
def relu(x):
return (x > 0) * x + (x < 0) * 0.01*x
# Leaky ReLU activation Function Gradient
def relu_grad(x):
return (x > 0) + (x < 0) * 0.01
# MSE Cost Function
def mse(prediction, actual, weights, decay, num_ex):
return np.sum((actual - prediction) ** 2)/(actual.shape[1]) + (decay/(2*num_ex))*np.sum([np.sum(w) for w in weights[1:]])
# MSE Cost Function Gradient
def mse_grad(prediction, actual):
return -2 * (actual - prediction)/(actual.shape[1])
# Forward Propagation
def forward_propagation(weights, biases, num_layers, act):
acts = [[None] for i in range(num_layers)]
z_values = [[None] for i in range(num_layers)]
acts[0] = act
for layer in range(1, num_layers):
z_values[layer] =[layer], acts[layer-1]) + biases[layer]
acts[layer] = relu(z_values[layer])
return z_values, acts
# Backward Propagation
def backward_propagation(weights, biases, num_layers, z_values, acts, y):
dweights = [[None] for i in range(num_layers)]
dbiases = [[None] for i in range(num_layers)]
zgrad = mse_grad(acts[-1], y) * relu_grad(z_values[-1])
dweights[-1] =, acts[-2].T)
dbiases[-1] = np.sum(zgrad, axis=1, keepdims=True)
for layer in range(num_layers-2, 0, -1):
zgrad =[layer+1].T, zgrad) * relu_grad(z_values[layer])
dweights[layer] =, acts[layer-1].T)
dbiases[layer] = np.sum(zgrad, axis=1, keepdims=True)
return dweights, dbiases
# Update Parameters with Regularization
def update_paras(weights, biases, dweights, dbiases, learning_rate, decay, num_ex):
weights = [None] + [w - learning_rate*(dw + (decay/num_ex)*w) for w, dw in zip(weights[1:], dweights[1:])]
biases = [None] + [b - learning_rate*db for b, db in zip(biases[1:], dbiases[1:])]
return weights, biases
# Gradient Checking
def grad_check(weights, biases, num_layers, X, y, decay, num_ex):
z_values, acts = forward_propagation(weights, biases, num_layers, X)
dweights, dbiases = backward_propagation(weights, biases, num_layers, z_values, acts, y)
epsilon = 1e-7
dw_approx = cp.deepcopy(weights)
db_approx = cp.deepcopy(biases)
for layer in range(1, num_layers):
height = weights[layer].shape[0]
width = weights[layer].shape[1]
for i in range(height):
for j in range(width):
w_plus = cp.deepcopy(weights)
w_plus[layer][i, j] += epsilon
w_minus = cp.deepcopy(weights)
w_minus[layer][i, j] -= epsilon
_, temp_plus = forward_propagation(w_plus, biases, num_layers, X)
cost_plus = mse(temp_plus[-1], y, w_plus, decay, num_ex)
_, temp_minus = forward_propagation(w_minus, biases, num_layers, X)
cost_minus = mse(temp_minus[-1], y, w_minus, decay, num_ex)
dw_approx[layer][i, j] = (cost_plus - cost_minus)/(2*epsilon)
b_plus = cp.deepcopy(biases)
b_plus[layer][i, 0] += epsilon
b_minus = cp.deepcopy(biases)
b_minus[layer][i, 0] -= epsilon
_, temp_plus = forward_propagation(weights, b_plus, num_layers, X)
cost_plus = mse(temp_plus[-1], y, weights, decay, num_ex)
_, temp_minus = forward_propagation(weights, b_minus, num_layers, X)
cost_minus = mse(temp_minus[-1], y, weights, decay, num_ex)
db_approx[layer][i, 0] = (cost_plus - cost_minus)/(2*epsilon)
dweights_flat = [dw.flatten() for dw in dweights[1:]]
dweights_flat = np.concatenate(dweights_flat, axis=None)
dw_approx_flat = [dw.flatten() for dw in dw_approx[1:]]
dw_approx_flat = np.concatenate(dw_approx_flat, axis=None)
dbiases_flat = [db.flatten() for db in dbiases[1:]]
dbiases_flat = np.concatenate(dbiases_flat, axis=None)
db_approx_flat = [db.flatten() for db in db_approx[1:]]
db_approx_flat = np.concatenate(db_approx_flat, axis=None)
d_paras = np.concatenate([dweights_flat, dbiases_flat], axis=None)
d_approx_paras = np.concatenate([dw_approx_flat, db_approx_flat], axis=None)
difference = np.linalg.norm(d_paras - d_approx_paras)/(np.linalg.norm(d_paras) +
if difference > 2e-7:
"\033[93m" + "There is a mistake in the backward propagation! difference = " + str(difference) + "\033[0m")
"\033[92m" + "Your backward propagation works perfectly fine! difference = " + str(difference) + "\033[0m")
return dweights, dbiases, dw_approx, db_approx
Edit: Made some corrections to some old comments I had in the code, to avoid confusion
Edit 2: Thanks to #sid_508 for helping me find the main problem with my code! I also wanted to mention in this edit that I found out that there was some mistake in the way I had implemented the weight decay. After making the suggested changes and removing the weight decay element entirely for now, the neural network appears to work!

I ran your code and this is the output it gave:
The issue is that you use ReLU for the final layer too, so you can't get the best fit, use no activation in the final layer and it should produce way better results.
The final layer activation usually always varies from what you use for the hidden layers and it depends on what type of output you are going for. For continuous outputs use linear activation (basically no activation), and for classification use sigmoid/softmax.


how to use tensorboard to visualize functions

i am new to tensorflow2.9 and i have finished writing a function to realize linear regression. But I faced some problems when I want to visualize this function with tensorboard.I know how to record data, but I dont know how to generate a graph with tf.summary.trace_on
Here is my code.
def linear_regression_1():
writer = tf.summary.create_file_writer("./tmp/linear")
x = tf.random.normal(shape=[100, 1])
y_true = tf.matmul(x, [[0.8]]) + 0.7
weights = tf.Variable(initial_value=tf.random.normal(shape=[1, 1]))
bias = tf.Variable(initial_value=tf.random.normal(shape=[1, 1]))
optimizer = tf.keras.optimizers.SGD(learning_rate=0.01)
with writer.as_default():
for i in range(1000):
# tf.print('weights:', weights)
# tf.print('bias:', bias)
tf.summary.histogram('weights', weights, i)
tf.summary.histogram('bias', bias, i)
with tf.GradientTape() as tape:
y_predict = tf.matmul(x, weights) + bias
error = tf.reduce_mean(tf.square(y_predict - y_true))
tf.summary.histogram('error', error, i)
gradients = tape.gradient(error, [weights, bias])
optimizer.apply_gradients(zip(gradients, [weights, bias]))
print('weights:', weights)
print('bias:', bias)
when I put a #tf.function before this function, this function just report errors.

Problem with implementation of Multilayer perceptron

I am trying to create a multi-layered perceptron for the purpose of classifying a dataset of hand drawn digits obtained from the MNIST database. It implements 2 hidden layers that have a sigmoid activation function while the output layer utilizes SoftMax. However, for whatever reason I am not able to get it to work. I have attached the training loop from my code below, this I am confident is where the problems stems from. Can anyone identify possible issues with my implementation of the perceptron?
def train(self, inputs, targets, eta, niterations):
inputs is a numpy array of shape (num_train, D) containing the training images
consisting of num_train samples each of dimension D.
targets is a numpy array of shape (num_train, D) containing the training labels
consisting of num_train samples each of dimension D.
eta is the learning rate for optimization
niterations is the number of iterations for updating the weights
ndata = np.shape(inputs)[0] # number of data samples
# adding the bias
inputs = np.concatenate((inputs, -np.ones((ndata, 1))), axis=1)
# numpy array to store the update weights
updatew1 = np.zeros((np.shape(self.weights1)))
updatew2 = np.zeros((np.shape(self.weights2)))
updatew3 = np.zeros((np.shape(self.weights3)))
for n in range(niterations):
# forward phase
self.outputs = self.forwardPass(inputs)
# Error using the sum-of-squares error function
error = 0.5*np.sum((self.outputs-targets)**2)
if (np.mod(n, 100) == 0):
print("Iteration: ", n, " Error: ", error)
# backward phase
deltao = self.outputs - targets
placeholder = np.zeros(np.shape(self.outputs))
for j in range(np.shape(self.outputs)[1]):
y = self.outputs[:, j]
placeholder[:, j] = y * (1 - y)
for y in range(np.shape(self.outputs)[1]):
if not y == j:
placeholder[:, j] += -y * self.outputs[:, y]
deltao *= placeholder
# compute the derivative of the second hidden layer
deltah2 =, np.transpose(self.weights3))
deltah2 = self.hidden2*self.beta*(1.0-self.hidden2)*deltah2
# compute the derivative of the first hidden layer
deltah1 =[:, :-1], np.transpose(self.weights2))
deltah1 = self.hidden1*self.beta*(1.0-self.hidden1)*deltah1
# update the weights of the three layers: self.weights1, self.weights2 and self.weights3
updatew1 = eta*(,deltah1[:, :-1])) + (self.momentum * updatew1)
updatew2 = eta*(,deltah2[:, :-1])) + (self.momentum * updatew2)
updatew3 = eta*(,deltao)) + (self.momentum * updatew3)
self.weights1 -= updatew1
self.weights2 -= updatew2
self.weights3 -= updatew3
def forwardPass(self, inputs):
inputs is a numpy array of shape (num_train, D) containing the training images
consisting of num_train samples each of dimension D.
# layer 1
# the forward pass on the first hidden layer with the sigmoid function
self.hidden1 =, self.weights1)
self.hidden1 = 1.0/(1.0+np.exp(-self.beta*self.hidden1))
self.hidden1 = np.concatenate((self.hidden1, -np.ones((np.shape(self.hidden1)[0], 1))), axis=1)
# layer 2
# the forward pass on the second hidden layer with the sigmoid function
self.hidden2 =, self.weights2)
self.hidden2 = 1.0/(1.0+np.exp(-self.beta*self.hidden2))
self.hidden2 = np.concatenate((self.hidden2, -np.ones((np.shape(self.hidden2)[0], 1))), axis=1)
# output layer
# the forward pass on the output layer with softmax function
outputs =, self.weights3)
outputs = np.exp(outputs)
outputs /= np.repeat(np.sum(outputs, axis=1),outputs.shape[1], axis=0).reshape(outputs.shape)
return outputs
Update: I have since figured something out that I messed up during the backpropagation of the SoftMax algorithm. The actual deltao should be:
deltao = self.outputs - targets
placeholder = np.zeros(np.shape(self.outputs))
for j in range(np.shape(self.outputs)[1]):
y = self.outputs[:, j]
placeholder[:, j] = y * (1 - y)
# the counter for the for loop below used to also be named y causing confusion
for i in range(np.shape(self.outputs)[1]):
if not i == j:
placeholder[:, j] += -y * self.outputs[:, i]
deltao *= placeholder
After this correction the overflow errors have seemed to have sorted themselves however, there is now a new problem, no matter my efforts the accuracy of the perceptron does not exceed 15% no matter what variables I change
Second Update: After a long time I have finally found a way to get my code to work. I had to change the backpropogation of SoftMax (in code this is called deltao) to the following:
deltao = np.exp(self.outputs)
deltao = deltao * (1 - deltao)
deltao *= (self.outputs - targets)/np.shape(inputs)[0]
Only problem is I have no idea why this works as a derivative of SoftMax could anyone explain this?

How to correctly calculate gradients in neural network with numpy

I am trying to build a simple neural network class from scratch using numpy, and test it using the XOR problem. But the backpropagation function (backprop) does not seem to be working correctly.
In the class, I construct instances by passing in the size of each layer, and the activation functions to use at each layer. I assume that the final activation function is softmax, so that I can calculate the derivative of cross-entropy loss wrt to Z of the last layer. I also do not have a separate set of bias matrices in my class. I just include them in the weight matrices as an extra column at the end.
I know that my backprop function is not working correctly, because the neural network does not ever converge on a somewhat correct output. I also created a numerical gradient function, and when comparing the results of both. I get drastically different numbers.
My understanding from what I have read is that the delta values of each layer (with L being the last layer, and i representing any other layer) should be:
And the respective gradients/weight-update of those layers should be:
Where * is the hardamard product, a represents the activation of some layer, and z represents the nonactivated output of some layer.
The sample data that I am using to test this is at the bottom of the file.
This is my first time trying to implement the backpropagation algorithm from scratch. So I am a bit lost on where to go from here.
import numpy as np
def sigmoid(n, deriv=False):
if deriv:
return np.multiply(n, np.subtract(1, n))
return 1 / (1 + np.exp(-n))
def softmax(X, deriv=False):
if not deriv:
exps = np.exp(X - np.max(X))
return exps / np.sum(exps)
raise Error('Unimplemented')
def cross_entropy(y, p, deriv=False):
when deriv = True, returns deriv of cost wrt z
if deriv:
ret = p - y
return ret
p = np.clip(p, 1e-12, 1. - 1e-12)
N = p.shape[0]
return -np.sum(y*np.log(p))/(N)
class NN:
def __init__(self, layers, activations):
"""random initialization of weights/biases
NOTE - biases are built into the standard weight matrices by adding an extra column
and multiplying it by one in every layer"""
self.activate_fns = activations
self.weights = [np.random.rand(layers[1], layers[0]+1)]
for i in range(1, len(layers)):
if i != len(layers)-1:
self.weights.append(np.random.rand(layers[i+1], layers[i]+1))
for j in range(layers[i+1]):
for k in range(layers[i]+1):
if np.random.rand(1,1)[0,0] > .5:
self.weights[-1][j,k] = -self.weights[-1][j,k]
def ff(self, X, get_activations=False):
activations, zs = [], []
for activate, w in zip(self.activate_fns, self.weights):
X = np.vstack([X, np.ones((1, 1))]) # adding bias
z =
X = activate(z)
if get_activations:
return (activations, zs) if get_activations else X
def grad_descent(self, data, epochs, learning_rate):
"""gradient descent
data - list of 2 item tuples, the first item being an input, and the second being its label"""
grad_w = [np.zeros_like(w) for w in self.weights]
for _ in range(epochs):
for x, y in data:
grad_w = [n+o for n, o in zip(self.backprop(x, y), grad_w)]
self.weights = [w-(learning_rate/len(data))*gw for w, gw in zip(self.weights, grad_w)]
def backprop(self, X, y):
"""perfoms backprop for one layer of a NN with softmax/cross_entropy output layer"""
(activations, zs) = self.ff(X, True)
activations.insert(0, X)
deltas = [0 for _ in range(len(self.weights))]
grad_w = [0 for _ in range(len(self.weights))]
deltas[-1] = cross_entropy(y, activations[-1], True) # assumes output activation is softmax
grad_w[-1] =[-1], np.vstack([activations[-2], np.ones((1, 1))]).transpose())
for i in range(len(self.weights)-2, -1, -1):
deltas[i] =[i+1][:, :-1].transpose(), deltas[i+1]) * self.activate_fns[i](zs[i], True)
grad_w[i] = np.hstack(([i], activations[max(0, i-1)].transpose()), deltas[i]))
# check gradient
num_gw = self.gradient_check(X, y, i)
print('numerical:', num_gw, '\nanalytic:', grad_w)
return grad_w
def gradient_check(self, x, y, i, epsilon=1e-4):
"""Numerically calculate the gradient in order to check analytical correctness"""
grad_w = [np.zeros_like(w) for w in self.weights]
for w, gw in zip(self.weights, grad_w):
for j in range(w.shape[0]):
for k in range(w.shape[1]):
w[j,k] += epsilon
out1 = cross_entropy(self.ff(x), y)
w[j,k] -= 2*epsilon
out2 = cross_entropy(self.ff(x), y)
gw[j,k] = np.float64(out1 - out2) / (2*epsilon)
w[j,k] += epsilon # return weight to original value
return grad_w
##### TESTING #####
X = [np.array([[0],[0]]), np.array([[0],[1]]), np.array([[1],[0]]), np.array([[1],[1]])]
y = [np.array([[1], [0]]), np.array([[0], [1]]), np.array([[0], [1]]), np.array([[1], [0]])]
data = []
for x, t in zip(X, y):
data.append((x, t))
def nn_test():
c = NN([2, 2, 2], [sigmoid, sigmoid, softmax])
c.grad_descent(data, 100, .01)
for x in X:
UPDATE: I found one small bug in the code, but it still does not converge correctly. I calculated/derived the gradients for both matrices by hand and found no errors in my implementation, so I still do not know what is wrong with it.
UPDATE #2: I created a procedural version of what I was using above with the following code. Upon testing I discovered that the NN was able to learn the correct weights for classifying each of the 4 cases in XOR separately, but when I try to train using all the training examples at once (as shown), the resultant weights almost always output something around .5 for both output nodes. Could someone please tell me why this is occurring?
X = [np.array([[0],[0]]), np.array([[0],[1]]), np.array([[1],[0]]), np.array([[1],[1]])]
y = [np.array([[1], [0]]), np.array([[0], [1]]), np.array([[0], [1]]), np.array([[1], [0]])]
weights = [np.random.rand(2, 3) for _ in range(2)]
for _ in range(1000):
for i in range(4):
a0 = X[i]
z0 = weights[0].dot(np.vstack([a0, np.ones((1, 1))]))
a1 = sigmoid(z0)
z1 = weights[1].dot(np.vstack([a1, np.ones((1, 1))]))
a2 = softmax(z1)
# print('output:', a2, '\ncost:', cross_entropy(y[i], a2))
del1 = cross_entropy(y[i], a2, True)
dcdw1 =[a1, np.ones((1, 1))]).T)
del0 = weights[1][:, :-1]*sigmoid(z0, True)
dcdw0 =[a0, np.ones((1, 1))]).T)
weights[0] -= .03*weights[0]*dcdw0
weights[1] -= .03*weights[1]*dcdw1
i = 0
a0 = X[i]
z0 = weights[0].dot(np.vstack([a0, np.ones((1, 1))]))
a1 = sigmoid(z0)
z1 = weights[1].dot(np.vstack([a1, np.ones((1, 1))]))
a2 = softmax(z1)
Softmax doesn't look right
Using cross entropy loss, the derivative for softmax is really nice (assuming you are using a 1 hot vector, where "1 hot" essentially means an array of all 0's except for a single 1, ie: [0,0,0,0,0,0,1,0,0])
For node y_n it ends up being y_n-t_n. So for a softmax with output:
And desired output:
The gradient at each of the softmax nodes is:
It looks as if you are subtracting 1 from the entire array. The variable names aren't very clear, so if you could possibly rename them from L to what L represents, such as output_layer I'd be able to help more.
Also, for the other layers just to clear things up. When you say a^(L-1) as an example, do you mean "a to the power of (l-1)" or do you mean "a xor (l-1)"? Because in python ^ means xor.
I used this code and found the strange matrix dimensions (modified at line 69 in the function backprop)
deltas = [0 for _ in range(len(self.weights))]
grad_w = [0 for _ in range(len(self.weights))]
deltas[-1] = cross_entropy(y, activations[-1], True) # assumes output activation is softmax
grad_w[-1] =[-1], np.vstack([activations[-2], np.ones((1, 1))]).transpose())

Batch normalization in tensorflow: variables and performance

I would like to add conditional operations on the variables of a batch normalization layer. Specifically, train in float, then quantize in a fine-tuning secondary training phase. For this, I want to add a tf.cond operation on the variables (scale, shift and exp moving averages of mean and var).
I replaced the tf.layers.batch_normalization with a batchnorm layer I wrote (see below).
This function works perfectly (i.e. I get the same metrics with both functions), and I can add whatever pipeline to the variables (before the batchnorm operation). The problem is that the performance (runtime) dropped dramatically (i.e. there's a x2 factor by simply replacing the layers.batchnorm with my own function, as written below).
def batchnorm(self, x, name, epsilon=0.001, decay=0.99):
epsilon = tf.to_float(epsilon)
decay = tf.to_float(decay)
with tf.variable_scope(name):
shape = x.get_shape().as_list()
channels_num = shape[3]
# scale factor
gamma = tf.get_variable("gamma", shape=[channels_num], initializer=tf.constant_initializer(1.0), trainable=True)
# shift value
beta = tf.get_variable("beta", shape=[channels_num], initializer=tf.constant_initializer(0.0), trainable=True)
moving_mean = tf.get_variable("moving_mean", channels_num, initializer=tf.constant_initializer(0.0), trainable=False)
moving_var = tf.get_variable("moving_var", channels_num, initializer=tf.constant_initializer(1.0), trainable=False)
batch_mean, batch_var = tf.nn.moments(x, axes=[0, 1, 2]) # per channel
update_mean = moving_mean.assign((decay * moving_mean) + ((1. - decay) * batch_mean))
update_var = moving_var.assign((decay * moving_var) + ((1. - decay) * batch_var))
tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_mean)
tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_var)
bn_mean = tf.cond(self.is_training, lambda: tf.identity(batch_mean), lambda: tf.identity(moving_mean))
bn_var = tf.cond(self.is_training, lambda: tf.identity(batch_var), lambda: tf.identity(moving_var))
with tf.variable_scope(name + "_batchnorm_op"):
inv = tf.math.rsqrt(bn_var + epsilon)
inv *= gamma
output = ((x*inv) - (bn_mean*inv)) + beta
return output
I would appreciate help in any of the following questions:
Any ideas on how to improve the performance (reduce runtime) of my solution?
Is it possible to add my own operators to the variables pipeline of layers.batchnorm before the batchnorm operation?
Any other solution to the same problem?
tf.nn.fused_batch_norm is optimized and did the trick.
I had to create two subgraphs, one per mode, since fused_batch_norm's interface does not take a conditional training/test mode (is_training is bool and not a tensor, so it's graph is not conditional). I added the condition after (see below). However, even with the two subgraphs, this has about the same runtime of tf.layers.batch_normalization.
Here's the final solution (I'd still appreciate any comment or advice for improvements):
def batchnorm(self, x, name, epsilon=0.001, decay=0.99):
with tf.variable_scope(name):
shape = x.get_shape().as_list()
channels_num = shape[3]
# scale factor
gamma = tf.get_variable("gamma", shape=[channels_num], initializer=tf.constant_initializer(1.0), trainable=True)
# shift value
beta = tf.get_variable("beta", shape=[channels_num], initializer=tf.constant_initializer(0.0), trainable=True)
moving_mean = tf.get_variable("moving_mean", channels_num, initializer=tf.constant_initializer(0.0), trainable=False)
moving_var = tf.get_variable("moving_var", channels_num, initializer=tf.constant_initializer(1.0), trainable=False)
(output_train, batch_mean, batch_var) = tf.nn.fused_batch_norm(x,
beta, # pylint: disable=invalid-name
(output_test, _, _) = tf.nn.fused_batch_norm(x,
beta, # pylint: disable=invalid-name
output = tf.cond(self.is_training, lambda: tf.identity(output_train), lambda: tf.identity(output_test))
update_mean = moving_mean.assign((decay * moving_mean) + ((1. - decay) * batch_mean))
update_var = moving_var.assign((decay * moving_var) + ((1. - decay) * batch_var))
tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_mean)
tf.add_to_collection(tf.GraphKeys.UPDATE_OPS, update_var)
return output

What's wrong with my backpropagation?

I'm trying to code a neural network from scratch in python. To check whether everything works I wanted to overfit the network but the loss seems to explode at first and then comes back to the initial value and stops there (Doesn't converge). I've checked my code and could find the reason. I assume my understanding or implementation of backpropagation is incorrect but there might be some other reason. Can anyone help me out or at least point me in the right direction?
# Initialize weights and biases given dimesnsions (For this example the dimensions are set to [12288, 64, 1])
def initialize_parameters(dims):
# Initiate parameters
parameters = {}
L = len(dims) # Number of layers in the network
# Loop over the given dimensions. Initialize random weights and set biases to zero.
for i in range(1, L):
parameters["W" + str(i)] = np.random.randn(dims[i], dims[i-1]) * 0.01
parameters["b" + str(i)] = np.zeros([dims[i], 1])
return parameters
# Activation Functions
def relu(x, deriv=False):
if deriv:
return 1. * (x > 0)
return np.maximum(0,x)
def sigmoid(x, deriv=False):
if deriv:
return x * (1-x)
return 1/(1 + np.exp(-x))
# Forward and backward pass for 2 layer neural network. (1st relu, 2nd sigmoid)
def forward_backward(X, Y, parameters):
# Array for storing gradients
grads = {}
# Get the length of examples
m = Y.shape[1]
# First layer
Z1 =["W1"], X) + parameters["b1"]
A1 = relu(Z1)
# Second layer
Z2 =["W2"], A1) + parameters["b2"]
AL = sigmoid(Z2)
# Compute cost
cost = (-1 / m) * np.sum(np.multiply(Y, np.log(AL)) + np.multiply(1 - Y, np.log(1 - AL)))
# Backpropagation
# Second Layer
dAL = - (np.divide(Y, AL) - np.divide(1 - Y, 1 - AL))
dZ2 = dAL * sigmoid(AL, deriv=True)
grads["dW2"] =, A1.T) / m
grads["db2"] = np.sum(dZ2, axis=1, keepdims=True) / m
# First layer
dA1 =["W2"].T, dZ2)
dZ1 = dA1 * relu(A1, deriv=True)
grads["dW1"] =, X.T)
grads["db1"] = np.sum(dZ1, axis=1, keepdims=True) / m
return AL, grads, cost
# Hyperparameters
dims = [12288, 64, 1]
epoches = 2000
learning_rate = 0.1
# Initialize parameters
parameters = initialize_parameters(dims)
log_list = []
# Train the network
for i in range(epoches):
# Get X and Y
x = np.array(train[0:10],ndmin=2).T
y = np.array(labels[0:10], ndmin=2).T
# Perform forward and backward pass
AL, grads, cost = forward_backward(x, y, parameters)
# Compute cost and append to the log_list
# Update parameters with computed gradients
parameters = update_parameters(grads, parameters, learning_rate)
plt.title("Loss of the network")
I am struggling to find the place where you calculate the error gradients and the input training data sample would also help...
I don't know if this will help you, but I'll share my solution for Python neural network to learn XOR problem.
import numpy as np
def sigmoid_function(x, derivative=False):
Sigmoid function
“x” is the input and “y” the output, the nonlinear properties of this function means that
the rate of change is slower at the extremes and faster in the centre. Put plainly,
we want the neuron to “make its mind up” instead of indecisively staying in the middle.
:param x: Float
:param Derivative: Boolean
:return: Float
if (derivative):
return x * (1 - x) # Derivative using the chain rule.
return 1 / (1 + np.exp(-x))
# create dataset for XOR problem
input_data = np.array([[0.0, 0.0], [0.0, 1.0], [1.0, 0.0], [1.0, 1.0]])
ideal_output = np.array([[0.0], [1.0], [1.0], [0.0]])
#initialize variables
learning_rate = 0.1
epoch = 50000 #number or iterations basically - One round of forward and back propagation is called an epoch
# get the second element from the numpy array shape field to detect the count of features for input layer
input_layer_neurons = input_data.shape[1]
hidden_layer_neurons = 3 #number of hidden layer neurons
output_layer_neurons = 1 #number of output layer neurons
#init weight & bias
weights_hidden = np.random.uniform(size=(input_layer_neurons, hidden_layer_neurons))
bias_hidden = np.random.uniform(1, hidden_layer_neurons)
weights_output = np.random.uniform(size=(hidden_layer_neurons, output_layer_neurons))
bias_output = np.random.uniform(1, output_layer_neurons)
for i in range(epoch):
#forward propagation
hidden_layer_input_temp =, weights_hidden) #matrix dot product to adjust for weights in the layer
hidden_layer_input = hidden_layer_input_temp + bias_hidden #adjust for bias
hidden_layer_activations = sigmoid_function(hidden_layer_input) #use the activation function
output_layer_input_temp =, weights_output)
output_layer_input = output_layer_input_temp + bias_output
output = sigmoid_function(output_layer_input) #final output
#backpropagation (where adjusting of the weights happens)
error = ideal_output - output #error gradient
if (i % 1000 == 0):
print("Error: {}".format(np.mean(abs(error))))
#use derivatives to compute slope of output and hidden layers
slope_output_layer = sigmoid_function(output, derivative=True)
slope_hidden_layer = sigmoid_function(hidden_layer_activations, derivative=True)
#calculate deltas
delta_output = error * slope_output_layer
error_hidden_layer = #calculates the error at hidden layer
delta_hidden = error_hidden_layer * slope_hidden_layer
#change the weights
weights_output += * learning_rate
bias_output += np.sum(delta_output, axis=0, keepdims=True) * learning_rate
weights_hidden += * learning_rate
bias_hidden += np.sum(delta_hidden, axis=0, keepdims=True) * learning_rate

