Backprop implementation issue

Backprop implementation issue - python

What I am supposed to do. I have an black and white image (100x100px):
I am supposed to train a backpropagation neural network with this image. The inputs are x, y coordinates of the image (from 0 to 99) and output is either 1 (white color) or 0 (black color).
Once the network has learned, I would like it to reproduce the image based on its weights and get the closest possible image to the original.
Here is my backprop implementation:
import os
import math
import Image
import random
from random import sample
#------------------------------ class definitions
class Weight:
def __init__(self, fromNeuron, toNeuron):
self.value = random.uniform(-0.5, 0.5)
self.fromNeuron = fromNeuron
self.toNeuron = toNeuron
fromNeuron.outputWeights.append(self)
toNeuron.inputWeights.append(self)
self.delta = 0.0 # delta value, this will accumulate and after each training cycle used to adjust the weight value
def calculateDelta(self, network):
self.delta += self.fromNeuron.value * self.toNeuron.error
class Neuron:
def __init__(self):
self.value = 0.0 # the output
self.idealValue = 0.0 # the ideal output
self.error = 0.0 # error between output and ideal output
self.inputWeights = []
self.outputWeights = []
def activate(self, network):
x = 0.0;
for weight in self.inputWeights:
x += weight.value * weight.fromNeuron.value
# sigmoid function
if x < -320:
self.value = 0
elif x > 320:
self.value = 1
else:
self.value = 1 / (1 + math.exp(-x))
class Layer:
def __init__(self, neurons):
self.neurons = neurons
def activate(self, network):
for neuron in self.neurons:
neuron.activate(network)
class Network:
def __init__(self, layers, learningRate):
self.layers = layers
self.learningRate = learningRate # the rate at which the network learns
self.weights = []
for hiddenNeuron in self.layers[1].neurons:
for inputNeuron in self.layers[0].neurons:
self.weights.append(Weight(inputNeuron, hiddenNeuron))
for outputNeuron in self.layers[2].neurons:
self.weights.append(Weight(hiddenNeuron, outputNeuron))
def setInputs(self, inputs):
self.layers[0].neurons[0].value = float(inputs[0])
self.layers[0].neurons[1].value = float(inputs[1])
def setExpectedOutputs(self, expectedOutputs):
self.layers[2].neurons[0].idealValue = expectedOutputs[0]
def calculateOutputs(self, expectedOutputs):
self.setExpectedOutputs(expectedOutputs)
self.layers[1].activate(self) # activation function for hidden layer
self.layers[2].activate(self) # activation function for output layer
def calculateOutputErrors(self):
for neuron in self.layers[2].neurons:
neuron.error = (neuron.idealValue - neuron.value) * neuron.value * (1 - neuron.value)
def calculateHiddenErrors(self):
for neuron in self.layers[1].neurons:
error = 0.0
for weight in neuron.outputWeights:
error += weight.toNeuron.error * weight.value
neuron.error = error * neuron.value * (1 - neuron.value)
def calculateDeltas(self):
for weight in self.weights:
weight.calculateDelta(self)
def train(self, inputs, expectedOutputs):
self.setInputs(inputs)
self.calculateOutputs(expectedOutputs)
self.calculateOutputErrors()
self.calculateHiddenErrors()
self.calculateDeltas()
def learn(self):
for weight in self.weights:
weight.value += self.learningRate * weight.delta
def calculateSingleOutput(self, inputs):
self.setInputs(inputs)
self.layers[1].activate(self)
self.layers[2].activate(self)
#return round(self.layers[2].neurons[0].value, 0)
return self.layers[2].neurons[0].value
#------------------------------ initialize objects etc
inputLayer = Layer([Neuron() for n in range(2)])
hiddenLayer = Layer([Neuron() for n in range(10)])
outputLayer = Layer([Neuron() for n in range(1)])
learningRate = 0.4
network = Network([inputLayer, hiddenLayer, outputLayer], learningRate)
# let's get the training set
os.chdir("D:/stuff")
image = Image.open("backprop-input.gif")
pixels = image.load()
bbox = image.getbbox()
width = 5#bbox[2] # image width
height = 5#bbox[3] # image height
trainingInputs = []
trainingOutputs = []
b = w = 0
for x in range(0, width):
for y in range(0, height):
if (0, 0, 0, 255) == pixels[x, y]:
color = 0
b += 1
elif (255, 255, 255, 255) == pixels[x, y]:
color = 1
w += 1
trainingInputs.append([float(x), float(y)])
trainingOutputs.append([float(color)])
print "\nOriginal image ... Black:"+str(b)+" White:"+str(w)+"\n"
#------------------------------ let's train
for i in range(500):
for j in range(len(trainingOutputs)):
network.train(trainingInputs[j], trainingOutputs[j])
network.learn()
for w in network.weights:
w.delta = 0.0
#------------------------------ let's check
b = w = 0
for x in range(0, width):
for y in range(0, height):
out = network.calculateSingleOutput([float(x), float(y)])
if 0.0 == round(out):
color = (0, 0, 0, 255)
b += 1
elif 1.0 == round(out):
color = (255, 255, 255, 255)
w += 1
pixels[x, y] = color
#print out
print "\nAfter learning the network thinks ... Black:"+str(b)+" White:"+str(w)+"\n"
Obviously, there is some issue with my implementation. The above code returns:
Original image ... Black:21 White:4
After learning the network thinks ...
Black:25 White:0
It does the same thing if I try to use larger training set (I'm testing just 25 pixels from the image above for testing purposes). It returns that all pixels should be black after learning.
Now, if I use a manual training set like this instead:
trainingInputs = [
[0.0,0.0],
[1.0,0.0],
[2.0,0.0],
[0.0,1.0],
[1.0,1.0],
[2.0,1.0],
[0.0,2.0],
[1.0,2.0],
[2.0,2.0]
]
trainingOutputs = [
[0.0],
[1.0],
[1.0],
[0.0],
[1.0],
[0.0],
[0.0],
[0.0],
[1.0]
]
#------------------------------ let's train
for i in range(500):
for j in range(len(trainingOutputs)):
network.train(trainingInputs[j], trainingOutputs[j])
network.learn()
for w in network.weights:
w.delta = 0.0
#------------------------------ let's check
for inputs in trainingInputs:
print network.calculateSingleOutput(inputs)
The output is for example:
0.0330125791296 # this should be 0, OK
0.953539182136 # this should be 1, OK
0.971854575477 # this should be 1, OK
0.00046146137467 # this should be 0, OK
0.896699762781 # this should be 1, OK
0.112909223162 # this should be 0, OK
0.00034058462280 # this should be 0, OK
0.0929886299643 # this should be 0, OK
0.940489647869 # this should be 1, OK
In other words the network guessed all pixels right (both black and white). Why does it say all pixels should be black if I use actual pixels from the image instead of hard coded training set like the above?
I tried changing the amount of neurons in the hidden layers (up to 100 neurons) with no success.
This is a homework.
This is also a continuation of my previous question about backprop.

It's been a while, but I did get my degree in this stuff, so I think hopefully some of it has stuck.
From what I can tell, you're too deeply overloading your middle layer neurons with the input set. That is, your input set consists of 10,000 discrete input values (100 pix x 100 pix); you're attempting to encode those 10,000 values into 10 neurons. This level of encoding is hard (I suspect it's possible, but certainly hard); at the least, you'd need a LOT of training (more than 500 runs) to get it to reproduce reasonably. Even with 100 neurons for the middle layer, you're looking at a relatively dense compression level going on (100 pixels to 1 neuron).
As to what to do about these problems; well, that's tricky. You can increase your number of middle neurons dramatically, and you'll get a reasonable effect, but of course it'll take a long time to train. However, I think there might be a different solution; if possible, you might consider using polar coordinates instead of cartesian coordinates for the input; quick eyeballing of the input pattern indicates a high level of symmetry, and effectively you'd be looking at a linear pattern with a repeated predictable deformation along the angular coordinate, which it seems would encode nicely in a small number of middle layer neurons.
This stuff is tricky; going for a general solution for pattern encoding (as your original solution does) is very complex, and can usually (even with large numbers of middle layer neurons) require a lot of training passes; on the other hand, some advance heuristic task breakdown and a little bit of problem redefinition (i.e. advance converting from cartesian to polar coordinates) can give good solutions for well defined problem sets. Therein, of course, is the perpetual rub; general solutions are hard to come by, but slightly more specified solutions can be quite nice indeed.
Interesting stuff, in any event!

Related

Trouble building a custom loss function in tensorflow

I have been trying to develop the YOLO cost function which I have shown below. This is the first time I have tried to develop my own cost function in Tensorflow and am unsure if I am approaching it correctly or not. For one, my model uses a number of intermediate steps. I not sure if this complicates the computational graph in some meaningfully destructive way? Or, I am using an abs. value step and am unsure whether it would have some negative effect on my backprop? Any assistance would be helpful in regard to whether I am approaching this problem correctly.
I can answer any questions about my implementation.
Note - Z13 is the prediction, y are the true values. There are 49 cells in my model (7x7) with each cell being represented by a 7x1 vector: [prob of anything in cell, x midpoint, y midpoint, box width, box height, prob dog, prob cat] .Referenced paper: https://arxiv.org/pdf/1506.02640.pdf which explains the cost function in depth.
I believe that there is either an issue with my forward prop or my cost function as my model is not learning meaningful representations.
def cost_function(Z13,y,coord=5,noobj=0.5):
"""
Z13: shape (None,7,7,7)
y: shape (None,7,7,7)
"""
# Masks are used as classification score for box coords only applies to cell where actual bounding box is
c_mask_true = y[:,:,:,0:1] > 0 # Mask which determines which cell has bounding box
c_mask_false = y[:,:,:,0:1] < 1 # Mask for cells w/o bounding boxes
# Confidence scores
ci_guess_t = tf.boolean_mask(Z13[:,:,:,0:1],c_mask_true)
ci_guess_f = tf.boolean_mask(Z13[:,:,:,0:1],c_mask_false)
ci_act_t = tf.boolean_mask(y[:,:,:,0:1],c_mask_true)
ci_act_f = tf.boolean_mask(y[:,:,:,0:1],c_mask_false)
# Bounding box coordinated for ground truth box prediction
xi_guess = tf.boolean_mask(Z13[:,:,:,1:2],c_mask_true) # Midpoint x position
xi_act = tf.boolean_mask(y[:,:,:,1:2],c_mask_true)
yi_guess = tf.boolean_mask(Z13[:,:,:,2:3],c_mask_true) # Midpoint y position
yi_act = tf.boolean_mask(y[:,:,:,2:3],c_mask_true)
# Width:
wi_guess = tf.boolean_mask(Z13[:,:,:,3:4],c_mask_true) # Midpoint width pos.
wi_guess = tf.minimum(tf.sqrt(tf.abs(wi_guess)),wi_guess) # prevent sqrt(neg) and increase cost for neg prediction
wi_act = tf.sqrt(tf.boolean_mask(y[:,:,:,3:4],c_mask_true))
# Height:
hi_guess = tf.boolean_mask(Z13[:,:,:,4:5],c_mask_true) # Midpoint height pos.
hi_guess = tf.minimum(tf.sqrt(tf.abs(hi_guess)),hi_guess) # prevent sqrt(neg) and increase cost for neg prediction
hi_act = tf.sqrt(tf.boolean_mask(y[:,:,:,4:5],c_mask_true))
# Predicted classes:
class_g_dog = tf.boolean_mask(Z13[:,:,:,5:6],c_mask_true)
class_t_dog = tf.boolean_mask(y[:,:,:,5:6],c_mask_true)
class_g_cat = tf.boolean_mask(Z13[:,:,:,6:7],c_mask_true)
class_t_cat = tf.boolean_mask(y[:,:,:,6:7],c_mask_true)
# Parts correspond with the cost function equations above
part1 = coord * tf.reduce_sum(tf.square(xi_act - xi_guess)+tf.square(yi_act - yi_guess))
part2 = coord * tf.reduce_sum(tf.square(wi_act - wi_guess)+tf.square(hi_act - hi_guess))
part3 = tf.reduce_sum(tf.square(ci_act_t - ci_guess_t))
part4 = noobj * tf.reduce_sum(tf.square(ci_act_f - ci_guess_f))
part5 = tf.reduce_sum(tf.square(class_t_dog - class_g_dog)+tf.square(class_t_cat - class_g_cat))
total_cost = part1 + part2 + part3 + part4 + part5
return total_cost

"Hello World" CTC (Connectionist Temporal Classification) model

I have created the following Python program, which, as far as I understand CTC, should be a valid CTC-based model, as well as training data. The best documentation I can find is CNTK_208_Speech_CTC Tutorial, which is what I've based this on. The program is as simple as I could make it, and it relies only on numpy and CNTK, and generates data itself.
When I run this, I get the following error:
Validating --> ForwardBackward2850 = ForwardBackward (LabelsToGraph2847, StableSigmoid2703) : [5 x labelAxis1], [5 x inputAxis1] -> []
RuntimeError: The Matrix dimension in the ForwardBackwardNode operation does not match.
This seems to be the same issue from this ticket: https://github.com/Microsoft/CNTK/issues/2156
Here is the Python program:
# cntk_ctc_hello_world.py
#
# This is a "hello world" example of using CTC (Connectionist Temporal Classification) with CNTK.
#
# The input is a sequence of vectors of size 17. We use 17 because it's easy to spot that number in
# error messages. The output is a string of codes, each code being one of 4 possible characters from
# our alphabet that we'll refer to here as "ABCD", although they're actually just represented
# by the numbers 0..3, which is typical for classification systems. To make the setup of training data
# trivial, we assign the first four elements of our 17-dimension input vector to the four characters
# of our alphabet, so that the matching is:
# 10000000000000000 A
# 01000000000000000 B
# 00100000000000000 C
# 00010000000000000 D
# In our input sequences, we repeat each code three to five times, followed by three to five codes
# containing random noise. Whether it's repeated 3,4, or 5 times, is random for each code and each
# spacer. When we emit one of our codes, we fill the first 4 values with the code, and the remaining
# 13 values with random noise.
# For example:
# Input: AAA-----CCCC---DDDDD
# Output: ACD
import cntk as C
import numpy as np
import random
import sys
InputDim = 17
NumClasses = 4 # A,B,C,D
MinibatchSize = 100
MinibatchPerEpoch = 50
NumEpochs = 10
MaxOutputSeqLen = 10 # ABCDABCDAB
inputAxis = C.Axis.new_unique_dynamic_axis('inputAxis')
labelAxis = C.Axis.new_unique_dynamic_axis('labelAxis')
inputVar = C.sequence.input_variable((InputDim), sequence_axis=inputAxis, name="input")
labelVar = C.sequence.input_variable((NumClasses+1), sequence_axis=labelAxis, name="labels")
# Construct an LSTM-based model that will perform the classification
with C.default_options(activation=C.sigmoid):
classifier = C.layers.Sequential([
C.layers.For(range(3), lambda: C.layers.Recurrence(C.layers.LSTM(128))),
C.layers.Dense(NumClasses + 1)
])(inputVar)
criteria = C.forward_backward(C.labels_to_graph(labelVar), classifier, blankTokenId=NumClasses, delayConstraint=3)
err = C.edit_distance_error(classifier, labelVar, squashInputs=True, tokensToIgnore=[NumClasses])
lr = C.learning_rate_schedule([(3, .01), (1,.001)], C.UnitType.sample)
mm = C.momentum_schedule([(1000, 0.9), (0, 0.99)], MinibatchSize)
learner = C.momentum_sgd(classifier.parameters, lr, mm)
trainer = C.Trainer(classifier, (criteria, err), learner)
# Return a numpy array of 17 elements, for this code
def make_code(code):
a = np.zeros(NumClasses) # 0,0,0,0
v = np.random.rand(InputDim - NumClasses) # 13x random
a = np.concatenate((a, v))
a[code] = 1
return a
def make_noise_code():
return np.random.rand(InputDim)
def make_onehot(code):
v = np.zeros(NumClasses+1)
v[code] = 1
return v
def gen_batch():
x_batch = []
y_batch = []
for mb in range(MinibatchSize):
yLen = random.randint(1, MaxOutputSeqLen)
x = []
y = []
for i in range(yLen):
code = random.randint(0,3)
y.append(make_onehot(code))
xLen = random.randint(3,5) # Input is 3 to 5 repetitions of the code
for j in range(xLen):
x.append(make_code(code))
spacerLen = random.randint(3,5) # Spacer is 3 to 5 repetitions of noise
for j in range(spacerLen):
x.append(make_noise_code())
x_batch.append(np.array(x, dtype='float32'))
y_batch.append(np.array(y, dtype='float32'))
return x_batch, y_batch
#######################################################################################
# Dump first X/Y training pair from minibatch
#x, y = gen_batch()
#print("\nx sequence of first sample of minibatch:\n", x[0])
#print("\ny sequence of first sample of minibatch:\n", y[0])
#######################################################################################
progress_printer = C.logging.progress_print.ProgressPrinter(tag='Training', num_epochs=NumEpochs)
for epoch in range(NumEpochs):
for mb in range(MinibatchPerEpoch):
x_batch, y_batch = gen_batch()
trainer.train_minibatch({inputVar: x_batch, labelVar: y_batch})
progress_printer.epoch_summary(with_metric=True)

For those who are facing this error, there are two points to take note of:
(1) The data provided to labels sequence tensor to labels_to_graph must have the same sequence length as the data coming out from network output at runtime.
(2) If during the model building you change the dynamic sequence axis of input sequence tensor (e.g. stride in sequential axis), then you must call reconcile_dynamic_axes on your labels sequence tensor with the network_output as the second argument to the function. This tells CNTK that labels have the same dynamic axis as the network.
Adhering to these 2 points will allow forward_backward to run.

Convergence of backpropagation algorithm : how many iterations?

I'm new to the world of neural network, which is very interesting. I wrote the basic algorithm of backpropagation on multi-layer NN to solve small problems.
I use the activation function sigmoid (like most of you I think) (x->1/(1+exp(-x))).
I tried my program on several problems :
The first one is the XOR problem. I took a 3 layers network of size [2,2,1] with one bias neuron in the two first layers (so actually the size is more [3,3,1]).
It tried it with 1000 sets of data (i.e. a couple (0/1, 0/1) and its XOR as output), and the algorithm seemed to converge at an error of 0.5 :( I found it weird so i raised the number to 10000, and as it didn't change anything, to 100000 (in despair :p) and it WORKS ! The error fell down to less that 0.02 in average. Does anybody have an idea why it needs so much data to work?
The second one is the sum problem between two numbers (like 4+8 = ?). I took randomly a [2, 5, 5, 1] network with one bias neuron in the three first layers (so actually the size is more [3, 6, 6, 1]). I put a training data set of 100000 couples of numbers below 100 and their sum. This time, the error does not converge at all, while the output of the network always return the number 1. Have you already seen such situations ? Is it a bug code ? (code that i checked many times but perhaps).
import random
import math
class Network:
def initdata(self):
#weights initialization
self.weights.append([])
self.threshold.append([])
for l in range(1,len(self.layers)):
n = self.layers[l]
thresholdl = []
weightsl = []
for i in range(n):
thresholdl.append(-random.random())
weightsli = []
for j in range(self.layers[l-1]):
weightsli.append(random.random()*2-1)
#adding bias neurons
weightsli.append(thresholdl[-1])
weightsl.append(weightsli)
self.weights.append(weightsl)
self.threshold.append(thresholdl)
def __init__(self, layers):
self.layers = layers
self.weights = []
self.threshold = []
self.initdata()
def activation_function(self, x):
return 1/(1+math.exp(-x))
def outputlayer(self, input, l):
if l==0:
return [input]
output = []
prevoutput = self.outputlayer(input, l-1)
for i in range(self.layers[l]):
f = 0
for k in range(len(prevoutput[-1])):
f += self.weights[l][i][k]*prevoutput[-1][k]
f += self.weights[l][i][-1] #bias weight !
output.append(self.activation_function(f))
return prevoutput+[output]
def layersoutput(self, input):
return self.outputlayer(input, len(self.layers)-1)
def finaloutput(self, input):
return self.layersoutput(input)[-1]
def train(self, data, nu):
for (input, finaloutput) in data:
output = self.layersoutput(input)
err = self.errorvector(finaloutput, output[-1])
self.changeweights(err, output, nu)
def changeweights(self, err, output, nu):
deltas = []
for i in range(len(self.layers)):
deltas.append([])
tempweights = self.weights.copy()
def changeweightslayer(layer):
if layer != len(self.layers)-1:
changeweightslayer(layer+1)
for i in range(self.layers[layer]):
delta = 0
if layer != len(self.layers)-1:
delta = output[layer][i]*(1-output[layer][i])*sum([deltas[layer+1][l]*self.weights[layer+1][l][i] for l in range(self.layers[layer+1])])
else:
delta = output[layer][i]*(1-output[layer][i])*err[i]
deltas[layer].append(delta)
for k in range(len(self.weights[layer][i])-1):
tempweights[layer][i][k] += nu*output[layer-1][k]*delta
tempweights[layer][i][-1] += nu*delta
changeweightslayer(1)
self.weights = tempweights
def quadraticerror(self, a, b):
return sum([(a[i]-b[i])**2 for i in range(len(a))])
def errorvector(self, a, b):
return [a[i]-b[i] for i in range(len(a))]
network = Network([2, 5, 5, 1])
print(network.weights)
data = []
for i in range(1000000):
bit1 = random.randrange(100)
bit2 = random.randrange(100)
data.append(([float(bit1), float(bit2)], [float(bit1+bit2)]))
network.train(data, 0.1)
print(network.weights)

GIMP python line drawing very slow

pdb.gimp_paintbrush_default seems to be very slow (several seconds, for 500 dots using a standard brush. Lines are worse, obviously). Is this the way it is? Is there a way to speed things up when drawing straight lines using the user selected brush?
pythonfu console code:
from random import randint
img=gimp.image_list()[0]
drw = pdb.gimp_image_active_drawable(img)
width = pdb.gimp_image_width(img)
height = pdb.gimp_image_height(img)
point_number = 500
while (point_number > 0):
x = randint(0, width)
y = randint(0, height)
pdb.gimp_paintbrush_default(drw,2,[x,y])
point_number -= 1

I've been working on something very similar and ran into this problem also. Here's one technique that I found that made my function about 5 times faster:
Create a temporary image
Copy the layer you are working with to the temporary image
Do the drawing on the temporary layer
Copy the temporary layer on top of the original layer
I believe this speeds stuff up because GIMP doesn't have to draw the edits to the screen, but I'm not 100% sure. Here's my function:
def splotches(img, layer, size, variability, quantity):
gimp.context_push()
img.undo_group_start()
width = layer.width
height = layer.height
temp_img = pdb.gimp_image_new(width, height, img.base_type)
temp_img.disable_undo()
temp_layer = pdb.gimp_layer_new_from_drawable(layer, temp_img)
temp_img.insert_layer(temp_layer)
brush = pdb.gimp_brush_new("Splotch")
pdb.gimp_brush_set_hardness(brush, 1.0)
pdb.gimp_brush_set_shape(brush, BRUSH_GENERATED_CIRCLE)
pdb.gimp_brush_set_spacing(brush, 1000)
pdb.gimp_context_set_brush(brush)
for i in range(quantity):
random_size = size + random.randrange(variability)
x = random.randrange(width)
y = random.randrange(height)
pdb.gimp_context_set_brush_size(random_size)
pdb.gimp_paintbrush(temp_layer, 0.0, 2, [x, y, x, y], PAINT_CONSTANT, 0.0)
gimp.progress_update(float(i) / float(quantity))
temp_layer.flush()
temp_layer.merge_shadow(True)
# Delete the original layer and copy the new layer in its place
new_layer = pdb.gimp_layer_new_from_drawable(temp_layer, img)
name = layer.name
img.remove_layer(layer)
pdb.gimp_item_set_name(new_layer, name)
img.insert_layer(new_layer)
gimp.delete(temp_img)
img.undo_group_end()
gimp.context_pop()

Backpropagation for Neural Network - Python

I am writing a program to do neural network in python I am trying to set up the backpropagation algorithm. The basic idea is that I look through 5,000 training examples and collect the errors and find out in which direction I need to move the thetas and then move them in that direction. There are the training examples, then I use one hidden layer, and then an output layer. However I am getting the gradient/derivative/error wrong here because I am not moving the thetas correct as they need to be moved. I put 8 hours into this today not sure what I'm doing wrong. Thanks for your help!!
x = 401x5000 matrix
y = 10x5000 matrix # 10 possible output classes, so one column will look like [0, 0, 0, 1, 0... 0] to indicate the output class was 4
theta_1 = 25x401
theta_2 = 10x26
alpha=.01
sigmoid= lambda theta, x: 1 / (1 + np.exp(-(theta*x)))
#move thetas in right direction for each iteration
for iter in range(0,1):
all_delta_1, all_delta_2 = 0, 0
#loop through each training example, 1...m
for t in range(0,5000):
hidden_layer = np.matrix(np.concatenate((np.ones((1,1)),sigmoid(theta_1,x[:,t]))))
output_layer = sigmoid(theta_2,hidden_layer)
delta_3 = output_layer - y[:,t]
delta_2= np.multiply((theta_2.T*delta_3),(np.multiply(hidden_layer,(1-hidden_layer))))
#print type(delta_3), delta_3.shape, type(hidden_layer.T), hidden_layer.T.shape
all_delta_2 += delta_3*hidden_layer.T
all_delta_1 += delta_2[1:]*x[:,t].T
delta_gradient_2 = (all_delta_2 / m)
delta_gradient_1 = (all_delta_1 / m)
theta_1 = theta_1- (alpha * delta_gradient_1)
theta_2 = theta_2- (alpha * delta_gradient_2)

It looks like your gradients are with respect to the unsquashed output layer.
Try changing output_layer = sigmoid(theta_2,hidden_layer) to output_layer = theta_2*hidden_layer.
Or recompute the gradients for squashed output.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.