I am currently going through the first chapter of Michael Nielsen's neural network book.
I am running into an anomaly where the exact same code is producing wildly different results, with the only difference between the code being where it is copied from. One set of code is the one which I wrote as I went through the chapter, and the other set is from copying Nielsen's source code from github. Naturally, his works and mine doesn't. Nonetheless, if you plug both blocks of code into a text comparing tool, you can see that they're the exact same, save for a few comments.
The code is building a basic neural network that seeks to identify handwritten digits. The code runs epochs in the neural network, and then outputs how many numbers are correctly identified in each epoch.
Given the current parameters, each epoch should produce roughly 8000 to 9500 out of 10,000 correct results, and this result is printed to the terminal. This is what you will consistently observe when Nielsen's code is ran.
Conversely, when the same code is ran, but copied from my file, the results are consistently ~50-450 out of 10,000.
Perhaps this is something to do with VS code? I'm really at a loss for why the same code to the letter is producing different results. I was hoping someone smarter than me would know why it matters where you ctrl-c ctrl-v from. Thanks!
Here is the code I wrote:
#mine
import random
import numpy as np
class Network(object):
def __init__(self, sizes):
self.num_layers = len(sizes)
self.sizes = sizes
self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
self.weights = [np.random.randn(y, x)
for x, y in zip(sizes[:-1], sizes[1:])]
def feedforward(self, a):
for b, w in zip(self.biases, self.weights):
a = sigmoid(np.dot(w, a)+b)
return a
def SGD(self, training_data, epochs, mini_batch_size, eta,
test_data=None):
if test_data: n_test = len(test_data)
n = len(training_data)
for j in range(epochs):
random.shuffle(training_data)
mini_batches = [
training_data[k:k+mini_batch_size]
for k in range(0, n, mini_batch_size)]
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta)
if test_data:
print("Epoch {0}: {1} / {2}".format(
j, self.evaluate(test_data), n_test))
else:
print("Epoch {0} complete".format(j))
def update_mini_batch(self, mini_batch, eta):
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [w-(eta/len(mini_batch))*nw
for w, nw in zip(self.weights, nabla_w)]
self.biases = [b-(eta/len(mini_batch))*nb
for b, nb in zip(self.biases, nabla_b)]
def backprop(self, x, y):
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
activation = x
activations = [x]
zs = []
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation)+b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
delta = self.cost_derivative(activations[-1], y) * \
sigmoid_prime(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
for l in range(2, self.num_layers):
z = zs[-l]
sp = sigmoid_prime(z)
delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
return (nabla_b, nabla_w)
def evaluate(self, test_data):
test_results = [(np.argmax(self.feedforward(x)), y)
for (x, y) in test_data]
return sum(int(x == y) for (x, y) in test_results)
def cost_derivative(self, output_activations, y):
return (output_activations-y)
### Non Network functions
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
def sigmoid_prime(z):
return sigmoid(z)*(1-sigmoid(z))
And here is the code that he wrote:
#his
import random
import numpy as np
class Network(object):
def __init__(self, sizes):
self.num_layers = len(sizes)
self.sizes = sizes
self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
self.weights = [np.random.randn(y, x)
for x, y in zip(sizes[:-1], sizes[1:])]
def feedforward(self, a):
for b, w in zip(self.biases, self.weights):
a = sigmoid(np.dot(w, a)+b)
return a
def SGD(self, training_data, epochs, mini_batch_size, eta,
test_data=None):
if test_data: n_test = len(test_data)
n = len(training_data)
for j in range(epochs):
random.shuffle(training_data)
mini_batches = [
training_data[k:k+mini_batch_size]
for k in range(0, n, mini_batch_size)]
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta)
if test_data:
print("Epoch {0}: {1} / {2}".format(
j, self.evaluate(test_data), n_test))
else:
print("Epoch {0} complete".format(j))
def update_mini_batch(self, mini_batch, eta):
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb+dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw+dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [w-(eta/len(mini_batch))*nw
for w, nw in zip(self.weights, nabla_w)]
self.biases = [b-(eta/len(mini_batch))*nb
for b, nb in zip(self.biases, nabla_b)]
def backprop(self, x, y):
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
activation = x
activations = [x]
zs = []
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation)+b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
delta = self.cost_derivative(activations[-1], y) * \
sigmoid_prime(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
for l in range(2, self.num_layers):
z = zs[-l]
sp = sigmoid_prime(z)
delta = np.dot(self.weights[-l+1].transpose(), delta) * sp
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l-1].transpose())
return (nabla_b, nabla_w)
def evaluate(self, test_data):
test_results = [(np.argmax(self.feedforward(x)), y)
for (x, y) in test_data]
return sum(int(x == y) for (x, y) in test_results)
def cost_derivative(self, output_activations, y):
return (output_activations-y)
#### Miscellaneous functions
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
def sigmoid_prime(z):
return sigmoid(z)*(1-sigmoid(z))
Below is the helper file and the main file. The number image data used can be downloaded here.
helper:
"""
mnist_loader
~~~~~~~~~~~~~~~~
A library used to load mnist data.
"""
#### Libraries
# Standard library
import pickle
import gzip
# Third party libraries
import numpy as np
def load_data():
"""Returns the MNIST data as a tuple containing the training data,
the validation data, and the test data.
The ``training_data`` is returned as a tuple with two entries.
The first entry contains the actual training images. This is a
numpy ndarray with 50,000 entries. Each entry is, in turn, a
numpy ndarray with 784 values, representing the 28 * 28 = 784
pixels in a single MNIST image.
The second entry in the ``training_data`` tuple is a numpy ndarray
containing 50,000 entries. Those entries are just the digit
values (0...9) for the corresponding images contained in the first
entry of the tuple.
The ``validation_data`` and ``test_data`` are similar, except
each contains only 10,000 images.
This is a nice data format, but for use in neural networks it's
helpful to modify the format of the ``training_data`` a little.
That's done in the wrapper function ``load_data_wrapper()``, see
below.
"""
f = gzip.open('./data/mnist.pkl.gz', 'rb')
u = pickle._Unpickler( f )
u.encoding = 'latin1'
training_data, validation_data, test_data = u.load()
# training_data, validation_data, test_data = pickle.load(f)
f.close()
return(training_data, validation_data, test_data)
def load_data_wrapper():
"""Return a tuple containing ``(training_data, validation_data,
test_data)``. Based on ``load_data``, but the format is more
convenient for use in our implementation of neural networks.
In particular, ``training_data`` is a list containing 50,000
2-tuples ``(x, y)``. ``x`` is a 784-dimensional numpy.ndarray
containing the input image. ``y`` is a 10-dimensional
numpy.ndarray representing the unit vector corresponding to the
correct digit for ``x``.
``validation_data`` and ``test_data`` are lists containing 10,000
2-tuples ``(x, y)``. In each case, ``x`` is a 784-dimensional
numpy.ndarry containing the input image, and ``y`` is the
corresponding classification, i.e., the digit values (integers)
corresponding to ``x``.
Obviously, this means we're using slightly different formats for
the training data and the validation / test data. These formats
turn out to be the most convenient for use in our neural network
code."""
tr_d, va_d, te_d = load_data()
training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
training_results = [vectorized_result(y) for y in tr_d[1]]
training_data = list(zip(training_inputs, training_results))
validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
validation_data = list(zip(validation_inputs, va_d[1]))
test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
test_data = list(zip(test_inputs, te_d[1]))
return (training_data, validation_data, test_data)
def vectorized_result(j):
"""Return a 10-dimensional unit vector with a 1.0 in the jth position
and zeroes elsewhere. This converts a a digit (0...9) into a
corresponding desired output from the neural network.
"""
e = np.zeros((10, 1))
e[j] = 1.0
return e
main:
import mnist_loader
training_data, validation_data, test_data = mnist_loader.load_data_wrapper()
import test #change this to test2 to check nielsens code
net = test.Network([784, 30, 10]) # change here too
net.SGD(training_data, 30, 10, 3.0, test_data=test_data)
So they weren't "eXacTLy tHe SaME". The difference was in the for loop of the feedforward function. I was returning at every iteration, which was incorrect.
Related
I am currently following the introduction guide to neural networks and deep learning from Michael Nielsen (link) and am stuck in chapter 1.
I set up everything and fixed smaller changes from python2 -> python3 and ran the code, however my value in each epoch remains constant. I assume it means that for whatever reason the weights and biases doesnt get updated.
As stated in the guide I followed it and created two files, one mnist_loader, which loads the data from other sources
Any ideas on where to fix?
I am currently running the newest python 3.10.2.
link to image of cmd
what i expect is the value of each epoch raising like given in the chapter
> Epoch 0: 9129 / 10000
> Epoch 1: 9295 / 10000
> Epoch 2: 9348 / 10000
> ...
> Epoch 27: 9528 / 10000
> Epoch 28: 9542 / 10000
> Epoch 29: 9534 / 10000
Edit: added code
"""
mnist_loader
~~~~~~~~~~~~
A library to load the MNIST image data. For details of the data
structures that are returned, see the doc strings for ``load_data``
and ``load_data_wrapper``. In practice, ``load_data_wrapper`` is the
function usually called by our neural network code.
"""
# Libraries
# Standard library
import pickle
import gzip
# Third-party libraries
import numpy as np
def load_data():
"""Return the MNIST data as a tuple containing the training data,
the validation data, and the test data.
The ``training_data`` is returned as a tuple with two entries.
The first entry contains the actual training images. This is a
numpy ndarray with 50,000 entries. Each entry is, in turn, a
numpy ndarray with 784 values, representing the 28 * 28 = 784
pixels in a single MNIST image.
The second entry in the ``training_data`` tuple is a numpy ndarray
containing 50,000 entries. Those entries are just the digit
values (0...9) for the corresponding images contained in the first
entry of the tuple.
The ``validation_data`` and ``test_data`` are similar, except
each contains only 10,000 images.
This is a nice data format, but for use in neural networks it's
helpful to modify the format of the ``training_data`` a little.
That's done in the wrapper function ``load_data_wrapper()``, see
below.
"""
f = gzip.open("C:/Users/Tai/Documents/mnist/neural-networks-and-deep-learning/data/mnist.pkl.gz")
training_data, validation_data, test_data = pickle.load(f, encoding="latin1")
print(training_data)
f.close()
return (training_data, validation_data, test_data)
def load_data_wrapper():
"""Return a tuple containing ``(training_data, validation_data,
test_data)``. Based on ``load_data``, but the format is more
convenient for use in our implementation of neural networks.
In particular, ``training_data`` is a list containing 50,000
2-tuples ``(x, y)``. ``x`` is a 784-dimensional numpy.ndarray
containing the input image. ``y`` is a 10-dimensional
numpy.ndarray representing the unit vector corresponding to the
correct digit for ``x``.
``validation_data`` and ``test_data`` are lists containing 10,000
2-tuples ``(x, y)``. In each case, ``x`` is a 784-dimensional
numpy.ndarry containing the input image, and ``y`` is the
corresponding classification, i.e., the digit values (integers)
corresponding to ``x``.
Obviously, this means we're using slightly different formats for
the training data and the validation / test data. These formats
turn out to be the most convenient for use in our neural network
code."""
tr_d, va_d, te_d = load_data()
training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
training_results = [vectorized_result(y) for y in tr_d[1]]
training_data = list(zip(training_inputs, training_results))
validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
validation_data = list(zip(validation_inputs, va_d[1]))
test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
test_data = list(zip(test_inputs, te_d[1]))
return (training_data, validation_data, test_data)
def vectorized_result(j):
"""Return a 10-dimensional unit vector with a 1.0 in the jth
position and zeroes elsewhere. This is used to convert a digit
(0...9) into a corresponding desired output from the neural
network."""
e = np.zeros((10, 1))
e[j] = 1.0
return e
and second code for network
import numpy as np
import random
"""Network is generating layers of nodes, first layer is the input nodes,
last layer is the output nodes
each layer have random number (gaussian distribution with mean 0 and
standard deviation 1)
applied for weights and bias, first layer has no bias
a network of net = Network([2,3,1]) has 2 input, 3 middle and 1 output
nodes"""
class Network(object):
def __init__(self, sizes):
self.num_layers = len(sizes)
self.sizes = sizes
self.biases = [np.random.randn(y, 1) for y in sizes[1:]]
self.weights = [np.random.randn(y, x)
for x, y in zip(sizes[:-1], sizes[1:])]
# returns output of the network for node a
def feedforward(self, a):
for b, w in zip(self.biases, self.weights):
a = sigmoid(np.dot(w, a) + b)
# stochastic gradiant descent
# Train the neural network using mini-batch stochastic gradient descent.
# training data: list of tuples (x, y) with input and desired output
# epochs: number of epoch to train for
# mini_batch_size: number of mini-batches to use when sampling
# eta: learning rate
# (optional) test data: evaluation of network after each epoch of
traning, print partial process
# slows program down considerably
# each epoch random shuffle of training data, partitioning into mini-
batches of specified size
# for each mini_batch single step of gradient descent specified in
# self.update_mini_batch(mini_batch, eta)
# update weights and biases according to a single iteration of gradient
descent
def SGD(self, training_data, epochs, mini_batch_size, eta, test_data=None):
if test_data: n_test = len(test_data)
n = len(training_data)
for j in range(epochs):
random.shuffle(training_data)
mini_batches = [
training_data[k:k + mini_batch_size]
for k in range(0, n, mini_batch_size)]
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch, eta)
if test_data:
print(
"Epoch {0}: {1} / {2}".format(
j, self.evaluate(test_data), n_test))
else:
print(
"Epoch {0} complete".format(j))
# update network's weights and biases by applying gradient descent using
backpropagation
# to a single mini batch.
# most important line
# delta_nabla_b, delta_nabla_w = self.backprop(x, y)
# self.backprop ensures the backpropagation of (x, y)
def update_mini_batch(self, mini_batch, eta):
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [w - (eta / len(mini_batch)) * nw
for w, nw in zip(self.weights, nabla_w)]
self.biases = [b - (eta / len(mini_batch)) * nb
for b, nb in zip(self.biases, nabla_b)]
def backprop(self, x, y):
"""Return a tuple ``(nabla_b, nabla_w)`` representing the
gradient for the cost function C_x. ``nabla_b`` and
``nabla_w`` are layer-by-layer lists of numpy arrays, similar
to ``self.biases`` and ``self.weights``."""
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# feedforward
activation = x
activations = [x] # list to store all the activations, layer by layer
zs = [] # list to store all the z vectors, layer by layer
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation) + b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
# backward pass
delta = self.cost_derivative(activations[-1], y) * \
sigmoid_prime(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
# Note that the variable l in the loop below is used a little
# differently to the notation in Chapter 2 of the book. Here,
# l = 1 means the last layer of neurons, l = 2 is the
# second-last layer, and so on. It's a renumbering of the
# scheme in the book, used here to take advantage of the fact
# that Python can use negative indices in lists.
for l in range(2, self.num_layers):
z = zs[-l]
sp = sigmoid_prime(z)
delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l - 1].transpose())
return nabla_b, nabla_w
def evaluate(self, test_data):
"""Return the number of test inputs for which the neural
network outputs the correct result. Note that the neural
network's output is assumed to be the index of whichever
neuron in the final layer has the highest activation."""
test_results = [(np.argmax(self.feedforward(x)), y)
for (x, y) in test_data]
return sum(int(x == y) for (x, y) in test_results)
#staticmethod
def cost_derivative(output_activations, y):
"""Return the vector of partial derivatives \partial C_x /
\partial a for the output activations."""
return output_activations - y
# z is a vector, numpy automatically applies sigmoid onto each element of z
def sigmoid(z):
return 1.0 / (1.0 + np.exp(-z))
# derivative of the sigmoid function
def sigmoid_prime(z):
return sigmoid(z) * (1 - sigmoid(z))
I found the error thanks to the other comments here, i missed a line in the code
in network:
# returns output of the network for node a
def feedforward(self, a):
for b, w in zip(self.biases, self.weights):
a = sigmoid(np.dot(w, a) + b)
return a
thanks everyone for pointing out!
I have been trying to code up the neural network for recognizing MNIST that was given by Michael Nielsen here http://neuralnetworksanddeeplearning.com/chap1.html
The original was written using Python 2.7 I believe, I'm using v3. The network does go through the test examples and updates the weights and biases, but it doesn't learn, and gets around 10% of the test examples right (so as good as random guessing).
I have also tried simply copying the code from the site and running it in Python 2.7, and it works as it should (getting up to 95% accuracy). The only significant difference in the nets are the data set (I'm using the one downloaded directly from MINST two days ago) and the two locations where i switched np.dot into np.outer, just to make it easier to keep track of the array shapes (I tried sticking to (N,) instead of (N,1)). But that part seems to be fine, since the layer sizes are different and the multiplications are going through. I am also using the same learning rate and layer sizes as given in the example.
I cannot see what could be throwing the net off. If anyone has tried doing the same, or has some insight into this I would greatly appreciate it.
Thanks !
The code :
import matplotlib.pyplot as plt
import numpy as np
import idx2numpy
import random
### LOAD DATASET ###
train = idx2numpy.convert_from_file("mnist/train-images.idx3-ubyte")
train_labels = idx2numpy.convert_from_file("mnist/train-labels.idx1-ubyte")
test = idx2numpy.convert_from_file("mnist/t10k-images.idx3-ubyte")
test_labels = idx2numpy.convert_from_file("mnist/t10k-labels.idx1-ubyte")
def vectorize(x):
e = np.zeros(10)
e[x] = 1.0
return e
training_images = [np.reshape(i, (784))/255 for i in train]
training_labels = [vectorize(i) for i in train_labels]
training_set = list(zip(training_images,training_labels))
test_images = [np.reshape(i, (784))/255 for i in test]
test_set = list(zip(training_images,test_labels))
### NETWORK CLASS ###
class myNet():
def __init__ (self , sizes ):
self.sizes = sizes
self.N = len(sizes)
self.w = [np.random.randn(y, x) for x, y in zip( sizes [: -1] , sizes [1:]) ]
self.b = [np.random.randn(i) for i in sizes[1:]]
def sigmoid (self,z):
return 1.0/(1.0+ np.exp(-z))
def sigmoid_prime (self,z):
return self.sigmoid (z)*(1 - self.sigmoid (z))
def cost_derivative (self,output_activations , y):
return ( output_activations - y)
def feedforward (self , a):
for bb, ww in zip(self.b , self.w ):
a = self.sigmoid (np.dot(ww, a)+bb)
return a
def backprop (self , x, y):
nabla_b = [np. zeros (bb. shape ) for bb in self.b ]
nabla_w = [np. zeros (ww. shape ) for ww in self.w ]
activation = x
activations = [x] # list to store all the activations , layer by layer
zs = [] # list to store all the z vectors , layer by layer
for bb, ww in zip(self.b , self. w ):
z = np.dot(ww, activation )+bb
zs. append (z)
activation = self.sigmoid (z)
activations . append ( activation )
# backward pass
delta = self. cost_derivative ( activations [-1], y) * self.sigmoid_prime (zs [ -1])
nabla_b [-1] = delta
nabla_w [-1] = np.outer(delta , activations [ -2])
# Note that the variable l in the loop below is used a little
for l in range (2, self.N ):
z = zs[-l]
sp = self.sigmoid_prime (z)
delta = np.dot(self.w [-l+1]. transpose () , delta ) * sp
nabla_b [-l] = delta
nabla_w [-l] = np.outer(delta , activations [-l -1])
return (nabla_b , nabla_w )
def update(self,mini_batch,eta):
nabla_b = [np.zeros (bb.shape ) for bb in self.b ]
nabla_w = [np.zeros (ww.shape ) for ww in self.w ]
for x, y in mini_batch :
delta_nabla_b , delta_nabla_w = self. backprop (x, y)
nabla_b = [nb+dnb for nb , dnb in zip(nabla_b , delta_nabla_b )]
nabla_w = [nw+dnw for nw , dnw in zip(nabla_w , delta_nabla_w )]
self.w = [ww -( eta/len( mini_batch ))*nw
for ww, nw in zip(self.w , nabla_w )]
self.b = [bb -( eta/len( mini_batch ))*nb
for bb, nb in zip(self.b, nabla_b )]
return
def gradient_descent(self,training_data,epochs,mini_batch_size,eta,test_data):
i = 0
n = len( training_data )
for j in range (epochs):
random.shuffle (training_data)
mini_batches = [
training_data [k:k+ mini_batch_size ]
for k in range (0, n, mini_batch_size )]
for mini_batch in mini_batches :
self.update( mini_batch , eta)
print("Epoch {0}: {1}". format (
j, self.evaluate(test_data)))
return
def evaluate (self, test_data):
test_results = [( np.argmax (self.feedforward (x)), y)
for (x, y) in test_data ]
return sum(int(x == y) for (x, y) in test_results )
sizes =[28*28, 30, 10]
net = myNet(sizes)
net.gradient_descent(training_set,30,10,3.0,test_set)
I found the mistake... By mistake I zipped up the training images with the test labels to form the test set, which is clearly not what it should be. Now that I formed the test set properly, everything works and gets up to around 95% accuracy. Here is the full corrected code for completeness (usable in Python3)
import matplotlib.pyplot as plt
import numpy as np
import idx2numpy
import random
def vectorize(x):
e = np.zeros(10)
e[x] = 1.0
return e
### LOAD DATASET ###
train_images = idx2numpy.convert_from_file("mnist/train-images.idx3-ubyte")/255
train_labels = idx2numpy.convert_from_file("mnist/train-labels.idx1-ubyte")
train_images = [np.reshape(x,(784)).astype('float32') for x in train_images]
train_labels = [vectorize(i) for i in train_labels]
test_images = idx2numpy.convert_from_file("mnist/t10k-images.idx3-ubyte")/255
test_labels = idx2numpy.convert_from_file("mnist/t10k-labels.idx1-ubyte")
test_images = [np.reshape(x,(784)).astype('float32') for x in test_images]
training_set = list(zip(train_images,train_labels))
test_set = list(zip(test_images,test_labels)) ## THIS IS WHERE I MESSED UP
### NETWORK CLASS ###
class myNet():
def __init__ (self , sizes ):
self.sizes = sizes
self.N = len(sizes)
self.w = [np.random.randn(y, x) for x, y in zip( sizes [: -1] , sizes [1:]) ]
self.b = [np.random.randn(i) for i in sizes[1:]]
def sigmoid (self,z):
return 1.0/(1.0+ np.exp(-z))
def sigmoid_prime (self,z):
return self.sigmoid (z)*(1 - self.sigmoid (z))
def cost_derivative (self,output_activations , y):
return ( output_activations - y)
def feedforward (self , a):
for bb, ww in zip(self.b , self.w ):
a = self.sigmoid (np.dot(ww, a)+bb)
return a
def backprop (self , x, y):
nabla_b = [np. zeros (bb. shape ) for bb in self.b ]
nabla_w = [np. zeros (ww. shape ) for ww in self.w ]
activation = x
activations = [x] # list to store all the activations , layer by layer
zs = [] # list to store all the z vectors , layer by layer
for bb, ww in zip(self.b , self. w ):
z = np.dot(ww, activation )+bb
zs. append (z)
activation = self.sigmoid (z)
activations . append ( activation )
# backward pass
delta = self. cost_derivative ( activations [-1], y) * self.sigmoid_prime (zs [ -1])
nabla_b [-1] = delta
nabla_w [-1] = np.outer(delta , activations [ -2])
# Note that the variable l in the loop below is used a little
for l in range (2, self.N ):
z = zs[-l]
sp = self.sigmoid_prime (z)
delta = np.dot(self.w [-l+1]. transpose () , delta ) * sp
nabla_b [-l] = delta
nabla_w [-l] = np.outer(delta , activations [-l -1])
return (nabla_b , nabla_w )
def update(self,mini_batch,eta):
nabla_b = [np.zeros (bb.shape ) for bb in self.b ]
nabla_w = [np.zeros (ww.shape ) for ww in self.w ]
for x, y in mini_batch :
delta_nabla_b , delta_nabla_w = self. backprop (x, y)
nabla_b = [nb+dnb for nb , dnb in zip(nabla_b , delta_nabla_b )]
nabla_w = [nw+dnw for nw , dnw in zip(nabla_w , delta_nabla_w )]
self.w = [ww -( eta/len( mini_batch ))*nw
for ww, nw in zip(self.w , nabla_w )]
self.b = [bb -( eta/len( mini_batch ))*nb
for bb, nb in zip(self.b, nabla_b )]
return
def gradient_descent(self,training_data,epochs,mini_batch_size,eta,test_data):
i = 0
n = len( training_data )
for j in range (epochs):
random.shuffle (training_data)
mini_batches = [
training_data [k:k+ mini_batch_size ]
for k in range (0, n, mini_batch_size )]
for mini_batch in mini_batches :
self.update( mini_batch , eta)
print("Epoch {0}: {1}". format (
j, self.evaluate(test_data)))
return
def evaluate (self, test_data):
test_results = [( np.argmax (self.feedforward (x)), y)
for (x, y) in test_data ]
return sum(int(x == y) for (x, y) in test_results )
sizes =[28*28, 30, 10]
net = myNet(sizes)
net.gradient_descent(training_set,30,10,3.0,test_set)
I have a class Linear Regression and want to check how does it work with dataset load_boston. I calculated the Mean absolute percentage error (MAPE) and the result is nan.
import numpy as np
import warnings
from sklearn.base import BaseEstimator
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import pandas as pd
warnings.filterwarnings('ignore')
class LinearRegressionSGD(BaseEstimator):
def __init__(self, epsilon=1e-4, max_steps=1000, w0=None, alpha=1e-2):
'''
epsilon: difference for the rate of change of weights
max_steps: maximum number of steps in gradient descent
w0: np.array (d,) - initial weights
alpha: learning step
'''
self.epsilon = epsilon
self.max_steps = max_steps
self.w0 = w0
self.alpha = alpha
self.w = None
self.w_history = []
def fit(self, X, y):
"""
X: np.array (l, d)
y: np.array (l)
---
output: self
"""
l, d = X.shape
if self.w0 is None:
self.w0 = np.zeros(d)
self.w = self.w0
for step in range(self.max_steps):
self.w_history.append(self.w)
w_new = self.w - self.alpha * self.calc_gradient(X, y)
if (np.linalg.norm(w_new - self.w) < self.epsilon):
break
self.w = w_new
return self
def predict(self, X):
"""
X: np.array (l, d)
---
output: np.array (l)
"""
if self.w is None:
raise Exception('Not trained yet')
l, d = X.shape
y_pred = []
for i in range(l):
y_pred.append(np.dot(X[i], self.w))
return np.array(y_pred)
def calc_gradient(self, X, y):
"""
X: np.array (l, d)
y: np.array (l)
---
output: np.array (d)
"""
l, d = X.shape
gradient = []
for j in range(d):
dQ = 0
for i in range(l):
dQ += (2 / l) * X[i][j] * (np.dot(X[i], self.w) - y[i])
gradient.append(dQ)
return np.array(gradient)
data = load_boston()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
X_train, X_test, y_train, y_test = train_test_split(np.array(X), y, test_size=0.3, random_state=10)
def MAPE(y_true, y_pred):
"""
y_true: np.array (l)
y_pred: np.array (l)
---
output: float [0, +inf)
"""
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
# Task 2
sgd = LinearRegressionSGD()
sgd.fit(X_train, y_train)
y_pred_sgd = sgd.predict(X_test)
print(MAPE(y_test, y_pred_sgd))
# Task 3
a, b = X_test.shape
w_0 = np.random.uniform(-2, 2, (b))
lr = LinearRegressionSGD(w0=w_0)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print(MAPE(y_test, y_pred_lr))
But when I create X, y like below, the code works properly and MAPE gives float value
n_features = 2
n_objects = 300
num_steps = 100
np.random.seed(1)
w_true = np.random.normal(0, 0.1, size=(n_features, ))
w_0 = np.random.uniform(-2, 2, (n_features))
X = np.random.uniform(-5, 5, (n_objects, n_features))
y = np.dot(X, w_true) + np.random.normal(0, 1, (n_objects))
What is the problem with my code? and how to fix it to get the float value?
(Sorry for my bad English, its not my native language)
I have been studying neural networks for some time, and I have decided to work with Michael Neilsen's book "Neural Networks and Deep Learning".
The book has been very good and it passed the concept of neural networks perfectly.
But,
While practicing the material taught I had encountered an error: ValueError: too many values to unpack (expected 2)
I have tried debugging, converting tuples to lists, running the program in Python 3.9 and Python 2.7, rewriting the whole program, copying it as a whole from github, running through different IDEs, and nothing changed the error.
The network file is as such :
import numpy as np
import mnist_loader
import random
class Network(object):
def __init__(self,sizes):
# eg. Network([2,3,1]) makes an array of arrays, in this eg. a 2 input,3 hidden, 1 output neural network
self.num_layers = len(sizes)
self.sizes = sizes
#All weights and biases are initialised randomly in gaussian normal distribution, and are initialised in sizes 1 an on(1st hidden layer to output)
self.biases = [np.random.randn(y,1)
for y in sizes[1:]] #[1:] means
self.weights =[np.random.randn(y,x)
for x,y in zip(sizes[:-1],sizes[1:])]
def feedforward(self,a):
"""Return the output of the network if "a" is input."""
for b,w in zip(self.biases,self.weights):
a = sigmoid(np.dot(w,a)+b)
return a
def SGD(self,training_data, epochs, mini_batch_size,eta,test_data=None): #eta - learning rate,test_data=None initialises empty var
"""Train the neural network using mini-batch stochastic
gradient descent. The "training_data" is a list of tuples
"(x, y)" representing the training inputs and the desired
outputs. The other non-optional parameters are
self-explanatory. If "test_data" is provided then the
network will be evaluated against the test data after each
epoch, and partial progress printed out. This is useful for
tracking progress, but slows things down substantially."""
if test_data: n_test = len(test_data) #If test_data carries value, make n_test in length of test_data
n= len(training_data)
for j in range(epochs):
random.shuffle(training_data)
mini_batches = [training_data[k:k+mini_batch_size]
for k in range(0,n,mini_batch_size)]
for mini_batch in mini_batches:
self.update_mini_batch(mini_batch,eta)
if test_data:
print("Epoch {0}: {1} /{2}".format(
j,self.evaluate(test_data),n_test))
else:
print ("Epoch {0} complete".format(j))
def update_mini_batch(self , mini_batch,eta):
"""Update the network's weights and biases by applying
gradient descent using backpropagation to a single mini batch.
The "mini_batch" is a list of tuples "(x, y)", and "eta"
is the learning rate."""
# nabla = gradient vector np.zeros makes b into an array of zeros the size of self.biases
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
for x, y in mini_batch:
delta_nabla_b, delta_nabla_w = self.backprop(x, y)
nabla_b = [nb + dnb for nb, dnb in zip(nabla_b, delta_nabla_b)]
nabla_w = [nw + dnw for nw, dnw in zip(nabla_w, delta_nabla_w)]
self.weights = [w - (eta / len(mini_batch)) * nw for w, nw in zip(self.weights, nabla_w)]
self.biases = [b - (eta / len(mini_batch)) * nb for b, nb in zip(self.biases, nabla_b)]
def backprop(self, x, y):
"""Return a tuple ``(nabla_b, nabla_w)`` representing the
gradient for the cost function C_x. ``nabla_b`` and
``nabla_w`` are layer-by-layer lists of numpy arrays, similar
to ``self.biases`` and ``self.weights``."""
nabla_b = [np.zeros(b.shape) for b in self.biases]
nabla_w = [np.zeros(w.shape) for w in self.weights]
# feedforward
activation = x
activations = [x] # list to store all the activations, layer by layer
zs = [] # list to store all the z vectors, layer by layer
for b, w in zip(self.biases, self.weights):
z = np.dot(w, activation) + b
zs.append(z)
activation = sigmoid(z)
activations.append(activation)
# backward pass
delta = self.cost_derivative(activations[-1], y) * \
sigmoid_prime(zs[-1])
nabla_b[-1] = delta
nabla_w[-1] = np.dot(delta, activations[-2].transpose())
# Note that the variable l in the loop below is used a little
# differently to the notation in Chapter 2 of the book. Here,
# l = 1 means the last layer of neurons, l = 2 is the
# second-last layer, and so on. It's a renumbering of the
# scheme in the book, used here to take advantage of the fact
# that Python can use negative indices in lists.
for l in range(2, self.num_layers):
z = zs[-l]
sp = sigmoid_prime(z)
delta = np.dot(self.weights[-l + 1].transpose(), delta) * sp
nabla_b[-l] = delta
nabla_w[-l] = np.dot(delta, activations[-l - 1].transpose())
return (nabla_b, nabla_w)
def evaluate(self, test_data):
"""Return the number of test inputs for which the neural
network outputs the correct result. Note that the neural
network's output is assumed to be the index of whichever
neuron in the final layer has the highest activation."""
test_results = [(np.argmax(self.feedforward(x)), y)
for (x, y) in test_data]
return sum(int(x == y) for (x, y) in test_results)
def cost_derivative(self, output_activations, y):
"""Return the vector of partial derivatives \partial C_x /
\partial a for the output activations."""
return (output_activations - y)
def sigmoid(z):
"""The sigmoid function."""
return 1.0/(1-np.np.exp(-z))
def sigmoid_prime(z):
"""Derivative of the sigmoid function."""
return sigmoid(z)*(1-sigmoid(z))
The dataset arrangement is as such:
# Standard library
import _pickle as cPickle
import gzip
# Third-party libraries
import numpy as np
def load_data():
"""Return the MNIST data as a tuple containing the training data,
the validation data, and the test data.
The ``training_data`` is returned as a tuple with two entries.
The first entry contains the actual training images. This is a
numpy ndarray with 50,000 entries. Each entry is, in turn, a
numpy ndarray with 784 values, representing the 28 * 28 = 784
pixels in a single MNIST image.
The second entry in the ``training_data`` tuple is a numpy ndarray
containing 50,000 entries. Those entries are just the digit
values (0...9) for the corresponding images contained in the first
entry of the tuple.
The ``validation_data`` and ``test_data`` are similar, except
each contains only 10,000 images.
This is a nice data format, but for use in neural networks it's
helpful to modify the format of the ``training_data`` a little.
That's done in the wrapper function ``load_data_wrapper()``, see
below.
"""
f = gzip.open("G:\Datasets/beural-networks-and-deep-learning-master/beural-networks-and-deep-learning-master\data\mnist.pkl.gz", 'rb')
training_data, validation_data, test_data = cPickle.load(f, encoding='latin1')
f.close()
return (training_data, validation_data, test_data)
def load_data_wrapper():
"""Return a tuple containing ``(training_data, validation_data,
test_data)``. Based on ``load_data``, but the format is more
convenient for use in our implementation of neural networks.
In particular, ``training_data`` is a list containing 50,000
2-tuples ``(x, y)``. ``x`` is a 784-dimensional numpy.ndarray
containing the input image. ``y`` is a 10-dimensional
numpy.ndarray representing the unit vector corresponding to the
correct digit for ``x``.
``validation_data`` and ``test_data`` are lists containing 10,000
2-tuples ``(x, y)``. In each case, ``x`` is a 784-dimensional
numpy.ndarry containing the input image, and ``y`` is the
corresponding classification, i.e., the digit values (integers)
corresponding to ``x``.
Obviously, this means we're using slightly different formats for
the training data and the validation / test data. These formats
turn out to be the most convenient for use in our neural network
code."""
tr_d, va_d, te_d = load_data()
training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
training_results = [vectorized_result(y) for y in tr_d[1]]
training_data = zip(training_inputs, training_results)
validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
validation_data = zip(validation_inputs, va_d[1])
test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
test_data = zip(test_inputs, te_d[1])
return (training_data, validation_data, test_data)
def vectorized_result(j):
"""Return a 10-dimensional unit vector with a 1.0 in the jth
position and zeroes elsewhere. This is used to convert a digit
(0...9) into a corresponding desired output from the neural
network."""
e = np.zeros((10, 1))
e[j] = 1.0
return e
and the training commands are as such:
import network
import mnist_loader
training_data , validation_data, test_data = mnist_loader.load_data()
training_data = list(training_data)
net = network.Network([784,30,10])
net.SGD(training_data, 30, 10, 0.001, test_data=test_data)
So, the code had a few bugs I had to settle.
There was a typeError because I was using the data without wrapping it properly.
First of all was using the load_data function instead of the load_data_wrapper, which just gave out a total mess, second of all, In load_data_wrapper, I needed to change the "zip"
format commands to "list" as such:
'''
def load_data_wrapper():
"""Return a tuple containing ``(training_data, validation_data,
test_data)``. Based on ``load_data``, but the format is more
convenient for use in our implementation of neural networks.
In particular, ``training_data`` is a list containing 50,000
2-tuples ``(x, y)``. ``x`` is a 784-dimensional numpy.ndarray
containing the input image. ``y`` is a 10-dimensional
numpy.ndarray representing the unit vector corresponding to the
correct digit for ``x``.
``validation_data`` and ``test_data`` are lists containing 10,000
2-tuples ``(x, y)``. In each case, ``x`` is a 784-dimensional
numpy.ndarry containing the input image, and ``y`` is the
corresponding classification, i.e., the digit values (integers)
corresponding to ``x``.
Obviously, this means we're using slightly different formats for
the training data and the validation / test data. These formats
turn out to be the most convenient for use in our neural network
code."""
tr_d, va_d, te_d = load_data()
training_inputs = [np.reshape(x, (784, 1)) for x in tr_d[0]]
training_results = [vectorized_result(y) for y in tr_d[1]]
training_data = list(zip(training_inputs, training_results))
validation_inputs = [np.reshape(x, (784, 1)) for x in va_d[0]]
validation_data = list(zip(validation_inputs, va_d[1]))
test_inputs = [np.reshape(x, (784, 1)) for x in te_d[0]]
test_data = list(zip(test_inputs, te_d[1]))
return (training_data, validation_data, test_data)
'''
And last but not least, the sigmoid function was wrong, and made an overflow because of a simple '-/+' mistake. Should look like this:
'''
def sigmoid(z):
return 1.0/(1.0+np.exp(-z))
'''
After applying all fixes, the code should work perfectly.
Special thanks to the contributors at https://github.com/nndl-solutions/NNDL-solutions
I am following a tutorial on rnn's in TensorFlow but I have a question concerning the input formats.
They are taking raw_x (one hot vector) and basically first cutting that up in pieces of length 200 (batch_size) to form data_x. That is good.
Then they further cut up data_x in pieces of length 5 (num_step, or graph width) with:
for i in range(epoch_size):
x = data_x[:, i * num_steps:(i + 1) * num_steps]
y = data_y[:, i * num_steps:(i + 1) * num_steps]
yield (x, y)
However, if I look in the data, the slices of x do not match data_x. The first one does, but then they diverge.
Am I misunderstanding the above code? I would like to understand how x is being created or what it is supposed to look like.
I had expected the second item to be 0 1 0 1 0.
Also, I thought an epoch is when you go through the data completely, from this it seems that they split up the data in 1000 parts (epoch size)?
If it helps, this is my full code. I am trying to figure out what is going on in x. at line 48:
import numpy as np
import tensorflow as tf
# %matplotlib inline
import matplotlib.pyplot as plt
# Global config variables
num_steps = 5 # number of truncated backprop steps ('n' in the discussion above)
batch_size = 200
num_classes = 2
state_size = 4
learning_rate = 0.1
def gen_data(size=1000000):
print('generating data');
X = np.array(np.random.choice(2, size=(size,)))
Y = []
for i in range(size):
threshold = 0.5
if X[i-3] == 1:
threshold += 0.5
if X[i-8] == 1:
threshold -= 0.25
if np.random.rand() > threshold:
Y.append(0)
else:
Y.append(1)
return X, np.array(Y)
# adapted from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/models/rnn/ptb/reader.py
def gen_batch(raw_data, batch_size, num_steps):
print('generating batches');
raw_x, raw_y = raw_data
data_length = len(raw_x)
# partition raw data into batches and stack them vertically in a data matrix
batch_partition_length = data_length // batch_size
data_x = np.zeros([batch_size, batch_partition_length], dtype=np.int32)
data_y = np.zeros([batch_size, batch_partition_length], dtype=np.int32)
for i in range(batch_size):
data_x[i] = raw_x[batch_partition_length * i:batch_partition_length * (i + 1)]
data_y[i] = raw_y[batch_partition_length * i:batch_partition_length * (i + 1)]
# further divide batch partitions into num_steps for truncated backprop
epoch_size = batch_partition_length // num_steps
for i in range(epoch_size):
x = data_x[:, i * num_steps:(i + 1) * num_steps]
y = data_y[:, i * num_steps:(i + 1) * num_steps]
yield (x, y)
def gen_epochs(n, num_steps):
for i in range(n):
yield gen_batch(gen_data(), batch_size, num_steps)
"""
Placeholders
"""
x = tf.placeholder(tf.int32, [batch_size, num_steps], name='input_placeholder')
y = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder')
init_state = tf.zeros([batch_size, state_size])
"""
RNN Inputs
"""
# Turn our x placeholder into a list of one-hot tensors:
# rnn_inputs is a list of num_steps tensors with shape [batch_size, num_classes]
x_one_hot = tf.one_hot(x, num_classes)
rnn_inputs = tf.unstack(x_one_hot, axis=1)
"""
Definition of rnn_cell
This is very similar to the __call__ method on Tensorflow's BasicRNNCell. See:
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/rnn_cell.py
"""
with tf.variable_scope('rnn_cell'):
W = tf.get_variable('W', [num_classes + state_size, state_size])
b = tf.get_variable('b', [state_size], initializer=tf.constant_initializer(0.0))
def rnn_cell(rnn_input, state):
with tf.variable_scope('rnn_cell', reuse=True):
W = tf.get_variable('W', [num_classes + state_size, state_size])
b = tf.get_variable('b', [state_size], initializer=tf.constant_initializer(0.0))
return tf.tanh(tf.matmul(tf.concat(axis=1, values=[rnn_input, state]), W) + b)
"""
Adding rnn_cells to graph
This is a simplified version of the "rnn" function from Tensorflow's api. See:
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/rnn.py
"""
state = init_state
rnn_outputs = []
for rnn_input in rnn_inputs:
state = rnn_cell(rnn_input, state)
rnn_outputs.append(state)
final_state = rnn_outputs[-1]
"""
Predictions, loss, training step
Losses and total_loss are simlar to the "sequence_loss_by_example" and "sequence_loss"
functions, respectively, from Tensorflow's api. See:
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/seq2seq.py
"""
#logits and predictions
with tf.variable_scope('softmax'):
W = tf.get_variable('W', [state_size, num_classes])
b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))
logits = [tf.matmul(rnn_output, W) + b for rnn_output in rnn_outputs]
predictions = [tf.nn.softmax(logit) for logit in logits]
# Turn our y placeholder into a list labels
y_as_list = [tf.squeeze(i, axis=[1]) for i in tf.split(axis=1, num_or_size_splits=num_steps, value=y)]
#losses and train_step
losses = [tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logit,labels=label) for \
logit, label in zip(logits, y_as_list)]
total_loss = tf.reduce_mean(losses)
train_step = tf.train.AdagradOptimizer(learning_rate).minimize(total_loss)
"""
Function to train the network
"""
def train_network(num_epochs, num_steps, state_size=4, verbose=True):
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
training_losses = []
for idx, epoch in enumerate(gen_epochs(num_epochs, num_steps)):
training_loss = 0
training_state = np.zeros((batch_size, state_size))
if verbose:
print("\nEPOCH", idx)
for step, (X, Y) in enumerate(epoch):
tr_losses, training_loss_, training_state, _ = \
sess.run([losses,
total_loss,
final_state,
train_step],
feed_dict={x:X, y:Y, init_state:training_state})
training_loss += training_loss_
if step % 100 == 0 and step > 0:
if verbose:
print("Average loss at step", step,
"for last 250 steps:", training_loss/100)
training_losses.append(training_loss/100)
training_loss = 0
return training_losses
training_losses = train_network(1,num_steps)
plt.plot(training_losses)
Seems like the batches are actually transposed.
So the first elements of the x-matrix (200 x 5) will fit the first 5 elements of x_raw.
Then only in the next iteration, the next 5-10 elements of x_raw will be in the first elements (again) of x.