simple example of mxnet model parallelism - python

The simple examples in the Guon tutorial for mxnet are very helpful to those of us who are just getting started with mxnet. As yet, there is not a simple example for model parallelism. I see the model parallelism example code for LSTM, but I am new to mxnet and it would help me (and perhaps others) to have a more streamlined example. So, I have created a model parallelism example by working off the regression example in the gluon tutorial, and by mixing in some code from mxnet.gluon.Trainer.
However, I am clearly getting something wrong. The gradients do not seem to be updated. Can anyone assist by identifying the problem(s)? The goal here is to create a linear regression model that has three layers, each held on a different gpu. The model itself is not useful, except as an example to show how initialization and training can occur for model parallelism, when using a custom block and imperative programming.
As I understand it, Trainer() is written for data parallelism. It will not work for model parallelism in that it requires all parameters to be initialized on all GPUs.
import os
import numpy as np
import mxnet as mx
from mxnet import nd, autograd, gluon
from mxnet.gluon import Block
# make some data
num_inputs = 2
num_outputs = 1
num_examples = 10000
def real_fn(X):
return 2 * X[:, 0] - 3.4 * X[:, 1] + 4.2
X = np.random.normal(0,1, (num_examples, num_inputs))
noise = 0.001 * np.random.normal(0,1, (num_examples))
y = real_fn(X) + noise
y = y.reshape(-1,1)
# configuration
hidden_layers = 2
num_gpus = hidden_layers + 1
ctxList = [mx.gpu(i) for i in range(num_gpus)]
#ctxList = [mx.gpu() for i in range(num_gpus)]
#os.environ["MXNET_ENGINE_TYPE"] = "NaiveEngine"
print("\n")
# ======================================================================
class myDenseBlock(Block):
"""
A custom layer
"""
def __init__(self, layer_number, size_input, size_output, **kwargs):
super(myDenseBlock, self).__init__(**kwargs)
self.layer_number = layer_number
self.size_input = size_input
self.size_output = size_output
with self.name_scope():
# add parameters to the Block's ParameterDict.
self.w = self.params.get(
'weight',
init= mx.init.Xavier(magnitude=2.24),
shape=(size_input, size_output),
grad_req = 'write')
self.b = self.params.get(
'bias',
init= mx.init.Constant(0.5),
shape=(size_output,),
grad_req = 'write')
def forward(self, x):
x = x.as_in_context(ctxList[self.layer_number])
with x.context:
linear = nd.dot(x, self.w.data()) + self.b.data()
return linear
# ======================================================================
# create net
net = gluon.nn.Sequential()
with net.name_scope():
# initial layer, with X as input
net.add(myDenseBlock(0,
size_input = 2,
size_output = 2))
for ii in range(hidden_layers-1):
net.add(myDenseBlock(ii+1,
size_input = 2,
size_output = 2))
# final block, Y is nx1
net.add(myDenseBlock(ii+2,
size_input = 2,
size_output = 1))
# ititialize paramerters for different blocks (layers) on different gpus.
params = net.collect_params()
"""
The parameters are:
sequential0_mydenseblock0_weight
sequential0_mydenseblock0_bias
sequential0_mydenseblock1_weight
sequential0_mydenseblock1_bias
sequential0_mydenseblock2_weight
sequential0_mydenseblock2_bias
"""
print("\ninitializing:")
for i, param in enumerate(params):
if 'mydenseblock0' in param:
params[param].initialize(ctx=ctxList[0])
elif 'mydenseblock1' in param:
params[param].initialize(ctx=ctxList[1])
elif 'mydenseblock2' in param:
params[param].initialize(ctx=ctxList[2])
print(" ", i, param, " ", params[param].list_data()[0].context)
print("\n")
def square_loss(yhat, y):
return nd.mean((yhat - y) ** 2)
def mytrainer(updaters, params, ignore_stale_grad=False):
#print("\n")
for i, param in enumerate(params):
#print(i, param, " ", len(params[param].list_data()), params[param].list_data()[0].context)
if params[param].grad_req == 'null':
continue
if not ignore_stale_grad:
for data in params[param].list_data():
if not data._fresh_grad:
print(
"`%s` on context %s has not been updated"%(params[param].name, str(data.context)))
assert False
for upd, arr, grad in zip(updaters, params[param].list_data(), params[param].list_grad()):
if not ignore_stale_grad or arr._fresh_grad:
upd(i, grad, arr)
arr._fresh_grad = False
#print ("grad= ", grad)
batch_size = 100
epochs = 100000
iteration = -1
opt = mx.optimizer.create('adam', learning_rate=0.001, rescale_grad = 1 / batch_size)
updaters = [mx.optimizer.get_updater(opt)]
# the following definition for updaters does not work either
#updaters = [mx.optimizer.get_updater(opt) for _ in ctxList]
results = []
for e in range(epochs):
train_groups = np.array_split(np.arange(X.shape[0]), X.shape[0]/batch_size)
for ii, idx in enumerate(train_groups):
iteration += 1
xtrain, ytrain = X[idx,:], y[idx]
xtrain = nd.array(xtrain)
xtrain = xtrain.as_in_context(ctxList[0])
ytrain = nd.array(ytrain).reshape((-1, 1))
ytrain = ytrain.as_in_context(ctxList[0])
with autograd.record():
yhat = net(xtrain)
error = square_loss(yhat, ytrain.as_in_context(ctxList[-1]))
# Question: does the call to error.backward() go under the indent
# for autograd.record() or outside the indent? The gluon examples have
# it both ways
error.backward()
mytrainer(updaters, net.collect_params())
if iteration%10 == 0:
results.append([iteration, error.asnumpy().item()])
print(("epoch= {:5,d}, iter= {:6,d}, error= {:6.3E}").format(
e, iteration, error.asnumpy().item()))
The code fails at the "if not data._fresh_grad" test in mytrainer(). The output is:
initializing:
0 sequential0_mydenseblock0_weight gpu(0)
1 sequential0_mydenseblock0_bias gpu(0)
2 sequential0_mydenseblock1_weight gpu(1)
3 sequential0_mydenseblock1_bias gpu(1)
4 sequential0_mydenseblock2_weight gpu(2)
5 sequential0_mydenseblock2_bias gpu(2)
`sequential0_mydenseblock0_weight` on context gpu(0) has not been updated
I can verify using mx.autograd.get_symbol(error).tojson() that the computational graph only extends to the parameters on gpu(2), and does not reach other gpus.

Yes, per #sergei's comment, moving to v1.0.0 solves this.

Related

Listwrapper not allowing multiplication of learning rate and thus no update of weight for Nueral Network

I am new to tensorflow and nueral networks. I am trying to create a NN to estimate y = x^2
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
x_train = tf.constant(value = np.linspace(-10,10,50),dtype='float32')
x_train = tf.reshape(x_train,shape=[50,1])
y_train = x_train**2
layers = [1,3,4,1]
I created a nueral network class to obtain my weights and biases and run forward propagation.
class NN(tf.Module):
def __init__(self,layers,name=None):
super().__init__(name=name)
self.layers = layers
self.weights, self.biases = self.initialze(layers)
def initialze(self,layers) :
num_layers = len(layers)
weights = []
biases = []
for i in range(num_layers-1):
in_dim = layers[i]
out_dim = layers[i+1]
stddev = np.sqrt(2/(in_dim + out_dim))
b = tf.Variable(tf.zeros([1,layers[i+1]], dtype='float32'), dtype='float32')
W = tf.Variable(tf.random.truncated_normal([in_dim, out_dim], stddev=stddev), dtype='float32')
weights.append(W)
biases.append(b)
return weights, biases
def __call__(self,x):
Z = x
num_layers = len(self.layers)
for i in range(num_layers-1):
Z =tf.math.add(tf.linalg.matmul(Z ,self.weights[i]),self.biases[i])
return Z
My_NN = NN(layers)
Next I created a class updat to do backward propogation
class updat:
def __init__(self,y_train,x_train):
self.y_train = y_train
self.x_train = x_train
self.l_r = 0.1
def get_grad(self,My_NN):
with tf.GradientTape(persistent=True) as tape:
tape.watch(My_NN.weights)
tape.watch(My_NN.biases)
loss = tf.reduce_mean(tf.square(self.y_train-My_NN(self.x_train)))
dw,db = tape.gradient(loss, [My_NN.weights,My_NN.biases])
print(dw,'weight')
print(db,'biases')
My_NN.weights -= (self.l_r * dw)
My_NN.biases -=(self.l_r * db)
del tape
return loss
def report(self, loss):
return f"W = {My_NN.weights.numpy():1.2f}, b = {My_NN.biases.numpy():1.2f}, loss={loss:2.5f}"
def prop(self,epochs,My_NN):
for epoch in epochs:
loss = self.get_grad(My_NN)
current_loss = loss
print(f"Epoch {epoch:2d}:")
print(" ", report(current_loss,My_NN))
But when I run the code
model = updat(y_train,x_train)
epochs = range(10)
model.prop(epochs,My_NN)
I get an error saying
My_NN.weights -= (self.l_r * dw)
My_NN.biases -=(self.l_r * db)
TypeError: can't multiply sequence by non-int of type 'float'
I tried substituting My_NN.weights -= (lr*dw)
with My_NN.weights.assign_sub(lr*dw)
still it shows that
'ListWrapper' object has no attribute 'assign_sub'
Is there any solution for this?
TURN
My_NN.weights -= (self.l_r * dw)
My_NN.biases -=(self.l_r * db)
TO
for weight,d_weight in zip(My_NN.weights,dw):
weight.assign_sub(self.l_r * d_weight)
for bias,d_bias in zip(My_NN.biases,db):
bias.assign_sub(self.l_r * d_bias)
can solve the problem.
Because My_NN.weights is a list of tf.Variable's ref and dw is corresponding list of tf.constant. We cannot modify it outside the list unless we iterate over the list. Additionally, if we want to update tf.Variable, we should use its assign .etc methods, this is like modifying the content specified by the pointer variable in C language.
More conveniently, we usually use tf.keras.optimizers's apply_gridents(), even minimize() to updata varibales directly.
For this specific task and your more process oriented coding approach, here I give out some suggestions for stable training:
add activations to constrain the fitting ability of this model:
def __call__(self,x):
Z = x
num_layers = len(self.layers)
for i in range(num_layers-2):
y = tf.math.add(tf.linalg.matmul(Z ,self.weights[i]),self.biases[i])
Z = tf.nn.relu(y)
i+=1
return tf.math.add(tf.linalg.matmul(Z ,self.weights[i]),self.biases[i])
make lower learning_rate:
self.l_r = 0.001 # self.l_r = 0.1
do more epochs:
epochs = range(1000) # epochs = range(10)
Since initial value of trainable wights will also influence the training stability, you may need to re-train several times. In my
tests, the above modification works.

How to Implement Vectorized Backprop in Numpy

I'm working on a school project and am stuck on how to implement backpropagation in Numpy with the current forward prop structure I have. The aim of this script is to make a simple dynamic (meaning any number of layers and nodes) fully connected network using only numpy.
I think that I have to find the derivatives of the activation functions and multipliy it by the original error as well as the derivative of each activation function I encounter moving backward.
However, I'm having trouble figuring out how to implement this correctly in my script.
It'd be a great help if someone could explain in English what exactly I have to do given the complexities of the setup here, or even give a recommendation for a video/post that deals w dynamic size backprop.
Right now all the weights and biases are being stored in lists for future backprop, and I'm able to get the error for each output with the small amount of code currently in the backprop function.
This code block
#initialize a test model w/ 128 bacth and lr of 0.01
model = Model(128, 0.01)
#simple x data input
X = np.array([[1,1],[0,0],[12,5]])
Y = np.array([[1],[0],[-1]])
#adding 4 layers
z = model.add(X, 3, "sigmoid")
z = model.add(z, 1, "sigmoid", output=True)
#this is a full forward pass through the layers
z = model.predict(X)
print(z)
#this is the error of the predictions
print(model.backprop(z, Y))
Outputs the following vectors:
[[0.50006457]
[0.50006459]
[0.50006431]]
[[0.24993544]
[0.2500646 ]
[2.25019293]]
Like I said, not sure how to move forward ( or backward ;) ) from here.
Below is the full script needed to run the example:
import math
import numpy as np
#everything below is defining activation functions
#--------------------------------------------------------------------------------------------
def b_relu(input):
return max((0, max(input)))
def bd_relu(input):
if(input < 0 or input == 0):
return 0
else:
return 1
def b_sigmoid(x):
return 1 / (1 + math.exp(-x))
def bd_sigmoid(input):
return sigmoid(input) * (1 - sigmoid(input))
def b_tanh(input):
top = (math.exp(input) - math.exp(-input))
bottom = (math.exp(input) + math.exp(-input))
return (top/bottom)
#helper functions for tanh
def cosh(input):
return ((math.exp(input) + math.exp(-input)) / 2)
def sinh(input):
return ((math.exp(input) - math.exp(-input)) / 2)
def bd_tanh(input):
top = (math.pow(cosh(input), 2) - math.pow(sinh(input), 2))
bottom = math.pow(input, 2)
return (top / bottom)
def b_softmax(z):
# subracting the max adds numerical stability
shiftx = z - np.max(z,axis=1)[:,np.newaxis]
exps = np.exp(shiftx)
return exps / np.sum(exps,axis=1)[:,np.newaxis]
def bd_softmax(Y_hat, Y):
return Y_hat - Y
def b_linear(input):
return input
def bd_linear(input):
return 1
#vectorizing the activation and deriv. activation functions
relu = np.vectorize(b_relu)
d_relu = np.vectorize(bd_relu)
sigmoid = np.vectorize(b_sigmoid)
d_sigmoid = np.vectorize(bd_sigmoid)
tanh = np.vectorize(b_tanh)
d_tanh = np.vectorize(bd_tanh)
softmax = np.vectorize(b_softmax)
d_softmax = np.vectorize(bd_softmax)
linear = np.vectorize(b_linear)
d_linear = np.vectorize(bd_linear)
class Model:
def __init__(self, batch, lr):
#initializing self lists to keep track of stuff for bacthes, forward prop & backporp
self.batch = batch
self.lr = lr
self.W = []
self.B = []
self.A = []
self.Z = []
self.X = []
self.layers = []
self.tempW = []
self.tempB = []
#store error for backprop
self.output_error = []
#initialize the weights during 'model.add' so we can test our network shapes dynamically w/out model.compile
#added an output bool here so we can make sure the shape of the output network is (1,n)
def initial_weights(self, input_data, output_shape, output=False):
B = np.zeros((1, output_shape))
#assigning the shape
W = np.random.uniform(-1e-3, 1e-3, size = (input_data.shape[len(input_data.shape) - 1], output_shape))
self.B.append(B)
self.W.append(W)
def add(self, input_data, output_shape, activation, output=False):
#append to layers so we have a correct index value
self.layers.append(69)
#making sure our data in a numpy array
if (type(input_data) == np.ndarray):
X = input_data
else:
X = np.asarray(input_data)
#adding data and activations to self lists
self.X.append(X)
self.A.append(activation)
#keep track of our index & initializing random weights for dynamic comatibility testing
index = len(self.layers)-1
self.initial_weights(input_data, output_shape, output=False)
X2 = self.forward(input_data, index)
#printing layer info
print("Layer:", index)
print("Input Shape: ", X.shape)
print("Weight Shape: ", self.W[index].shape)
print("Output Shape: ", X2.shape)
print(" ")
return(X2)
def forward(self, input_data, index):
#pulling weights and biases from main lists for operations
B = self.B[index]
W = self.W[index]
#matmul of data # weights + bias
Z = np.matmul(input_data, W) + B
#summing each row of inputs to activation node
for x in Z:
x = sum(x)
#pulling activation from index
act = str(self.A[index])
#activating
Z = activate(Z, act)
#keeping track of Z i guess
self.Zappend = Z
return(Z)
def predict(self, input_data):
for x in range(len(self.layers)):
z = model.forward(input_data, x)
input_data = z
return z
def backprop(self, model_output, ground_truth):
#------------------------------
#now begins the backprop portion
#let's start with finding the error between predictions and actual values
#gonna do MSE to keep it simple
self.output_error = (ground_truth - model_output) ** 2
#so now we have the error of the output layer, this tells us two things, how wrong we were, and in which direction we should update
#the outputs of these nodes
'''
What to do if this was linear regression (for m & b)
1. Take the error and multiply it by the transpose of the last layer weights
(I think the error in this case is where the prime activation function should be if we had activations)
2. The last layer bias is just the error
3. The second to last layer inputs is the bias times the transpose of second layers weights
3. Then I have no idea
'''
return self.output_error

Why everything is disconnected in my Tensorboard graph?

I have implemented a CNN for detecting human activity using accelrometer data, my model is working really fine but when i visualize my grapgh on tensorboard, everythin seems to be diconnected. Right now i am not using Namescopes but even without it grpagh should make some sense right?
EDIT After implementing answer given by #user1735003 , this is the output. What i still don't understand is why i'm getting all the nodes at left
What i have implemented is: i have two convolution layer and two max-pooling layers and on top of that i have two hidden layers with 1024 and 512 neurons.
so Here is my code:
#Weights
def init_weights(shape):
init_random_dist = tf.truncated_normal(shape, stddev=0.1)
return tf.Variable(init_random_dist)
#Bias
def init_bias(shape):
init_bias = tf.constant(0.1,shape=shape)
return tf.Variable(init_bias)
def conv1d(x,weights):
#x is input accelration data and W is corresponding weight
return tf.nn.conv1d(value=x,filters = weights,stride=1,padding='VALID')
def convolution_layer(input_x,shape):
w1 = init_weights(shape)
b = init_bias([shape[2]])
return tf.nn.relu(conv1d(input_x,weights=w1)+b)
def normal_full_layer(input_layer,size):
input_size = int(input_layer.get_shape()[1])
W = init_weights([input_size, size])
b = init_bias([size])
return tf.matmul(input_layer, W) +b
x = tf.placeholder(tf.float32,shape=[None ,window_size,3]) #input tensor with 3 input channels
y = tf.placeholder(tf.float32,shape=[None,6]) #Labels
con_layer_1 = convolution_layer(x,shape=[4,3,32])#filter of shape [filter_width, in_channels, out_channels]
max_pool_1=tf.layers.max_pooling1d(inputs=con_layer_1,pool_size=2,strides=2,padding='Valid')
con_layer_2 = convolution_layer(max_pool_1,shape=[4,32,64])
max_pool_2 = tf.layers.max_pooling1d(inputs=con_layer_2,pool_size=2,strides=2,padding='Valid')
flat = tf.reshape(max_pool_2,[-1,max_pool_2.get_shape()[1]*max_pool_2.get_shape()[2]])
fully_conected = tf.nn.relu(normal_full_layer(flat,1024))
second_hidden_layer = tf.nn.relu(normal_full_layer(fully_conected,512))
hold_prob = tf.placeholder(tf.float32)
full_one_dropout = tf.nn.dropout(second_hidden_layer,keep_prob=hold_prob)
y_pred = normal_full_layer(full_one_dropout,6)
pred_softmax = tf.nn.softmax(y_pred)
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y,logits=y_pred))
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train = optimizer.minimize(cross_entropy)
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
filename="./summary_log11/run"
summary_writer = tf.summary.FileWriter(filename, graph_def=sess.graph_def)
for i in range(5000):
batch_x,batch_y = next_batch(100,X_train,y_train)
sess.run(train, feed_dict={x: batch_x, y: batch_y, hold_prob: 0.5})
# PRINT OUT A MESSAGE EVERY 100 STEPS
if i%100 == 0:
print('Currently on step {}'.format(i))
print('Accuracy is:')
# Test the Train Model
matches = tf.equal(tf.argmax(y_pred,1),tf.argmax(y,1))
acc = tf.reduce_mean(tf.cast(matches,tf.float32))
print(sess.run(acc,feed_dict={x:X_test,y:y_test,hold_prob:1.0}))
print('\n')
Try organizing your nodes into scopes. That will help Tensorboard to figure out your graph hierarchy. For example,
with tf.variable_scope('input'):
x = tf.placeholder(tf.float32,shape=[None ,window_size,3]) #input tensor with 3 input channels
y = tf.placeholder(tf.float32,shape=[None,6]) #Labels
with tf.variable_scope('net'):
con_layer_1 = convolution_layer(x,shape=[4,3,32])#filter of shape [filter_width, in_channels, out_channels]
max_pool_1=tf.layers.max_pooling1d(inputs=con_layer_1,pool_size=2,strides=2,padding='Valid')
con_layer_2 = convolution_layer(max_pool_1,shape=[4,32,64])
max_pool_2 = tf.layers.max_pooling1d(inputs=con_layer_2,pool_size=2,strides=2,padding='Valid')
flat = tf.reshape(max_pool_2,[-1,max_pool_2.get_shape()[1]*max_pool_2.get_shape()[2]])
fully_conected = tf.nn.relu(normal_full_layer(flat,1024))
second_hidden_layer = tf.nn.relu(normal_full_layer(fully_conected,512))
hold_prob = tf.placeholder(tf.float32)
full_one_dropout = tf.nn.dropout(second_hidden_layer,keep_prob=hold_prob)
y_pred = normal_full_layer(full_one_dropout,6)
pred_softmax = tf.nn.softmax(y_pred)
with tf.variable_scope('loss'):
cross_entropy = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits(labels=y,logits=y_pred))
with tf.variable_scope('optimizer'):
optimizer = tf.train.AdamOptimizer(learning_rate=0.001)
train = optimizer.minimize(cross_entropy)
Since you didn't explicitly name your tf operations it was done automatically by tensorflow, e.g. ReLu operators were named ReLu_1, ReLu_2, ... . According to tensorboard documentation:
One last structural simplification is series collapsing. Sequential motifs--that is, nodes whose names differ by a number at the end and have isomorphic structures--are collapsed into a single stack of nodes, as shown below. For networks with long sequences, this greatly simplifies the view.
As you can see at the right side of your graph, all add_[0-7], MatMul_[0-5] and Relu_[0-5] nodes were grouped together because they have similar names, this doesn't mean that nodes are disconnected in your graph, it's just the tensorboard's node grouping policy.
If you want to avoid this then give your operations the names that are more different than just by a number at the end. Or use tf.name_scope() as you mentioned, e.g.:
with tf.name_scope("conv1"):
con_layer_1 = convolution_layer(x,shape=[4,3,32])
max_pool_1=tf.layers.max_pooling1d(inputs=con_layer_1,pool_size=2,strides=2,padding='Valid')
with tf.name_scope("conv2"):
con_layer_2 = convolution_layer(max_pool_1,shape=[4,32,64])
max_pool_2 = tf.layers.max_pooling1d(inputs=con_layer_2,pool_size=2,strides=2,padding='Valid')
# etc.

Modify neural net to classify single example

This is my custom extension of one of Andrew NG's neural network from deep learning course where instead of producing 0 or 1 for binary classification I'm attempting
to classify multiple examples.
Both the inputs and outputs are one hot encoded.
With not much training I receive an accuracy of 'train accuracy: 67.51658067499625 %'
How can I classify a single training example instead of classifying all training examples?
I think a bug exists in my implementation as an issue with this network is training examples (train_set_x) and output values (train_set_y) both need to have same dimensions or an error related to the dimensionality of matrices is received.
For example using :
train_set_x = np.array([
[1,1,1,1],[0,1,1,1],[0,0,1,1]
])
train_set_y = np.array([
[1,1,1],[1,1,0],[1,1,1]
])
returns error :
ValueError Traceback (most recent call last)
<ipython-input-11-0d356e8d66f3> in <module>()
27 print(A)
28
---> 29 np.multiply(train_set_y,A)
30
31 def initialize_with_zeros(numberOfTrainingExamples):
ValueError: operands could not be broadcast together with shapes (3,3) (1,4)
network code :
import numpy as np
import matplotlib.pyplot as plt
import h5py
import scipy
from scipy import ndimage
import pandas as pd
%matplotlib inline
train_set_x = np.array([
[1,1,1,1],[0,1,1,1],[0,0,1,1]
])
train_set_y = np.array([
[1,1,1,0],[1,1,0,0],[1,1,1,1]
])
numberOfFeatures = 4
numberOfTrainingExamples = 3
def sigmoid(z):
s = 1 / (1 + np.exp(-z))
return s
w = np.zeros((numberOfTrainingExamples , 1))
b = 0
A = sigmoid(np.dot(w.T , train_set_x))
print(A)
np.multiply(train_set_y,A)
def initialize_with_zeros(numberOfTrainingExamples):
w = np.zeros((numberOfTrainingExamples , 1))
b = 0
return w, b
def propagate(w, b, X, Y):
m = X.shape[1]
A = sigmoid(np.dot(w.T , X) + b)
cost = -(1/m)*np.sum(np.multiply(Y,np.log(A)) + np.multiply((1-Y),np.log(1-A)), axis=1)
dw = ( 1 / m ) * np.dot( X, ( A - Y ).T ) # consumes ( A - Y )
db = ( 1 / m ) * np.sum( A - Y ) # consumes ( A - Y ) again
# cost = np.squeeze(cost)
grads = {"dw": dw,
"db": db}
return grads, cost
def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost = True):
costs = []
for i in range(num_iterations):
grads, cost = propagate(w, b, X, Y)
dw = grads["dw"]
db = grads["db"]
w = w - (learning_rate * dw)
b = b - (learning_rate * db)
if i % 100 == 0:
costs.append(cost)
if print_cost and i % 10000 == 0:
print(cost)
params = {"w": w,
"b": b}
grads = {"dw": dw,
"db": db}
return params, grads, costs
def model(X_train, Y_train, num_iterations, learning_rate = 0.5, print_cost = False):
w, b = initialize_with_zeros(numberOfTrainingExamples)
parameters, grads, costs = optimize(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost = True)
w = parameters["w"]
b = parameters["b"]
Y_prediction_train = sigmoid(np.dot(w.T , X_train) + b)
print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
model(train_set_x, train_set_y, num_iterations = 20000, learning_rate = 0.0001, print_cost = True)
Update: A bug exists in this implementation in that the training example pairs (train_set_x , train_set_y) must contain the same dimensions. Can point in direction of how linear algebra should be modified?
Update 2 :
I modified #Paul Panzer answer so that learning rate is 0.001 and train_set_x , train_set_y pairs are unique :
train_set_x = np.array([
[1,1,1,1,1],[0,1,1,1,1],[0,0,1,1,0],[0,0,1,0,1]
])
train_set_y = np.array([
[1,0,0],[0,0,1],[0,1,0],[1,0,1]
])
grads = model(train_set_x, train_set_y, num_iterations = 20000, learning_rate = 0.001, print_cost = True)
# To classify single training example :
print(sigmoid(dw # [0,0,1,1,0] + db))
This update produces following output :
-2.09657359028
-3.94918577439
[[ 0.74043089 0.32851512 0.14776077 0.77970162]
[ 0.04810012 0.08033521 0.72846174 0.1063849 ]
[ 0.25956911 0.67148488 0.22029838 0.85223923]]
[[1 0 0 1]
[0 0 1 0]
[0 1 0 1]]
train accuracy: 79.84462279013312 %
[[ 0.51309252 0.48853845 0.50945862]
[ 0.5110232 0.48646923 0.50738869]
[ 0.51354109 0.48898712 0.50990734]]
Should print(sigmoid(dw # [0,0,1,1,0] + db)) produce a vector that once rounded matches train_set_y corresponding value : [0,1,0] ?
Modifying to produce a vector with (adding [0,0,1,1,0] to numpy array and taking transpose):
print(sigmoid(dw # np.array([[0,0,1,1,0]]).T + db))
returns :
array([[ 0.51309252],
[ 0.48646923],
[ 0.50990734]])
Again, rounding these values to nearest whole number produces vector [1,0,1] when [0,1,0] is expected.
These are incorrect operations to produce a prediction for single training example ?
Your difficulties come from mismatched dimensions, so let's walk through the problem and try and get them straight.
Your network has a number of inputs, the features, let's call their number N_in (numberOfFeatures in your code). And it has a number of outputs which correspond to different classes let's call their number N_out. Inputs and outputs are connected by the weights w.
Now here is the problem. Connections are all-to-all, so we need a weight for each of the N_out x N_in pairs of outputs and inputs. Therefore in your code the shape of w must be changed to (N_out, N_in). You probably also want an offset b for each output, so b should be a vector of size (N_out,) or rather (N_out, 1) so it plays well with the 2d terms.
I've fixed that in the modified code below and I tried to make it very explicit. I've also thrown a mock data creator into the bargain.
Re the one-hot encoded categorical output, I'm not an expert on neural networks but I think, most people understand it so that classes are mutually exclusive, so each sample in your mock output should have one one and the rest zeros.
Side note:
At one point a competing answer advised you to get rid of the 1-... terms in the cost function. While that looks like an interesting idea to me my gut feeling (Edit Now confirmed using gradient-free minimizer; use activation="hybrid" in code below. Solver will simply maximize all outputs which are active in at least one training example.) is it won't work just like that because the cost will then fail to penalise false positives (see below for detailed explanation). To make it work you'd have to add some kind of regularization. One method that appears to work is using the softmax instead of the sigmoid. The softmax is to one-hot what the sigmoid is to binary. It makes sure the output is "fuzzy one-hot".
Therefore my recommendation is:
If you want to stick with sigmoid and not explicitly enforce one-hot predictions. Keep the 1-... term.
If you want to use the shorter cost function. Enforce one-hot predictions. For example by using softmax instead of sigmoid.
I've added an activation="sigmoid"|"softmax"|"hybrid" parameter to the code that switches between models. I've also made the scipy general purpose minimizer available, which may be useful when the gradient of the cost is not at hand.
Recap on how the cost function works:
The cost is a sum over all classes and all training samples of the term
-y log (y') - (1-y) log (1-y')
where y is the expected response, i.e. the one given by the "y" training sample for the input (the "x" training sample). y' is the prediction, the response the network with its current weights and biases generates. Now, because the expected response is either 0 or 1 the cost for a single category and a single training sample can be written
-log (y') if y = 1
-log(1-y') if y = 0
because in the first case (1-y) is zero, so the second term vanishes and in the secondo case y is zero, so the first term vanishes.
One can now convince oneself that the cost is high if
the expected response y is 1 and the network prediction y' is close to zero
the expected response y is 0 and the network prediction y' is close to one
In other words the cost does its job in punishing wrong predictions. Now, if we drop the second term (1-y) log (1-y') half of this mechanism is gone. If the expected response is 1, a low prediction will still incur a cost, but if the expected response is 0, the cost will be zero, regardless of the prediction, in particular, a high prediction (or false positive) will go unpunished.
Now, because the total cost is a sum over all training samples, there are three possibilities.
all training samples prescribe that the class be zero:
then the cost will be completely independent of the predictions for this class and no learning can take place
some training samples put the class at zero, some at one:
then because "false negatives" or "misses" are still punished but false positives aren't the net will find the easiest way to minimize the cost which is to indiscriminately increase the prediction of the class for all samples
all training samples prescribe that the class be one:
essentially the same as in the second scenario will happen, only here it's no problem, because that is the correct behavior
And finally, why does it work if we use softmax instead of sigmoid? False positives will still be invisible. Now it is easy to see that the sum over all classes of the softmax is one. So I can only increase the prediction for one class if at least one other class is reduced to compensate. In particular, there can be no false positives without a false negative, and the false negative the cost will detect.
On how to get a binary prediction:
For binary expected responses rounding is indeed the appropriate procedure. For one-hot I'd rather find the largest value, set that to one and all others to zero. I've added a convenience function, predict, implementing that.
import numpy as np
from scipy import optimize as opt
from collections import namedtuple
# First, a few structures to keep ourselves organized
Problem_Size = namedtuple('Problem_Size', 'Out In Samples')
Data = namedtuple('Data', 'Out In')
Network = namedtuple('Network', 'w b activation cost gradient most_likely')
def get_dims(Out, In, transpose=False):
"""extract dimensions and ensure everything is 2d
return Data, Dims"""
# gracefully acccept lists etc.
Out, In = np.asanyarray(Out), np.asanyarray(In)
if transpose:
Out, In = Out.T, In.T
# if it's a single sample make sure it's n x 1
Out = Out[:, None] if len(Out.shape) == 1 else Out
In = In[:, None] if len(In.shape) == 1 else In
Dims = Problem_Size(Out.shape[0], *In.shape)
if Dims.Samples != Out.shape[1]:
raise ValueError("number of samples must be the same for Out and In")
return Data(Out, In), Dims
def sigmoid(z):
s = 1 / (1 + np.exp(-z))
return s
def sig_cost(Net, data):
A = process(data.In, Net)
logA = np.log(A)
return -(data.Out * logA + (1-data.Out) * (1-logA)).sum(axis=0).mean()
def sig_grad (Net, Dims, data):
A = process(data.In, Net)
return dict(dw = (A - data.Out) # data.In.T / Dims.Samples,
db = (A - data.Out).mean(axis=1, keepdims=True))
def sig_ml(z):
return np.round(z).astype(int)
def sof_ml(z):
hot = np.argmax(z, axis=0)
z = np.zeros(z.shape, dtype=int)
z[hot, np.arange(len(hot))] = 1
return z
def softmax(z):
z = z - z.max(axis=0, keepdims=True)
z = np.exp(z)
return z / z.sum(axis=0, keepdims=True)
def sof_cost(Net, data):
A = process(data.In, Net)
logA = np.log(A)
return -(data.Out * logA).sum(axis=0).mean()
sof_grad = sig_grad
def get_net(Dims, activation='softmax'):
activation, cost, gradient, ml = {
'sigmoid': (sigmoid, sig_cost, sig_grad, sig_ml),
'softmax': (softmax, sof_cost, sof_grad, sof_ml),
'hybrid': (sigmoid, sof_cost, None, sig_ml)}[activation]
return Network(w=np.zeros((Dims.Out, Dims.In)),
b=np.zeros((Dims.Out, 1)),
activation=activation, cost=cost, gradient=gradient,
most_likely=ml)
def process(In, Net):
return Net.activation(Net.w # In + Net.b)
def propagate(data, Dims, Net):
return Net.gradient(Net, Dims, data), Net.cost(Net, data)
def optimize_no_grad(Net, Dims, data):
def f(x):
Net.w[...] = x[:Net.w.size].reshape(Net.w.shape)
Net.b[...] = x[Net.w.size:].reshape(Net.b.shape)
return Net.cost(Net, data)
x = np.r_[Net.w.ravel(), Net.b.ravel()]
res = opt.minimize(f, x, options=dict(maxiter=10000)).x
Net.w[...] = res[:Net.w.size].reshape(Net.w.shape)
Net.b[...] = res[Net.w.size:].reshape(Net.b.shape)
def optimize(Net, Dims, data, num_iterations, learning_rate, print_cost = True):
w, b = Net.w, Net.b
costs = []
for i in range(num_iterations):
grads, cost = propagate(data, Dims, Net)
dw = grads["dw"]
db = grads["db"]
w -= learning_rate * dw
b -= learning_rate * db
if i % 100 == 0:
costs.append(cost)
if print_cost and i % 10000 == 0:
print(cost)
return grads, costs
def model(X_train, Y_train, num_iterations, learning_rate = 0.5, print_cost = False, activation='sigmoid'):
data, Dims = get_dims(Y_train, X_train, transpose=True)
Net = get_net(Dims, activation)
if Net.gradient is None:
optimize_no_grad(Net, Dims, data)
else:
grads, costs = optimize(Net, Dims, data, num_iterations, learning_rate, print_cost = True)
Y_prediction_train = process(data.In, Net)
print(Y_prediction_train)
print(data.Out)
print(Y_prediction_train.sum(axis=0))
print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - data.Out)) * 100))
return Net
def predict(In, Net, probability=False):
In = np.asanyarray(In)
is1d = In.ndim == 1
if is1d:
In = In.reshape(-1, 1)
Out = process(In, Net)
if not probability:
Out = Net.most_likely(Out)
if is1d:
Out = Out.reshape(-1)
return Out
def create_data(Dims):
Out = np.zeros((Dims.Out, Dims.Samples), dtype=int)
Out[np.random.randint(0, Dims.Out, (Dims.Samples,)), np.arange(Dims.Samples)] = 1
In = np.random.randint(0, 2, (Dims.In, Dims.Samples))
return Data(Out, In)
train_set_x = np.array([
[1,1,1,1,1],[0,1,1,1,1],[0,0,1,1,0],[0,0,1,0,1]
])
train_set_y = np.array([
[1,0,0],[1,0,0],[0,0,1],[0,0,1]
])
Net1 = model(train_set_x, train_set_y, num_iterations = 20000, learning_rate = 0.001, print_cost = True, activation='sigmoid')
Net2 = model(train_set_x, train_set_y, num_iterations = 20000, learning_rate = 0.001, print_cost = True, activation='softmax')
Net3 = model(train_set_x, train_set_y, num_iterations = 20000, learning_rate = 0.001, print_cost = True, activation='hybrid')
Dims = Problem_Size(8, 100, 50)
data = create_data(Dims)
model(data.In.T, data.Out.T, num_iterations = 40000, learning_rate = 0.001, print_cost = True, activation='softmax')
model(data.In.T, data.Out.T, num_iterations = 40000, learning_rate = 0.001, print_cost = True, activation='sigmoid')
Both the idea of how to fix the bug and how you can extend the implementation to classify between more classes can be solved with some dimensionality analysis.
I am assuming that you by classifying multiple examples mean multiple classes and not multiple samples, as we need multiple samples to train even for 2 classes.
Where N = number of samples, D = number of features, K = number of categories(with K=2 being a special case where one can reduce this down to one dimension,ie K=1 with y=0 signifying one class and y=1 the other). The data should have the following dimensions:
X: N * D #input
y: N * K #output
W: D * K #weights, also dW has same dimensions
b: 1 * K #bias, also db has same dimensions
#A should have same dimensions as y
The order of the dimensions can be switched around, as long as the dot products are done correctly.
First dealing with your bug: You are initializing W as N * K instead of D * K ie. in the binary case:
w = np.zeros((numberOfTrainingExamples , 1))
#instead of
w = np.zeros((numberOfFeatures , 1))
This means that the only time you are initializing W to correct dimensions is when y and X (coincidentally) have same dimensions.
This will mess with your dot products as well:
np.dot(X, w) # or np.dot(w.T,X.T) if you define y as [K * N] dimensions
#instead of
np.dot(w.T , X)
and
np.dot( X.T, ( A - Y ) ) #np.dot( X.T, ( A - Y ).T ) if y:[K * N]
#instead of
np.dot( X, ( A - Y ).T )
Also make sure that the cost function returns one number (ie. not an array).
Secondly going on to K>2 you need to make some changes. b is no longer a single number, but a vector (1D-array). y and W go from being 1D-array to 2D array. To avoid confusion and hard-to-find bugs it could be good to set K, N and D to different values

TensorFlow simple XOR example not converging

I have the following code to learn a simple XOR network:
import tensorflow as tf
import numpy as np
def generate_xor(length=1000):
x = np.random.randint(0,2, size=(length,2))
y = []
for pair in x:
y.append(int(np.logical_xor(pair[0],pair[1])))
return x, np.array(y)
n_inputs = 2
n_hidden = n_inputs*4
n_outputs = 1
x = tf.placeholder(tf.float32, shape=[1,n_inputs])
y = tf.placeholder(tf.float32, [1, n_outputs])
W = tf.Variable(tf.random_uniform([n_inputs, n_hidden],-1,1))
b = tf.Variable(tf.zeros([n_hidden]))
W2 = tf.Variable(tf.random_uniform([n_hidden,n_outputs],-1,1))
b2 = tf.Variable(tf.zeros([n_outputs]))
def xor_model(data):
x = data
hidden_layer = tf.nn.relu(tf.matmul(x,W)+b)
output = tf.nn.relu(tf.matmul(hidden_layer, W2)+b2)
return output
xor_nn = xor_model(x)
cost = tf.reduce_mean(tf.abs(xor_nn - y))
train_step = tf.train.AdagradOptimizer(0.05).minimize(cost)
init = tf.initialize_all_variables()
sess = tf.Session()
sess.run(init)
x_data,y_data = generate_xor(length=100000)
errors = []
count = 0
out_freq = 1000
for xor_in, xor_out in zip(x_data,y_data):
_, err = sess.run([train_step, cost], feed_dict={x:xor_in.reshape(1,2), y:xor_out.reshape(1,n_outputs)})
errors.append(err)
count += 1
if count == out_freq:
tol = np.mean(errors[-out_freq:])
print tol
count = 0
if tol < 0.005:
break
n_tests = 100
correct = 0
count = 0
x_test, y_test = generate_xor(length=n_tests)
for xor_in, xor_out in zip(x_test, y_test):
output = sess.run([xor_nn], feed_dict={x:xor_in.reshape(1,2)})[0]
guess = int(output[0][0])
truth = int(xor_out)
if guess == truth:
correct += 1
count += 1
print "Model %d : Truth %d - Pass Rate %.2f" % (int(guess), int(xor_out), float(correct*100.0)/float(count))
However, I can't get the code to reliably converge. I have tried varying the size of the hidden layer, using different optimizers / step sizes and different initializations of the weights and biases.
I'm clearly making an elemental error. If anyone could help I'd be grateful.
EDIT:
Thanks to Prem and Alexander Svetkin I managed to spot my errors. Firstly I wasn't rounding the outputs when I cast them to ints, a schoolboy mistake. Secondly I had a relu on the output layer which wasn't needed - a copy and paste mistake. Thirdly relu is indeed a bad choice of activation function for this task, using a sigmoid function works much better.
So this:
hidden_layer = tf.nn.relu(tf.matmul(x,W)+b)
output = tf.nn.relu(tf.matmul(hidden_layer, W2)+b2)
becomes this:
hidden_layer = tf.nn.sigmoid(tf.matmul(x,W)+b)
output = tf.matmul(hidden_layer, W2)+b2
and this:
guess = int(output[0][0])
becomes this:
guess = int(output[0][0]+0.5)
Shouldn't you only return the activation function of output layer instead of relu?
output = tf.matmul(hidden_layer, W2) + b2
ReLU just isn't right activation function for binary classification task, use something different, like sigmoid function.
Pay attention to your float output values. 0.99 should mean 1 or 0? Use rounding.

Categories

Resources