Minimize multivariate function in Tensorflow - python

Suppose I have the following simple example of a function of several variables
#tf.function
def f(A, Y, X):
AX = tf.matmul(A, X)
norm = tf.norm(Y - AX)
return norm
N = 2
A = tf.Variable(np.array([[1, 2], [3, 4]]))
Y = tf.Variable(np.identity(N))
X = tf.Variable(np.zeros((N, N)))
How do I find X that minimizes f with Tensorflow ?
I would be interested in a generic solution that works with a function declared as above and when there are more than one variable to optimize.

avanwyk is essentially right, although note that: 1) you can directly use the minimize method of the optimizer for simplicity 2) if you only want to optimize X you should make sure that is the only variable you are updating.
import tensorflow as tf
#tf.function
def f(A, Y, X):
AX = tf.matmul(A, X)
norm = tf.norm(Y - AX)
return norm
# Input data
N = 2
A = tf.Variable([[1., 2.], [3., 4.]], tf.float32)
Y = tf.Variable(tf.eye(N, dtype=tf.float32))
X = tf.Variable(tf.zeros((N, N), tf.float32))
# Initial function value
print(f(A, Y, X).numpy())
# 1.4142135
# Setup a stochastic gradient descent optimizer
opt = tf.keras.optimizers.SGD(learning_rate=0.01)
# Define loss function and variables to optimize
loss_fn = lambda: f(A, Y, X)
var_list = [X]
# Optimize for a fixed number of steps
for _ in range(1000):
opt.minimize(loss_fn, var_list)
# Optimized function value
print(f(A, Y, X).numpy())
# 0.14933111
# Optimized variable
print(X.numpy())
# [[-2.0012102 0.98504114]
# [ 1.4754106 -0.5111093 ]]

Assuming Tensorflow 2, you can use a Keras optimizer:
#tf.function
def f(A, Y, X):
AX = tf.matmul(A, X)
norm = tf.norm(Y - AX)
return norm
N = 2
A = tf.Variable(np.array([[1., 2.], [3., 4.]]))
Y = tf.Variable(np.identity(N))
X = tf.Variable(np.zeros((N, N)))
optimizer = tf.keras.optimizers.SGD()
for iteration in range(0, 100):
with tf.GradientTape() as tape:
loss = f(X, Y, X)
print(loss)
grads = tape.gradient(loss, [A, Y, X])
optimizer.apply_gradients(zip(grads, [A, Y, X]))
print(A, Y, X)
That will work for any differentiable function. For non-differentiable functions you could look at other optimization techniques (such as Genetic Algorithms, or Swarm Optimization. NEAT has implementations of these https://neat-python.readthedocs.io/en/latest/).

Related

How to correctly calculate gradients in neural network with numpy

I am trying to build a simple neural network class from scratch using numpy, and test it using the XOR problem. But the backpropagation function (backprop) does not seem to be working correctly.
In the class, I construct instances by passing in the size of each layer, and the activation functions to use at each layer. I assume that the final activation function is softmax, so that I can calculate the derivative of cross-entropy loss wrt to Z of the last layer. I also do not have a separate set of bias matrices in my class. I just include them in the weight matrices as an extra column at the end.
I know that my backprop function is not working correctly, because the neural network does not ever converge on a somewhat correct output. I also created a numerical gradient function, and when comparing the results of both. I get drastically different numbers.
My understanding from what I have read is that the delta values of each layer (with L being the last layer, and i representing any other layer) should be:
And the respective gradients/weight-update of those layers should be:
Where * is the hardamard product, a represents the activation of some layer, and z represents the nonactivated output of some layer.
The sample data that I am using to test this is at the bottom of the file.
This is my first time trying to implement the backpropagation algorithm from scratch. So I am a bit lost on where to go from here.
import numpy as np
def sigmoid(n, deriv=False):
if deriv:
return np.multiply(n, np.subtract(1, n))
return 1 / (1 + np.exp(-n))
def softmax(X, deriv=False):
if not deriv:
exps = np.exp(X - np.max(X))
return exps / np.sum(exps)
else:
raise Error('Unimplemented')
def cross_entropy(y, p, deriv=False):
"""
when deriv = True, returns deriv of cost wrt z
"""
if deriv:
ret = p - y
return ret
else:
p = np.clip(p, 1e-12, 1. - 1e-12)
N = p.shape[0]
return -np.sum(y*np.log(p))/(N)
class NN:
def __init__(self, layers, activations):
"""random initialization of weights/biases
NOTE - biases are built into the standard weight matrices by adding an extra column
and multiplying it by one in every layer"""
self.activate_fns = activations
self.weights = [np.random.rand(layers[1], layers[0]+1)]
for i in range(1, len(layers)):
if i != len(layers)-1:
self.weights.append(np.random.rand(layers[i+1], layers[i]+1))
for j in range(layers[i+1]):
for k in range(layers[i]+1):
if np.random.rand(1,1)[0,0] > .5:
self.weights[-1][j,k] = -self.weights[-1][j,k]
def ff(self, X, get_activations=False):
"""Feedforward"""
activations, zs = [], []
for activate, w in zip(self.activate_fns, self.weights):
X = np.vstack([X, np.ones((1, 1))]) # adding bias
z = w.dot(X)
X = activate(z)
if get_activations:
zs.append(z)
activations.append(X)
return (activations, zs) if get_activations else X
def grad_descent(self, data, epochs, learning_rate):
"""gradient descent
data - list of 2 item tuples, the first item being an input, and the second being its label"""
grad_w = [np.zeros_like(w) for w in self.weights]
for _ in range(epochs):
for x, y in data:
grad_w = [n+o for n, o in zip(self.backprop(x, y), grad_w)]
self.weights = [w-(learning_rate/len(data))*gw for w, gw in zip(self.weights, grad_w)]
def backprop(self, X, y):
"""perfoms backprop for one layer of a NN with softmax/cross_entropy output layer"""
(activations, zs) = self.ff(X, True)
activations.insert(0, X)
deltas = [0 for _ in range(len(self.weights))]
grad_w = [0 for _ in range(len(self.weights))]
deltas[-1] = cross_entropy(y, activations[-1], True) # assumes output activation is softmax
grad_w[-1] = np.dot(deltas[-1], np.vstack([activations[-2], np.ones((1, 1))]).transpose())
for i in range(len(self.weights)-2, -1, -1):
deltas[i] = np.dot(self.weights[i+1][:, :-1].transpose(), deltas[i+1]) * self.activate_fns[i](zs[i], True)
grad_w[i] = np.hstack((np.dot(deltas[i], activations[max(0, i-1)].transpose()), deltas[i]))
# check gradient
num_gw = self.gradient_check(X, y, i)
print('numerical:', num_gw, '\nanalytic:', grad_w)
return grad_w
def gradient_check(self, x, y, i, epsilon=1e-4):
"""Numerically calculate the gradient in order to check analytical correctness"""
grad_w = [np.zeros_like(w) for w in self.weights]
for w, gw in zip(self.weights, grad_w):
for j in range(w.shape[0]):
for k in range(w.shape[1]):
w[j,k] += epsilon
out1 = cross_entropy(self.ff(x), y)
w[j,k] -= 2*epsilon
out2 = cross_entropy(self.ff(x), y)
gw[j,k] = np.float64(out1 - out2) / (2*epsilon)
w[j,k] += epsilon # return weight to original value
return grad_w
##### TESTING #####
X = [np.array([[0],[0]]), np.array([[0],[1]]), np.array([[1],[0]]), np.array([[1],[1]])]
y = [np.array([[1], [0]]), np.array([[0], [1]]), np.array([[0], [1]]), np.array([[1], [0]])]
data = []
for x, t in zip(X, y):
data.append((x, t))
def nn_test():
c = NN([2, 2, 2], [sigmoid, sigmoid, softmax])
c.grad_descent(data, 100, .01)
for x in X:
print(c.ff(x))
nn_test()
UPDATE: I found one small bug in the code, but it still does not converge correctly. I calculated/derived the gradients for both matrices by hand and found no errors in my implementation, so I still do not know what is wrong with it.
UPDATE #2: I created a procedural version of what I was using above with the following code. Upon testing I discovered that the NN was able to learn the correct weights for classifying each of the 4 cases in XOR separately, but when I try to train using all the training examples at once (as shown), the resultant weights almost always output something around .5 for both output nodes. Could someone please tell me why this is occurring?
X = [np.array([[0],[0]]), np.array([[0],[1]]), np.array([[1],[0]]), np.array([[1],[1]])]
y = [np.array([[1], [0]]), np.array([[0], [1]]), np.array([[0], [1]]), np.array([[1], [0]])]
weights = [np.random.rand(2, 3) for _ in range(2)]
for _ in range(1000):
for i in range(4):
#Feedforward
a0 = X[i]
z0 = weights[0].dot(np.vstack([a0, np.ones((1, 1))]))
a1 = sigmoid(z0)
z1 = weights[1].dot(np.vstack([a1, np.ones((1, 1))]))
a2 = softmax(z1)
# print('output:', a2, '\ncost:', cross_entropy(y[i], a2))
#backprop
del1 = cross_entropy(y[i], a2, True)
dcdw1 = del1.dot(np.vstack([a1, np.ones((1, 1))]).T)
del0 = weights[1][:, :-1].T.dot(del1)*sigmoid(z0, True)
dcdw0 = del0.dot(np.vstack([a0, np.ones((1, 1))]).T)
weights[0] -= .03*weights[0]*dcdw0
weights[1] -= .03*weights[1]*dcdw1
i = 0
a0 = X[i]
z0 = weights[0].dot(np.vstack([a0, np.ones((1, 1))]))
a1 = sigmoid(z0)
z1 = weights[1].dot(np.vstack([a1, np.ones((1, 1))]))
a2 = softmax(z1)
print(a2)
Softmax doesn't look right
Using cross entropy loss, the derivative for softmax is really nice (assuming you are using a 1 hot vector, where "1 hot" essentially means an array of all 0's except for a single 1, ie: [0,0,0,0,0,0,1,0,0])
For node y_n it ends up being y_n-t_n. So for a softmax with output:
[0.2,0.2,0.3,0.3]
And desired output:
[0,1,0,0]
The gradient at each of the softmax nodes is:
[0.2,-0.8,0.3,0.3]
It looks as if you are subtracting 1 from the entire array. The variable names aren't very clear, so if you could possibly rename them from L to what L represents, such as output_layer I'd be able to help more.
Also, for the other layers just to clear things up. When you say a^(L-1) as an example, do you mean "a to the power of (l-1)" or do you mean "a xor (l-1)"? Because in python ^ means xor.
EDIT:
I used this code and found the strange matrix dimensions (modified at line 69 in the function backprop)
deltas = [0 for _ in range(len(self.weights))]
grad_w = [0 for _ in range(len(self.weights))]
deltas[-1] = cross_entropy(y, activations[-1], True) # assumes output activation is softmax
print(deltas[-1].shape)
grad_w[-1] = np.dot(deltas[-1], np.vstack([activations[-2], np.ones((1, 1))]).transpose())
print(self.weights[-1].shape)
print(activations[-2].shape)
exit()

Using Backward Propagation in fmin_cg

I am trying to build an ANN in python, and I've been able to get so far as to to forward pass, but I get a problem when I try to do backward propagation. In my function nnCostFunction, the gradient grad is define as:
grad = tr(c_[Theta1_grad.swapaxes(1,0).reshape(1,-1), Theta2_grad.swapaxes(1,0).reshape(1,-1)])
But this is a problem because I am using scipy.optimize.fmin_cg to calculate nn_params and cost, and fmin_cg accepts only a single value (the J value for my forward pass) and cannot accept grad...
nn_params, cost = op.fmin_cg(lambda t: nnCostFunction(t, input_layer_size, hidden_layer_size, num_labels, X, y, lam), initial_nn_params, gtol = 0.001, maxiter = 40, full_output=1)[0, 1]
Is there a way to fix this so I can include backward propagation in my network? I know there is a scipy.optimize.minimize function, but I am having some difficulty understand how to use it and get the results I need. Does anyone know what needs to be done?
Your help is greatly appreciated, thanks.
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lam):
'''
Given NN parameters, layer sizes, number of labels, data, and learning rate, returns the cost of traversing NN.
'''
Theta1 = (reshape(nn_params[:(hidden_layer_size*(input_layer_size+1))],(hidden_layer_size,(input_layer_size+1))))
Theta2 = (reshape(nn_params[((hidden_layer_size*(input_layer_size+1))):],(num_labels, (hidden_layer_size+1))))
m = X.shape[0]
n = X.shape[1]
#forward pass
y_eye = eye(num_labels)
y_new = np.zeros((y.shape[0],num_labels))
for z in range(y.shape[0]):
y_new[z,:] = y_eye[int(y[z])-1]
y = y_new
a_1 = c_[ones((m,1)),X]
z_2 = tr(Theta1.dot(tr(a_1)))
a_2 = tr(sigmoid(Theta1.dot(tr(a_1))))
a_2 = c_[ones((a_2.shape[0],1)), a_2]
a_3 = tr(sigmoid(Theta2.dot(tr(a_2))))
J_reg = lam/(2.*m) * (sum(sum(Theta1[:,1:]**2)) + sum(sum(Theta2[:,1:]**2)))
J = (1./m) * sum(sum(-y*log(a_3) - (1-y)*log(1-a_3))) + J_reg
#Backprop
d_3 = a_3 - y
d_2 = d_3.dot(Theta2[:,1:])*sigmoidGradient(z_2)
Theta1_grad = 1./m * tr(d_2).dot(a_1)
Theta2_grad = 1./m * tr(d_3).dot(a_2)
#Add regularization
Theta1_grad[:,1:] = Theta1_grad[:,1:] + lam*1.0/m*Theta1[:,1:]
Theta2_grad[:,1:] = Theta2_grad[:,1:] + lam*1.0/m*Theta2[:,1:]
#Unroll gradients
grad = tr(c_[Theta1_grad.swapaxes(1,0).reshape(1,-1), Theta2_grad.swapaxes(1,0).reshape(1,-1)])
return J, grad
def nn_train(X,y,lam = 1.0, hidden_layer_size = 10):
'''
Train neural network given the features and class arrays, learning rate, and size of the hidden layer.
Return parameters Theta1, Theta2.
'''
# NN input and output layer sizes
input_layer_size = X.shape[1]
num_labels = unique(y).shape[0] #output layer
# Initialize NN parameters
initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)
# Unroll parameters
initial_nn_params = np.append(initial_Theta1.flatten(1), initial_Theta2.flatten(1))
initial_nn_params = reshape(initial_nn_params,(len(initial_nn_params),)) #flatten into 1-d array
# Find and print initial cost:
J_init = nnCostFunction(initial_nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,lam)[0]
grad_init = nnCostFunction(initial_nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,lam)[1]
print 'Initial J cost: ' + str(J_init)
print 'Initial grad cost: ' + str(grad_init)
# Implement backprop and train network, run fmin
print 'Training Neural Network...'
print 'fmin results:'
nn_params, cost = op.fmin_cg(lambda t: nnCostFunction(t, input_layer_size, hidden_layer_size, num_labels, X, y, lam), initial_nn_params, gtol = 0.001, maxiter = 40, full_output=1)[0, 1]
Theta1 = (reshape(nn_params[:(hidden_layer_size*(input_layer_size+1))],(hidden_layer_size,(input_layer_size+1))))
Theta2 = (reshape(nn_params[((hidden_layer_size*(input_layer_size+1))):],(num_labels, (hidden_layer_size+1))))
return Theta1, Theta2

Difference between GradientDescentOptimizer and AdamOptimizer in tensorflow?

When using GradientDescentOptimizer instead of Adam Optimizer the model doesn't seem to converge. On the otherhand, AdamOptimizer seems to work fine. Is the something wrong with the GradientDescentOptimizer from tensorflow?
import matplotlib.pyplot as plt
import tensorflow as tf
import numpy as np
def randomSample(size=100):
"""
y = 2 * x -3
"""
x = np.random.randint(500, size=size)
y = x * 2 - 3 - np.random.randint(-20, 20, size=size)
return x, y
def plotAll(_x, _y, w, b):
fig = plt.figure()
ax = fig.add_subplot(111)
ax.scatter(_x, _y)
x = np.random.randint(500, size=20)
y = w * x + b
ax.plot(x, y,'r')
plt.show()
def lr(_x, _y):
w = tf.Variable(2, dtype=tf.float32)
b = tf.Variable(3, dtype=tf.float32)
x = tf.placeholder(tf.float32)
y = tf.placeholder(tf.float32)
linear_model = w * x + b
loss = tf.reduce_sum(tf.square(linear_model - y))
optimizer = tf.train.AdamOptimizer(0.0003) #GradientDescentOptimizer
train = optimizer.minimize(loss)
init = tf.global_variables_initializer()
sess = tf.Session()
sess.run(init)
for i in range(10000):
sess.run(train, {x : _x, y: _y})
cw, cb, closs = sess.run([w, b, loss], {x:_x, y:_y})
print(closs)
print(cw,cb)
return cw, cb
x,y = randomSample()
w,b = lr(x,y)
plotAll(x,y, w, b)
I had a similar problem once and it took me a long time to find out the real problem. With gradient descent my loss function was actually growing instead of getting smaller.
It turned out that my learning rate was too high. If you take too big of a step with gradient descent you can end up jumping over the minimum. And if you are really unlucky, like I was you end up jumping so far ahead that your error increases.
Lowering the learning rate should make the model converge. But it could take a long time.
Adam optimizer has momentum, that is, it doesn't just follow the instantaneous gradient, but it keeps track of the direction it was going before with a sort of velocity. This way, if you start going back and forth because of the gradient than the momentum will force you to go slower in this direction. This helps a lot! Adam has a few more tweeks other than momentum that make it the prefered deep learning optimizer.
If you want to read more about optimizers this blog post is very informative.
http://ruder.io/optimizing-gradient-descent/

Tensorflow: Linear regression with non-negative constraints

I am trying to implement a linear regression model in Tensorflow, with additional constraints (coming from the domain) that the W and b terms must be non-negative.
I believe there are a couple of ways to do this.
We can modify the cost function to penalize negative weights [Lagrangian approach] [See:TensorFlow - best way to implement weight constraints
We can compute the gradients ourselves and project them on [0, infinity] [Projected gradient approach]
Approach 1: Lagrangian
When I tried the first approach, I would often end up with negative b.
I had modified the cost function from:
cost = tf.reduce_sum(tf.pow(pred-Y, 2))/(2*n_samples)
to:
cost = tf.reduce_sum(tf.pow(pred-Y, 2))/(2*n_samples)
nn_w = tf.reduce_sum(tf.abs(W) - W)
nn_b = tf.reduce_sum(tf.abs(b) - b)
constraint = 100.0*nn_w + 100*nn_b
cost_with_constraint = cost + constraint
Keeping the coefficient of nn_b and nn_w to be very high leads to instability and very high cost.
Here is the complete code.
import numpy as np
import tensorflow as tf
n_samples = 50
train_X = np.linspace(1, 50, n_samples)
train_Y = 10*train_X + 6 +40*np.random.randn(50)
X = tf.placeholder("float")
Y = tf.placeholder("float")
# Set model weights
W = tf.Variable(np.random.randn(), name="weight")
b = tf.Variable(np.random.randn(), name="bias")
# Construct a linear model
pred = tf.add(tf.multiply(X, W), b)
# Gradient descent
learning_rate=0.0001
# Initializing the variables
init = tf.global_variables_initializer()
# Mean squared error
cost = tf.reduce_sum(tf.pow(pred-Y, 2))/(2*n_samples)
nn_w = tf.reduce_sum(tf.abs(W) - W)
nn_b = tf.reduce_sum(tf.abs(b) - b)
constraint = 1.0*nn_w + 100*nn_b
cost_with_constraint = cost + constraint
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost_with_constraint)
training_epochs=200
with tf.Session() as sess:
sess.run(init)
# Fit all training data
cost_array = np.zeros(training_epochs)
W_array = np.zeros(training_epochs)
b_array = np.zeros(training_epochs)
for epoch in range(training_epochs):
for (x, y) in zip(train_X, train_Y):
sess.run(optimizer, feed_dict={X: x, Y: y})
W_array[epoch] = sess.run(W)
b_array[epoch] = sess.run(b)
cost_array[epoch] = sess.run(cost, feed_dict={X: train_X, Y: train_Y})
The following is the mean of b across 10 different runs.
0 -1.101268
1 0.169225
2 0.158363
3 0.706270
4 -0.371205
5 0.244424
6 1.312516
7 -0.069609
8 -1.032187
9 -1.711668
Clearly, the first approach is not optimal. Further, there is a lot of art involved in choosing the coefficient of penalty terms.
Approach 2: Projected gradient
I then thought to use the second approach, which is more guaranteed to work.
gr = tf.gradients(cost, [W, b])
We manually compute the gradients and update the W and b.
with tf.Session() as sess:
sess.run(init)
for epoch in range(training_epochs):
for (x, y) in zip(train_X, train_Y):
W_del, b_del = sess.run(gr, feed_dict={X: x, Y: y})
W = max(0, (W - W_del)*learning_rate) #Project the gradient on [0, infinity]
b = max(0, (b - b_del)*learning_rate) # Project the gradient on [0, infinity]
This approach seems to be very slow.
I am wondering if there is a better way to run the second approach, or guarantee the results with the first approach. Can we somehow allow the optimizer to ensure that the learnt weights are non-negative?
Edit: How to do this in Autograd
https://github.com/HIPS/autograd/issues/207
If you modify your linear model to:
pred = tf.add(tf.multiply(X, tf.abs(W)), tf.abs(b))
it will have the same effect as using only positive W and b values.
The reason your second approach is slow is that you clip the W and b values outside of the tensorflow graph. (Also it will not converge because (W - W_del)*learning_rate must instead be W - W_del*learning_rate)
edit:
You can implement the clipping using tensorflow graph like this:
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
with tf.control_dependencies([train_step]):
clip_W = W.assign(tf.maximum(0., W))
clip_b = b.assign(tf.maximum(0., b))
train_step_with_clip = tf.group(clip_W, clip_b)
In this case W and b values will be clipped to 0 and not to small positive numbers.
Here is a small mnist example with clipping:
import tensorflow as tf
(x_train, y_train), (x_test, y_test) = tf.keras.datasets.mnist.load_data()
x = tf.placeholder(tf.uint8, [None, 28, 28])
x_vec = tf.cast(tf.reshape(x, [-1, 784]), tf.float32) / 255.
W = tf.Variable(tf.zeros([784, 10]))
b = tf.Variable(tf.zeros([10]))
y = tf.matmul(x_vec, W) + b
y_target = tf.placeholder(tf.uint8, [None])
y_target_one_hot = tf.one_hot(y_target, 10)
cross_entropy = tf.reduce_mean(
tf.nn.softmax_cross_entropy_with_logits(labels=y_target_one_hot, logits=y))
train_step = tf.train.GradientDescentOptimizer(0.5).minimize(cross_entropy)
with tf.control_dependencies([train_step]):
clip_W = W.assign(tf.maximum(0., W))
clip_b = b.assign(tf.maximum(0., b))
train_step_with_clip = tf.group(clip_W, clip_b)
correct_prediction = tf.equal(tf.argmax(y, 1), tf.argmax(y_target_one_hot, 1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction, tf.float32))
with tf.Session() as sess:
tf.global_variables_initializer().run()
for i in range(1000):
sess.run(train_step_with_clip, feed_dict={
x: x_train[(i*100)%len(x_train):((i+1)*100)%len(x_train)],
y_target: y_train[(i*100)%len(x_train):((i+1)*100)%len(x_train)]})
if not i%100:
print("Min_W:", sess.run(tf.reduce_min(W)))
print("Min_b:", sess.run(tf.reduce_min(b)))
print("Accuracy:", sess.run(accuracy, feed_dict={
x: x_test,
y_target: y_test}))
I actually was not able to reproduce your problem of getting negative bs with your first approach.
But I do agree that this is not optimal for your use case and can result in negative values.
You should be able to constrain your parameters to non-negative values like so:
W *= tf.cast(W > 0., tf.float32)
b *= tf.cast(b > 0., tf.float32)
(exchange > with >= if necessary, the cast is necessary as the comparison operators will produce boolean values.
You then would optimize for the "standard cost" without the additional constraints.
However, this does not work in every case. For example, it should be avoided to initialize W or b with negative values in the beginning.
Your second (and probably better) approach can be accelerated by defining the update logic in the general computational graph, i.e. after the definition of cost
params = [W, b]
grads = tf.gradients(cost, params)
optimizer = [tf.assign(param, tf.maximum(0., param - grad*learning_rate))
for param, grad in zip(params, grads)]
I think your solution is slow because it creates new computation nodes every time which is probably very costly and repeated a lot inside the loops.
update using tensorflow optimizer
In my solution above not the gradients are clipped but rather the resulting update values.
Along the lines of this answer you could clip the gradients to be at most the value of the updated parameter like so:
params = [W, b]
opt = tf.train.GradientDescentOptimizer(learning_rate)
grads_and_vars = opt.compute_gradients(cost, params)
clipped_grads_vars = [(tf.clip_by_value(grad, -np.inf, var), var) for grad, var in grads_and_vars]
optimizer = opt.apply_gradients(clipped_grads_vars)
This way an update will never decrease a parameter to a value below 0.
However, I think this will not work in the case the updated variable is already negative.
Also, if the optimizing algorithm somehow multiplies the clipped gradient by a value greater than 1.
The latter might actually never happen, but I'm not 100% sure.

Linear Regression Lasagne / Theano

I'm trying to make a simple multivariate linear Regression with Lasagne.
This is my Input:
x_train = np.array([[37.93, 139.5, 329., 16.64,
16.81, 16.57, 1., 707.,
39.72, 149.25, 352.25, 16.61,
16.91, 16.60, 40.11, 151.5,
361.75, 16.95, 16.98, 16.79]]).astype(np.float32)
y_train = np.array([37.92, 138.25, 324.66, 16.28, 16.27, 16.28]).astype(np.float32)
For this two data points the network should be able to learn y perfectly.
Here is the model:
i1 = T.matrix()
y = T.vector()
lay1 = lasagne.layers.InputLayer(shape=(None,20),input_var=i1)
out1 = lasagne.layers.get_output(lay1)
lay2 = lasagne.layers.DenseLayer(lay1, 6, nonlinearity=lasagne.nonlinearities.linear)
out2 = lasagne.layers.get_output(lay2)
params = lasagne.layers.get_all_params(lay2, trainable=True)
cost = T.sum(lasagne.objectives.squared_error(out2, y))
grad = T.grad(cost, params)
updates = lasagne.updates.sgd(grad, params, learning_rate=0.1)
f_train = theano.function([i1, y], [out1, out2, cost], updates=updates)
After executing multiple times
f_train(x_train,y_train)
the cost explodes to infinity. Any idea what is going wrong here?
Thanks!
The network has too much capacity for a single training instance. You would need to apply some strong regularization to prevent the training diverging. Alternatively, and hopefully more realistically, give it more complex training data (many instances).
With a single instance the task can be solved using just one input, instead of 20, and with the DenseLayer's bias disabled:
import numpy as np
import theano
import lasagne
import theano.tensor as T
def compile():
x, z = T.matrices('x', 'z')
lh = lasagne.layers.InputLayer(shape=(None, 1), input_var=x)
ly = lasagne.layers.DenseLayer(lh, 6, nonlinearity=lasagne.nonlinearities.linear,
b=None)
y = lasagne.layers.get_output(ly)
params = lasagne.layers.get_all_params(ly, trainable=True)
cost = T.sum(lasagne.objectives.squared_error(y, z))
updates = lasagne.updates.sgd(cost, params, learning_rate=0.0001)
return theano.function([x, z], [y, cost], updates=updates)
def main():
f_train = compile()
x_train = np.array([[37.93]]).astype(theano.config.floatX)
y_train = np.array([[37.92, 138.25, 324.66, 16.28, 16.27, 16.28]])\
.astype(theano.config.floatX)
for _ in xrange(100):
print f_train(x_train, y_train)
main()
Note that the learning rate also needs to be reduced a lot to prevent divergence.

Categories

Resources