pytorch how to compute grad after clone a tensor - python

My simple code:
import torch
x = torch.randn(4, requires_grad=True).cuda()
y = torch.randn(4, requires_grad=True).cuda()
z = torch.zeros(4)
z = torch.clone(x)
z.retain_grad()
h = (z + y) * z
l = torch.randn(4).cuda()
loss = (l - h).pow(2).sum()
loss.backward()
print('x.grad=', x.grad)
print('z.grad=', z.grad)
output:
x.grad= None
z.grad= tensor([-15.3401, -3.2623, -2.1670, 0.1410], device='cuda:0')
Why x.grad is None but not same as z.grad ?
What should I do if I want they are same?

You need to call x.retain_grad() after declaring x if you want to keep the grad of tensor x.

Related

How to split list of inputs in a tensorflow keras Model call method?

How are we supposed to split the inputs? Getting various errors. tf.split seems like it should work but not sure that is correct?
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
tf.keras.backend.set_floatx('float64')
m = 250 # samples
n_x = 1 # dim of x
n_tau = 11
x = (2 * np.random.rand(m, n_x).astype(np.float64) - 1) * 2
i = np.argsort(x[:, 0])
x = x[i] # to make plotting nicer
A = np.random.randn(n_x, 1)
y = x ** 2 + 0.3 * x + 0.4 * np.random.randn(m, 1).astype(np.float64)
y = y.dot(A) # y is 1d
# y = y.squeeze()
tau = np.linspace(1.0 / n_tau, 1 - 1.0 / n_tau, n_tau).astype(np.float64)
tau = tau[:, None]
def loss(tau_y, u):
tau, y = tau_y
tf.debugging.assert_rank(y, 2, f"y should be rank 2")
u = y[:, None, :] - u[None, :, :]
tf.debugging.assert_rank(tau, 2, f"tau should be rank 2")
tau = tau[None, :, :]
res = u ** 2 * (tau - tf.where(u <= np.float64(0.0), np.float64(1.0), np.float64(0.0)))
return tf.reduce_sum(tf.reduce_mean(res, axis=[1, 2]), axis=0)
tf.keras.backend.set_floatx('float64')
class My(tf.keras.models.Model):
def __init__(self):
super().__init__()
self._my_layer = tf.keras.layers.Dense(1, dtype=tf.float64)
def call(self, inputs):
tau, y = inputs
tf.print(tau.shape, y.shape)
return self._my_layer(tau)
model = My()
u = model((tau, y))
loss((tau, y), model((tau, y)))
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt, loss=loss)
model.fit((tau, y), (tau, y)) # fails
This works for a call but not from within a compiled model. Some weird list comprehension error.
~/j.py in loss(tau_y, u)
21
22 def loss(tau_y, u):
---> 23 tau, y = tau_y
24 tf.debugging.assert_rank(y, 2, f"y should be rank 2")
25 u = y[:, None, :] - u[None, :, :]
~/anaconda3/envs/37nightly/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in __iter__(self)
537 def __iter__(self):
538 if not context.executing_eagerly():
--> 539 self._disallow_iteration()
540
541 shape = self._shape_tuple()
~/anaconda3/envs/37nightly/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in _disallow_iteration(self)
533 else:
534 # Default: V1-style Graph execution.
--> 535 self._disallow_in_graph_mode("iterating over `tf.Tensor`")
536
537 def __iter__(self):
~/anaconda3/envs/37nightly/lib/python3.7/site-packages/tensorflow_core/python/framework/ops.py in _disallow_in_graph_mode(self, task)
513 raise errors.OperatorNotAllowedInGraphError(
514 "{} is not allowed in Graph execution. Use Eager execution or decorate"
--> 515 " this function with #tf.function.".format(task))
516
517 def _disallow_bool_casting(self):
OperatorNotAllowedInGraphError: iterating over `tf.Tensor` is not allowed in Graph execution. Use Eager execution or decorate this function with #tf.function.
UPDATE:
Do this? But then you get another error to do with keras:
import numpy as np
import pandas as pd
import tensorflow as tf
import tensorflow.keras as keras
tf.keras.backend.set_floatx('float64')
m = 250 # samples
n_x = 1 # dim of x
n_tau = 11
x = (2 * np.random.rand(m, n_x).astype(np.float64) - 1) * 2
i = np.argsort(x[:, 0])
x = x[i] # to make plotting nicer
A = np.random.randn(n_x, 1)
y = x ** 2 + 0.3 * x + 0.4 * np.random.randn(m, 1).astype(np.float64)
y = y.dot(A) # y is 1d
y = y[:, :, None]
tau = np.linspace(1.0 / n_tau, 1 - 1.0 / n_tau, n_tau).astype(np.float64)
tau = tau[None, :, None]
def loss(tau_y, u):
tau = tau_y[0]
y = tau_y[1]
u = y - u
res = u ** 2 * (tau - tf.where(u <= np.float64(0.0), np.float64(1.0), np.float64(0.0)))
return tf.reduce_sum(tf.reduce_mean(res, axis=[1, 2]), axis=0)
tf.keras.backend.set_floatx('float64')
class My(tf.keras.models.Model):
def __init__(self):
super().__init__()
self._my_layer = tf.keras.layers.Dense(1, dtype=tf.float64)
def call(self, inputs):
tau = inputs[0]
y = inputs[1]
tf.print(tau.shape, y.shape)
return self._my_layer(tau)
model = My()
u = model((tau, y)) # calling model works
l = loss((tau, y), model((tau, y))) # call loss works
opt = tf.keras.optimizers.Adam(learning_rate=0.01)
model.compile(optimizer=opt, loss=loss)
# this fails with the error below
model.fit((tau, y), (tau, y))
# ValueError: Error when checking model target: the list of Numpy arrays that you are passing to your model is not the size the model expected. Expected to see 1 array(s), for inputs ['output_1'] but instead got the following list of 2 arrays: [array([[[0.09090909],
# [0.17272727],
# [0.25454545],
# [0.33636364],
# [0.41818182],
# [0.5 ],
# [0.58181818],
# [0.66363636],
# [0.74545455],
# ...

autograd differentiation example in PyTorch - should be 9/8?

In the example for the Torch tutorial for Python, they use the following graph:
x = [[1, 1], [1, 1]]
y = x + 2
z = 3y^2
o = mean( z ) # 1/4 * x.sum()
Thus, the forward pass gets us this:
x_i = 1, y_i = 3, z_i = 27, o = 27
In code this looks like:
import torch
# define graph
x = torch.ones(2, 2, requires_grad=True)
y = x + 2
z = y * y * 3
out = z.mean()
# if we don't do this, torch will only retain gradients for leaf nodes, ie: x
y.retain_grad()
z.retain_grad()
# does a forward pass
print(z, out)
however, I get confused at the gradients computed:
# now let's run our backward prop & get gradients
out.backward()
print(f'do/dz = {z.grad[0,0]}')
which outputs:
do/dx = 4.5
By chain rule, do/dx = do/dz * dz/dy * dy/dx, where:
dy/dx = 1
dz/dy = 9/2 given x_i=1
do/dz = 1/4 given x_i=1
which means:
do/dx = 1/4 * 9/2 * 1 = 9/8
However this doesn't match the gradients returned by Torch (9/2 = 4.5). Perhaps I have a math error (something with the do/dz = 1/4 term?), or I don't understand autograd in Torch.
Any pointers?
do/dz = 1 / 4
dz/dy = 6y = 6 * 3 = 18
dy/dx = 1
therefore, do/dx = 9/2

Neural network from scratch - predict single example

Here is a neural network I've modified from Coursera Deep Learning Specialization to train on a dataset containing a flattened array of training data :
%reset -s -f
import numpy as np
import math
def sigmoid(x):
return 1 / (1 + np.exp(-x))
def initialize_with_zeros(dim):
w = np.zeros(shape=(dim, 1))
b = 0
return w, b
X = np.array([[1,1,1,1],[1,0,1,0] , [1,1,1,0], [0,0,0,0], [0,1,0,0], [0,1,0,1]])
Y = np.array([[1,0,1,1,1,1]])
X = X.reshape(X.shape[0], -1).T
Y = Y.reshape(Y.shape[0], -1).T
print('X shape' , X.shape)
print('Y shape' , Y.shape)
b = 1
w, b = initialize_with_zeros(4)
def propagate(w, b, X, Y) :
m = X.shape[1]
A = sigmoid(np.dot(w.T, X) + b) # compute activation
cost = (- 1 / m) * np.sum(Y * np.log(A) + (1 - Y) * (np.log(1 - A))) # compute cost
dw = (1./m)*np.dot(X,((A-Y).T))
db = (1./m)*np.sum(A-Y, axis=1)
cost = np.squeeze(cost)
grads = {"dw": dw,
"db": db}
return grads, cost
propagate(w , b , X , Y)
learning_rate = .001
costs = []
def optimize(w , b, X , Y) :
for i in range(2):
grads, cost = propagate(w=w, b=b, X=X, Y=Y)
dw = grads["dw"]
db = grads["db"]
w = w - learning_rate*dw
b = b - learning_rate*db
if i % 100 == 0:
costs.append(cost)
return w , b
w , b = optimize(w , b , X , Y)
def predict(w, b, X):
m = 6
Y_prediction = np.zeros((1,m))
# w = w.reshape(X.shape[0], 1)
A = sigmoid(np.dot(w.T, X) + b)
for i in range(A.shape[1]):
if A[0, i] >= 0.5:
Y_prediction[0, i] = 1
else:
Y_prediction[0, i] = 0
return Y_prediction
predict(w , b, X)
This works as expected but I'm struggling to predict a single example.
If I use :
predict(w , b, X[0])
returns error :
ValueError: shapes (6,4) and (6,) not aligned: 4 (dim 1) != 6 (dim 0)
How to re-arrange matrix operation in order to predict a single instance ?
Try
predict(w, b, X[:1])
It seems like you predict function expects X to be 2-d, when passing only one X it should have a singleton second dimension (i.e., shape=(6,1)) rather than being a single dimension (i.e., shape=(6,)).
The error comes from the fact that predict expect to be called on a batch of data of shape ... * bs. In order to predict on a single element you can create a batch of size 1 using np.expand_dims :
predict(w, b, np.expand_dims(X[0], axis=1)
should work.

Implement gradient descent in python

I am trying to implement gradient descent in python. Though my code is returning result by I think results I am getting are completely wrong.
Here is the code I have written:
import numpy as np
import pandas
dataset = pandas.read_csv('D:\ML Data\house-prices-advanced-regression-techniques\\train.csv')
X = np.empty((0, 1),int)
Y = np.empty((0, 1), int)
for i in range(dataset.shape[0]):
X = np.append(X, dataset.at[i, 'LotArea'])
Y = np.append(Y, dataset.at[i, 'SalePrice'])
X = np.c_[np.ones(len(X)), X]
Y = Y.reshape(len(Y), 1)
def gradient_descent(X, Y, theta, iterations=100, learningRate=0.000001):
m = len(X)
for i in range(iterations):
prediction = np.dot(X, theta)
theta = theta - (1/m) * learningRate * (X.T.dot(prediction - Y))
return theta
theta = np.random.randn(2,1)
theta = gradient_descent(X, Y, theta)
print('theta',theta)
The result I get after running this program is:
theta [[-5.23237458e+228]
[-1.04560188e+233]]
Which are very high values. Can someone point out the mistake I have made in implementation.
Also, 2nd problem is I have to set value of learning rate very low (in this case i have set to 0.000001) to work other wise program throws an error.
Please help me in diagnosis the problem.
try to reduce the learning rate with iteration otherwise it wont be able to reach the optimal lowest.try this
import numpy as np
import pandas
dataset = pandas.read_csv('start.csv')
X = np.empty((0, 1),int)
Y = np.empty((0, 1), int)
for i in range(dataset.shape[0]):
X = np.append(X, dataset.at[i, 'R&D Spend'])
Y = np.append(Y, dataset.at[i, 'Profit'])
X = np.c_[np.ones(len(X)), X]
Y = Y.reshape(len(Y), 1)
def gradient_descent(X, Y, theta, iterations=50, learningRate=0.01):
m = len(X)
for i in range(iterations):
prediction = np.dot(X, theta)
theta = theta - (1/m) * learningRate * (X.T.dot(prediction - Y))
learningRate/=10;
return theta
theta = np.random.randn(2,1)
theta = gradient_descent(X, Y, theta)
print('theta',theta)

Optimizing parameter in odeint with the output of a neural network in TensorFlow

I would like to optimize the coefficients of an ODE using tensorflow.
def odeModel(state, t):
x, y, z = tf.unstack(state)
dx = y
# Here I want to define dy and dz as follows:
# [dy, dz] = tf.nn.relu(tf.matmul([y, z], W) + b)
return tf.stack([dx, dy, dz])
Basically, I aim to define [dy, dz] as a map from [y, z] that depends on appropriately sized TensorFlow variables 'W' and 'b'. Then, I would like to find 'W' and 'b' that minimizes a loss function that depends on the trajectory starting from 'state0'. Is that possible?
I aim to write the rest of the code along the following lines.
t = np.linspace(0, 5, 100)
state0 = #Appropriate starting point, e.g., tf.constant([0, 1, 3], dtype=tf.float64)
states = tf.contrib.integrate.odeint(odeModel, state0, t)
loss = tf.reduce_mean(tf.pow(states[:, 2], 2))
optimizer = tf.train.AdagradOptimizer(0.05).minimize(loss)
Of course, I need to the create a session and run the optimizer. The details are omitted for brevity. I am wondering if there is a way to achieve what I am shooting for.
This can be done in exactly the way your describe:
import tensorflow as tf
import numpy as np
RS = np.random.RandomState(42)
# Defining model parameters as TF variables
W1 = tf.Variable(RS.randn(2, 1))
b1 = tf.Variable(RS.randn(1,))
W2 = tf.Variable(RS.randn(2, 1))
b2 = tf.Variable(RS.randn(1,))
def odeModel(state, t):
x, y, z = tf.unstack(state)
dx = y
# Model definition
dy = tf.nn.relu(tf.matmul(tf.expand_dims([y, z], -1), W1, transpose_a=True) + b1)
dz = tf.nn.relu(tf.matmul(tf.expand_dims([y, z], -1), W2, transpose_a=True) + b2)
return tf.stack([dx, tf.squeeze(dy), tf.squeeze(dz)])
t = np.linspace(0, 5, 100)
state0 = tf.constant([0, 1, 3], dtype=tf.float64)
states, info = tf.contrib.integrate.odeint(odeModel, state0, t, full_output=True)
loss = tf.reduce_mean(tf.pow(states[:, 2], 2))
optimizer = tf.train.AdagradOptimizer(0.05).minimize(loss)
# ----
sess = tf.Session()
sess.run(tf.global_variables_initializer())
# Value before optimizing
sess.run(W1)
# array [[ 0.49671415],
# [-0.1382643 ]]
# Optimize for 10 steps.
for i in range(10): sess.run(optimizer)
# Value after optimization
sess.run(W1)
# array([[ 0.38043613],
# [-0.26166077]])

Categories

Resources