I have implemented and trained a neural network in Pytorch, however, I am interested in the derivative of the neural network parameters with respect to the input.
I have extensively searched for any procedure to that would allow evaluating the derivative of weights with respect to a given input, but I did not find anything.
I know that I can compute the gradients of a function in the following way.
external_grad = torch.tensor([1., 1.])
Q.backward(gradient=external_grad)
But How would I do that with a trained neural network instead of a function Q?
Thanks in advance.
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import numpy as np
from scipy.stats import norm
from numpy import linalg as la
import numpy.random as npr
from tabulate import tabulate
from matplotlib import pyplot as plt
import random
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
#from torchvision import datasets, transforms
from torch.autograd import Variable
# In[2]:
import numpy as np
from scipy.stats import norm
from numpy import linalg as la
import numpy.random as npr
from tabulate import tabulate
from matplotlib import pyplot as plt
import random
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
#from torchvision import datasets, transforms
from torch.autograd import Variable
from torch import optim
# In[3]:
nSimul = 32768
T1 = 1.0
T2 = 2.0
K = 110.0
spot = 100.0
vol = 0.2
vol0 = 0.5 # vol is increased over the 1st period so we have more points in the wings
# simulate all Gaussian returns (N1, N2) first
# returns: matrix of shape [nSimul, TimeSteps=2]
returns = np.random.normal(size=[nSimul,2])
# generate paths, step by step, and not path by path as customary
# this is to avoid slow Python loops, using NumPy's optimized vector functions instead
# generate the vector of all scenarios for S1, of shape [nSimul]
S1 = spot * np.exp(-0.5*vol0*vol0*T1 + vol0*np.sqrt(T1)*returns[:,0])
# generate the vector of all scenarios for S2, of shape [nSimul]
S2 = S1 * np.exp(-0.5*vol*vol*(T2-T1) + vol*np.sqrt(T2-T1)*returns[:,1])
# training set, X and Y are both vectors of shape [nSimul]
X = S1
Y = np.maximum(0, S2 - K)
xAxis = np.linspace(20, 200, 100)
xAxis=xAxis.reshape(-1,1)
# In[4]:
#Normalization of the simulated data:
meanX = np.mean(X)
stdX = np.std(X)
meanY = np.mean(Y)
stdY = np.std(Y)
normX = (X - meanX) / stdX
normY = (Y - meanY) / stdY
normX=normX.reshape(-1,1)
normY=normY.reshape(-1,1)
# In[5]:
class NeuralNetwork(nn.Module):
def __init__(self,inputsize,outputsize):
super(NeuralNetwork, self).__init__()
#self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(inputsize,3),
nn.ELU(),
nn.Linear(3, 5),
nn.ELU(),
nn.Linear(5,3),
nn.ELU(),
nn.Linear(3,outputsize),
)
w = torch.empty(0,1)
nn.init.normal_(w)
def forward(self, x):
#x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
# In[6]:
inputDim = 1 # takes variable 'x'
outputDim = 1 # takes variable 'y'
learningRate = 0.05
epochs = 10000
#weight=torch.empty(3)
model = NeuralNetwork(inputDim, outputDim)
##### For GPU #######
if torch.cuda.is_available():
model.cuda()
# In[7]:
#criterion = torch.nn.MSELoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)
# In[ ]:
def ridge_loss(outputs,labels):
torch.mean((outputs-labels)**2)
# In[ ]:
# In[9]:
#Adam optmization
criterion = torch.nn.MSELoss()
#optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = optim.Adam(model.parameters(), lr=0.05)
# In[10]:
for epoch in range(epochs):
# Converting inputs and labels to Variable
if torch.cuda.is_available():
inputs = Variable(torch.from_numpy(normX).cuda().float())
labels = Variable(torch.from_numpy(normY).cuda().float())
else:
inputs = Variable(torch.from_numpy(normX).float())
labels = Variable(torch.from_numpy(normY).float())
# Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
optimizer.zero_grad()
# get output from the model, given the inputs
outputs = model(inputs)
# get loss for the predicted output
loss = criterion(outputs, labels)
print(loss)
# get gradients w.r.t to parameters
loss.backward()
# update parameters
optimizer.step()
print('epoch {}, loss {}'.format(epoch, loss.item()))
# In[11]:
def predict(xs):
# first, normalize
nxs = (xs - meanX) / stdX
# forward feed through ANN
# we don't need gradients in the testing phase
with torch.no_grad():
if torch.cuda.is_available():
nys = model(Variable(torch.from_numpy(nxs.rehape(-1,1)).cuda().float())).cpu().data.numpy()
else:
nys = model(Variable(torch.from_numpy(nxs.reshape(-1,1))).float()).data.numpy()
# de-normalize output
ys = meanY + stdY * nys
# we get a matrix of shape [size of xs][1], which we reshape as vector [size of xs]
return np.reshape(ys, [-1])
# In[13]:
def BlackScholes(S0,r,sigma,T,K):
d1 = 1 / (sigma * np.sqrt(T)) * (np.log(S0/K) + (r+sigma**2/2)*T)
d2 = d1 - sigma * np.sqrt(T)
return norm.cdf(d1) * S0 - norm.cdf(d2) * K * np.exp(-r*T)
def BlackScholesCallDelta(S0,r,sigma,T,K):
d1 = 1 / (sigma * np.sqrt(T)) * (np.log(S0/K) + (r+sigma**2/2)*T)
return norm.cdf(d1)
BlackScholes_vec=np.vectorize(BlackScholes)
BlackScholesCallDelta_vec=np.vectorize(BlackScholesCallDelta)
# In[14]:
BS_price=BS_prices=BlackScholes_vec(S0=xAxis,r=0,sigma=0.2,T=1.0,K=110.0)
predicted=predict(xAxis)
S1=1
#line_learn = plt.plot(Sval,y,label="Deep Neural Net")
line_learn = plt.plot(xAxis,predicted,label="Neural Regression")
line_BS = plt.plot(xAxis,BS_price, label="Black-Scholes")
plt.xlabel("Spot Price")
plt.ylabel("Option Price")
#plt.title(r'Time: %1.1f' % time, loc='left', fontsize=11)
plt.title(r'Strike: %1.2f' % K, loc='right', fontsize=11)
plt.title(r'Initial price: %1.2f' % S1, loc='center', fontsize=11)
plt.legend()
plt.show()
#plt.savefig("deephedge.png", dpi=150)
plt.savefig("deephedge.pdf")
# In[15]:
Prices_rg_mc_diff=[]
for i in range(len(xAxis)-1):
delta=(predicted[i+1]-predicted[i])/(xAxis[i+1]-xAxis[i])
Prices_rg_mc_diff.append(delta)
# In[16]:
BS_delta=BlackScholesCallDelta(S0=xAxis,r=0,sigma=0.2,T=1.0,K=110.0)
predicted=predict(xAxis)
S1=1
#line_learn = plt.plot(Sval,y,label="Deep Neural Net")
line_learn = plt.plot(xAxis[1:],Prices_rg_mc_diff,label="Neural Regression")
line_BS = plt.plot(xAxis[1:],BS_delta[1:], label="Black-Scholes")
plt.xlabel("Spot Price")
plt.ylabel("Option Price")
#plt.title(r'Time: %1.1f' % time, loc='left', fontsize=11)
plt.title(r'Strike: %1.2f' % K, loc='right', fontsize=11)
plt.title(r'Initial price: %1.2f' % S1, loc='center', fontsize=11)
plt.legend()
plt.show()
#plt.savefig("deephedge.png", dpi=150)
plt.savefig("deephedge.pdf")
# In[17]:
model.backward(retain_graph=True)
# In[ ]:
print(NeuralNetwork.weight.grad)
# In[ ]:
def predict(xs):
# first, normalize
nxs = (xs - meanX) / stdX
# forward feed through ANN
# we don't need gradients in the testing phase
with torch.no_grad():
if torch.cuda.is_available():
nys = model(Variable(torch.from_numpy(nxs.rehape(-1,1)).cuda().float())).cpu().data.numpy()
else:
nys = model(Variable(torch.from_numpy(nxs.reshape(-1,1))).float()).data.numpy()
# de-normalize output
ys = meanY + stdY * nys
# we get a matrix of shape [size of xs][1], which we reshape as vector [size of xs]
return np.reshape(ys, [-1])
# In[21]:
c3=torch.from_numpy((predicted.reshape(-1,1)), requires_grad=True)
c4=torch.from_numpy(xAxis, requires_grad=True)
#c5=torch.Tensor(c3)
#c6=torch.Tensor(c4)
loss = criterion(c3,c4) # calculating loss
loss.backward()
# In[28]:
torch.tensor(predicted.reshape(-1,1), requires_grad=True)
torch.tensor(xAxis, requires_grad=True)
criterion(torch.tensor(predicted.reshape(-1,1), requires_grad=True),torch.tensor(xAxis, requires_grad=True))
loss.backward()
You need to explicitly use requires_grad = True when create a tensor. And to calculate gradient you first need to apply some operation on the tensor.
Here is an example:
import torch
x = torch.rand(2, 2, requires_grad=True)
y = x + 2
z = y * y * 3
out = z.mean()
out.backward()
print(x.grad)
Output:
tensor([[3.3720, 3.4302],
[3.4030, 3.3605]])
In this way you are using torch.autograd to calculate the gradient for tensor x. See autograd for more.
And for neural network you can simply use the network and backward it afterward.
A neural network Example:
import torch
import torch.nn as nn
import torch.nn.functional as f
x = torch.rand(2, 2)
# define a neural network
network = nn.Sequential(
nn.Linear(2,100),
nn.Linear(100,2)
)
pred = network(x)
loss = f.mae_loss(pred, x) # calculating loss
loss.backward()
# Update weights with gradients
network[0].weight = 0.1 * network[0].weight.grad
network[1].weight = 0.1 * network[1].weight.grad
Note: I didn't put any activation function in network for the sack of simplicity.
Example of backward() using torch.nn.MSELoss():
import torch
from torch.nn import MSELoss
criterion = MSELoss()
a = torch.tensor([1.,2.], requires_grad=True)
b = a**2
loss = criterion(b, a)
loss.backward()
print(a.grad)
Output:
tensor([0., 6.])
Related
I am trying to implement Ridge Regression in pytorch, defining the loss function and plotting said function over different iterations. The only issue is, I keep getting an error code: mat1 and mat2 shapes cannot be multiplied (1000x10 and 1x1). I would like to convert the second matrix to a 1x10 in order to complete the code but I can't seem to get it to work.
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
n = 1000
p = 10
mean = np.zeros((p))
val = 0.8
cov = np.ones((p,p))*val
cov = cov + np.eye(p)*(1-val)
np.random.seed(10)
X = np.random.multivariate_normal(mean, cov, n)
theta_true = np.concatenate((np.ones((5,1)), np.zeros((5,1))),axis=0)
delta=0.5
Sigma = np.eye(n,n,k=-1)*0.4 + np.eye(n,n)*1 + np.eye(n,n,k=1)*0.4
mean = np.zeros(n)
e = np.random.multivariate_normal(mean, Sigma, 1)
y=X#theta_true + delta*e.T
import torch
X_t = torch.from_numpy(X).float()
y_t = torch.from_numpy(y).float()
Sigma_t = torch.from_numpy(Sigma).float()
import torch.nn as nn
import torch.nn.functional as F
class MyLinear(nn.Module):
def __init__(self):
super(MyLinear, self).__init__()
self.linear = nn.Linear(1, 1)
def forward(self, x):
out = self.linear(x)
return out
def L2_norm(model):
return torch.sum(list(model.parameters())[0]**2)
def L1_norm(model):
return torch.sum(torch.abs(list(model.parameters())[0]))
def ridge_loss(y_pred, y_true, model, lambda_):
mse = F.mse_loss(y_pred, y_true)
regularization = lambda_ * L2_norm(model)
return mse + regularization
import matplotlib.pyplot as plt
model = MyLinear()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
lambda_ = 0.1
num_epochs = 1000
loss_values = []
for epoch in range(num_epochs):
optimizer.zero_grad()
y_pred = model(X_t)
loss = ridge_loss(y_pred, y_t, model, lambda_)
loss_values.append(loss.item())
loss.backward()
optimizer.step()
plt.plot(loss_values)
plt.xlabel('Iteration')
plt.ylabel('Loss')
plt.title('Ridge Regression Loss over Iterations')
plt.show()
I tried changing the theta_true definition to transform the matrix but the same error occurred.
theta_true = np.concatenate((np.ones((5,1)), np.zeros((5,1)))).reshape(10, 1)
Your Linear layer in MyLinear (line 37) is what is causing the issue.
self.linear = nn.Linear(1, 1)
means 1 input channel, one output channel, but x, as you have it here has shape (1000, 10), meaning it has 10 channels. So you will need to change that line to
self.linear = nn.Linear(10, 1)
that will do the trick, here is the image I get with that change:
I am trying to train a model to estimate a GMM. However, the means of the GMM are calculated each time based on a mean_placement parameter. I am following the solution provided here, I'll copy and paste the original code:
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as datasets
import torch
from torch import nn
from torch import optim
import torch.distributions as D
num_layers = 8
weights = torch.ones(8,requires_grad=True)
means = torch.tensor(np.random.randn(8,2),requires_grad=True)
stdevs = torch.tensor(np.abs(np.random.randn(8,2)),requires_grad=True)
parameters = [weights, means, stdevs]
optimizer1 = optim.SGD(parameters, lr=0.001, momentum=0.9)
num_iter = 10001
for i in range(num_iter):
mix = D.Categorical(weights)
comp = D.Independent(D.Normal(means,stdevs), 1)
gmm = D.MixtureSameFamily(mix, comp)
optimizer1.zero_grad()
x = torch.randn(5000,2)#this can be an arbitrary x samples
loss2 = -gmm.log_prob(x).mean()#-densityflow.log_prob(inputs=x).mean()
loss2.backward()
optimizer1.step()
print(i, loss2)
What I would like to do is this:
num_layers = 8
weights = torch.ones(8,requires_grad=True)
means_coef = torch.tensor(10.,requires_grad=True)
means = torch.tensor(torch.dstack([torch.linspace(1,means_coef.detach().item(),8)]*2).squeeze(),requires_grad=True)
stdevs = torch.tensor(np.abs(np.random.randn(8,2)),requires_grad=True)
parameters = [means_coef]
optimizer1 = optim.SGD(parameters, lr=0.001, momentum=0.9)
num_iter = 10001
for i in range(num_iter):
means = torch.tensor(torch.dstack([torch.linspace(1,means_coef.detach().item(),8)]*2).squeeze(),requires_grad=True)
mix = D.Categorical(weights)
comp = D.Independent(D.Normal(means,stdevs), 1)
gmm = D.MixtureSameFamily(mix, comp)
optimizer1.zero_grad()
x = torch.randn(5000,2)#this can be an arbitrary x samples
loss2 = -gmm.log_prob(x).mean()#-densityflow.log_prob(inputs=x).mean()
loss2.backward()
optimizer1.step()
print(i, means_coef)
print(means_coef)
However in this case the parameter is not updated and the grad value is always None. Any ideas how to fix this?
According to your instructions I have re-written your model.
If you run it you can see that all the parameters are changing after the model is optimized. I also have provided the graph of the model at the end. You can simply modify the GMM class as you need if you want to make a new one.
import numpy as np
import matplotlib.pyplot as plt
import sklearn.datasets as datasets
import torch
from torch import nn
from torch import optim
import torch.distributions as D
class GMM(nn.Module):
def __init__(self, weights, base, scale, n_cell=8, shift=0, dim=2):
super(GMM, self).__init__()
self.weight = nn.Parameter(weights)
self.base = nn.Parameter(base)
self.scale = nn.Parameter(scale)
self.grid = torch.arange(1, n_cell+1)
self.shift = shift
self.n_cell = n_cell
self.dim = dim
def trsf_grid(self):
trsf = (
torch.log(self.scale * self.grid + self.shift)
/ torch.log(self.base)
).reshape(-1, 1)
return trsf.expand(self.n_cell, self.dim)
def forward(self, x, std):
means = self.trsf_grid()
mix = D.Categorical(self.weight)
comp = D.Independent(D.Normal(means, std), 1)
gmm = D.MixtureSameFamily(mix, comp)
return -gmm.log_prob(x).mean()
if __name__ == "__main__":
weight = torch.ones(8)
base = torch.tensor(3.)
scale = torch.tensor(1.)
stds = torch.tensor(np.abs(np.random.randn(8,2)),requires_grad=False)
model = GMM(weight, base, scale)
print(list(model.parameters()))
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
for i in range(1000):
optimizer.zero_grad()
x = torch.randn(5000,2)
loss = model(x, stds)
loss.backward()
optimizer.step()
print(list(model.parameters()))
In my case It returned the following parameters:
[Parameter containing:
tensor([1., 1., 1., 1., 1., 1., 1., 1.], requires_grad=True), Parameter containing:
tensor(3., requires_grad=True), Parameter containing:
tensor(1., requires_grad=True)]
[Parameter containing:
tensor([0.7872, 1.1010, 1.3390, 1.3757, 0.5122, 0.2884, 1.2597, 0.7597],
requires_grad=True), Parameter containing:
tensor(3.3207, requires_grad=True), Parameter containing:
tensor(0.2814, requires_grad=True)]
which indeed shows that the parameters are updating.
Also you can see the computation graph below:
I have the following code that uses tensorflow to calculate a custom average loss when the image is consistently rotated:
import tensorflow as tf
import cv2
#initialize x_hat
img = cv2.imread("4.jpg")
x_hat = tf.Variable(img,name = 'x_hat') #img we want to attack
#tf.function
def cost2():
image=x_hat
#Now it will generate 100 samples rotated
num_samples = 100
average_loss = 0
for j in range(num_samples):
#ADD ROTATION (there may be a problem here)
rotated = tf.keras.preprocessing.image.random_rotation(image,
tf.random.uniform(shape=(),minval=40, maxval=90),channel_axis=2)
#get logits
rotated_logits, _ = resnet(rotated)
#get average CUSTOM loss
average_loss+=-1 * tf.nn.softmax_cross_entropy_with_logits(logits=rotated_logits, labels=labels)/ num_samples
return average_loss
and here is how I call it
learning_rate = 1e-1
optim = tf.optimizers.SGD (learning_rate=learning_rate)
epsilon = 2.0/255.0 # a really small perturbation
below = x - epsilon
above = x + epsilon
demo_steps = 200
# projected gradient descent
for i in range(demo_steps):
loss = optim.minimize(cost2, var_list=[x_hat])
if (i+1) % 10 == 0:
print('step %d, loss=%g' % (i+1, loss.numpy()))
projected = tf.clip_by_value(tf.clip_by_value(x_hat, below, above), 0, 1)
with tf.control_dependencies([projected]):
x_hat.assign(projected)
adv_robust = x_hat.numpy()
However, the following error returns to me once I run the code:
TypeError: in user code:
<ipython-input-183-abde02909da7>:14 cost2 *
rotated = tf.keras.preprocessing.image.random_rotation(image,
tf.random.uniform(shape=(),minval=40, maxval=90),channel_axis=2)
/home/me/.local/lib/python3.8/site-
packages/keras_preprocessing/image/affine_transformations.py:55 random_rotation *
theta = np.random.uniform(-rg, rg)
mtrand.pyx:1111 numpy.random.mtrand.RandomState.uniform **
TypeError: __array__() takes 1 positional argument but 2 were given
I am on Tensorflow 2.4.0 and the random_rotation and random.uniform functions are correct according to the TF 2.4.0 documentation HERE and HERE. So, what am I missing here?
The error might be coming from using TF tensors. As stated in the docs you linked regarding random_rotation:
Performs a random rotation of a Numpy image tensor.
Meaning you cannot use TF tensors with this operation. If you are in eager execution mode you can use tensor.numpy():
import tensorflow as tf
image = tf.random.normal((180, 180, 3))
rotated = tf.keras.preprocessing.image.random_rotation(image.numpy(),
tf.random.uniform(shape=(),minval=40, maxval=90).numpy(),channel_axis=2)
Otherwise, it is recommended to use the preprocessing layer: tf.keras.layers.RandomRotation, since using numpy in graph mode (for example in a function decorated with #tf.function) is not recommended.
Here is an example using the tf.keras.layers.RandomRotation:
import tensorflow as tf
import os
import matplotlib.pyplot as plt
_URL = 'https://storage.googleapis.com/mledu-datasets/cats_and_dogs_filtered.zip'
path_to_zip = tf.keras.utils.get_file('cats_and_dogs.zip', origin=_URL, extract=True)
PATH = os.path.join(os.path.dirname(path_to_zip), 'cats_and_dogs_filtered')
train_dir = os.path.join(PATH, 'train')
validation_dir = os.path.join(PATH, 'validation')
BATCH_SIZE = 1
IMG_SIZE = (160, 160)
train_ds = tf.keras.utils.image_dataset_from_directory(train_dir,
shuffle=True,
batch_size=BATCH_SIZE,
image_size=IMG_SIZE)
data_augmentation = tf.keras.Sequential([
tf.keras.layers.RandomRotation(tf.random.uniform(shape=(),minval=40, maxval=90)),
])
for image, _ in train_ds.take(1):
plt.figure(figsize=(10, 10))
first_image = image[0]
for i in range(9):
ax = plt.subplot(3, 3, i + 1)
augmented_image = data_augmentation(tf.expand_dims(first_image, 0), training=True)
plt.imshow(augmented_image[0] / 255)
plt.axis('off')
I am doing this custom training of a neural network in Colab, with and without GPU, and the training process is faster using the CPU, which makes me think that I am not parallelising the operations or missing something. I do not think it is because of the model is small, because I tried more complicated models and the problem persists:
## Import libraries
import matplotlib
# matplotlib.use('TkAgg') # Required to make it run on both Windows and Mac
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os
from tqdm import trange
# Switch of unnecessary TF warning messages
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
###############################################################################
################################## Parameters #################################
###############################################################################
gamma = tf.constant(2.0) # Curvature of the utility function
rho = tf.constant(0.04) # Discount rate
A = tf.constant(0.5) # TFP
alpha = tf.constant(0.36) # Returns to scale
delta = tf.constant(0.05) # Depreciation Rate of Capital
batchSize = 100 # Batch Size
number_epochs = 100000 # Number of epochs
kMin = 0.1 # lower bound of sample interval
kMax = 10.0 # upper bound of sample interval
gridSize = 10000 # Plotting grid
# Set global seed
tf.random.set_seed(1234)
np.random.seed(1234)
# Value function initial guess
initGuess = -60
# Neural network optimizer
optimizer = keras.optimizers.Adam()
###############################################################################
######################## Value Function Neural Network ########################
###############################################################################
def valueFnNeuralNet(nHidden = 3, nNeurons = 8):
model = keras.models.Sequential()
# Input layer
model.add(keras.layers.Dense(nNeurons, activation = "tanh", input_dim = 1))
# Hiden layers
for layer in range(nHidden - 1):
model.add(keras.layers.Dense(nNeurons, activation = "tanh"))
# Output layer
model.add(keras.layers.Dense(1,bias_initializer = keras.initializers.Constant(value = initGuess)))
return model
def HJB(input, V):
VPrime = tf.gradients(V(input), input)[0]
VPrimemax = tf.maximum(VPrime, 1E-7) # dV/dk
Y = A * tf.pow(input, alpha) # Output
C = tf.pow(VPrimemax, (-1/gamma)) # Consumption
I = Y - C # Investment
muK = I - delta * input # Capital drift
U = tf.pow(C, 1-gamma) / (1-gamma) # Utility
HJB = U - rho * V(input) + tf.multiply(tf.stop_gradient(VPrimemax), muK)
return HJB
def Objective(batchSize):
input = tf.random.uniform(shape = (batchSize,1), minval = kMin, maxval = kMax)
error = HJB(input, VF)
return tf.reduce_mean(tf.square(error))
###############################################################################
################################ Training Step ################################
###############################################################################
# Need decorator to run in graph mode instead of eager exectution
#tf.function
def training_step():
with tf.GradientTape() as tape:
loss = Objective(batchSize)
grads = tape.gradient(loss, theta)
optimizer.apply_gradients(zip(grads, theta))
return loss
###############################################################################
################################ Training Loop ################################
###############################################################################
def train_model(epochs):
losses = []
for epoch in trange(epochs):
loss = training_step()
losses.append(loss.numpy())
return losses
###############################################################################
################################### Running ###################################
###############################################################################
# Set up neural network
VF = valueFnNeuralNet()
# Define trainable network parameters
theta = VF.trainable_variables
# Run Model (and output loss evolution)
results = train_model(number_epochs)
The outputs that I get are the following:
withouth GPU: 100%|██████████| 100000/100000 [01:30<00:00, 1101.79it/s]
with GPU: 100%|██████████| 100000/100000 [03:36<00:00, 461.47it/s]
GPUs are more efficient for large matrix multiplications. Your input is of shape (100, 1) and so the distributed advantages of the GPU is so little it doesn't even offset the overhead of switching between CPU and GPU.
My guess is that you'll see the pattern reverse if you have input of shape (100, 100) instead.
This code is built up as follows: My robot takes a picture, some tf computer vision model calculates where in the picture the target object starts. This information (x1 and x2 coordinate) is passed to a pytorch model. It should learn to predict the correct motor activations, in order to get closer to the target. After the movement is executed, the robot takes a picture again and the tf cv model should calculate whether the motor activation brought the robot closer to the desired state (x1 at 10, x2 coordinate at at31)
However every time i run the code pytorch is not able to calculate the gradients.
I'm wondering if this is some data-type problem or if it is a more general one: Is it impossible to calculate the gradients if the loss is not calculated directly from the pytorch network's output?
Any help and suggestions will be greatly appreciated.
#define policy model (model to learn a policy for my robot)
import torch
import torch.nn as nn
import torch.nn.functional as F
class policy_gradient_model(nn.Module):
def __init__(self):
super(policy_gradient_model, self).__init__()
self.fc0 = nn.Linear(2, 2)
self.fc1 = nn.Linear(2, 32)
self.fc2 = nn.Linear(32, 64)
self.fc3 = nn.Linear(64,32)
self.fc4 = nn.Linear(32,32)
self.fc5 = nn.Linear(32, 2)
def forward(self,x):
x = self.fc0(x)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.relu(self.fc4(x))
x = F.relu(self.fc5(x))
return x
policy_model = policy_gradient_model().double()
print(policy_model)
optimizer = torch.optim.AdamW(policy_model.parameters(), lr=0.005, betas=(0.9,0.999), eps=1e-08, weight_decay=0.01, amsgrad=False)
#make robot move as predicted by pytorch network (not all code included)
def move(motor_controls):
#define curvature
# motor_controls[0] = sigmoid(motor_controls[0])
activation_left = 1+(motor_controls[0])*99
activation_right = 1+(1- motor_controls[0])*99
print("activation left:", activation_left, ". activation right:",activation_right, ". time:", motor_controls[1]*100)
#start movement
#main
import cv2
import numpy as np
import time
from torch.autograd import Variable
print("start training")
losses=[]
losses_end_of_epoch=[]
number_of_steps_each_epoch=[]
loss_function = nn.MSELoss(reduction='mean')
#each epoch
for epoch in range(2):
count=0
target_reached=False
while target_reached==False:
print("epoch: ", epoch, ". step:", count)
###process and take picture
indices = process_picture()
###binary_network(sliced)=indices as input for policy model
optimizer.zero_grad()
###output: 1 for curvature, 1 for duration of movement
motor_controls = policy_model(Variable(torch.from_numpy(indices))).detach().numpy()
print("NO TANH output for motor: 1)activation left, 2)time ", motor_controls)
motor_controls[0] = np.tanh(motor_controls[0])
motor_controls[1] = np.tanh(motor_controls[1])
print("TANH output for motor: 1)activation left, 2)time ", motor_controls)
###execute suggested action
move(motor_controls)
###take and process picture2 (after movement)
indices = (process_picture())
###loss=(binary_network(picture2) - desired
print("calculate loss")
print("idx", indices, type(torch.tensor(indices)))
# loss = 0
# loss = (indices[0]-10)**2+(indices[1]-31)**2
# loss = loss/2
print("shape of indices", indices.shape)
array=np.zeros((1,2))
array[0]=indices
print(array.shape, type(array))
array2 = torch.ones([1,2])
loss = loss_function(torch.tensor(array).double(), torch.tensor([[10.0,31.0]]).double()).float()
print("loss: ", loss, type(loss), loss.shape)
# array2[0] = loss_function(torch.tensor(array).double(),
torch.tensor([[10.0,31.0]]).double()).float()
losses.append(loss)
#start line causing the error-message (still part of main)
###calculate gradients
loss.backward()
#end line causing the error-message (still part of main)
###apply gradients
optimizer.step()
#Output (so far as intented) (not all included)
#calculate loss
idx [14. 15.] <class 'torch.Tensor'>
shape of indices (2,)
(1, 2) <class 'numpy.ndarray'>
loss: tensor(136.) <class 'torch.Tensor'> torch.Size([])
#Error Message:
Traceback (most recent call last):
File "/home/pi/Desktop/GradientPolicyLearning/PolicyModel.py", line 259, in <module>
array2.backward()
File "/home/pi/.local/lib/python3.7/site-packages/torch/tensor.py", line 134, in backward
torch.autograd.backward(self, gradient, retain_graph, create_graph)
File "/home/pi/.local/lib/python3.7/site-packages/torch/autograd/__init__.py", line 99, in
backward
allow_unreachable=True) # allow_unreachable flag
RuntimeError: element 0 of tensors does not require grad and does not have a grad_fn
If you call .detach() on the prediction, that will delete the gradients. Since you are first getting indices from the model and then trying to backprop the error, I would suggest
prediction = policy_model(torch.from_numpy(indices))
motor_controls = prediction.clone().detach().numpy()
This would keep the predictions as it is with the calculated gradients that can be backproped.
Now you can do
loss = loss_function(prediction, torch.tensor([[10.0,31.0]]).double()).float()
Note, you might wanna call double of the prediction if it throws an error.
It is indeed impossible to calculate the gradients if the loss is not calculated directly from the PyTorch network's output because then you would not be able to apply the chain rule which is used to optimise the gradients.
simple solution, turn on the Context Manager that sets gradient calculation to ON, if it is off
torch.set_grad_enabled(True) # Context-manager
Make sure that all your inputs into the NN, the output of NN and ground truth/target values are all of type torch.tensor and not list, numpy.array or any other iterable.
Also, make sure that they are not converted to list or numpy.array at any point either.
In my case, I got this error because I performed list comprehension on the tensor containing predicted values from NN. I did this to get the max value in each row. Then, converted the list back to a torch.tensor. before calculating the loss.
This back and forth conversion disables the gradient calculations
In my case, I got past this error by specifying requires_grad=True when defining my input tensors
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('dark_background')
# define rosenbrock function and gradient
a = 1
b = 5
def f(x):
return (a - x[0]) ** 2 + b * (x[1] - x[0] ** 2) ** 2
def jac(x):
dx1 = -2 * a + 4 * b * x[0] ** 3 - 4 * b * x[0] * x[1] + 2 * x[0]
dx2 = 2 * b * (x[1] - x[0] ** 2)
return np.array([dx1, dx2])
# create stochastic rosenbrock function and gradient
def f_rand(x):
return f(x) * np.random.uniform(0.5, 1.5)
def jac_rand(x): return jac(x) * np.random.uniform(0.5, 1.5)
# use hand coded adam
x = np.array([0.1, 0.1])
x0 = x.copy()
j = jac_rand(x)
beta1=0.9
beta2=0.999
eps=1e-8
m = x * 0
v = x * 0
learning_rate = .1
for ii in range(200):
m = (1 - beta1) * j + beta1 * m # first moment estimate.
v = (1 - beta2) * (j ** 2) + beta2 * v # second moment estimate.
mhat = m / (1 - beta1 ** (ii + 1)) # bias correction.
vhat = v / (1 - beta2 ** (ii + 1))
x = x - learning_rate * mhat / (np.sqrt(vhat) + eps)
x -= learning_rate * v
j = jac_rand(x)
print('hand code finds optimal to be ', x, f(x))
# attempt to use pytorch
import torch
x_tensor = torch.tensor(x0, requires_grad=True)
optimizer = torch.optim.Adam([x_tensor], lr=learning_rate)
def closure():
optimizer.zero_grad()
loss = f_rand(x_tensor)
loss.backward()
return loss
for ii in range(200):
optimizer.step(closure)
print('My PyTorch attempt found ', x_tensor, f(x_tensor))
Following worked for me:
loss.requires_grad = True
loss.backward()