I am doing this custom training of a neural network in Colab, with and without GPU, and the training process is faster using the CPU, which makes me think that I am not parallelising the operations or missing something. I do not think it is because of the model is small, because I tried more complicated models and the problem persists:
## Import libraries
import matplotlib
# matplotlib.use('TkAgg') # Required to make it run on both Windows and Mac
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow import keras
import numpy as np
import os
from tqdm import trange
# Switch of unnecessary TF warning messages
os.environ['TF_CPP_MIN_LOG_LEVEL'] = '2'
###############################################################################
################################## Parameters #################################
###############################################################################
gamma = tf.constant(2.0) # Curvature of the utility function
rho = tf.constant(0.04) # Discount rate
A = tf.constant(0.5) # TFP
alpha = tf.constant(0.36) # Returns to scale
delta = tf.constant(0.05) # Depreciation Rate of Capital
batchSize = 100 # Batch Size
number_epochs = 100000 # Number of epochs
kMin = 0.1 # lower bound of sample interval
kMax = 10.0 # upper bound of sample interval
gridSize = 10000 # Plotting grid
# Set global seed
tf.random.set_seed(1234)
np.random.seed(1234)
# Value function initial guess
initGuess = -60
# Neural network optimizer
optimizer = keras.optimizers.Adam()
###############################################################################
######################## Value Function Neural Network ########################
###############################################################################
def valueFnNeuralNet(nHidden = 3, nNeurons = 8):
model = keras.models.Sequential()
# Input layer
model.add(keras.layers.Dense(nNeurons, activation = "tanh", input_dim = 1))
# Hiden layers
for layer in range(nHidden - 1):
model.add(keras.layers.Dense(nNeurons, activation = "tanh"))
# Output layer
model.add(keras.layers.Dense(1,bias_initializer = keras.initializers.Constant(value = initGuess)))
return model
def HJB(input, V):
VPrime = tf.gradients(V(input), input)[0]
VPrimemax = tf.maximum(VPrime, 1E-7) # dV/dk
Y = A * tf.pow(input, alpha) # Output
C = tf.pow(VPrimemax, (-1/gamma)) # Consumption
I = Y - C # Investment
muK = I - delta * input # Capital drift
U = tf.pow(C, 1-gamma) / (1-gamma) # Utility
HJB = U - rho * V(input) + tf.multiply(tf.stop_gradient(VPrimemax), muK)
return HJB
def Objective(batchSize):
input = tf.random.uniform(shape = (batchSize,1), minval = kMin, maxval = kMax)
error = HJB(input, VF)
return tf.reduce_mean(tf.square(error))
###############################################################################
################################ Training Step ################################
###############################################################################
# Need decorator to run in graph mode instead of eager exectution
#tf.function
def training_step():
with tf.GradientTape() as tape:
loss = Objective(batchSize)
grads = tape.gradient(loss, theta)
optimizer.apply_gradients(zip(grads, theta))
return loss
###############################################################################
################################ Training Loop ################################
###############################################################################
def train_model(epochs):
losses = []
for epoch in trange(epochs):
loss = training_step()
losses.append(loss.numpy())
return losses
###############################################################################
################################### Running ###################################
###############################################################################
# Set up neural network
VF = valueFnNeuralNet()
# Define trainable network parameters
theta = VF.trainable_variables
# Run Model (and output loss evolution)
results = train_model(number_epochs)
The outputs that I get are the following:
withouth GPU: 100%|██████████| 100000/100000 [01:30<00:00, 1101.79it/s]
with GPU: 100%|██████████| 100000/100000 [03:36<00:00, 461.47it/s]
GPUs are more efficient for large matrix multiplications. Your input is of shape (100, 1) and so the distributed advantages of the GPU is so little it doesn't even offset the overhead of switching between CPU and GPU.
My guess is that you'll see the pattern reverse if you have input of shape (100, 100) instead.
Related
I have implemented and trained a neural network in Pytorch, however, I am interested in the derivative of the neural network parameters with respect to the input.
I have extensively searched for any procedure to that would allow evaluating the derivative of weights with respect to a given input, but I did not find anything.
I know that I can compute the gradients of a function in the following way.
external_grad = torch.tensor([1., 1.])
Q.backward(gradient=external_grad)
But How would I do that with a trained neural network instead of a function Q?
Thanks in advance.
#!/usr/bin/env python
# coding: utf-8
# In[1]:
import numpy as np
from scipy.stats import norm
from numpy import linalg as la
import numpy.random as npr
from tabulate import tabulate
from matplotlib import pyplot as plt
import random
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
#from torchvision import datasets, transforms
from torch.autograd import Variable
# In[2]:
import numpy as np
from scipy.stats import norm
from numpy import linalg as la
import numpy.random as npr
from tabulate import tabulate
from matplotlib import pyplot as plt
import random
import os
import torch
from torch import nn
from torch.utils.data import DataLoader
#from torchvision import datasets, transforms
from torch.autograd import Variable
from torch import optim
# In[3]:
nSimul = 32768
T1 = 1.0
T2 = 2.0
K = 110.0
spot = 100.0
vol = 0.2
vol0 = 0.5 # vol is increased over the 1st period so we have more points in the wings
# simulate all Gaussian returns (N1, N2) first
# returns: matrix of shape [nSimul, TimeSteps=2]
returns = np.random.normal(size=[nSimul,2])
# generate paths, step by step, and not path by path as customary
# this is to avoid slow Python loops, using NumPy's optimized vector functions instead
# generate the vector of all scenarios for S1, of shape [nSimul]
S1 = spot * np.exp(-0.5*vol0*vol0*T1 + vol0*np.sqrt(T1)*returns[:,0])
# generate the vector of all scenarios for S2, of shape [nSimul]
S2 = S1 * np.exp(-0.5*vol*vol*(T2-T1) + vol*np.sqrt(T2-T1)*returns[:,1])
# training set, X and Y are both vectors of shape [nSimul]
X = S1
Y = np.maximum(0, S2 - K)
xAxis = np.linspace(20, 200, 100)
xAxis=xAxis.reshape(-1,1)
# In[4]:
#Normalization of the simulated data:
meanX = np.mean(X)
stdX = np.std(X)
meanY = np.mean(Y)
stdY = np.std(Y)
normX = (X - meanX) / stdX
normY = (Y - meanY) / stdY
normX=normX.reshape(-1,1)
normY=normY.reshape(-1,1)
# In[5]:
class NeuralNetwork(nn.Module):
def __init__(self,inputsize,outputsize):
super(NeuralNetwork, self).__init__()
#self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(inputsize,3),
nn.ELU(),
nn.Linear(3, 5),
nn.ELU(),
nn.Linear(5,3),
nn.ELU(),
nn.Linear(3,outputsize),
)
w = torch.empty(0,1)
nn.init.normal_(w)
def forward(self, x):
#x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
# In[6]:
inputDim = 1 # takes variable 'x'
outputDim = 1 # takes variable 'y'
learningRate = 0.05
epochs = 10000
#weight=torch.empty(3)
model = NeuralNetwork(inputDim, outputDim)
##### For GPU #######
if torch.cuda.is_available():
model.cuda()
# In[7]:
#criterion = torch.nn.MSELoss()
#optimizer = torch.optim.SGD(model.parameters(), lr=learningRate)
# In[ ]:
def ridge_loss(outputs,labels):
torch.mean((outputs-labels)**2)
# In[ ]:
# In[9]:
#Adam optmization
criterion = torch.nn.MSELoss()
#optimizer = optim.SGD(model.parameters(), lr=0.01, momentum=0.9)
optimizer = optim.Adam(model.parameters(), lr=0.05)
# In[10]:
for epoch in range(epochs):
# Converting inputs and labels to Variable
if torch.cuda.is_available():
inputs = Variable(torch.from_numpy(normX).cuda().float())
labels = Variable(torch.from_numpy(normY).cuda().float())
else:
inputs = Variable(torch.from_numpy(normX).float())
labels = Variable(torch.from_numpy(normY).float())
# Clear gradient buffers because we don't want any gradient from previous epoch to carry forward, dont want to cummulate gradients
optimizer.zero_grad()
# get output from the model, given the inputs
outputs = model(inputs)
# get loss for the predicted output
loss = criterion(outputs, labels)
print(loss)
# get gradients w.r.t to parameters
loss.backward()
# update parameters
optimizer.step()
print('epoch {}, loss {}'.format(epoch, loss.item()))
# In[11]:
def predict(xs):
# first, normalize
nxs = (xs - meanX) / stdX
# forward feed through ANN
# we don't need gradients in the testing phase
with torch.no_grad():
if torch.cuda.is_available():
nys = model(Variable(torch.from_numpy(nxs.rehape(-1,1)).cuda().float())).cpu().data.numpy()
else:
nys = model(Variable(torch.from_numpy(nxs.reshape(-1,1))).float()).data.numpy()
# de-normalize output
ys = meanY + stdY * nys
# we get a matrix of shape [size of xs][1], which we reshape as vector [size of xs]
return np.reshape(ys, [-1])
# In[13]:
def BlackScholes(S0,r,sigma,T,K):
d1 = 1 / (sigma * np.sqrt(T)) * (np.log(S0/K) + (r+sigma**2/2)*T)
d2 = d1 - sigma * np.sqrt(T)
return norm.cdf(d1) * S0 - norm.cdf(d2) * K * np.exp(-r*T)
def BlackScholesCallDelta(S0,r,sigma,T,K):
d1 = 1 / (sigma * np.sqrt(T)) * (np.log(S0/K) + (r+sigma**2/2)*T)
return norm.cdf(d1)
BlackScholes_vec=np.vectorize(BlackScholes)
BlackScholesCallDelta_vec=np.vectorize(BlackScholesCallDelta)
# In[14]:
BS_price=BS_prices=BlackScholes_vec(S0=xAxis,r=0,sigma=0.2,T=1.0,K=110.0)
predicted=predict(xAxis)
S1=1
#line_learn = plt.plot(Sval,y,label="Deep Neural Net")
line_learn = plt.plot(xAxis,predicted,label="Neural Regression")
line_BS = plt.plot(xAxis,BS_price, label="Black-Scholes")
plt.xlabel("Spot Price")
plt.ylabel("Option Price")
#plt.title(r'Time: %1.1f' % time, loc='left', fontsize=11)
plt.title(r'Strike: %1.2f' % K, loc='right', fontsize=11)
plt.title(r'Initial price: %1.2f' % S1, loc='center', fontsize=11)
plt.legend()
plt.show()
#plt.savefig("deephedge.png", dpi=150)
plt.savefig("deephedge.pdf")
# In[15]:
Prices_rg_mc_diff=[]
for i in range(len(xAxis)-1):
delta=(predicted[i+1]-predicted[i])/(xAxis[i+1]-xAxis[i])
Prices_rg_mc_diff.append(delta)
# In[16]:
BS_delta=BlackScholesCallDelta(S0=xAxis,r=0,sigma=0.2,T=1.0,K=110.0)
predicted=predict(xAxis)
S1=1
#line_learn = plt.plot(Sval,y,label="Deep Neural Net")
line_learn = plt.plot(xAxis[1:],Prices_rg_mc_diff,label="Neural Regression")
line_BS = plt.plot(xAxis[1:],BS_delta[1:], label="Black-Scholes")
plt.xlabel("Spot Price")
plt.ylabel("Option Price")
#plt.title(r'Time: %1.1f' % time, loc='left', fontsize=11)
plt.title(r'Strike: %1.2f' % K, loc='right', fontsize=11)
plt.title(r'Initial price: %1.2f' % S1, loc='center', fontsize=11)
plt.legend()
plt.show()
#plt.savefig("deephedge.png", dpi=150)
plt.savefig("deephedge.pdf")
# In[17]:
model.backward(retain_graph=True)
# In[ ]:
print(NeuralNetwork.weight.grad)
# In[ ]:
def predict(xs):
# first, normalize
nxs = (xs - meanX) / stdX
# forward feed through ANN
# we don't need gradients in the testing phase
with torch.no_grad():
if torch.cuda.is_available():
nys = model(Variable(torch.from_numpy(nxs.rehape(-1,1)).cuda().float())).cpu().data.numpy()
else:
nys = model(Variable(torch.from_numpy(nxs.reshape(-1,1))).float()).data.numpy()
# de-normalize output
ys = meanY + stdY * nys
# we get a matrix of shape [size of xs][1], which we reshape as vector [size of xs]
return np.reshape(ys, [-1])
# In[21]:
c3=torch.from_numpy((predicted.reshape(-1,1)), requires_grad=True)
c4=torch.from_numpy(xAxis, requires_grad=True)
#c5=torch.Tensor(c3)
#c6=torch.Tensor(c4)
loss = criterion(c3,c4) # calculating loss
loss.backward()
# In[28]:
torch.tensor(predicted.reshape(-1,1), requires_grad=True)
torch.tensor(xAxis, requires_grad=True)
criterion(torch.tensor(predicted.reshape(-1,1), requires_grad=True),torch.tensor(xAxis, requires_grad=True))
loss.backward()
You need to explicitly use requires_grad = True when create a tensor. And to calculate gradient you first need to apply some operation on the tensor.
Here is an example:
import torch
x = torch.rand(2, 2, requires_grad=True)
y = x + 2
z = y * y * 3
out = z.mean()
out.backward()
print(x.grad)
Output:
tensor([[3.3720, 3.4302],
[3.4030, 3.3605]])
In this way you are using torch.autograd to calculate the gradient for tensor x. See autograd for more.
And for neural network you can simply use the network and backward it afterward.
A neural network Example:
import torch
import torch.nn as nn
import torch.nn.functional as f
x = torch.rand(2, 2)
# define a neural network
network = nn.Sequential(
nn.Linear(2,100),
nn.Linear(100,2)
)
pred = network(x)
loss = f.mae_loss(pred, x) # calculating loss
loss.backward()
# Update weights with gradients
network[0].weight = 0.1 * network[0].weight.grad
network[1].weight = 0.1 * network[1].weight.grad
Note: I didn't put any activation function in network for the sack of simplicity.
Example of backward() using torch.nn.MSELoss():
import torch
from torch.nn import MSELoss
criterion = MSELoss()
a = torch.tensor([1.,2.], requires_grad=True)
b = a**2
loss = criterion(b, a)
loss.backward()
print(a.grad)
Output:
tensor([0., 6.])
I am trying to train a simple neural network where the input data is taken from a matlab simulink simulation and the output is then fed back into a different matlab simulink simulation. My code is as follows:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
import random
def get_pid_values():
# call simulink model that just produces PID values
return random.random()
def get_plant(intermediate_val):
# get plant output.
return random.random()
class CustomDataGen(tf.keras.utils.Sequence):
def __init__(self, df, X_col,
batch_size,
input_size=(1,),
shuffle=True):
self.df = df.copy()
self.X_col = X_col
self.batch_size = batch_size
self.input_size = input_size
self.shuffle = shuffle
self.n = len(self.df)
def __get_input(self, index):
# Need to adjust this to support retrieving ref voltage.
return self.df[self.X_col].iloc[index]
def on_epoch_end(self):
if self.shuffle:
self.df = self.df.sample(frac=1).reset_index(drop=True)
def __getitem__(self, index):
X = self.__get_input(index)
return X
def __len__(self):
return self.n // self.batch_size
def get_model(input_shape, hidden, output_shape):
inputs = keras.layers.Input(shape=input_shape)
x = layers.Dense(hidden, activation="relu")(inputs)
x = layers.Dense(hidden, activation='relu')(x)
outputs = layers.Dense(output_shape)(x)
model = keras.Model(inputs=inputs, outputs=outputs, name="pid-modifier")
return model
loss_object = tf.keras.losses.MeanSquaredError()
def loss(y_ref, y_plant):
y_ = y_plant
y = y_ref
return loss_object(y_true=y, y_pred=y_)
if __name__ == "__main__":
# Hyperparameters
lr = 0.01
num_epochs = 1
hidden_size = 4
net_input_size = 1
net_output_size = 1
batch_size = 1
reference_fpath = "Run2_rThrottleTarget.csv"
references = pd.read_csv(reference_fpath)
data_generator = CustomDataGen(df=references, X_col='Throttle', batch_size=1)
# Keep results for plotting
train_loss_results = []
# Initialize optimizer
optimizer = tf.keras.optimizers.Adam(learning_rate=lr)
# error intitial condition
err = 0
# instantiate model
model = get_model(input_shape=(2,), hidden=hidden_size, output_shape=net_output_size)
for epoch in range(num_epochs):
for ref in data_generator:
with tf.GradientTape() as tape:
tape.watch(model.trainable_variables)
# Get pid values
pid = get_pid_values()
# Group ref with pid voltage for input
input = tf.constant([[ref, pid]])
# Get the adjusted voltage from the network
intermediate_val = model(input)
# Get the plant output based on the adjusted value.
plant = get_plant(intermediate_val)
plant = tf.constant([plant], dtype=tf.float64)
ref = tf.constant([ref], dtype=tf.float64)
# Calculate loss
loss_value = loss(ref, plant)
grads = tape.gradient(loss_value, model.trainable_weights)
optimizer.apply_gradients(zip(grads, model.trainable_weights))
err = ref - plant
if epoch % 50 == 0:
print("Epoch {:03d}: Loss: {:.3f}".format(epoch, loss_value))
fig, axes = plt.subplots(1, figsize=(12, 8))
fig.suptitle('Training Metrics')
axes[0].set_ylabel("Loss", fontsize=14)
axes[0].plot(train_loss_results)
plt.show()
For the moment I am just mocking the calls to simulink by returning a random number. My problem is that when I take the model output and then call the function that mocks a call to simulink and calculate my loss:
# Get the adjusted voltage from the network
intermediate_val = model(input)
# Get the plant output based on the adjusted value.
plant = get_plant(intermediate_val)
plant = tf.constant([plant], dtype=tf.float64)
ref = tf.constant([ref], dtype=tf.float64)
# Calculate loss
loss_value = loss(ref, plant)
I get the error ValueError: No gradients provided for any variable. I've figured out that if I pass the model's output directly to the loss function everything works fine. My question is how can I have the intermediate step of passing my model's output to another function and using the returned value to calculate loss?
a gradient exists between intermediate_val and model.trainable_variables as it is calculated by back propagation, the tape however cannot perform back-propagation on plant because it wasn't calculated by tensorflow, it's just a constant to it, it has no gradient.
since the model knows nothing about the relation between the loss and how it is generated, this becomes a case of reinforcement learning, which can be done using the tensorflow-agents module.
this is a tutorial about it on youtube Everything You Need To Master Actor Critic Methods | Tensorflow 2 Tutorial , it's about a certain network architecture but its gradient calculation method is exactly the same as your case, the code is easily adaptabe.
I'm wondering why every new observation of my Pong gym environment takes so long to load onto my GPU.
I am attempting to train a Policy Gradient model to play Pong. I have included my entire code below for this model, but my question is about why it takes so long to load and process the matrices on CUDA. The Atari, Pong gym environment cannot run on CUDA as far as I know, so I am converting Numpy arrays to Pytorch tensors at each reset observation and each step observation.
The observation of the Pong screen is 6400 pixels, or an 80 x 80 tensor.
The code below is my adaptation of this code, which is purely in Numpy and Python. THIS NUMPY VERSION RUNS FASTER ON MY CPU THAN MY CODE, which I attempted to rewrite for Pytorch to run on my GPU. I'm not posting to ask if I adapted every part of the original code well. I am only posting to ask why the tensors and model are so slow to get on CUDA and to run on CUDA.
import torch
import torch.nn as nn
from torch.distributions.categorical import Categorical
from torch.optim import Adam
import numpy as np
import random
import time
import gym
from gym.spaces import Discrete, Box
device = torch.device("cuda:3" if torch.cuda.is_available() else "cpu")
# hyperparameters
H = 200 # number of hidden layer neurons
batch_size = 10 # every how many episodes to do a param update?
learning_rate = 1e-4
gamma = 0.99 # discount factor for reward
decay_rate = 0.99 # decay factor for RMSProp leaky sum of grad^2
resume = False # resume from previous checkpoint?
render = False
def mlp(sizes, activation=nn.Tanh, output_activation=nn.Identity):
# Build a feedforward neural network.
layers = []
for j in range(len(sizes)-1):
act = activation if j < len(sizes)-2 else output_activation
layers += [nn.Linear(sizes[j], sizes[j+1]), act()]
return nn.Sequential(*layers)
def reward_to_go(rews):
n = len(rews)
rtgs = np.zeros_like(rews)
for i in reversed(range(n)):
rtgs[i] = rews[i] + (rtgs[i+1] if i+1 < n else 0)
discounted_r = np.zeros_like(rtgs)
running_add = 0
for t in reversed(range(0, rtgs.size)):
if rtgs[t] != 0: running_add = 0 # reset the sum, since this was a game boundary (pong specific!)
running_add = running_add * gamma + rtgs[t]
discounted_r[t] = running_add
discounted_epr = discounted_r.copy()
discounted_epr -= np.mean(discounted_epr)
discounted_epr /= np.std(discounted_epr)
return discounted_epr
from ale_py import ALEInterface
ale = ALEInterface()
from ale_py.roms import Pong
ale.loadROM(Pong)
env = gym.make("ALE/Pong-v5")
lr=1e-2
epochs=50
batch_size=5000
render=False
# make environment, check spaces, get obs / act dims
assert isinstance(env.observation_space, Box), \
"This example only works for envs with continuous state spaces."
assert isinstance(env.action_space, Discrete), \
"This example only works for envs with discrete action spaces."
obs_dim = env.observation_space.shape[0]
n_acts = env.action_space.n
# make core of policy network
logits_net = mlp([6400, 3200, 1600, 2]).cuda(device)
def prepro(I):
I = I[35:195] # crop
I = I[::2,::2,0] # downsample by factor of 2
I[I == 144] = 0 # erase background (background type 1)
I[I == 109] = 0 # erase background (background type 2)
I[I != 0] = 1 # everything else (paddles, ball) just set to 1
return I.ravel()
# make function to compute action distribution
def get_policy(obs):
logits = logits_net(obs)
return Categorical(logits=logits)
# make action selection function (outputs int actions, sampled from policy)
def get_action(obs):
return get_policy(obs).sample().item()
# make loss function whose gradient, for the right data, is policy gradient
def compute_loss(obs, act, weights):
logp = get_policy(obs).log_prob(act)
return -(logp * weights).mean()
# make optimizer
optimizer = Adam(logits_net.parameters(), lr=lr)
# for training policy
def train_one_epoch():
prev_x = None
# make some empty lists for logging.
batch_obs = [] # for observations
batch_acts = [] # for actions
batch_weights = [] # for reward-to-go weighting in policy gradient
batch_rets = [] # for measuring episode returns
batch_lens = [] # for measuring episode lengths
# reset episode-specific variables
obs = env.reset() # first obs comes from starting distribution
done = False # signal from environment that episode is over
ep_rews = [] # list for rewards accrued throughout ep
# render first episode of each epoch
finished_rendering_this_epoch = False
# collect experience by acting in the environment with current policy
while True:
# preprocess the observation, set input to network to be difference image
cur_x = prepro(torch.as_tensor(obs, dtype=torch.float32, device=device))
x = cur_x - prev_x if prev_x is not None else np.zeros(6400)
prev_x = cur_x
act = get_action(torch.as_tensor(x, dtype=torch.float32, device=device))
# rendering
if (not finished_rendering_this_epoch) and render:
env.render()
# save obs
batch_obs.append(obs.copy())
# act in the environment
obs, rew, done, _ = env.step(act)
cur_x = prepro(torch.as_tensor(obs, dtype=torch.float32, device=device))
x = cur_x - prev_x
prev_x = cur_x
act = get_action(torch.as_tensor(x, dtype=torch.float32, device=device))
# save action, reward
batch_acts.append(act)
ep_rews.append(rew)
if done:
print("done one")
# if episode is over, record info about episode
ep_ret, ep_len = sum(ep_rews), len(ep_rews)
batch_rets.append(ep_ret)
batch_lens.append(ep_len)
# the weight for each logprob(a_t|s_t) is reward-to-go from t
batch_weights += list(reward_to_go(ep_rews))
# reset episode-specific variables
obs, done, ep_rews = env.reset(), False, []
# won't render again this epoch
finished_rendering_this_epoch = True
# end experience loop if we have enough of it
if len(batch_obs) > batch_size:
break
# take a single policy gradient update step
optimizer.zero_grad()
batch_loss = compute_loss(obs=torch.as_tensor(batch_obs, dtype=torch.float32),
act=torch.as_tensor(batch_acts, dtype=torch.int32),
weights=torch.as_tensor(batch_weights, dtype=torch.float32)
)
batch_loss.backward()
optimizer.step()
return batch_loss, batch_rets, batch_lens
# training loop
for i in range(1):
batch_loss, batch_rets, batch_lens = train_one_epoch()
print('epoch: %3d \t loss: %.3f \t return: %.3f \t ep_len: %.3f'%
(i, batch_loss, np.mean(batch_rets), np.mean(batch_lens)))
I'm trying to implement and train a neural network using the JAX library and its little neural network submodule, "Stax". Since this library doesn't come with an implementation of binary cross entropy, I wrote my own:
def binary_cross_entropy(y_hat, y):
bce = y * jnp.log(y_hat) + (1 - y) * jnp.log(1 - y_hat)
return jnp.mean(-bce)
I implemented a simple neural network and trained it on MNIST, and started to get suspicious of some of the results I was getting. So I implemented the same setup in Keras, and I immediately got wildly different results! The same model, trained in the same way on the same data, was getting 90% training accuracy in Keras instead of around 50% in JAX. Eventually I tracked down part of the issue to my naive implementation of cross-entropy, which is supposedly numerically unstable. Following this post and this code I found, I wrote the following new version:
def binary_cross_entropy_stable(y_hat, y):
y_hat = jnp.clip(y_hat, 0.000001, 0.9999999)
logits = jnp.log(y_hat/(1 - y_hat))
max_logit = jnp.clip(logits, 0, None)
bces = logits - logits * y + max_logit + jnp.log(jnp.exp(-max_logit) + jnp.exp(-logits - max_logit))
return jnp.mean(bces)
This works a little better. Now my JAX implementation gets up to 80% train accuracy, but that's still a lot less than the 90% Keras gets. What I want to know is what is going on? Why are my two implementations not behaving the same way?
Below, I condensed my two implementations down to a single script. In this script, I implement the same model in JAX and in Keras. I initialize both with the same weights, and train them using full-batch gradient descent for 10 steps on 1000 datapoints from MNIST, the same data for each model. JAX finishes with 80% training accuracy, while Keras finishes with 90%. Specifically, I get this output:
Initial Keras accuracy: 0.4350000023841858
Initial JAX accuracy: 0.435
Final JAX accuracy: 0.792
Final Keras accuracy: 0.9089999794960022
JAX accuracy (Keras weights): 0.909
Keras accuracy (JAX weights): 0.7919999957084656
And actually, when I vary the conditions a little (using different random initial weights or a different training set), sometimes I get back the 50% JAX accuracy and 90% Keras accuracy.
I swap the weights at the end to verify that the weights obtained from training are indeed the issue, not something to do with the actual computation of the network predictions, or the way I calculate accuracy.
The code:
import numpy as np
import jax
from jax import jit, grad
from jax.experimental import stax, optimizers
import jax.numpy as jnp
import keras
import keras.datasets.mnist
def binary_cross_entropy(y_hat, y):
bce = y * jnp.log(y_hat) + (1 - y) * jnp.log(1 - y_hat)
return jnp.mean(-bce)
def binary_cross_entropy_stable(y_hat, y):
y_hat = jnp.clip(y_hat, 0.000001, 0.9999999)
logits = jnp.log(y_hat/(1 - y_hat))
max_logit = jnp.clip(logits, 0, None)
bces = logits - logits * y + max_logit + jnp.log(jnp.exp(-max_logit) + jnp.exp(-logits - max_logit))
return jnp.mean(bces)
def binary_accuracy(y_hat, y):
return jnp.mean((y_hat >= 1/2) == (y >= 1/2))
########################################
# #
# Create dataset #
# #
########################################
input_dimension = 784
(x_train, y_train), (x_test, y_test) = keras.datasets.mnist.load_data(path="mnist.npz")
xs = np.concatenate([x_train, x_test])
xs = xs.reshape((70000, 784))
ys = np.concatenate([y_train, y_test])
ys = (ys >= 5).astype(np.float32)
ys = ys.reshape((70000, 1))
train_xs = xs[:1000]
train_ys = ys[:1000]
########################################
# #
# Create JAX model #
# #
########################################
jax_initializer, jax_model = stax.serial(
stax.Dense(1000),
stax.Relu,
stax.Dense(1),
stax.Sigmoid
)
rng_key = jax.random.PRNGKey(0)
_, initial_jax_weights = jax_initializer(rng_key, (1, input_dimension))
########################################
# #
# Create Keras model #
# #
########################################
initial_keras_weights = [*initial_jax_weights[0], *initial_jax_weights[2]]
keras_model = keras.Sequential([
keras.layers.Dense(1000, activation="relu"),
keras.layers.Dense(1, activation="sigmoid")
])
keras_model.compile(
optimizer=keras.optimizers.SGD(learning_rate=0.01),
loss=keras.losses.binary_crossentropy,
metrics=["accuracy"]
)
keras_model.build(input_shape=(1, input_dimension))
keras_model.set_weights(initial_keras_weights)
if __name__ == "__main__":
########################################
# #
# Compare untrained models #
# #
########################################
initial_keras_predictions = keras_model.predict(train_xs, verbose=0)
initial_jax_predictions = jax_model(initial_jax_weights, train_xs)
_, keras_initial_accuracy = keras_model.evaluate(train_xs, train_ys, verbose=0)
jax_initial_accuracy = binary_accuracy(jax_model(initial_jax_weights, train_xs), train_ys)
print("Initial Keras accuracy:", keras_initial_accuracy)
print("Initial JAX accuracy:", jax_initial_accuracy)
########################################
# #
# Train JAX model #
# #
########################################
L = jit(binary_cross_entropy_stable)
gradL = jit(grad(lambda w, x, y: L(jax_model(w, x), y)))
opt_init, opt_apply, get_params = optimizers.sgd(0.01)
network_state = opt_init(initial_jax_weights)
for _ in range(10):
wT = get_params(network_state)
gradient = gradL(wT, train_xs, train_ys)
network_state = opt_apply(
0,
gradient,
network_state
)
final_jax_weights = get_params(network_state)
final_jax_training_predictions = jax_model(final_jax_weights, train_xs)
final_jax_accuracy = binary_accuracy(final_jax_training_predictions, train_ys)
print("Final JAX accuracy:", final_jax_accuracy)
########################################
# #
# Train Keras model #
# #
########################################
for _ in range(10):
keras_model.fit(
train_xs,
train_ys,
epochs=1,
batch_size=1000,
verbose=0
)
final_keras_loss, final_keras_accuracy = keras_model.evaluate(train_xs, train_ys, verbose=0)
print("Final Keras accuracy:", final_keras_accuracy)
########################################
# #
# Swap weights #
# #
########################################
final_keras_weights = keras_model.get_weights()
final_keras_weights_in_jax_format = [
(final_keras_weights[0], final_keras_weights[1]),
tuple(),
(final_keras_weights[2], final_keras_weights[3]),
tuple()
]
jax_accuracy_with_keras_weights = binary_accuracy(
jax_model(final_keras_weights_in_jax_format, train_xs),
train_ys
)
print("JAX accuracy (Keras weights):", jax_accuracy_with_keras_weights)
final_jax_weights_in_keras_format = [*final_jax_weights[0], *final_jax_weights[2]]
keras_model.set_weights(final_jax_weights_in_keras_format)
_, keras_accuracy_with_jax_weights = keras_model.evaluate(train_xs, train_ys, verbose=0)
print("Keras accuracy (JAX weights):", keras_accuracy_with_jax_weights)
Try changing the PRNG seed at line 57 to a value other than 0 to run the experiment using different initial weights.
Your binary_cross_entropy_stable function does not match the output of keras.binary_crossentropy; for example:
x = np.random.rand(10)
y = np.random.rand(10)
print(keras.losses.binary_crossentropy(x, y))
# tf.Tensor(0.8134677734043875, shape=(), dtype=float64)
print(binary_cross_entropy_stable(x, y))
# 0.9781515
That is where I would start if you're trying to exactly duplicate the model.
You can view the source of the keras loss function here: keras/losses.py#L1765-L1810, with the main part of the implementation here: keras/backend.py#L4972-L5017
One detail: it appears that with a sigmoid activation function, Keras re-uses some cached logits to compute the binary cross entropy while avoiding problematic values: keras/backend.py#L4988-L4997. I'm not sure how to easily replicate that behavior using JAX & stax.
I am currently trying to create a simple ANN learning environment for reinforcement learning. I already did fitting via neuronal network to substitute a physical model for a neuronal network. Now i would like to create a simple reinforcement learning model out of curiosity.
To create this model I thought it would be a good option to manipulate the loss function to not calculate the difference between expectation and model output but to run a simple simulation a few rounds and calculate where the model can earn points for a specific target. In case of the example code below the model is a simple mass damper system that starts with a random excitation and speed. The model can exert a force upon it. The points are based upon the distance from the equilibrium. At the end I invert the points by dividing one by the amount of points earned. I am not sure if this is the right approach but I wanted to try anyway for the sake of learning. Now I get the error message No gradients provided for any variable: . I am not sure how to solve it.
Here is my code:
import time
import numpy as np
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras.models import Model
from tensorflow.keras.optimizers import SGD
from tensorflow.keras.layers import Input, Dense, Conv2D, Reshape,concatenate, Flatten, UpSampling2D, AveragePooling2D,LayerNormalization
import random
#Physical Parameters
m = 1 #kg
k = 1 #N/m
c = 0.01
dt = 0.01
opt = keras.optimizers.Adam(learning_rate=0.01)
def getnewstate(u,v,f):
#Calculate new state of mass spring damper system
a = (f-v*c-k*u)/m
v = v+a*dt
u = u+v*dt
return (u,v)
def generatemodel():
#Generate simple keras model
kernel_initializer=tf.keras.initializers.RandomNormal(stddev=0.01)
bias_initializer=tf.keras.initializers.Zeros()
InputLayer = Input(shape=(2))
Outputlayer = Dense(1,activation='linear')(InputLayer)
model = Model(inputs=InputLayer, outputs=Outputlayer)
return model
def lossfunction(u,v,model):
#Costume loss function
loss = 0;
t = 0;
t_last = 0;
#do for 100 timesteps (to ses if it runs at all)
for j in range(100):
x = [];
x.append(np.array([u,v]))
x = np.array(x)
f=model(x)
f=f.numpy()[0][0]
(u,v) = getnewstate(u,v,f)
points = 1000/(abs(u)+1)
loss=loss+1/points
t += dt;
return(loss)
def dotraining(model):
#traububg loop
for epoch in range(100):
print("\nStart of epoch %d" % (epoch,))
start_time = time.time()
loss_value = 0;
# Iterate over the batches of the dataset.
for step in range(100):
with tf.GradientTape() as tape:
loss_value=[]
for i in range(10):
#Randomize Starting Condition
u = random.random()-0.5;
v = random.random()-0.5;
x = [];
x.append(np.array([u,v]))
x = np.array(x)
#feed model
logits = model(x, training=True)
#calculate loss
loss_value.append(lossfunction(u,v,model))
print(step)
print(loss_value)
loss = loss_value
loss = tf.convert_to_tensor(loss)
grads = tape.gradient(loss, model.trainable_weights)
opt.apply_gradients(zip(grads, model.trainable_weights))
# Log every 200 batches.
if step % 200 == 0:
print(
"Training loss (for one batch) at step %d: %.4f"
% (step, float(loss_value))
)
print("Seen so far: %d samples" % ((step + 1) * 64))
print("Time taken: %.2fs" % (time.time() - start_time))
model=generatemodel()
x = []
x.append(np.array([1.0,2.0]))
print(np.shape(x))
f=model(np.array(x))
dotraining(model)
The problem is that, when you cast f to numpy here:
f=f.numpy()[0][0]
it stops being a tensor and tensorflow doesn't track its gradient any more.
For tensorflow to compute gradient, you must get from inputs to loss using only tensor operations.