I'm training a NN and would like to save the model weights every N epochs for a prediction phase. I propose this draft code, it's inspired by #grovina 's response here. Could you, please, make suggestions?
Thanks in advance.
from keras.callbacks import Callback
class WeightsSaver(Callback):
def __init__(self, model, N):
self.model = model
self.N = N
self.epoch = 0
def on_batch_end(self, epoch, logs={}):
if self.epoch % self.N == 0:
name = 'weights%08d.h5' % self.epoch
self.model.save_weights(name)
self.epoch += 1
Then add it to the fit call: to save weights every 5 epochs:
model.fit(X_train, Y_train, callbacks=[WeightsSaver(model, 5)])
You shouldn't need to pass a model for the callback. It already has access to the model via it's super. So remove __init__(..., model, ...) argument and self.model = model. You should be able to access the current model via self.model regardless. You are also saving it on every batch end, which is not what you want, you probably want it to be on_epoch_end.
But in any case, what you are doing can be done via naive modelcheckpoint callback. You don't need to write a custom one. You can use that as follows;
mc = keras.callbacks.ModelCheckpoint('weights{epoch:08d}.h5',
save_weights_only=True, period=5)
model.fit(X_train, Y_train, callbacks=[mc])
You should implement on on_epoch_end rather implementing on_batch_end. And also passing model as argument for __init__ is redundant.
from keras.callbacks import Callback
class WeightsSaver(Callback):
def __init__(self, N):
self.N = N
self.epoch = 0
def on_epoch_end(self, epoch, logs={}):
if self.epoch % self.N == 0:
name = 'weights%08d.h5' % self.epoch
self.model.save_weights(name)
self.epoch += 1
Related
I am training a stable_baselines3 PPO agent and want to perform some task on every step. To do this, I'm using a callback CustomCallback with _on_step method defined.
But it appears that _on_step is called only on every PPO.n_steps, so if n_steps param is 1024, then CustomCallback._on_step appears to be called only on every 1024 steps.
How can you do something on every 1 step, insted of on every PPO.n_steps steps?
from stable_baselines3 import PPO
from stable_baselines3.common.env_util import make_vec_env
from stable_baselines3.common.callbacks import BaseCallback
class CustomCallback(BaseCallback):
def __init__(self, freq, verbose=0):
super().__init__(verbose)
self.freq = freq
def _on_step(self):
if self.n_calls % self.freq == 0:
print('do something')
return True
env = make_vec_env("CartPole-v1", n_envs=1)
model = PPO("MlpPolicy", env, n_steps=1024)
model.learn(
total_timesteps=25000,
callback=CustomCallback(freq=123),
)
class BinaryTruePositives(tf.keras.metrics.Metric):
def __init__(self, name='binary_true_positives', **kwargs):
super(BinaryTruePositives, self).__init__(name=name, **kwargs)
self.true_positives = self.add_weight(name='tp', initializer='zeros')
def update_state(self, y_true, y_pred, sample_weight=None):
y_true = tf.squeeze(y_true)
y_pred = tf.sign(y_pred)
y_pred=tf.reshape(y_pred,[-1])
self.true_positives.assign_add(tf.keras.backend.mean(tf.keras.backend.equal(y_true,
y_pred)))
def result(self):
return self.true_positives
def reset_states(self):
self.true_positives.assign(0)
def model_fn():
keras_model = create_keras_model()
return tff.learning.from_keras_model(keras_model,
input_spec=preprocessed_example_dataset.element_spec,
loss=tf.keras.losses.MSE,
metrics=[BinaryTruePositives()])
TypeError: Expected tensorflow.python.keras.losses.Loss or collections.abc.Sequence, found function.
Some more of the stacktrace might be useful here, but I believe the issue in the code above is the fact that tf.keras.losses.MSE is a function defining the loss logic, rather than an instance of tf.keras.losses.Loss itself.
Looking at an old version of TFF, it seems you are hitting this line, though note that you'd get a different error with a newer version of TFF (I believe you'd hit this line instead).
You can fix this by passing
loss=tf.keras.losses.MeanSquaredError()
instead of the existing loss argument in your model_fn above.
I'm working on an early stopping class that I want to monitor the progress of validation loss, save the best model so far, and stop training when the validation loss stops decreasing.
I implemented 2 class: a monitor class that does the monitoring, and a checkpoint class that handles saving/loading.
The thing is that when I want to save a model I have this function in the checkpoint class:
#staticmethod
def save_cp(epoch, model, optimizer, val_loss, cp_path):
torch.save({
'epoch': epoch,
'model_state_dict': model.state_dict(),
'optimizer_state_dict': optimizer.state_dict(),
'val_loss': val_loss,
}, cp_path)
But when I want to monitor I do it with this function in the monitor class:
def __call__(self, validation_loss):
if validation_loss < self.min_val_loss:
# should save cp here
self.should_save_cp = True
self.min_val_loss = validation_loss
else:
self.count += 1
if self.count >= self.tolerance:
self.should_stop = True
I used a boolean member to indicate if the current model needs saving and I thought about setting it to false after calling the save method during training, but it seems a bit not elegant to me, so what am I missing?
Thanks!
I would like to define a network that comprises many templates. Below under Network Definitions is a simplified example where the first network definition is used as a template in the second one. This doesn't work - when I initialise my optimiser is says that the network parameters are empty!
How should I do this properly? The network that I ultimately want is very complicated.
Main Function
if __name__ == "__main__":
myNet = Network().cuda().train()
optimizer = optim.SGD(myNet.parameters(), lr=0.01, momentum=0.9)
Network definitions:
class NetworkTemplate(nn.Module):
def __init__(self):
super(NetworkTemplate, self).__init__()
self.conv1 = nn.Conv2d(1, 3, kernel_size=1, bias=False)
self.bn1 = nn.BatchNorm2d(3)
def forward(self, x):
x = self.conv1(x)
x = self.bn1(x)
return x
class Network(nn.Module):
def __init__(self, nNets):
super(Network, self).__init__()
self.nets = []
for curNet in range(nNets):
self.nets.append(NetworkTemplate())
def forward(self, x):
for curNet in self.nets:
x = curNet(x)
return x
Just use torch.nn.Sequential? Like self.nets=torch.nn.Sequential(*self.nets) after you populated self.nets and then call return self.nets(x) in your forward function?
If you want to do something more complicated, you can put all networks into torch.nn.ModuleList, however you'll need to manually take care of calling them in your forward method in that case (but it can be more complicated than just sequential).
I have subclassed RNNCell as the building block of my RNN. I put an instance of this object into tf.dynamic_rnn and then I define a prediction function in my Agent class:
class Agent():
def __init__(self):
...
def predictions(self):
cell = RNNCell()
output, last_state = tf.dynamic_rnn(cell, inputs = ...)
return output
Everything works fine, but how do I add a histogram for the layers now? I've tried to do it in the RNNCell but it doesn't work:
class RNNCell(tf.nn.rnn_cell.RNNCell):
def __init__(self):
super(RNNCell, self).__init__()
self._output_size = 15
self._state_size = 15
self._histogram1 = None
def __call__(self, X, state):
network = tflearn.layers.conv_2d(X, 5, [1, 3], activation='relu', weights_init=tflearn.initializations.variance_scaling(), padding="valid")
self._histogram1 = tf.summary.histogram("layer1_hist_summary", network)
...
#property
def histogram1(self):
return self._histogram1
and then
class Agent():
def __init__(self):
...
def predictions(self):
cell = RNNCell()
self.histogram1 = cell.histogram1
output, last_state = tf.dynamic_rnn(cell, inputs = ...)
return output
Later when I run sess.run(agent.histogram1, feed_dict=...) I get the error TypeError: Fetch argument None has invalid type <class 'NoneType'>
I think the problem is that the value of Agent's self.histogram1 never got updated to reflect that summary assigned in RNNCell.
Your code for the Agent predictions() method initializes Agent's histogram1 value to None here:
cell = RNNCell() #invoks __init__() so RNNCELL's histogram1 is now None
self.histogram1 = cell.histogram1
When RNNCell's __call__() method is invoked, it updates the RNNCell's value of histogram1
self._histogram1 = tf.summary.histogram("layer1_hist_summary", network)
But the Agent's copy of histogram1 was apparently not updated, so when the call is made:
sess.run(agent.histogram1, feed_dict=...)
agent.histogram1 is still None.
I don't see in the posted code where the summaries were merged before training, so the missing step is likely in unposted code somewhere.