class Seq2Seq(keras.Model):
def __init__(self, enc_v_dim, dec_v_dim, emb_dim, units, attention_layer_size, max_pred_len, start_token,
end_token):
super().__init__()
self.units = units
....
def encode(self, x):
o = self.enc_embeddings(x)
init_s = [tf.zeros((x.shape[0], self.units)), tf.zeros((x.shape[0], self.units))]
...
return s
def inference(self, x, return_align=False):
...
return pred_id
def train_logits(self, x, y, seq_len):
...
return logits
def step(self, x, y, seq_len):
with tf.GradientTape() as tape:
logits = self.train_logits(x, y, seq_len)
dec_out = y[:, 1:] # ignore <GO>
loss = self.cross_entropy(dec_out, logits)
grads = tape.gradient(loss, self.trainable_variables)
self.opt.apply_gradients(zip(grads, self.trainable_variables))
return loss
I found a model for natural language processing that works well and trained well. But I don't know how to save it, it's different from the typical model structure I've seen, and I can't call functions like build or compile. I would like to know how can I save such a model.
Related
I want to customize TensorFlow model. I need a custom training algorithm like these:
I don't want my model to be inside the custom model just the training algorithm.
class CustomModel(keras.Model):
def __init__(self,inputs, outputs, echo=False):
super().__init__()
self.echo = echo
def train_step(self, data):
x, y = data
with tf.GradientTape() as tape:
y_pred = self(x, training=True)
loss = self.compiled_loss(y, y_pred, regularization_losses=self.losses)
print(loss)
if self.echo:
print('*')
trainable_vars = self.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
self.compiled_metrics.update_state(y, y_pred)
return {m.name: m.result() for m in self.metrics}
inputs = keras.Input(shape=(224,224,3))
x = keras.layers.Conv2D(32,(3,3))(inputs)
x = keras.layers.Conv2D(64,3)(x)
x = keras.layers.Conv2D(64,3)(x)
x = keras.layers.AveragePooling2D()(x)
x = keras.layers.Flatten()(x)
x = keras.layers.Dense(64, activation='relu')(x)
x = keras.layers.Dense(3, activation='softmax')(x)
model = CustomModel( inputs, x,echo= True)
model.compile(optimizer="adam", loss="mse", metrics=["mae"])
opt = Adam(learning_rate=0.0001)
model.compile(optimizer=opt, loss='categorical_crossentropy', metrics=['accuracy'])
epochs = 5
history = model.fit_generator(train_generator,
validation_data=valid_generator, verbose=1, epochs=epochs)
error:
NotImplementedError: When subclassing the `Model` class, you should implement a `call` method.
You don't need to provide them (inputs, outputs) argument in the init function of your sub-class model. You can implement the call method in your sub-class model as follows:
class CustomModel(keras.Model):
...
...
# A call function needs to be implemented
def call(self, inputs, *args, **kwargs):
return self(inputs)
update
Based on the comments, here's a possible workaround. You build the model with the provided input/output within init.
class CustomModel(keras.Model):
def __init__(self, inputs, x, echo=False, **kwargs):#student
super().__init__(**kwargs)
self.model = keras.Model(inputs, x)
self.echo = echo
def call(self, inputs, *args, **kwargs):
return self.model(inputs)
def train_step(self, data):
x, y = data
with tf.GradientTape() as tape:
y_pred = self.model(x, training=True) # Forward pass
...
# Compute gradients
trainable_vars = self.model.trainable_variables
gradients = ...
# Update weights
...
return {m.name: m.result() for m in self.metrics}
I am running Alexnet on CIFAR10 dataset using Pytorch Lightning, here is my model:
class SelfSupervisedModel(pl.LightningModule):
def __init__(self, hparams=None, num_classes=10, batch_size=128):
super(SelfSupervisedModel, self).__init__()
self.batch_size = batch_size
self.loss_fn = nn.CrossEntropyLoss()
self.hparams["lr"] = ModelHelper.Hyperparam.Learning_rate
self.model = torchvision.models.alexnet(pretrained=False)
def forward(self, x):
return self.model(x)
def training_step(self, train_batch, batch_idx):
inputs, targets = train_batch
predictions = self(inputs)
loss = self.loss_fn(predictions, targets)
return {'loss': loss}
def validation_step(self, test_batch, batch_idx):
inputs, targets = test_batch
predictions = self(inputs)
val_loss = self.loss_fn(predictions, targets)
_, preds = tf.max(predictions, 1)
acc = tf.sum(preds == targets.data) / (targets.shape[0] * 1.0)
return {'val_loss': val_loss, 'val_acc': acc, 'target': targets, 'preds': predictions}
def validation_epoch_end(self, outputs):
avg_loss = tf.stack([x['val_loss'] for x in outputs]).mean()
avg_acc = tf.stack([x['val_acc'].float() for x in outputs]).mean()
logs = {'val_loss': avg_loss, 'val_acc': avg_acc}
print(f'validation_epoch_end logs => {logs}')
OutputMatrix.predictions = tf.cat([tmp['preds'] for tmp in outputs])
OutputMatrix.targets = tf.cat([tmp['target'] for tmp in outputs])
return {'progress_bar': logs}
def configure_optimizers(self):
return tf.optim.SGD(self.parameters(), lr=self.hparams["lr"], momentum=0.9)
I am storing the predicted and true values in OutputMatrix.predictions and OutputMatrix.targets which are used to generate confusion matrix looks like below:
I'm pretty much sure that this should not be the output though. Can not find where is the mistake. Any help would be appreciated.
I would suggest using Torchmetrics and the internal log method, so the code could like:
class MyModule(LightningModule):
def __init__(self):
...
self.train_acc = torchmetrics.Accuracy()
self.valid_acc = torchmetrics.Accuracy()
def training_step(self, batch, batch_idx):
x, y = batch
preds = self(x)
...
self.train_acc(preds, y)
self.log('train_acc', self.train_acc, on_step=True, on_epoch=False)
def validation_step(self, batch, batch_idx):
logits = self(x)
...
self.valid_acc(logits, y)
self.log('valid_acc', self.valid_acc, on_step=True, on_epoch=True)
as you can also find in the docs related to PL integration.
I am using tensorflow 2.0 and trying to evaluate gradients for backpropagating to a simple feedforward neural network. Here's how my model looks like:
def __init__(self, input_size, output_size):
inputs = tf.keras.Input(shape=(input_size,))
hidden_layer1 = tf.keras.layers.Dense(30, activation='relu')(inputs)
outputs = tf.keras.layers.Dense(output_size)(hidden_layer1)
self.model = tf.keras.Model(inputs=inputs, outputs=outputs)
self.optimizer = tf.keras.optimizers.Adam(learning_rate=0.001)
self.loss_function = tf.keras.losses.Huber()
The forward pass to this network is fine but when I use gradient tape to train the model, it is at least 10x slower than PyTorch.
Training function:
def learn_modified_x(self, inputs, targets, actions):
with tf.GradientTape() as tape:
predictions = self.model(inputs)
predictions_for_action = gather_single_along_axis(predictions, actions)
loss = self.loss_function(targets, predictions_for_action)
grads = tape.gradient(loss, self.model.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.model.trainable_weights))
I tried commenting lines to find what is actually causing the problem. I discovered that tape.gradient is a significant contributor to this situation.
Any idea?
PyTorch implementation
def __init__(self, input_size, nb_action):
super(Network, self).__init__()
self.input_size = input_size
self.nb_action = nb_action
self.fc1 = nn.Linear(input_size, 30)
self.fc2 = nn.Linear(30, nb_action)
def forward(self, state):
x = F.relu(self.fc1(state))
q_values = self.fc2(x)
return q_values
def learn(self, batch_state, batch_next_state, batch_reward, batch_action):
outputs = self.model(batch_state).gather(1, batch_action.unsqueeze(1)).squeeze(1)
next_outputs = self.model(batch_next_state).detach().max(1)[0]
target = self.gamma*next_outputs + batch_reward
td_loss = F.smooth_l1_loss(outputs, target)
self.optimizer.zero_grad()
td_loss.backward(retain_variables = True)
self.optimizer.step()
def __init__(self,...):
...
self.model.call = tf.function(self.model.call)
...
you need use tf.function to wrap your model's call function.
I have been trying to implement the paper: SeER: An Explainable Deep Learning MIDI-based Hybrid Song Recommender System.
So, what I have been doing is this:
Model Code:
class HybridFactorization(tf.keras.layers.Layer):
# embedding_size is also the number of lstm units
# num_users, num_movies = input_shape
# required_users: (batch_size, embedding_size)
# songs_output: (batch_size, embedding_size)
def __init__(self, embedding_size, num_users, num_tracks):
super(HybridFactorization, self).__init__()
self.embedding_size = embedding_size
self.num_users = num_users
self.num_tracks = num_tracks
self.required_users = None
self.U = self.add_weight("U",
shape=[self.num_users, self.embedding_size],
dtype=tf.float32,
initializer=tf.initializers.GlorotUniform)
self.lstm = tf.keras.layers.LSTM(self.embedding_size)
def call(self, user_index, songs_batch):
output_lstm = self.lstm(songs_batch)
self.required_users = self.U.numpy()
self.required_users = tf.convert_to_tensor(self.required_users[np.array(user_index)],
dtype=tf.float32)
return tf.matmul(self.required_users, output_lstm, transpose_b=True)
class HybridRecommender(tf.keras.Model):
def __init__(self, embedding_size, num_users, num_tracks):
super(HybridRecommender, self).__init__()
self.HybridFactorization = HybridFactorization(embedding_size,
num_users, num_tracks)
def call(self, user_index, songs_batch):
output = self.HybridFactorization(user_index, songs_batch)
return output
Utility Functions and running the model:
def loss_fn(source, target):
mse = tf.keras.losses.MeanSquaredError()
return mse(source, target)
model = HybridRecommender(EMBEDDING_SIZE, num_users, num_tracks)
Xhat = model(user_index, songs_batch)
tf.keras.backend.clear_session()
optimizer = tf.keras.optimizers.Adam()
EPOCHS = 1
for epoch in range(EPOCHS):
start = time.time()
total_loss = 0
for (batch, (input_batch, target_batch)) in enumerate(train_dataset):
songs_batch = create_songs_batch(input_batch)
user_index = input_batch[:, 0].numpy()
X = create_pivot_batch(input_batch, target_batch)
with tf.GradientTape() as tape:
Xhat = model(user_index, songs_batch)
batch_loss = loss_fn(X, Xhat)
variables = model.trainable_variables
gradients = tape.gradient(batch_loss, variables)
optimizer.apply_gradients(zip(gradients, variables))
total_loss += batch_loss
Now, various functions like create_songs_batch(input_batch) and create_pivot_batch(input_batch, target_batch) just provide data in the required format.
My model runs but I get the warning:
WARNING:tensorflow:Gradients do not exist for variables ['U:0'] when minimizing the loss.
Now, I can see why variable U is not being updated as there is no direct path to it.
I want to update some specific rows of U which are mentioned in user_index in every batch call.
Is there a way to do it?
So, I was able to solve the problem by rather than copying some rows of U and trying to solve it. Instead, I used a temporary matrix that is one hot encoded form of user_index and multiplied it with U to desired results and it also removed the results.
Part of code that needs to be modified:
def call(self, user_index, songs_batch):
# output_lstm: (batch_size, emb_sz)
# batch_encoding: (batch_size, num_users)
# required_users: (batch_size, emb_sz)
output_lstm = self.lstm(songs_batch)
user_idx = np.array(user_index)
batch_encoding = np.zeros((user_idx.size, self.num_users))
batch_encoding[np.arange(user_idx.size), user_idx] = 1
batch_encoding = tf.convert_to_tensor(batch_encoding, dtype=tf.float32)
self.required_users = tf.matmul(batch_encoding, self.U)
return tf.matmul(self.required_users, output_lstm, transpose_b=True)
I am trying to modify a non-trainable model variable from a callback on beginning of each epoch. Essentially I would like to have a mechanism similar to the learning rate scheduler (which has built in infrastructure in TF) but applicable to an arbitrary model variable. The code below is a minimum example to show the concept. I am trying to modify the decay variable but it does not work. Apparently the initial value of the variable (1.0) is treated as a constant and folded by the graph and never looked at again as training progresses even though the variable seems to be properly modified (to 0.5) by the callback.
dense1 = tf.keras.layers.Dense(10)
decay = tf.Variable(1.0, trainable=False)
dense2 = tf.keras.layers.Dense(10)
def epoch_callback(epoch):
nonlocal decay
tf.keras.backend.set_value(decay, 0.5)
#decay.assign(0.5)
print(tf.keras.backend.get_value(decay))
input = tf.keras.layers.Input((MAX_LENGTH,))
x = dense1(input)
with tf.control_dependencies([decay]):
x = x * decay
prediction = dense2(x)
model = tf.keras.Model(inputs=[input], outputs=[prediction])
model.compile(optimizer=tf.keras.optimizers.Adam(), loss=tf.keras.losses.SparseCategoricalCrossentropy(from_logits=True))
callbacks = [tf.keras.callbacks.LambdaCallback(on_epoch_begin = lambda epoch, logs: epoch_callback(epoch))]
model.fit(train_ds, epochs=EPOCHS, verbose=1, callbacks=callbacks, validation_data=eval_ds)
#nbro: Here you go. The code below is what worked for me. I use a teacher forcing protocol and the per-epoch decay variable is used to "lower teacher's voice" as training progresses.
class Teacher(tf.keras.layers.Layer):
def __init__(self, embedding, name='teacher', **kwargs):
super().__init__(name=name, **kwargs)
...
def build(self, input_shape):
...
def call(self, inputs, training=None):
x, y, decay = inputs
...
if training:
y = tf.multiply(y, decay)
else:
y = tf.multiply(y, tf.constant(0.0))
...
return x
def get_config(self):
return {}
class MyNet(tf.keras.Model):
def __init__(self, name='mynet', **kwargs):
super().__init__(name=name, **kwargs)
def build(self, input_shape):
...
self.teacher = Teacher()
self.decay = tf.Variable(1.0, trainable=False)
...
def set_decay(self, decay):
self.decay.assign(decay)
#tf.function
def call(self, example, training=None):
x, y = example
...
x = self.teacher((x, y, self.decay))
...
return x
def get_config(self):
return {}
def main():
train_ds = ...
eval_ds = ...
train_ds = train_ds.map(lambda data, label: ((data, label), label), num_parallel_calls=tf.data.experimental.AUTOTUNE)
eval_ds = eval_ds.map(lambda data, label: ((data, label), label), num_parallel_calls=tf.data.experimental.AUTOTUNE)
strategy = tf.distribute.MirroredStrategy()
with strategy.scope():
the_net = MyNet()
inputs = tf.keras.layers.Input((MAX_LENGTH,), dtype='int64', name='inputs')
targets = tf.keras.layers.Input((MAX_LENGTH,), dtype='int64', name='targets')
prediction = the_net((inputs, targets))
model = tf.keras.Model(inputs=[inputs, targets], outputs=[prediction])
model.compile(optimizer=tf.keras.optimizers.Adam(), loss=CosineSimilarity(name='val_loss'))
def _callback_fun(epoch, start = 0, steps = 8):
the_net.set_decay(tf.clip_by_value((start+steps-epoch)/steps, clip_value_min=tf.constant(0.0), clip_value_max=tf.constant(1.0)))
callbacks = [tf.keras.callbacks.LambdaCallback(on_epoch_begin=lambda epoch, logs: _callback_fun(epoch))]
model.fit(train_ds, epochs=EPOCHS, verbose=2, callbacks=callbacks, validation_data=eval_ds)
if __name__ == '__main__':
main()