I have a relatively large TF retrieval model using the TFRS library. It uses a ScaNN layer for indexing the recommendations. I am having a system host memory issue when I try to save this model via the tf.saved_model.save() method. I am running the official TF 2.9.1 Docker Container with TFRS on a VM in the cloud. I have 28 GB of memory to try to save the model.
Here is the quickstart example:
Basically we create the first embedding
user_model = tf.keras.Sequential([
tf.keras.layers.StringLookup(
vocabulary=unique_user_ids, mask_token=None),
# We add an additional embedding to account for unknown tokens.
tf.keras.layers.Embedding(len(unique_user_ids) + 1, embedding_dimension)
])
Then create the model
class MovielensModel(tfrs.Model):
def __init__(self, user_model, movie_model):
super().__init__()
self.movie_model: tf.keras.Model = movie_model
self.user_model: tf.keras.Model = user_model
self.task: tf.keras.layers.Layer = task
def compute_loss(self, features: Dict[Text, tf.Tensor], training=False) -> tf.Tensor:
# We pick out the user features and pass them into the user model.
user_embeddings = self.user_model(features["user_id"])
# And pick out the movie features and pass them into the movie model,
# getting embeddings back.
positive_movie_embeddings = self.movie_model(features["movie_title"])
# The task computes the loss and the metrics.
return self.task(user_embeddings, positive_movie_embeddings)
Next we create the ScaNN indexing layer
scann_index = tfrs.layers.factorized_top_k.ScaNN(model.user_model)
scann_index.index_from_dataset(
tf.data.Dataset.zip((movies.batch(100), movies.batch(100).map(model.movie_model)))
)
# Get recommendations.
_, titles = scann_index(tf.constant(["42"]))
print(f"Recommendations for user 42: {titles[0, :3]}")
Finally the model is sent out to be saved
# Export the query model.
with tempfile.TemporaryDirectory() as tmp:
path = os.path.join(tmp, "model")
# Save the index.
tf.saved_model.save(
index,
path,
options=tf.saved_model.SaveOptions(namespace_whitelist=["Scann"])
)
# Load it back; can also be done in TensorFlow Serving.
loaded = tf.saved_model.load(path)
# Pass a user id in, get top predicted movie titles back.
scores, titles = loaded(["42"])
print(f"Recommendations: {titles[0][:3]}")
This is the problem line:
# Save the index.
tf.saved_model.save(
index,
path,
options=tf.saved_model.SaveOptions(namespace_whitelist=["Scann"])
)
I'm not sure if there is a memory leak or what, but when I train my model on 5M+ records... I can watch the host system memory spike to 100% and the process is killed. If I train on a smaller dataset... there is no problem, so I know the code is okay.
Can anyone suggest how to get around the memory bottleneck when saving a large ScaNN retrieval model, so I can eventually load the model back in for inference?
I think you are saving the TF model after training was completed. You just need the saved model to get trained weights from the model.
You can try the following code:
sku_ids = df['SKU_ID']
sku_ids_list = sku_ids.to_list()
q = embedding(sku_ids, output_mode='distance_matrix')
dist_mat = tf.cast(q, tf.float32)
tree = scann.Scann(n_tables=scann_tables_file_name,
n_clusters_per_table=scann_clusters_file_name,
dimension=embedding_dimensions,
space_type=dist_mat.dtype,
metric_type=tf.float32,
random_seed=seed,
transport_dtype=tf.float32,
symmetrize_query_and_dataset=True,
num_neighbors_per_table=scann_tables_number_of_neighbors)
q = tree.build_index(dist_mat)
p = tree.run(dist_mat)
model = keras.models.Sequential([
scann.Dense(1, use_bias=False, activation='linear', dtype=tf.float32),
keras.layers.Activation('sigmoid')
])
model.compile(
keras.optimizers.Adam(1e-3),
'binary_crossentropy', metrics=[metrics.BinaryAccuracy()])
idx = -1
number_of_epochs = 10
optimizer = keras.optimizers.Adam(1e-3)
optimizer_state = None
random_seed = seed
callbacks = [
keras.callbacks.EarlyStopping(
monitor='binary_accuracy', mode='max',
patience=10, restore_best_weights=True)]
batch_size = 1000
total_records = len(sku_ids)
epochs = number_of_epochs
epochs_completed = 0
while epochs_completed < epochs:
idx += 1
if idx * batch_size >= total_records:
idx = 0
epochs_completed += 1
optimizer_state = None
print("training epoch: {}".format(idx))
q_ = tree.transform(dist_mat[idx * batch_size : (idx + 1) * batch_size])
p_ = tree.transform(dist_mat)
y = p_[:, :, 0]
print("callbacks: {}".format(callbacks))
print("model compile: {}".format(model.compile))
model.fit(q_, y, epochs=1, batch_size=batch_size,
callbacks=callbacks,
validation_split=0.2,
verbose=0,
shuffle=True,
initial_epoch=0,
steps_per_epoch=None,
validation_steps=None,
validation_batch_size=None,
validation_freq=1,
class_weight=None,
max_queue_size=10,
workers=1,
use_multiprocessing=False,
shuffle=False, initial_epoch=0)
sku_ids_tensor = tf.constant(sku_ids_list,
shape=[len(sku_ids_list), 1],
dtype=tf.int64)
print("sku_ids_tensor shape: {}".format(sku_ids_tensor.shape))
tree_tensor = tree.transform(dist_mat)
print("tree_tensor shape: {}".format(tree_tensor.shape))
predictions = tf.constant(tf.sigmoid(model.predict(tree_tensor)),
dtype=tf.float32)
print("predictions shape: {}".format(predictions.shape))
recommendations = tf.concat([sku_ids_tensor, predictions], axis=1)
print("recommendations shape: {}".format(recommendations.shape))
retrieval_user_sku_recommendations = []
for u in unique_sku_list:
print("u: {}".format(u))
user_skus = sku_ids[sku_ids.isin([u])]
print("user_skus: {}".format(user_skus))
user_sku_id = user_skus.index[0]
print("user_sku_id: {}".format(user_sku_id))
user_sku_recommendations = recommendations[sku_ids.isin([u])]
print("user_sku_recommendations: {}".format(user_sku_recommendations))
retrieval_user_sku_recommendations.append(user_sku_recommendations)
retrieval_skus_df = pd.DataFrame(sku_ids_list, columns=['SKU_ID'])
retrieval_skus_df['SKU_ID'] = retrieval_skus_df['SKU_ID'].astype(int)
retrieval_skus_df.head()
user_sku_recommendations_list = []
for sku in retrieval_skus_df['SKU_ID']:
for u in unique_sku_list:
print("sku: {}".format(sku))
print("u: {}".format(u))
if sku == u:
user_skus = sku_ids[sku_ids.isin([sku])]
user_sku_id = user_skus.index[0]
user_sku_recommendations = recommendations[sku_ids.isin([sku])]
user_sku_recommendations_list.append(user_sku_recommendations)
tf.saved_model.save(model, ss_model_dir)
Related
I'm trying to implement the Laplace Posteriori Approximation on the last layer for the classification results obtained by BERT model. I get an error regarding input size, and after I fix it by extracting just embeddings and class labels from BERT to feed them into Laplace, I get another bunch of errors regarding input dimensions that I don't know how to debug.
As this is something I didn't find on the internet, and includes relatively new libraries, I will post here just the first error I got, code that might help in debugging and useful links.
I will update post if needed.
Of course, if someone knows how to implement Laplace Posteriori Approximation with BERT in some other library like Scikit or Trax, it would be helpful. Also, some other Transformer classification model with some other confidence approximation will be useful for me. Any help is appreciated!
Code:
# Import
import pandas as pd
import torch
from torch.utils.data import DataLoader
from torch.utils.data import TensorDataset
from torch import nn
from transformers import BertTokenizer
from transformers import BertModel
from transformers import BertForSequenceClassification
from sklearn.model_selection import train_test_split
import time
import os
#Toy Data
data_a_b_c = ["""category a. This is category a. In category a we talk about animals.
This category includes lions, fish, tigers, birds, elephants, mouses, dogs, cats, and all other animals."""] * 60 \
+ ["""category b. This is category b. In category b we talk about people. This category members are
Abraham Maslow, John Lennon, Drazen Petrovic, Nikola Tesla, Slavoljub Penkala, Nenad Bakic and Larry Page."""] * 60 \
+ ["""category c. This is category c. Category c is dedicated to car brands like Lamborgini, Rimac-Buggati, BMW, Mercedes,
Honda, Opel, Wolkswagen, and etc."""] * 60
label_0_1_2 = [0] * 60 + [1] * 60 + [2] * 60
d = {'text': data_a_b_c, 'labels': label_0_1_2}
df = pd.DataFrame(data=d)
print(df.head(3))
print(df.tail(3))
print(df.info())
# Parameters
tokenizer = BertTokenizer.from_pretrained('bert-base-multilingual-cased')
batch_size = 2
learning_rate = 3e-4
epochs = 3
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
labels = pd.Series(df.labels.values).to_dict()
num_classes = 3
print(f'Tokenizer: {tokenizer}, Batch size:{batch_size}, Learning rate:{learning_rate}, Epochs:{epochs}')
print('Device: ', device)
print('Number of possible classes: ', num_classes)
# Model Architecture
class TransformerModel(nn.Module):
def __init__(self, num_classes, dropout=0.5):
super(TransformerModel, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, num_classes)
self.relu = nn.ReLU()
def forward(self, input_id, mask):
_, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
dropout_output = self.dropout(pooled_output)
linear_output = self.linear(dropout_output)
final_layer = self.relu(linear_output)
return final_layer
# Prepare Data Function
def prepare_data(data, labels):
texts = tokenizer(data, padding='max_length', max_length=512, truncation=True, return_tensors="pt")
input_ids = texts['input_ids']
attention_mask = texts['attention_mask']
train_dataset = TensorDataset(input_ids, attention_mask, torch.LongTensor(labels))
dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
return dataloader
#Run Training Function
def run_training(train_dataloader, val_dataloader, epochs=epochs, lr=learning_rate):
def train(dataloader):
model.train()
total_acc, total_count = 0, 0
log_interval = 128
start_time = time.time()
for idx, (input_id, mask, label) in enumerate(train_dataloader):
# print(idx)
mask = mask.to(device)
input_id = input_id.to(device)
label = label.type(torch.LongTensor).to(device)
output = model(input_id, mask)
optimizer.zero_grad()
loss = criterion(output, label)
loss.backward()
# torch.nn.utils.clip_grad_norm_(model.parameters(), 0.1)
optimizer.step()
total_acc += (output.argmax(1) == label).sum().item()
total_count += label.size(0)
if idx % log_interval == 0 and idx > 0:
elapsed = time.time() - start_time
print('| epoch {:3d} | {:5d}/{:5d} batches '
'| accuracy {:8.3f}'.format(epoch, idx, len(dataloader),
total_acc / total_count))
total_acc, total_count = 0, 0
start_time = time.time()
def evaluate(dataloader):
model.eval()
total_acc, total_count = 0, 0
with torch.no_grad():
for idx, (input_id, mask, label) in enumerate(dataloader):
mask = mask.to(device)
input_id = input_id.to(device)
label = label.to(device)
output = model(input_id, mask)
total_acc += (output.argmax(1) == label).sum().item()
total_count += label.size(0)
return total_acc / total_count
criterion = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.1)
cuda = torch.cuda.is_available()
device = torch.device("cuda" if cuda else "cpu")
device = 'cuda'
model.to(device)
total_accu = None
for epoch in range(1, epochs + 1):
epoch_start_time = time.time()
train(train_dataloader)
accu_val = evaluate(val_dataloader)
if total_accu is not None and total_accu > accu_val:
scheduler.step()
else:
total_accu = accu_val
print('-' * 59)
print('| end of epoch {:3d} | time: {:5.2f}s | '
'valid accuracy {:8.3f} '.format(epoch,
time.time() - epoch_start_time,
accu_val))
print('-' * 59)
# Data Split And Preparation
X_train, X_test, y_train, y_test = train_test_split(df.text.values.tolist(), df.labels.values.tolist(), test_size=0.2, random_state=2)
train_dataloader = prepare_data(X_train, y_train)
val_dataloader = prepare_data(X_test, y_test)
# Run The Model
model = TransformerModel(num_classes)
run_training(train_dataloader, val_dataloader)
print('finished')
# Save And Load The Model (if needed)
PATH = ".../Torch_BERT_model"
torch.save(model, os.path.join(PATH, "Toy_Data_BERT.pth"))
model = torch.load(os.path.join(PATH, "Toy_Data_BERT.pth"))
print(model)
# Laplace
from laplace import Laplace
la = Laplace(model, 'classification', subset_of_weights='last_layer', hessian_structure='full')
la.fit(train_dataloader)
Error I get:
--------------------------------------------------------------------------- ValueError Traceback (most recent call
last) ~\AppData\Local\Temp\ipykernel_7144\3779742208.py in <cell line:
2>()
1 la = Laplace(model, 'classification', subset_of_weights='last_layer', hessian_structure='full')
----> 2 la.fit(train_dataloader)
~\anaconda3\lib\site-packages\laplace\lllaplace.py in fit(self,
train_loader, override)
98
99 if self.model.last_layer is None:
--> 100 X, _ = next(iter(train_loader))
101 with torch.no_grad():
102 try:
ValueError: too many values to unpack (expected 2)
Useful link for Laplace implementation with examples:
https://aleximmer.github.io/Laplace/#full-example-optimization-of-the-marginal-likelihood-and-prediction
Code that might help in debugging:
for x in train_dataloader:
print("The length of batch is:", len(x))
print()
print("The batch looks like:", x)
print()
print("The length of the first element in the batch is:") #embedding
print(len(x[0]))
print("The length of the second element in the batch is:") #1 if place is filled with word, 0 if it's empty?
print(len(x[1]))
print("The length of the third element in the batch is:") #category
print(len(x[2]))
print()
print("The lengths of the first tensor and second tensor in the first element in the batch is:")
print(len(x[0][0]), len(x[0][1])) # = max_length (512)
print("The lengths of the first tensor and second tensor in the second element in the batch is:")
print(len(x[1][0]), len(x[1][1])) # = max_length (512)
print()
print()
The laplace library expects that the dataloader returns two parameters (X,y) and that the model requires exactly one argument to make its prediction (code). But your model forward pass requires two arguments, namely input_id and mask, and your dataloader returns three arguments input_id, mask, and labels.
There are several ways to work around this limitation (e.g. return a dict with input_ids and attention_mask). The way that requires the least understanding of the internals of the laplace library is to generate the attention mask at runtime in the forward pass (not great for the performance):
class TransformerModel(nn.Module):
def __init__(self, num_classes, pad_id, dropout=0.5):
super(TransformerModel, self).__init__()
self.bert = BertModel.from_pretrained('bert-base-multilingual-cased')
self.dropout = nn.Dropout(dropout)
self.linear = nn.Linear(768, num_classes)
self.relu = nn.ReLU()
self.pad_id = pad_id
def forward(self, input_id):
mask = (input_ids!=self.pad_id).type(input_ids.dtype)
_, pooled_output = self.bert(input_ids=input_id, attention_mask=mask, return_dict=False)
dropout_output = self.dropout(pooled_output)
linear_output = self.linear(dropout_output)
final_layer = self.relu(linear_output)
return final_layer
model = TransformerModel(num_classes, tokenizer.pad_token_id)
I am having a hard time with google colab and a Keras model that I train. I am using all kind of magic tricks like Tensorboard, HParams, callbacks, etc.
Initially, excuse me for posting code and screenshots only and not data. Since the code posted runs successfully on my local machine and not on colab I am guessing that this is not due to data error but rather code error.
My local machine TF version: 2.1.0
Colab's TF version: 2.2.0
Below there is my code.
libraries
import tensorflow as tf
import tensorflow_addons as tfa
import tensorflow_docs as tfdocs #!pip install git+https://github.com/tensorflow/docs
import tensorflow_docs.plots as tfplots
import tensorflow_docs.modeling as tfmodel
from tensorflow.keras import layers, regularizers, models
from tensorflow.keras import models
from tensorflow.keras.callbacks import ModelCheckpoint, EarlyStopping
from tensorflow.keras.utils import model_to_dot, plot_model
from tensorflow.keras.models import load_model, model_from_json
%load_ext tensorboard
from tensorboard.plugins.hparams import api as hp
import keras.backend as K
from tensorflow import keras
Callbacks
def callback(folder_path, saved_model_name, logdir, hparams):
# Initialize parameters
monitor_metric = 'val_loss'
minimum_delta = 0.1
patience_limit = 1
verbose_value = 1
mode_value = 'min'
weights_fname = os.path.join(os.getcwd(), '{0}/{1}.h5'.format(folder_path, saved_model_name))
print(weights_fname)
# Initialize callbacks
callbacks = [
EarlyStopping(monitor=monitor_metric,
min_delta=minimum_delta,
patience=patience_limit,
verbose=verbose_value,
mode=mode_value,
restore_best_weights=True),
ModelCheckpoint(filepath=weights_fname,
monitor=monitor_metric,
verbose=verbose_value,
save_best_only=True,
save_weights_only=True),
tf.keras.callbacks.TensorBoard(logdir),
hp.KerasCallback(logdir, hparams)
]
return callbacks
Hyper parameters
HP_HIDDEN_UNITS = hp.HParam('batch_size', hp.Discrete([32, 64, 128]))
HP_EMBEDDING_DIM = hp.HParam('embedding_dim', hp.Discrete([100, 200, 300]))
HP_LEARNING_RATE = hp.HParam('learning_rate', hp.Discrete([0.001, 0.01, 0.1])) # Adam default: 0.001, SGD default: 0.01, RMSprop default: 0.001
HP_DECAY_STEPS_MULTIPLIER = hp.HParam('decay_steps_multiplier', hp.Discrete([100, 1000]))
METRIC_ACCURACY = 'accuracy'
create and fit the model function
def create_fit_keras_model(hparams,
version_data_control,
optimizer_name,
validation_method,
callbacks,
folder_path,
optimizer_version = None):
sentenceLength_actors = X_train_seq_actors.shape[1]
vocab_size_frequent_words_actors = len(actors_tokenizer.word_index)
sentenceLength_plot = X_train_seq_plot.shape[1]
vocab_size_frequent_words_plot = len(plot_tokenizer.word_index)
sentenceLength_features = X_train_seq_features.shape[1]
vocab_size_frequent_words_features = len(features_tokenizer.word_index)
sentenceLength_reviews = X_train_seq_reviews.shape[1]
vocab_size_frequent_words_reviews = len(reviews_tokenizer.word_index)
model = keras.Sequential(name='MultyInput_Keras_Classification_model_{0}dim_{1}batchsize_{2}lr_{3}decaymultiplier_{4}'.format(hparams[HP_EMBEDDING_DIM], hparams[HP_HIDDEN_UNITS],
hparams[HP_LEARNING_RATE], hparams[HP_DECAY_STEPS_MULTIPLIER],
version_data_control))
actors = keras.Input(shape=(sentenceLength_actors,), name='actors_input')
plot = keras.Input(shape=(sentenceLength_plot,), name='plot_input')
features = keras.Input(shape=(sentenceLength_features,), name='features_input')
reviews = keras.Input(shape=(sentenceLength_reviews,), name='reviews_input')
emb1 = layers.Embedding(input_dim = vocab_size_frequent_words_actors + 2,
output_dim = hparams[HP_EMBEDDING_DIM],
embeddings_initializer = 'uniform',
mask_zero = True,
input_length = sentenceLength_actors,
name="actors_embedding_layer")(actors)
encoded_layer1 = layers.GlobalMaxPooling1D(name="globalmaxpooling_actors_layer")(emb1)
emb2 = layers.Embedding(input_dim = vocab_size_frequent_words_plot + 2,
output_dim = hparams[HP_EMBEDDING_DIM],
embeddings_initializer = 'uniform',
mask_zero = True,
input_length = sentenceLength_plot,
name="plot_embedding_layer")(plot)
encoded_layer2 = layers.GlobalMaxPooling1D(name="globalmaxpooling_plot_summary_Layer")(emb2)
emb3 = layers.Embedding(input_dim = vocab_size_frequent_words_features + 2,
output_dim = hparams[HP_EMBEDDING_DIM],
embeddings_initializer = 'uniform',
mask_zero = True,
input_length = sentenceLength_features,
name="features_embedding_layer")(features)
encoded_layer3 = layers.GlobalMaxPooling1D(name="globalmaxpooling_movie_features_layer")(emb3)
emb4 = layers.Embedding(input_dim = vocab_size_frequent_words_reviews + 2,
output_dim = hparams[HP_EMBEDDING_DIM],
embeddings_initializer = 'uniform',
mask_zero = True,
input_length = sentenceLength_reviews,
name="reviews_embedding_layer")(reviews)
encoded_layer4 = layers.GlobalMaxPooling1D(name="globalmaxpooling_user_reviews_layer")(emb4)
merged = layers.concatenate([encoded_layer1, encoded_layer2, encoded_layer3, encoded_layer4], axis=-1)
dense_layer_1 = layers.Dense(hparams[HP_HIDDEN_UNITS],
kernel_regularizer=regularizers.l2(neural_network_parameters['l2_regularization']),
activation=neural_network_parameters['dense_activation'],
name="1st_dense_hidden_layer_concatenated_inputs")(merged)
layers.Dropout(neural_network_parameters['dropout_rate'])(dense_layer_1)
output_layer = layers.Dense(y_train.shape[1],
activation=neural_network_parameters['output_activation'],
name='output_layer')(dense_layer_1)
model = keras.Model(inputs=[actors, plot, features, reviews], outputs=output_layer, name='MultyInput_Keras_Classification_model_{0}dim_{1}batchsize_{2}lr_{3}decaymultiplier_{4}'.format(hparams[HP_EMBEDDING_DIM],
hparams[HP_HIDDEN_UNITS],
hparams[HP_LEARNING_RATE],
hparams[HP_DECAY_STEPS_MULTIPLIER],
version_data_control))
print(model.summary())
if optimizer_name=="adam" and optimizer_version is None:
optimizer = optimizer_adam_v2(hparams[HP_LEARNING_RATE], hparams[HP_DECAY_STEPS_MULTIPLIER], X_train_seq_actors.shape[0], optimizer_parameters['validation_split_ratio'], hparams[HP_HIDDEN_UNITS])
elif optimizer_name=="sgd" and optimizer_version is None:
optimizer = optimizer_sgd_v1(hparams[HP_LEARNING_RATE])
elif optimizer_name=="rmsprop" and optimizer_version is None:
optimizer = optimizer_rmsprop_v1(hparams[HP_LEARNING_RATE])
model.compile(optimizer=optimizer,
loss=neural_network_parameters['model_loss'],
metrics=[neural_network_parameters['model_metric']])
plot_model(model, to_file=os.path.join(os.getcwd(), 'model_one/network_structure_multy_input_keras_model_{0}.png'.format(version_data_control)))
start_time = time.time()
if validation_method=="validation_split":
model.fit([X_train_seq_actors, X_train_seq_plot, X_train_seq_features, X_train_seq_reviews],
y_train,
steps_per_epoch=int(np.ceil((X_train_seq_actors.shape[0]*optimizer_parameters['validation_split_ratio'])//hparams[HP_HIDDEN_UNITS])),
epochs=fit_parameters["epoch"],
verbose=fit_parameters["verbose_fit"],
batch_size=hparams[HP_HIDDEN_UNITS],
validation_split=fit_parameters['validation_data_ratio'],
callbacks=callbacks)
elif validation_method=="validation_data":
model.fit([X_train_seq_actors, X_train_seq_plot, X_train_seq_features, X_train_seq_reviews],
y_train,
steps_per_epoch=int(np.ceil((X_train_seq_actors.shape[0]*optimizer_parameters['validation_split_ratio'])//hparams[HP_HIDDEN_UNITS])),
epochs=fit_parameters["epoch"],
verbose=fit_parameters["verbose_fit"],
batch_size=hparams[HP_HIDDEN_UNITS],
validation_data=([X_test_seq_actors, X_test_seq_plot, X_test_seq_features, X_test_seq_reviews], y_test),
callbacks=callbacks)
#save the model
save_model(model, folder_path, "multi_input_keras_model_{0}dim_{1}batchsize_{2}lr_{3}decaymultiplier_{4}".format(str(hparams[HP_EMBEDDING_DIM]), str(hparams[HP_HIDDEN_UNITS]), str(hparams[HP_LEARNING_RATE]), str(hparams[HP_DECAY_STEPS_MULTIPLIER]), version_data_control))
elapsed_time = time.time() - start_time
print("\nTraining time of the multi-input keras model has finished. Duration {} secs".format(format_timespan(elapsed_time)))
_, accuracy = model.evaluate([X_test_seq_actors, X_test_seq_plot, X_test_seq_features, X_test_seq_reviews], y_test, batch_size=hparams[HP_HIDDEN_UNITS], verbose=2)
return accuracy, model
Run everything - function -> Basically this is the function that I call to train my model.
def run(run_dir, hparams, version_data_control, optimizer_name, validation_method, callbacks, folder_path):
with tf.summary.create_file_writer(run_dir).as_default():
hp.hparams(hparams) # record the values used in this trial
accuracy, model = create_fit_keras_model(hparams, version_data_control, optimizer_name, validation_method, callbacks, folder_path)
print(model.history.history)
tf.summary.scalar(METRIC_ACCURACY, accuracy, step=1)
return model
Training process -> Runs successfully on my local machine but not in Google Colab.
session_num = 1
for batch_size in HP_HIDDEN_UNITS.domain.values:
for embedding_dim in HP_EMBEDDING_DIM.domain.values:
for learning_rate in HP_LEARNING_RATE.domain.values:
for decay_steps_multiplier in HP_DECAY_STEPS_MULTIPLIER.domain.values:
hparams = {
HP_HIDDEN_UNITS: batch_size,
HP_EMBEDDING_DIM: embedding_dim,
HP_LEARNING_RATE: learning_rate,
HP_DECAY_STEPS_MULTIPLIER: decay_steps_multiplier
}
run_name = "run-id {0}/{1}".format(session_num, (len(HP_HIDDEN_UNITS.domain.values)*len(HP_EMBEDDING_DIM.domain.values)*len(HP_LEARNING_RATE.domain.values)*len(HP_DECAY_STEPS_MULTIPLIER.domain.values)))
print('--- Starting trial: %s/n' % run_name)
print({h.name: hparams[h] for h in hparams}, "/n")
model_history=run('./logs/hparam_tuning/' + run_name, hparams, version_data_control, "adam", "validation_split",
callback("model_one/adam_v2_07072020",
"multi_input_keras_model_{0}dim_{1}batchsize_{2}lr_{3}decaymultiplier_{4}".format(str(embedding_dim), str(batch_size), str(learning_rate), str(decay_steps_multiplier), version_data_control),
"./logs/hparam_tuning/"+datetime.now().strftime("%Y%m%d-%H%M%S"),
hparams),
"model_one/adam_v2_07072020")
hist = pd.DataFrame(model_history.history.history)
hist['epoch'] = model_history.history.epoch
Everything is running fine based on the screenshot below. Although the history model that is returned from the run() function is totally empty. Also, nothing is printed on Tensorboard even though the logs are in the logdir directory. Also, I want to add that the code posted above is perfectly running on my local machine, both Tensorboard and callbacks, and model history. But due to lack of resources, I want to run this on colab for 100 epochs and not 1 as in this example (demo).
The error I get
My files
Thank you in advance for any comments and help. If you may want to see the full code with data I can share with you my colab notebook. Just post in the comments your Gmail account and I will share it with you!
I solved the issue by assigning the model.fit() method to a different object like below:
fitted_model = model.fit([X_train_seq_actors, X_train_seq_plot, X_train_seq_features, X_train_seq_reviews],
y_train,
steps_per_epoch=int(np.ceil((X_train_seq_actors.shape[0]*optimizer_parameters['validation_split_ratio'])//hparams[HP_HIDDEN_UNITS])),
epochs=fit_parameters["epoch"],
verbose=fit_parameters["verbose_fit"],
batch_size=hparams[HP_HIDDEN_UNITS],
validation_split=fit_parameters['validation_data_ratio'],
callbacks=callbacks)
then by returning the fitted_model object I can successfully run this hist = pd.DataFrame(model_history.history)
Output:
+----+----------+------------+------------+----------------+---------+
| | loss | accuracy | val_loss | val_accuracy | epoch |
|----+----------+------------+------------+----------------+---------|
| 1 | 0.295619 | 0.452375 | 0.186601 | 0.64396 | 1 |
+----+----------+------------+------------+----------------+---------+
Hope this will also help other users. Please feel free to assign this as a [Duplicate] question if a similar answer was given on the same issue.
I'm working on a project (my first AI project) and I've hit a bit of a wall. When performing testing on my trained classifier, it's predicting that everything is of class 1. Now the data set is heavily biased to class 1; however, I've implemented weights to compensate for this. Just concerned that I've coded this wrong or missed something. Please let me know if you see anything.
This is the setup and training
batchSize = 50
trainingLoad = DataLoader(trainingData, shuffle = True, batch_size = batchSize, drop_last=True)
validationLoad = DataLoader(validationData, shuffle = True, batch_size = batchSize, drop_last=True)
testingLoad = DataLoader(testingData, shuffle = True, batch_size = batchSize, drop_last=True)
vocabularySize = len(wordToNoDict)
output = 3
embedding = 400
hiddenDimension = 524
layers = 4
classifierModel = Classifier.HateSpeechDetector(device, vocabularySize, output, embedding, hiddenDimension, layers)
classifierModel.to(device)
path = 'Program\data\state_dict2.pt'
weights = torch.tensor([1203/1203, 1203/15389, 1203/3407])
criterion = nn.CrossEntropyLoss(weight = weights)
trainClassifier(classifierModel, trainingLoad, validationLoad, device, batchSize, criterion, path)
test(classifierModel, path, testingLoad, batchSize, device, criterion)
def trainClassifier(model, trainingData, validationData, device, batchSize, criterion, path):
epochs = 5
counter = 0
testWithValiEvery = 10
clip = 5
valid_loss_min = np.Inf
lr=0.0001
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
model.train()
for i in range(epochs):
h = model.init_hidden(batchSize, device)
for inputs, labels in trainingData:
h = tuple([e.data for e in h])
inputs, labels = inputs.to(device), labels.to(device)
model.zero_grad()
output, h = model(inputs, h)
loss = criterion(output.squeeze(), labels.long())
loss.backward()
nn.utils.clip_grad_norm_(model.parameters(), clip)
optimizer.step()
counter += 1
print(counter)
if counter%testWithValiEvery == 0:
print("validating")
val_h = model.init_hidden(batchSize, device)
val_losses = []
model.eval()
for inp, lab in validationData:
val_h = tuple([each.data for each in val_h])
inp, lab = inp.to(device), lab.to(device)
out, val_h = model(inp, val_h)#
val_loss = criterion(out.squeeze(), lab.long())
val_losses.append(val_loss.item())
model.train()
print("Epoch: {}/{}...".format(i+1, epochs),
"Step: {}...".format(counter),
"Loss: {:.6f}...".format(loss.item()),
"Val Loss: {:.6f}".format(np.mean(val_losses)))
if np.mean(val_losses) <= valid_loss_min:
torch.save(model.state_dict(), path)
print('Validation loss decreased ({:.6f} --> {:.6f}). Saving model ...'.format(valid_loss_min,np.mean(val_losses)))
print('model saved')
valid_loss_min = np.mean(val_losses)
This is the classifier - Fair amount of random commenting here where i've meddled with bits
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as op
import torchvision
from torch.utils.data import TensorDataset, DataLoader
from torchvision import transforms, datasets
class HateSpeechDetector(nn.Module):
def __init__(self, device, vocabularySize, output, embedding, hidden, layers, dropProb=0.5):
super(HateSpeechDetector, self).__init__()
#Number of outputs (Classes/Categories)
self.output = output
#Number of layers in the LSTM
self.numLayers = layers
#Number of hidden neurons in each LSTM layer
self.hiddenDimensions = hidden
#Device being used for by model (CPU or GPU)
self.device = device
#Embedding layer finds correlations in words by converting word integers into vectors
self.embedding = nn.Embedding(vocabularySize, embedding)
#LSTM stores important data in memory, using it to help with future predictions
self.lstm = nn.LSTM(embedding,hidden,layers,dropout=dropProb,batch_first=True)
#Dropout is used to randomly drop nodes. This helps to prevent overfitting of the model during training
self.dropout = nn.Dropout(dropProb)
#Establishing 4 simple layers and a sigmoid output
self.fc = nn.Linear(hidden, hidden)
self.fc2 = nn.Linear(hidden, hidden)
self.fc3 = nn.Linear(hidden, hidden)
self.fc4 = nn.Linear(hidden, hidden)
self.fc5 = nn.Linear(hidden, hidden)
self.fc6 = nn.Linear(hidden, output)
self.softmax = nn.Softmax(dim=2)
def forward(self, x, hidden):
batchSize = x.size(0)
x = x.long()
embeds = self.embedding(x)
lstm_out, hidden = self.lstm(embeds, hidden)
#Tensor changes here from 250,33,524 to 8250,524
# lstm_out = lstm_out.contiguous().view(-1,self.hiddenDimensions)
out = self.dropout(lstm_out)
out = self.fc(out)
out = self.fc2(out)
out = self.fc3(out)
out = self.fc4(out)
out = self.fc5(out)
out = self.fc6(out)
out = self.softmax(out)
out = out[:,-1,:]
# myTensor = torch.Tensor([0,0,0])
# newOut = torch.zeros(batchSize, self.output)
# count = 0
# row = 0
# for tensor in out:
# if(count == 33):
# newOut[row] = myTensor/33
# myTensor = torch.Tensor([0,0,0])
# row += 1
# count = 0
# myTensor += tensor
# count += 1
return out, hidden
def init_hidden(self, batchSize, device):
weight = next(self.parameters()).data
hidden = (weight.new(self.numLayers, batchSize, self.hiddenDimensions).zero_().to(device), weight.new(self.numLayers, batchSize, self.hiddenDimensions).zero_().to(device))
return hidden
You've added weights to the cross-entropy loss, and the weights bias towards the first class already ([1.0, 0.08, 0.35]).
Having a higher weight for a certain class means that the model will be more heavily penalized for getting that class wrong, and it's possible for the model to learn to just predict everything as the class with highest weight. Usually you don't need to manually assign weights.
Also, check your data to see if there's label imbalance, i.e., whether you have more training examples that are of the first class. An imbalanced training set has similar effects as setting different weights on the loss.
I am literally a beginner of PyTorch.
I trained an autoencoder network so that I can plot the distribution of the latent vectors (the result of encoders).
This is the code that I used for network training.
import torch
import torchvision
from torch import nn
from torch.utils.data import DataLoader
from torchvision import transforms
from torchvision.utils import save_image
from torch.utils.data import Dataset
from PIL import Image
import os
import glob
dir_img_decoded = '/media/dohyeong/HDD/mouth_autoencoder/dc_img_2'
if not os.path.exists(dir_img_decoded):
os.mkdir(dir_img_decoded)
dir_check_point = '/media/dohyeong/HDD/mouth_autoencoder/ckpt_2'
if not os.path.exists(dir_check_point):
os.mkdir(dir_check_point)
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
num_epochs = 200
batch_size = 150 # up -> GPU memory increase
learning_rate = 1e-3
dir_dataset = '/media/dohyeong/HDD/mouth_autoencoder/mouth_crop/dir_normalized_mouth_cropped_images'
images = glob.glob(os.path.join(dir_dataset, '*.png'))
train_images = images[:-113]
test_images = images[-113:]
train_images.sort()
test_images.sort()
class TrumpMouthDataset(Dataset):
def __init__(self, images):
super(TrumpMouthDataset, self).__init__()
self.images = images
self.transform = transforms.Compose([
# transforms.Resize((28, 28)),
transforms.ToTensor(),
transforms.Normalize((0.5, 0.5, 0.5), (0.5, 0.5, 0.5))
])
def __getitem__(self, index):
image = Image.open(self.images[index])
return self.transform(image)
def __len__(self):
return len(self.images)
train_dataset = TrumpMouthDataset(train_images)
test_dataset = TrumpMouthDataset(test_images)
train_dataloader = DataLoader(train_dataset, batch_size=batch_size, shuffle=True)
test_dataloader = DataLoader(test_dataset, batch_size=batch_size, shuffle=True)
class Autoencoder(nn.Module):
def __init__(self):
super(Autoencoder, self).__init__()
self.encoder = nn.Sequential(
nn.Linear(60000, 60),
nn.ReLU(True),
nn.Linear(60, 3),
nn.ReLU(True),
)
self.decoder = nn.Sequential(
nn.Linear(3, 60),
nn.ReLU(True),
nn.Linear(60, 60000),
nn.Tanh()
)
def forward(self, x):
x = x.view(x.size(0), -1)
encoded = self.encoder(x)
decoded = self.decoder(encoded)
return encoded, decoded
model = Autoencoder().cuda()
if torch.cuda.device_count() > 1:
model = nn.DataParallel(model)
model.to(device)
criterion = nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(),
lr=learning_rate,
weight_decay=1e-5)
for epoch in range(num_epochs):
total_loss = 0
for index, imgs in enumerate(train_dataloader):
imgs = imgs.to(device)
# ===================forward=====================
outputs = model(imgs)
imgs_flatten = imgs.view(imgs.size(0), -1)
loss = criterion(outputs, imgs_flatten)
# ===================backward====================
optimizer.zero_grad()
loss.backward()
optimizer.step()
total_loss += loss.item()
print('{} Epoch, [{}/{}] batch, loss: {:.4f}'.format(epoch, index + 1, len(train_dataloader), loss.item()))
avg_loss = total_loss / len(train_dataset)
print('{} Epoch, avg_loss: {:.4f}'.format(epoch, avg_loss))
if epoch % 10 == 0:
check_point_file = os.path.join(dir_check_point, str(epoch) + ".pth")
torch.save(model.state_dict(), check_point_file)
After training, I tried to get encoded values using this code.
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
check_point = '/media/dohyeong/HDD/mouth_autoencoder/290.pth'
model = torch.load(check_point)
for index, imgs in enumerate(train_dataloader):
imgs = imgs.to(device)
# ===================evaluate=====================
encoded, _ = model(imgs)
It finished with this error message.
"TypeError: 'collections.OrderedDict' object is not callable"
May I get some help?
Hi and welcome to the PyTorch community :D
TL;DR
Change model = torch.load(check_point) to model.load_state_dict(torch.load(check_point)).
The only problem is with the line:
model = torch.load(check_point)
The way you saved the checkpoint was:
torch.save(model.state_dict(), check_point_file)
That is, you saved the model's state_dict (which is just a dictionary of the various parameters that together describe the current instance of the model) in check_point_file.
Now, in order to load it back, just reverse the process.
check_point_file contains just the state_dict.
It knows nothing about the internals of the model - what it's architecture is, how it's supposed to work etc.
So, load it back:
state_dict = torch.load(check_point)
This state_dict can now be copied onto your Model instance as follows:
model.load_state_dict(state_dict)
Or, more succinctly,
model.load_state_dict(torch.load(check_point))
You got the error because the torch.load(check_point) returned the state_dict which you assigned to model.
When you subsequently called model(imgs), model was an OrderedDict object (not callable).
Hence the error.
See the Serialization Semantics Notes for more details.
Apart from that, your code sure is thorough for a beginner. Great going!
P.S. Your device agnosticity is brilliant! Perhaps you'd want to take a look at:
the line model = Autoencoder().cuda()
The map_location argument of torch.load()
I'm working on word2vec via distributed tensorflow. For a compatible reason, just slightly edit official word2vec to a Model kinda coding arch.
Code Snippet as follows:
def build():
self.global_step = tf.train.get_or_create_global_step()
with tf.variable_scope("weights", partitioner=partitioner):
self.embeddings = tf.get_variable(name="embeddings", shape=(self.vocab_size, self.embedding_size), initializer=tf.random_uniform_initializer(minval=-1.0, maxval=1.0))
self.nce_weights = tf.get_variable(name="nce_weights", shape=(self.vocab_size, self.embedding_size), initializer=tf.truncated_normal_initializer(stddev=1.0/math.sqrt(self.embedding_size)))
self.bias = tf.get_variable(name="bias", shape=(self.vocab_size), initializer=tf.zeros_initializer())
self.embeded = tf.nn.embedding_lookup(self.embeddings, inputs, partition_strategy='div')
print("lables: ", self.labels)
self.loss = tf.reduce_mean(
tf.nn.nce_loss(
weights = self.nce_weights,
biases = self.bias,
labels = self.labels,
inputs = self.embeded,
num_sampled = self.num_sampled,
num_classes = self.vocab_size,
partition_strategy="div"
)
)
self.optimizer = tf.train.GradientDescentOptimizer(self.learning_rate).minimize(self.loss, global_step=self.global_step)
# evaluate
normized = tf.sqrt(tf.reduce_sum(tf.square(self.embeddings), 1, keepdims=True))
normallized_embeddings = self.embeddings / normized
valid_data = np.r_[1:5]
self.valid_size = len(valid_data)
evaluate_examples = tf.constant(valid_data)
valid_embeddings = tf.nn.embedding_lookup(normallized_embeddings, evaluate_examples)
self.similarity = tf.matmul(valid_embeddings, normallized_embeddings, transpose_b=True)
The train method:
def train(args):
loss, _, global_step, embs = session.run([self.loss, self.optimizer, self.global_step, self.embeddings])
print(embs)
Training:
def main():
model = Word2vec(args)
model.build() # call the method above to build the graph
tf.global_variables_initializer()
with tf.Session() as sess:
while num_step < upperboud:
model.train(sess)
I print out the evaluation results during training, and found no changes all the time, but nce_weights are changing. And global_step and local_step are increasing. Not sure where is wrong, anyone can help point out ? Thanks