I want to implement a model with a custom loss function using keras.
I will simulate by sampling a dataset according to the following function:
def sampler(N1, N2, N3):
np.random.seed(42)
# Sampler #1: PDE domain
t1 = np.random.uniform(low=T0,
high=T,
size=[N1,1])
s1 = np.random.uniform(low=S1,
high=S2,
size=[N1,1])
# Sampler #2: boundary condition
t2 = np.random.uniform(low=T0,
high=T,
size=[N2,1])
s2 = np.zeros(shape=(N2, 1))
# Sampler #3: initial/terminal condition
t3 = T * np.ones((N3,1)) #Terminal condition
s3 = np.random.uniform(low=S1,
high=S2,
size=[N3,1])
return (t1, s1, t2, s2, t3, s3)
Each subset (t1, s1), (t2, s2) and (t3, s3) should be evaluated by a unique term in the loss function:
def loss(model, t1, x1, t2, x2, t3, x3):
# Loss term #1: PDE
V = model(t1, x1)
V_t = tf.gradients(V, t1)[0]
V_x = tf.gradients(V, x1)[0]
V_xx = tf.gradients(V_x, x1)[0]
f = V_t + r*x1*V_x + 0.5*sigma**2*x1**2*V_xx - r*V
L1 = tf.reduce_mean(tf.square(f))
#Loss term #2: boundary condition
L2 = tf.reduce_mean(tf.square(model(t2, x2) - 0))
# Loss term #3: initial/terminal condition
L3 = tf.reduce_mean(tf.square(model(t3, x3) - tf.math.maximum(x3-K,0)))
I have established the following parameters:
# Set random seeds
np.random.seed(123)
tf.random.set_seed(123)
# Strike price
K = 0.5
# PDE parameters
r = 0.02 # Interest rate
sigma = 0.18 # Volatility
# Time limits
T0 = 0.0 + 1e-10 # Initial time
T = 1.0 # Terminal time
# Space limits
S1 = 0.0 + 1e-10 # Low boundary
S2 = 1.0 # High boundary
# Number of samples
NS_1 = 1000
NS_2 = 100
NS_3 = 100
The Model
class DGM(tf.keras.Model):
def __init__(self, n_layers, n_nodes, dimensions=1):
"""
Parameters:
- n_layers: number of layers
- n_nodes: number of nodes in (inner) layers
- dimensions: number of spacial dimensions
"""
super().__init__()
self.n_layers = n_layers
self.initial_layer = DenseLayer(dimensions + 1, n_nodes, activation="relu")
self.lstmlikelist = []
for _ in range(self.n_layers):
self.lstmlikelist.append(LSTMLikeLayer(dimensions + 1, n_nodes, activation="relu"))
self.final_layer = DenseLayer(n_nodes, 1, activation=None)
def call(self, t, x):
X = tf.concat([t,x], 1)
S = self.initial_layer.call(X)
for i in range(self.n_layers):
S = self.lstmlikelist[i].call({'S': S, 'X': X})
result = self.final_layer.call(S)
return result
# Neural network layers
class DenseLayer(tf.keras.layers.Layer):
def __init__(self, n_inputs, n_outputs, activation):
"""
Parameters:
- n_inputs: number of inputs
- n_outputs: number of outputs
- activation: activation function
"""
super(DenseLayer, self).__init__()
self.n_inputs = n_inputs
self.n_outputs = n_outputs
self.W = self.add_weight(shape=(self.n_inputs, self.n_outputs),
initializer='random_normal',
trainable=True)
self.b = self.add_weight(shape=(1, self.n_outputs),
initializer='random_normal',
trainable=True)
self.activation = _get_function(activation)
def call(self, inputs):
S = tf.add(tf.matmul(inputs, self.W), self.b)
S = self.activation(S)
return S
class LSTMLikeLayer(tf.keras.layers.Layer):
def __init__(self, n_inputs, n_outputs, activation):
"""
Parameters:
- n_inputs: number of inputs
- n_outputs: number of outputs
- activation: activation function
"""
super(LSTMLikeLayer, self).__init__()
self.n_outputs = n_outputs
self.n_inputs = n_inputs
self.Uz = self.add_weight("Uz", shape=[self.n_inputs, self.n_outputs])
self.Ug = self.add_weight("Ug", shape=[self.n_inputs, self.n_outputs])
self.Ur = self.add_weight("Ur", shape=[self.n_inputs, self.n_outputs])
self.Uh = self.add_weight("Uh", shape=[self.n_inputs, self.n_outputs])
self.Wz = self.add_weight("Wz", shape=[self.n_outputs, self.n_outputs])
self.Wg = self.add_weight("Wg", shape=[self.n_outputs, self.n_outputs])
self.Wr = self.add_weight("Wr", shape=[self.n_outputs, self.n_outputs])
self.Wh = self.add_weight("Wh", shape=[self.n_outputs, self.n_outputs])
self.bz = self.add_weight("bz", shape=[1, self.n_outputs])
self.bg = self.add_weight("bg", shape=[1, self.n_outputs])
self.br = self.add_weight("br", shape=[1, self.n_outputs])
self.bh = self.add_weight("bh", shape=[1, self.n_outputs])
self.activation = _get_function(activation)
def call(self, inputs):
S = inputs['S']
X = inputs['X']
Z = self.activation(tf.add(tf.add(tf.matmul(X, self.Uz), tf.matmul(S, self.Wz)), self.bz))
G = self.activation(tf.add(tf.add(tf.matmul(X, self.Ug), tf.matmul(S, self.Wg)), self.bg))
R = self.activation(tf.add(tf.add(tf.matmul(X, self.Ur), tf.matmul(S, self.Wr)), self.br))
H = self.activation(tf.add(tf.add(tf.matmul(X, self.Uh), tf.matmul(tf.multiply(S, R), self.Wh)), self.bh))
Snew = tf.add(tf.multiply(tf.subtract(tf.ones_like(G), G), H), tf.multiply(Z, S))
return Snew
def _get_function(name):
f = None
if name == "tanh":
f = tf.nn.tanh
elif name == "sigmoid":
f = tf.nn.sigmoid
elif name == "relu":
f = tf.nn.relu
elif not name:
f = tf.identity
assert f is not None
return f
This would be my approach for an exemplary model:
model = DGM(n_layers=2, n_nodes = 3)
model.compile(
optimizer = "Adam", loss = loss
)
model.fit()
Error trace back:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-12-f781fe65eb00> in <module>()
----> 1 model.fit()
2 frames
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _method_wrapper(self, *args, **kwargs)
106 def _method_wrapper(self, *args, **kwargs):
107 if not self._in_multi_worker_mode(): # pylint: disable=protected-access
--> 108 return method(self, *args, **kwargs)
109
110 # Running inside `run_distribute_coordinator` already.
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1030 version_utils.disallow_legacy_graph('Model', 'fit')
1031 self._assert_compile_was_called()
-> 1032 self._check_call_args('fit')
1033 _disallow_inside_tf_function('fit')
1034
/usr/local/lib/python3.6/dist-packages/tensorflow/python/keras/engine/training.py in _check_call_args(self, method_name)
2468 'Models passed to `' + method_name + '` can only have `training` '
2469 'and the first argument in `call` as positional arguments, '
-> 2470 'found: ' + str(extra_args) + '.')
2471
2472 def _validate_compile(self, optimizer, metrics, **kwargs):
ValueError: Models passed to `fit` can only have `training` and the first argument in `call` as positional arguments, found: ['x'].
Now how can I fit the model in accordance with the customary loss function I have set up?
Related
Neural network optimization algorithm : ValueError:optimizer got an empty parameter list
I tried to fix the code. However, It got another error called RecursionError.
I don't know how to solve this error.
RecursionError: maximum recursion depth exceeded while
calling a Python object
RecursionErrorTraceback (most recent call last)
--------------------------------------------------------------------------
<ipython-input-97-77e92a8e7336> in <module>()
2 sampler = optuna.samplers.TPESampler(seed=0)
3 study = optuna.create_study(sampler=sampler, direction='maximize')
----> 4 study.optimize(objective, n_trials=100)
489 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in __
setattr__(self, name, value)
1188
1189 params = self.__dict__.get('_parameters')
-> 1190 if isinstance(value, Parameter):
1191 if params is None:
1192 raise AttributeError(
RecursionError: maximum recursion
depth exceeded while
calling a Python object
class Net(pl.LightningModule):
def __init__(self, optimizer_name='SGD', n_layers=1, n_mid=3,lr=0.01):
super().__init__()
self.net = Net()
self.optimizer_name = optimizer_name
self.n_layers = n_layers
self.n_mid = n_mid
self.lr = lr
self.layers = nn.Sequential()
def forward(self, x):
h = self.layers(x)
return h
def training_step(self, batch, batch_idx):
x, t = batch
y = self(x)
loss = F.cross_entropy(y, t)
self.log('train_loss',loss,on_step=True,on_epoch=
True,prog_bar=True)
self.log('train_acc', accuracy(y, t), on_step=True, on_epoch=True,
prog_bar=True)
return loss
def validation_step(self, batch, batch_idx):
x, t = batch
y = self(x)
loss = F.cross_entropy(y, t)
self.log('val_loss', loss, on_step=False, on_epoch=True)
self.log('val_acc', accuracy(y, t), on_step=False, on_epoch=True)
return loss
def test_step(self, batch, batch_idx):
x, t = batch
y = self(x)
loss = F.cross_entropy(y, t)
self.log('test_loss', loss, on_step=False, on_epoch=True)
self.log('test_acc', accuracy(y, t), on_step=False, on_epoch=True)
return loss
def configure_optimizers(self) -> optim.Optimizer:
optimizer = getattr(torch.optim, self.optimizer_name)
(self.net.parameters(), lr=self.lr)
return optimizer
def objective(trial: optuna.trial.Trial) -> float:
optimizer_name = trial.suggest_categorical('optimizer', ['SGD',
'Adagrad', 'RMSprop', 'Adadelta', 'Adam', 'AdamW'])
lr = trial.suggest_loguniform('lr', 1e-5, 1e-1)
pl.seed_everything(0)
net = Net(optimizer_name, lr)
trainer = pl.Trainer(max_epochs=10, gpus=1, deterministic=True,
callbacks=[EarlyStopping(monitor='val_acc')])
trainer.fit(net, train_loader, val_loader)
result = trainer.callback_metrics['val_acc']
return result
sampler = optuna.samplers.TPESampler(seed=0)
study = optuna.create_study(sampler=sampler, direction='maximize')
study.optimize(objective, n_trials=100)
What could be causing this?
While trying to train my model with 2 targets, I get the error…“ValueError: Target size (torch.Size([1, 1])) must be the same as input size (torch.Size([1, 2]))”. I have a data set with 2 targets. I tried a lot even by resizing the tensors but no use. Also if I make the output_dim = 1, it always predicts the same class out of two.
** Loading Training data
class SwelltrainDataset(T.utils.data.Dataset):
def __init__(self, Swelltrain):
sc = StandardScaler()
X_tr = sc.fit_transform(X_train)
Y_tr = y_train
self.X_tr = torch.tensor(X_tr, dtype = torch.float32)
self.Y_tr = torch.tensor(Y_tr, dtype = torch.float32)
def __len__(self):
return len(self.Y_tr)
def __getitem__(self, idx):
return self.X_tr[idx], self.Y_tr[idx]
train_ds = SwelltrainDataset(Swelltrain)
bat_size = 1
idx = np.append(np.where(train_ds.Y_tr == 0)[0],
np.where(train_ds.Y_tr == 1)[0],
)
train_ds.X_tr = train_ds.X_tr[idx]
train_ds.Y_tr = train_ds.Y_tr[idx]
train_ldr = T.utils.data.DataLoader(train_ds,
batch_size=bat_size, shuffle=True)
batch = next(iter(train_ldr))
I am using LSTM Model with dimensions: input_dim = 16, hidden_dim = 100, layer_dim = 1, output_dim = 2
class LSTMModel(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super(LSTMModel, self).__init__()
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
self.lstm = nn.LSTM(input_dim, hidden_dim, layer_dim, dropout=1, batch_first=True, )
self.fc = nn.Linear(hidden_dim, output_dim)
def forward(self, x):
h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim).requires_grad_()
x, (hn, cn) = self.lstm(x, (h0.detach(), c0.detach()))
x = self.fc(x[:, -1, :])
return (x)
**Model Training
optimizer = optim.Adam(model.parameters(), lr=0.001)
loss_func = nn.BCEWithLogitsLoss()
epochs = 2
loss_list = []
model.train()
for epoch in range(epochs):
total_loss = []
for X_tr, Y_tr in train_ldr:
X_tr = X_tr.unsqueeze(1)
Y_tr = Y_tr.type(torch.LongTensor)
Y_tr = Y_tr.unsqueeze(1)
optimizer.zero_grad()
output = model(X_tr.float())
pred = output.argmax(dim=1, keepdim=True)
loss = loss_func(output, Y_tr.float())
loss.backward()
optimizer.step()
total_loss.append(loss.item())
loss_list.append(sum(total_loss)/len(total_loss))
print('Training [{:.0f}%]\tLoss: {:.4f}'.format(
100. * (epoch + 1) / epochs, loss_list[-1]))
ValueError Traceback (most recent call last)
<ipython-input-30-1ab26e6f45d7> in <module>
31
32 # print(Y_tr.size())
---> 33 loss = loss_func(output, Y_tr.float())
34
35 # Backward pass
~\anaconda3\lib\site-packages\torch\nn\modules\module.py in _call_impl(self, *input, **kwargs)
887 result = self._slow_forward(*input, **kwargs)
888 else:
--> 889 result = self.forward(*input, **kwargs)
890 for hook in itertools.chain(
891 _global_forward_hooks.values(),
~\anaconda3\lib\site-packages\torch\nn\modules\loss.py in forward(self, input, target)
712 assert self.weight is None or isinstance(self.weight, Tensor)
713 assert self.pos_weight is None or isinstance(self.pos_weight, Tensor)
--> 714 return F.binary_cross_entropy_with_logits(input, target,
715 self.weight,
716 pos_weight=self.pos_weight,
~\anaconda3\lib\site-packages\torch\nn\functional.py in binary_cross_entropy_with_logits(input, target, weight, size_average, reduce, reduction, pos_weight)
2825
2826 if not (target.size() == input.size()):
-> 2827 raise ValueError("Target size ({}) must be the same as input size ({})".format(target.size(), input.size()))
2828
2829 return torch.binary_cross_entropy_with_logits(input, target, weight, pos_weight, reduction_enum)
ValueError: Target size (torch.Size([1, 1])) must be the same as input size (torch.Size([1, 2]))
Try converting your Y_tr variable into a 2 class one hot label:
one_hot_label = torch.nn.functional.one_hot(Y_tr.to(torch.int64), 2)
one_hot_label = one_hot_label.float()
This should have the shape of target torch.Size([1, 2])
I'm trying to make a variational autoencoder with Keras using a custom model.
Note that if I don't have a validation dataset it doesn't error! I believe this error is due to the validation loss logs but I can't manage to work it out.
This is my model:
class VAE(keras.Model):
def __init__(self, encoder, decoder, **kwargs):
super(VAE, self).__init__(**kwargs)
self.encoder = encoder
self.decoder = decoder
# Train
self.total_train_loss_tracker = keras.metrics.Mean(name="total_train_loss")
self.recon_train_loss_tracker = keras.metrics.Mean(name="recon_train_loss")
self.kl_train_loss_tracker = keras.metrics.Mean(name="kl_train_loss")
# Val
self.total_val_loss_tracker = keras.metrics.Mean(name="total_val_loss")
self.recon_val_loss_tracker = keras.metrics.Mean(name="recon_val_loss")
self.kl_val_loss_tracker = keras.metrics.Mean(name="kl_val_loss")
#property
def metrics(self):
return [
self.total_train_loss_tracker,
self.recon_train_loss_tracker,
self.kl_train_loss_tracker,
self.total_val_loss_tracker,
self.recon_val_loss_tracker,
self.kl_val_loss_tracker
]
def train_step(self, data):
with tf.GradientTape() as tape:
x, y, weights = data
z_mean, z_log_var, z = self.encoder(x)
reconstruction = self.decoder(z)
## Loss
# reconstruction
recon_loss = binary_crossentropy(x, reconstruction) # Shape = BATCH_SIZE
# Weights on recon loss
recon_train_loss = (weights * recon_loss) / K.sum(weights)
recon_loss = K.mean(recon_loss, axis = 0)
# KL
kl_loss = -0.5 * (1 + z_log_var - K.square(z_mean) - K.exp(z_log_var))
kl_loss = K.mean(K.sum(kl_loss, axis=1), axis=0)
# Weights on KL Loss
kl_loss = (weights * kl_loss) / K.sum(weights)
kl_loss = K.mean(kl_loss, axis = 0)
# Total
total_loss = recon_loss + kl_loss
# Step
grads = tape.gradient(total_loss, self.trainable_weights)
self.optimizer.apply_gradients(zip(grads, self.trainable_weights))
# Log
self.total_train_loss_tracker.update_state(total_loss)
self.recon_train_loss_tracker.update_state(recon_loss)
self.kl_train_loss_tracker.update_state(kl_loss)
return {
"total_train_loss": self.total_train_loss_tracker.result(),
"recon_train_loss": self.recon_train_loss_tracker.result(),
"kl_train_loss": self.kl_train_loss_tracker.result(),
}
def validation_step(self, data):
# No tape, we don't need gradients
x, y = data
print("1", type(x), type(y))
z_mean, z_log_var, z = self.encoder(x)
reconstruction = self.decoder(z)
## Loss
# reconstruction
recon_loss = binary_crossentropy(x, reconstruction) # Shape = BATCH_SIZE
# KL
kl_loss = -0.5 * (1 + z_log_var - K.square(z_mean) - K.exp(z_log_var))
kl_loss = K.mean(K.sum(kl_loss, axis=1), axis=0)
# Total
total_loss = recon_loss + kl_loss
loss = recon_loss*self.alpha + kl_loss
# Log
self.total_val_loss_tracker.update_state(total_loss)
self.recon_val_loss_tracker.update_state(recon_loss)
self.kl_val_loss_tracker.update_state(kl_loss)
return {
"total_val_loss": self.total_val_loss_tracker.result(),
"recon_val_loss": self.recon_val_loss_tracker.result(),
"kl_val_loss": self.kl_val_loss_tracker.result(),
}
def test_step(self, data):
# No tape, we don't need gradients
x, y = data
z_mean, z_log_var, z = self.encoder(x)
reconstruction = self.decoder(z)
return reconstruction, z_mean, z_log_var, z # z is the latent vector
# Compile
vae = VAE(encoder, decoder)
vae.compile(optimizer=keras.optimizers.Adam())
And when I fit the model with:
## Callbacks
# Model name
name = str(datetime.now().strftime("%d_%m_%Y__%H_%M_%S"))
# Tensorboard
TB = keras.callbacks.TensorBoard(log_dir=join("logs", name), write_images=True)
# Early Stopping
ES = keras.callbacks.EarlyStopping(monitor="total_val_loss", patience=30, verbose=2, mode="min")
# Model Checkpoint
MC = keras.callbacks.ModelCheckpoint(filepath=join("models_tf", name), save_best_only=True, monitor="total_val_loss", mode="min")
# Fit
history = vae.fit(
# Train
x=x_train,
y=x_train,
sample_weight=x_train_weights,
# Validation
validation_data=(x_val,x_val),
# Hyper-parameters
epochs=30,
batch_size=4048,
callbacks=[TB, ES, MC])
It outputs:
Epoch 1/30
90/92 [============================>.] - ETA: 0s - total_train_loss: -1.3369 - recon_train_loss: -1.3404 - kl_train_loss: 0.0035
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-17-b3ea8a6b1b64> in <module>
1 # Fit
----> 2 history = vae.fit(
3 # Train
4 x=x_train,
5 y=x_train,
~\AppData\Local\Programs\Python\Python39\lib\site-packages\tensorflow\python\keras\engine\training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1212 model=self,
1213 steps_per_execution=self._steps_per_execution)
-> 1214 val_logs = self.evaluate(
1215 x=val_x,
1216 y=val_y,
~\AppData\Local\Programs\Python\Python39\lib\site-packages\tensorflow\python\keras\engine\training.py in evaluate(self, x, y, batch_size, verbose, sample_weight, steps, callbacks, max_queue_size, workers, use_multiprocessing, return_dict, **kwargs)
1494 callbacks.on_test_batch_end(end_step, logs)
1495 logs = tf_utils.sync_to_numpy_or_python_type(logs)
-> 1496 callbacks.on_test_end(logs=logs)
1497
1498 if return_dict:
~\AppData\Local\Programs\Python\Python39\lib\site-packages\tensorflow\python\keras\callbacks.py in on_test_end(self, logs)
543 logs = self._process_logs(logs)
544 for callback in self.callbacks:
--> 545 callback.on_test_end(logs)
546
547 def on_predict_begin(self, logs=None):
~\AppData\Local\Programs\Python\Python39\lib\site-packages\tensorflow\python\keras\callbacks.py in on_test_end(self, logs)
2391 if self.model.optimizer and hasattr(self.model.optimizer, 'iterations'):
2392 with summary_ops_v2.record_if(True), self._val_writer.as_default():
-> 2393 for name, value in logs.items():
2394 summary_ops_v2.scalar(
2395 'evaluation_' + name + '_vs_iterations',
AttributeError: 'tuple' object has no attribute 'items'
I'm currently using Keras and tensorflow 2.5.0
I had the same AttributeError: 'tuple' object has no attribute 'items' and thanks to you, I solved it.
The problem was at the end of train_step(), I returned return d_loss, g_loss, generated_images, which resolved to a touple with then threw the Error.
Instead, I returned a dict:
return {'d_loss': d_loss, 'g_loss': g_loss, 'generated_images': generated_images} and it worked.
So looking at your code, in test_step() you just return a touple instead of a dict.
Hope this is helpful
I am working on clinical EHR. I am currently referring to this blog and github link here. I have generated the dataset and processed it as per the instructions in the notebooks present in the repository. I am facing an issue trying to train the model.
build_EHRNN class.
torch.manual_seed(1)
class build_EHRNN(nn.Module):
def __init__(self, inputDimSize=4894, hiddenDimSize=[200,200], batchSize=100, embSize=200, numClass=4894, dropout=0.5, logEps=1e-8):
super(build_EHRNN, self).__init__()
self.inputDimSize = inputDimSize
self.hiddenDimSize = hiddenDimSize
self.numClass = numClass
self.embSize = embSize
self.batchSize = batchSize
self.dropout = nn.Dropout(p=0.5)
self.logEps = logEps
# Embedding inputs
self.W_emb = nn.Parameter(torch.randn(self.inputDimSize, self.embSize).cuda())
self.b_emb = nn.Parameter(torch.zeros(self.embSize).cuda())
self.W_out = nn.Parameter(torch.randn(self.hiddenDimSize, self.numClass).cuda())
self.b_out = nn.Parameter(torch.zeros(self.numClass).cuda())
self.params = [self.W_emb, self.W_out,
self.b_emb, self.b_out]
def forward(self,x, y, h, lengths, mask):
self.emb = torch.tanh(torch.matmul(x, self.W_emb) + self.b_emb)
input_values = self.emb
self.outputs = [input_values]
for i, hiddenSize in enumerate([self.hiddenDimSize, self.hiddenDimSize]): # iterate over layers
rnn = EHRNN(self.inputDimSize,hiddenSize,self.embSize,self.batchSize,self.numClass) # calculate hidden states
hidden_state = []
h = self.init_hidden().cuda()
for i,seq in enumerate(input_values): # loop over sequences in each batch
h = rnn(seq, h)
hidden_state.append(h)
hidden_state = self.dropout(torch.stack(hidden_state)) # apply dropout between layers
input_values = hidden_state
y_linear = torch.matmul(hidden_state, self.W_out) + self.b_out # fully connected layer
yhat = F.softmax(y_linear, dim=1) # yhat
yhat = yhat*mask[:,:,None] # apply mask
# Loss calculation
cross_entropy = -(y * torch.log(yhat + self.logEps) + (1. - y) * torch.log(1. - yhat + self.logEps))
last_step = -torch.mean(y[-1] * torch.log(yhat[-1] + self.logEps) + (1. - y[-1]) * torch.log(1. - yhat[-1] + self.logEps))
prediction_loss = torch.sum(torch.sum(cross_entropy, dim=0),dim=1)/ torch.cuda.FloatTensor(lengths)
cost = torch.mean(prediction_loss) + 0.000001 * (self.W_out ** 2).sum() # regularize
return (yhat, hidden_state, cost)
def init_hidden(self):
return torch.zeros(self.batchSize, self.hiddenDimSize) # initial state
Creating instance and training model
model = build_EHRNN(inputDimSize=4894, hiddenDimSize=200, batchSize=100, embSize=200, numClass=4894, dropout=0.5, logEps=1e-8)
model = model.to(device)
optimizer = torch.optim.Adadelta(model.parameters(), lr = 0.01, rho=0.90)
max_epochs = 10
loss_all = []
iteration = 0
for e in range(max_epochs):
for index in random.sample(range(n_batches), n_batches):
batchX = train[0][:n_batches*batchSize][index*batchSize:(index+1)*batchSize]
batchY = train[1][:n_batches*batchSize][index*batchSize:(index+1)*batchSize]
optimizer.zero_grad()
x, y, lengths, mask = padding(batchX, batchY, 4894, 4894)
if torch.cuda.is_available():
x, y, lenghts, mask = x.cuda(), y.cuda(), lengths, mask.cuda()
outputs, hidden, cost = model(x,y, h, lengths, mask)
if torch.cuda.is_available():
cost.cuda()
cost.backward()
nn.utils.clip_grad_norm_(model.parameters(), 5)
optimizer.step()
Error:
---------------------------------------------------------------------------
NameError Traceback (most recent call last)
<ipython-input-14-cff1f002dced> in <module>()
17 x, y, lenghts, mask = x.cuda(), y.cuda(), lengths, mask.cuda()
18
---> 19 outputs, hidden, cost = model(x,y, h, lengths, mask)
20
21 if torch.cuda.is_available():
NameError: name 'h' is not defined
Update:
Removing 'h' param produces the following error
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-14-6495250d91c9> in <module>()
18
19 # outputs, hidden, cost = model(x,y, h, lengths, mask)
---> 20 outputs, hidden, cost = model(x, y, lengths, mask)
21
22 if torch.cuda.is_available():
1 frames
/usr/local/lib/python3.6/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
720 result = self._slow_forward(*input, **kwargs)
721 else:
--> 722 result = self.forward(*input, **kwargs)
723 for hook in itertools.chain(
724 _global_forward_hooks.values(),
<ipython-input-7-3c831fe3ca8d> in forward(self, x, y, lengths, mask)
36 h = rnn(seq, h)
37 hidden_state.append(h)
---> 38 hidden_state = self.dropout(torch.stack(hidden_state)) # apply dropout between layers
39 input_values = hidden_state
40
RuntimeError: stack expects a non-empty TensorList
I think I fixed your error:
replace your forward method to:
def forward(self,x, y, lengths, mask):
self.emb = torch.tanh(torch.matmul(x, self.W_emb) + self.b_emb)
input_values = self.emb
self.outputs = [input_values]
for i, hiddenSize in enumerate([self.hiddenDimSize, self.hiddenDimSize]): # iterate over layers
rnn = EHRNN(self.inputDimSize,hiddenSize,self.embSize,self.batchSize,self.numClass) # calculate hidden states
hidden_state = []
h = self.init_hidden().cuda()
for i,seq in enumerate(input_values): # loop over sequences in each batch
h = rnn(seq, h)
hidden_state.append(h)
hidden_state = self.dropout(torch.stack(hidden_state)) # apply dropout between layers
input_values = hidden_state
y_linear = torch.matmul(hidden_state, self.W_out) + self.b_out # fully connected layer
yhat = F.softmax(y_linear, dim=1) # yhat
yhat = yhat*mask[:,:,None] # apply mask
and replace the line where the error happens to:
outputs, hidden, cost = model(x, y, lengths, mask)
Following the tutorial from https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb
There is a USE_CUDA flag that is used to control the variable and tensor types between CPU (when False) to GPU (when True) types.
Using the data from en-fr.tsv and converting the sentences to variables:
import unicodedata
import string
import re
import random
import time
import math
from gensim.corpora.dictionary import Dictionary
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import LongTensor, FloatTensor
from torch import optim
import torch.nn.functional as F
import numpy as np
MAX_LENGTH = 10
USE_CUDA = False
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
s = unicode_to_ascii(s.lower().strip())
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
return s
SOS_IDX, SOS_TOKEN = 0, '<s>'
EOS_IDX, EOS_TOKEN = 1, '</s>'
UNK_IDX, UNK_TOKEN = 2, '<unk>'
PAD_IDX, PAD_TOKEN = 3, '<blank>'
lines = open('en-fr.tsv').read().strip().split('\n')
pairs = [[normalize_string(s).split() for s in l.split('\t')] for l in lines]
src_sents, trg_sents = zip(*pairs)
src_dict = Dictionary([[SOS_TOKEN, EOS_TOKEN, UNK_TOKEN, PAD_TOKEN]])
src_dict.add_documents(src_sents)
trg_dict = Dictionary([[SOS_TOKEN, EOS_TOKEN, UNK_TOKEN, PAD_TOKEN]])
trg_dict.add_documents(trg_sents)
def variablize_sentences(sentence, dictionary):
indices = [dictionary.token2id[tok] for tok in sentence] + [dictionary.token2id[EOS_TOKEN]]
var = Variable(LongTensor(indices).view(-1, 1))
return var.cuda() if USE_CUDA else var
input_variables = [variablize_sentences(sent, src_dict) for sent in src_sents]
output_variables = [variablize_sentences(sent, trg_dict) for sent in trg_sents]
And using a Encoder-Attn-Decoder network:
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size, n_layers=1):
super(EncoderRNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.embedding = nn.Embedding(input_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
self.embedding = self.embedding.cuda() if USE_CUDA else self.embedding
self.gru = self.gru.cuda() if USE_CUDA else self.gru
def forward(self, word_inputs, hidden):
seq_len = len(word_inputs)
embedded = self.embedding(word_inputs).view(seq_len, 1, -1)
embedded = embedded.cuda() if USE_CUDA else embedded
output, hidden = self.gru(embedded, hidden)
output = output.cuda() if USE_CUDA else output
hiddne = hidden.cuda() if USE_CUDA else hidden
return output, hidden
def init_hidden(self):
hidden = Variable(torch.zeros(self.n_layers, 1, self.hidden_size))
return hidden.cuda() if USE_CUDA else hidden
class Attn(nn.Module):
def __init__(self, method, hidden_size, max_length=MAX_LENGTH):
super(Attn, self).__init__()
self.method = method
self.hidden_size = hidden_size
if self.method == 'general':
self.attn = nn.Linear(self.hidden_size, hidden_size)
elif self.method == 'concat':
self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
self.other = nn.Parameter(FloatTensor(1, hidden_size))
def forward(self, hidden, encoder_outputs):
seq_len = len(encoder_outputs)
# Create variable to store attention energies
attn_energies = Variable(torch.zeros(seq_len)) # B x 1 x S
attn_energies = attn_energies.cuda() if USE_CUDA else attn_energies
# Calculate energies for each encoder output
for i in range(seq_len):
attn_energies[i] = self.score(hidden, encoder_outputs[i])
# Normalize energies to weights in range 0 to 1, resize to 1 x 1 x seq_len
return F.softmax(attn_energies).unsqueeze(0).unsqueeze(0)
def score(self, hidden, encoder_output):
if self.method == 'dot':
energy =torch.dot(hidden.view(-1), encoder_output.view(-1))
elif self.method == 'general':
energy = self.attn(encoder_output)
energy = torch.dot(hidden.view(-1), energy.view(-1))
elif self.method == 'concat':
energy = self.attn(torch.cat((hidden, encoder_output), 1))
energy = torch.dot(self.v.view(-1), energy.view(-1))
return energy
class AttnDecoderRNN(nn.Module):
def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout_p=0.1):
super(AttnDecoderRNN, self).__init__()
# Keep parameters for reference
self.attn_model = attn_model
self.hidden_size = hidden_size
self.output_size = output_size
self.n_layers = n_layers
self.dropout_p = dropout_p
# Define layers
self.embedding = nn.Embedding(output_size, hidden_size)
self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p)
self.out = nn.Linear(hidden_size * 2, output_size)
self.embedding = self.embedding.cuda() if USE_CUDA else self.embedding
self.gru = self.gru.cuda() if USE_CUDA else self.gru
self.out = self.out.cuda() if USE_CUDA else self.out
# Choose attention model
if attn_model != 'none':
self.attn = Attn(attn_model, hidden_size)
self.attn = self.attn.cuda() if USE_CUDA else self.attn
def forward(self, word_input, last_context, last_hidden, encoder_outputs):
# Note: we run this one step at a time
# Get the embedding of the current input word (last output word)
word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N
# Combine embedded input word and last context, run through RNN
rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
rnn_output, hidden = self.gru(rnn_input, last_hidden)
# Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
attn_weights = self.attn(rnn_output.squeeze(0), encoder_outputs)
context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N
# Final output layer (next word prediction) using the RNN hidden state and context vector
rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
context = context.squeeze(1) # B x S=1 x N -> B x N
output = F.log_softmax(self.out(torch.cat((rnn_output, context), 1)))
if USE_CUDA:
return output.cuda(), context.cuda(), hidden.cuda(), attn_weights.cuda()
else:
return output, context, hidden, attn_weights
And testing the network:
encoder_test = EncoderRNN(10, 10, 2) # I, H , L
decoder_test = AttnDecoderRNN('general', 10, 10, 2) # A, H, O, L
encoder_hidden = encoder_test.init_hidden()
if USE_CUDA:
word_inputs = Variable(torch.LongTensor([1, 2, 3]).cuda())
else:
word_inputs = Variable(torch.LongTensor([1, 2, 3]))
encoder_outputs, encoder_hidden = encoder_test(word_inputs, encoder_hidden)
decoder_attns = torch.zeros(1, 3, 3)
decoder_hidden = encoder_hidden
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))
decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs)
print(decoder_output)
print(decoder_hidden)
print(decoder_attn)
The code works fine on CPU,
[out]:
EncoderRNN (
(embedding): Embedding(10, 10)
(gru): GRU(10, 10, num_layers=2)
)
AttnDecoderRNN (
(embedding): Embedding(10, 10)
(gru): GRU(20, 10, num_layers=2, dropout=0.1)
(out): Linear (20 -> 10)
(attn): Attn (
(attn): Linear (10 -> 10)
)
)
Variable containing:
-2.4378 -2.3556 -2.3391 -2.5070 -2.3439 -2.3415 -2.3976 -2.1832 -1.9976 -2.2213
[torch.FloatTensor of size 1x10]
Variable containing:
(0 ,.,.) =
Columns 0 to 8
-0.2325 0.0775 0.5415 0.4876 -0.5771 -0.0687 0.1832 -0.5285 0.2508
Columns 9 to 9
-0.1837
(1 ,.,.) =
Columns 0 to 8
-0.1389 -0.2605 -0.0518 0.3405 0.0774 0.1815 0.0297 -0.1304 -0.1015
Columns 9 to 9
0.2602
[torch.FloatTensor of size 2x1x10]
Variable containing:
(0 ,.,.) =
0.3334 0.3291 0.3374
[torch.FloatTensor of size 1x1x3]
but when changing the flag to USE_GPU=True, it throws the error when initializing the decoder_test object, it throws a TypeError:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-76-b3c660013934> in <module>()
12 decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))
13
---> 14 decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs)
15 print(decoder_output)
16 print(decoder_hidden)
~/.local/lib/python3.5/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
222 for hook in self._forward_pre_hooks.values():
223 hook(self, input)
--> 224 result = self.forward(*input, **kwargs)
225 for hook in self._forward_hooks.values():
226 hook_result = hook(self, input, result)
<ipython-input-75-34ecfe9b3112> in forward(self, word_input, last_context, last_hidden, encoder_outputs)
32
33 # Combine embedded input word and last context, run through RNN
---> 34 rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
35 rnn_output, hidden = self.gru(rnn_input, last_hidden)
36
~/.local/lib/python3.5/site-packages/torch/autograd/variable.py in cat(iterable, dim)
895 #staticmethod
896 def cat(iterable, dim=0):
--> 897 return Concat.apply(dim, *iterable)
898
899 #staticmethod
~/.local/lib/python3.5/site-packages/torch/autograd/_functions/tensor.py in forward(ctx, dim, *inputs)
315 ctx.dim = dim
316 ctx.input_sizes = [i.size(dim) for i in inputs]
--> 317 return torch.cat(inputs, dim)
318
319 #staticmethod
TypeError: cat received an invalid combination of arguments - got (tuple, int), but expected one of:
* (sequence[torch.cuda.FloatTensor] seq)
* (sequence[torch.cuda.FloatTensor] seq, int dim)
didn't match because some of the arguments have invalid types: (tuple, int)
The question is why are that types not matching in CUDA but it works on CPU and how to resolve this?
Does PyTorch have a global flag to just change all types to CUDA types and not mess around with CPU/GPU types?
You can also try:
net = YouNetworkClass()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
After that, you have to send the word_inputs, encoder_hidden and decoder_context to the GPU too:
word_inputs, encoder_hidden, decoder_context = word_inputs.to(device), encoder_hidden.to(device), decoder_context.to(device)
Look here: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#training-on-gpu
Does PyTorch have a global flag to just change all types to CUDA types and not mess around with CPU/GPU types?
Nope.
(Source: https://discuss.pytorch.org/t/porting-seq2seq-tutorial-from-spro-practical-pytorh-from-cpu-to-gpu/8604)
Specific to the example:
The input variables to the decoder_test object needs to be in .cuda() type. More specifically:
encoder_hidden = encoder_test.init_hidden()
---> encoder_hidden = encoder_test.init_hidden().cuda()
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))
---> decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size)).cuda()
So the code to test the network should be:
encoder_test = EncoderRNN(10, 10, 2) # I, H , L
decoder_test = AttnDecoderRNN('general', 10, 10, 2) # A, H, O, L
encoder_hidden = encoder_test.init_hidden().cuda()
if USE_CUDA:
word_inputs = Variable(torch.LongTensor([1, 2, 3]).cuda())
else:
word_inputs = Variable(torch.LongTensor([1, 2, 3]))
encoder_outputs, encoder_hidden = encoder_test(word_inputs, encoder_hidden)
decoder_attns = torch.zeros(1, 3, 3)
decoder_hidden = encoder_hidden
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size)).cuda()
decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs)
print(decoder_output)
print(decoder_hidden)
print(decoder_attn)
[out]:
Variable containing:
-2.1412 -2.4589 -2.4042 -2.1591 -2.5080 -2.0839 -2.5058 -2.3831 -2.4468 -2.0804
[torch.cuda.FloatTensor of size 1x10 (GPU 0)]
Variable containing:
(0 ,.,.) =
Columns 0 to 8
-0.0264 -0.0689 0.1049 0.0760 0.1017 -0.4585 -0.1273 0.0449 -0.3271
Columns 9 to 9
-0.0104
(1 ,.,.) =
Columns 0 to 8
-0.0308 -0.0690 -0.0258 -0.2759 0.1403 -0.0468 -0.0205 0.0126 -0.1729
Columns 9 to 9
0.0599
[torch.cuda.FloatTensor of size 2x1x10 (GPU 0)]
Variable containing:
(0 ,.,.) =
0.3328 0.3328 0.3344
[torch.cuda.FloatTensor of size 1x1x3 (GPU 0)]
Does PyTorch have a global flag to just change all types to CUDA types and not mess around with CPU/GPU types?
Yes. You can set the default tensor type to cuda with:
torch.set_default_tensor_type('torch.cuda.FloatTensor')