I have a network which outputs a vector of length two. My targets are in the form of 1 or zeros, referring to two possible categories. What is the best way to get the loss - i.e. should I transform the targets, for example into a dimension 2 vector, or should I transform the output of the network, e.g. take the location of the max number as the output?
My network looks like:
class LSTMClassifier(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super().__init__()
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
self.lstm1 = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, layer_dim, batch_first=True)
self.fc1 = nn.Linear(hidden_dim, 32)
self.fc2 = nn.Linear(32, 1)
self.dropout = nn.Dropout(p=0.2)
self.batch_normalisation1 = nn.BatchNorm1d(layer_dim)
self.batch_normalisation2 = nn.BatchNorm1d(2)
self.activation = nn.Softmax(dim=2)
def forward(self, x):
h0, c0 = self.init_hidden(x)
out, (hn1, cn1) = self.lstm1(x, (h0, c0))
out = self.dropout(out,)
out = self.batch_normalisation1(out)
h1, c1 = self.init_hidden(out)
out, (hn2, cn2) = self.lstm2(out, (h1, c1))
out = self.dropout(out)
out = self.batch_normalisation1(out)
h2, c2 = self.init_hidden(out)
out, (hn3, cn3) = self.lstm2(out, (h2, c2))
out = self.dropout(out)
out = self.batch_normalisation1(out)
out = self.fc1(out[:, -1, :])
out = self.dropout(out)
out = self.fc2(out)
return out
def init_hidden(self, x):
h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
return [t for t in (h0, c0)]
def pred(self, x):
out = self(x)
return out > 0
An example of input to this network is:
tensor([[[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
[2.3597e-04, 1.1507e-02, 8.7719e-02, 6.1093e-02, 9.5556e-01],
[2.1474e-03, 5.3805e-03, 9.6491e-02, 2.2508e-01, 8.2222e-01]]])
which has shape torch.Size([1, 3, 5]). The target is currently 1 or 0. However, the network outputs a vector such as:
tensor([[0.5293, 0.4707]], grad_fn=<SoftmaxBackward>)
What would be the best way to set up the loss between these target and the network output?
Update:
I can now train the model as suggested in the answers as:
model = LSTMClassifier(5, 128, 3, 1)
Epochs = 10
batch_size = 32
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-6)
for epoch in range(Epochs):
if epoch == 0:
accurate = 0
for X_instance, y_instance in zip(val_x, val_y):
if int(y_instance) == 1 and model.pred(X_instance.view(-1, 3, 5)).item():
accurate += 1
print(f"Untrained accuracy test set: {accurate/len(val_x)}")
print(f"Epoch {epoch + 1}")
for n, (X, y) in enumerate(train_batches):
model.train()
optimizer.zero_grad()
y_pred = model(X)
loss = criterion(y_pred, y)
loss.backward()
optimizer.step()
model.eval()
accurate = 0
for X_instance, y_instance in zip(val_x, val_y):
if int(y_instance) == 1 and model.pred(X_instance.view(-1, 3, 5)).item():
accurate += 1
print(f"Accuracy test set: {accurate/len(val_x)}")
You shouldn't use any activation at the end of your network and output only a single neuron instead of two (trained with BCEWithLogitsLoss).
Below is your neural network code with commentary and removal of unnecessary parts:
class LSTMClassifier(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super().__init__()
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
self.lstm1 = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, layer_dim, batch_first=True)
self.fc1 = nn.Linear(hidden_dim, 32)
# Output 1 neuron instead of two
self.fc2 = nn.Linear(32, 1)
# Model should not depend on batch size
# self.batch_size = None
# You are not using this variable
# self.hidden = None
self.dropout = nn.Dropout(p=0.2)
self.batch_normalisation1 = nn.BatchNorm1d(layer_dim)
self.batch_normalisation2 = nn.BatchNorm1d(2)
def forward(self, x):
# Hidden are initialized with 0 explicitly
# h0, c0 = self.init_hidden(x)
out, _ = self.lstm1(x)
# No need for initial values
# out, (hn1, cn1) = self.lstm1(x, (h0, c0))
out = self.dropout(out)
out = self.batch_normalisation1(out)
# Same for all other cells you re-init with zeros, it's implicit
out, _ = self.lstm2(out)
out = self.dropout(out)
out = self.batch_normalisation1(out)
out, _ = self.lstm2(out)
out = self.dropout(out)
out = self.batch_normalisation1(out)
out = self.fc1(out[:, -1, :])
out = self.dropout(out)
# No need for activation
# out = F.softmax(self.fc2(out))
out = self.fc2(out)
return out
# Return True (1) or False (0)
def pred(self, x):
return self(x) > 0
I have also added pred method which transforms logits into targets (e.g. to use with some metrics).
Basically, if your logit is lower than 0 it is False, otherwise it is True. No need for activation in this case.
Related
I want to learn 100 for 3000days and predict 100 next day but I don't know why this error is happen and resolve this problem
please help me
batchsize = 100, hidden_dim = 10, seq_length = 60, data_dim = 100, output_dim = 100
class Net(nn.Module):
def __init__(self, input_dim, hidden_dim, batch_size, output_dim, layers):
super(Net, self).__init__()
self.hidden_dim = hidden_dim
self.batch_size = batch_size
self.output_dim = output_dim
self.layers = layers
self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=layers,
batch_first=True)
self.fc = nn.Linear(hidden_dim, output_dim, bias = True)
self.hidden = self.reset_hidden_state()
def reset_hidden_state(self):
return (
torch.zeros(self.layers, self.batch_size, self.hidden_dim),
torch.zeros(self.layers, self.batch_size, self.hidden_dim))
def forward(self, x):
x, self.hidden = self.lstm(x, self.hidden)
x = self.fc(x[:, -1,:]) # [batch_size, seq_len, hidden_dim]
return x
# Train part
def train_model(model, train_df, num_epochs = None, lr = None, verbose = 10, patience = 10):
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
nb_epochs = num_epochs
train_hist = np.zeros(nb_epochs)
for epoch in range(nb_epochs):
avg_cost = 0
total_batch = len(train_df)
for batch_idx, samples in enumerate(train_df):
x_train, y_train = samples
# seq별 hidden state reset
model.reset_hidden_state()
model.hidden = [hidden.to(device) for hidden in model.reset_hidden_state()]
outputs = model(x_train)
loss = criterion(outputs, y_train)
optimizer.zero_grad()
loss.backward()
optimizer.step()
avg_cost += loss/total_batch
train_hist[epoch] = avg_cost
if epoch % verbose == 0:
print('Epoch:', '%04d' % (epoch), 'train loss :', '{:.4f}'.format(avg_cost))
if (epoch % patience == 0) & (epoch != 0):
if train_hist[epoch-patience] < train_hist[epoch]:
print('\n Early Stopping')
break
return model.eval(), train_hist
prediction part
net = Net(data_dim, hidden_dim, batch , output_dim, 1).to(device)
model, train_hist = train_model(net, dataloader, num_epochs = nb_epochs, lr = learning_rate, verbose = 20, patience = 10)
with torch.no_grad():
pred = np.zeros((2940,1,100))
for pr in range(len(trainX_tensor)):
model.reset_hidden_state()
predicted = model(torch.unsqueeze(trainX_tensor[pr], 0)).cpu()
print(predicted.shape)
#predicted = predicted.item()
pred[pr,:,:] = predicted
#print(pr)
# INVERSE
pred_inverse = scaler.inverse_transform(pred)
Expected hidden[0] size (1, 1, 10), got [1, 100, 10]
how can i resolve hidden[0] size fix?
This is my attention layer code :
implementation of attention layer
**class Attention(nn.Module):
def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
super(Attention, self).__init__(**kwargs)
self.supports_masking = True
self.bias = bias
self.feature_dim = feature_dim
self.step_dim = step_dim
self.features_dim = 0
weight = torch.zeros(feature_dim, 1)
nn.init.kaiming_uniform_(weight)
self.weight = nn.Parameter(weight)
if bias:
self.b = nn.Parameter(torch.zeros(step_dim))
def forward(self, x, mask=None):
feature_dim = self.feature_dim
step_dim = self.step_dim
eij = torch.mm(
x.contiguous().view(-1, feature_dim),
self.weight
).view(-1, step_dim)
if self.bias:
eij = eij + self.b
eij = torch.tanh(eij)
a = torch.exp(eij)
if mask is not None:
a = a * mask
a = a / (torch.sum(a, 1, keepdim=True) + 1e-10)
weighted_input = x * torch.unsqueeze(a, -1)
return torch.sum(weighted_input, 1)**
This is RNN codes :
**# Instantiate the model w/ hyperparams
weights_matrix = weights_matrix
output_size = 13 # number of classes to predict
hidden_dim = 64
drop_prob = 0.5
# The RNN model that will be used to perform classification
class AttentionLSTM(nn.Module):
def __init__(self, weights_matrix, output_size, hidden_dim, drop_prob):
super(AttentionLSTM, self).__init__()
# embedding layers
self.embedding, self.num_embeddings, self.embeddings_size = create_emb_layer(weights_matrix, True)
# embedding dropout
self.dropout = nn.Dropout2d(drop_prob)
# First lstm and GRU layers
self.lstm1 = nn.LSTM(self.embeddings_size, hidden_dim, batch_first=True, bidirectional=True)
self.gru1 = nn.GRU(hidden_dim * 2, hidden_dim, bidirectional=True, batch_first=True)
# attention layer
self.attention = Attention(hidden_dim*2, seq_length)
# Second lstm and GRU layers
self.lstm2 = nn.LSTM(hidden_dim * 2, hidden_dim, batch_first=True, bidirectional=True)
self.gru2 = nn.GRU(hidden_dim * 2, hidden_dim, bidirectional=True, batch_first=True)
# linear
self.fc = nn.Linear(hidden_dim * 2, hidden_dim * 2)
self.out = nn.Linear(hidden_dim * 2, output_size)
# activation functions
self.sigmoid = nn.Sigmoid() # for hidden layers
self.softmax = nn.Softmax(dim=1) # for output layer
def forward(self, x):
batch_size = x.size(0)
# embedding output
x = x.long()
embeds = self.embedding(x)
embeds = torch.squeeze(torch.unsqueeze(embeds, 0))
# lstm, and gru outputs
lstm_out1, _ = self.lstm1(embeds)
gru_out1, _ = self.gru1(lstm_out1)
gru_out1 = gru_out1.view(batch_size, -1, hidden_dim * 2)
attention_out = self.attention(gru_out1, seq_length)
attention_out = attention_out.view(batch_size, -1, hidden_dim * 2)
attention_out = self.sigmoid(attention_out)
lstm_out2, _ = self.lstm2(attention_out)
# slice lstm_out to just get output of last element of the input sequence
lstm_out2 = lstm_out2[:, -1]
gru_out2, _ = self.gru2(lstm_out2)
# linear outputs
fc_out = self.softmax(self.fc(gru_out2))
final_out = self.out(fc_out)
return final_out**
I am sure that my dataset is balanced after pre-processing step but my model always predict the same output. Precision and fscore are changing for each input, however, this problem makes my recall score 1.0 since output is always same whatever input is.
If anybody help me, i will be appreciated
It required some time to build networks from your requirements but I provided a few samples to create a customer layer or model, you start from an embedded layer and suddenly random leaves of data create different input every time GRU and LSTM learning layers may provide good results when they had :
Matching input and target layer and parameters.
Learning scopes when they can differentiate input, repeating of gated current, and LSTM is specifically used when patterns of data are
significant such as pictures or continue data.
Linear, and Sigmoid provide contrast differentiate and softmax sometime we required when compared based on distribution values. This
is supposed to create contrast output excepted softmax applied on
weights of values.
Loss Fn is based on a similar output dimension/expectation
[ Sample ]:
class create_emb_layer( tf.keras.layers.Embedding ):
def __init__( self, weights_matrix, bidirectional=True ):
self.num_embeddings = weights_matrix[0]
self.embeddings_size = weights_matrix[1]
self.bidirectional = bidirectional
super(create_emb_layer, self).__init__( self.embeddings_size, self.num_embeddings )
def build(self, input_shape):
self.kernel = self.add_weight("kernel",
shape=[int(input_shape[-1]),
self.input_dim])
def call(self, inputs):
return tf.matmul(inputs, self.kernel)
[ My model ]:
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Model Initialize
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
model = tf.keras.models.Sequential([
tf.keras.layers.InputLayer(input_shape=( 32, 32, 4 )),
tf.keras.layers.Normalization(mean=3., variance=2.),
tf.keras.layers.Normalization(mean=4., variance=6.),
tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
tf.keras.layers.MaxPooling2D((2, 2)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Reshape((128, 225)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(96, return_sequences=True, return_state=False)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(96)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(192, activation='relu'),
tf.keras.layers.Dense(10),
])
[ Output ]:
This is my CNN class:
class CNN(nn.Module):
def __init__(
self,
vocab_size,
emb_dim,
out_channels,
kernel_sizes,
dropout,
):
super().__init__()
self.embedding = nn.Embedding(vocab_size, emb_dim)
self.conv_0 = nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=(kernel_sizes[0], emb_dim), 2)
self.conv_1 = nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=(kernel_sizes[1], emb_dim), 2)
self.conv_2 = nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=(kernel_sizes[2], emb_dim), 2)
self.fc = nn.Linear(len(kernel_sizes) * out_channels, 1)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
embedded = self.embedding(text)
print('embedded', embedded.shape)
embedded = embedded.unsqueeze(1) # may be reshape here
print('embedded', embedded.shape)
conved_0 = F.relu(self.conv_0(embedded)).squeeze(3) # may be reshape here
print('conved_0', conved_0.shape)
conved_1 = F.relu(self.conv_1(embedded)).squeeze(3) # may be reshape here
print('conved_1', conved_1.shape)
conved_2 = F.relu(self.conv_2(embedded)).squeeze(3) # may be reshape here
print('conved_2', conved_2.shape)
pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
print('pooled_0', pooled_0.shape)
pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
print('pooled_1', pooled_1.shape)
pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
print('pooled_2', pooled_2.shape)
cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))
print('cat', cat.shape)
return self.fc(cat)
Variables:
kernel_sizes = [3, 4, 5]
vocab_size = len(TEXT.vocab)
out_channels = 64
dropout = 0.2
dim = 300
model = CNN(vocab_size=vocab_size, emb_dim=dim, out_channels=out_channels,
kernel_sizes=kernel_sizes, dropout=dropout)
And training:
import numpy as np
min_loss = np.inf
cur_patience = 0
for epoch in range(1, max_epochs + 1):
train_loss = 0.0
model.train()
pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
pbar.set_description(f"Epoch {epoch}")
for it, batch in pbar:
#YOUR CODE GOES HERE
opt.zero_grad()
input = batch.text[0].to(device)
output = model(input)
train_loss = loss_func(output, batch.label)
train_loss.backward()
opt.step()
train_loss /= len(train_iter)
val_loss = 0.0
model.eval()
pbar = tqdm(enumerate(valid_iter), total=len(valid_iter), leave=False)
pbar.set_description(f"Epoch {epoch}")
for it, batch in pbar:
# YOUR CODE GOES HERE
input = batch.text[0].to(device)
output = model(input)
val_loss = loss_fn(output, batch.label)
val_loss /= len(valid_iter)
if val_loss < min_loss:
min_loss = val_loss
best_model = model.state_dict()
else:
cur_patience += 1
if cur_patience == patience:
cur_patience = 0
break
print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)
I get this error:
RuntimeError: Expected 4-dimensional input for 4-dimensional weight [64, 1, 3, 300], but got 3-dimensional input of size [894, 1, 300] instead
in this line:
---> 32 conved_0 = F.relu(self.conv_0(embedded)).squeeze(3)
I've tried using Conv1d, but still had problems with dimensions. Could somebody please explain what should I fix here for the network to train?
EDIT:
This is my class but with Conv1d:
class CNN(nn.Module):
def __init__(
self,
vocab_size,
emb_dim,
out_channels,
kernel_sizes,
dropout,
):
super().__init__()
self.embedding = nn.Embedding(vocab_size, emb_dim)
self.conv_0 = nn.Conv1d(in_channels=1, out_channels=out_channels, kernel_size=kernel_sizes[0])
self.conv_1 = nn.Conv1d(in_channels=1, out_channels=out_channels, kernel_size=kernel_sizes[1])
self.conv_2 = nn.Conv1d(in_channels=1, out_channels=out_channels, kernel_size=kernel_sizes[2])
self.fc = nn.Linear(len(kernel_sizes) * out_channels, 1)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
embedded = self.embedding(text)
print('embedded', embedded.shape)
embedded = embedded.unsqueeze(1) # may be reshape here
print('embedded', embedded.shape)
conved_0 = F.relu(self.conv_0(embedded)) # may be reshape here
print('conved_0', conved_0.shape)
conved_1 = F.relu(self.conv_1(embedded)) # may be reshape here
print('conved_1', conved_1.shape)
conved_2 = F.relu(self.conv_2(embedded)) # may be reshape here
print('conved_2', conved_2.shape)
pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
print('pooled_0', pooled_0.shape)
pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
print('pooled_1', pooled_1.shape)
pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
print('pooled_2', pooled_2.shape)
cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))
print('cat', cat.shape)
return self.fc(cat)
Dimensions output:
embedded torch.Size([1115, 300])
embedded torch.Size([1115, 1, 300])
conved_0 torch.Size([1115, 64, 298])
conved_1 torch.Size([1115, 64, 297])
conved_2 torch.Size([1115, 64, 296])
pooled_0 torch.Size([1115, 64])
pooled_1 torch.Size([1115, 64])
pooled_2 torch.Size([1115, 64])
cat torch.Size([1115, 192])
Error:
ValueError: Target size (torch.Size([128])) must be the same as input size (torch.Size([1115, 1]))
What I was missing is that I had batch_first parameter set to True, which SWAPED batch_size and seq_len. Once I've set it to False, everything worked perfectly.
Right now I am having trouble telling my network to understand my data because I have created an error with the embedding layer. It is expecting a Long but I am passing it an torch.cuda.IntTensor
My Model
class LSTMClassification(torch.nn.Module):
def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.5):
super().__init__()
# params: "n_" means dimension
self.n_vocab = n_vocab # number of unique words in vocabulary
self.n_layers = n_layers # number of LSTM layers
self.n_hidden = n_hidden # number of hidden nodes in LSTM
self.embedding = torch.nn.Embedding(n_vocab, n_embed)
self.lstm = torch.nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
self.dropout = torch.nn.Dropout(drop_p)
self.fc = torch.nn.Linear(n_hidden, n_output)
self.sigmoid = torch.nn.Sigmoid()
def forward (self, input_words):
# INPUT : (batch_size, seq_length)
embedded_words = self.embedding(input_words) # (batch_size, seq_length, n_embed)
lstm_out, h = self.lstm(embedded_words) # (batch_size, seq_length, n_hidden)
lstm_out = self.dropout(lstm_out)
lstm_out = lstm_out.contiguous().view(-1, self.n_hidden) # (batch_size*seq_length, n_hidden)
fc_out = self.fc(lstm_out) # (batch_size*seq_length, n_output)
sigmoid_out = self.sigmoid(fc_out) # (batch_size*seq_length, n_output)
sigmoid_out = sigmoid_out.view(batch_size, -1) # (batch_size, seq_length*n_output)
# extract the output of ONLY the LAST output of the LAST element of the sequence
sigmoid_last = sigmoid_out[:, -1] # (batch_size, 1)
return sigmoid_last, h
def init_hidden (self, batch_size): # initialize hidden weights (h,c) to 0
device = "cuda" if torch.cuda.is_available() else "cpu"
weights = next(self.parameters()).data
h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
return h
**Model Initialization & Training **
num_epochs_first = 1
for epoch in range(n_epochs):
h = model.init_hidden(batch_size)
for i, data in enumerate(train_loader, 0):
step += 1
inputs, labels = data[0].to(device), data[1].to(device)
# making requires_grad = False for the latest set of h
h = tuple([each.data for each in h])
model.zero_grad()
output, h = model(inputs)
#loss = criterion(output, labels)
#loss.backward()
#optimizer.step()
My Error
RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.cuda.IntTensor instead (while checking arguments for embedding)
**
The Error occurs on line:
output, h = model(inputs)
**
I made a code that sort of change the tutorial script of seq2seq provided by Pytorch. Here’s the model:
class Seq2Seq(nn.Module):
def __init__(self, encoder, batch_size, vocab_size, input_size, output_size, hidden_dim, embedding_dim, n_layers=2, dropout_p=0.5):
super(Seq2Seq, self).__init__()
self.hidden_dim = hidden_dim
self.batch_size = batch_size
self.input_length = input_size
self.output_length = output_size
self.vocab_size = vocab_size
self.encoder = encoder
self.dropout = nn.Dropout(dropout_p)
self.selu = nn.SELU()
self.decoder_embeddings = nn.Embedding(vocab_size, hidden_dim)
self.decoder_gru = nn.GRU(hidden_dim, hidden_dim)
self.out = nn.Linear(hidden_dim, vocab_size)
self.softmax = nn.LogSoftmax()
def decode(self, SOS_token, encoder_hidden, target_output, teacher_forcing_ratio=0.8):
decoder_output_full = autograd.Variable(torch.zeros(self.output_length, self.batch_size, self.vocab_size))
decoder_output_full = decoder_output_full.cuda() if use_cuda else decoder_output_full
target = target_output.permute(1,0)
use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
for idx in range(self.output_length):
if idx == 0:
decoder_input = SOS_token
decoder_hidden = encoder_hidden.unsqueeze(0)
output = self.decoder_embeddings(decoder_input).view(1, self.batch_size, -1)
output = self.dropout(output)
output = self.selu(output)
if use_teacher_forcing:
decoder_output, decoder_hidden = self.decoder_gru(output, decoder_hidden)
temp = 1
out = self.out(decoder_output[0])
out = out + sample_gumbel(out.shape)
decoder_output = F.softmax(out / temp, dim=1)
# decoder_output = (self.decoder_embeddings.weight * decoder_output.unsqueeze(1)).sum(0).view(1, 1, -1)
decoder_output_full[idx, :, :] = decoder_output
decoder_input = target[idx-1] # Teacher forcing
else:
decoder_output, decoder_hidden = self.decoder_gru(output, decoder_hidden)
temp = 1
out = self.out(decoder_output[0])
out = out + sample_gumbel(out.shape)
decoder_output = F.softmax(out / temp, dim=1)
# decoder_output = (self.decoder_embeddings.weight * decoder_output.unsqueeze(1)).sum(0).view(1, 1, -1)
topv, topi = decoder_output.data.topk(1)
# print topi
ni = topi
# decoder_input_v = autograd.Variable(torch.LongTensor([[ni]]))
decoder_input = autograd.Variable(ni)
# decoder_input = decoder_input.cuda() if use_cuda else decoder_input
# print decoder_input
decoder_output_full[idx, :, :] = decoder_output
decoder_output_full = decoder_output_full.permute(1,0,2)
# gen_output = self.softmax(self.out(decoder_output_full))
return decoder_output_full
def forward(self, input, target_output, teacher_forcing_ratio=0.8):
encoder_feat, _ = self.encoder(input)
SOS_token = np.zeros((self.batch_size,1), dtype=np.int32)
SOS_token = torch.LongTensor(SOS_token.tolist())
SOS_token = autograd.Variable(SOS_token)
if use_cuda:
SOS_token = SOS_token.cuda(gpu)
gen_output = self.decode(SOS_token, encoder_feat, target_output, teacher_forcing_ratio)
return gen_output
def initHidden(self):
result = autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
if use_cuda:
return result.cuda()
else:
return result
The way I calculate the NLL loss is by creating one whole sequence of output first and compare it with the target output. Here’s the loss function:
class batchNLLLoss(nn.Module):
def __init__(self):
super(batchNLLLoss, self).__init__()
def forward(self, synt, target, claim_length=20):
loss_fn = nn.NLLLoss()
loss = 0
for i in range(synt.shape[0]):
for j in range(claim_length):
loss += loss_fn(synt[i][j].unsqueeze(0), target[i][j])
return loss
The current problem is the loss value is really small and seems like the network learns nothing (the output is the same word repeated again and again). Any thought about this? Thanks in advance!