I'm working on the signal compression and reconstruction with VAE. I've trained 1600 fragments but the values of 1600 reconstructed signals are very similar. Moreover, results from same batch are almost consistent. As using the VAE, loss function of the model contains binary cross entropy (BCE) and the output of the train model should be located between 0 to 1 (The input data also normalized to 0~1).
VAE model(LSTM) :
class LSTM_VAE(nn.Module):
def __init__(self,
input_size=3000,
hidden=[1024, 512, 256, 128, 64],
latent_size=64,
num_layers=8,
bidirectional=True):
super().__init__()
self.input_size = input_size
self.hidden = hidden
self.latent_size = latent_size
self.num_layers = num_layers
self.bidirectional = bidirectional
self.actv = nn.LeakyReLU()
self.encode = nn.LSTM(input_size=self.input_size,
hidden_size=self.hidden[0],
num_layers=self.num_layers,
batch_first=True,
bidirectional=True)
self.bn_encode = nn.BatchNorm1d(1)
self.decode = nn.LSTM(input_size=self.latent_size,
hidden_size=self.hidden[2],
num_layers=self.num_layers,
batch_first=True,
bidirectional=True)
self.bn_decode = nn.BatchNorm1d(1)
self.fc1 = nn.Linear(self.hidden[0]*2, self.hidden[1])
self.fc2 = nn.Linear(self.hidden[1], self.hidden[2])
self.fc31 = nn.Linear(self.hidden[2], self.latent_size)
self.fc32 = nn.Linear(self.hidden[2], self.latent_size)
self.bn1 = nn.BatchNorm1d(1)
self.bn2 = nn.BatchNorm1d(1)
self.bn3 = nn.BatchNorm1d(1)
self.fc4 = nn.Linear(self.hidden[2]*2, self.hidden[1])
self.fc5 = nn.Linear(self.hidden[1], self.hidden[0])
self.fc6 = nn.Linear(self.hidden[0], self.input_size)
self.bn4 = nn.BatchNorm1d(1)
self.bn5 = nn.BatchNorm1d(1)
self.bn6 = nn.BatchNorm1d(1)
def encoder(self, x):
x = torch.unsqueeze(x, 1)
x, _ = self.encode(x)
x = self.actv(x)
x = self.fc1(x)
x = self.actv(x)
x = self.fc2(x)
x = self.actv(x)
mu = self.fc31(x)
log_var = self.fc32(x)
return mu, log_var
def decoder(self, z):
z, _ = self.decode(z)
z = self.bn_decode(z)
z = self.actv(z)
z = self.fc4(z)
z = self.bn4(z)
z = self.fc5(z)
z = self.bn5(z)
z = self.fc6(z)
z = self.bn6(z)
z = torch.sigmoid(z)
return torch.squeeze(z)
def sampling(self, mu, log_var):
std = torch.exp(0.5 * log_var)
eps = torch.randn_like(std)
return mu + eps * std
def forward(self, x):
mu, log_var = self.encoder(x.view(-1, self.input_size))
z = self.sampling(mu, log_var)
z = self.decoder(z)
return z, mu, log_var
Loss function and Train code :
def lossF(recon_x, x, mu, logvar, input_size):
BCE = F.binary_cross_entropy(recon_x, x.view(-1, input_size), reduction='sum')
KLD = -0.5 * torch.sum(1 + logvar - mu.pow(2) - logvar.exp())
return BCE + KLD
optim = torch.optim.Adam(model.parameters(), lr=opt.lr)
for epoch in range(opt.epoch):
for batch_idx, data in enumerate(train_set):
data = data.to(device)
optim.zero_grad()
recon_x, mu, logvar = model(data)
loss = lossF(recon_x, data, mu, logvar, opt.input_size)
loss.backward()
train_loss += loss.item()
optim.step()
I built the code by refer the example codes of others and only changed very few parameters. I rebuilt the code, change the dataset, update parameters but nothing worked. If you have any suggestion to solve this problem, PLEASE let me know.
I've find out the reason of the issue. It turns out that the decoder model derives output value in the range of 0.4 to 0.6 to stabilize the BCE loss. BCE loss can't be 0 even if the prediction is correct to answer. Also the loss value is non-linear to the range of the output. The easiest way to lower the loss is give 0.5 for the output, and my model did.
To avoid this error, I standardize my data and added some outlier data to avoid BCE issue. VAE is such complicated network for sure.
Related
I'm currently switching from tensorflow to pytorch and facing the warning UserWarning: Using a target size (torch.Size([400])) that is different to the input size (torch.Size([400, 1])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size
I came across that unsqueeze(1) on my target could help to resolve my problem, however, I do so obtain problems in regard of the multitarget which results from the shape my loss function (crossentropy) expects.
Here is a minimal example to my code:
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
X1 = torch.randn(400, 1, 9999)
X2 = torch.randn((400,1, 9999))
aux1 = torch.randn(400,1)
aux2 = torch.randn(400,1)
aux3 = torch.randn(400,1)
y1 = torch.rand(400,)
y2 = torch.rand(400,)
y3 = torch.rand(400,)
import torch
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import Dataset, DataLoader, TensorDataset
import torch.nn.functional as F
# In[18]:
class MultiTaskDataset:
def __init__(self,
amplitude,
phase,
weight,
temperature,
humidity,
shelf_life_clf,
shelf_life_pred,
thickness_pred
):
self.amplitude = amplitude
self.phase = phase
self.weight = weight
self.temperature = temperature
self.humidity = humidity
self.shelf_life_clf = shelf_life_clf
self.shelf_life_pred = shelf_life_pred
self.thickness_pred = thickness_pred
def __len__(self):
return self.amplitude.shape[0]
def __getitem__(self, idx):
#inputs
amplitude = self.amplitude[idx]
phase = self.phase[idx]
weight = self.weight[idx]
temperature = self.temperature[idx]
humidity = self.humidity[idx]
#outputs
shelf_life_clf = self.shelf_life_clf[idx]
shelf_life_reg = self.shelf_life_pred[idx]
thickness_pred = self.thickness_pred[idx]
return ([torch.tensor(amplitude, dtype=torch.float32),
torch.tensor(phase, dtype=torch.float32),
torch.tensor(weight, dtype=torch.float32),
torch.tensor(temperature, dtype=torch.float32),
torch.tensor(humidity, dtype=torch.float32)],
[torch.tensor(shelf_life_clf, dtype=torch.long),
torch.tensor(shelf_life_reg, dtype=torch.float32),
torch.tensor(thickness_pred, dtype=torch.float32)])
# In[19]:
# train loader
dataset = MultiTaskDataset(X1, X2, aux1, aux2, aux3,
y1,y2,y3)
train_loader = DataLoader(dataset, batch_size=512, shuffle=True, num_workers=0)
# test loader
# In[20]:
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.features_amp = nn.Sequential(
nn.LazyConv1d(1, 3, 1),
)
self.features_phase = nn.Sequential(
nn.LazyConv1d(1, 3, 1),
)
self.backbone1 = nn.Sequential(
nn.LazyConv1d(64,3,1),
nn.LazyConv1d(64,3,1),
nn.AvgPool1d(3),
nn.Dropout(0.25),
)
self.backbone2 = nn.Sequential(
nn.Conv1d(64, 32,3,1),
nn.Conv1d(32, 32,3,1),
nn.AvgPool1d(3),
nn.Dropout(0.25),
)
self.backbone3 = nn.Sequential(
nn.Conv1d(32, 16,3,1),
nn.Conv1d(16, 16,3,1),
nn.AvgPool1d(3),
nn.Dropout(0.25),
)
self.classifier = nn.LazyLinear(2)
self.shelf_life_reg = nn.LazyLinear(1)
self.thickness_reg = nn.LazyLinear(1)
def forward(self, x1, x2, aux1, aux2, aux3):
x1 = self.features_amp(x1)
x2 = self.features_phase(x2)
x1 = x1.view(x1.size(0),-1)
x2 = x2.view(x2.size(0),-1)
x = torch.cat((x1, x2), dim=-1)
print(x.size())
x = x.unsqueeze(1)
print(x.size())
x = self.backbone1(x)
print(x.size())
x = torch.flatten(x, start_dim=1, end_dim=-1)
x = torch.cat([x, aux1, aux2, aux3], dim=-1)
shelf_life_clf = self.classifier(x)
shelf_life_reg = self.shelf_life_reg(x)
thickness_reg = self.thickness_reg(x)
return (shelf_life_clf,
shelf_life_reg,
thickness_reg)
model = MyModel()
optimizer = optim.Adam(model.parameters(), lr=0.003)
criterion1 = nn.CrossEntropyLoss()
criterion2 = nn.MSELoss()
criterion3 = nn.MSELoss()
# In[21]:
def train(epoch):
model.train()
#exp_lr_scheduler.step()
arr_loss = []
#first_batch = next(iter(train_loader))
for batch_idx, (data, target) in enumerate(train_loader):
#amp, phase = data
clf, reg1, reg2 = target
#print(amp.shape, phase.shape)
#print(target[2].shape)
if torch.cuda.is_available():
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
data = [data[i].cuda() for i in range(len(data))]
target = [target[i].cuda() for i in range(len(target))]
model.to(device)
optimizer.zero_grad()
output1, output2, output3 = model(*data)
#losses
loss = criterion1(output1, target[0].long())
loss1 = criterion2(output2, target[1].float())
loss2 = criterion3(output3, target[2].float())
loss = loss + loss1 + loss2
#metrices
loss.backward()
optimizer.step()
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, (batch_idx + 1) * len(data), len(train_loader.dataset),
100. * (batch_idx + 1) / len(train_loader), loss.data))
arr_loss.append(loss.data)
return arr_loss
def averaged_accuracy(outputs, targets):
assert len(outputs) != len(targets), "number of outputs should equal the number of targets"
accuracy = []
for i in range(len(outputs)):
_, predicted = torch.max(output1.data, 1)
total += target[0].size(0)
correct += (predicted == target[0]).sum()
acc = correct / total *100
accuracy.append(acc)
return torch.mean(accuracy)
# In[22]:
optimizer = optim.Adam(model.parameters(), lr=0.00003)
criterion1 = nn.CrossEntropyLoss()
criterion2 = nn.MSELoss()
criterion3 = nn.MSELoss()
n_epochs = 10
for epoch in range(n_epochs):
train(epoch)
Can anybody provide guidance to resolve this problem?
I'm new to Tensorflow and I'm practicing using custom models by trying to write a BidirectionalLSTM. However, when training, the command line returns a warning that the Gradients do not exist for, from what I can tell, every variable in the model. The code for the class is the following:
class LSTMCell(tf.keras.layers.Layer):
def __init__(self, hidden_layers):
super(LSTMCell, self).__init__()
self.hidden_layers = hidden_layers
self.f = tf.keras.layers.Dense(hidden_layers, activation='sigmoid')
self.i = tf.keras.layers.Dense(hidden_layers, activation='sigmoid')
self.o = tf.keras.layers.Dense(hidden_layers, activation='sigmoid')
self.Cprime = tf.keras.layers.Dense(hidden_layers, activation='tanh')
def call(self, x, h, c, mask):
mask = tf.reshape(mask, (BATCH_SIZE, 1))
input_x = np.concatenate((h, x), axis = 1)
# print(input_x.shape)
tf_mask = tf.matmul(mask, tf.ones([1, hidden_layers]))
tf_inverse_mask = tf.ones([BATCH_SIZE, 1]) - mask
tf_inverse_mask = tf.matmul(tf_inverse_mask, tf.ones([1, hidden_layers]))
c = (self.f(input_x)*c + self.i(input_x)*self.Cprime(input_x))*tf_mask + c*tf_inverse_mask
h = (self.o(input_x)*tf.math.tanh(c))*tf_mask + h*tf_inverse_mask
return h, c
class BidirectionalLSTM(tf.keras.Model):
def __init__(self, hidden_layers, outputLayer, encoder):
super(BidirectionalLSTM, self).__init__()
self.embedding = tf.keras.layers.Embedding(len(encoder.get_vocabulary()), hidden_layers, mask_zero=True)
self.lstm1 = LSTMCell(hidden_layers)
self.lstm2 = LSTMCell(hidden_layers)
self.outputLayer = outputLayer
self.outputDense = tf.keras.layers.Dense(outputLayer)
self.outputFinal = tf.keras.layers.Dense(1)
def call(self, input, length, mask):
input = self.embedding(input)
h1 = np.zeros([BATCH_SIZE, hidden_layers])
h2 = np.zeros([BATCH_SIZE, hidden_layers])
c1 = np.zeros([BATCH_SIZE, hidden_layers])
c2 = np.zeros([BATCH_SIZE, hidden_layers])
# mask.shape is [batch_size, length]
for i in range(length):
h1, c1 = self.lstm1(input[:, i, :], h1, c1, mask[:, i])
h2, c2 = self.lstm2(input[:, length - i-1,:], h2, c2, mask[:, length-i-1])
finalH = np.concatenate((h1, h2), axis = 1)
lastNode = self.outputFinal(self.outputDense(finalH))
return lastNode
bidLSTM = BidirectionalLSTM(hidden_layers, hidden_layers, encoder)
optimizer = tf.keras.optimizers.Adam()
# loss_function = tf.keras.losses.BinaryCrossentropy(from_logits=True)
for example, label in train_dataset:
encoded_example = encoder(example).numpy()
mask = create_mask(encoded_example)
with tf.GradientTape() as tape:
predictions = bidLSTM(encoded_example, encoded_example.shape[1], mask)
loss = loss_function(label, predictions)
gradients = tape.gradient(loss, bidLSTM.trainable_variables)
optimizer.apply_gradients(zip(gradients, bidLSTM.trainable_variables)
I suspect this might have something to do with the fact that the model is recurrent, and Tensorflow needs to get the gradients of loss with respect to the weights in each cell and then sum it up (bptt). Is this the cause of the error, or is there perhaps something larger I'm missing? If it is the cause, what would be the workaround?
In the line of code where it says "for i in range(length)", I set length equal to 1, so there's no "recurrent" aspect of the model, and I still get the same error, so I suspect I'm missing something else here.
Thanks
I'm a beginner with pytorch framework and I'm trying to add a multiheaded self attention on top of another architecture (BERT) (this is a simple question but I'm not familiar with PyTorch):
UPDATE 1
import math
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
self.d_model = d_model
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x, seq_len = 768, mask = None):
pos_emb = self.pe[:, :seq_len]
x = x * mask[:, :, None].float()
x = x + pos_emb
return x
The problem in how to add the transformer is in the following class:
class CamemBERTQA(nn.Module):
def __init__(self,bert_type, hidden_size, num_labels, num_inter_layers=1, heads = 12, do_lower_case = True):
super(CamemBERTQA, self).__init__()
self.do_lower_case = do_lower_case
self.bert_type = bert_type
self.hidden_size = hidden_size
self.num_labels = num_labels
self.num_inter_layers = num_inter_layers
self.camembert = CamembertModel.from_pretrained(self.bert_type)
# ---------------- Transformer ------------------------------------------
self.d_model = self.hidden_size # 768
dropout = 0.1
self.pos_emb = PositionalEncoding(d_model = self.d_model, dropout = dropout)
self.transformer_inter = nn.ModuleList(
[nn.TransformerEncoderLayer(d_model = self.d_model, nhead = heads, dim_feedforward = 2048, dropout = dropout)
for _ in range(num_inter_layers)])
# ---------------- Transformer ------------------------------------------
self.qa_outputs = nn.Linear(self.hidden_size, self.num_labels)
def forward(self, input_ids, mask=None):
bert_output = self.camembert(input_ids = input_ids) # input_ids is a tensor
# ---------------- Transformer ------------------------------------------
seq_len = self.hidden_size
x = self.pos_emb(x = bert_output, seq_len = seq_len, mask = None)
for i in range(self.num_inter_layers):
x = self.transformer_inter[i](i, x, x, 1 - mask) # all_tokens * max_tokens * dim
output = self.layer_norm(x)
# ---------------- Transformer ------------------------------------------
sequence_output = output[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
outputs = (start_logits, end_logits,)
return x
Thank you so much.
So it seems that you're trying to add a Transformer network on top of the BERT component. It has to be mentioned that the self-attention network is only a part of the Transformer network, meaning that Transformers have other components besides self-attention as well. I would recommend using the Transformer (which has the self-attention component included) as an encoder that receives BERT vectors and transforms them into another representation (in another space).
Try this instead of self.attention = MultiHeadAttention():
self.transformer_inter = nn.ModuleList(
[TransformerEncoderLayer(d_model, heads, d_ff, dropout)
for _ in range(num_inter_layers)])
and then in forward(), call self.transformer_inter through a loop which will give you the representations produced by Transformer architecture. Like this:
def forward(self, bert_output, mask):
batch_size, seq_len = bert_output.size(0), bert_output.size(1)
# Transformer Encoder
pos_emb = self.pos_emb.pe[:, :seq_len]
x = bert_output * mask[:, :, None].float()
x = x + pos_emb
for i in range(self.num_inter_layers):
x = self.transformer_inter[i](i, x, x, 1 - mask) # all_tokens * max_tokens * dim
x = self.layer_norm(x) # Transformer also normalizes the outputs from each layer.
# x is the encoded vectors by Transformer encoder
return x
Then using a nn.Linear(.) layer, do another transformation to map the hidden_size to the number of labels for your task, which will give you the logits for each label. These all should be done within BERT class that you have posted.
Note that the TransformerEncoderLayer is a placeholder class that I used above. So you have to either implement it or use open source packages. As Transformers are quite well-known, I think you won't have trouble finding an implementation of it.
I'm just learn pytorch recently.
And I try to write a same model like the paper that I have read for practice.
This is the PDF of the paper I refer.
https://dl.acm.org/doi/pdf/10.1145/3178876.3186066?download=true
Here is the code what I wrote.
class Tem(torch.nn.Module):
def __init__(self, embedding_size, hidden_size):
super(Tem, self).__init()
self.embedding_size = embedding_size
self.hidden_size = hidden_size
self.leaf_size = 0
self.xgb_model = None
self.vec_embedding = None
self.multi_hot_Q = None
self.user_embedding = torch.nn.Linear(1, embedding_size)
self.item_embedding = torch.nn.Linear(1, embedding_size)
def pretrain(self, ui_attributes, labels):
print("Start XGBoost Training...")
self.xgb_model = XGBoost(ui_attributes, labels)
self.leaf_size = self.xgb_model.leaf_size
self.vec_embedding = Variable(torch.rand(self.embedding_size, self.leaf_size, requires_grad=True))
self.h = Variable(torch.rand(self.hidden_size, 1, requires_grad=True))
self.att_w = Variable(torch.rand(2 * self.embedding_size, self.hidden_size, requires_grad=True))
self.att_b = Variable(torch.rand(self.leaf_size, self.hidden_size, requires_grad=True))
self.r_1 = Variable(torch.rand(self.embedding_size, 1, requires_grad=True))
self.r_2 = Variable(torch.rand(self.embedding_size, 1, requires_grad=True))
self.bias = Variable(torch.rand(1, 1, requires_grad=True))
def forward(self, ui_ids, ui_attributes):
if self.xgb_model == None:
raise Exception("Please run Tem.pretrain() to pre-train XGBoost model first.")
n_data = len(ui_ids)
att_input = torch.FloatTensor(ui_attributes)
self.multi_hot_Q = torch.FloatTensor(self.xgb_model.multi_hot(att_input)).permute(0,2,1)
vq = self.vec_embedding * self.multi_hot_Q
id_input = torch.FloatTensor(ui_ids)
user_embedded = self.user_embedding(id_input[:,0].reshape(n_data, 1))
item_embedded = self.item_embedding(id_input[:,1].reshape(n_data, 1))
ui = (user_embedded * item_embedded).reshape(n_data, self.embedding_size, 1)
ui_repeat = ui.repeat(1, 1, self.leaf_size)
cross = torch.cat([ui_repeat, vq], dim=1).permute(0,2,1)
re_cross = corss.reshape(cross.shape[0] * cross.shape[1], cross.shape[2])
attention = torch.mm(re_cross, self.att_w)
attention = F.leaky_relu(attention + self.att_b.repeat(n_data, 1))
attention = torch.mm(attention, self.h).reshape(n_data, self.leaf_size)
attention = F.softmax(attention).reshape(n_data, self.leaf_size, 1)
attention = self.vec_embedding.permute(1,0) * attention.repeat(1,1,20)
pool = torch.max(attention, 1).values
y_hat = self.bias.repeat(n_data, 1) + torch.mm(ui.reshape(n_data, self.embedding_size), self.r_1) + torch.mm(pool, self.r_2)
y_hat = F.softmax(torch.nn.Linear(1, 2)(y_hat))
return y_hat
My question is...It seems torch didn't know what tensor should be calculate gradient in backward propagation.
print(tem)
Tem(
(user_embedding): Linear(in_features=1, out_features=20, bias=True)
(item_embedding): Linear(in_features=1, out_features=20, bias=True)
)
I googled this problem, someone says those tensors should use torch.autograd.Variable(), but it didn't solve my problem. And someone says autograd directly supports tensors now. torch.autograd.Variable() is not necessary.
loss_func = torch.nn.CrossEntropyLoss()
optimizer = torch.Adagrad(tem.parameters(), lr=0.02)
for t in range(20):
prediction = tem(ids_train, att_train)
loss = loss_func(prediction, y_train)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if t % 5 == 0:
print("loss: ", loss)
loss: tensor(0.8133, grad_fn=<NllLossBackward>)
loss: tensor(0.8133, grad_fn=<NllLossBackward>)
loss: tensor(0.8133, grad_fn=<NllLossBackward>)
loss: tensor(0.8133, grad_fn=<NllLossBackward>)
Your problem is not related to Variable. As you said, it's not necessary anymore. To compute the gradients of a tensor declared in a model (that extends nn.Module) you need to include them into the model's parameters using the method nn.Parameter(). For example, to include self.h, you can do:
self.h = nn.Parameter(torch.zeros(10,10)
Now, when you call loss.backward() it'll collect the gradient for this variable (of course, loss must be dependent on self.h).
I am kind of a beginner in RNNs, so I coded a LSTM architecture using Pytorch, but I always run out of RAM whenever I am in the 3rd epoch. I am already using a DataLoader and I tried to detach the gradient from the input tensor but it doesn't solve the problem out.
This is my training loop
writer = SummaryWriter()
criterion = nn.CrossEntropyLoss(reduction='mean', ignore_index = 0)
optimizer = optim.Adam(lstm.parameters(), lr = 1e-5)
gradient_clip = clip_grad_norm_(lstm.parameters(), max_norm = 5)
num_epochs = 20
epoch_loss = -1.0
loss = - 1
t = trange(num_epochs, desc= "Epoch loss", leave=True)
for epoch in t:
trainLoader = iter(DataLoader(dataset, batch_size = batch_size))
tt = trange(len(trainLoader)-1, desc= "Batch loss", leave=True)
for i in tt:
text, embedding = next(trainLoader)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
y = lstm.forward(embedding.transpose(1,0))
labels = text.transpose(0,1)[1:].transpose(0,1).flatten()
loss = criterion(y.reshape(-1, y.shape[-1]), labels)
tt.set_description("Batch loss : %.4f" % loss)
tt.refresh()
loss.backward(retain_graph=True)
optimizer.step()
epoch_loss += loss
epoch_loss = epoch_loss / (len(trainLoader) - 1)
# Saving model
save_date = datetime.now().strftime("%d%m%Y-%H:%M:%S")
PATH = './save/lstm_model_'+save_date
torch.save(lstm, PATH)
# Updating progression bar
t.set_description("Epoch loss : %.4f" % epoch_loss)
t.refresh()
# Plotting gradients histograms in Tensorboard
writer.add_scalar('Text_generation_Loss/train', epoch_loss, epoch)
for tag, parm in lstm.named_parameters():
with torch.no_grad():
writer.add_histogram(tag, parm.grad.data.cpu().numpy(), epoch)
writer.flush()
print('Finished Training')
writer.close()
And this is the LSTM class that I built:
class LSTM(nn.Module):
def __init__(self, in_size : int, hidden_size : int):
super().__init__()
self.in_size = in_size
self.hidden_size = hidden_size
self.W_fi = nn.Linear(in_size,hidden_size)
self.W_fh = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_ii = nn.Linear(in_size,hidden_size)
self.W_ih = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_Ci = nn.Linear(in_size,hidden_size)
self.W_Ch = nn.Linear(hidden_size,hidden_size, bias=False)
self.W_oi = nn.Linear(in_size,hidden_size)
self.W_oh = nn.Linear(hidden_size,hidden_size, bias=False)
self.sigmoid = nn.Sigmoid()
self.tanh = nn.Tanh()
def one_step(self, x, h, C):
f_t = self.sigmoid(self.W_fi(x) + self.W_fh(h))
i_t = self.sigmoid(self.W_ii(x) + self.W_ih(h))
g_t = self.tanh(self.W_Ci(x) + self.W_Ch(h))
C_t = torch.mul(f_t, C) + torch.mul(i_t, g_t)
o_t = self.sigmoid(self.W_oi(x) + self.W_oh(h))
h_t = torch.mul(o_t, self.tanh(C_t))
return h_t, C_t
def forward(self, X):
h_out = []
h = - torch.ones(X.shape[1], self.hidden_size)
C = - torch.ones(X.shape[1], self.hidden_size)
h_t, C_t = self.one_step(X[0], h, C)
h_out.append(h_t)
for i in range(1, X.shape[0] - 1):
h_t, C_t = self.one_step(X[i], h_t, C_t)
h_out.append(h_t)
h_out = torch.cat(h_out)
return h_out #h_out.reshape(-1,batch_size,num_embeddings)
I already searched for a similar case but I wasn't able to find a solution
I don't know if it may help somebody, but I solved the problem. I wasn't perhaps clear about the task, but the goal was to make text generation. The first thing I was doing is embed the sentences using torch.nn.embedding that was defined outside my LSTM. The solution was to include it as a layer of my network, since the embedding is not a pretrained one and should be learned too.