I want to add a linear layer after an encoder in VAE to get a smaller latent space of a group of data, but the loss returns nan.
This is the simple linear layer I want to add between the encoder and decoder of VAE.
class FC_en(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(2429*32, 64)
self.BN1 = nn.BatchNorm1d(64)
def forward(self, x):
z_loc = self.BN1(self.fc1(x))
return z_loc
class FC_de(nn.Module):
def __init__(self):
super().__init__()
self.fc1 = nn.Linear(64,2429*32)
self.BN1 = nn.BatchNorm1d(2429*32)
def forward(self, z):
x = self.BN1(self.fc1(z))
return x
The code followed is the VAE model
class VAE(nn.Module):
def __init__(self, z_dim=16, hidden_dim=1000, use_cuda=True):
super().__init__()
# create the encoder and decoder networks
self.encoder = Encoder(z_dim, hidden_dim)
self.decoder = Decoder(z_dim, hidden_dim)
self.fc3 = FC_en()
self.fc4 = FC_de()
if use_cuda:
# calling cuda() here will put all the parameters of
# the encoder and decoder networks into gpu memory
self.cuda()
self.use_cuda = use_cuda
self.z_dim = z_dim
# define the model p(x|z)p(z)
def model(self, x):
# register PyTorch module `decoder` with Pyro
pyro.module("decoder", self.decoder)
with pyro.plate("data", x.shape[0]):
# setup hyperparameters for prior p(z)
z_loc = x.new_zeros(torch.Size((x.shape[0], self.z_dim)))
z_scale = x.new_ones(torch.Size((x.shape[0], self.z_dim)))
# sample from prior (value will be sampled by guide when computing the ELBO)
z = pyro.sample("latent", dist.Normal(z_loc, z_scale).to_event(1))
# decode the latent code z
loc_img = self.decoder(z)
loc_img = loc_img.reshape(-1,200*200)
pyro.sample("obs", dist.Bernoulli(loc_img).to_event(1), obs=x.reshape(-1, 200*200))
# define the guide (i.e. variational distribution) q(z|x)
def guide(self, x):
# register PyTorch module `encoder` with Pyro
pyro.module("encoder", self.encoder)
with pyro.plate("data", x.shape[0]):
# use the encoder to get the parameters used to define q(z|x)
z_loc, z_scale = self.encoder(x)
z_sum = torch.cat((z_loc,z_scale),1)
z_sum = z_sum.view(2, 2429*32)
z_sum_z = self.fc3(z_sum)
loc_img = self.fc4(z_sum_z)
loc_img = loc_img.reshape(2429*2, 32)
z_loc = loc_img[:, 0:16]
z_scale = loc_img[:, 16:32]
# sample the latent code z
pyro.sample("latent", dist.Normal(z_loc, z_scale).to_event(1))
Does my idea have some problems? how to avoid the nan loss in the VAE model?
Related
Image from tensorboard
I got a problem when I check my model.
I built encoder/decoder structured model and draw the graph to check tensors.
Defined encoder, decoder separately then merged them by new module.
I choose this architecture for flexibility.
As you can see from the above image, 2 tensors flow from output to encoder (especially to the output layer).
These tensors disappear when one of encoder or decoder is removed from the model.
I want to know the followings
Why did they appear and flow backwards.
How to remove them.
If these cannot be removed, is there any effect on the model?
Here is source code for reproduction.
I got a same issue even from a simplified model.
import os, torch
from torch.utils.tensorboard import SummaryWriter
class Encoder(torch.nn.Module):
def __init__(self):
super(Encoder, self).__init__()
self.embed = torch.nn.Sequential(
torch.nn.Linear(5, 10), torch.nn.ELU())
self.network = torch.nn.Sequential(
torch.nn.Linear(10, 20), torch.nn.ELU(), torch.nn.Dropout(0.2),
torch.nn.Linear(20, 10), torch.nn.ELU(), torch.nn.Dropout(0.2),
)
self.output = torch.nn.Linear(10, 2)
def forward(self, x):
h = self.embed(x)
h = self.network(h)
h = self.output(h)
return h
class Decoder(torch.nn.Module):
def __init__(self):
super(Decoder, self).__init__()
self.network = torch.nn.Sequential(
torch.nn.Linear(2, 5), torch.nn.ELU(), torch.nn.Dropout(0.2),
torch.nn.Linear(5, 10), torch.nn.ELU(), torch.nn.Dropout(0.2),
)
self.output = torch.nn.Linear(10, 5)
def forward(self, x):
h = self.network(x)
h = self.output(h)
return h
class EncoderDecoder(torch.nn.Module):
def __init__(self, encoder, decoder):
super(EncoderDecoder, self).__init__()
self.encoder = encoder
self.decoder = decoder
def forward(self, x):
h = self.encoder(x)
h = self.decoder(h)
return h
root = './test'
os.makedirs(root, exist_ok=True)
writer = SummaryWriter(root)
model = EncoderDecoder(Encoder(), Decoder())
model.eval()
with torch.no_grad():
writer.add_graph(model, torch.rand(20).view(-1,5))
I have a simple autoencoder architecture in PyTorch, which I train to do feature compression and reconstruction. My goal is to use the latent space of the autoencoder to reduce the initial dimensionality of my data and compress it in the test phase.
To perform this, I would need to pass my test data only to my encoder, not the whole autoencoder. Would you have any idea of how to do this ? Something like model = Autoencoder.encoder() or else?
My complete architecture is below:
class Autoencoder(nn.Module):
def __init__(self, n_features):
super(Autoencoder, self).__init__()
self.n_sensors = n_sensors
self.encoder = nn.Sequential(
nn.Linear(self.n_features, 1),
nn.ReLU(True))
self.decoder = nn.Sequential(
nn.Linear(1, self.n_features),
nn.ReLU(True))
def forward(self, x):
x = self.encoder(x)
x = self.decoder(x)
return x
Base on your model definition, you can call forward on the encoder submodule directly:
class Autoencoder(nn.Module):
def __init__(self, n_features):
super(Autoencoder, self).__init__()
self.n_features = n_features
self.encoder = nn.Sequential(
nn.Linear(self.n_features, 1),
nn.ReLU(True))
self.decoder = nn.Sequential(
nn.Linear(1, self.n_features),
nn.ReLU(True))
def forward(self, x):
x = self.encoder(x)
x = self.decoder(x)
return x
Keep in mind you first need to initialize the model:
>>> ae = Autoencoder(n_features=10)
>>> x = torch.empty(16, 10)
>>> ae.encoder(x).shape
(16, 1)
I'm a beginner with pytorch framework and I'm trying to add a multiheaded self attention on top of another architecture (BERT) (this is a simple question but I'm not familiar with PyTorch):
UPDATE 1
import math
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
self.d_model = d_model
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x, seq_len = 768, mask = None):
pos_emb = self.pe[:, :seq_len]
x = x * mask[:, :, None].float()
x = x + pos_emb
return x
The problem in how to add the transformer is in the following class:
class CamemBERTQA(nn.Module):
def __init__(self,bert_type, hidden_size, num_labels, num_inter_layers=1, heads = 12, do_lower_case = True):
super(CamemBERTQA, self).__init__()
self.do_lower_case = do_lower_case
self.bert_type = bert_type
self.hidden_size = hidden_size
self.num_labels = num_labels
self.num_inter_layers = num_inter_layers
self.camembert = CamembertModel.from_pretrained(self.bert_type)
# ---------------- Transformer ------------------------------------------
self.d_model = self.hidden_size # 768
dropout = 0.1
self.pos_emb = PositionalEncoding(d_model = self.d_model, dropout = dropout)
self.transformer_inter = nn.ModuleList(
[nn.TransformerEncoderLayer(d_model = self.d_model, nhead = heads, dim_feedforward = 2048, dropout = dropout)
for _ in range(num_inter_layers)])
# ---------------- Transformer ------------------------------------------
self.qa_outputs = nn.Linear(self.hidden_size, self.num_labels)
def forward(self, input_ids, mask=None):
bert_output = self.camembert(input_ids = input_ids) # input_ids is a tensor
# ---------------- Transformer ------------------------------------------
seq_len = self.hidden_size
x = self.pos_emb(x = bert_output, seq_len = seq_len, mask = None)
for i in range(self.num_inter_layers):
x = self.transformer_inter[i](i, x, x, 1 - mask) # all_tokens * max_tokens * dim
output = self.layer_norm(x)
# ---------------- Transformer ------------------------------------------
sequence_output = output[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
outputs = (start_logits, end_logits,)
return x
Thank you so much.
So it seems that you're trying to add a Transformer network on top of the BERT component. It has to be mentioned that the self-attention network is only a part of the Transformer network, meaning that Transformers have other components besides self-attention as well. I would recommend using the Transformer (which has the self-attention component included) as an encoder that receives BERT vectors and transforms them into another representation (in another space).
Try this instead of self.attention = MultiHeadAttention():
self.transformer_inter = nn.ModuleList(
[TransformerEncoderLayer(d_model, heads, d_ff, dropout)
for _ in range(num_inter_layers)])
and then in forward(), call self.transformer_inter through a loop which will give you the representations produced by Transformer architecture. Like this:
def forward(self, bert_output, mask):
batch_size, seq_len = bert_output.size(0), bert_output.size(1)
# Transformer Encoder
pos_emb = self.pos_emb.pe[:, :seq_len]
x = bert_output * mask[:, :, None].float()
x = x + pos_emb
for i in range(self.num_inter_layers):
x = self.transformer_inter[i](i, x, x, 1 - mask) # all_tokens * max_tokens * dim
x = self.layer_norm(x) # Transformer also normalizes the outputs from each layer.
# x is the encoded vectors by Transformer encoder
return x
Then using a nn.Linear(.) layer, do another transformation to map the hidden_size to the number of labels for your task, which will give you the logits for each label. These all should be done within BERT class that you have posted.
Note that the TransformerEncoderLayer is a placeholder class that I used above. So you have to either implement it or use open source packages. As Transformers are quite well-known, I think you won't have trouble finding an implementation of it.
I have been trying to implement the paper: SeER: An Explainable Deep Learning MIDI-based Hybrid Song Recommender System.
So, what I have been doing is this:
Model Code:
class HybridFactorization(tf.keras.layers.Layer):
# embedding_size is also the number of lstm units
# num_users, num_movies = input_shape
# required_users: (batch_size, embedding_size)
# songs_output: (batch_size, embedding_size)
def __init__(self, embedding_size, num_users, num_tracks):
super(HybridFactorization, self).__init__()
self.embedding_size = embedding_size
self.num_users = num_users
self.num_tracks = num_tracks
self.required_users = None
self.U = self.add_weight("U",
shape=[self.num_users, self.embedding_size],
dtype=tf.float32,
initializer=tf.initializers.GlorotUniform)
self.lstm = tf.keras.layers.LSTM(self.embedding_size)
def call(self, user_index, songs_batch):
output_lstm = self.lstm(songs_batch)
self.required_users = self.U.numpy()
self.required_users = tf.convert_to_tensor(self.required_users[np.array(user_index)],
dtype=tf.float32)
return tf.matmul(self.required_users, output_lstm, transpose_b=True)
class HybridRecommender(tf.keras.Model):
def __init__(self, embedding_size, num_users, num_tracks):
super(HybridRecommender, self).__init__()
self.HybridFactorization = HybridFactorization(embedding_size,
num_users, num_tracks)
def call(self, user_index, songs_batch):
output = self.HybridFactorization(user_index, songs_batch)
return output
Utility Functions and running the model:
def loss_fn(source, target):
mse = tf.keras.losses.MeanSquaredError()
return mse(source, target)
model = HybridRecommender(EMBEDDING_SIZE, num_users, num_tracks)
Xhat = model(user_index, songs_batch)
tf.keras.backend.clear_session()
optimizer = tf.keras.optimizers.Adam()
EPOCHS = 1
for epoch in range(EPOCHS):
start = time.time()
total_loss = 0
for (batch, (input_batch, target_batch)) in enumerate(train_dataset):
songs_batch = create_songs_batch(input_batch)
user_index = input_batch[:, 0].numpy()
X = create_pivot_batch(input_batch, target_batch)
with tf.GradientTape() as tape:
Xhat = model(user_index, songs_batch)
batch_loss = loss_fn(X, Xhat)
variables = model.trainable_variables
gradients = tape.gradient(batch_loss, variables)
optimizer.apply_gradients(zip(gradients, variables))
total_loss += batch_loss
Now, various functions like create_songs_batch(input_batch) and create_pivot_batch(input_batch, target_batch) just provide data in the required format.
My model runs but I get the warning:
WARNING:tensorflow:Gradients do not exist for variables ['U:0'] when minimizing the loss.
Now, I can see why variable U is not being updated as there is no direct path to it.
I want to update some specific rows of U which are mentioned in user_index in every batch call.
Is there a way to do it?
So, I was able to solve the problem by rather than copying some rows of U and trying to solve it. Instead, I used a temporary matrix that is one hot encoded form of user_index and multiplied it with U to desired results and it also removed the results.
Part of code that needs to be modified:
def call(self, user_index, songs_batch):
# output_lstm: (batch_size, emb_sz)
# batch_encoding: (batch_size, num_users)
# required_users: (batch_size, emb_sz)
output_lstm = self.lstm(songs_batch)
user_idx = np.array(user_index)
batch_encoding = np.zeros((user_idx.size, self.num_users))
batch_encoding[np.arange(user_idx.size), user_idx] = 1
batch_encoding = tf.convert_to_tensor(batch_encoding, dtype=tf.float32)
self.required_users = tf.matmul(batch_encoding, self.U)
return tf.matmul(self.required_users, output_lstm, transpose_b=True)
Input: Set of ten "Vowels", set of ten "Consonents", Image dataset where in every image both, one vowel and one consonent, are written.
Task: To identify the vowel and consonent from given image.
Approach: First apply CNN hidden layers on image, then apply two parallel fully connected/dense layers where one will classify vowel in image and other will classify consonent in image.
Problem: I am taking Pretrained Model like VGG or GoogleNet. How to modify that pretrained model to apply two parallel dense layers and return two outputs.
I have tried two different models but my query is can we modify pretrained model for this task.
Right now my model is having only one "fc" layer. I have modified number of neurons in final "fc" layer, like this
final_in_features = googlenet.fc.in_features
googlenet.fc = nn.Linear(final_in_features, 10)
But I need to add one more fc layer so that both "fc" layers are connecting with hidden layers parallel.
Right now model is returning only one output.
outputs1 = googlenet(inputs)
Task is to return two outputs from both "fc" layers, so that it should be look like this
outputs1, outputs2 = googlenet(inputs)
Here is the source for a Linear Layer in Pytorch :
class Linear(Module):
r"""Applies a linear transformation to the incoming data: :math:`y = xA^T + b`
Args:
in_features: size of each input sample
out_features: size of each output sample
bias: If set to ``False``, the layer will not learn an additive bias.
Default: ``True``
Shape:
- Input: :math:`(N, *, H_{in})` where :math:`*` means any number of
additional dimensions and :math:`H_{in} = \text{in\_features}`
- Output: :math:`(N, *, H_{out})` where all but the last dimension
are the same shape as the input and :math:`H_{out} = \text{out\_features}`.
Attributes:
weight: the learnable weights of the module of shape
:math:`(\text{out\_features}, \text{in\_features})`. The values are
initialized from :math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})`, where
:math:`k = \frac{1}{\text{in\_features}}`
bias: the learnable bias of the module of shape :math:`(\text{out\_features})`.
If :attr:`bias` is ``True``, the values are initialized from
:math:`\mathcal{U}(-\sqrt{k}, \sqrt{k})` where
:math:`k = \frac{1}{\text{in\_features}}`
Examples::
>>> m = nn.Linear(20, 30)
>>> input = torch.randn(128, 20)
>>> output = m(input)
>>> print(output.size())
torch.Size([128, 30])
"""
__constants__ = ['bias']
def __init__(self, in_features, out_features, bias=True):
super(Linear, self).__init__()
self.in_features = in_features
self.out_features = out_features
self.weight = Parameter(torch.Tensor(out_features, in_features))
if bias:
self.bias = Parameter(torch.Tensor(out_features))
else:
self.register_parameter('bias', None)
self.reset_parameters()
def reset_parameters(self):
init.kaiming_uniform_(self.weight, a=math.sqrt(5))
if self.bias is not None:
fan_in, _ = init._calculate_fan_in_and_fan_out(self.weight)
bound = 1 / math.sqrt(fan_in)
init.uniform_(self.bias, -bound, bound)
#weak_script_method
def forward(self, input):
return F.linear(input, self.weight, self.bias)
def extra_repr(self):
return 'in_features={}, out_features={}, bias={}'.format(
self.in_features, self.out_features, self.bias is not None
)
You can create a class DoubleLinear like this :
class DoubleLinear(Module):
def __init__(self, Linear1, Linear2):
self.Linear1 = Linear1
self.Linear2 = Linear2
#weak_script_method
def forward(self, input):
return self.Linear1(input), self.Linear2(input)
Then, create your two Linear layers :
Linear_vow = nn.Linear(final_in_features, 10)
Linear_con = nn.Linear(final_in_features, 10)
final_layer = DoubleLinear(Linear_vow, Linear_con)
now outputs1, outputs2 = final_layer(inputs) will work as expected.
class DoubleLinear(torch.nn.Module):
def __init__(self, Linear1, Linear2):
super(DoubleLinear, self).__init__()
self.Linear1 = Linear1
self.Linear2 = Linear2
def forward(self, input):
return self.Linear1(input), self.Linear2(input)
in_features = model._fc.in_features
Linear_first = nn.Linear(in_features, 10)
Linear_second = nn.Linear(in_features, 5)
model._fc = DoubleLinear(Linear_first, Linear_second)
i have used resnet as my pretrained model from torchvision.models
i am nullifying the fc layer using nn.Identity()
class MyModel(nn.Module):
def __init__(self, num_classes1, num_classes2):
super(MyModel, self).__init__()
self.model_resnet = models.resnet18(pretrained=True)
num_ftrs = self.model_resnet.fc.in_features
self.model_resnet.fc = nn.Identity()
self.fc1 = nn.Linear(num_ftrs, num_classes1)
self.fc2 = nn.Linear(num_ftrs, num_classes2)
def forward(self, x):
x = self.model_resnet(x)
out1 = self.fc1(x)
out2 = self.fc2(x)
return out1, out2