Right now I am having trouble telling my network to understand my data because I have created an error with the embedding layer. It is expecting a Long but I am passing it an torch.cuda.IntTensor
My Model
class LSTMClassification(torch.nn.Module):
def __init__(self, n_vocab, n_embed, n_hidden, n_output, n_layers, drop_p = 0.5):
super().__init__()
# params: "n_" means dimension
self.n_vocab = n_vocab # number of unique words in vocabulary
self.n_layers = n_layers # number of LSTM layers
self.n_hidden = n_hidden # number of hidden nodes in LSTM
self.embedding = torch.nn.Embedding(n_vocab, n_embed)
self.lstm = torch.nn.LSTM(n_embed, n_hidden, n_layers, batch_first = True, dropout = drop_p)
self.dropout = torch.nn.Dropout(drop_p)
self.fc = torch.nn.Linear(n_hidden, n_output)
self.sigmoid = torch.nn.Sigmoid()
def forward (self, input_words):
# INPUT : (batch_size, seq_length)
embedded_words = self.embedding(input_words) # (batch_size, seq_length, n_embed)
lstm_out, h = self.lstm(embedded_words) # (batch_size, seq_length, n_hidden)
lstm_out = self.dropout(lstm_out)
lstm_out = lstm_out.contiguous().view(-1, self.n_hidden) # (batch_size*seq_length, n_hidden)
fc_out = self.fc(lstm_out) # (batch_size*seq_length, n_output)
sigmoid_out = self.sigmoid(fc_out) # (batch_size*seq_length, n_output)
sigmoid_out = sigmoid_out.view(batch_size, -1) # (batch_size, seq_length*n_output)
# extract the output of ONLY the LAST output of the LAST element of the sequence
sigmoid_last = sigmoid_out[:, -1] # (batch_size, 1)
return sigmoid_last, h
def init_hidden (self, batch_size): # initialize hidden weights (h,c) to 0
device = "cuda" if torch.cuda.is_available() else "cpu"
weights = next(self.parameters()).data
h = (weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device),
weights.new(self.n_layers, batch_size, self.n_hidden).zero_().to(device))
return h
**Model Initialization & Training **
num_epochs_first = 1
for epoch in range(n_epochs):
h = model.init_hidden(batch_size)
for i, data in enumerate(train_loader, 0):
step += 1
inputs, labels = data[0].to(device), data[1].to(device)
# making requires_grad = False for the latest set of h
h = tuple([each.data for each in h])
model.zero_grad()
output, h = model(inputs)
#loss = criterion(output, labels)
#loss.backward()
#optimizer.step()
My Error
RuntimeError: Expected tensor for argument #1 'indices' to have scalar type Long; but got torch.cuda.IntTensor instead (while checking arguments for embedding)
**
The Error occurs on line:
output, h = model(inputs)
**
Related
I'm trying to run this code for the attention model in NLP.
class DecoderAttn(nn.Module):
def __init__(self, output_dim, emb_dim, hid_dim, n_layers, attn_dim):
super().__init__()
self.hid_dim = hid_dim
self.n_layers = n_layers
self.output_dim = output_dim
self.embedding = nn.Embedding(output_dim, emb_dim)
self.rnn = nn.GRU(emb_dim, hid_dim, n_layers, batch_first=True)
# Implement this
# BEGIN
# self.fc_out =
# add attention layer and linear transform layers
# attention layer's elements defined (key, query, value).
self.key = nn.Linear(hid_dim, attn_dim)
self.query = nn.Linear(hid_dim, attn_dim)
self.value = nn.Linear(hid_dim, attn_dim)
self.attention = nn.MultiheadAttention(attn_dim, 128)
self.fc_out = nn.Linear(hid_dim, output_dim)
# END
def forward(self, input, hidden, encoder_outputs):
#input: [batch size]
#hidden: [batch size, hid_dim]
#encoder_outputs: [batch size, src_len, hid_dim]
input = input.unsqueeze(1)
#input: [batch size, 1]
embedded = self.embedding(input)
#embedded: [batch size, 1, emb dim]
output, hidden = self.rnn(embedded, hidden)
# implement this
# BEGIN
# compute v* (attention output)
# compute prediction, using a fully connected layer that takes as input
# both attention output and output from GRU
attention_out, attention_out_w = self.attention(self.query(output), self.key(encoder_outputs), self.value(encoder_outputs))
concat_out = torch.cat((output, attention_out), 2)
prediction = self.fc_out(output.squeeze(1))
# END
#prediction : [batch size, output dim]
return prediction, hidden
and after this part trying to run this:
INPUT_DIM = len(CHARS.vocab)
OUTPUT_DIM = len(PHONEMES.vocab)
ENC_EMB_DIM = 500
DEC_EMB_DIM = 50
HID_DIM = 256
ATTN_DIM = 128
N_LAYERS = 4
ENC_DROPOUT = 0.1
DEC_DROPOUT = 0.1
enc = Encoder(INPUT_DIM, ENC_EMB_DIM, HID_DIM, N_LAYERS)
dec = DecoderAttn(OUTPUT_DIM, DEC_EMB_DIM, HID_DIM, N_LAYERS, ATTN_DIM)
model_attn = Seq2Seq(enc, dec, device).to(device)
N_EPOCHS = 10
CLIP = 1
train(model_attn, N_EPOCHS, CLIP)
but it gives the following error:
RuntimeError: shape '[32, 128, 1]' is invalid for input of size
61440
I am also adding the google colab link. It could be easier to investigate this way:
https://colab.research.google.com/drive/1MEemocW8nvebjq17CNnRzvUHdd_wQumP?usp=sharing
hey I have some problems with my LSTM. I have 6 features and I´m sending all my data (29002 rows) in the LSTM at once (is this a good idea?).
My Input is of size:
Training Shape torch.Size([290002, 1, 6]) torch.Size([290002, 1])
Testing Shape torch.Size([74998, 1, 6]) torch.Size([74998, 1])
my model:
class LSTM(nn.Module):
def __init__(self, hidden_dim_LSTM, num_layers_LSTM, hidden1, drop):
super(LSTM, self).__init__()
self.hidden_dim_LSTM = hidden_dim_LSTM
self.num_layers_LSTM = num_layers_LSTM
self.hidden1=hidden1
self.drop=drop
final_output_dim = 1
#self.lstm = nn.LSTM(self.input_dim, self.hidden_dim, self.num_layers, batch_first=True)
self.lstm = nn.LSTM(6, hidden_size=hidden_dim_LSTM, num_layers=num_layers_LSTM, batch_first=True)
self.fc1 = nn.Linear(in_features=hidden_dim_LSTM, out_features=hidden1)
self.drop = nn.Dropout(drop)
self.fc2 = nn.Linear(in_features=hidden1, out_features=final_output_dim)
def forward(self, x):
h_0 = Variable(torch.zeros(self.num_layers_LSTM, x.size(0), self.hidden_dim_LSTM)).requires_grad_().to(device) #hidden state
c_0 = Variable(torch.zeros(self.num_layers_LSTM, x.size(0), self.hidden_dim_LSTM)).requires_grad_().to(device) #internal state
# Propagate input through LSTM
output, (hn, cn) = self.lstm(x, (h_0, c_0)) #lstm with input, hidden, and internal state
hn = hn.view(-1, self.hidden_dim_LSTM) #reshaping the data for Dense layer next
out = F.relu(hn)
out = self.fc1(out)
out = self.drop(out)
out = torch.relu(out)
#out = self.drop(out)
out = self.fc2(out)
return out
When I start the training I get this Error:
RuntimeError: For unbatched 2-D input, hx and cx should also be 2-D but got (3-D, 3-D) tensors
at Line: output, (hn, cn) = self.lstm(x, (h_0, c_0))
I'm grateful for every help!
I am trying to build a wakeword model for my AI Assistant. I have 1 second length 3 audios. I created the data. I have 3 audio MFCC extracted data as an example.
def test():
#look at the values of the tensors after printing
wwd = WakeWordData(data_json = '../../data_json_files/test.json');
print(wwd[0])
arr =[]
arr.append(wwd[87]) #shape(1,19,40)
arr.append(wwd[0]) #shape(1,78,40)
arr.append(wwd[4]) #shape(1,28,40)
mfccs, labels = collate_fn(arr) #torch.Size([78, 3, 40])
model_params = {
"size_of_output": 1, "input_size": 40, "hidden_size": 1,
"num_layers": 2, "dropout": 0.1, "bidirectional": True,
"device":'cpu'
}
lst_w = LSTM_WakeWord(**model_params)
o = lst_w(mfccs)
#print(str(o))
Here is my collate_fn below.
def collate_fn(data):
mfccs = []
labels = []
for d in data:
mfcc_tensor, label = d
#mfcc_tensor -> (channel, time, n_mfcc)
mfccs.append(mfcc_tensor.squeeze(0).transpose(0, 1))
labels.append(label)
mfccs = nn.utils.rnn.pad_sequence(mfccs, batch_first=True) # batch,
feature(n_mfcc),seq_len(time)
print("collate_fn MFCCs->" + str(mfccs.shape)) #torch.Size([3, 78, 40])
mfccs = mfccs.transpose(0, 1) #torch.Size([78, 3, 40])(feature(n_mfcc), batch,
seq_len(time))
labels = torch.Tensor(labels)
return mfccs, labels
when i run this code with 3 MFCC's , after pad_sequence i get the data as (3,78,40). Which is i think (batch, features(n_mfcc) ,seq_len(time)). is it correct ? then i traspose is and get ([78, 3, 40]).
then i try to give it to my LSTM. LSTM takes the input as ( seq_len,batch, feature). I can make the model work even though my (78,3,40) is ( features(n_mfcc) ,batch, seq_len(time)). Should i set the shape exactly as the model wants or it is good if it's working?
My model is below.
class LSTM_WakeWord(nn.Module):
def __init__(self,input_size,hidden_size,num_layers,dropout,bidirectional,size_of_output, device):
super(LSTM_WakeWord, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.device = device
self.bidirectional = bidirectional
self.directions = 2 if bidirectional else 1
self.lstm = nn.LSTM(input_size=input_size,
hidden_size = hidden_size,
num_layers = num_layers,
dropout=dropout,
bidirectional=bidirectional)
self.layernorm = nn.LayerNorm(input_size)
self.classifier = nn.Linear(hidden_size * self.directions, size_of_output)
def _init_hidden(self,batch_size):
n, d, hs = self.num_layers, self.directions, self.hidden_size
return (torch.zeros(n * d, batch_size, hs).to(self.device),
torch.zeros(n * d, batch_size, hs).to(self.device))
def forward(self,x):
# the values with e+xxx are gone. so it normalizes the values
x = self.layernorm(x)
# x shape -> feature(n_mfcc),batch,seq_len(time)
hidden = self._init_hidden(x.size()[1])
out, (hn, cn) = self.lstm(x, hidden)
print("hn "+str(hn.shape))
print("out " + str(out.shape))
out = self.classifier(hn)
return out
But then i get an error when i try to give the hidden state output to my Linear dense layer (classifier). It is a shape error. mat1 and mat2 shapes cannot be multiplied (12x1 and 2x1)
Why is this happening?
I want to implement a Hierarchical attention mechanism for document classification presented by Yang. But I want to replace LSTM with Transformer.
I used Apoorv Nandan's text classification with Transformer:
https://keras.io/examples/nlp/text_classification_with_transformer/
I have implemented Transformer hierarchically to classification. One for sentence representation and another one for document representation. The code is as follow:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.np_utils import to_categorical
class MultiHeadSelfAttention(layers.Layer):
def __init__(self, embed_dim, num_heads=8):
super(MultiHeadSelfAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
if embed_dim % num_heads != 0:
raise ValueError(
f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
)
self.projection_dim = embed_dim // num_heads
self.query_dense = layers.Dense(embed_dim)
self.key_dense = layers.Dense(embed_dim)
self.value_dense = layers.Dense(embed_dim)
self.combine_heads = layers.Dense(embed_dim)
def attention(self, query, key, value):
score = tf.matmul(query, key, transpose_b=True)
dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
scaled_score = score / tf.math.sqrt(dim_key)
weights = tf.nn.softmax(scaled_score, axis=-1)
output = tf.matmul(weights, value)
return output, weights
def separate_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, inputs):
# x.shape = [batch_size, seq_len, embedding_dim]
batch_size = tf.shape(inputs)[0]
query = self.query_dense(inputs) # (batch_size, seq_len, embed_dim)
key = self.key_dense(inputs) # (batch_size, seq_len, embed_dim)
value = self.value_dense(inputs) # (batch_size, seq_len, embed_dim)
query = self.separate_heads(
query, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
key = self.separate_heads(
key, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
value = self.separate_heads(
value, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
attention, weights = self.attention(query, key, value)
attention = tf.transpose(
attention, perm=[0, 2, 1, 3]
) # (batch_size, seq_len, num_heads, projection_dim)
concat_attention = tf.reshape(
attention, (batch_size, -1, self.embed_dim)
) # (batch_size, seq_len, embed_dim)
output = self.combine_heads(
concat_attention
) # (batch_size, seq_len, embed_dim)
return output
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate, name=None):
super(TransformerBlock, self).__init__(name=name)
self.att = MultiHeadSelfAttention(embed_dim, num_heads)
self.ffn = keras.Sequential(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim), ]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(dropout_rate)
self.dropout2 = layers.Dropout(dropout_rate)
def call(self, inputs, training):
attn_output = self.att(inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim, name=None):
super(TokenAndPositionEmbedding, self).__init__(name=name)
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
def compute_output_shape(self, input_shape):
# it changes the shape from (batch_size, maxlen) to (batch_size, maxlen, embed_dim)
return input_shape + (self.pos_emb.output_dim,)
# Lower level (produce a representation of each sentence):
embed_dim = 100 # Embedding size for each token
num_heads = 2 # Number of attention heads
ff_dim = 64 # Hidden layer size in feed forward network inside transformer
L1_dense_units = 100 # Size of the sentence-level representations output by the word-level model
dropout_rate = 0.1
vocab_size = 1000
class_number = 5
max_docs = 10000
max_sentences = 15
max_words = 60
word_input = layers.Input(shape=(max_words,), name='word_input')
word_embedding = TokenAndPositionEmbedding(maxlen=max_words, vocab_size=vocab_size,
embed_dim=embed_dim, name='word_embedding')(word_input)
word_transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='word_transformer')(word_embedding)
word_pool = layers.GlobalAveragePooling1D(name='word_pooling')(word_transformer)
word_drop = layers.Dropout(dropout_rate, name='word_drop')(word_pool)
word_dense = layers.Dense(L1_dense_units, activation="relu", name='word_dense')(word_drop)
word_encoder = keras.Model(word_input, word_dense)
word_encoder.summary()
# =========================================================================
# Upper level (produce a representation of each document):
L2_dense_units = 100
sentence_input = layers.Input(shape=(max_sentences, max_words), name='sentence_input')
# This is the line producing "NotImplementedError":
sentence_encoder = tf.keras.layers.TimeDistributed(word_encoder, name='sentence_encoder')(sentence_input)
sentence_transformer = TransformerBlock(embed_dim=L1_dense_units, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='sentence_transformer')(sentence_encoder)
sentence_dense = layers.TimeDistributed(layers.Dense(int(L2_dense_units)),name='sentence_dense')(sentence_transformer)
sentence_out = layers.Dropout(dropout_rate)(sentence_dense)
preds = layers.Dense(class_number , activation='softmax', name='sentence_output')(sentence_out)
model = keras.Model(sentence_input, preds)
model.summary()
#==========================================================================
Everything is OK(for testing you can copy and paste it in googlecolab). But when I compile and fit the model by following codes, it throws an error:
X = tf.random.uniform(shape=(max_docs, max_sentences, max_words), minval=1, maxval=1000, dtype=tf.dtypes.int32, seed=1)
y = tf.random.uniform(shape=(max_docs, ), minval=0, maxval=class_number , dtype=tf.dtypes.int32, seed=1)
y = to_categorical(y)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
X, y, batch_size=32, epochs=25,
)
The error is:
ValueError: Shapes (None, 5) and (None, 15, 5) are incompatible
When I had a similar error, I found that a Flatten() layer helped, I had incompatible shapes of (None, x, y) and (None, y).
If you try to provide a flatten layer for the part that gives you the (None, 15, 5), then it should output something like (None, 75).
The flatten layer merely removes dimensions, when I was doing this I got the output as (None, xy) and due to the way Tensorflow works, it was able to match both shapes as xy is obviously a factor of just y.
I have a network which outputs a vector of length two. My targets are in the form of 1 or zeros, referring to two possible categories. What is the best way to get the loss - i.e. should I transform the targets, for example into a dimension 2 vector, or should I transform the output of the network, e.g. take the location of the max number as the output?
My network looks like:
class LSTMClassifier(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super().__init__()
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
self.lstm1 = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, layer_dim, batch_first=True)
self.fc1 = nn.Linear(hidden_dim, 32)
self.fc2 = nn.Linear(32, 1)
self.dropout = nn.Dropout(p=0.2)
self.batch_normalisation1 = nn.BatchNorm1d(layer_dim)
self.batch_normalisation2 = nn.BatchNorm1d(2)
self.activation = nn.Softmax(dim=2)
def forward(self, x):
h0, c0 = self.init_hidden(x)
out, (hn1, cn1) = self.lstm1(x, (h0, c0))
out = self.dropout(out,)
out = self.batch_normalisation1(out)
h1, c1 = self.init_hidden(out)
out, (hn2, cn2) = self.lstm2(out, (h1, c1))
out = self.dropout(out)
out = self.batch_normalisation1(out)
h2, c2 = self.init_hidden(out)
out, (hn3, cn3) = self.lstm2(out, (h2, c2))
out = self.dropout(out)
out = self.batch_normalisation1(out)
out = self.fc1(out[:, -1, :])
out = self.dropout(out)
out = self.fc2(out)
return out
def init_hidden(self, x):
h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
return [t for t in (h0, c0)]
def pred(self, x):
out = self(x)
return out > 0
An example of input to this network is:
tensor([[[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
[2.3597e-04, 1.1507e-02, 8.7719e-02, 6.1093e-02, 9.5556e-01],
[2.1474e-03, 5.3805e-03, 9.6491e-02, 2.2508e-01, 8.2222e-01]]])
which has shape torch.Size([1, 3, 5]). The target is currently 1 or 0. However, the network outputs a vector such as:
tensor([[0.5293, 0.4707]], grad_fn=<SoftmaxBackward>)
What would be the best way to set up the loss between these target and the network output?
Update:
I can now train the model as suggested in the answers as:
model = LSTMClassifier(5, 128, 3, 1)
Epochs = 10
batch_size = 32
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-6)
for epoch in range(Epochs):
if epoch == 0:
accurate = 0
for X_instance, y_instance in zip(val_x, val_y):
if int(y_instance) == 1 and model.pred(X_instance.view(-1, 3, 5)).item():
accurate += 1
print(f"Untrained accuracy test set: {accurate/len(val_x)}")
print(f"Epoch {epoch + 1}")
for n, (X, y) in enumerate(train_batches):
model.train()
optimizer.zero_grad()
y_pred = model(X)
loss = criterion(y_pred, y)
loss.backward()
optimizer.step()
model.eval()
accurate = 0
for X_instance, y_instance in zip(val_x, val_y):
if int(y_instance) == 1 and model.pred(X_instance.view(-1, 3, 5)).item():
accurate += 1
print(f"Accuracy test set: {accurate/len(val_x)}")
You shouldn't use any activation at the end of your network and output only a single neuron instead of two (trained with BCEWithLogitsLoss).
Below is your neural network code with commentary and removal of unnecessary parts:
class LSTMClassifier(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super().__init__()
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
self.lstm1 = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, layer_dim, batch_first=True)
self.fc1 = nn.Linear(hidden_dim, 32)
# Output 1 neuron instead of two
self.fc2 = nn.Linear(32, 1)
# Model should not depend on batch size
# self.batch_size = None
# You are not using this variable
# self.hidden = None
self.dropout = nn.Dropout(p=0.2)
self.batch_normalisation1 = nn.BatchNorm1d(layer_dim)
self.batch_normalisation2 = nn.BatchNorm1d(2)
def forward(self, x):
# Hidden are initialized with 0 explicitly
# h0, c0 = self.init_hidden(x)
out, _ = self.lstm1(x)
# No need for initial values
# out, (hn1, cn1) = self.lstm1(x, (h0, c0))
out = self.dropout(out)
out = self.batch_normalisation1(out)
# Same for all other cells you re-init with zeros, it's implicit
out, _ = self.lstm2(out)
out = self.dropout(out)
out = self.batch_normalisation1(out)
out, _ = self.lstm2(out)
out = self.dropout(out)
out = self.batch_normalisation1(out)
out = self.fc1(out[:, -1, :])
out = self.dropout(out)
# No need for activation
# out = F.softmax(self.fc2(out))
out = self.fc2(out)
return out
# Return True (1) or False (0)
def pred(self, x):
return self(x) > 0
I have also added pred method which transforms logits into targets (e.g. to use with some metrics).
Basically, if your logit is lower than 0 it is False, otherwise it is True. No need for activation in this case.