Following the tutorial from https://github.com/spro/practical-pytorch/blob/master/seq2seq-translation/seq2seq-translation.ipynb
There is a USE_CUDA flag that is used to control the variable and tensor types between CPU (when False) to GPU (when True) types.
Using the data from en-fr.tsv and converting the sentences to variables:
import unicodedata
import string
import re
import random
import time
import math
from gensim.corpora.dictionary import Dictionary
import torch
import torch.nn as nn
from torch.autograd import Variable
from torch import LongTensor, FloatTensor
from torch import optim
import torch.nn.functional as F
import numpy as np
MAX_LENGTH = 10
USE_CUDA = False
# Turn a Unicode string to plain ASCII, thanks to http://stackoverflow.com/a/518232/2809427
def unicode_to_ascii(s):
return ''.join(
c for c in unicodedata.normalize('NFD', s)
if unicodedata.category(c) != 'Mn'
)
# Lowercase, trim, and remove non-letter characters
def normalize_string(s):
s = unicode_to_ascii(s.lower().strip())
s = re.sub(r"([.!?])", r" \1", s)
s = re.sub(r"[^a-zA-Z.!?]+", r" ", s)
return s
SOS_IDX, SOS_TOKEN = 0, '<s>'
EOS_IDX, EOS_TOKEN = 1, '</s>'
UNK_IDX, UNK_TOKEN = 2, '<unk>'
PAD_IDX, PAD_TOKEN = 3, '<blank>'
lines = open('en-fr.tsv').read().strip().split('\n')
pairs = [[normalize_string(s).split() for s in l.split('\t')] for l in lines]
src_sents, trg_sents = zip(*pairs)
src_dict = Dictionary([[SOS_TOKEN, EOS_TOKEN, UNK_TOKEN, PAD_TOKEN]])
src_dict.add_documents(src_sents)
trg_dict = Dictionary([[SOS_TOKEN, EOS_TOKEN, UNK_TOKEN, PAD_TOKEN]])
trg_dict.add_documents(trg_sents)
def variablize_sentences(sentence, dictionary):
indices = [dictionary.token2id[tok] for tok in sentence] + [dictionary.token2id[EOS_TOKEN]]
var = Variable(LongTensor(indices).view(-1, 1))
return var.cuda() if USE_CUDA else var
input_variables = [variablize_sentences(sent, src_dict) for sent in src_sents]
output_variables = [variablize_sentences(sent, trg_dict) for sent in trg_sents]
And using a Encoder-Attn-Decoder network:
class EncoderRNN(nn.Module):
def __init__(self, input_size, hidden_size, n_layers=1):
super(EncoderRNN, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.n_layers = n_layers
self.embedding = nn.Embedding(input_size, hidden_size)
self.gru = nn.GRU(hidden_size, hidden_size, n_layers)
self.embedding = self.embedding.cuda() if USE_CUDA else self.embedding
self.gru = self.gru.cuda() if USE_CUDA else self.gru
def forward(self, word_inputs, hidden):
seq_len = len(word_inputs)
embedded = self.embedding(word_inputs).view(seq_len, 1, -1)
embedded = embedded.cuda() if USE_CUDA else embedded
output, hidden = self.gru(embedded, hidden)
output = output.cuda() if USE_CUDA else output
hiddne = hidden.cuda() if USE_CUDA else hidden
return output, hidden
def init_hidden(self):
hidden = Variable(torch.zeros(self.n_layers, 1, self.hidden_size))
return hidden.cuda() if USE_CUDA else hidden
class Attn(nn.Module):
def __init__(self, method, hidden_size, max_length=MAX_LENGTH):
super(Attn, self).__init__()
self.method = method
self.hidden_size = hidden_size
if self.method == 'general':
self.attn = nn.Linear(self.hidden_size, hidden_size)
elif self.method == 'concat':
self.attn = nn.Linear(self.hidden_size * 2, hidden_size)
self.other = nn.Parameter(FloatTensor(1, hidden_size))
def forward(self, hidden, encoder_outputs):
seq_len = len(encoder_outputs)
# Create variable to store attention energies
attn_energies = Variable(torch.zeros(seq_len)) # B x 1 x S
attn_energies = attn_energies.cuda() if USE_CUDA else attn_energies
# Calculate energies for each encoder output
for i in range(seq_len):
attn_energies[i] = self.score(hidden, encoder_outputs[i])
# Normalize energies to weights in range 0 to 1, resize to 1 x 1 x seq_len
return F.softmax(attn_energies).unsqueeze(0).unsqueeze(0)
def score(self, hidden, encoder_output):
if self.method == 'dot':
energy =torch.dot(hidden.view(-1), encoder_output.view(-1))
elif self.method == 'general':
energy = self.attn(encoder_output)
energy = torch.dot(hidden.view(-1), energy.view(-1))
elif self.method == 'concat':
energy = self.attn(torch.cat((hidden, encoder_output), 1))
energy = torch.dot(self.v.view(-1), energy.view(-1))
return energy
class AttnDecoderRNN(nn.Module):
def __init__(self, attn_model, hidden_size, output_size, n_layers=1, dropout_p=0.1):
super(AttnDecoderRNN, self).__init__()
# Keep parameters for reference
self.attn_model = attn_model
self.hidden_size = hidden_size
self.output_size = output_size
self.n_layers = n_layers
self.dropout_p = dropout_p
# Define layers
self.embedding = nn.Embedding(output_size, hidden_size)
self.gru = nn.GRU(hidden_size * 2, hidden_size, n_layers, dropout=dropout_p)
self.out = nn.Linear(hidden_size * 2, output_size)
self.embedding = self.embedding.cuda() if USE_CUDA else self.embedding
self.gru = self.gru.cuda() if USE_CUDA else self.gru
self.out = self.out.cuda() if USE_CUDA else self.out
# Choose attention model
if attn_model != 'none':
self.attn = Attn(attn_model, hidden_size)
self.attn = self.attn.cuda() if USE_CUDA else self.attn
def forward(self, word_input, last_context, last_hidden, encoder_outputs):
# Note: we run this one step at a time
# Get the embedding of the current input word (last output word)
word_embedded = self.embedding(word_input).view(1, 1, -1) # S=1 x B x N
# Combine embedded input word and last context, run through RNN
rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
rnn_output, hidden = self.gru(rnn_input, last_hidden)
# Calculate attention from current RNN state and all encoder outputs; apply to encoder outputs
attn_weights = self.attn(rnn_output.squeeze(0), encoder_outputs)
context = attn_weights.bmm(encoder_outputs.transpose(0, 1)) # B x 1 x N
# Final output layer (next word prediction) using the RNN hidden state and context vector
rnn_output = rnn_output.squeeze(0) # S=1 x B x N -> B x N
context = context.squeeze(1) # B x S=1 x N -> B x N
output = F.log_softmax(self.out(torch.cat((rnn_output, context), 1)))
if USE_CUDA:
return output.cuda(), context.cuda(), hidden.cuda(), attn_weights.cuda()
else:
return output, context, hidden, attn_weights
And testing the network:
encoder_test = EncoderRNN(10, 10, 2) # I, H , L
decoder_test = AttnDecoderRNN('general', 10, 10, 2) # A, H, O, L
encoder_hidden = encoder_test.init_hidden()
if USE_CUDA:
word_inputs = Variable(torch.LongTensor([1, 2, 3]).cuda())
else:
word_inputs = Variable(torch.LongTensor([1, 2, 3]))
encoder_outputs, encoder_hidden = encoder_test(word_inputs, encoder_hidden)
decoder_attns = torch.zeros(1, 3, 3)
decoder_hidden = encoder_hidden
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))
decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs)
print(decoder_output)
print(decoder_hidden)
print(decoder_attn)
The code works fine on CPU,
[out]:
EncoderRNN (
(embedding): Embedding(10, 10)
(gru): GRU(10, 10, num_layers=2)
)
AttnDecoderRNN (
(embedding): Embedding(10, 10)
(gru): GRU(20, 10, num_layers=2, dropout=0.1)
(out): Linear (20 -> 10)
(attn): Attn (
(attn): Linear (10 -> 10)
)
)
Variable containing:
-2.4378 -2.3556 -2.3391 -2.5070 -2.3439 -2.3415 -2.3976 -2.1832 -1.9976 -2.2213
[torch.FloatTensor of size 1x10]
Variable containing:
(0 ,.,.) =
Columns 0 to 8
-0.2325 0.0775 0.5415 0.4876 -0.5771 -0.0687 0.1832 -0.5285 0.2508
Columns 9 to 9
-0.1837
(1 ,.,.) =
Columns 0 to 8
-0.1389 -0.2605 -0.0518 0.3405 0.0774 0.1815 0.0297 -0.1304 -0.1015
Columns 9 to 9
0.2602
[torch.FloatTensor of size 2x1x10]
Variable containing:
(0 ,.,.) =
0.3334 0.3291 0.3374
[torch.FloatTensor of size 1x1x3]
but when changing the flag to USE_GPU=True, it throws the error when initializing the decoder_test object, it throws a TypeError:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-76-b3c660013934> in <module>()
12 decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))
13
---> 14 decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs)
15 print(decoder_output)
16 print(decoder_hidden)
~/.local/lib/python3.5/site-packages/torch/nn/modules/module.py in __call__(self, *input, **kwargs)
222 for hook in self._forward_pre_hooks.values():
223 hook(self, input)
--> 224 result = self.forward(*input, **kwargs)
225 for hook in self._forward_hooks.values():
226 hook_result = hook(self, input, result)
<ipython-input-75-34ecfe9b3112> in forward(self, word_input, last_context, last_hidden, encoder_outputs)
32
33 # Combine embedded input word and last context, run through RNN
---> 34 rnn_input = torch.cat((word_embedded, last_context.unsqueeze(0)), 2)
35 rnn_output, hidden = self.gru(rnn_input, last_hidden)
36
~/.local/lib/python3.5/site-packages/torch/autograd/variable.py in cat(iterable, dim)
895 #staticmethod
896 def cat(iterable, dim=0):
--> 897 return Concat.apply(dim, *iterable)
898
899 #staticmethod
~/.local/lib/python3.5/site-packages/torch/autograd/_functions/tensor.py in forward(ctx, dim, *inputs)
315 ctx.dim = dim
316 ctx.input_sizes = [i.size(dim) for i in inputs]
--> 317 return torch.cat(inputs, dim)
318
319 #staticmethod
TypeError: cat received an invalid combination of arguments - got (tuple, int), but expected one of:
* (sequence[torch.cuda.FloatTensor] seq)
* (sequence[torch.cuda.FloatTensor] seq, int dim)
didn't match because some of the arguments have invalid types: (tuple, int)
The question is why are that types not matching in CUDA but it works on CPU and how to resolve this?
Does PyTorch have a global flag to just change all types to CUDA types and not mess around with CPU/GPU types?
You can also try:
net = YouNetworkClass()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
net.to(device)
After that, you have to send the word_inputs, encoder_hidden and decoder_context to the GPU too:
word_inputs, encoder_hidden, decoder_context = word_inputs.to(device), encoder_hidden.to(device), decoder_context.to(device)
Look here: https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html#training-on-gpu
Does PyTorch have a global flag to just change all types to CUDA types and not mess around with CPU/GPU types?
Nope.
(Source: https://discuss.pytorch.org/t/porting-seq2seq-tutorial-from-spro-practical-pytorh-from-cpu-to-gpu/8604)
Specific to the example:
The input variables to the decoder_test object needs to be in .cuda() type. More specifically:
encoder_hidden = encoder_test.init_hidden()
---> encoder_hidden = encoder_test.init_hidden().cuda()
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size))
---> decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size)).cuda()
So the code to test the network should be:
encoder_test = EncoderRNN(10, 10, 2) # I, H , L
decoder_test = AttnDecoderRNN('general', 10, 10, 2) # A, H, O, L
encoder_hidden = encoder_test.init_hidden().cuda()
if USE_CUDA:
word_inputs = Variable(torch.LongTensor([1, 2, 3]).cuda())
else:
word_inputs = Variable(torch.LongTensor([1, 2, 3]))
encoder_outputs, encoder_hidden = encoder_test(word_inputs, encoder_hidden)
decoder_attns = torch.zeros(1, 3, 3)
decoder_hidden = encoder_hidden
decoder_context = Variable(torch.zeros(1, decoder_test.hidden_size)).cuda()
decoder_output, decoder_context, decoder_hidden, decoder_attn = decoder_test(word_inputs[0], decoder_context, decoder_hidden, encoder_outputs)
print(decoder_output)
print(decoder_hidden)
print(decoder_attn)
[out]:
Variable containing:
-2.1412 -2.4589 -2.4042 -2.1591 -2.5080 -2.0839 -2.5058 -2.3831 -2.4468 -2.0804
[torch.cuda.FloatTensor of size 1x10 (GPU 0)]
Variable containing:
(0 ,.,.) =
Columns 0 to 8
-0.0264 -0.0689 0.1049 0.0760 0.1017 -0.4585 -0.1273 0.0449 -0.3271
Columns 9 to 9
-0.0104
(1 ,.,.) =
Columns 0 to 8
-0.0308 -0.0690 -0.0258 -0.2759 0.1403 -0.0468 -0.0205 0.0126 -0.1729
Columns 9 to 9
0.0599
[torch.cuda.FloatTensor of size 2x1x10 (GPU 0)]
Variable containing:
(0 ,.,.) =
0.3328 0.3328 0.3344
[torch.cuda.FloatTensor of size 1x1x3 (GPU 0)]
Does PyTorch have a global flag to just change all types to CUDA types and not mess around with CPU/GPU types?
Yes. You can set the default tensor type to cuda with:
torch.set_default_tensor_type('torch.cuda.FloatTensor')
Related
I am trying to build a wakeword model for my AI Assistant. I have 1 second length 3 audios. I created the data. I have 3 audio MFCC extracted data as an example.
def test():
#look at the values of the tensors after printing
wwd = WakeWordData(data_json = '../../data_json_files/test.json');
print(wwd[0])
arr =[]
arr.append(wwd[87]) #shape(1,19,40)
arr.append(wwd[0]) #shape(1,78,40)
arr.append(wwd[4]) #shape(1,28,40)
mfccs, labels = collate_fn(arr) #torch.Size([78, 3, 40])
model_params = {
"size_of_output": 1, "input_size": 40, "hidden_size": 1,
"num_layers": 2, "dropout": 0.1, "bidirectional": True,
"device":'cpu'
}
lst_w = LSTM_WakeWord(**model_params)
o = lst_w(mfccs)
#print(str(o))
Here is my collate_fn below.
def collate_fn(data):
mfccs = []
labels = []
for d in data:
mfcc_tensor, label = d
#mfcc_tensor -> (channel, time, n_mfcc)
mfccs.append(mfcc_tensor.squeeze(0).transpose(0, 1))
labels.append(label)
mfccs = nn.utils.rnn.pad_sequence(mfccs, batch_first=True) # batch,
feature(n_mfcc),seq_len(time)
print("collate_fn MFCCs->" + str(mfccs.shape)) #torch.Size([3, 78, 40])
mfccs = mfccs.transpose(0, 1) #torch.Size([78, 3, 40])(feature(n_mfcc), batch,
seq_len(time))
labels = torch.Tensor(labels)
return mfccs, labels
when i run this code with 3 MFCC's , after pad_sequence i get the data as (3,78,40). Which is i think (batch, features(n_mfcc) ,seq_len(time)). is it correct ? then i traspose is and get ([78, 3, 40]).
then i try to give it to my LSTM. LSTM takes the input as ( seq_len,batch, feature). I can make the model work even though my (78,3,40) is ( features(n_mfcc) ,batch, seq_len(time)). Should i set the shape exactly as the model wants or it is good if it's working?
My model is below.
class LSTM_WakeWord(nn.Module):
def __init__(self,input_size,hidden_size,num_layers,dropout,bidirectional,size_of_output, device):
super(LSTM_WakeWord, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.device = device
self.bidirectional = bidirectional
self.directions = 2 if bidirectional else 1
self.lstm = nn.LSTM(input_size=input_size,
hidden_size = hidden_size,
num_layers = num_layers,
dropout=dropout,
bidirectional=bidirectional)
self.layernorm = nn.LayerNorm(input_size)
self.classifier = nn.Linear(hidden_size * self.directions, size_of_output)
def _init_hidden(self,batch_size):
n, d, hs = self.num_layers, self.directions, self.hidden_size
return (torch.zeros(n * d, batch_size, hs).to(self.device),
torch.zeros(n * d, batch_size, hs).to(self.device))
def forward(self,x):
# the values with e+xxx are gone. so it normalizes the values
x = self.layernorm(x)
# x shape -> feature(n_mfcc),batch,seq_len(time)
hidden = self._init_hidden(x.size()[1])
out, (hn, cn) = self.lstm(x, hidden)
print("hn "+str(hn.shape))
print("out " + str(out.shape))
out = self.classifier(hn)
return out
But then i get an error when i try to give the hidden state output to my Linear dense layer (classifier). It is a shape error. mat1 and mat2 shapes cannot be multiplied (12x1 and 2x1)
Why is this happening?
I am new to PyTorch and I am trying to build a reinforcement learning system that uses OpenAI for trying to predict whether or not a stock should be bought or not and at what time.
class NeuronalNetwork(nn.Module):
def __init__(self, stock_env: StockEnv):
super(NeuronalNetwork, self).__init__()
self.stock_env = stock_env
input_size = len(self.stock_env.normalized_dataframe.columns)
self.hidden_size = 128
self.num_layers = 4
self.kernel = 2
output_size = self.stock_env.action_space.n
self.lstm = nn.LSTM(input_size=input_size, hidden_size=self.hidden_size, num_layers=self.num_layers, batch_first=True)
self.output_layer = nn.Linear(self.hidden_size, output_size)
self.softmax = nn.LogSoftmax(dim=output_size)
self.tanh = nn.Tanh()
def forward(self, x, hidden=None):
# N x T x D
# N - the number of windows sizes
# T - the window size
# D - the number of indicators and OHLCV in total
if len(x.shape) > 2:
batch_size = x.shape[0]
else:
batch_size = 1
if hidden is None:
hidden = (
torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device),
torch.zeros(self.num_layers, batch_size, self.hidden_size).to(device),
)
D = len(self.stock_env.normalized_dataframe.columns)
T = self.stock_env.window_size
N = batch_size
x = x.view(N, T, D).type(torch.FloatTensor).to(device)
out, (ht, ct) = self.lstm(x, hidden)
out = self.tanh(out)
out = self.output_layer(out)
return out
My x from forward is representing my data under the form [Number_of_batches x Window_size x Features]
For the moment my out will be the shape Number_of_batches x Window_size x Action but what I want to make my model learn is to predict the best action ONLY for the 250th element. So does anyone know what can I do in order to obtain an out with a shape of (batch_size x action) where the action is going to be the last element from the column window_sie?
Example:
out.shape => (batch_size, windows_size, features)
FOR **b** all batch_size:
batch = []
FOR _ all actions
batch.append(out[b][250])
new_out.append(batch)
And on the end, I will have an out that is going to be batch_size x action where the action is only going to be the action of the 250th element from the window_size.
I'm not sure if makes sense for you what is my question, but it doesn't just let me know and I will try to explain it differently.
I'm a beginner with pytorch framework and I'm trying to add a multiheaded self attention on top of another architecture (BERT) (this is a simple question but I'm not familiar with PyTorch):
UPDATE 1
import math
class PositionalEncoding(nn.Module):
def __init__(self, d_model, dropout=0.1, max_len=5000):
super(PositionalEncoding, self).__init__()
self.dropout = nn.Dropout(p=dropout)
self.d_model = d_model
pe = torch.zeros(max_len, d_model)
position = torch.arange(0, max_len, dtype=torch.float).unsqueeze(1)
div_term = torch.exp(torch.arange(0, d_model, 2).float() * (-math.log(10000.0) / d_model))
pe[:, 0::2] = torch.sin(position * div_term)
pe[:, 1::2] = torch.cos(position * div_term)
pe = pe.unsqueeze(0).transpose(0, 1)
self.register_buffer('pe', pe)
def forward(self, x, seq_len = 768, mask = None):
pos_emb = self.pe[:, :seq_len]
x = x * mask[:, :, None].float()
x = x + pos_emb
return x
The problem in how to add the transformer is in the following class:
class CamemBERTQA(nn.Module):
def __init__(self,bert_type, hidden_size, num_labels, num_inter_layers=1, heads = 12, do_lower_case = True):
super(CamemBERTQA, self).__init__()
self.do_lower_case = do_lower_case
self.bert_type = bert_type
self.hidden_size = hidden_size
self.num_labels = num_labels
self.num_inter_layers = num_inter_layers
self.camembert = CamembertModel.from_pretrained(self.bert_type)
# ---------------- Transformer ------------------------------------------
self.d_model = self.hidden_size # 768
dropout = 0.1
self.pos_emb = PositionalEncoding(d_model = self.d_model, dropout = dropout)
self.transformer_inter = nn.ModuleList(
[nn.TransformerEncoderLayer(d_model = self.d_model, nhead = heads, dim_feedforward = 2048, dropout = dropout)
for _ in range(num_inter_layers)])
# ---------------- Transformer ------------------------------------------
self.qa_outputs = nn.Linear(self.hidden_size, self.num_labels)
def forward(self, input_ids, mask=None):
bert_output = self.camembert(input_ids = input_ids) # input_ids is a tensor
# ---------------- Transformer ------------------------------------------
seq_len = self.hidden_size
x = self.pos_emb(x = bert_output, seq_len = seq_len, mask = None)
for i in range(self.num_inter_layers):
x = self.transformer_inter[i](i, x, x, 1 - mask) # all_tokens * max_tokens * dim
output = self.layer_norm(x)
# ---------------- Transformer ------------------------------------------
sequence_output = output[0]
logits = self.qa_outputs(sequence_output)
start_logits, end_logits = logits.split(1, dim=-1)
start_logits = start_logits.squeeze(-1)
end_logits = end_logits.squeeze(-1)
outputs = (start_logits, end_logits,)
return x
Thank you so much.
So it seems that you're trying to add a Transformer network on top of the BERT component. It has to be mentioned that the self-attention network is only a part of the Transformer network, meaning that Transformers have other components besides self-attention as well. I would recommend using the Transformer (which has the self-attention component included) as an encoder that receives BERT vectors and transforms them into another representation (in another space).
Try this instead of self.attention = MultiHeadAttention():
self.transformer_inter = nn.ModuleList(
[TransformerEncoderLayer(d_model, heads, d_ff, dropout)
for _ in range(num_inter_layers)])
and then in forward(), call self.transformer_inter through a loop which will give you the representations produced by Transformer architecture. Like this:
def forward(self, bert_output, mask):
batch_size, seq_len = bert_output.size(0), bert_output.size(1)
# Transformer Encoder
pos_emb = self.pos_emb.pe[:, :seq_len]
x = bert_output * mask[:, :, None].float()
x = x + pos_emb
for i in range(self.num_inter_layers):
x = self.transformer_inter[i](i, x, x, 1 - mask) # all_tokens * max_tokens * dim
x = self.layer_norm(x) # Transformer also normalizes the outputs from each layer.
# x is the encoded vectors by Transformer encoder
return x
Then using a nn.Linear(.) layer, do another transformation to map the hidden_size to the number of labels for your task, which will give you the logits for each label. These all should be done within BERT class that you have posted.
Note that the TransformerEncoderLayer is a placeholder class that I used above. So you have to either implement it or use open source packages. As Transformers are quite well-known, I think you won't have trouble finding an implementation of it.
summary
I'm adding alphabets to captcha recognition, but pytorch's CTC seems to not working properly when alphabets are added.
What I've tried
At first, I modified BLANK_LABEL to 62 since there are 62 labels(0-9, a-z, A-Z), but it gives me runtime error blank must be in label range. I also tried BLANK_LABEL=0 and then assigning 1~63 as nonblank labels but it outputs NaN as loss.
The code
This is the colab link for the current version of my code: here
below are just core parts of the code.
Constants:
DATASET_PATH = "/home/ik1ne/Downloads/numbers"
MODEL_PATH = "/home/ik1ne/Downloads"
BATCH_SIZE = 50
TRAIN_BATCHES = 180
TEST_BATCHES = 20
TOTAL_BATCHES = TRAIN_BATCHES+TEST_BATCHES
TOTAL_DATASET = BATCH_SIZE*TOTAL_BATCHES
BLANK_LABEL = 63
dataset generation:
!pip install captcha
from captcha.image import ImageCaptcha
import itertools
import os
import random
import string
if not os.path.exists(DATASET_PATH):
os.makedirs(DATASET_PATH)
characters = "0123456789"+string.ascii_lowercase + string.ascii_uppercase
while(len(list(Path(DATASET_PATH).glob('*'))) < TOTAL_BATCHES):
captcha_str = "".join(random.choice(characters) for x in range(6))
if captcha_str in list(Path(DATASET_PATH).glob('*')):
continue
ImageCaptcha().write(captcha_str, f"{DATASET_PATH}/{captcha_str}.png")
dataset:
def convert_strseq_to_numseq(s):
for c in s:
if c >= '0' and c <= '9':
return int(c)
elif c>='a' and c <='z':
return ord(c)-ord('a')+10
else:
return ord(c)-ord('A')+36
class CaptchaDataset(Dataset):
"""CAPTCHA dataset."""
def __init__(self, root_dir, transform=None):
self.root_dir = root_dir
self.image_paths = list(Path(root_dir).glob('*'))
self.transform = transform
def __getitem__(self, index):
image = Image.open(self.image_paths[index])
if self.transform:
image = self.transform(image)
label_sequence = [convert_strseq_to_numseq(c) for c in self.image_paths[index].stem]
return (image, torch.tensor(label_sequence))
def __len__(self):
return len(self.image_paths)
model:
class StackedLSTM(nn.Module):
def __init__(self, input_size=60, output_size=11, hidden_size=512, num_layers=2):
super(StackedLSTM, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.dropout = nn.Dropout()
self.fc = nn.Linear(hidden_size, output_size)
self.lstm = nn.LSTM(input_size, hidden_size, num_layers)
def forward(self, inputs, hidden):
batch_size, seq_len, input_size = inputs.shape
outputs, hidden = self.lstm(inputs, hidden)
outputs = self.dropout(outputs)
outputs = torch.stack([self.fc(outputs[i]) for i in range(width)])
outputs = F.log_softmax(outputs, dim=2)
return outputs, hidden
def init_hidden(self, batch_size):
weight = next(self.parameters()).data
return (weight.new(self.num_layers, batch_size, self.hidden_size).zero_(),
weight.new(self.num_layers, batch_size, self.hidden_size).zero_())
net = StackedLSTM().to(device)
training:
net.train() # set network to training phase
epochs = 30
# for each pass of the training dataset
for epoch in range(epochs):
train_loss, train_correct, train_total = 0, 0, 0
h = net.init_hidden(BATCH_SIZE)
# for each batch of training examples
for batch_index, (inputs, targets) in enumerate(train_dataloader):
inputs, targets = inputs.to(device), targets.to(device)
h = tuple([each.data for each in h])
BATCH_SIZE, channels, height, width = inputs.shape
# reshape inputs: NxCxHxW -> WxNx(HxC)
inputs = (inputs
.permute(3, 0, 2, 1)
.contiguous()
.view((width, BATCH_SIZE, -1)))
optimizer.zero_grad() # zero the parameter gradients
outputs, h = net(inputs, h) # forward pass
# compare output with ground truth
input_lengths = torch.IntTensor(BATCH_SIZE).fill_(width)
target_lengths = torch.IntTensor([len(t) for t in targets])
loss = criterion(outputs, targets, input_lengths, target_lengths)
loss.backward() # backpropagation
nn.utils.clip_grad_norm_(net.parameters(), 10) # clip gradients
optimizer.step() # update network weights
# record statistics
prob, max_index = torch.max(outputs, dim=2)
train_loss += loss.item()
train_total += len(targets)
for i in range(BATCH_SIZE):
raw_pred = list(max_index[:, i].cpu().numpy())
pred = [c for c, _ in groupby(raw_pred) if c != BLANK_LABEL]
target = list(targets[i].cpu().numpy())
if pred == target:
train_correct += 1
# print statistics every 10 batches
if (batch_index + 1) % 10 == 0:
print(f'Epoch {epoch + 1}/{epochs}, ' +
f'Batch {batch_index + 1}/{len(train_dataloader)}, ' +
f'Train Loss: {(train_loss/1):.5f}, ' +
f'Train Accuracy: {(train_correct/train_total):.5f}')
train_loss, train_correct, train_total = 0, 0, 0
This error will occur when the index of blank is larger than the total number of classes, which equals number of chars + blank. What's more, the index starts from 0, instead of 1, so if you have 62 characters in total, their index should be 0-61 and the index of blank should be 62 instead of 63. (Or you can set blank as 0, other characters from 1-62)
You should also check the shape of the output tensor, it should has shape [T, B, C], where T is the time step length, B is the batch size, C is the class num, remember to add blank in to the class num or you will meet the problem
Most probably there is some problem with net shape when it's sent to CTC loss, but you should have provided the dataset to us to see the net's shape. It should be (T,N,C) , where T=input length, N=batch size, C=number of classes. And as I understand blank symbol id should in the 0..C range. Also, you should add blank symbol, for example '-' to the alphabet.
My current code for implementing encoder lstm using raw_rnn. This question is also related to another question I asked before (Tensorflow raw_rnn retrieve tensor of shape BATCH x DIM from embedding matrix).
When I run the following code I get the following error:
ValueError: The two structures don't have the same number of elements.
First structure (1 elements): None
Second structure (2 elements): LSTMStateTuple(c=64, h=64)
The error occures on the line: encoder_outputs_ta, encoder_final_state, _ = tf.nn.raw_rnn(cell, loop_fn=reader_loop)
import tensorflow as tf
import numpy as np
batch_size, max_time, input_embedding_size = 5, 10, 16
vocab_size, num_units = 50, 64
encoder_inputs = tf.placeholder(shape=(None, None), dtype=tf.int32, name='encoder_inputs')
encoder_inputs_length = tf.placeholder(shape=(None,), dtype=tf.int32, name='encoder_inputs_length')
embeddings = tf.Variable(tf.random_uniform([vocab_size + 2, input_embedding_size], -1.0, 1.0),
dtype=tf.float32, name='embeddings')
encoder_inputs_embedded = tf.nn.embedding_lookup(embeddings, encoder_inputs)
cell = tf.contrib.rnn.LSTMCell(num_units)
W = tf.Variable(tf.random_uniform([num_units, vocab_size], -1, 1), dtype=tf.float32, name='W_reader')
b = tf.Variable(tf.zeros([vocab_size]), dtype=tf.float32, name='b_reader')
with tf.variable_scope('ReaderNetwork'):
def loop_fn_initial():
init_elements_finished = (0 >= encoder_inputs_length)
init_input = cell.zero_state(batch_size, dtype=tf.float32)
init_cell_state = None
init_cell_output = None
init_loop_state = None
return (init_elements_finished, init_input,
init_cell_state, init_cell_output, init_loop_state)
def loop_fn_transition(time, previous_output, previous_state, previous_loop_state):
def get_next_input():
return tf.ones([batch_size, input_embedding_size], dtype=tf.float32) # TODO replace with value from embeddings
elements_finished = (time >= encoder_inputs_length)
finished = tf.reduce_all(elements_finished) # boolean scalar
next_input = tf.cond(finished,
true_fn=lambda: tf.zeros([batch_size, input_embedding_size], dtype=tf.float32),
false_fn=get_next_input)
state = previous_state
output = previous_output
loop_state = None
return elements_finished, next_input, state, output, loop_state
def loop_fn(time, previous_output, previous_state, previous_loop_state):
if previous_state is None: # time = 0
return loop_fn_initial()
return loop_fn_transition(time, previous_output, previous_state, previous_loop_state)
reader_loop = loop_fn
encoder_outputs_ta, encoder_final_state, _ = tf.nn.raw_rnn(cell, loop_fn=reader_loop)
outputs = encoder_outputs_ta.stack()
def next_batch():
return {
encoder_inputs: np.random.random((batch_size, max_time)),
encoder_inputs_length: [max_time] * batch_size
}
init = tf.global_variables_initializer()
with tf.Session() as s:
s.run(init)
outs = s.run([outputs], feed_dict=next_batch())
print len(outs), outs[0].shape
Resolved the problem by changing initial state and input:
init_input = tf.zeros([batch_size, input_embedding_size], dtype=tf.float32)
init_cell_state = cell.zero_state(batch_size, tf.float32)
def loop_fn_initial():
init_elements_finished = (0 >= encoder_inputs_length)
init_input = tf.zeros([batch_size, input_embedding_size], dtype=tf.float32)
init_cell_state = cell.zero_state(batch_size, tf.float32)
init_cell_output = None
init_loop_state = None
return (init_elements_finished, init_input,
init_cell_state, init_cell_output, init_loop_state)