I am doing a sequence to label learning model in PyTorch. I have two sentences and I am classifying whether they are entailed or not (SNLI dataset). I concatenate two 50 word sentences together (sometimes padded) into a vector of length 100. I then send in minibatches into word embeddings -> LSTM -> Linear layer. I am doing cross entropy loss but I need a vector of [mini_batch, C] to go into the CrossEntropyLoss function. Instead I still have the 100 words in my vector as [mini_batch, 100, C]
Here is my model:
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, mlp_d=256):
super(myLSTM, self).__init__()
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=1, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(mlp_d, 1024)
# Set static embedding vectors
self.embedding.weight.requires_grad = False
#self.sm = nn.CrossEntropyLoss()
def display(self):
for param in self.parameters():
print(param.data.size())
def filter_params(self):
# Might not be compatible with python 3
#self.parameters = filter(lambda p: p.requires_grad, self.parameters())
pass
def init_hidden(self):
# Need to init hidden weights in LSTM
pass
def forward(self, sentence):
print(sentence.size())
embeds = self.embedding(sentence)
print(embeds.size())
out, _ = self.lstm(embeds)
print(out.size())
out = self.mlp(out)
return out
My training sequences with output:
batch_size = 3
SGD_optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01, weight_decay=1e-4)
ADM_optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01)
criterion = nn.CrossEntropyLoss()
num_epochs = 50
from torch.autograd import Variable
from torch import optim
for epoch in range(num_epochs):
print("Epoch {0}/{1}: {2}%".format(epoch, num_epochs, float(epoch)/num_epochs))
for start, end in tqdm(batch_index_gen(batch_size, len(n_data))):
# Convert minibatch to numpy
s1, s2, y = convert_to_numpy(n_data[start:end])
# Convert numpy to Tensor
res = np.concatenate((s1,s2), axis=1) # Attach two sentences into 1 input vector
input_tensor = torch.from_numpy(res).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.FloatTensor)
data, target = Variable(input_tensor), Variable(target_tensor)
# Zero gradients
SGD_optimizer.zero_grad()
# Forward Pass
output = model.forward(data)
print("Output size: ")
print(output.size())
print("Target size: ")
print(target.size())
# Calculate loss with respect to training labels
loss = criterion(output, target)
# Backprogogate and update optimizer
loss.backward()
SGD_optimizer.step()
#ADAM_optimizer.step()
output:
Epoch 0/50: 0.0%
torch.Size([3, 100])
torch.Size([3, 100, 300])
torch.Size([3, 100, 256])
Output size:
torch.Size([3, 100, 1024])
Target size:
torch.Size([3])
error:
ValueError: Expected 2 or 4 dimensions (got 3)
EDITED -------------------------------------------------------------------
I have now got my model training but I am getting low accuracy. Is there an issue with my LSTM outputs being concatenated and then condensed to a smaller tensor to go through my linear layer?
New Model:
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, mlp_d=256, num_classes=3, lstm_layers=1):
super(myLSTM, self).__init__()
self.num_layers = lstm_layers
self.hidden_size = h_size
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=lstm_layers, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(2 * h_size * 2, num_classes)
# Set static embedding vectors
self.embedding.weight.requires_grad = False
def forward(self, s1, s2):
# Set initial states
#h0 = Variable(torch.zeros(self.num_layers*2, s1.size(0), self.hidden_size)).cuda() # 2 for bidirection
#c0 = Variable(torch.zeros(self.num_layers*2, s1.size(0), self.hidden_size)).cuda()
batch_size = s1.size()[0]
embeds_1 = self.embedding(s1)
embeds_2 = self.embedding(s2)
_, (h_1_last, _) = self.lstm(embeds_1)#, (h0, c0)) #note the change here. Last hidden state is taken
_, (h_2_last, _) = self.lstm(embeds_2)#, (h0, c0))
concat = torch.cat( (h_1_last, h_2_last), dim=2) #double check the dimension
concat = concat.view(batch_size, -1)
scores = self.mlp(concat)
return scores
New Training
batch_size = 64
SGD_optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()
num_epochs = 10
model.train()
if cuda:
model = model.cuda()
criterion = criterion.cuda()
from torch.autograd import Variable
from torch import optim
epoch_losses = []
for epoch in range(num_epochs):
print("Epoch {0}/{1}: {2}%".format(epoch, num_epochs, 100*float(epoch)/num_epochs))
# Batch loss aggregator
losses = []
for start, end in tqdm(batch_index_gen(batch_size, len(n_data))):
# Convert minibatch to numpy
s1, s2, y = convert_to_numpy(n_data[start:end])
# Convert numpy to Tensor
s1_tensor = torch.from_numpy(s1).type(torch.LongTensor)
s2_tensor = torch.from_numpy(s2).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.LongTensor)
s1 = Variable(s1_tensor)
s2 = Variable(s2_tensor)
target = Variable(target_tensor)
if cuda:
s1 = s1.cuda()
s2 = s2.cuda()
target = target.cuda()
# Zero gradients
SGD_optimizer.zero_grad()
# Forward Pass
output = model.forward(s1,s2)
# Calculate loss with respect to training labels
loss = criterion(output, target)
losses.append(loss.data[0])
# Backprogogate and update optimizer
loss.backward()
SGD_optimizer.step()
# concat losses to epoch losses
epoch_losses += losses
training with tensor sizes printed:
Epoch 0/10: 0.0%
Batch size: 64
Sentences
torch.Size([64, 50])
torch.Size([64, 50])
torch.Size([64, 50, 300])
torch.Size([64, 50, 300])
Hidden states
torch.Size([2, 64, 128])
torch.Size([2, 64, 128])
Concatenated hidden states
torch.Size([2, 64, 256])
Reshaped tensors for linear layer
torch.Size([64, 512])
Linear propogation
torch.Size([64, 3])
Evaluation
def eval_model(model, mode='dev'):
file_name = 'snli_1.0/snli_1.0_dev.jsonl' if mode == 'dev' else 'snli_1.0/snli_1.0_test.jsonl'
dev_data, _ = obtain_data(file_name)
dev_n_data = vocab.process_data(dev_data)
print("Length of data: {}".format(len(dev_n_data)))
eval_batch_size = 1024
model.eval()
total = len(dev_n_data)
hit = 0
correct = 0
# Batch dev eval
for start, end in batch_index_gen(eval_batch_size, len(dev_n_data)):
s1, s2, y = convert_to_numpy(dev_n_data[start:end])
s1_tensor = torch.from_numpy(s1).type(torch.LongTensor)
s2_tensor = torch.from_numpy(s2).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.LongTensor)
s1 = Variable(s1_tensor, volatile=True)
s2 = Variable(s2_tensor, volatile=True)
target = Variable(target_tensor, volatile=True)
if cuda:
s1 = s1.cuda()
s2 = s2.cuda()
target = target.cuda()
output = model.forward(s1,s2)
loss = criterion(output, target)
#print("output size: {}".format(output.size()))
#print("target size: {}".format(target.size()))
pred = output.data.max(1)[1] # get the index of the max log-probability
#print(pred[:5])
#print(output[:])
correct += pred.eq(target.data).cpu().sum()
return correct / float(total)
eval_model(model)
I think there is an issue in a way you are trying to solve an entailment problem.
Maybe you can do it this way:
design your module to accept two sentences as input
embed both of them with your embeddings
encode them using the LSTM module.
now you have two fixed length vector representations of two sentences. Simpliest thing to do is to just concatenate them
together.
Add liner layer on top to evaluate scores for each entailment class (3 I suppose)
apply softmax to get a proper probability distribution
So your model can look like this (double check the dimensions):
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, num_classes = 3):
super(myLSTM, self).__init__()
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=1, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(2*h_size*2, num_classes) #<- change here
def forward(self, sentence1, sentence2):
embeds_1 = self.embedding(sentence1)
embeds_2 = self.embedding(sentence2)
_, (h_1_last, _) = self.lstm(embeds_1) #note the change here. Last hidden state is taken
_, (h_2_last, _) = self.lstm(embeds_2)
concat = torch.concat([h_1_last, h_2_last], dim=1) #double check the dimension
scores = self.mlp(concat)
probas = F.softmax(scores) #from torch.functional ...
Then you can play around with adding more hidden layers or thinking how combining two sentences can be done in more intelligent way (attention, etc).
Double check what CrossEntropyLoss accepts as input and target and adjust (is it unnormalized class scores or probability distribution). Check http://pytorch.org/docs/master/nn.html#lstm for LSTM module documentation to clarify what LSTM returns (do you need hidden states for every word or just the representation after the last one).
Related
I derived a Bidirectional LSTM for Sentiment analysis on news headlines, but when training the model the loss function value doesn't improve, stays around 0.6 and 0.7.
Certainly I'm making something wrong, I wonder if it is something related with the Embedding layer.
I'm iteratively passing each batch into the Network with a size of 10 and sentence length of 30 words, my vocabulary size is 5745 so the shape of this tensor would be (10, 30, 5745) after one hot encoding.
My Embedding layer has num_embeddings = 5745 and embed-dim = 100 so when I call self.embedding(input), have as output shape: (10, 30, 5745, 100)
I wanted to have an output shape of: (10,30,100)
Therefore I used this line of code:
embeddings = torch.max(embeddings, dim=2)
But I'm not sure if it does what I expect it to do for each word/one-hot vector which is:
if I have one hot encoding vector representing a word with shape (5745,1) and an embbeding matrix with shape (100, 5745), I get an embedding vector of (100,1) and therefore I would have an ouptut of (10,30,100) by doing the above code?
Maybe I'm not thinking correctly and it's affecting my end result
RNN:
class RNN(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers, dropout):
#calling the init function of the RNN parent
super(RNN, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.encoder = nn.LSTM(embed_dim,
hidden_dim,
n_layers,
dropout=dropout,
bidirectional=True
)
#Linear transformation
self.decoder = nn.Linear(hidden_dim*2, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, inputs):
#(batch_size, timesteps, embed_dim)
embeddings = self.dropout(self.embedding(inputs))
embeddings = torch.max(embeddings, dim=2)
embeddings = embeddings[0].type(torch.cuda.FloatTensor)
#output of each timestep
output, (hidden, cell) = self.encoder(embeddings)
merge = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
output = self.decoder(merge)
return output
train:
def train(model, text, label, epochs, lr=0.001):
model.train()
opt = torch.optim.Adam(model.parameters(), lr=lr)
#The BCEWithLogitsLoss criterion carries out both the sigmoid and the binary cross entropy steps
criterion = nn.BCEWithLogitsLoss()
counter = 0
for e in range(epochs):
for x, y in batches(text, label):
#In order to avoid gradient accumulation before backpropagation
x = one_hot_encode(x, vocab)
x = torch.from_numpy(x).to(device)
output = model(x)
#print(torch.cuda.memory_summary(device=None, abbreviated=False))
y = torch.from_numpy(y)
y = torch.unsqueeze(y,1).to(device)
#print(output.shape)
#print(y.shape)
loss = criterion(output, y.float())
acc = binaryAccuracy(output,y)
opt.zero_grad()
loss.backward()
opt.step()
counter += 1
print("Epoch {}/{}".format(e+1, epochs),
"Loss: {}".format(loss.item()),
"accuracy: {}".format(acc))
one-hot:
def one_hot_encode(arr, n_labels):
one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.int64)
one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1
one_hot = one_hot.reshape((*arr.shape, n_labels))
return one_hot
batches:
def batches(text, label, num_seqs=10):
counter = 0
#create empty arrays with the specified number of columns
x = np.array([], dtype=int).reshape(0,findMaxLen())
y = np.array([], dtype=int).reshape(0,1)
for sent, l in zip(text, label):
# create a np array with zeros with length 30
tmp1 = np.zeros((findMaxLen()), dtype=int)
#tmp1 = np.randint(0, high=vocab,size=findMaxLen(), dtype=int)
#create a 1d array
tmp2 = np.atleast_1d(np.array(l))
for ind, wrd in enumerate(sent):
if wrd in uniqueWrds and ind < 30:
tmp1[ind] = word_to_index[wrd]
# the arrays to the empty arrays
x = np.vstack([x, tmp1])
y = np.vstack([y, tmp2])
counter +=1
if counter == num_seqs:
yield x, np.squeeze(y,1)
counter = 0
x = np.array([], dtype=int).reshape(0,findMaxLen())
y = np.array([], dtype=int).reshape(0,1)
If you want your output to be of dimension (10,30,100), I think you need to input the indexes of the words of the sentences in the form of an array of dimension (10x30). You can probably get the indexes by performing a torch.argmax on the one hot encoded input tensor.
Trying to get similar results on same dataset with Keras and PyTorch.
Data
from numpy import array
from numpy import hstack
from sklearn.model_selection import train_test_split
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
X, y = list(), list()
for i in range(len(sequences)):
# find the end of this pattern
end_ix = i + n_steps
# check if we are beyond the dataset
if end_ix > len(sequences):
break
# gather input and output parts of the pattern
seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
X.append(seq_x)
y.append(seq_y)
return array(X), array(y)
def get_data():
# define input sequence
in_seq1 = array([x for x in range(0,500,10)])/1
in_seq2 = array([x for x in range(5,505,10)])/1
out_seq = array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])
# convert to [rows, columns] structure
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
out_seq = out_seq.reshape((len(out_seq), 1))
# horizontally stack columns
dataset = hstack((in_seq1, in_seq2, out_seq))
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps
# convert into input/output
X, y = split_sequences(dataset, n_timesteps)
print(X.shape, y.shape)
X_train,x_test,Y_train, y_test = train_test_split(X,y,test_size = 0.2,shuffle=False)
return X_train,x_test,Y_train, y_test
Keras
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from sklearn.metrics import mean_squared_error
import testing.TimeSeries.datacreator as dc # !!!!change this!!!!
X_train,x_test,Y_train, y_test = dc.get_data()
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps
# define model
model = Sequential()
model.add(LSTM(1024, activation='relu',
input_shape=(n_timesteps, n_features),
kernel_initializer='uniform',
recurrent_initializer='uniform'))
model.add(Dense(512, activation='relu'))
model.add(Dense(1))
opt = keras.optimizers.Adam(lr=0.001,
beta_1=0.9,
beta_2=0.999,
epsilon=keras.optimizers.K.epsilon(),
decay=0.0,
amsgrad=False)
model.compile(optimizer=opt, loss='mse')
# fit model
model.fit(X_train, Y_train, epochs=200, verbose=1,validation_data=(x_test,y_test))
yhat = model.predict(x_test, verbose=0)
mean_squared_error(y_test, yhat)
PyTorch - module class
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error
import testing.TimeSeries.datacreator as dc # !!!! change this !!!!
X_train,x_test,Y_train, y_test = dc.get_data()
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps
class MV_LSTM(torch.nn.Module):
def __init__(self,n_features,seq_length):
super(MV_LSTM, self).__init__()
self.n_features = n_features # number of parallel inputs
self.seq_len = seq_length # number of timesteps
self.n_hidden = 1024 # number of hidden states
self.n_layers = 1 # number of LSTM layers (stacked)
self.l_lstm = torch.nn.LSTM(input_size = n_features,
hidden_size = self.n_hidden,
num_layers = self.n_layers,
batch_first = True)
# according to pytorch docs LSTM output is
# (batch_size,seq_len, num_directions * hidden_size)
# when considering batch_first = True
self.l_linear = torch.nn.Linear(self.n_hidden*self.seq_len, 512)
# self.l_linear1 = torch.nn.Linear(512, 512)
self.l_linear2 = torch.nn.Linear(512, 1)
def init_hidden(self, batch_size):
# even with batch_first = True this remains same as docs
hidden_state = torch.zeros(self.n_layers,batch_size,self.n_hidden).to(next(self.parameters()).device)
cell_state = torch.zeros(self.n_layers,batch_size,self.n_hidden).to(next(self.parameters()).device)
self.hidden = (hidden_state, cell_state)
def forward(self, x):
batch_size, seq_len, _ = x.size()
lstm_out, self.hidden = self.l_lstm(x,self.hidden)
# lstm_out(with batch_first = True) is
# (batch_size,seq_len,num_directions * hidden_size)
# for following linear layer we want to keep batch_size dimension and merge rest
# .contiguous() -> solves tensor compatibility error
x = lstm_out.contiguous().view(batch_size,-1)
x = F.relu(x)
x = F.relu(self.l_linear(x))
# x = F.relu(self.l_linear1(x))
x = self.l_linear2(x)
return x
PyTorch - init and train
# create NN
mv_net = MV_LSTM(n_features,n_timesteps)
criterion = torch.nn.MSELoss()
import keras # for epsilon constant
optimizer = torch.optim.Adam(mv_net.parameters(),
lr=1e-3,
betas=[0.9,0.999],
eps=keras.optimizers.K.epsilon(),
weight_decay=0,
amsgrad=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mv_net.to(device)
train_episodes = 200
batch_size = 32
eval_batch_size = 32
for t in range(train_episodes):
# TRAIN
mv_net.train()
for b in range(0,len(X_train),batch_size):
inpt = X_train[b:b+batch_size,:,:]
target = Y_train[b:b+batch_size]
x_batch = torch.tensor(inpt,dtype=torch.float32).to(device)
y_batch = torch.tensor(target,dtype=torch.float32).to(device)
mv_net.init_hidden(x_batch.size(0))
output = mv_net(x_batch)
loss = criterion(output.view(-1), y_batch)
loss.backward()
optimizer.step()
optimizer.zero_grad()
# EVAL
mv_net.eval()
mv_net.init_hidden(eval_batch_size)
acc = 0
for b in range(0,len(x_test),eval_batch_size):
inpt = x_test[b:b+eval_batch_size,:,:]
target = y_test[b:b+eval_batch_size]
x_batch = torch.tensor(inpt,dtype=torch.float32).to(device)
y_batch = torch.tensor(target,dtype=torch.float32).to(device)
mv_net.init_hidden(x_batch.size(0))
output = mv_net(x_batch)
acc += mean_squared_error(y_batch.cpu().detach().numpy(), output.view(-1).cpu().detach().numpy())
print('step:' , t , 'train loss:' , round(loss.item(),3),'eval acc:',round(acc/len(x_test),3))
mv_net.init_hidden(len(x_test))
val = torch.tensor(x_test,dtype=torch.float32).to(device)
otp = mv_net(val)
print(mean_squared_error(y_test, otp.view(-1).cpu().detach().numpy()))
Results
Keras produces test MSE almost 0, but PyTorch about 6000, which is way too different
I have tried couple tweaks in PyTorch code, but none got me anywhere close to similar keras, even with identical optim params
I cant see what is wrong with (kinda tutorialic) PyTorch code
I know it is almost one year too late. But I came across the same problem and I think the problem is the following. From the keras documentation it says:
return_sequences: Boolean. Whether to return the last output in the
output sequence, or the full sequence.
this basically means that the input shape of your self.l_linear needs to be torch.nn.Linear(1024, 512) instead of self.n_hidden*self.seq_len, 512.
Now you also need to do the same as keras does and only use the last output in your forward pass:
def forward(self, x):
batch_size, seq_len, _ = x.size()
lstm_out, self.hidden = self.l_lstm(x,self.hidden)
x = lstm_out[:,-1]
x = torch.nn.functional.relu(x)
x = torch.nn.functional.relu(self.l_linear(x))
x = self.l_linear2(x)
return x
when I run your example (which I needed to tweak a bit to get it run) I get very similar training losses.
Keras:
38/38 [==============================] - 0s 6ms/step - loss: 67.6081 - val_loss: 325.9259
PyTorch:
step: 199 train loss: 41.043 eval acc: 1142.688
I hope this helps others having a similar problem.
PS also note that keras is resetting the hidden state (stateful=False) by default.
I am testing a two-step architecture that is composed of a conventional first section that can be implemented with any standard deep learning architecture and a second section that must be coded manually outside the declaration of the Pytorch graph (while still utilizing numpy-like torch functions).
My problem can be simplified to coding a feed-forward neural network with two hidden layers, where the first is implemented within the Pytorch graph and the second is implemented manually outside the Pytorch graph.
Architecture:
Input
-> Linear(28 * 28, 120) w/ Pytorch
-> ReLU w/ Pytorch
-> Linear(120, 84) w/ Pytorch
-> ReLU w/ Pytorch
-> Linear(84, 10) w/o Pytorch
-> Output
Problem: My implementation below achieves a very low ~74%, while a standard fully Pytorch implementation achieves ~95%. What is causing this disparity?
I believe my problem lies in manually passing back the deltas, although the math looks right, so I am stuck in finding a solution to this.
Implementation of architecture and training on MNIST:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(28 * 28, 120)
self.fc2 = nn.Linear(120, 84)
def forward(self, x):
x = x.view(-1, 28 * 28)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return x
net = Net()
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)
# Initialize weights just as Pytorch does by default:
m = torch.distributions.uniform.Uniform(torch.tensor([-np.sqrt(1.0/84)]),
torch.tensor([np.sqrt(1.0/84)]))
W = m.sample((84, 10)).reshape((84, 10))
# based on https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
for epoch in range(2): # loop over the dataset multiple times
for i, data in enumerate(trainloader, 0):
# get the inputs
inputs, labels = data
# make one-hot encoding of labels
targets = oneHot(labels)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
pytorch_outputs = net(inputs)
pytorch_outputs = torch.autograd.Variable(pytorch_outputs,
requires_grad=True)
manual_outputs = torch.mm(pytorch_outputs, W)
delta_out = manual_outputs - targets.view(-1,10) # = error_out
dEdW3 = torch.mm(torch.t(pytorch_outputs), delta_out)
W -= 0.01 * dEdW3 # gradient descent
delta_h = torch.autograd.Variable(
torch.t(torch.mm(W, torch.t(delta_out))))
loss = criterion(pytorch_outputs, delta_h)
loss.backward()
optimizer.step()
Full code:
import torch
import torchvision
import torchvision.transforms as transforms
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import numpy as np
transform = transforms.Compose(
[transforms.ToTensor(),
transforms.Normalize((0.0, 0.0, 0.0), (1.0, 1.0, 1.0))])
trainset = torchvision.datasets.MNIST(root='./data', train=True,
download=True, transform=transform)
trainloader = torch.utils.data.DataLoader(trainset, batch_size=1,
shuffle=True, num_workers=1)
testset = torchvision.datasets.MNIST(root='./data', train=False,
download=True, transform=transform)
testloader = torch.utils.data.DataLoader(testset, batch_size=1,
shuffle=False, num_workers=1)
classes = ('0', '1', '2', '3',
'4', '5', '6', '7', '8', '9')
def oneHot(a):
b = torch.zeros(10)
b[a] = 1
return b
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.fc1 = nn.Linear(28 * 28, 120)
self.fc2 = nn.Linear(120, 84)
def forward(self, x):
x = x.view(-1, 28 * 28)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
return x
net = Net()
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.01)
# Initialize weights just as Pytorch does by default:
m = torch.distributions.uniform.Uniform(torch.tensor([-np.sqrt(1.0/84)]),
torch.tensor([np.sqrt(1.0/84)]))
W = m.sample((84, 10)).reshape((84, 10))
# based on https://pytorch.org/tutorials/beginner/blitz/cifar10_tutorial.html
for epoch in range(2): # loop over the dataset multiple times
for i, data in enumerate(trainloader, 0):
# get the inputs
inputs, labels = data
# make one-hot encoding of labels
targets = oneHot(labels)
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
pytorch_outputs = net(inputs)
pytorch_outputs = torch.autograd.Variable(pytorch_outputs,
requires_grad=True)
manual_outputs = torch.mm(pytorch_outputs, W)
delta_out = manual_outputs - targets.view(-1,10) # = error_out
dEdW3 = torch.mm(torch.t(pytorch_outputs), delta_out)
W -= 0.001*dEdW3 # gradient descent
delta_h = torch.autograd.Variable(
torch.t(torch.mm(W, torch.t(delta_out))))
loss = criterion(pytorch_outputs, delta_h)
loss.backward()
optimizer.step()
correct = 0
total = 0
with torch.no_grad():
for data in testloader:
inputs, labels = data
pytorch_outputs = torch.autograd.Variable(net(inputs),
requires_grad=True)
manual_outputs = torch.mm(pytorch_outputs, W)
_, predicted = torch.max(manual_outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Accuracy of the network on the 10000 test images: %d %%' % (
100 * correct / total))
I'm trying a basic averaging example, but the validation and loss don't match and the network fails to converge if I increase the training time. I'm training a network with 2 hidden layers, each 500 units wide on three integers from the range [0,9] with a learning rate of 1e-1, Adam, batch size of 1, and dropout for 3000 iterations and validate every 100 iterations. If the absolute difference between the label and the hypothesis is less than a threshold, here I set the threshold to 1, I consider that correct. Could someone let me know if this is an issue with the choice of loss function, something wrong with Pytorch, or something I'm doing. Below are some plots:
val_diff = 1
acc_diff = torch.FloatTensor([val_diff]).expand(self.batch_size)
Loop 100 times to during validation:
num_correct += torch.sum(torch.abs(val_h - val_y) < acc_diff)
Append after each validation phase:
validate.append(num_correct / total_val)
Here are some examples of the (hypothesis, and labels):
[...(-0.7043088674545288, 6.0), (-0.15691305696964264, 2.6666667461395264),
(0.2827358841896057, 3.3333332538604736)]
I tried six of the loss functions in the API that are typically used for regression:
torch.nn.L1Loss(size_average=False)
torch.nn.L1Loss()
torch.nn.MSELoss(size_average=False)
torch.nn.MSELoss()
torch.nn.SmoothL1Loss(size_average=False)
torch.nn.SmoothL1Loss()
Thanks.
Network code:
class Feedforward(nn.Module):
def __init__(self, topology):
super(Feedforward, self).__init__()
self.input_dim = topology['features']
self.num_hidden = topology['hidden_layers']
self.hidden_dim = topology['hidden_dim']
self.output_dim = topology['output_dim']
self.input_layer = nn.Linear(self.input_dim, self.hidden_dim)
self.hidden_layer = nn.Linear(self.hidden_dim, self.hidden_dim)
self.output_layer = nn.Linear(self.hidden_dim, self.output_dim)
self.dropout_layer = nn.Dropout(p=0.2)
def forward(self, x):
batch_size = x.size()[0]
feat_size = x.size()[1]
input_size = batch_size * feat_size
self.input_layer = nn.Linear(input_size, self.hidden_dim).cuda()
hidden = self.input_layer(x.view(1, input_size)).clamp(min=0)
for _ in range(self.num_hidden):
hidden = self.dropout_layer(F.relu(self.hidden_layer(hidden)))
output_size = batch_size * self.output_dim
self.output_layer = nn.Linear(self.hidden_dim, output_size).cuda()
return self.output_layer(hidden).view(output_size)
Training code:
def train(self):
if self.cuda:
self.network.cuda()
dh = DataHandler(self.data)
# loss_fn = nn.L1Loss(size_average=False)
# loss_fn = nn.L1Loss()
# loss_fn = nn.SmoothL1Loss(size_average=False)
# loss_fn = nn.SmoothL1Loss()
# loss_fn = nn.MSELoss(size_average=False)
loss_fn = torch.nn.MSELoss()
losses = []
validate = []
hypos = []
labels = []
val_size = 100
val_diff = 1
total_val = float(val_size * self.batch_size)
for i in range(self.iterations):
x, y = dh.get_batch(self.batch_size)
x = self.tensor_to_Variable(x)
y = self.tensor_to_Variable(y)
self.optimizer.zero_grad()
loss = loss_fn(self.network(x), y)
loss.backward()
self.optimizer.step()
It looks like you've misunderstood how layers in pytorch works, here are a few tips:
In your forward when you do nn.Linear(...) you are definining new layers instead of using those you pre-defined in your network __init__. Therefore, it cannot learn anything as weights are constantly reinitalized.
You shouldn't need to call .cuda() inside net.forward(...) since you've already copied the network on gpu in your train by calling self.network.cuda()
Ideally the net.forward(...) input should directly have the shape of the first layer so you won't have to modify it. Here you should have x.size() <=> Linear -- > (Batch_size, Features).
Your forward should look close to this:
def forward(self, x):
x = F.relu(self.input_layer(x))
x = F.dropout(F.relu(self.hidden_layer(x)),training=self.training)
x = self.output_layer(x)
return x
I am trying to model sentence similarity use tensorflow. The ideal is that first feed the concatenated sentence to a rnn, then feed the output of the rnn to a softmax to do a binary classification, similar or not. I make some minus changes to the PTB language model example , but the cost won't decrease as expected.
cost: 0.694479
cost: 0.695012
cost: 0.6955
...
The code is as follows. Any kind of help would be appreciated.
class PTBModel(object):
"""The PTB model."""
def __init__(self, is_training, config):
self.batch_size = batch_size = config.batch_size
self.num_steps = num_steps = config.num_steps
size = config.hidden_size
vocab_size = config.vocab_size
label_size = 2
self._input_data = tf.placeholder(tf.int32, [batch_size, num_steps], name="inputs")
# for each sentence pair we only have one output
self._targets = tf.placeholder(tf.int64, [batch_size], name="labels")
# Slightly better results can be obtained with forget gate biases
# initialized to 1 but the hyperparameters of the model would need to be
# different than reported in the paper.
lstm_cell = rnn_cell.BasicLSTMCell(size, forget_bias=0.0)
if is_training and config.keep_prob < 1:
lstm_cell = rnn_cell.DropoutWrapper(
lstm_cell, output_keep_prob=config.keep_prob)
cell = rnn_cell.MultiRNNCell([lstm_cell] * config.num_layers)
self._initial_state = cell.zero_state(batch_size, tf.float32)
with tf.device("/cpu:0"):
embedding = tf.get_variable("embedding", [vocab_size, size])
inputs = tf.nn.embedding_lookup(embedding, self._input_data)
if is_training and config.keep_prob < 1:
inputs = tf.nn.dropout(inputs, config.keep_prob)
# Simplified version of tensorflow.models.rnn.rnn.py's rnn().
# This builds an unrolled LSTM for tutorial purposes only.
# In general, use the rnn() or state_saving_rnn() from rnn.py.
#
# The alternative version of the code below is:
#
# from tensorflow.models.rnn import rnn
# inputs = [tf.squeeze(input_, [1])
# for input_ in tf.split(1, num_steps, inputs)]
# outputs, states = rnn.rnn(cell, inputs, initial_state=self._initial_state)
outputs = []
states = []
state = self._initial_state
with tf.variable_scope("RNN"):
for time_step in range(num_steps):
if time_step > 0: tf.get_variable_scope().reuse_variables()
(cell_output, state) = cell(inputs[:, time_step, :], state)
outputs.append(cell_output)
states.append(state)
# use the output of the last word as the input feature to softmax
output = tf.reshape(tf.concat(1, outputs[-1]), [-1, size])
softmax_w = tf.get_variable("softmax_w", [size, label_size])
softmax_b = tf.get_variable("softmax_b", [label_size])
self.logits = logits = tf.matmul(output, softmax_w) + softmax_b
loss = tf.nn.sparse_softmax_cross_entropy_with_logits(logits, self.targets)
self._cost = cost = tf.reduce_sum(loss) / batch_size
# self._cost = cost = -tf.reduce_sum(tf.reshape(self.targets, [-1, 1])
# *tf.log(tf.clip_by_value(tf.log(tf.nn.softmax(logits)), 1e-10,1.0)))
self._final_state = states[-1]
if not is_training:
return
self._lr = tf.Variable(0.0, trainable=False)
tvars = tf.trainable_variables()
grads, _ = tf.clip_by_global_norm(tf.gradients(cost, tvars),
config.max_grad_norm)
optimizer = tf.train.GradientDescentOptimizer(self.lr)
self._train_op = optimizer.apply_gradients(zip(grads, tvars))
def step(self, session, inputs, labels, is_training):
if is_training:
output_feeds = [self.cost, self._final_state, self._train_op]
else:
output_feeds = [self.cost]
input_feeds = {
self.input_data: inputs,
self.targets: labels
}
cost, state, logits = session.run(output_feeds, input_feeds)
return cost, state, logits
A small recommendation for you.
You can use two encoders(either RNN or CNN) one for each sentence, and then encoder these two sentence into two sentence embeddings. Once you have the two sentence vectors you just calculate the cosine similarity as the output. 1 if the two sentence have the same meaning and 0 if not for training. In inference you can input any two sentences and get the cosine similarity as the sentence semantic similarity.
With RNN the assumption is that there are sequential relationships between the inputs. Concatenating two sentences implies to the network that they compose a single sequence, but this isn't the case. For a comparison between two sentences, each should be represented individually.