I have a dataset that looks like this:
The labels are basically a list of items (let's say, cars in a parking lot) I am given, where there are 10 of them in total, labeled from 0 to 10. I have 14 classes (let's say, 14 different car brands). Every float point value is just percentage of which class that particular item belongs to. For example, Item 2 is likely class 2 with a probability of 0.995275:
print(set(list(df['label'])))
> {0, 1, 2, 3, 4, 5, 6, 7, 9}
My goal is to train a classifier to output an integer from 0 to 14 to predict what class label x belongs to.
I am trying to build a feedforward NN with 3 hidden layers (+ input and output layers) and takes 15 inputs and outputs a prediction from 0 to 14. This is what I've designed so far:
class NNO(nn.Module):
def __init__(self):
super(NNO, self).__init__()
h= [2,1]
self.hidden = nn.Linear(h[0], h[1])
self.hidden = nn.Linear(2,20)
self.hidden = nn.Linear(20,20)
self.output = nn.Linear(20,15)
self.sigmoid = nn.Sigmoid()
self.softmax = nn.Softmax(dim = 1)
def forward(self, y):
x = self.hidden(x)
x = self.sigmoid(x)
x = self.output(x)
x = self.softmax(x)
My question is this. How do I feed the dataset to my NN to start training the epochs? I couldn't find any resource that pertains to a dataset like this.
Here is the answer:
# First I create some dummy data
label = np.random.randint(0, 14, 1000)
random = np.random.random((1000, 14))
total = pd.DataFrame(data=random, columns=[f'{i}_col' for i in range(14)])
total['label'] = label
'''
From what I understood you need 1 class in output that has the highest probability and hence this is a multi-class classification problem. In my case, I will just use the highest value from `random` as the target class.
'''
class TDataset(torch.utils.data.Dataset):
def __init__(self, df):
self.inputs = df[[f'{i}_col' for i in range(14)] + ['label']].values
self.outputs = df[[f'{i}_col' for i in range(14)]].values
def __len__(self):
return len(self.inputs)
def __getitem__(self, idx):
x = torch.tensor(self.inputs[idx], dtype=torch.float)
y = torch.tensor(np.argmax(self.outputs[idx]))
return x, y
ds = TDataset(total)
dl = torch.utils.data.DataLoader(ds, batch_size=64)
# After doing this I will create a model which takes 15 inputs and
# Give 14 outputs in my case which represent the logits
class NNO(nn.Module):
def __init__(self):
super(NNO, self).__init__()
self.hidden = nn.Linear(15, 20)
self.relu = nn.ReLU()
self.output = nn.Linear(20, 14)
def forward(self, x):
x = self.hidden(x)
x = self.relu(x)
x = self.output(x)
return x
# Now we create the model object
m = NNO()
sample = None
for i in dl:
sample = i
break
print(m(sample[0]).shape) # shape = [64, 14] as desired.
# Now we define the loss function and then the optimizer
loss_fn = torch.nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(m.parameters())
# Now we define the training loop
for i in range(500): # for 500 epochs
epoch_loss = 0
for idx, data in enumerate(dl):
inputs = data[0]
targets = data[1] # change accordingly for your data
preds = m(inputs)
optimizer.zero_grad()
loss = loss_fn(preds, targets)
epoch_loss += loss
loss.backward()
optimizer.step()
if (i%50 == 0):
print('loss: ', epoch_loss.item() / len(dl))
'''
Now at the time of inference, you just need to apply softmax on the results of your model and select the most probable output.
'''
preds = m(sample[0])
predicted_classes = torch.argmax(torch.nn.functional.softmax(preds), axis=1)
# Here the predicted classes are the desired final output.
Related
I derived a Bidirectional LSTM for Sentiment analysis on news headlines, but when training the model the loss function value doesn't improve, stays around 0.6 and 0.7.
Certainly I'm making something wrong, I wonder if it is something related with the Embedding layer.
I'm iteratively passing each batch into the Network with a size of 10 and sentence length of 30 words, my vocabulary size is 5745 so the shape of this tensor would be (10, 30, 5745) after one hot encoding.
My Embedding layer has num_embeddings = 5745 and embed-dim = 100 so when I call self.embedding(input), have as output shape: (10, 30, 5745, 100)
I wanted to have an output shape of: (10,30,100)
Therefore I used this line of code:
embeddings = torch.max(embeddings, dim=2)
But I'm not sure if it does what I expect it to do for each word/one-hot vector which is:
if I have one hot encoding vector representing a word with shape (5745,1) and an embbeding matrix with shape (100, 5745), I get an embedding vector of (100,1) and therefore I would have an ouptut of (10,30,100) by doing the above code?
Maybe I'm not thinking correctly and it's affecting my end result
RNN:
class RNN(nn.Module):
def __init__(self, vocab_size, embed_dim, hidden_dim, output_dim, n_layers, dropout):
#calling the init function of the RNN parent
super(RNN, self).__init__()
self.embedding = nn.Embedding(vocab_size, embed_dim)
self.encoder = nn.LSTM(embed_dim,
hidden_dim,
n_layers,
dropout=dropout,
bidirectional=True
)
#Linear transformation
self.decoder = nn.Linear(hidden_dim*2, output_dim)
self.dropout = nn.Dropout(dropout)
def forward(self, inputs):
#(batch_size, timesteps, embed_dim)
embeddings = self.dropout(self.embedding(inputs))
embeddings = torch.max(embeddings, dim=2)
embeddings = embeddings[0].type(torch.cuda.FloatTensor)
#output of each timestep
output, (hidden, cell) = self.encoder(embeddings)
merge = self.dropout(torch.cat((hidden[-2,:,:], hidden[-1,:,:]), dim=1))
output = self.decoder(merge)
return output
train:
def train(model, text, label, epochs, lr=0.001):
model.train()
opt = torch.optim.Adam(model.parameters(), lr=lr)
#The BCEWithLogitsLoss criterion carries out both the sigmoid and the binary cross entropy steps
criterion = nn.BCEWithLogitsLoss()
counter = 0
for e in range(epochs):
for x, y in batches(text, label):
#In order to avoid gradient accumulation before backpropagation
x = one_hot_encode(x, vocab)
x = torch.from_numpy(x).to(device)
output = model(x)
#print(torch.cuda.memory_summary(device=None, abbreviated=False))
y = torch.from_numpy(y)
y = torch.unsqueeze(y,1).to(device)
#print(output.shape)
#print(y.shape)
loss = criterion(output, y.float())
acc = binaryAccuracy(output,y)
opt.zero_grad()
loss.backward()
opt.step()
counter += 1
print("Epoch {}/{}".format(e+1, epochs),
"Loss: {}".format(loss.item()),
"accuracy: {}".format(acc))
one-hot:
def one_hot_encode(arr, n_labels):
one_hot = np.zeros((np.multiply(*arr.shape), n_labels), dtype=np.int64)
one_hot[np.arange(one_hot.shape[0]), arr.flatten()] = 1
one_hot = one_hot.reshape((*arr.shape, n_labels))
return one_hot
batches:
def batches(text, label, num_seqs=10):
counter = 0
#create empty arrays with the specified number of columns
x = np.array([], dtype=int).reshape(0,findMaxLen())
y = np.array([], dtype=int).reshape(0,1)
for sent, l in zip(text, label):
# create a np array with zeros with length 30
tmp1 = np.zeros((findMaxLen()), dtype=int)
#tmp1 = np.randint(0, high=vocab,size=findMaxLen(), dtype=int)
#create a 1d array
tmp2 = np.atleast_1d(np.array(l))
for ind, wrd in enumerate(sent):
if wrd in uniqueWrds and ind < 30:
tmp1[ind] = word_to_index[wrd]
# the arrays to the empty arrays
x = np.vstack([x, tmp1])
y = np.vstack([y, tmp2])
counter +=1
if counter == num_seqs:
yield x, np.squeeze(y,1)
counter = 0
x = np.array([], dtype=int).reshape(0,findMaxLen())
y = np.array([], dtype=int).reshape(0,1)
If you want your output to be of dimension (10,30,100), I think you need to input the indexes of the words of the sentences in the form of an array of dimension (10x30). You can probably get the indexes by performing a torch.argmax on the one hot encoded input tensor.
I am trying to predict the center of my palm
The structure of my neural network consists of 2 cnn which both are followed by max-pooling and a linear layer that has 2 outputs, one for x and the other one for y. The input is a 720x720 image.
class MyNeuralNetwork(torch.nn.Module):
def __init__(self):
super(MyNeuralNetwork, self).__init__()
self.conv1 = torch.nn.Conv2d(4, 5, 5)
self.conv2 = torch.nn.Conv2d(5, 5, 5)
self.pool = torch.nn.MaxPool2d(3, 3)
self.linear = torch.nn.Linear(5 * 78 * 78, 2)
def forward(self, x):
x = self.conv1(x)
x = self.pool(x)
x = self.conv2(x)
x = self.pool(x)
x = x.view(x.size(0), -1)
x = self.linear(x)
return x
I have the pathnames of the images saved in a csv file. the x and y coordinates are saved in a different csv file. Here is the code for my Dataset.
class MyHand(Dataset):
"""Creating the proper dataset to feed my neural network"""
def __init__(self, name_path, root_dir, results_path, transform=None):
self.names = pd.read_csv(name_path)
self.rootdir = root_dir
self.transform = transform
self.results = pd.read_csv(results_path)
def __len__(self):
length = len(self.names.columns)
return length
def __getitem__(self, index):
img_path = os.path.join(self.rootdir, self.names.columns[index])
image = pl.imread(img_path)
x_top_left_corner = torch.tensor(self.results.iloc[index, 0])
y_top_left_corner = torch.tensor(self.results.iloc[index, 1])
width = torch.tensor(self.results.iloc[index, 2])
height = torch.tensor(self.results.iloc[index, 3])
# calculating the x and y center of my palm
x_center = x_top_left_corner + width/2
y_center = y_top_left_corner - height/2
if self.transform:
image = self.transform(image)
return image, x_center, y_center
and the code for training the network is
dataset = MyHand(name_path='path to the names of the images csv',
results_path='path to the results cvs',
transform=torchvision.transforms.ToTensor( ))
loader = DataLoader(dataset=dataset, batch_size=4)
model = MyNeuralNetwork()
criterion = torch.nn.MSELoss()
EPOCHS = 5
LEARNING_RATE = 0.001
optimizer = optim.SGD(model.parameters(), LEARNING_RATE)
for epoch in range(EPOCHS):
print("epoch:", epoch)
for data in dataset:
pic, x, y = data
model.zero_grad()
outpout = model(pic[None, :, :, :])
loss1 = criterion(outpout[0, 0], x)
loss2 = criterion(outpout[0, 1], y)
loss = loss1 + loss2
loss.backward()
print(loss)
but as you can see below my loss function has exactly the same results at each epoch and it doesn't decrease at all. What can i do for that? I tried different values of learning rate but still the same.
Your loss values are extremly high as you see. I would propose that you normalize your outputs by using the sigmoid activation function. Now the coordinates are in the range 0-1 and can be later translated to the image by multiplying them with 720. To calculate the loss, you have to divide your target cooridnates by 720. Then you should get a nice and stable loss in the range 0-1.
Also:
either decay your learning rate or try a smaller one
scale your image down (I don't know how the images look like but 720x720 is quite big)
use three convolutions with smaller kernels
add a second linear layer
I am trying to solve the very simple non-linear problem. It is XOR gate.
I my school knowledge. XOR can be solve by using 2 input nodes, 2 hidden layer nodes. And 1 output. It is binary classification problem.
I generate the 1000 of random integer number it is 0 or 1 and then do backpropagation. But for some unknown reason my network has not learned anything. The training accuracy is constant at 50.
# coding: utf-8
import matplotlib
import torch
import torch.nn as nn
from torch.autograd import Variable
matplotlib.use('TkAgg') # My buggy OSX 10.13.6 requires this
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from tqdm import tqdm
import random
N = 1000
batch_size = 10
epochs = 40
hidden_size = 2
output_size = 1
lr = 0.1
def return_xor(N):
tmp_x = []
tmp_y = []
for i in range(N):
a = (random.randint(0, 1) == 1)
b = (random.randint(0, 1) == 1)
if (a and not b) or (not a and b):
q = True
else:
q = False
input_features = (a, b)
output_class = q
tmp_x.append(input_features)
tmp_y.append(output_class)
return tmp_x, tmp_y
# In[495]:
# Training set
x, y = return_xor(N)
x = torch.tensor(x, dtype=torch.float, requires_grad=True)
y = torch.tensor(y, dtype=torch.float, requires_grad=True)
# Test dataset
x_test, y_test = return_xor(100)
x_test = torch.tensor(x_test)
y_test = torch.tensor(y_test)
class MyDataset(Dataset):
"""Define my own `Dataset` in order to use `Variable` with `autograd`"""
def __init__(self, x, y):
self.x = x
self.y = y
def __getitem__(self, index):
return self.x[index], self.y[index]
def __len__(self):
return len(self.x)
dataset = MyDataset(x, y)
test_dataset = MyDataset(x_test, y_test)
print(dataset.x.shape)
print(dataset.y.shape)
# Make data iterable by loading to a loader. Shuffle, batch_size kwargs put them here in order to remind I myself
train_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
print(f"They are {len(train_loader)} batches in the dataset")
shown = 0
for (x, y) in train_loader:
if shown == 1:
break
print(f"{x.shape} {x.dtype}")
print(f"{y.shape} {y.dtype}")
shown += 1
class MyModel(nn.Module):
"""
Binary classification
2 input nodes
2 hidden nodes
1 output node
"""
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.fc1 = torch.nn.Linear(input_size, hidden_size)
self.fc2 = torch.nn.Linear(hidden_size, output_size)
self.sigmoid = torch.nn.Sigmoid()
def forward(self, out):
out = self.fc1(out)
out = self.fc2(out)
out = self.sigmoid(out)
return out
# Create my network
net = MyModel(dataset.x.shape[1], hidden_size, output_size)
CUDA = torch.cuda.is_available()
if CUDA:
net = net.cuda()
criterion = torch.nn.BCELoss(reduction='elementwise_mean')
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
# Train the network
correct_train = 0
total_train = 0
for epoch in range(epochs):
for i, (batches, labels) in enumerate(train_loader):
batcesh = Variable(batches.float())
labels = Variable(labels.float())
output = net(batches) # Forward pass
optimizer.zero_grad()
loss = criterion(output, labels.view(10, 1))
loss.backward()
optimizer.step()
total_train += labels.size(0)
correct_train += (predicted == labels.long()).sum()
if (i + 1) % 10 == 0:
print(f"""
Epoch {epoch+1}/{epochs},
Iteration {i+1}/{len(dataset)//batch_size},
Training Loss: {loss.item()},
Training Accuracy: {100*correct_train/total_train}
""")
Solution:
I did initialized weight, Adaptive learning rate
https://github.com/elcolie/nnbootcamp/blob/master/Study-XOR.ipynb
I am not sure what results you are getting, as the code you have posted in the question doesn't work (It gives errors with pytorch 0.4.1 like predicted not defined etc). But syntax issues apart, there are other problems.
Your model is not actually two layer as it does not use non-linearity after the first output. Effectively this is one layer network and to fix that you can modify your model's forward as follows:
def forward(self, out):
out = torch.nn.functional.relu(self.fc1(out))
out = self.fc2(out)
out = self.sigmoid(out)
return out
You can try sigmoid or tanh non-linearity as well... but the non-linearity is a must. This should fix the problem.
I also see that you are using only 2 hidden units. This might be restrictive and you might want to increase that to something like 5 or 10.
I am doing a sequence to label learning model in PyTorch. I have two sentences and I am classifying whether they are entailed or not (SNLI dataset). I concatenate two 50 word sentences together (sometimes padded) into a vector of length 100. I then send in minibatches into word embeddings -> LSTM -> Linear layer. I am doing cross entropy loss but I need a vector of [mini_batch, C] to go into the CrossEntropyLoss function. Instead I still have the 100 words in my vector as [mini_batch, 100, C]
Here is my model:
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, mlp_d=256):
super(myLSTM, self).__init__()
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=1, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(mlp_d, 1024)
# Set static embedding vectors
self.embedding.weight.requires_grad = False
#self.sm = nn.CrossEntropyLoss()
def display(self):
for param in self.parameters():
print(param.data.size())
def filter_params(self):
# Might not be compatible with python 3
#self.parameters = filter(lambda p: p.requires_grad, self.parameters())
pass
def init_hidden(self):
# Need to init hidden weights in LSTM
pass
def forward(self, sentence):
print(sentence.size())
embeds = self.embedding(sentence)
print(embeds.size())
out, _ = self.lstm(embeds)
print(out.size())
out = self.mlp(out)
return out
My training sequences with output:
batch_size = 3
SGD_optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01, weight_decay=1e-4)
ADM_optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01)
criterion = nn.CrossEntropyLoss()
num_epochs = 50
from torch.autograd import Variable
from torch import optim
for epoch in range(num_epochs):
print("Epoch {0}/{1}: {2}%".format(epoch, num_epochs, float(epoch)/num_epochs))
for start, end in tqdm(batch_index_gen(batch_size, len(n_data))):
# Convert minibatch to numpy
s1, s2, y = convert_to_numpy(n_data[start:end])
# Convert numpy to Tensor
res = np.concatenate((s1,s2), axis=1) # Attach two sentences into 1 input vector
input_tensor = torch.from_numpy(res).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.FloatTensor)
data, target = Variable(input_tensor), Variable(target_tensor)
# Zero gradients
SGD_optimizer.zero_grad()
# Forward Pass
output = model.forward(data)
print("Output size: ")
print(output.size())
print("Target size: ")
print(target.size())
# Calculate loss with respect to training labels
loss = criterion(output, target)
# Backprogogate and update optimizer
loss.backward()
SGD_optimizer.step()
#ADAM_optimizer.step()
output:
Epoch 0/50: 0.0%
torch.Size([3, 100])
torch.Size([3, 100, 300])
torch.Size([3, 100, 256])
Output size:
torch.Size([3, 100, 1024])
Target size:
torch.Size([3])
error:
ValueError: Expected 2 or 4 dimensions (got 3)
EDITED -------------------------------------------------------------------
I have now got my model training but I am getting low accuracy. Is there an issue with my LSTM outputs being concatenated and then condensed to a smaller tensor to go through my linear layer?
New Model:
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, mlp_d=256, num_classes=3, lstm_layers=1):
super(myLSTM, self).__init__()
self.num_layers = lstm_layers
self.hidden_size = h_size
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=lstm_layers, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(2 * h_size * 2, num_classes)
# Set static embedding vectors
self.embedding.weight.requires_grad = False
def forward(self, s1, s2):
# Set initial states
#h0 = Variable(torch.zeros(self.num_layers*2, s1.size(0), self.hidden_size)).cuda() # 2 for bidirection
#c0 = Variable(torch.zeros(self.num_layers*2, s1.size(0), self.hidden_size)).cuda()
batch_size = s1.size()[0]
embeds_1 = self.embedding(s1)
embeds_2 = self.embedding(s2)
_, (h_1_last, _) = self.lstm(embeds_1)#, (h0, c0)) #note the change here. Last hidden state is taken
_, (h_2_last, _) = self.lstm(embeds_2)#, (h0, c0))
concat = torch.cat( (h_1_last, h_2_last), dim=2) #double check the dimension
concat = concat.view(batch_size, -1)
scores = self.mlp(concat)
return scores
New Training
batch_size = 64
SGD_optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()
num_epochs = 10
model.train()
if cuda:
model = model.cuda()
criterion = criterion.cuda()
from torch.autograd import Variable
from torch import optim
epoch_losses = []
for epoch in range(num_epochs):
print("Epoch {0}/{1}: {2}%".format(epoch, num_epochs, 100*float(epoch)/num_epochs))
# Batch loss aggregator
losses = []
for start, end in tqdm(batch_index_gen(batch_size, len(n_data))):
# Convert minibatch to numpy
s1, s2, y = convert_to_numpy(n_data[start:end])
# Convert numpy to Tensor
s1_tensor = torch.from_numpy(s1).type(torch.LongTensor)
s2_tensor = torch.from_numpy(s2).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.LongTensor)
s1 = Variable(s1_tensor)
s2 = Variable(s2_tensor)
target = Variable(target_tensor)
if cuda:
s1 = s1.cuda()
s2 = s2.cuda()
target = target.cuda()
# Zero gradients
SGD_optimizer.zero_grad()
# Forward Pass
output = model.forward(s1,s2)
# Calculate loss with respect to training labels
loss = criterion(output, target)
losses.append(loss.data[0])
# Backprogogate and update optimizer
loss.backward()
SGD_optimizer.step()
# concat losses to epoch losses
epoch_losses += losses
training with tensor sizes printed:
Epoch 0/10: 0.0%
Batch size: 64
Sentences
torch.Size([64, 50])
torch.Size([64, 50])
torch.Size([64, 50, 300])
torch.Size([64, 50, 300])
Hidden states
torch.Size([2, 64, 128])
torch.Size([2, 64, 128])
Concatenated hidden states
torch.Size([2, 64, 256])
Reshaped tensors for linear layer
torch.Size([64, 512])
Linear propogation
torch.Size([64, 3])
Evaluation
def eval_model(model, mode='dev'):
file_name = 'snli_1.0/snli_1.0_dev.jsonl' if mode == 'dev' else 'snli_1.0/snli_1.0_test.jsonl'
dev_data, _ = obtain_data(file_name)
dev_n_data = vocab.process_data(dev_data)
print("Length of data: {}".format(len(dev_n_data)))
eval_batch_size = 1024
model.eval()
total = len(dev_n_data)
hit = 0
correct = 0
# Batch dev eval
for start, end in batch_index_gen(eval_batch_size, len(dev_n_data)):
s1, s2, y = convert_to_numpy(dev_n_data[start:end])
s1_tensor = torch.from_numpy(s1).type(torch.LongTensor)
s2_tensor = torch.from_numpy(s2).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.LongTensor)
s1 = Variable(s1_tensor, volatile=True)
s2 = Variable(s2_tensor, volatile=True)
target = Variable(target_tensor, volatile=True)
if cuda:
s1 = s1.cuda()
s2 = s2.cuda()
target = target.cuda()
output = model.forward(s1,s2)
loss = criterion(output, target)
#print("output size: {}".format(output.size()))
#print("target size: {}".format(target.size()))
pred = output.data.max(1)[1] # get the index of the max log-probability
#print(pred[:5])
#print(output[:])
correct += pred.eq(target.data).cpu().sum()
return correct / float(total)
eval_model(model)
I think there is an issue in a way you are trying to solve an entailment problem.
Maybe you can do it this way:
design your module to accept two sentences as input
embed both of them with your embeddings
encode them using the LSTM module.
now you have two fixed length vector representations of two sentences. Simpliest thing to do is to just concatenate them
together.
Add liner layer on top to evaluate scores for each entailment class (3 I suppose)
apply softmax to get a proper probability distribution
So your model can look like this (double check the dimensions):
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, num_classes = 3):
super(myLSTM, self).__init__()
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=1, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(2*h_size*2, num_classes) #<- change here
def forward(self, sentence1, sentence2):
embeds_1 = self.embedding(sentence1)
embeds_2 = self.embedding(sentence2)
_, (h_1_last, _) = self.lstm(embeds_1) #note the change here. Last hidden state is taken
_, (h_2_last, _) = self.lstm(embeds_2)
concat = torch.concat([h_1_last, h_2_last], dim=1) #double check the dimension
scores = self.mlp(concat)
probas = F.softmax(scores) #from torch.functional ...
Then you can play around with adding more hidden layers or thinking how combining two sentences can be done in more intelligent way (attention, etc).
Double check what CrossEntropyLoss accepts as input and target and adjust (is it unnormalized class scores or probability distribution). Check http://pytorch.org/docs/master/nn.html#lstm for LSTM module documentation to clarify what LSTM returns (do you need hidden states for every word or just the representation after the last one).
I'm trying a basic averaging example, but the validation and loss don't match and the network fails to converge if I increase the training time. I'm training a network with 2 hidden layers, each 500 units wide on three integers from the range [0,9] with a learning rate of 1e-1, Adam, batch size of 1, and dropout for 3000 iterations and validate every 100 iterations. If the absolute difference between the label and the hypothesis is less than a threshold, here I set the threshold to 1, I consider that correct. Could someone let me know if this is an issue with the choice of loss function, something wrong with Pytorch, or something I'm doing. Below are some plots:
val_diff = 1
acc_diff = torch.FloatTensor([val_diff]).expand(self.batch_size)
Loop 100 times to during validation:
num_correct += torch.sum(torch.abs(val_h - val_y) < acc_diff)
Append after each validation phase:
validate.append(num_correct / total_val)
Here are some examples of the (hypothesis, and labels):
[...(-0.7043088674545288, 6.0), (-0.15691305696964264, 2.6666667461395264),
(0.2827358841896057, 3.3333332538604736)]
I tried six of the loss functions in the API that are typically used for regression:
torch.nn.L1Loss(size_average=False)
torch.nn.L1Loss()
torch.nn.MSELoss(size_average=False)
torch.nn.MSELoss()
torch.nn.SmoothL1Loss(size_average=False)
torch.nn.SmoothL1Loss()
Thanks.
Network code:
class Feedforward(nn.Module):
def __init__(self, topology):
super(Feedforward, self).__init__()
self.input_dim = topology['features']
self.num_hidden = topology['hidden_layers']
self.hidden_dim = topology['hidden_dim']
self.output_dim = topology['output_dim']
self.input_layer = nn.Linear(self.input_dim, self.hidden_dim)
self.hidden_layer = nn.Linear(self.hidden_dim, self.hidden_dim)
self.output_layer = nn.Linear(self.hidden_dim, self.output_dim)
self.dropout_layer = nn.Dropout(p=0.2)
def forward(self, x):
batch_size = x.size()[0]
feat_size = x.size()[1]
input_size = batch_size * feat_size
self.input_layer = nn.Linear(input_size, self.hidden_dim).cuda()
hidden = self.input_layer(x.view(1, input_size)).clamp(min=0)
for _ in range(self.num_hidden):
hidden = self.dropout_layer(F.relu(self.hidden_layer(hidden)))
output_size = batch_size * self.output_dim
self.output_layer = nn.Linear(self.hidden_dim, output_size).cuda()
return self.output_layer(hidden).view(output_size)
Training code:
def train(self):
if self.cuda:
self.network.cuda()
dh = DataHandler(self.data)
# loss_fn = nn.L1Loss(size_average=False)
# loss_fn = nn.L1Loss()
# loss_fn = nn.SmoothL1Loss(size_average=False)
# loss_fn = nn.SmoothL1Loss()
# loss_fn = nn.MSELoss(size_average=False)
loss_fn = torch.nn.MSELoss()
losses = []
validate = []
hypos = []
labels = []
val_size = 100
val_diff = 1
total_val = float(val_size * self.batch_size)
for i in range(self.iterations):
x, y = dh.get_batch(self.batch_size)
x = self.tensor_to_Variable(x)
y = self.tensor_to_Variable(y)
self.optimizer.zero_grad()
loss = loss_fn(self.network(x), y)
loss.backward()
self.optimizer.step()
It looks like you've misunderstood how layers in pytorch works, here are a few tips:
In your forward when you do nn.Linear(...) you are definining new layers instead of using those you pre-defined in your network __init__. Therefore, it cannot learn anything as weights are constantly reinitalized.
You shouldn't need to call .cuda() inside net.forward(...) since you've already copied the network on gpu in your train by calling self.network.cuda()
Ideally the net.forward(...) input should directly have the shape of the first layer so you won't have to modify it. Here you should have x.size() <=> Linear -- > (Batch_size, Features).
Your forward should look close to this:
def forward(self, x):
x = F.relu(self.input_layer(x))
x = F.dropout(F.relu(self.hidden_layer(x)),training=self.training)
x = self.output_layer(x)
return x