Related
I'm using PyTorch with a training set of movie reviews each labeled positive or negative. Every review is truncated or padded to be 60 words and I have a batch size of 32. This 60x32 Tensor is fed to an embedding layer with an embedding dim of 100 resulting in a 60x32x100 Tensor. Then I use the unpadded lengths of each review to pack the embedding output, and feed that to a BiLSTM layer with hidden dim = 256.
I then pad it back, apply a transformation (to try to get the last hidden state for the forward and backward directions) and feed the transformation to a Linear layer which is 512x1. Here is my module, I pass the final output through a sigmoid not shown here
class RNN(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, output_dim, n_layers,
bidirectional, dropout, pad_idx):
super().__init__()
self.el = nn.Embedding(vocab_size, embedding_dim)
print('vocab size is ', vocab_size)
print('embedding dim is ', embedding_dim)
self.hidden_dim = hidden_dim
self.num_layers = n_layers # 2
self.lstm = nn.LSTM(input_size=embedding_dim, hidden_size=hidden_dim, num_layers=n_layers, dropout=dropout, bidirectional=bidirectional)
# Have an output layer for outputting a single output value
self.linear = nn.Linear(2*hidden_dim, output_dim)
def init_hidden(self):
return (torch.zeros(self.n_layers*2, 32, self.hidden_dim).to(device),
torch.zeros(self.n_layers*2, 32, self.hidden_dim).to(device))
def forward(self, text, text_lengths):
print('input text size ', text.size())
embedded = self.el(text)
print('embedded size ', embedded.size())
packed_seq = torch.nn.utils.rnn.pack_padded_sequence(embedded, lengths=text_lengths, enforce_sorted=False)
packed_out, (ht, ct) = self.lstm(packed_seq, None)
out_rnn, out_lengths = torch.nn.utils.rnn.pad_packed_sequence(packed_out)
print('padded lstm out ', out_rnn.size())
#out_rnn = out_rnn[-1] #this works
#out_rnn = torch.cat((out_rnn[-1, :, :self.hidden_dim], out_rnn[0, :, self.hidden_dim:]), dim=1) # this works
out_rnn = torch.cat((ht[-1], ht[0]), dim=1) #this works
#out_rnn = out_rnn[:, -1, :] #doesn't work maybe should
print('attempt to get last hidden ', out_rnn.size())
linear_out = self.linear(out_rnn)
print('after linear ', linear_out.size())
return linear_out
I've tried 3 different transformations to get the dimensions correct for the linear layer
out_rnn = out_rnn[-1] #this works
out_rnn = torch.cat((out_rnn[-1, :, :self.hidden_dim], out_rnn[0, :, self.hidden_dim:]), dim=1) # this works
out_rnn = torch.cat((ht[-1], ht[0]), dim=1) #this works
These all produce an output like this
input text size torch.Size([60, 32])
embedded size torch.Size([60,32, 100])
padded lstm out torch.Size([36, 32, 512])
attempt to get last hidden torch.Size([32, 512])
after linear torch.Size([32, 1])
I would expect the padded lstm out to be [60, 32, 512] but it is always less than 60 in the first dimension.
I'm training for 10 epochs with optim.SGD and nn.BCEWithLogitsLoss(). My training accuracy is always around 52% and test accuracy is always at like 50%, so the model is doing no better than randomly guessing. I'm sure that my data is being handled correctly in my tochtext.data.Dataset. Am I forwarding my tensors along incorrectly?
I have tried using batch_first=True in my lstm, packed_seq function, and pad_packed_seq function and that breaks my transformations before feeding to the linear layer.
Update
I added the init_hidden method and have tried without the pack/pad sequence methods and still get the same results
I changed my optimizer from SGD to Adam and changed layers from 2 to 1 and my model started to learn, getting accuracies > 75%
I'm trying to create a 2 layer lstm (incl. dropout) but get an error message that 'inputs must be a sequence'.
I use embeddings as the input and not sure how to change these to be a sequence? Any explanations are greatly appreciated.
This is my graph definition:
with tf.name_scope('Placeholders'):
input_x = tf.placeholder(tf.int32, [None, n_steps], name='input_x')
input_y = tf.placeholder(tf.float32, [None, n_classes], name='input_y')
dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')
with tf.name_scope('Embedding_layer'):
embeddings_var = tf.Variable(tf.random_uniform([vocab_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True)
embedded_chars = tf.nn.embedding_lookup(embeddings_var, input_x)
print(embedded_chars, 'embed')
def get_a_cell(lstm_size, keep_prob):
lstm = tf.nn.rnn_cell.BasicLSTMCell(lstm_size)
drop = tf.nn.rnn_cell.DropoutWrapper(lstm, output_keep_prob=dropout_keep_prob)
return drop
with tf.name_scope('lstm'):
cell = tf.nn.rnn_cell.MultiRNNCell(
[get_a_cell(num_hidden, dropout_keep_prob) for _ in range(num_layers)]
)
lstm_outputs, state = tf.nn.static_rnn(cell=cell,inputs=embedded_chars, dtype=tf.float32)
with tf.name_scope('Fully_connected'):
W = tf.Variable(tf.truncated_normal([num_hidden, n_classes], stddev=0.1))
b = tf.Variable(tf.constant(0.1, shape=n_classes))
output = tf.nn.xw_plus_b(lstm_outputs,W,b)
predictions = tf.argmax(output, 1, name='predictions')
with tf.name_scope('Loss'):
# Cross-entropy loss and optimizer initialization
loss1 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=output, labels=input_y))
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss1, global_step=global_step)
with tf.name_scope('Accuracy'):
# Accuracy metrics
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.nn.softmax(output)), input_y), tf.float32))
with tf.name_scope('num_correct'):
correct_predictions = tf.equal(predictions, tf.argmax(input_y, 1))
num_correct = tf.reduce_sum(tf.cast(correct_predictions, 'float'), name='num_correct')
EDIT:
when changing static_rnn to dynamic_rnn the error message changes to the following, failing on the bias (b) variable:
TypeError: 'int' object is not iterable
After I changed the bias term to this:
b = tf.Variable(tf.random_normal([n_classes]))
and get a new error message:
ValueError: Shape must be rank 2 but is rank 3 for 'Fully_connected/xw_plus_b/MatMul' (op: 'MatMul') with input shapes: [?,27,128], [128,6].
If we assume you use tf.dynamic_rnn (for the case of tf.static_rnn, the first problem is because you don't give the input in the right format, tf.static_rnn except a sequence of tensor such as list of tensors [batch_size x seq_len] and not a single tensor with shape [batch_size x seq_len x dim] whereas tf.dynamic_rnn deals with such tensors as input)
I invite you to read the documentation of tf.nn_dynamic_rnn to see that for your classification problem you might not want to use lstm_outputs but state which basically contain the last output of your RNN, because lstm_output contains all the outputs , whereas here you are interested on only in the last_output (except if you want to do something like attention for classification , here you'll need all the outputs).
To get the last output you'll basically need to do that:
lstm_outputs, state = tf.nn.dynamic_rnn(cell=cell,inputs=embedded_chars, dtype=tf.float32)
last_output = state[-1].h
state[-1] to take the state of the last cell, then h contains the last output and pass last_output to your feed forward network.
Full code
(working, but compute wrong accuracy see comments)
n_classes = 6
n_steps = 27
num_hidden=128
dropout_keep_prob =0.5
vocab_size=10000
EMBEDDING_DIM=300
num_layers = 2
with tf.name_scope('Placeholders'):
input_x = tf.placeholder(tf.int32, [None, n_steps], name='input_x')
input_y = tf.placeholder(tf.float32, [None, n_classes], name='input_y')
dropout_keep_prob = tf.placeholder(tf.float32, name='dropout_keep_prob')
with tf.name_scope('Embedding_layer'):
embeddings_var = tf.Variable(tf.random_uniform([vocab_size, EMBEDDING_DIM], -1.0, 1.0), trainable=True)
embedded_chars = tf.nn.embedding_lookup(embeddings_var, input_x)
print(embedded_chars, 'embed')
def get_a_cell(lstm_size, keep_prob):
lstm = tf.nn.rnn_cell.BasicLSTMCell(lstm_size)
drop = tf.nn.rnn_cell.DropoutWrapper(lstm, output_keep_prob=dropout_keep_prob)
return drop
with tf.name_scope('lstm'):
cell = tf.nn.rnn_cell.MultiRNNCell(
[get_a_cell(num_hidden, dropout_keep_prob) for _ in range(num_layers)]
)
lstm_outputs, state = tf.nn.dynamic_rnn(cell=cell,inputs=embedded_chars, dtype=tf.float32)
last_output = state[-1].h
with tf.name_scope('Fully_connected'):
W = tf.Variable(tf.truncated_normal([num_hidden, n_classes], stddev=0.1))
b = tf.Variable(tf.constant(0.1, shape=[n_classes]))
output = tf.nn.xw_plus_b(last_output,W,b)
predictions = tf.argmax(output, 1, name='predictions')
with tf.name_scope('Loss'):
# Cross-entropy loss and optimizer initialization
loss1 = tf.reduce_mean(tf.nn.softmax_cross_entropy_with_logits_v2(logits=output, labels=input_y))
global_step = tf.Variable(0, name="global_step", trainable=False)
optimizer = tf.train.AdamOptimizer(learning_rate=1e-3).minimize(loss1, global_step=global_step)
with tf.name_scope('Accuracy'):
# Accuracy metrics
accuracy = tf.reduce_mean(tf.cast(tf.equal(tf.round(tf.nn.softmax(output)), input_y), tf.float32))
with tf.name_scope('num_correct'):
correct_predictions = tf.equal(predictions, tf.argmax(input_y, 1))
num_correct = tf.reduce_sum(tf.cast(correct_predictions, 'float'), name='num_correct')
I am trying to implement a Fully Convolutional network (5 layers) in TensorFlow.
But after few time of training, all my logits fall to 0.
Did anyone have the same problem before ?
Here is how I implemented my CONV-ReLU-maxPOOL layer :
def conv_relu_layer (in_data, nb_filters, filter_shape) :
nb_in_channels = int (in_data_reshaped.shape[3])
conv_shape = [filter_shape[0], filter_shape[1],
nb_in_channels, nb_filters]
weights = tf.Variable (
tf.truncated_normal (conv_shape, mean=0., stddev=.05))
bias = tf.Variable (
tf.truncated_normal ([nb_filters], mean=0., stddev=1.))
output = tf.nn.conv2d (in_data_reshaped, weights,
[1,1,1,1], padding="SAME")
output += bias
output = tf.nn.relu (output)
return output
def conv_relu_pool_layer (in_data, nb_filters, filter_shape, pool_shape,
pooling=tf.nn.max_pool) :
conv_out = conv_relu_layer (in_data, nb_filters, filter_shape)
ksize = [1, pool_shape[0], pool_shape[1], 1]
strides = [1, pool_shape[0], pool_shape[1], 1]
return pooling (conv_out, ksize=ksize, strides=strides, padding="SAME")
Here is my network :
def create_network_5C (in_data, name="5C") :
c1 = conv_relu_pool_layer (in_data, 64, [5,5], [2,2])
c2 = conv_relu_pool_layer (c1, 128, [5,5], [2,2])
c3 = conv_relu_pool_layer (c2, 256, [5,5], [2,2])
c4 = conv_relu_pool_layer (c3, 64, [5,5], [2,2])
return conv_relu_layer (c4, 2, [5,5])
The loss function :
def loss (logits, labels, num_classes) :
with tf.name_scope('loss'):
logits = tf.reshape(logits, (-1, num_classes))
epsilon = tf.constant(value=1e-4)
labels = tf.to_float(tf.reshape(labels, (-1, num_classes)))
softmax = tf.nn.softmax(logits) + epsilon
cross_entropy = - tf.reduce_sum (
tf.multiply (labels * tf.log (softmax), head),
reduction_indices=[1])
cross_entropy_mean = tf.reduce_mean (cross_entropy)
tf.add_to_collection('losses', cross_entropy_mean)
loss = tf.add_n(tf.get_collection('losses'))
return loss
My main routine :
batch_size = 5
# Load data
x = tf.placeholder (tf.float32, [None, 416, 416, 3], name="x")
y = tf.placeholder (tf.float32, [None, 416, 416, 1], name="y")
# Contrast normalization and computation
x_gcn = tf.map_fn (lambda img : tf.image.per_image_standardization (img), x)
logits = create_network_5C (x_gcn)
# Having label at the same dimension as the output
y_p = tf.nn.avg_pool (tf.sign (y),
ksize=[1,16,16,1], strides=[1,16,16,1], padding="SAME")
y_rshp = tf.reshape (y_p, [batch_size, 416//16, 416//16])
y_bin = tf.cast (y_rshp > .5, tf.int32)
y_1hot = tf.one_hot (y_bin, 2)
# Compute error
error = loss (logits, y_1hot, 2)
optimizer = tf.train.AdamOptimizer (learning_rate=args.eta).minimize (error)
# Run the session
init_op = tf.global_variables_initializer ()
with tf.Session () as session :
session.run (init_op)
err, _ = session.run ([error, optimizer],
feed_dict={ x: image_batch,
y: label_batch })
I note that, if I reduce my network to 2 layers only, it won't drop the logits to 0, but it won't learn anything either. If I reduce it to 3 layers, it will drop to 0, but after a many iterations (while 5 layers drop to 0 in few batches).
Can this be linked to what is called "gradient vanish" ?
If it's relevant, my spec are : Ubuntu 16.04 - Python 3.6.4 - tensorflow 1.6.0
[EDIT] My problem really look like dead-ReLU, as mentioned here : StackOverflow : FCN training error, but my data is normalized (between something like -2 and +2, and I already tried to change the mean and stddev initial value of my weights and biases
[EDIT 2] I tried to replace the ReLUs with Leaky ReLU, or a softplus, in both cases, logits get stucked under 0.1 and loss stay between 0.6 and 0.7
Using some leaky relu was actually enough, then I just needed to let him train for a hudge amount of time.
I am doing a sequence to label learning model in PyTorch. I have two sentences and I am classifying whether they are entailed or not (SNLI dataset). I concatenate two 50 word sentences together (sometimes padded) into a vector of length 100. I then send in minibatches into word embeddings -> LSTM -> Linear layer. I am doing cross entropy loss but I need a vector of [mini_batch, C] to go into the CrossEntropyLoss function. Instead I still have the 100 words in my vector as [mini_batch, 100, C]
Here is my model:
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, mlp_d=256):
super(myLSTM, self).__init__()
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=1, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(mlp_d, 1024)
# Set static embedding vectors
self.embedding.weight.requires_grad = False
#self.sm = nn.CrossEntropyLoss()
def display(self):
for param in self.parameters():
print(param.data.size())
def filter_params(self):
# Might not be compatible with python 3
#self.parameters = filter(lambda p: p.requires_grad, self.parameters())
pass
def init_hidden(self):
# Need to init hidden weights in LSTM
pass
def forward(self, sentence):
print(sentence.size())
embeds = self.embedding(sentence)
print(embeds.size())
out, _ = self.lstm(embeds)
print(out.size())
out = self.mlp(out)
return out
My training sequences with output:
batch_size = 3
SGD_optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01, weight_decay=1e-4)
ADM_optimizer = optim.Adam(filter(lambda p: p.requires_grad, model.parameters()), lr=0.01)
criterion = nn.CrossEntropyLoss()
num_epochs = 50
from torch.autograd import Variable
from torch import optim
for epoch in range(num_epochs):
print("Epoch {0}/{1}: {2}%".format(epoch, num_epochs, float(epoch)/num_epochs))
for start, end in tqdm(batch_index_gen(batch_size, len(n_data))):
# Convert minibatch to numpy
s1, s2, y = convert_to_numpy(n_data[start:end])
# Convert numpy to Tensor
res = np.concatenate((s1,s2), axis=1) # Attach two sentences into 1 input vector
input_tensor = torch.from_numpy(res).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.FloatTensor)
data, target = Variable(input_tensor), Variable(target_tensor)
# Zero gradients
SGD_optimizer.zero_grad()
# Forward Pass
output = model.forward(data)
print("Output size: ")
print(output.size())
print("Target size: ")
print(target.size())
# Calculate loss with respect to training labels
loss = criterion(output, target)
# Backprogogate and update optimizer
loss.backward()
SGD_optimizer.step()
#ADAM_optimizer.step()
output:
Epoch 0/50: 0.0%
torch.Size([3, 100])
torch.Size([3, 100, 300])
torch.Size([3, 100, 256])
Output size:
torch.Size([3, 100, 1024])
Target size:
torch.Size([3])
error:
ValueError: Expected 2 or 4 dimensions (got 3)
EDITED -------------------------------------------------------------------
I have now got my model training but I am getting low accuracy. Is there an issue with my LSTM outputs being concatenated and then condensed to a smaller tensor to go through my linear layer?
New Model:
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, mlp_d=256, num_classes=3, lstm_layers=1):
super(myLSTM, self).__init__()
self.num_layers = lstm_layers
self.hidden_size = h_size
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=lstm_layers, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(2 * h_size * 2, num_classes)
# Set static embedding vectors
self.embedding.weight.requires_grad = False
def forward(self, s1, s2):
# Set initial states
#h0 = Variable(torch.zeros(self.num_layers*2, s1.size(0), self.hidden_size)).cuda() # 2 for bidirection
#c0 = Variable(torch.zeros(self.num_layers*2, s1.size(0), self.hidden_size)).cuda()
batch_size = s1.size()[0]
embeds_1 = self.embedding(s1)
embeds_2 = self.embedding(s2)
_, (h_1_last, _) = self.lstm(embeds_1)#, (h0, c0)) #note the change here. Last hidden state is taken
_, (h_2_last, _) = self.lstm(embeds_2)#, (h0, c0))
concat = torch.cat( (h_1_last, h_2_last), dim=2) #double check the dimension
concat = concat.view(batch_size, -1)
scores = self.mlp(concat)
return scores
New Training
batch_size = 64
SGD_optimizer = optim.SGD(filter(lambda p: p.requires_grad, model.parameters()), lr=0.001, weight_decay=1e-4)
criterion = nn.CrossEntropyLoss()
num_epochs = 10
model.train()
if cuda:
model = model.cuda()
criterion = criterion.cuda()
from torch.autograd import Variable
from torch import optim
epoch_losses = []
for epoch in range(num_epochs):
print("Epoch {0}/{1}: {2}%".format(epoch, num_epochs, 100*float(epoch)/num_epochs))
# Batch loss aggregator
losses = []
for start, end in tqdm(batch_index_gen(batch_size, len(n_data))):
# Convert minibatch to numpy
s1, s2, y = convert_to_numpy(n_data[start:end])
# Convert numpy to Tensor
s1_tensor = torch.from_numpy(s1).type(torch.LongTensor)
s2_tensor = torch.from_numpy(s2).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.LongTensor)
s1 = Variable(s1_tensor)
s2 = Variable(s2_tensor)
target = Variable(target_tensor)
if cuda:
s1 = s1.cuda()
s2 = s2.cuda()
target = target.cuda()
# Zero gradients
SGD_optimizer.zero_grad()
# Forward Pass
output = model.forward(s1,s2)
# Calculate loss with respect to training labels
loss = criterion(output, target)
losses.append(loss.data[0])
# Backprogogate and update optimizer
loss.backward()
SGD_optimizer.step()
# concat losses to epoch losses
epoch_losses += losses
training with tensor sizes printed:
Epoch 0/10: 0.0%
Batch size: 64
Sentences
torch.Size([64, 50])
torch.Size([64, 50])
torch.Size([64, 50, 300])
torch.Size([64, 50, 300])
Hidden states
torch.Size([2, 64, 128])
torch.Size([2, 64, 128])
Concatenated hidden states
torch.Size([2, 64, 256])
Reshaped tensors for linear layer
torch.Size([64, 512])
Linear propogation
torch.Size([64, 3])
Evaluation
def eval_model(model, mode='dev'):
file_name = 'snli_1.0/snli_1.0_dev.jsonl' if mode == 'dev' else 'snli_1.0/snli_1.0_test.jsonl'
dev_data, _ = obtain_data(file_name)
dev_n_data = vocab.process_data(dev_data)
print("Length of data: {}".format(len(dev_n_data)))
eval_batch_size = 1024
model.eval()
total = len(dev_n_data)
hit = 0
correct = 0
# Batch dev eval
for start, end in batch_index_gen(eval_batch_size, len(dev_n_data)):
s1, s2, y = convert_to_numpy(dev_n_data[start:end])
s1_tensor = torch.from_numpy(s1).type(torch.LongTensor)
s2_tensor = torch.from_numpy(s2).type(torch.LongTensor)
target_tensor = torch.from_numpy(y).type(torch.LongTensor)
s1 = Variable(s1_tensor, volatile=True)
s2 = Variable(s2_tensor, volatile=True)
target = Variable(target_tensor, volatile=True)
if cuda:
s1 = s1.cuda()
s2 = s2.cuda()
target = target.cuda()
output = model.forward(s1,s2)
loss = criterion(output, target)
#print("output size: {}".format(output.size()))
#print("target size: {}".format(target.size()))
pred = output.data.max(1)[1] # get the index of the max log-probability
#print(pred[:5])
#print(output[:])
correct += pred.eq(target.data).cpu().sum()
return correct / float(total)
eval_model(model)
I think there is an issue in a way you are trying to solve an entailment problem.
Maybe you can do it this way:
design your module to accept two sentences as input
embed both of them with your embeddings
encode them using the LSTM module.
now you have two fixed length vector representations of two sentences. Simpliest thing to do is to just concatenate them
together.
Add liner layer on top to evaluate scores for each entailment class (3 I suppose)
apply softmax to get a proper probability distribution
So your model can look like this (double check the dimensions):
class myLSTM(nn.Module):
def __init__(self, h_size=128, v_size=10, embed_d=300, num_classes = 3):
super(myLSTM, self).__init__()
self.embedding = nn.Embedding(v_size, embed_d)
self.lstm = nn.LSTM(embed_d, h_size, num_layers=1, bidirectional=True, batch_first=True)
self.mlp = nn.Linear(2*h_size*2, num_classes) #<- change here
def forward(self, sentence1, sentence2):
embeds_1 = self.embedding(sentence1)
embeds_2 = self.embedding(sentence2)
_, (h_1_last, _) = self.lstm(embeds_1) #note the change here. Last hidden state is taken
_, (h_2_last, _) = self.lstm(embeds_2)
concat = torch.concat([h_1_last, h_2_last], dim=1) #double check the dimension
scores = self.mlp(concat)
probas = F.softmax(scores) #from torch.functional ...
Then you can play around with adding more hidden layers or thinking how combining two sentences can be done in more intelligent way (attention, etc).
Double check what CrossEntropyLoss accepts as input and target and adjust (is it unnormalized class scores or probability distribution). Check http://pytorch.org/docs/master/nn.html#lstm for LSTM module documentation to clarify what LSTM returns (do you need hidden states for every word or just the representation after the last one).
In a classification problem with many classes, tensorflow docs suggests using sampled_softmax_loss over a simple softmax to reduce training runtime.
According to the docs and source (line 1180), the call pattern for sampled_softmax_loss is:
tf.nn.sampled_softmax_loss(weights, # Shape (num_classes, dim) - floatXX
biases, # Shape (num_classes) - floatXX
labels, # Shape (batch_size, num_true) - int64
inputs, # Shape (batch_size, dim) - floatXX
num_sampled, # - int
num_classes, # - int
num_true=1,
sampled_values=None,
remove_accidental_hits=True,
partition_strategy="mod",
name="sampled_softmax_loss")
It's unclear (at least to me) how to convert a real world problem into the shapes that this loss function requires. I think the 'inputs' field is the problem.
Here is a copy-paste-ready minimum working example that throws a matrix multiplication shape error when calling the loss function.
import tensorflow as tf
# Network Parameters
n_hidden_1 = 256 # 1st layer number of features
n_input = 784 # MNIST data input (img shape: 28*28)
n_classes = 10 # MNIST total classes (0-9 digits)
# Dependent & Independent Variable Placeholders
x = tf.placeholder("float", [None, n_input])
y = tf.placeholder("float", [None, n_classes]) #
# Weights and Biases
weights = {
'h1': tf.Variable(tf.random_normal([n_input, n_hidden_1])),
'out': tf.Variable(tf.random_normal([n_hidden_1, n_classes]))
}
biases = {
'b1': tf.Variable(tf.random_normal([n_hidden_1])),
'out': tf.Variable(tf.random_normal([n_classes]))
}
# Super simple model builder
def tiny_perceptron(x, weights, biases):
layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
layer_1 = tf.nn.relu(layer_1)
out_layer = tf.matmul(layer_1, weights['out']) + biases['out']
return out_layer
# Create the model
pred = tiny_perceptron(x, weights, biases)
# Set up loss function inputs and inspect their shapes
w = tf.transpose(weights['out'])
b = biases['out']
labels = tf.reshape(tf.argmax(y, 1), [-1,1])
inputs = pred
num_sampled = 3
num_true = 1
num_classes = n_classes
print('Shapes\n------\nw:\t%s\nb:\t%s\nlabels:\t%s\ninputs:\t%s' % (w.shape, b.shape, labels.shape, inputs.shape))
# Shapes
# ------
# w: (10, 256) # Requires (num_classes, dim) - CORRECT
# b: (10,) # Requires (num_classes) - CORRECT
# labels: (?, 1) # Requires (batch_size, num_true) - CORRECT
# inputs: (?, 10) # Requires (batch_size, dim) - Not sure
loss_function = tf.reduce_mean(tf.nn.sampled_softmax_loss(
weights=w,
biases=b,
labels=labels,
inputs=inputs,
num_sampled=num_sampled,
num_true=num_true,
num_classes=num_classes))
The final line triggers and ValueError, stating that you cant multiply tensors with shape (?,10) and (?,256). As a general rule, I'd agree with that statement. Full error shown below:
ValueError: Dimensions must be equal, but are 10 and 256 for 'sampled_softmax_loss_2/MatMul_1' (op: 'MatMul') with input shapes: [?,10], [?,256].
If the 'dim' value from tensorflow docs is intended to be constant, either the 'weights' or 'inputs' variables going into the loss function are incorrect.
Any thoughts would be awesome, I'm totally stumped on how to use this loss function correctly & it would have a huge impact on training time for the model we're using it for (500k classes). Thanks!
---EDIT---
It is possible to get the sample shown above to run without errors by playing with parameters and ignoring the sampled_softmax_loss call pattern's expected inputs. If you do that, it results in a trainable model that has 0 impact on prediction accuracy (as you would expect).
In your softmax layer you are multiplying your network predictions, which have dimension (num_classes,) by your w matrix which has dimension (num_classes, num_hidden_1), so you end up trying to compare your target labels of size (num_classes,) to something that is now size (num_hidden_1,). Change your tiny perceptron to output layer_1 instead, then change the definition of your cost. The code below might do the trick.
def tiny_perceptron(x, weights, biases):
layer_1 = tf.add(tf.matmul(x, weights['h1']), biases['b1'])
layer_1 = tf.nn.relu(layer_1)
return layer_1
layer_1 = tiny_perceptron(x, weights, biases)
loss_function = tf.reduce_mean(tf.nn.sampled_softmax_loss(
weights=weights['h1'],
biases=biases['b1'],
labels=labels,
inputs=layer_1,
num_sampled=num_sampled,
num_true=num_true,
num_classes=num_classes))
When you train your network with some optimizer, you will tell it to minimize loss_function, which should mean that it will adjust both sets of weights and biases.
The key point is to pass right shape of weight, bias, input and label. The shape of weight passed to sampled_softmax is not the the same with the general situation.
For example, logits = xw + b, call sampled_softmax like this:
sampled_softmax(weight=tf.transpose(w), bias=b, inputs=x), NOT sampled_softmax(weight=w, bias=b, inputs=logits)!!
Besides, label is not one-hot representation. if your labels are one-hot represented, pass labels=tf.reshape(tf.argmax(labels_one_hot, 1), [-1,1])