Inference vs Training causes different outputs

Inference vs Training causes different outputs - python

I'm using a custom dense layer that scales an output (size 1) between zero and one.
class BoundingBoxNumber(tf.keras.layers.Layer):
def __init__(self, self_input_shape):
super(BoundingBoxNumber,self).__init__()
self.input_shape_custom = self_input_shape
self.input_shape_accu = 1
for item in self_input_shape:
self.input_shape_accu *= item
self.internal_dense_layer = tf.keras.layers.Dense(1, activation = 'tanh')
#tf.function
def call(self, inputs):
inputs = tf.keras.layers.Flatten()(inputs)
inputs.set_shape((1, self.input_shape_accu))
output = self.internal_dense_layer(inputs)
output = tf.divide(output, 2)
output = tf.math.add(output, 0.5)
return(output)
I'm also using a custom training loop
for epoch in range(50):
print("Epoch:", epoch, "of 50")
average_loss = 0
for iter, item in enumerate(image):
num_bounding_boxes = tf.shape(bb[iter])[0]
float_target = tf.cast(1/num_bounding_boxes, tf.float32)
with tf.GradientTape() as tape:
logits = dense_bb_num_layer(item)
loss = NumLoss(logits, float_target)
print("Logits:", logits, "Target:", float_target, "Loss:", loss)
average_loss += loss
call_gradients = tape.gradient(loss, dense_bb_num_layer.trainable_weights)
call_optimizer.apply_gradients(zip(call_gradients, dense_bb_num_layer.trainable_weights))
average_loss /= len(image)
print(average_loss)
In addition, I'm using a custom loss function:
def NumLoss(logits, expected):
return((logits - expected)**2)
When I run the layer inside the training loop, the result is always a float with a value of exactly 0.0 or 1.0, however, when I just call it, it's as expected, a float value between zero and 1.
I'm not sure why this would be the case, and any help would be appreciated. I'm trying to train the layer to output values such as 0.333333, 0.25, 0.125, etc, so it's difficult if it only outputs whole numbers.

Related

Word2Vec with negative sampling python implementation

I'm trying to implement word2vec with negative sampling in python almost from scratch and quite new in neural networks and faced some issues. Would be very appreciate for any help.
So, I wrote simple nn with a forward pass. I didn't get which element have to have grad_fn, I'd been getting error like 'tensor have no grad_fn' until I add requires_grad_() on the returning value. Is that correct?
dataset = Word2VecNegativeSampling(data, num_negative_samples, 30000)
dataset.generate_dataset()
wordvec_dim = 10
class Word2VecNegativeSamples(nn.Module):
def __init__(self, num_tokens):
super(Word2VecNegativeSamples, self).__init__()
self.input = nn.Linear(num_tokens, 10, bias=False)
self.output = nn.Linear(10, num_tokens, bias=False)
self.num_tokens = num_tokens
def forward(self, input_index_batch, output_indices_batch):
'''
Implements forward pass with negative sampling
Arguments:
input_index_batch - Tensor of ints, shape: (batch_size, ), indices of input words in the batch
output_indices_batch - Tensor if ints, shape: (batch_size, num_negative_samples+1),
indices of the target words for every sample
Returns:
predictions - Tensor of floats, shape: (batch_size, num_negative_samples+1)
'''
results = []
batch_size = len(input_index_batch)
for i in range(batch_size):
input_one_hot = torch.zeros(self.num_tokens)
input_one_hot[input_index_batch[i]] = 1
forward_result = self.output(self.input(input_one_hot))
results.append(torch.tensor([forward_result[out_index] for out_index in output_indices_batch[i]]))
return torch.stack(results).requires_grad_()
nn_model = Word2VecNegativeSamples(data.num_tokens())
nn_model.type(torch.FloatTensor)
After all i'm trying to train the model, but neither loss nor accuracy changing. Is the code for model prediction correct as well?
Here is training code:
def train_neg_sample(model, dataset, train_loader, optimizer, scheduler, num_epochs):
loss = nn.BCEWithLogitsLoss().type(torch.FloatTensor)
loss_history = []
train_history = []
for epoch in range(num_epochs):
model.train() # Enter train mode
dataset.generate_dataset()
loss_accum = 0
correct_samples = 0
total_samples = 0
for i_step, (inp, out, lab) in enumerate(train_loader):
prediction = model(inp, out)
loss_value = loss(prediction, lab)
optimizer.zero_grad()
loss_value.backward()
optimizer.step()
_, indices = torch.max(prediction, 1)
correct_samples += torch.sum(indices == 0)
total_samples += lab.shape[0]
loss_accum += loss_value
scheduler.step()
ave_loss = loss_accum / i_step
train_accuracy = float(correct_samples) / total_samples
loss_history.append(float(ave_loss))
train_history.append(train_accuracy)
print("Epoch#: %i, Average loss: %f, Train accuracy: %f" % (epoch, ave_loss, train_accuracy))
return loss_history, train_history

If your loss function is not changing, it's highly probable that you register the wrong set of parameters to the optimizer. Can you post the code snippet where you initialize your model and optimizer? It is supposed to look like this:
nn_model = Word2VecNegativeSamples(data.num_tokens())
optimizer = optim.SGD(nn_model.parameters(), lr=0.001, momentum=0.9)

Why loss function always return zero after first epoch?

Why the loss function is always printing zero after the first epoch?
I suspect it's because of loss = loss_fn(outputs, torch.max(labels, 1)[1]).
But if I use loss = loss_fn(outputs, labels), I will get the error
RuntimeError: 0D or 1D target tensor expected, multi-target not supported
.
nepochs = 5
losses = np.zeros(nepochs)
loss_fn = nn.CrossEntropyLoss()
optimizer = optim.Adam(modell.parameters(), lr = 0.001)
for epoch in range(nepochs):
running_loss = 0.0
n = 0
for data in train_loader:
#single batch
if(n == 1):
break;
inputs, labels = data
optimizer.zero_grad()
outputs = modell(inputs)
#loss = loss_fn(outputs, labels)
loss = loss_fn(outputs, torch.max(labels, 1)[1])
loss.backward()
optimizer.step()
running_loss += loss.item()
n += 1
losses[epoch] = running_loss / n
print(f"epoch: {epoch+1} loss: {losses[epoch] : .3f}")
The model is:
def __init__(self, labels=10):
super(Classifier, self).__init__()
self.fc = nn.Linear(3 * 64 * 64, labels)
def forward(self, x):
out = x.reshape(x.size(0), -1)
out = self.fc (out)
return out
Any idea?
The labels are a 64 elements tensor like this:
tensor([[7],[1],[ 2],[3],[ 2],[9],[9],[8],[9],[8],[ 1],[7],[9],[2],[ 5],[1],[3],[3],[8],[3],[7],[1],[7],[9],[8],[ 8],[3],[7],[ 5],[ 1],[7],[3],[2],[1],[ 3],[3],[2],[0],[3],[4],[0],[7],[1],[ 8],[4],[1],[ 5],[ 3],[4],[3],[ 4],[8],[4],[1],[ 9],[7],[3],[ 2],[ 6],[4],[ 8],[3],[ 7],[3]])

Usually loss calculation is loss = loss_fn(outputs, labels) and here outputs is as following:
_ , outputs = torch.max(model(input), 1)
or
outputs = torch.max(predictions, 1)[0]
Common practice is modifying outputs instead of labels:
torch.max() returns a namedtuple (values, indices) where values is
the maximum value of each row of the input tensor in the given
dimension dim. And indices is the index location of each maximum value found (argmax).
In your code snippet the labels is not indices of the labels, so when you calculate the loss, the function should look like this:
loss = loss_fn(torch.max(outputs, 1)[0], labels)

Predicting point sequence in image

My training set is a set of images (either 3 channel or 1 ofc i use only one type of channel). And the labels are a sequence of points in a specific order that i want to predict from the images.
I am using a model inspired by the image captioning example on the tensorflow website. This is the also the approach that this paper takes https://arxiv.org/pdf/1901.03781.pdf
class CNN_Encoder(tf.keras.Model):
# Since you have already extracted the features and dumped it using pickle
# This encoder passes those features through a Fully connected layer
def __init__(self, embedding_dim):
super(CNN_Encoder, self).__init__()
self.fc = tf.keras.layers.Dense(embedding_dim)
def call(self, x):
x = self.fc(x)
x = tf.nn.relu(x)
return x
class RNN_Decoder(tf.keras.Model):
def __init__(self, embedding_dim, units, output_dim):
super(RNN_Decoder, self).__init__()
self.units = units
self.gru = tf.keras.layers.GRU(self.units,
return_sequences=True,
return_state=True,
recurrent_initializer='glorot_uniform')
self.fc1 = tf.keras.layers.Dense(self.units)
self.fc2 = tf.keras.layers.Dense(output_dim)
def call(self, x, features, hidden):
x = tf.concat((features, x), axis=-1)
output, state = self.gru(x)
x = self.fc1(state)
x = self.fc2(x)
return x
def reset_state(self, batch_size):
return tf.zeros((batch_size, self.units))
#tf.function
def train_step(img_tensor, target):
loss = 0
hidden = decoder.reset_state(batch_size=target.shape[0])
dec_input = tf.expand_dims([[0., 0.]] * target.shape[0], 1)
with tf.GradientTape() as tape:
features = encoder(img_tensor)
for i in (range(1, target.shape[1])):
predictions = decoder(dec_input, features, hidden)
loss += loss_function(target[:, i], predictions)
# using teacher forcing
dec_input = tf.expand_dims(target[:, i], 1)
total_loss = (loss / int(target.shape[1]))
trainable_variables = encoder.trainable_variables + decoder.trainable_variables
gradients = tape.gradient(loss, trainable_variables)
optimizer.apply_gradients(zip(gradients, trainable_variables))
return loss, total_loss
EPOCHS = 20
batch_size = 8
for epoch in tqdm(range(start_epoch, EPOCHS)):
start = time.time()
total_loss = 0
for (batch, (img_tensor, target)) in enumerate((data_generator(preds_t, labels_t))):
img_tensor = img_tensor.reshape((-1, 1, 128*128))
batch_loss, t_loss = train_step(img_tensor, target)
total_loss += t_loss
if batch % 100 == 0:
print ('Epoch {} Batch {} Loss {:.4f}'.format(
epoch + 1, batch, batch_loss.numpy() / int(target.shape[1])))
if batch == 10000:
break
# storing the epoch end loss value to plot later
#loss_plot.append(total_loss / num_steps)
if epoch % 5 == 0:
ckpt_manager.save()
print ('Epoch {} Loss {:.6f}'.format(epoch + 1,
total_loss/num_steps))
print ('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
For the features vector. I am extracting the last layer of a unet. So each image has a size 1x128x128. I reshape it to be 1x1x128*128. Which i then pass through a fully connected layer. The shape then becomes 1x1x256
My labels i want to predict are image coordinates so (x, y). The input to the gru layer is the
concatenated 1x1x256 , 1x1x2 (t-1 coordinates). Which i then further pass through a 2 layer fc layer with output dimension 2 for the 2 coordinates. I have removed attention for now to get a simpler model. I normalize my images. I pad the coordinate sequences with 0,0 for the start -1, -1 for the end and -2,-2 for the regular padding to get uniform sequence length of 350x2.
The network doesnt seem to learn much. I just get a few points scattered diagonally across the image. The biggest difference i see with the image captioning model is that the words can be converted to embeddings and then you have a 128 image features 128 word features being concatenated and fed into the lstm. In my case the sequence information is just 1 entry. Could that be the reason that the network is not learning much.
If someone has any insights into what i should change that would be great

Your question requires certain experience and a deep investigation. I'd only suggest general advices for under-fitting issue. Here's a list of things to try.
Personally, I'd start by trying to overfit on a single batch.

My custom loss function in Pytorch does not train

My custom loss function in Pytorch does not update during training. The loss stays exactly the same. I am trying to write this custom loss function based on the false positive and negative rates. I am giving you a simplified version of the code. Any idea what could be happening? Does the backpropagation turns to 0? Is this not the correct way of defining a custom loss function?
I have already checked that during backpropagation the Gradient always stays TRUE (assert requires_grad). I have also tried to make a class (torch.nn.module) of the function false_pos_neg_rate, but that did not work. The Assert Requires_grad turned out to be negative and I left it out afterwards.
There is no error, the training does continue.
def false_pos_neg_rate(outputs, truths):
y = truths
y_predicted = outputs
cut_off= torch.tensor(0.5, requires_grad=True)
y_predicted =torch.where(y_predicted <= cut_off, zeros, ones)
tp, fp, tn, fn = confusion_matrix(y_predicted, y)
fp_rate = fp / (fp+tn).float()
fn_rate = fn / (fn+tp).float()
loss = fn_rate + fp_rate
return loss
for i, (samples, truths) in enumerate(train_loader):
samples = Variable(samples)
truths = Variable(truths)
outputs = model(samples)
loss = false_pos_neg_rate_torch(outputs, truths)
loss.backward()
optimizer.step()
I expect the loss function to update the model and be smaller every training step. Instead the loss stays exactly the same and nothing happens.
Please help me, what happens? Why does the model not train during training steps?

As pointed out by Umang Gupta your loss function is not differentiable. If you write, mathematically, what you are trying to do you'll see that your loss has zero gradient almost everywhere and it behaves like a "step function".
In order to train models using gradient-descent methods you must have meaningful gradients for the loss function.

Based on your tips, I updated my Loss Function. I made a dummy so you can check the first 2 functions as well. I added the rest, so you can see how it is implemented. However, still somewhere the gradient turns out to be zero. What is now the step where the gradient turns zero, or how can I check this? Please I would like to know how I can fix this :).
I tried providing you with more information so you can play around as well, but if you miss anything please do let me know!
y = Variable(torch.tensor((0, 0, 0, 1, 1,1), dtype=torch.float), requires_grad = True)
y_pred = Variable(torch.tensor((0.333, 0.2, 0.01, 0.99, 0.49, 0.51), dtype=torch.float), requires_grad = True)
def binary_y_pred(y_pred):
y_pred.register_hook(lambda grad: print(grad))
y_pred = y_pred+torch.tensor(0.5, requires_grad=True, dtype=torch.float)
y_pred = y_pred.pow(5) # this is my way working around using torch.where()
y_pred = y_pred.pow(10)
y_pred = y_pred.pow(15)
m = nn.Sigmoid()
y_pred = m(y_pred)
y_pred = y_pred-torch.tensor(0.5, requires_grad=True, dtype=torch.float)
y_pred = y_pred*2
y_pred.register_hook(lambda grad: print(grad))
return y_pred
def confusion_matrix(y_pred, y):
TP = torch.sum(y*y_pred)
TN = torch.sum((1-y)*(1-y_pred))
FP = torch.sum((1-y)*y_pred)
FN = torch.sum(y*(1-y_pred))
k_eps = torch.tensor(1e-12, requires_grad=True, dtype=torch.float)
FN_rate = FN/(TP + FN + k_eps)
FP_rate = FP/(TN + FP + k_eps)
cost = FN_rate + FP_rate
return cost
class FeedforwardNeuralNetModel(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super(FeedforwardNeuralNetModel, self).__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(hidden_dim, output_dim)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
out = self.fc1(x)
out = self.relu1(out)
out = self.fc2(out)
out = self.sigmoid(out)
return out
model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=[0.9, 0.99], amsgrad=True)
criterion = torch.nn.BCELoss(weight=None, size_average=None, reduce=None, reduction='mean')
samples= Variable(samples)
truths = Variable(truths)
outputs = model(samples)
loss = confusion_matrix(outputs, truths)
loss.backward()
optimizer.step()

AND-gate with Pytorch

I'm new to PyTorch and deep learning generally.
The code I wrote can be seen longer down.
I'm trying to learn the simple 'And' problem, which is linearby separable.
The problem is, that I'm getting poor results. Only around 2/10 times it gets to the correct answer.
Sometimes the loss.item() values is stuck at 0.250.
Just to clear up
Why does it only work 2/10 times?
.
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autog
data_x = np.array([[0, 0], [0, 1], [1, 0], [1, 1]])
data_y = np.array([[0, 0, 0, 1]]).T
data_x = autog.Variable(torch.FloatTensor(data_x))
data_y = autog.Variable(torch.FloatTensor(data_y), requires_grad=False)
in_dim = 2
out_dim = 1
epochs = 15000
epoch_print = epochs / 5
l_rate = 0.001
class NeuralNet(nn.Module):
def __init__(self, input_size, output_size):
super(NeuralNet, self).__init__()
self.lin1 = nn.Linear(input_size, output_size)
self.relu = nn.ReLU()
def forward(self, x):
out = x
out = self.lin1(out)
out = self.relu(out)
return out
model = NeuralNet(in_dim, out_dim)
criterion = nn.L1Loss()
optimizer = optim.Adam(model.parameters(), lr=l_rate)
for epoch in range(epochs):
pred = model(data_x)
loss = criterion(pred, data_y)
loss.backward()
optimizer.step()
if (epoch + 1) % epoch_print == 0:
print("Epoch %d Loss %.3f" %(epoch + 1, loss.item()))
for x, y in zip(data_x, data_y):
pred = model(x)
print("Input", list(map(int, x)), "Pred", int(pred), "Output", int(y))

1. Using zero_grad with optimizer
You are not using optimizer.zero_grad() to clear the gradient. Your learning loop should look like this:
for epoch in range(epochs):
optimizer.zero_grad()
pred = model(data_x)
loss = criterion(pred, data_y)
loss.backward()
optimizer.step()
if (epoch + 1) % epoch_print == 0:
print("Epoch %d Loss %.3f" %(epoch + 1, loss.item()))
In this particular case it will not have any detrimental effect, the gradient is accumulating, but as you have the same dataset looped over and over it makes barely any difference (you should get into this habit though, as you will use it throughout your deep learning journey).
2. Cost Function
You are using Mean Absolute Error which is regression loss function, not a classification one (what you do is binary classification).
Accordingly, you should use BCELoss and sigmoid activation or (I prefer it that way), return logits from the network and use BCEWithLogitsLoss, both of them calculate binary cross entropy (simplified version of cross-entropy).
See below:
class NeuralNet(nn.Module):
def __init__(self, input_size, output_size):
super(NeuralNet, self).__init__()
self.lin1 = nn.Linear(input_size, output_size)
def forward(self, x):
# You may want to use torch.nn.functional.sigmoid activation
return self.lin1(x)
...
# Change your criterion to nn.BCELoss() if using sigmoid
criterion = nn.BCEWithLogitsLoss()
...
3. Predictions
If you used the logits version, classifier learns to assign negative values to 0 label and positive to indicate 1. Your display function has to be modified to incorporate this knowledge:
for x, y in zip(data_x, data_y):
pred = model(x)
# See int(pred > 0), that's the only change
print("Input", list(map(int, x)), "Pred", int(pred > 0), "Output", int(y))
This step does not apply if your forward applies sigmoid to the output. Oh, and it's better to use torch.round instead of casting to int.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Inference vs Training causes different outputs - python

Related

Word2Vec with negative sampling python implementation

Why loss function always return zero after first epoch?

Predicting point sequence in image

My custom loss function in Pytorch does not train

AND-gate with Pytorch

Categories

Resources