I have tried to use the fit_generator function. It's weird that when I use fit_generator, the loss seems just slowly going down by a little. Like I already trained about 5 days my loss just decrease from 129 to 119. But I have search how long should the model training and it said on the internet like only 8 hours. I do think it's my problem but i don't know what is the problem so I wish somebody could help me! And my data was only using 100 images only.
Below is my training code
vae_model.compile(optimizer=adam_optimizer, loss = total_loss, metrics = [r_loss, kl_loss],experimental_run_tf_function=False)
N_EPOCHS = 3000
zz = '/weights.h5'
vae_model.load_weights(zz)
checkpoint_vae = ModelCheckpoint(zz, save_weights_only = True, verbose=1)
vae_model.fit_generator(data_flow,
shuffle=False,
epochs = N_EPOCHS,
initial_epoch = 0,
steps_per_epoch=tr_num / BATCH_SIZE,
callbacks=[checkpoint_vae])
LEARNING_RATE = 0.0005
LOSS_FACTOR = 10000
def r_loss(y_true, y_pred):
return K.mean(K.square(y_true - y_pred), axis = [1,2,3])
def kl_loss(y_true, y_pred):
kl_loss = -0.5 * K.sum(1 + log_var - K.square(mean_mu) - K.exp(log_var), axis = 1)
return kl_loss
def total_loss(y_true, y_pred):
return LOSS_FACTOR*r_loss(y_true, y_pred) + kl_loss(y_true, y_pred)
adam_optimizer = Adam(lr = LEARNING_RATE, beta_1=0.5, beta_2=0.9)
Related
I was trying to train a image to image translation model using transunet. I split my data by 70%, 15%, 15% for training, validation and testing. But when I monitor the loss curve, I find that the validation loss is much lower than the training loss.
loss curve:
The code is here:
criterion = nn.L1Loss()
net = net.cuda()
net = torch.nn.DataParallel(net, device_ids=range(torch.cuda.device_count()))
optimizer = torch.optim.Adam(net.parameters(), lr=lr, betas=(0.9, 0.999))
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(optimizer, mode='min', patience=3)
for epoch in range(1, total_epoch + 1):
print('---------- Epoch:'+str(epoch)+ ' ----------')
# data_loader_iter = iter(data_loader)
data_loader_iter = data_loader
train_epoch_loss = 0.
print('Train:')
for img, mask in tqdm(data_loader_iter,ncols=20,total=len(data_loader_iter)):
net.train()
img, mask = img.to(device), mask.to(device)
optimizer.zero_grad()
pred = net(img)
train_loss = criterion(pred, mask)
train_epoch_loss += train_loss
train_loss.backward()
optimizer.step()
train_epoch_loss /= len(data_loader_iter)
val_data_loader_num = val_data_loader
test_data_loader_num = test_data_loader
val_epoch_loss = 0
test_epoch_loss = 0
#Validation
print('Validation:')
with torch.no_grad():
for val_img, val_mask in tqdm(val_data_loader_num,ncols=20,total=len(val_data_loader_num)):
val_img, val_mask = val_img.to(device), val_mask.to(device)
net.eval()
predict = net(val_img)
val_loss = criterion(predict, val_mask)
val_epoch_loss += val_loss
val_epoch_loss = val_epoch_loss / len(val_data_loader_num)
Another problem is that, when I was testing the model, one class could not be predicted properly, that class is always clustered at the edge of the imageļ¼ see the green class in the figure below:
while the ground truth looks like this:
I know there seems to be to many problems, but anyone has faced similar problems? Thanks in advance!
After each epoch y_pred simply keeps increasing
input at each batch is 64x10 tensor, trying to predict max of the vector at each row.
I thought the gradient might not be going to 0 between batches, but I that wasn't the case.
I tried changing LR, epoch, LSTM layers (LSTM to RNN), hidden size etc, nothing helped.
BTW, using simple sequential network of dense and relu instead of lstm worked perfectly
Following is the code:
LR = 0.0001
class LSTM(nn.Module):
def __init__(self, input_size=1, hidden_layer_size=100, output_size=1):
super().__init__()
self.hidden_layer_size = hidden_layer_size
self.lstm = nn.LSTM(input_size, hidden_layer_size)
self.linear = nn.Linear(hidden_layer_size, output_size)
# self.hidden_cell = (torch.zeros(1,max_array_len,self.hidden_layer_size),
# torch.zeros(1,max_array_len,self.hidden_layer_size))
def forward(self, input_seq):
# lstm_out,self.hidden_cell = self.lstm(input_seq.view(len(input_seq),max_array_len, 1),self.hidden_cell)
lstm_out,self.hidden_cell = self.lstm(input_seq.view(len(input_seq),max_array_len, 1))
predictions = self.linear(lstm_out[:, -1,:])
return predictions
model=LSTM()
optimizer = torch.optim.Adam(model.parameters(), lr=LR, weight_decay=0.8) # optimize all cnn parameters
loss_func = nn.MSELoss() # the target label is not one-hotted
print(model)
EPOCHS=2000
for i in range(EPOCHS):
# model.train()
for step, (seq,labels) in enumerate(train_data):
model.zero_grad()
labels=labels.view(labels.shape[0],1)
y_pred = model(seq)
loss = loss_func(y_pred.float(), labels.float())
loss.backward(retain_graph=True)
optimizer.step()
if i%10 == 0:
# print(y_pred.shape,labels.shape)
print(y_pred)
print(f'epoch: {i:3} train_loss: {loss.item():10.8f}')
print('Finished Training')
y_pred i am gettting is:
tensor([[0.2661],
[0.7536],
[1.4659],
[2.4905],
[3.8662],
[5.4478],
[6.8958],
[7.9347],
[8.5493],
[8.8773],
[9.0486],
[9.1409],
[9.1931],
[9.2244],
[9.2441],
[9.2570],
[9.2657],
[9.2718],
[9.2761],
[9.2792],
[9.2815],
[9.2831],
[9.2843],
[9.2853],
[9.2860],
[9.2865],
[9.2869],
[9.2872],
[9.2874],
[9.2876],
[9.2877],
[9.2878]], grad_fn=<AddmmBackward>)```
I am new to Pytorch. I was trying to model a binary classifier on the Kepler dataset. The following was my dataset class.
class KeplerDataset(Dataset):
def __init__(self, test=False):
self.dataframe_orig = pd.read_csv(koi_cumm_path)
if (test == False):
self.data = df_numeric[( df_numeric.koi_disposition == 1 ) | ( df_numeric.koi_disposition == 0 )].values
else:
self.data = df_numeric[~(( df_numeric.koi_disposition == 1 ) | ( df_numeric.koi_disposition == 0 ))].values
self.X_data = torch.FloatTensor(self.data[:, 1:])
self.y_data = torch.FloatTensor(self.data[:, 0])
def __len__(self):
return len(self.data)
def __getitem__(self, index):
return self.X_data[index], self.y_data[index]
Here, I created a custom classifier class with one hidden layer and a single output unit that produces sigmoidal probability of being in class 1 (planet).
class KOIClassifier(nn.Module):
def __init__(self, input_dim, out_dim):
super(KOIClassifier, self).__init__()
self.linear1 = nn.Linear(input_dim, 32)
self.linear2 = nn.Linear(32, 32)
self.linear3 = nn.Linear(32, out_dim)
def forward(self, xb):
out = self.linear1(xb)
out = F.relu(out)
out = self.linear2(out)
out = F.relu(out)
out = self.linear3(out)
out = torch.sigmoid(out)
return out
I then created a train_model function to optimize the loss using SGD.
def train_model(X, y):
criterion = nn.BCELoss()
optim = torch.optim.SGD(model.parameters(), lr=0.001)
n_epochs = 100
losses = []
for epoch in range(n_epochs):
y_pred = model.forward(X)
loss = criterion(y_pred, y)
losses.append(loss.item())
optim.zero_grad()
loss.backward()
optim.step()
losses = []
for X, y in train_loader:
losses.append(train_model(X, y))
But after performing the optimization over the train_loader, When I try predicting on the trainn_loader itself, the prediction values are so much worse.
for features, y in train_loader:
y_pred = model.predict(features)
break
y_pred
> tensor([[4.5436e-02],
[1.5024e-02],
[2.2579e-01],
[4.2279e-01],
[6.0811e-02],
.....
Why is my model not working properly? Is it the problem with the dataset or am I doing something wrong with implementing the Neural net? I will link my Kaggle notebook because more context might be helpful. Please help.
You are optimizing many times (100 steps) on the first batch (first samples), then moving to the next samples. It means that your model will overfit your few samples before going to the next batch. Then, your training will be very non smooth, diverge and go far from your global optimum.
Usually, in a training loop you should:
go over all samples (this is one epoch)
shuffle your dataset in order to visit your samples in a different order (set your pytorch training loader accordingly)
go back to 1. until you reach the max number of epochs
Also you should not define your optimizer each time (nor your criterion).
Your training loop should look like this:
criterion = nn.BCELoss()
optim = torch.optim.SGD(model.parameters(), lr=0.001)
n_epochs = 100
def train_model():
for X, y in train_loader:
optim.zero_grad()
y_pred = model.forward(X)
loss = criterion(y_pred, y)
loss.backward()
optim.step()
for epoch in range(n_epochs):
train_model()
My custom loss function in Pytorch does not update during training. The loss stays exactly the same. I am trying to write this custom loss function based on the false positive and negative rates. I am giving you a simplified version of the code. Any idea what could be happening? Does the backpropagation turns to 0? Is this not the correct way of defining a custom loss function?
I have already checked that during backpropagation the Gradient always stays TRUE (assert requires_grad). I have also tried to make a class (torch.nn.module) of the function false_pos_neg_rate, but that did not work. The Assert Requires_grad turned out to be negative and I left it out afterwards.
There is no error, the training does continue.
def false_pos_neg_rate(outputs, truths):
y = truths
y_predicted = outputs
cut_off= torch.tensor(0.5, requires_grad=True)
y_predicted =torch.where(y_predicted <= cut_off, zeros, ones)
tp, fp, tn, fn = confusion_matrix(y_predicted, y)
fp_rate = fp / (fp+tn).float()
fn_rate = fn / (fn+tp).float()
loss = fn_rate + fp_rate
return loss
for i, (samples, truths) in enumerate(train_loader):
samples = Variable(samples)
truths = Variable(truths)
outputs = model(samples)
loss = false_pos_neg_rate_torch(outputs, truths)
loss.backward()
optimizer.step()
I expect the loss function to update the model and be smaller every training step. Instead the loss stays exactly the same and nothing happens.
Please help me, what happens? Why does the model not train during training steps?
As pointed out by Umang Gupta your loss function is not differentiable. If you write, mathematically, what you are trying to do you'll see that your loss has zero gradient almost everywhere and it behaves like a "step function".
In order to train models using gradient-descent methods you must have meaningful gradients for the loss function.
Based on your tips, I updated my Loss Function. I made a dummy so you can check the first 2 functions as well. I added the rest, so you can see how it is implemented. However, still somewhere the gradient turns out to be zero. What is now the step where the gradient turns zero, or how can I check this? Please I would like to know how I can fix this :).
I tried providing you with more information so you can play around as well, but if you miss anything please do let me know!
y = Variable(torch.tensor((0, 0, 0, 1, 1,1), dtype=torch.float), requires_grad = True)
y_pred = Variable(torch.tensor((0.333, 0.2, 0.01, 0.99, 0.49, 0.51), dtype=torch.float), requires_grad = True)
def binary_y_pred(y_pred):
y_pred.register_hook(lambda grad: print(grad))
y_pred = y_pred+torch.tensor(0.5, requires_grad=True, dtype=torch.float)
y_pred = y_pred.pow(5) # this is my way working around using torch.where()
y_pred = y_pred.pow(10)
y_pred = y_pred.pow(15)
m = nn.Sigmoid()
y_pred = m(y_pred)
y_pred = y_pred-torch.tensor(0.5, requires_grad=True, dtype=torch.float)
y_pred = y_pred*2
y_pred.register_hook(lambda grad: print(grad))
return y_pred
def confusion_matrix(y_pred, y):
TP = torch.sum(y*y_pred)
TN = torch.sum((1-y)*(1-y_pred))
FP = torch.sum((1-y)*y_pred)
FN = torch.sum(y*(1-y_pred))
k_eps = torch.tensor(1e-12, requires_grad=True, dtype=torch.float)
FN_rate = FN/(TP + FN + k_eps)
FP_rate = FP/(TN + FP + k_eps)
cost = FN_rate + FP_rate
return cost
class FeedforwardNeuralNetModel(nn.Module):
def __init__(self, input_dim, hidden_dim, output_dim):
super(FeedforwardNeuralNetModel, self).__init__()
self.fc1 = nn.Linear(input_dim, hidden_dim)
self.relu1 = nn.ReLU()
self.fc2 = nn.Linear(hidden_dim, output_dim)
self.sigmoid = nn.Sigmoid()
def forward(self, x):
out = self.fc1(x)
out = self.relu1(out)
out = self.fc2(out)
out = self.sigmoid(out)
return out
model = FeedforwardNeuralNetModel(input_dim, hidden_dim, output_dim)
optimizer = torch.optim.Adam(model.parameters(), lr=0.0001, betas=[0.9, 0.99], amsgrad=True)
criterion = torch.nn.BCELoss(weight=None, size_average=None, reduce=None, reduction='mean')
samples= Variable(samples)
truths = Variable(truths)
outputs = model(samples)
loss = confusion_matrix(outputs, truths)
loss.backward()
optimizer.step()
I have just begin to learn tensorflow,and write a model for exercising on MNIST.Thus I am following a book,but there is still porblem,could you please help me about this?
Following is my code with problem description in it,thank you very much!
x = tf.placeholder(tf.float32,[None,INPUT_NODE],name='input')
y_ = tf.placeholder(tf.float32,[None,OUTPUT_NODE],name='output')
weights1 = tf.Variable(tf.truncated_normal([INPUT_NODE,LAYER1_NODE],stddev=0.1))
biases1 = tf.Variable(tf.constant(0.1,shape=[LAYER1_NODE]))
weights2 = tf.Variable(tf.truncated_normal([LAYER1_NODE,OUTPUT_NODE],stddev=0.1))
biases2 = tf.Variable(tf.constant(0.1,shape=[OUTPUT_NODE]))
the next y = ()...define forward propagating without using moving average model.
y = inference(x,None,weights1,biases1,weights2,biases2)
global_step = tf.Variable(0,trainable=False)
variable_averages = tf.train.ExponentialMovingAverage(MOVING_AVERAGE_DECAY,global_step)
variables_averages_op = variable_averages.apply(tf.trainable_variables())
the next average_y =()...define forward propagating using moving average model.
average_y = inference(x,variable_averages,weights1,biases1,weights2,biases2)
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y,labels=tf.arg_max(y_,1))
cross_entropy_mean = tf.reduce_mean(cross_entropy)
regularizer = tf.contrib.layers.l2_regularizer(REGULARIZATION_RATE)
regularization = regularizer(variable_averages.average(weights1)) +\
regularizer(variable_averages.average(weights2))
loss = cross_entropy_mean + regularization
learning_rate = tf.train.exponential_decay(
LEARNING_RATE_BASE,
global_step,
mnist.train.num_examples / BATCH_SIZE,
LEARNING_RATE_DECAY
)
train_step = tf.train.GradientDescentOptimizer(learning_rate).minimize(loss,global_step=global_step)
train_op = tf.group(train_step,variables_averages_op)
the problem is when I use average_y to calculate the accuracy,it seems like training doesn't help improving at all:
After 0 training steps, acc in validatation is 0.0742
After 1000 training steps, acc in validatation is 0.0924
After 2000 training steps, acc in validatation is 0.0924
When I using y instead of average_y,everything is good.This really confuse me:
After 0 training steps, acc in validatation is 0.0686
After 1000 training steps, acc in validatation is 0.9716
After 2000 training steps, acc in validatation is 0.9768
#correct_prediction = tf.equal(tf.arg_max(y,1),tf.arg_max(y_,1))
correct_prediction = tf.equal(tf.arg_max(average_y,1),tf.arg_max(y_,1))
accuracy = tf.reduce_mean(tf.cast(correct_prediction,tf.float32))
with tf.Session() as sess:
tf.initialize_all_variables().run()
validate_feed = {
x:mnist.validation.images,
y_:mnist.validation.labels
}
test_feed={
x:mnist.test.images,
y_:mnist.test.labels
}
for i in range(TRAINING_STEPS):
if i%1000 == 0:
validate_acc = sess.run(accuracy,feed_dict=validate_feed)
print("After %d training steps, acc in validatation is %g"%(i,validate_acc))
xs,ys = mnist.train.next_batch(BATCH_SIZE)
sess.run([train_op,global_step],feed_dict={x:xs,y_:ys})
test_acc = sess.run(accuracy,feed_dict=test_feed)
print("After %d training steps, acc in test is %g" % (TRAINING_STEPS, test_acc))
From your code snippet you are training the classification loss with respect to the y logits instead of average_y, so the inference graph with exponential moving average is actually not trained
cross_entropy = tf.nn.sparse_softmax_cross_entropy_with_logits(logits=y,labels=tf.arg_max(y_,1))