PyTorch adapt binary classification model to output probabilities of both classes - python

My dataset has 14 features and a target containing {0,1}.
I have trained this binary classifier:
class SimpleBinaryClassifier(nn.Module):
def __init__(self,input_shape):
super().__init__()
self.fc1 = nn.Linear(input_shape,64)
self.fc2 = nn.Linear(64,32)
self.dropout = nn.Dropout(p=0.1)
self.fc3 = nn.Linear(32,1)
def forward(self,x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = self.dropout(x)
x = self.fc3(x)
return x
with the following criterion and training loop:
criterion = nn.BCEWithLogitsLoss()
def binary_acc(y_pred, y_test):
y_pred_tag = torch.round(torch.sigmoid(y_pred))
correct_results_sum = (y_pred_tag == y_test).sum().float()
acc = correct_results_sum/y_test.shape[0]
acc = torch.round(acc * 100)
return acc
model.train()
for e in range(1, EPOCHS+1):
epoch_loss = 0
epoch_acc = 0
for X_batch, y_batch in train_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
y_pred = model(X_batch)
loss = criterion(y_pred, y_batch.unsqueeze(1))
acc = binary_acc(y_pred, y_batch.unsqueeze(1))
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')
This model, when called like sigmoid(model(input_tensor)) outputs a single number in [0,1]. The pipeline I'm working with, expects a model to output probabilities [p_class1, p_class2].
How can I adapt the model and the training loop?
If I set the output of the last layer to 2, I have problems with the criterion inside the training loop.
class SimpleBinaryClassifier2(nn.Module):
def __init__(self,input_shape):
super().__init__()
self.fc1 = nn.Linear(input_shape,64)
self.fc2 = nn.Linear(64,32)
self.dropout = nn.Dropout(p=0.1)
self.fc3 = nn.Linear(32,2) # now it's 2
def forward(self,x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = self.dropout(x)
x = self.fc3(x)
return x
I use the CrossEntropy
model = SimpleBinaryClassifier2(input_shape=14)
model.to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()
and replace y_pred_tag = torch.round(torch.sigmoid(y_pred)) with argmax(softmax)
def binary_acc2(y_pred, y_test):
y_pred_tag = torch.argmax(torch.softmax(y_pred), dim=1)
correct_results_sum = (y_pred_tag == y_test).sum().float()
acc = correct_results_sum/y_test.shape[0]
acc = torch.round(acc * 100)
return acc
Then the train loop rises an error:
model.train()
for e in range(1, EPOCHS+1):
epoch_loss = 0
epoch_acc = 0
for X_batch, y_batch in train_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
y_pred = model(X_batch)
loss = criterion(y_pred, y_batch)
acc = binary_acc(y_pred, y_batch)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
​
print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')
The error is the following:
RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Float'
I already looked up on this other post where the cause of that error was that the element was a Float and not a tensor, but in my case the datasets are tensors:
train_dataset = GenericDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
test_dataset = GenericDataset(torch.FloatTensor(X_test), torch.FloatTensor(y_test))

According to nn.CrossEntropyLoss description it expects target as long and not float, while in your train_dataset you clearly convert it to float

Related

For categorical class RuntimeError: 0D or 1D target tensor expected, multi-target not supported

I have 28 features and target variable is categorical (0-8) i.e. 9 target variable .
Data sample:
X_train.shape,y_train.shape
output --((640, 28), (640, 1))
X_train[0]
output --array([0.4546875 , 0.63958333, 0.46875 , 0.62916667, 0.4859375 ,
0.62916667, 0.5015625 , 0.64166667, 0.4859375 , 0.65 ,
0.4671875 , 0.65 , 0.478125 , 0.6375 , 0.5625 ,
0.64166667, 0.5765625 , 0.62708333, 0.5921875 , 0.62708333,
0.60625 , 0.63541667, 0.59375 , 0.64583333, 0.5765625 ,
0.64791667, 0.58125 , 0.63541667])
y_train[0]
output --array([1])
defined data generator and model like below
class ClassifierDataset(Dataset):
def __init__(self, X_data, y_data):
self.X_data = X_data
self.y_data = y_data
def __getitem__(self, index):
return self.X_data[index], self.y_data[index]
def __len__ (self):
return len(self.X_data)
train_dataset = ClassifierDataset(torch.from_numpy(X_train).float(), torch.from_numpy(y_train).long())
val_dataset = ClassifierDataset(torch.from_numpy(X_val).float(), torch.from_numpy(y_val).long())
test_dataset = ClassifierDataset(torch.from_numpy(X_test).float(), torch.from_numpy(y_test).long())
EPOCHS = 150
BATCH_SIZE = 32
LEARNING_RATE = 0.0007
NUM_FEATURES = len(X[0])
NUM_CLASSES = 9
train_loader = DataLoader(dataset=train_dataset,
batch_size=BATCH_SIZE,
shuffle = True
)
val_loader = DataLoader(dataset=val_dataset, batch_size=1)
test_loader = DataLoader(dataset=test_dataset, batch_size=1)
class MulticlassClassification(nn.Module):
def __init__(self, num_feature, num_class):
super(MulticlassClassification, self).__init__()
self.layer_1 = nn.Linear(num_feature, 512)
self.layer_2 = nn.Linear(512, 128)
self.layer_3 = nn.Linear(128, 64)
self.layer_out = nn.Linear(64, num_class)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.2)
self.batchnorm1 = nn.BatchNorm1d(512)
self.batchnorm2 = nn.BatchNorm1d(128)
self.batchnorm3 = nn.BatchNorm1d(64)
def forward(self, x):
x = self.layer_1(x)
x = self.batchnorm1(x)
x = self.relu(x)
x = self.layer_2(x)
x = self.batchnorm2(x)
x = self.relu(x)
x = self.dropout(x)
x = self.layer_3(x)
x = self.batchnorm3(x)
x = self.relu(x)
x = self.dropout(x)
x = self.layer_out(x)
return x
Defined loss and batch size as:
model = MulticlassClassification(num_feature = NUM_FEATURES, num_class=NUM_CLASSES)
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=LEARNING_RATE)
print(model)
Defined function for multi accuracy class
def multi_acc(y_pred, y_test):
y_pred_softmax = torch.log_softmax(y_pred, dim = 1)
_, y_pred_tags = torch.max(y_pred_softmax, dim = 1)
correct_pred = (y_pred_tags == y_test).float()
acc = correct_pred.sum() / len(correct_pred)
acc = torch.round(acc * 100)
return acc
Started training like this
accuracy_stats = {
'train': [],
"val": []
}
loss_stats = {
'train': [],
"val": []
}
print("Begin training.")
for e in tqdm(range(1, EPOCHS+1)):
# TRAINING
train_epoch_loss = 0
train_epoch_acc = 0
model.train()
for X_train_batch, y_train_batch in train_loader:
print(X_train_batch.shape, y_train_batch.shape)
X_train_batch, y_train_batch = X_train_batch.to(device), y_train_batch.to(device)
optimizer.zero_grad()
y_train_pred = model(X_train_batch)
# y_train_pred = y_train_pred.unsqueeze(1)
print(y_train_pred.shape,y_train_batch.shape)
print(y_train_batch)
print(y_train_pred)
# train_loss = criterion(y_train_pred, torch.max(y_train_batch,1)[1])
train_loss = criterion(y_train_pred, y_train_batch)
train_acc = multi_acc(y_train_pred, y_train_batch)
train_loss.backward()
optimizer.step()
train_epoch_loss += train_loss.item()
train_epoch_acc += train_acc.item()
# VALIDATION
with torch.no_grad():
val_epoch_loss = 0
val_epoch_acc = 0
model.eval()
for X_val_batch, y_val_batch in val_loader:
X_val_batch, y_val_batch = X_val_batch.to(device), y_val_batch.to(device)
y_val_pred = model(X_val_batch)
# val_loss = criterion(y_val_pred, torch.max(y_val_batch,1)[1])
val_loss = criterion(y_val_pred, y_val_batch)
val_acc = multi_acc(y_val_pred, y_val_batch)
val_epoch_loss += val_loss.item()
val_epoch_acc += val_acc.item()
loss_stats['train'].append(train_epoch_loss/len(train_loader))
loss_stats['val'].append(val_epoch_loss/len(val_loader))
accuracy_stats['train'].append(train_epoch_acc/len(train_loader))
accuracy_stats['val'].append(val_epoch_acc/len(val_loader))
print(f'Epoch {e+0:03}: | Train Loss: {train_epoch_loss/len(train_loader):.5f} | Val Loss: {val_epoch_loss/len(val_loader):.5f} | Train Acc: {train_epoch_acc/len(train_loader):.3f}| Val Acc: {val_epoch_acc/len(val_loader):.3f}')
This is the error I am getting:
RuntimeError Traceback (most recent call last)
<ipython-input-529-1d57dbd350e4> in <module>
17 print(y_train_pred)
18 # train_loss = criterion(y_train_pred, torch.max(y_train_batch,1)[1])
---> 19 train_loss = criterion(y_train_pred, y_train_batch)
20 train_acc = multi_acc(y_train_pred, y_train_batch)
21
2 frames
/usr/local/lib/python3.8/dist-packages/torch/nn/functional.py in nll_loss(input, target, weight, size_average, ignore_index, reduce, reduction)
2699 if size_average is not None or reduce is not None:
2700 reduction = _Reduction.legacy_get_string(size_average, reduce)
-> 2701 return torch._C._nn.nll_loss_nd(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
2702
2703
RuntimeError: 0D or 1D target tensor expected, multi-target not supported
Any idea how to correct this? I've been stuck for a long time
The y_train_batch in criterion(y_train_pred, y_train_batch) where criterion is nn.NLLLoss should be with the shape [batch_size] containig indices in the range [0, nb_classes-1]. However, according to your explanation y_train_batch is with shape of [batch_size,1]. Therefore, in order to solve your problem, you should modify the line train_loss =criterion(y_train_pred,y_train_batch) in your code with:
train_loss = criterion(y_train_pred, y_train_batch.squueze(-1))
or with:
train_loss = criterion(y_train_pred, y_train_batch.view(y_train_batch.size(0))
or with:
train_loss = criterion(y_train_pred,y_train_batch.reshape(y_train_batch.size(0))

How predict next word using LSTM model?

I am currently building an LSTM model in Pytorch to predict the next word of a given input.
My model:
class LSTM(nn.Module):
def __init__(self, vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights):
super().__init__()
self.num_layers = num_layers
self.hidden_dim = hidden_dim
self.embedding_dim = embedding_dim
self.embedding = nn.Embedding(vocab_size, embedding_dim)
self.lstm = nn.LSTM(embedding_dim, hidden_dim, num_layers=num_layers,
dropout=dropout_rate, batch_first=True)
self.dropout = nn.Dropout(dropout_rate)
self.linear = nn.Linear(hidden_dim, vocab_size)
if tie_weights:
#Embedding and hidden layer need to be same size for weight tieing
assert embedding_dim == hidden_dim, 'cannot tie, check dims'
self.linear.weight = self.embedding.weight
self.init_weights()
def forward(self, x):
# x is a batch of input sequences
x = self.embedding(x)
x, _ = self.lstm(x)
x = self.linear(x)
return x
def init_weights(self):
init_range_emb = 0.1
init_range_other = 1/math.sqrt(self.hidden_dim)
self.embedding.weight.data.uniform_(-init_range_emb, init_range_emb)
self.linear.weight.data.uniform_(-init_range_other, init_range_other)
self.linear.bias.data.zero_()
for i in range(self.num_layers):
self.lstm.all_weights[i][0] = torch.FloatTensor(self.embedding_dim,
self.hidden_dim).uniform_(-init_range_other, init_range_other)
self.lstm.all_weights[i][1] = torch.FloatTensor(self.hidden_dim,
self.hidden_dim).uniform_(-init_range_other, init_range_other)
vocab_size = len(vocab)
embedding_dim = 100
hidden_dim = 100
num_layers = 2
dropout_rate = 0.4
tie_weights = True
model = LSTM(vocab_size, embedding_dim, hidden_dim, num_layers, dropout_rate, tie_weights)
model.to(device)
Training and evaluation funciton:
import copy
import time
criterion = nn.CrossEntropyLoss()
lr = 20.0 # learning rate
optimizer = torch.optim.SGD(model.parameters(), lr=lr)
scheduler = torch.optim.lr_scheduler.StepLR(optimizer, 1.0, gamma=0.95)
def train(model: nn.Module) -> None:
model.train() # turn on train mode
total_loss = 0.
log_interval = 200
start_time = time.time()
num_batches = len(train_data) // bptt
for batch, i in enumerate(range(0, train_data.size(0) - 1, bptt)):
data, targets = get_batch(train_data, i)
seq_len = data.size(0)
output = model(data)
loss = criterion(output.view(-1, vocab_size), targets)
optimizer.zero_grad()
loss.backward()
torch.nn.utils.clip_grad_norm_(model.parameters(), 0.5)
optimizer.step()
total_loss += loss.item()
if batch % log_interval == 0 and batch > 0:
lr = scheduler.get_last_lr()[0]
ms_per_batch = (time.time() - start_time) * 1000 / log_interval
cur_loss = total_loss / log_interval
ppl = math.exp(cur_loss)
print(f'| epoch {epoch:3d} | {batch:5d}/{num_batches:5d} batches | '
f'lr {lr:02.2f} | ms/batch {ms_per_batch:5.2f} | '
f'loss {cur_loss:5.2f} | ppl {ppl:8.2f}')
total_loss = 0
start_time = time.time()
def evaluate(model: nn.Module, eval_data: Tensor) -> float:
model.eval() # turn on evaluation mode
total_loss = 0.
with torch.no_grad():
for i in range(0, eval_data.size(0) - 1, bptt):
data, targets = get_batch(eval_data, i)
seq_len = data.size(0)
output = model(data)
output_flat = output.view(-1, vocab_size)
total_loss += seq_len * criterion(output_flat, targets).item()
return total_loss / (len(eval_data) - 1)
Training loop
best_val_loss = float('inf')
epochs = 50
best_model = None
for epoch in range(1, epochs + 1):
epoch_start_time = time.time()
train(model)
val_loss = evaluate(model, val_data)
val_ppl = math.exp(val_loss)
elapsed = time.time() - epoch_start_time
print('-' * 89)
print(f'| end of epoch {epoch:3d} | time: {elapsed:5.2f}s | '
f'valid loss {val_loss:5.2f} | valid ppl {val_ppl:8.2f}')
print('-' * 89)
if val_loss < best_val_loss:
best_val_loss = val_loss
best_model = copy.deepcopy(model)
scheduler.step()
My problem is I have no idea how to go about this. I've seen some implementations of character based LSTM text generators but I'm looking for it to be word based. For example I want to pass an input like "How are you" and the output will included the next predicted word, like for example "How are you today"
Any help appreciated.
I would suggest to try the example in the attached link(https://www.kaggle.com/code/ysthehurricane/next-word-prediction-bi-lstm-tutorial-easy-way).
You can download the dataset from the attached link below.
(https://www.kaggle.com/datasets/dorianlazar/medium-articles-dataset)
It tries to predict the next word using Bi-directional LSTM architecture. I think that this example mostly suits to your needs, which will give you an idea to proceed further.
You can follow the instruction provided in the first link.

Using ray tune `tune.run` with pytorch returns different optimal hyperparameters combination

I've initialized two identical ANN with PyTorch (both as structure and initial parameters), and I've noticed that the hyperparameters setting with Ray Tune, returns different results for the two ANN, even if I didn't have any random initialization.
Someone could explain what I'm doing wrong? I'll attach the code:
ANN Initialization:
class Featrues_model(nn.Module):
def __init__(self, n_inputs, dim_hidden, n_outputs):
super().__init__()
self.fc1 = nn.Linear(n_inputs, dim_hidden)
self.fc2 = nn.Linear(dim_hidden, n_outputs)
def forward(self, X):
X = self.fc1(X)
X = self.fc2(X)
return X
features_model_v1 = Featrues_model(len(list_input_variables),5,6)
features_model_v2 = Featrues_model(len(list_input_variables),5,6)
features_model_v2.load_state_dict(features_model_v1.state_dict())
Hyperpamameters setting
config = {
"lr": tune.choice([1e-2, 1e-5]),
"weight_decay": tune.choice([1e-2, 1e-5]),
"batch_size": tune.choice([16,64]),
"epochs": tune.choice([10,50])
}
Train & Validation Dataframe
trainset = df_final.copy()
test_abs = int(len(trainset) * 0.8)
train_subset, val_subset = random_split(
trainset, [test_abs, len(trainset) - test_abs]
)
df_train = df_final.iloc[train_subset.indices]
df_val = df_final.iloc[val_subset.indices]
Train function design
def setting_model(config, df_train, df_val, model):
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
BATCH_SIZE = config["batch_size"]
for epoch in range(config["epochs"]):
train_epoch_loss = 0
train_epoch_acc = 0
step = 0
for i in tqdm(range(0, df_train.shape[0], BATCH_SIZE)):
batch_X = np.array(
df_train[list_input_variables].iloc[i:i+BATCH_SIZE]
)
batch_X = torch.Tensor([x for x in batch_X])
batch_Y = np.array(
df_train[list_output_variables].iloc[i:i+BATCH_SIZE]
)
batch_Y = torch.Tensor([int(y) for y in batch_Y])
batch_Y = batch_Y.type(torch.int64)
optimizer.zero_grad()
outputs = model.forward(batch_X)
train_loss = criterion(outputs, batch_Y)
train_acc = multi_acc(outputs, batch_Y)
train_loss.backward()
optimizer.step()
train_epoch_loss += train_loss.item()
train_epoch_acc += train_acc.item()
step += 1
# print statistics
print(f"Epochs: {epoch}")
print(f"Train Loss: {train_epoch_loss/len(df_train)}")
print(f"Train Acc: {train_epoch_acc/step}")
print("\n")
# Validation loss
with torch.no_grad():
X_val = np.array(
df_val[list_input_variables]
)
X_val = torch.Tensor([x for x in X_val])
Y_val = np.array(
df_val[list_output_variables]
)
Y_val = torch.Tensor([int(y) for y in Y_val])
Y_val = Y_val.type(torch.int64)
outputs = model.forward(X_val)
_, predicted = torch.max(outputs.data, 1)
total = Y_val.size(0)
correct = (predicted == Y_val).sum().item()
loss = criterion(outputs, Y_val)
tune.report(loss=(loss.numpy()), accuracy=correct / total)
print(f"Validation Loss: {loss.numpy()/len(df_val)}")
print(f"Validation Acc: {correct / total:.3f}")
print("Finished Training")
Hyperparameters Tune
result_v1 = tune.run(
partial(setting_model, df_train=df_train, df_val=df_val, model=features_model_v1),
config=config,
fail_fast="raise",
)
result_v2 = tune.run(
partial(setting_model, df_train=df_train, df_val=df_val, model=features_model_v2),
config=config,
fail_fast="raise"
)
Output
result_v1.get_best_config()
{'lr': 1e-05, 'weight_decay': 1e-05, 'epochs': 1}
result_v2.get_best_config()
{'lr': 0.01, 'weight_decay': 1e-05, 'epochs': 1}
The issue is the use of torch.random under the hood. Since you are not directly providing a weight matrix for your layers, pytorch initializes it for you. Luckily, you can have a reproducible experiment by setting
torch.manual_seed(x) # where x is an integer
One should use only a few random seeds, otherwise you might overfit on the random seed. See lottery ticket hypothesis at https://arxiv.org/abs/1803.03635)

Input contains NaN, infinity or a value too large for dtype('float32'). Pythorch

I try to train model but in vain. I see the error
Input contains NaN, infinity or a value too large for dtype('float32').
I think it can be connected with Mse function, because with MAE it works somehow also with RMSE it works somehow (on the second epoch i have RMSE = 10). I can't figure out what i do wrong.
# Count Nan
df = pd.read_csv('data.txt.zip', header=None)
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values
train_size = 463715
X_train = X[:train_size, :]
y_train = y[:train_size]
X_test = X[train_size:, :]
y_test = y[train_size:]
#ToTensor
X_train = torch.FloatTensor(X_train)
y_train = torch.FloatTensor(y_train)
X_test = torch.FloatTensor(X_test)
y_test = torch.FloatTensor(y_test)
# Create TensorDataset
train_ds = TensorDataset(X_train, y_train)
test_ds = TensorDataset(X_test, y_test)
val_num = 92743
train_num = 370972
# Divide train data into train and validation data
train_ds, val_ds = random_split(train_ds, [train_num, val_num])
# Evaluate accuracy
def accuracy(y_true, y_pred):
return r2_score(y_true, y_pred)
# create Class
class BaselineModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(BaselineModel, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.linear1 = nn.Linear(90, 45)
self.linear2 = nn.Linear(45, 1)
self.linear3 = nn.Linear(45, 15)
self.linear4 = nn.Linear(15, 1)
self.batch = nn.BatchNorm2d(hidden_size)
self.relu = nn.ReLU()
self.lreku = nn.LeakyReLU()
self.elu = nn.ELU()
self.dropout = nn.Dropout(0.5)
def forward(self, x):
x = self.elu(self.linear1(x))
return self.linear2(x)
def training_step(self, criterion, batch):
x_train, y_train = batch
y_pred = self(x_train)
loss = (criterion(y_pred, y_train.unsqueeze(1)))
return loss
def validation_step(self, criterion, batch):
x_val, y_val = batch
y_pred = self(x_val)
loss = (criterion(y_pred, y_val.unsqueeze(1)))
acc = accuracy(y_val, y_pred)
return {'val_loss': loss, 'val_acc': acc}
def validation_epoch_end(self, y_pred):
batch_losses = [x['val_loss'] for x in y_pred]
epoch_loss = torch.stack(batch_losses).mean()
batch_accs = [x['val_acc'] for x in y_pred]
epoch_acc = np.mean(batch_accs)
#epoch_acc = torch.stack(batch_accs).mean()
return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
def epoch_end(self, epoch, result):
print(f"Epoch {epoch}, val_loss: {result['val_loss']}, val_acc: {result['val_acc']} ")
model = BaselineModel(input_size = 90, hidden_size = 45, output_size = 1)
# Evaluate
def evaluate(model, criterion, val_loader):
with torch.no_grad():
y_pred = [model.validation_step(criterion, batch) for batch in val_loader]
return model.validation_epoch_end(y_pred)
# Train
def train(model, criterion, optimizer, train_loader, val_loader, lr, epochs):
history = []
for epoch in range(epochs):
for batch in train_loader:
optimizer.zero_grad()
loss = model.training_step(criterion, batch)
loss.backward()
optimizer.step()
result = evaluate(model, criterion, val_loader)
model.epoch_end(epoch, result)
history.append(result)
#return history
# Create train_loader & val_loader
batch_size = 128
train_loader = DataLoader(train_ds, batch_size = batch_size, shuffle = True)
val_loader = DataLoader(val_ds, batch_size = batch_size, shuffle = True)
# Create parameters and Train
lr = 0.05
optimizer = torch.optim.SGD(model.parameters(), lr, momentum = 0.9)
criterion = F.mse_loss
epochs = 10
train(model, criterion, optimizer, train_loader, val_loader, lr, epochs)
Yes, it is because of your loss of function. if the value of the loss function after some epoch becomes very small or very large then when you want to use it in backpropagation to train the model, you face this error. To handle that, you should use Early Stopping to Halt the Training. so you should implement Callback, Callbacks provide a way to execute code and interact with the training model process automatically.

Function 'CudnnConvolutionBackward' returned nan values in its 1th output

I'm trying to train this simple convolutional model:
class Modello1(nn.Module):
#struttura del modello
def __init__(self, in_size, hidden_size1, hidden_size2, hidden_size3, hidden_size4, hidden_size5, out_size):
super().__init__()
kernel_size = 3 #3
stride = 3 #2
padding = 3 #3
self.conv1 = nn.Conv1d(in_size, hidden_size1, kernel_size, stride, padding)
self.pool = nn.MaxPool1d(kernel_size, padding = 1, stride = 3)
self.conv2 = nn.Conv1d(hidden_size1, hidden_size2, kernel_size, stride, padding)
self.conv3 = nn.Conv1d(hidden_size2, hidden_size3, kernel_size, stride, padding)
# IMPORTANTE, qui aggiungere un livello di flattening (flatten qualcosa)?
self.linear1 = nn.Linear(hidden_size3, hidden_size4)
self.linear2 = nn.Linear(hidden_size4, hidden_size5)
self.linear3 = nn.Linear(hidden_size5, out_size)
#i dati attraversano il modello
def forward(self, input_data):
input_data = torch.reshape(input_data, (input_data.shape[0],input_data.shape[1],1))
input_data = input_data.float()
out = self.pool(F.relu(self.conv1(input_data))) #70 neuroni
out = torch.reshape(out, (out.shape[0],out.shape[1],1))
out = self.pool(F.relu(self.conv2(out))) #33 neuroni
out = torch.reshape(out, (out.shape[0],out.shape[1],1))
out = self.pool(F.relu(self.conv3(out))) #33 neuroni
out = torch.reshape(out, (out.shape[0],out.shape[1]))
out = F.relu(self.linear1(out))
out = F.relu(self.linear2(out))
out = self.linear3(out)
return out
#calcolo loss e accuratezza batch
def validation_step(self, batch):
input_data, targets = batch
out = self(input_data)
targets = targets.view(targets.shape[0])
targets = targets.long()
loss = criterion(out, targets)
acc = accuracy(out,targets)
return {'val_loss': loss, 'val_acc': acc}
#loss e accuratezza di ciascuna epoca
def validation_epoch_end(self, outputs):
#print(outputs)
batch_losses = [x['val_loss'] for x in outputs]
epoch_loss = torch.stack(batch_losses).mean() #media tra le losses di ogni batch
batch_accs = [x['val_acc'] for x in outputs]
batch_accs = torch.tensor(batch_accs)
epoch_acc = torch.mean(batch_accs)
return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc}
I use torch.autograd.set_detect_anomaly(True) function to check anomalies in loss.backward() function and as soon as I start the training process I obtain this error: Function 'CudnnConvolutionBackward' returned nan values in its 1th output. Does anyone have any idea why it appens?
Here is the rest of the code:
accuracy function
def accuracy(outputs, targets):
dim = targets.shape[0]
preds = torch.cuda.FloatTensor(dim).fill_(0)
_, preds = torch.max(outputs, dim=1)
i = 0
j = 0
targets = targets.long()
for x in preds:
if(x == targets[i]):
j+=1
i+=1
return (j / i) * 100
Criterion and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
Evaluation and fit function
def evaluate(model, val_loader):
outputs = [model.validation_step(batch) for batch in val_loader]
return model.validation_epoch_end(outputs)
def fit(epochs, model, train_loader, val_loader):
history = [] #stores loss e accuracy for each epoch
for epoch in range(epochs):
since = time.time()
running_loss = 0.0
for batch in train_loader:
inputs, targets = batch
optimizer.zero_grad()
# forward + backward + optimize
outputs = model(inputs)
targets = targets.view(targets.shape[0])
targets = targets.long()
loss = criterion(outputs, targets)
loss.backward()
optimizer.step()
#validation phase
result = evaluate(model, val_loader)
history.append(result)
running_loss += loss.item()
time_elapsed = time.time() - since
print("Epoch [{}], val_loss: {:.4f}, val_acc: {:.4f}%".format(epoch, result['val_loss'], result['val_acc']))
print('Training complete in {:.0f}m {:.0f}s'.format(time_elapsed // 60, time_elapsed % 60))
print('-' * 10)
running_loss = 0.0
return history
Training
model = Modello1(input_size, hidden_size1, hidden_size2, hidden_size3, hidden_size4, hidden_size5, out_size)
history = fit(10, model, train_loader, val_loader)

Categories

Resources