I want to train a 1D CNN on time series. I get the following error message 1D target tensor expected, multi-target not supported
Here is the code with simulated data corresponding to the structures of my data as well as the error message
import torch
from torch.utils.data import DataLoader
import torch.utils.data as data
import torch.nn as nn
import numpy as np
import random
from tqdm.notebook import tqdm
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
train_dataset = []
n_item = 20
for i in range(0,n_item):
train_data = np.random.uniform(-10, 10, 500)
train_dataset.append(train_data)
train_dataset = np.asarray(train_dataset)
train_dataset.shape
ecg_train = torch.from_numpy(train_dataset).float()
labels_train = np.random.randint(2, size=n_item)
labels_train = torch.from_numpy(labels_train).long()
val_dataset = []
n_item = 10
for i in range(0,n_item):
val_data = np.random.uniform(-10, 10, 500)
val_dataset.append(val_data)
val_dataset = np.asarray(val_dataset)
val_dataset.shape
ecg_validation = torch.from_numpy(val_dataset).float()
labels_validation = np.random.randint(2, size=n_item)
labels_validation = torch.from_numpy(labels_validation).long()
class ECGNet(data.Dataset):
"""ImageNet Limited dataset."""
def __init__(self, ecgs, labls, transform=None):
self.ecg = ecgs
self.target = labls
self.transform = transform
def __getitem__(self, idx):
ecgVec = self.ecg[idx] #.reshape(10, -1)
labelID = self.target[idx].reshape(1)
return ecgVec,labelID
def __len__(self):
return len(self.ecg)
train_data = ECGNet(ecg_train,
labels_train,
)
print("size of Training dataset: {}".format(len(train_data)))
validation_data = ECGNet(ecg_validation,
labels_validation,
)
print("size of Training dataset: {}".format(len(validation_data)))
batch_size = 1
train_dataloader = DataLoader(dataset = train_data,
batch_size=batch_size,
shuffle = True,
num_workers = 0)
val_dataloader = DataLoader(dataset = validation_data,
batch_size=batch_size,
shuffle = True,
num_workers = 0)
def train_epoch(model, train_dataloader, optimizer, loss_fn):
losses = []
correct_predictions = 0
# Iterate mini batches over training dataset
for images, labels in tqdm(train_dataloader):
images = images.to(device)
#labels = labels.squeeze_()
labels = labels.to(device)
#labels = labels.to(device=device, dtype=torch.int64)
# Run predictions
output = model(images)
# Set gradients to zero
optimizer.zero_grad()
# Compute loss
loss = loss_fn(output, labels)
# Backpropagate (compute gradients)
loss.backward()
# Make an optimization step (update parameters)
optimizer.step()
# Log metrics
losses.append(loss.item())
predicted_labels = output.argmax(dim=1)
correct_predictions += (predicted_labels == labels).sum().item()
accuracy = 100.0 * correct_predictions / len(train_dataloader.dataset)
# Return loss values for each iteration and accuracy
mean_loss = np.array(losses).mean()
return mean_loss, accuracy
def evaluate(model, dataloader, loss_fn):
losses = []
correct_predictions = 0
with torch.no_grad():
for images, labels in dataloader:
images = images.to(device)
#labels = labels.squeeze_()
labels = labels.to(device=device, dtype=torch.int64)
# Run predictions
output = model(images)
# Compute loss
loss = loss_fn(output, labels)
# Save metrics
predicted_labels = output.argmax(dim=1)
correct_predictions += (predicted_labels == labels).sum().item()
losses.append(loss.item())
mean_loss = np.array(losses).mean()
accuracy = 100.0 * correct_predictions / len(dataloader.dataset)
# Return mean loss and accuracy
return mean_loss, accuracy
def train(model, train_dataloader, val_dataloader, optimizer, n_epochs, loss_function):
# We will monitor loss functions as the training progresses
train_losses = []
val_losses = []
train_accuracies = []
val_accuracies = []
for epoch in range(n_epochs):
model.train()
train_loss, train_accuracy = train_epoch(model, train_dataloader, optimizer, loss_fn)
model.eval()
val_loss, val_accuracy = evaluate(model, val_dataloader, loss_fn)
train_losses.append(train_loss)
val_losses.append(val_loss)
train_accuracies.append(train_accuracy)
val_accuracies.append(val_accuracy)
print('Epoch {}/{}: train_loss: {:.4f}, train_accuracy: {:.4f}, val_loss: {:.4f}, val_accuracy: {:.4f}'.format(epoch+1, n_epochs,
train_losses[-1],
train_accuracies[-1],
val_losses[-1],
val_accuracies[-1]))
return train_losses, val_losses, train_accuracies, val_accuracies
class Simple1DCNN(torch.nn.Module):
def __init__(self):
super(Simple1DCNN, self).__init__()
self.layer1 = torch.nn.Conv1d(in_channels=50,
out_channels=20,
kernel_size=5,
stride=2)
self.act1 = torch.nn.ReLU()
self.layer2 = torch.nn.Conv1d(in_channels=20,
out_channels=10,
kernel_size=1)
self.fc1 = nn.Linear(10* 3, 2)
def forward(self, x):
print(x.shape)
x = x.view(1, 50,-1)
print(x.shape)
x = self.layer1(x)
print(x.shape)
x = self.act1(x)
print(x.shape)
x = self.layer2(x)
print(x.shape)
x = x.view(1,-1)
print(x.shape)
x = self.fc1(x)
print(x.shape)
print(x)
return x
model_a = Simple1DCNN()
model_a = model_a.to(device)
criterion = nn.CrossEntropyLoss()
loss_fn = torch.nn.CrossEntropyLoss()
n_epochs_a = 50
learning_rate_a = 0.01
alpha_a = 1e-5
momentum_a = 0.9
optimizer = torch.optim.SGD(model_a.parameters(),
momentum = momentum_a,
nesterov = True,
weight_decay = alpha_a,
lr=learning_rate_a)
train_losses_a, val_losses_a, train_acc_a, val_acc_a = train(model_a,
train_dataloader,
val_dataloader,
optimizer,
n_epochs_a,
loss_fn
)
Error message:
cpu
size of Training dataset: 20
size of Training dataset: 10
0%| | 0/20 [00:00<?, ?it/s]
torch.Size([1, 500])
torch.Size([1, 50, 10])
torch.Size([1, 20, 3])
torch.Size([1, 20, 3])
torch.Size([1, 10, 3])
torch.Size([1, 30])
torch.Size([1, 2])
tensor([[ 0.5785, -1.0169]], grad_fn=<AddmmBackward>)
Traceback (most recent call last):
File "SO_question.py", line 219, in <module>
train_losses_a, val_losses_a, train_acc_a, val_acc_a = train(model_a,
File "SO_question.py", line 137, in train
train_loss, train_accuracy = train_epoch(model, train_dataloader, optimizer, loss_fn)
File "SO_question.py", line 93, in train_epoch
loss = loss_fn(output, labels)
File "/Users/mymac/Documents/programming/python/mainenv/lib/python3.8/site-packages/torch/nn/modules/module.py", line 727, in _call_impl
result = self.forward(*input, **kwargs)
File "/Users/mymac/Documents/programming/python/mainenv/lib/python3.8/site-packages/torch/nn/modules/loss.py", line 961, in forward
return F.cross_entropy(input, target, weight=self.weight,
File "/Users/mymac/Documents/programming/python/mainenv/lib/python3.8/site-packages/torch/nn/functional.py", line 2468, in cross_entropy
return nll_loss(log_softmax(input, 1), target, weight, None, ignore_index, None, reduction)
File "/Users/mymac/Documents/programming/python/mainenv/lib/python3.8/site-packages/torch/nn/functional.py", line 2264, in nll_loss
ret = torch._C._nn.nll_loss(input, target, weight, _Reduction.get_enum(reduction), ignore_index)
RuntimeError: 1D target tensor expected, multi-target not supported
What am I doing wrong?
You are using nn.CrossEntropyLoss as the criterion for your training. You correctly passed the labels as indices of the ground truth class: 0s and 1s. However, as the error message suggests, it needs to be a 1D tensor!
Simply remove the reshape in ECGNet's __getitem__:
def __getitem__(self, idx):
ecgVec = self.ecg[idx]
labelID = self.target[idx]
return ecgVec,labelID
Edit
I want to increase the batch_size to 8. But now I get the error [...]
You are doing a lot of broadcasting (flattening) which surely will affect the batch size. As a general rule of thumb never fiddle with axis=0. For instance, if you have an input shape of (8, 500), straight off you have a problem when doing x.view(1, 50, -1). Since the resulting tensor will be (1, 50, 80) (the desired shape would have been (8, 50, 10)). Instead, you could broadcast with x.view(x.size(0), 50, -1).
Same with x.view(1, -1) later down forward. You are looking to flatten the tensor, but you should not flatten it along with the batches, they need to stay separated! It's safer to use torch.flatten, yet I prefer nn.Flatten which flattens from axis=1 to axis=-1 by default.
My personal advice is to start with a simple setup (without train loops etc...) to verify the architecture and intermediate output shapes. Then, add the necessary logic to handle the training.
class ECGNet(data.Dataset):
"""ImageNet Limited dataset."""
def __init__(self, ecgs, labls, transform=None):
self.ecg = ecgs
self.target = labls
self.transform = transform
def __getitem__(self, idx):
ecgVec = self.ecg[idx]
labelID = self.target[idx]
return ecgVec, labelID
def __len__(self):
return len(self.ecg)
class Simple1DCNN(nn.Module):
def __init__(self):
super(Simple1DCNN, self).__init__()
self.layer1 = nn.Conv1d(in_channels=50,
out_channels=20,
kernel_size=5,
stride=2)
self.act1 = nn.ReLU()
self.layer2 = nn.Conv1d(in_channels=20,
out_channels=10,
kernel_size=1)
self.fc1 = nn.Linear(10*3, 2)
self.flatten = nn.Flatten()
def forward(self, x):
x = x.view(x.size(0), 50, -1)
x = self.layer1(x)
x = self.act1(x)
x = self.layer2(x)
x = self.flatten(x)
x = self.fc1(x)
return x
batch_size = 8
train_data = ECGNet(ecg_train, labels_train)
train_dl = DataLoader(dataset=train_data,
batch_size=batch_size,
shuffle=True,
num_workers=0)
model = Simple1DCNN()
criterion = nn.CrossEntropyLoss()
Then
>>> x, y = next(iter(train_dl))
>>> y_hat = model(x)
>>> y_hat.shape
torch.Size([8, 2])
Also, make sure your loss works:
>>> criterion(y_hat, y)
tensor(..., grad_fn=<NllLossBackward>)
Related
I am currently trying to setup a neural network identifying three categorical variables which is severely imbalanced. Therefore I use WeightedRandomSampler so all classes have equal probability. Using this on a small sample of the data, it does exactly what it is supposed to. However, when running the model with the full dataset I keep getting the error: “number of categories cannot exceed 2^24”. The training data consist of 27,000,000 observations of 36 ‘x’ variables and one ‘y’ which is either 0,1 or 2.
I can’t figure out why I get this error and I have tried to implement everything I can find on this forum regarding this error without any luck. Any help would be greatly appreciated.
My code is set up as the following:
### getting data ###
def load_dataset_as_numpy(path):
dataset = pd.read_csv(path,usecols= selected_columns))
y_np = dataset.to_numpy()[:, -1]
x_np = dataset.to_numpy()
x_np = np.delete(x_np, [-1], axis=1)
data = TensorDataset(torch.tensor(x_np, dtype=torch.float32),
torch.tensor(y_np, dtype=torch.long))
return data
train_dataset = load_dataset_as_numpy(r'E:\filepath\crsp_train.csv')
### setting up WeightedRandomSample and DataLoader###
target_list = []
for _, y in train_dataset:
target_list.append(y)
target_list = torch.tensor(target_list)
class_count = torch.bincount(target_list)
class_weights = 1/class_count
class_weights_all = class_weights[target_list]
print(class_weights_all)
weighted_sampler = WeightedRandomSampler(
weights=class_weights_all,
num_samples=len(class_weights_all),
replacement=True
)
loader_train = DataLoader(train_dataset, batch_size=batc_size, shuffle=False, sampler=weighted_sampler)
### NN model ###
device = 'cuda' if torch.cuda.is_available() else 'cpu'
print(f'using {device} device')
class Model(nn.Module):
def __init__(self):
super(Model, self).__init__()
self.layer1 = nn.Linear(input_dim, hidden1)
self.layer2 = nn.Linear(hidden1, hidden2)
self.layer3 = nn.Linear(hidden2, hidden3)
self.out = nn.Linear(hidden3, 3)
self.relu = nn.ReLU()
self.drop = nn.Dropout(p=p2)
self.batchnorm1 = nn.BatchNorm1d(hidden1)
self.batchnorm2 = nn.BatchNorm1d(hidden2)
self.batchnorm3 = nn.BatchNorm1d(hidden3)
def forward(self, x):
x = self.drop(x)
x = self.layer1(x)
x = self.batchnorm1(x)
x = self.relu(x)
x = self.drop(x)
x = self.layer2(x)
x = self.batchnorm2(x)
x = self.relu(x)
x = self.drop(x)
x = self.layer3(x)
x = self.batchnorm3(x)
x = self.relu(x)
x = self.drop(x)
x = self.out(x)
return x
model = Model().to(device)
print(model)
def multi_acc(y_hat, y):
y_pred_softmax = torch.log_softmax(y_hat, dim=1)
_, y_pred_tags = torch.max(y_pred_softmax, dim=1)
correct_pred = (y_pred_tags == y).float()
acc = correct_pred.sum() / len(correct_pred)
acc = torch.round(acc * 100)
return acc
test_stats = {
'loss': [],
"acc": []
}
train_stats = {
'loss': [],
"acc": []
}
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
def train(dataloader, model, loss_fn, optimizer, multi_acc):
model.train()
train_loss = 0
train_acc = 0
for i,(x,y) in enumerate(dataloader):
x,y = x.to(device), y.to(device)
y_hat = model(x)
loss = loss_fn(y_hat,y)
train_loss += loss.item()
acc = multi_acc(y_hat, y)
train_acc += acc.item()
optimizer.zero_grad()
loss.backward()
optimizer.step()
num_batches = len(dataloader)
train_loss = train_loss / num_batches
train_acc = train_acc/ num_batches
train_stats['loss'].append(train_loss)
train_stats['acc'].append(train_acc)
#print(f'train RMSE: {train_loss}')
print(
f'Epoch {epoch + 1:03}: | Train Loss: {train_loss:.5f} | Train Acc: {train_acc:.3f}| ')
for epoch in range(epochs):
# print(f"Epoch {epoch+1}:")
start_time = time.time()
train(loader_train, model, loss_fn, optimizer, multi_acc)
print("--- %s seconds ---" % (time.time() - start_time))
The full error message I get is:
Traceback (most recent call last):
File "C:\Users\swlli\NeuralNetworkIndicator.py", line 251, in <module>
train(loader_train, model, loss_fn, optimizer, multi_acc)
File "C:\Users\swlli\NeuralNetworkIndicator.py", line 200, in train
for i,(x,y) in enumerate(dataloader):
File "E:\Venv\lib\site-packages\torch\utils\data\dataloader.py", line 628, in __next__
data = self._next_data()
File "E:\Venv\lib\site-packages\torch\utils\data\dataloader.py", line 670, in _next_data
index = self._next_index() # may raise StopIteration
File "E:\Venv\lib\site-packages\torch\utils\data\dataloader.py", line 618, in _next_index
return next(self._sampler_iter) # may raise StopIteration
File "E:\Venv\lib\site-packages\torch\utils\data\sampler.py", line 254, in __iter__
for idx in self.sampler:
File "E:\Venv\lib\site-packages\torch\utils\data\sampler.py", line 203, in __iter__
rand_tensor = torch.multinomial(self.weights, self.num_samples, self.replacement, generator=self.generator)
RuntimeError: number of categories cannot exceed 2^24
import torch
import torchvision
n_epochs = 3
batch_size_train = 64
batch_size_test = 1000
learning_rate = 0.01
momentum = 0.5
log_interval = 10
random_seed = 1
torch.backends.cudnn.enabled = False
torch.manual_seed(random_seed)
train_loader = torch.utils.data.DataLoader(
torchvision.datasets.MNIST('./files', train=True, download=True,
transform=torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(
(0.1307,), (0.3081,))
])),
batch_size=batch_size_train, shuffle=True)
test_loader = torch.utils.data.DataLoader(
torchvision.datasets.MNIST('./files', train=False, download=True,
transform=torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(
(0.1307,), (0.3081,))
])),
batch_size=batch_size_test, shuffle=True)
examples = enumerate(test_loader)
batch_idx, (example_data, example_targets) = next(examples)
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.conv2_drop = nn.Dropout2d()
self.fc1 = nn.Linear(320, 50)
self.fc2 = nn.Linear(50, 10)
def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return F.log_softmax(x)
network = Net()
optimizer = optim.SGD(network.parameters(), lr=learning_rate,
momentum=momentum)
train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_loader.dataset) for i in range(n_epochs + 1)]
def train(epoch):
network.train()
for batch_idx, (data, target) in enumerate(train_loader):
optimizer.zero_grad()
output = network(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
train_losses.append(loss.item())
train_counter.append(
(batch_idx*64) + ((epoch-1)*len(train_loader.dataset)))
def test():
network.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
output = network(data)
test_loss += F.nll_loss(output, target, size_average=False).item()
pred = output.data.max(1, keepdim=True)[1]
correct += pred.eq(target.data.view_as(pred)).sum()
test_loss /= len(test_loader.dataset)
test_losses.append(test_loss)
print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
test()
for epoch in range(1, n_epochs + 1):
train(epoch)
test()
torch.save(network.state_dict(), './results/model.pth')
Other file:
PATH = "results/model.pth"
model = torch.load(PATH)
When this is called, instead of loading the model parameters, Pytorch retrains the entire model. The model is just retrained the same way (ie. they take the exact same steps to get to the same local minimum).
PATH = "results/model.pth"
model = Net()
model.load_state_dict(torch.load(PATH))
has the same result.
Is there any way I can load the model without retraining the whole thing?
I just tried executing the code, and it works perfect. load_state_dict did not retrain the model:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.conv2_drop = nn.Dropout2d()
self.fc1 = nn.Linear(320, 50)
self.fc2 = nn.Linear(50, 10)
def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return F.log_softmax(x)
network = Net()
PATH = "results/model.pth"
network.load_state_dict(torch.load(PATH))
# works perfect
By the way, state_dict only contains the model weights and not the dataset, so load_state_dict can never re-train the model.
I think the problem is how the original code is organized. The tranining procedure starts running inmediately after Class Net is defined, so you cannot import Net from this file without re-running everything.
Ideally, the training and the testing procedure should be wrapped inside an if __name__=='__main__' statement (at the end of the file), so that you can safely import Net without re-running any calculations:
# source_file.py
import torch
import torchvision
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.conv1 = nn.Conv2d(1, 10, kernel_size=5)
self.conv2 = nn.Conv2d(10, 20, kernel_size=5)
self.conv2_drop = nn.Dropout2d()
self.fc1 = nn.Linear(320, 50)
self.fc2 = nn.Linear(50, 10)
def forward(self, x):
x = F.relu(F.max_pool2d(self.conv1(x), 2))
x = F.relu(F.max_pool2d(self.conv2_drop(self.conv2(x)), 2))
x = x.view(-1, 320)
x = F.relu(self.fc1(x))
x = F.dropout(x, training=self.training)
x = self.fc2(x)
return F.log_softmax(x)
def train(epoch):
network.train()
for batch_idx, (data, target) in enumerate(train_loader):
optimizer.zero_grad()
output = network(data)
loss = F.nll_loss(output, target)
loss.backward()
optimizer.step()
if batch_idx % log_interval == 0:
print('Train Epoch: {} [{}/{} ({:.0f}%)]\tLoss: {:.6f}'.format(
epoch, batch_idx * len(data), len(train_loader.dataset),
100. * batch_idx / len(train_loader), loss.item()))
train_losses.append(loss.item())
train_counter.append(
(batch_idx*64) + ((epoch-1)*len(train_loader.dataset)))
def test():
network.eval()
test_loss = 0
correct = 0
with torch.no_grad():
for data, target in test_loader:
output = network(data)
test_loss += F.nll_loss(output, target, size_average=False).item()
pred = output.data.max(1, keepdim=True)[1]
correct += pred.eq(target.data.view_as(pred)).sum()
test_loss /= len(test_loader.dataset)
test_losses.append(test_loss)
print('\nTest set: Avg. loss: {:.4f}, Accuracy: {}/{} ({:.0f}%)\n'.format(
test_loss, correct, len(test_loader.dataset),
100. * correct / len(test_loader.dataset)))
if __name__ == '__main__':
n_epochs = 3
batch_size_train = 64
batch_size_test = 1000
learning_rate = 0.01
momentum = 0.5
log_interval = 10
random_seed = 1
torch.backends.cudnn.enabled = False
torch.manual_seed(random_seed)
train_loader = torch.utils.data.DataLoader(
torchvision.datasets.MNIST('./files', train=True, download=True,
transform=torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(
(0.1307,), (0.3081,))
])),
batch_size=batch_size_train, shuffle=True)
test_loader = torch.utils.data.DataLoader(
torchvision.datasets.MNIST('./files', train=False, download=True,
transform=torchvision.transforms.Compose([
torchvision.transforms.ToTensor(),
torchvision.transforms.Normalize(
(0.1307,), (0.3081,))
])),
batch_size=batch_size_test, shuffle=True)
examples = enumerate(test_loader)
batch_idx, (example_data, example_targets) = next(examples)
network = Net()
optimizer = optim.SGD(network.parameters(), lr=learning_rate,
momentum=momentum)
train_losses = []
train_counter = []
test_losses = []
test_counter = [i*len(train_loader.dataset) for i in range(n_epochs + 1)]
test()
for epoch in range(1, n_epochs + 1):
train(epoch)
test()
PATH = './results/model.pth'
torch.save(network.state_dict(), PATH)
Then, when you reload the model in a second file, you can write:
from my_source_file import Net
network = Net()
network.load_state_dict(torch.load(PATH))
Here is a website with more information about if __name__ == '__main__':
https://realpython.com/if-name-main-python/
PS. Another option, that I personally use, is to define the neural network in a separate file than the training procedure. This is useful to make big projects look more organized, or even to experiment with different neural network designs.
Previous answer:
We should use load_state_dict to restore models:
model = TheModelClass(*args, **kwargs)
model.load_state_dict(torch.load(PATH))
model.eval()
https://pytorch.org/tutorials/beginner/saving_loading_models.html
I try to train model but in vain. I see the error
Input contains NaN, infinity or a value too large for dtype('float32').
I think it can be connected with Mse function, because with MAE it works somehow also with RMSE it works somehow (on the second epoch i have RMSE = 10). I can't figure out what i do wrong.
# Count Nan
df = pd.read_csv('data.txt.zip', header=None)
X = df.iloc[:, 1:].values
y = df.iloc[:, 0].values
train_size = 463715
X_train = X[:train_size, :]
y_train = y[:train_size]
X_test = X[train_size:, :]
y_test = y[train_size:]
#ToTensor
X_train = torch.FloatTensor(X_train)
y_train = torch.FloatTensor(y_train)
X_test = torch.FloatTensor(X_test)
y_test = torch.FloatTensor(y_test)
# Create TensorDataset
train_ds = TensorDataset(X_train, y_train)
test_ds = TensorDataset(X_test, y_test)
val_num = 92743
train_num = 370972
# Divide train data into train and validation data
train_ds, val_ds = random_split(train_ds, [train_num, val_num])
# Evaluate accuracy
def accuracy(y_true, y_pred):
return r2_score(y_true, y_pred)
# create Class
class BaselineModel(nn.Module):
def __init__(self, input_size, hidden_size, output_size):
super(BaselineModel, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.output_size = output_size
self.linear1 = nn.Linear(90, 45)
self.linear2 = nn.Linear(45, 1)
self.linear3 = nn.Linear(45, 15)
self.linear4 = nn.Linear(15, 1)
self.batch = nn.BatchNorm2d(hidden_size)
self.relu = nn.ReLU()
self.lreku = nn.LeakyReLU()
self.elu = nn.ELU()
self.dropout = nn.Dropout(0.5)
def forward(self, x):
x = self.elu(self.linear1(x))
return self.linear2(x)
def training_step(self, criterion, batch):
x_train, y_train = batch
y_pred = self(x_train)
loss = (criterion(y_pred, y_train.unsqueeze(1)))
return loss
def validation_step(self, criterion, batch):
x_val, y_val = batch
y_pred = self(x_val)
loss = (criterion(y_pred, y_val.unsqueeze(1)))
acc = accuracy(y_val, y_pred)
return {'val_loss': loss, 'val_acc': acc}
def validation_epoch_end(self, y_pred):
batch_losses = [x['val_loss'] for x in y_pred]
epoch_loss = torch.stack(batch_losses).mean()
batch_accs = [x['val_acc'] for x in y_pred]
epoch_acc = np.mean(batch_accs)
#epoch_acc = torch.stack(batch_accs).mean()
return {'val_loss': epoch_loss.item(), 'val_acc': epoch_acc.item()}
def epoch_end(self, epoch, result):
print(f"Epoch {epoch}, val_loss: {result['val_loss']}, val_acc: {result['val_acc']} ")
model = BaselineModel(input_size = 90, hidden_size = 45, output_size = 1)
# Evaluate
def evaluate(model, criterion, val_loader):
with torch.no_grad():
y_pred = [model.validation_step(criterion, batch) for batch in val_loader]
return model.validation_epoch_end(y_pred)
# Train
def train(model, criterion, optimizer, train_loader, val_loader, lr, epochs):
history = []
for epoch in range(epochs):
for batch in train_loader:
optimizer.zero_grad()
loss = model.training_step(criterion, batch)
loss.backward()
optimizer.step()
result = evaluate(model, criterion, val_loader)
model.epoch_end(epoch, result)
history.append(result)
#return history
# Create train_loader & val_loader
batch_size = 128
train_loader = DataLoader(train_ds, batch_size = batch_size, shuffle = True)
val_loader = DataLoader(val_ds, batch_size = batch_size, shuffle = True)
# Create parameters and Train
lr = 0.05
optimizer = torch.optim.SGD(model.parameters(), lr, momentum = 0.9)
criterion = F.mse_loss
epochs = 10
train(model, criterion, optimizer, train_loader, val_loader, lr, epochs)
Yes, it is because of your loss of function. if the value of the loss function after some epoch becomes very small or very large then when you want to use it in backpropagation to train the model, you face this error. To handle that, you should use Early Stopping to Halt the Training. so you should implement Callback, Callbacks provide a way to execute code and interact with the training model process automatically.
For some reason while training my VAE my RAM usage is steadily increasing, and I cannot seem to pin point why.
I have narrowed down the problem to my save_plots function by using psutil.virtual_memory() checking my virtual memory between function calls.
Here is the code for the VAE model and initialization of model and training params:
import torch
import torch.nn as nn
import torch.nn.functional as F
import torchvision
from torch.utils.data import DataLoader
from utils import modelSummary, train_evaluate, plot_training_results
class Encoder(nn.Module):
def __init__(self, latent_dims) -> None:
super(Encoder, self).__init__()
self.conv1 = nn.Conv2d(1, 64, 3, stride = 2, bias = False)
self.batchnorm1 = nn.BatchNorm2d(64)
self.conv2 = nn.Conv2d(64, 128 , 3, stride = 2, bias = False)
self.batchnorm2 = nn.BatchNorm2d(128)
self.conv3 = nn.Conv2d(128, 128, 3, stride = 2) # (#num samples, 64 , 2 , 2)
self.flatten = nn.Flatten(start_dim = 1) # (#num samples, 256)
self.linear1 = nn.Linear(512, 1024)
self.mu = nn.Linear(1024, latent_dims)
self.sigma = nn.Linear(1024, latent_dims)
self.N = torch.distributions.Normal(0, 1)
self.N.loc = self.N.loc.cuda()
self.N.scale = self.N.scale.cuda()
self.kl = 0
def forward(self, x):
x = F.relu(self.conv1(x))
x = self.batchnorm1(x)
x = F.relu(self.conv2(x))
x = self.batchnorm2(x)
x = self.conv3(x)
x = self.flatten(x)
x = F.relu(self.linear1(x))
mu = self.mu(x)
sigma = torch.exp(self.sigma(x))
z = mu + sigma * self.N.sample(mu.shape)
self.kl = (sigma**2 + mu**2 - torch.log(sigma) - 0.5).sum()
return z
class Decoder(nn.Module):
def __init__(self, latent_dims) -> None:
super(Decoder, self).__init__()
self.linear1 = nn.Linear(latent_dims, 512)
self.deconv1 = nn.ConvTranspose2d(32, 128, 3, stride = 3, padding = 1, output_padding = 2, bias = False)
self.batchnorm1 = nn.BatchNorm2d(128)
self.deconv2 = nn.ConvTranspose2d(128, 64, 3, stride = 2, output_padding = 1, bias = False)
self.batchnorm2 = nn.BatchNorm2d(64)
self.deconv3 = nn.ConvTranspose2d(64, 1, 3)
def forward(self, x):
x = F.relu(self.linear1(x))
x = x.view(-1, 32, 4, 4)
x = F.relu(self.deconv1(x))
x = self.batchnorm1(x)
x = F.relu(self.deconv2(x))
x = self.batchnorm2(x)
x = torch.sigmoid(self.deconv3(x))
return x
class VariationalAutoEncoder(nn.Module):
def __init__(self, latent_dims) -> None:
super(VariationalAutoEncoder, self).__init__()
self.encoder = Encoder(latent_dims)
self.decoder = Decoder(latent_dims)
def forward(self, x):
z = self.encoder(x)
return self.decoder(z)
if __name__ == '__main__':
# Initialize Model
latent_dims = 256
model = VariationalAutoEncoder(latent_dims)
modelSummary(model)
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(f"Using device: {device}\n")
training_params = {
'num_epochs': 200,
'batch_size': 512,
'loss_function':F.mse_loss,
'optimizer': torch.optim.Adam(model.parameters(), lr=1e-4),
'save_path': 'training_256',
'sample_size': 10,
'plot_every': 1,
'latent_dims' : latent_dims
}
# Load Data
train_dataset = DataLoader(torchvision.datasets.MNIST(root = './data', train = True, download = True, transform = torchvision.transforms.ToTensor()), batch_size = training_params['batch_size'])
validation_dataset = DataLoader(torchvision.datasets.MNIST(root = './data', train = False, download = True, transform = torchvision.transforms.ToTensor()), batch_size = training_params['batch_size'])
metrics = {
'l1': lambda output, target: (torch.abs(output - target).sum())
}
train_results, evaluation_results = train_evaluate(model, device, train_dataset, validation_dataset, training_params, metrics)
plot_training_results(train_results=train_results, validation_results=evaluation_results, training_params=training_params, metrics=metrics)
Here is my utils.py file containing the training loop and other utility functions
import torch
import torch.nn as nn
from torch.utils.data import DataLoader
import time
import gc
import numpy as np
import matplotlib.pyplot as plt
def modelSummary(model, verbose=False):
if verbose:
print(model)
total_parameters = 0
for name, param in model.named_parameters():
num_params = param.size()[0]
total_parameters += num_params
if verbose:
print(f"Layer: {name}")
print(f"\tNumber of parameters: {num_params}")
print(f"\tShape: {param.shape}")
if total_parameters > 1e5:
print(f"Total number of parameters: {total_parameters/1e6:.2f}M")
else:
print(f"Total number of parameters: {total_parameters/1e3:.2f}K")
def train_epoch(model: nn.Module, device: torch.device, train_dataloader: DataLoader, training_params: dict, metrics: dict):
"""_summary_
Args:
model (nn.Module): Model to be trained by
device (str): device to be trained on
train_dataloader (nn.data.DataLoader): Dataloader object to load batches of dataset
training_params (dict): Dictionary of training parameters containing "batch_size", "loss_function"
"optimizer".
metrics (dict): Dictionary of functional methods that would compute the metric value
Returns:
run_results (dict): Dictionary of metrics computed for the epoch
"""
OPTIMIZER = training_params["optimizer"]
model = model.to(device)
model.train()
# Dictionary holding result of this epoch
run_results = dict()
for metric in metrics:
run_results[metric] = 0.0
run_results["loss"] = 0.0
# Iterate over batches
num_batches = 0
for x, target in train_dataloader:
num_batches += 1
# Move tensors to device
input = x.to(device)
# Forward pass
output = model(input)
# Compute loss
loss = ((output - input)**2).sum() + model.encoder.kl
# Backward pass
OPTIMIZER.zero_grad()
loss.backward()
OPTIMIZER.step()
# Update metrics
run_results["loss"] += loss.detach().item()
for key, func in metrics.items():
run_results[key] += func(output, input).detach().item()
# Clean up memory
del loss
del input
del output
for key in run_results:
run_results[key] /= num_batches
return run_results
def evaluate_epoch(model: nn.Module, device: torch.device, validation_dataloader: DataLoader, training_params: dict, metrics: dict):
"""_summary_
Args:
model (nn.Module): model to evaluate
device (str): device to evaluate on
validation_dataloader (DataLoader): DataLoader for evaluation
training_params (dict): Dictionary of training parameters containing "batch_size", "loss_function"
"optimizer".
metrics (dict): Dictionary of functional methods that would compute the metric value
Returns:
run_results (dict): Dictionary of metrics computed for the epoch
"""
model = model.to(device)
# Dictionary holding result of this epoch
run_results = dict()
for metric in metrics:
run_results[metric] = 0.0
run_results["loss"] = 0.0
# Iterate over batches
with torch.no_grad():
model.eval()
num_batches = 0
for x, target in validation_dataloader:
num_batches += 1
# Move tensors to device
input = x.to(device)
target = target.to(device)
# Forward pass
output = model(input)
# Compute loss
loss = ((output - input)**2).sum() + model.encoder.kl
# Update metrics
run_results["loss"] += loss.detach().item()
for key, func in metrics.items():
run_results[key] += func(output, input).detach().item()
# Clean up memory
del loss
del input
del output
for key in run_results:
run_results[key] /= num_batches
return run_results
def train_evaluate(model: nn.Module, device: torch.device, train_dataloader: DataLoader, validation_dataloader: DataLoader, training_params: dict, metrics: dict):
"""Function to train a model and provide statistics during training
Args:
model (nn.Module): Model to be trained
device (torch.device): Device to be trained on
train_dataset (DataLoader): Dataset to be trained on
validation_dataset (DataLoader): Dataset to be evaluated on
training_params (dict): Dictionary of training parameters containing "num_epochs", "batch_size", "loss_function",
"save_path", "optimizer"
metrics (dict): Dictionary of functional methods that would compute the metric value
Returns:
_type_: _description_
"""
NUM_EPOCHS = training_params["num_epochs"]
BATCH_SIZE = training_params["batch_size"]
SAVE_PATH = training_params["save_path"]
SAMPLE_SIZE = training_params["sample_size"]
PLOT_EVERY = training_params["plot_every"]
LATENT_DIMS = training_params["latent_dims"]
# Initialize metrics
train_results = dict()
train_results['loss'] = np.empty(1)
evaluation_results = dict()
evaluation_results['loss'] = np.empty(1)
for metric in metrics:
train_results[metric] = np.empty(1)
evaluation_results[metric] = np.empty(1)
batch = next(iter(validation_dataloader))
idxs = []
for i in range(SAMPLE_SIZE):
idx = torch.where(batch[1] == i)[0].squeeze()[0]
idxs.append(idx.item())
FIXED_SAMPLES = batch[0][idxs].to(device).detach()
FIXED_NOISE = torch.normal(0, 1, size = (100, LATENT_DIMS)).to(device).detach()
del idxs
del batch
for epoch in range(NUM_EPOCHS):
start = time.time()
print(f"======== Epoch {epoch+1}/{NUM_EPOCHS} ========")
# Train Model
print("Training ... ")
epoch_train_results = train_epoch(model, device, train_dataloader, training_params, metrics)
# Evaluate Model
print("Evaluating ... ")
epoch_evaluation_results = evaluate_epoch(model, device, validation_dataloader, training_params, metrics)
for metric in metrics:
np.append(train_results[metric], epoch_train_results[metric])
np.append(evaluation_results[metric], epoch_evaluation_results[metric])
# Print results of epoch
print(f"Completed Epoch {epoch+1}/{NUM_EPOCHS} in {(time.time() - start):.2f}s")
print(f"Train Loss: {epoch_train_results['loss']:.2f} \t Validation Loss: {epoch_evaluation_results['loss']:.2f}")
# Plot results
if epoch % PLOT_EVERY == 0:
save_plots(FIXED_SAMPLES, FIXED_NOISE, model, device, epoch, training_params)
print(f"Items cleaned up: {gc.collect()}")
# Save model
SAVE = f"{SAVE_PATH}_epoch{epoch + 1}.pt"
torch.save(model.state_dict(), SAVE)
return train_results, evaluation_results
def save_plots(fixed_samples, fixed_noise, model, device, epoch, training_params):
"""Function to save plots of the model
Args:
fixed_samples (torch.Tensor): Samples to be plotted
fixed_noise (torch.Tensor): Noise to be plotted
model (nn.Module): Model to be tested
epoch (int): Epoch number
SAVE_PATH (str): Path to save plots
"""
SAMPLE_SIZE = training_params["sample_size"]
SAVE_PATH = training_params["save_path"]
with torch.no_grad():
model.eval()
fixed_samples = fixed_samples.to(device)
fixed_noise = fixed_noise.to(device)
outputs = model(fixed_samples)
generated_images = model.decoder(fixed_noise)
fig, ax = plt.subplots(2, SAMPLE_SIZE, figsize=(SAMPLE_SIZE * 5,15))
for i in range(SAMPLE_SIZE):
image = fixed_samples[i].detach().cpu().numpy()
output = outputs[i].detach().cpu().numpy()
ax[0][i].imshow(image.reshape(28,28))
ax[1][i].imshow(output.reshape(28,28))
plt.savefig(f"{SAVE_PATH}/training_images/epoch{epoch + 1}.png")
plt.close()
del fig, ax
del output
del outputs
_, axs = plt.subplots(10, 10, figsize=(30, 20))
axs = axs.flatten()
for image, ax in zip(generated_images, axs):
ax.imshow(image.cpu().numpy().reshape(28, 28))
ax.axis('off')
plt.savefig(f"{SAVE_PATH}/generated_images/epoch{epoch + 1}.png")
plt.close()
# Clean up memory
del generated_images
del image
del _, axs
def plot_training_results(train_results, validation_results, training_params, metrics):
"""Function to plot training results
Args:
train_results (dict): Dictionary of training results
validation_results (dict): Dictionary of validation results
"""
plt.plot(train_results['loss'], label='Training Loss')
plt.plot(validation_results['loss'], label='Validation Loss')
for metric in metrics:
plt.plot(train_results[metric], label=f"Train {metric}")
plt.plot(validation_results[metric], label=f"Validation {metric}")
plt.legend()
plt.xlabel("Epoch")
plt.ylabel("Loss")
plt.savefig(f"{training_params['save_path']}_training_results.png")
plt.show()
if __name__ == '__main__':
pass
Am I doing something wrong while detaching? Or is it a problem with the number of figures I am saving?
On another sidenote, while training the following by running in a terminal and just calling python VAE.py, I run out of memory due to the steady increase as mentioned above, however if I run it on VSCode it seems to clean up my memory as it nears max, is there any documentation of this or am I mistaken?
I am trying to train a neural network with PyTorch, but I get the error in the title. I followed this tutorial, and I just applied some small changes to meet my needs. Here's the network:
class ChordClassificationNetwork(nn.Module):
def __init__(self, train_model=False):
super(ChordClassificationNetwork, self).__init__()
self.train_model = train_model
self.flatten = nn.Flatten()
self.firstConv = nn.Conv2d(3, 64, (3, 3))
self.secondConv = nn.Conv2d(64, 64, (3, 3))
self.pool = nn.MaxPool2d(2)
self.drop = nn.Dropout(0.25)
self.fc1 = nn.Linear(33856, 256)
self.fc2 = nn.Linear(256, 256)
self.outLayer = nn.Linear(256, 7)
def forward(self, x):
x = self.firstConv(x)
x = F.relu(x)
x = self.pool(x)
x = self.secondConv(x)
x = F.relu(x)
x = self.pool(x)
x = self.drop(x)
x = self.flatten(x)
x = self.fc1(x)
x = F.relu(x)
x = self.drop(x)
x = self.fc2(x)
x = F.relu(x)
x = self.drop(x)
x = self.outLayer(x)
output = F.softmax(x, dim=1)
return output
and the accuray check part, the one that is causing the error:
device = ("cuda" if torch.cuda.is_available() else "cpu")
transformations = transforms.Compose([
transforms.Resize((100, 100))
])
num_epochs = 10
learning_rate = 0.001
train_CNN = False
batch_size = 32
shuffle = True
pin_memory = True
num_workers = 1
dataset = GuitarDataset("../chords_data/cropped_images/train", transform=transformations)
train_set, validation_set = torch.utils.data.random_split(dataset, [int(0.8 * len(dataset)), len(dataset) - int(0.8*len(dataset))])
train_loader = DataLoader(dataset=train_set, shuffle=shuffle, batch_size=batch_size, num_workers=num_workers,
pin_memory=pin_memory)
validation_loader = DataLoader(dataset=validation_set, shuffle=shuffle, batch_size=batch_size, num_workers=num_workers,
pin_memory=pin_memory)
model = ChordClassificationNetwork().to(device)
criterion = nn.BCELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
def check_accuracy(loader, model):
if loader == train_loader:
print("Checking accuracy on training data")
else:
print("Checking accuracy on validation data")
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x)
predictions = torch.tensor([1.0 if i >= 0.5 else 0.0 for i in scores]).to(device)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print(
f"Got {num_correct} / {num_samples} with accuracy {float(num_correct) / float(num_samples) * 100:.2f}"
)
return f"{float(num_correct) / float(num_samples) * 100:.2f}"
def train():
model.train()
for epoch in range(num_epochs):
loop = tqdm(train_loader, total=len(train_loader), leave=True)
if epoch % 2 == 0:
loop.set_postfix(val_acc=check_accuracy(validation_loader, model))
for imgs, labels in loop:
imgs = imgs.to(device)
labels = labels.to(device)
outputs = model(imgs)
loss = criterion(outputs, labels)
optimizer.zero_grad()
loss.backward()
optimizer.step()
loop.set_description(f"Epoch [{epoch}/{num_epochs}]")
loop.set_postfix(loss=loss.item())
if __name__ == "__main__":
train()
The error is caused on this line: predictions = torch.tensor([1.0 if i >= 0.5 else 0.0 for i in scores]).to(device) but I don't understand why. I saw some other answers but those could not fix my problem.
Complete stack trace:
0%| | 0/13 [00:00<?, ?it/s]Checking accuracy on validation data
Traceback (most recent call last):
File "/home/deffo/Documents/Unimore/Magistrale/Computer Vision and Cognitive Systems/Guitar_Fingering_&_Chords_Recognition/ChordsClassification/train_CCN.py", line 80, in <module>
train()
File "/home/deffo/Documents/Unimore/Magistrale/Computer Vision and Cognitive Systems/Guitar_Fingering_&_Chords_Recognition/ChordsClassification/train_CCN.py", line 66, in train
loop.set_postfix(val_acc=check_accuracy(validation_loader, model))
File "/home/deffo/Documents/Unimore/Magistrale/Computer Vision and Cognitive Systems/Guitar_Fingering_&_Chords_Recognition/ChordsClassification/train_CCN.py", line 52, in check_accuracy
predictions = torch.tensor([1.0 if i >= 0.5 else 0.0 for i in scores]).to(device)
File "/home/deffo/Documents/Unimore/Magistrale/Computer Vision and Cognitive Systems/Guitar_Fingering_&_Chords_Recognition/ChordsClassification/train_CCN.py", line 52, in <listcomp>
predictions = torch.tensor([1.0 if i >= 0.5 else 0.0 for i in scores]).to(device)
RuntimeError: Boolean value of Tensor with more than one value is ambiguous
0%| | 0/13 [00:02<?, ?it/s]
The output of the model will be a discrete distribution over your 7 classes. To retrieve the predicted image you can directly apply an argmax over it:
scores = model(x)
predictions = scores.argmax(1)