Pytorch RuntimeError - python

I am getting the Runtime error while training the network in this line
loss.backward()
The whole error I'm getting
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [4, 2]], which is output 0 of SigmoidBackward0, is at version 4; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
class ANNModel(nn.Module):
def __init__(self, input_shape, output_shape, device="cpu"):
super(ANNModel, self).__init__()
self.device = torch.device(device)
self.loss = nn.CrossEntropyLoss()
self.model = nn.Sequential(
nn.Linear(input_shape, 512),
nn.ReLU(),
nn.Linear(512, 256),
nn.ReLU(),
nn.Linear(256, 64),
nn.ReLU(),
nn.Linear(64, 16),
nn.ReLU(),
nn.Linear(16, output_shape),
nn.Sigmoid(),
).to(device)
self.optim = RMSprop(params=self.model.parameters(), alpha=0.95, eps=0.01)
def forward(self, x):
return self.model(x)
def train_model(self, X, Y, batch_size, epochs=1):
for epoch in range(0, epochs):
print("[INFO] epoch:{} ...".format(epoch+1))
trainloss= 0
trainAcc = 0
samples = 0
self.model.train()
# print(X)
# print(Y)
for (batchX, batchY) in self.next_batch(X, Y, batch_size):
(batchX, batchY) = (batchX.to(self.device), batchY.to(self.device))
predictions = self.model(batchX)
loss = self.loss(predictions, batchY)
self.optim.zero_grad()
loss.backward()
self.optim.step()
trainloss += loss.item() * batchY.size(0)
trainAcc += (predictions.max(1)[1] == batchY).sum().item()
samples += batchY.size(0)
traintemplate = "epoch:{} train loss:{:.3f} train accuracy:{:.3f}"
print(traintemplate.format(epoch+1, trainloss/samples, trainAcc/samples))
def next_batch(self, inputs, targets, batchSize):
for i in range(0, inputs.shape[0], batchSize):
yield (inputs[i:i+batchSize], targets[i:i+batchSize])

Related

RuntimeError: Mat1 and mat2 shapes cannot be multiplied (256x65536 and 1024x4096) - Python

I am trying to use AlexNet to classify spectrogram images generated for 3s audio segments. I am aware that the input image to AlexNet must be 224x224 and have transformed the train and test datasets accordingly. I am encountering the following error: mat1 and mat2 shapes cannot be multiplied (256x65536 and 1024x4096 - see link for full error message) and I am not entirely sure why. Can anyone help me figure out where I am going wrong?
https://i.stack.imgur.com/cnVKp.png
Transform data
data_transform_train = transforms.Compose([
transforms.Resize(256),
transforms.CenterCrop(256),
transforms.ToTensor(),
transforms.Normalize(norm_mean_train, norm_std_train),
])
Create dataloaders
train_size = int(len(train_data_df))
test_size = int(len(test_data_df))
ins_dataset_train = Audio(
df=train_data_df[:train_size],
transform=data_transform_train,
)
ins_dataset_test = Audio(
df=test_data_df[:test_size],
transform=data_transform_test,
)
train_loader = torch.utils.data.DataLoader(
ins_dataset_train,
batch_size=256,
shuffle=True
)
test_loader = torch.utils.data.DataLoader(
ins_dataset_test,
batch_size=256,
shuffle=True
)
AlexNet Model
class AlexNet(nn.Module):
def __init__(self, output_dim):
super().__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, 3, 2, 1), # in_channels, out_channels, kernel_size, stride, padding
nn.MaxPool2d(2), # kernel_size
nn.ReLU(inplace=True),
nn.Conv2d(64, 192, 3, padding=1),
nn.MaxPool2d(2),
nn.ReLU(inplace=True),
nn.Conv2d(192, 384, 3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(384, 256, 3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(256, 256, 3, padding=1),
nn.MaxPool2d(2),
nn.ReLU(inplace=True)
)
self.classifier = nn.Sequential(
nn.Dropout(0.5),
nn.Linear(256 * 2 * 2, 4096),
nn.ReLU(inplace=True),
nn.Dropout(0.5),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, output_dim),
)
def forward(self, x):
x = self.features(x)
h = x.view(x.shape[0], -1)
x = self.classifier(h)
return x, h
output_dim = 2
model = AlexNet(output_dim)
def count_parameters(model):
return sum(p.numel() for p in model.parameters() if p.requires_grad)
print(f'The model has {count_parameters(model):,} trainable parameters')
def initialize_parameters(m):
if isinstance(m, nn.Conv2d):
nn.init.kaiming_normal_(m.weight.data, nonlinearity='relu')
nn.init.constant_(m.bias.data, 0)
elif isinstance(m, nn.Linear):
nn.init.xavier_normal_(m.weight.data, gain=nn.init.calculate_gain('relu'))
nn.init.constant_(m.bias.data, 0)
model.apply(initialize_parameters)
Learning Rate Finder
class LRFinder:
def __init__(self, model, optimizer, criterion, device):
self.optimizer = optimizer
self.model = model
self.criterion = criterion
self.device = device
torch.save(model.state_dict(), 'init_params.pt')
def range_test(self, iterator, end_lr=10, num_iter=100, smooth_f=0.05, diverge_th=5):
lrs = []
losses = []
best_loss = float('inf')
lr_scheduler = ExponentialLR(self.optimizer, end_lr, num_iter)
iterator = IteratorWrapper(iterator)
for iteration in range(num_iter):
loss = self._train_batch(iterator)
lrs.append(lr_scheduler.get_last_lr()[0])
lr_scheduler.step()
if iteration > 0:
loss = smooth_f * loss + (1 - smooth_f) * losses[-1]
if loss < best_loss:
best_loss = loss
losses.append(loss)
if loss > diverge_th * best_loss:
print("Stopping early, the loss has diverged")
break
model.load_state_dict(torch.load('init_params.pt'))
return lrs, losses
def _train_batch(self, iterator):
self.model.train()
self.optimizer.zero_grad()
x, y = iterator.get_batch()
x = x.to(self.device)
y = y.to(self.device)
y_pred, _ = self.model(x)
loss = self.criterion(y_pred, y)
loss.backward()
self.optimizer.step()
return loss.item()
from torch.optim.lr_scheduler import _LRScheduler
class ExponentialLR(_LRScheduler):
def __init__(self, optimizer, end_lr, num_iter, last_epoch=-1):
self.end_lr = end_lr
self.num_iter = num_iter
super(ExponentialLR, self).__init__(optimizer, last_epoch)
def get_lr(self):
curr_iter = self.last_epoch
r = curr_iter / self.num_iter
return [base_lr * (self.end_lr / base_lr) ** r
for base_lr in self.base_lrs]
class IteratorWrapper:
def __init__(self, iterator):
self.iterator = iterator
self._iterator = iter(iterator)
def __next__(self):
try:
inputs, labels = next(self._iterator)
except StopIteration:
self._iterator = iter(self.iterator)
inputs, labels, *_ = next(self._iterator)
return inputs, labels
def get_batch(self):
return next(self)
start_learning_rate = 1e-7
optimizer = optim.Adam(model.parameters(), lr=start_learning_rate)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
criterion = nn.CrossEntropyLoss()
model = model.to(device)
criterion = criterion.to(device)
END_LR = 10
NUM_ITER = 100[![enter image description here][1]][1]
lr_finder = LRFinder(model, optimizer, criterion, device)
lrs, losses = lr_finder.range_test(train_loader, END_LR, NUM_ITER)

Given groups=1, weight of size [8, 32, 3], expected input[1, 9999, 5024] to have 32 channels, but got 9999 channels instead

I'm trying to train a cnn with pytorch. My error message I'm getting is:
Given groups=1, weight of size [8, 32, 3], expected input[1, 9999, 5024] to have 32 channels, but got 9999 channels instead
Before starting to train my architecture I hand in my data and label to the
images_batch = torch.from_numpy(np.array(X))
labels_batch = torch.from_numpy(np.array(y))
dataset_train = TensorDataset(X, y)
train_loader = DataLoader(dataset_train, batch_size=32, shuffle=True)
The dimension of X is (5024, 9999, 1) with 5024 being the number of instances, 9999 the sequence length. The dimension to y is (5024,1).
My current code for the model is the following:
class Model(nn.Module):
def __init__(self, **kwargs):
super().__init__()
self.conv1 = nn.Conv1d(32, 8, kernel_size=3, stride=1, padding=0)
self.conv2 = nn.Conv1d(8, 16, kernel_size=3, stride=1, padding=0)
self.conv3 = nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=0)
#self.fc1 = nn.Linear(32, 2)
def forward(self, X):
X = F.relu(self.conv1(X))
X = F.relu(self.conv2(X))
X = F.max_pool2d(X,2)
X = self.conv3(X)
X = F.max_pool2d(X,2)
#X = self.fc1(X)
return F.softmax(X,dim =1)
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
model = Model().to('cpu')
# loss and optimizer
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x,y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x)
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print(
f"Got {num_correct} / {num_samples} with accuracy"
f" {float(num_correct) / float(num_samples) * 100:.2f}"
)
model.train()
check_accuracy(train_loader, model)
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(train_loader):
data.to(device=device)
targets.to(device=device)
data = data.reshape(data[0],1)
scores = model(data)
loss = criterion(scores, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
I'm aware that in tensorflow the ordering of the tensor is different than in pytorch.

Pytorch: RuntimeError: result type Float can't be cast to the desired output type Long

I have a model which looks as follows:
IMG_WIDTH = IMG_HEIGHT = 224
class AlexNet(nn.Module):
def __init__(self, output_dim):
super(AlexNet, self).__init__()
self._to_linear = None
self.x = torch.randn(3, IMG_WIDTH, IMG_HEIGHT).view(-1, 3, IMG_WIDTH, IMG_HEIGHT)
self.features = nn.Sequential(
nn.Conv2d(3, 64, 3, 2, 1), # in_channels, out_channels, kernel_size, stride, padding
nn.MaxPool2d(2),
nn.ReLU(inplace=True),
nn.Conv2d(64, 192, 3, padding=1),
nn.MaxPool2d(2),
nn.ReLU(inplace=True),
nn.Conv2d(192, 384, 3, padding=1),
nn.MaxPool2d(2),
nn.ReLU(inplace=True),
nn.Conv2d(384, 256, 3, padding=1),
nn.MaxPool2d(2),
nn.ReLU(inplace=True),
nn.Conv2d(256, 512, 3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 256, 3, padding=1),
nn.MaxPool2d(2),
nn.ReLU(inplace=True)
)
self.conv(self.x)
self.classifier = nn.Sequential(
nn.Dropout(.5),
nn.Linear(self._to_linear, 4096),
nn.ReLU(inplace=True),
nn.Dropout(.5),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, output_dim),
)
def conv(self, x):
x = self.features(x)
if self._to_linear is None:
self._to_linear = x.shape[1] * x.shape[2] * x.shape[3]
return x
def forward(self, x):
x = self.conv(x)
h = x.view(x.shape[0], -1)
x = self.classifier(h)
return x, h
Here is my optimizer and loss functions:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss().to(device)
Here is my train and evaluate functions:
def train(model, iterator, optimizer, criterion, device):
epoch_loss, epoch_acc = 0, 0
model.train()
for (x, y) in iterator:
# features and labels to the device
x = x.to(device)
y = y.to(device).long()
# Zero the gradients
optimizer.zero_grad()
y_pred, _ = model(x)
# Calculate the loss and accuracy
loss = criterion(y_pred.squeeze(), y)
acc = binary_accuracy(y_pred, y)
# Backward propagate
loss.backward()
# Update the weights
optimizer.step()
epoch_loss +=loss.item()
epoch_acc += acc.item()
return epoch_loss/len(iterator), epoch_acc/len(iterator)
def evaluate(model, iterator, criterion, device):
epoch_loss, epoch_acc = 0, 0
model.eval()
with torch.no_grad():
for (x, y) in iterator:
x = x.to(device)
y = y.to(device).long()
y_pred, _ = model(x)
loss = criterion(y_pred, y)
acc = binary_accuracy(y_pred, y)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss/len(iterator), epoch_acc/len(iterator)
This is the error that I'm getting:
RuntimeError: result type Float can't be cast to the desired output type Long
What may be possibly my problem because I have tried to convert my labels to long tensors as follows:
y = y.to(device).long()
But it seems not to work.
I was getting the same error doing this:
loss_fn(output, target)
where the output was Tensor torch.float32 and target was Tensor torch.int64. What solved this problem was calling the loss function like this:
loss_fn(output, target.float())
I encountered this error while using a library (Huggingface). In that case you do not have access to the code that computes the loss. You do not convert the data type of your labels that you pass to the library. What worked for me was:
labels = labels.astype(np.float32).tolist()

Why pytorch gradients are array but not vector?

I am trying to calculate the dot of gradients of the same layer of two different epochs but when I am using print(model.layer1[0].weight.grad) it returns
tensor([[[[-1.1855e-03, -3.7884e-03, -2.8973e-03, -2.8847e-03, -9.6510e-04],
[-2.0213e-03, -4.4927e-03, -5.4852e-03, -6.6060e-03, -3.5726e-03],
[ 7.4499e-04, -1.8440e-03, -5.0472e-03, -5.6322e-03, -1.9532e-03],
[-4.5696e-04, 9.6445e-04, -1.4923e-03, -2.9467e-03, -1.4610e-03],
[ 2.4987e-04, 2.2086e-03, -7.6576e-04, -2.7009e-03, -2.8571e-03]]],
[[[ 2.1447e-03, 3.1090e-03, 6.8175e-03, 6.4778e-03, 3.0501e-03],
[ 2.0214e-03, 3.9936e-03, 7.9528e-03, 6.0224e-03, 1.7545e-03],
[ 3.8781e-03, 5.6659e-03, 6.6901e-03, 5.4041e-03, 7.8014e-04],
[ 4.4273e-03, 3.4548e-03, 5.7185e-03, 4.1650e-03, 9.9067e-04],
[ 4.6075e-03, 4.1176e-03, 6.8392e-03, 3.4005e-03, 1.0009e-03]]],
[[[-3.8654e-04, -2.9567e-03, -6.1341e-03, -8.3991e-03, -8.2343e-03],
[-2.9113e-03, -5.4605e-03, -6.3008e-03, -8.2075e-03, -9.6702e-03],
[-1.5218e-03, -4.4105e-03, -5.5651e-03, -6.8926e-03, -6.6076e-03],
[-6.0357e-04, -3.1118e-03, -4.4441e-03, -4.0519e-03, -3.9733e-03],
[-2.8683e-04, -1.6281e-03, -4.2213e-03, -5.5304e-03, -5.0142e-03]]],
[[[-3.7607e-04, -1.7234e-04, -1.4569e-03, -3.5825e-04, 1.4530e-03],
[ 2.6226e-04, 8.5076e-04, 1.2195e-03, 2.7885e-03, 2.5953e-03],
[-7.7404e-04, 1.0984e-03, 7.8208e-04, 5.1286e-03, 4.6842e-03],
[-1.8183e-03, 8.9730e-04, 1.0955e-03, 4.9259e-03, 6.4677e-03],
[ 1.1674e-03, 4.0651e-03, 4.5886e-03, 8.3678e-03, 8.9893e-03]]],
Are that the gradients? If yes, why they are not vector? Below there is my neural network
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(1, 32, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential(
nn.Conv2d(32, 64, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.drop_out = nn.Dropout()
self.fc1 = nn.Linear(7 * 7 * 64, 1000)
self.fc2 = nn.Linear(1000, 10)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.drop_out(out)
out = self.fc1(out)
out = self.fc2(out)
return out
Below is the code of how I train and compute the gradients
model = ConvNet()
klisi=[]
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Train the model
total_step = len(train_loader)
loss_list = []
acc_list = []
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
# Run the forward pass
outputs = model(images)
loss = criterion(outputs, labels)
loss_list.append(loss.item())
# Backprop and perform Adam optimisation
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Track the accuracy
total = labels.size(0)
_, predicted = torch.max(outputs.data, 1)
correct = (predicted == labels).sum().item()
acc_list.append(correct / total)
if (i + 1) % 100 == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accuracy: {:.2f}%'
.format(epoch + 1, num_epochs, i + 1, total_step, loss.item(),
(correct / total) * 100))
print(model.layer1[0].weight.grad)
klisi.append(model.layer1[0].weight.grad)
print(optimizer.param_groups[0]['lr'])
optimizer.param_groups[0]['lr'] *= 0.9999

Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same

I'm trying to implement ResNet18 on pyTorch but I'm having some troubles with it. My code is this:
device = torch.device("cuda:0")
class ResnetBlock(nn.Module):
def __init__(self, strides, nf, nf0, reps, bn):
super(ResnetBlock, self).__init__()
self.adapt = strides == 2
self.layers = []
self.relus = []
self.adapt_layer = nn.Conv2d(nf0, nf, kernel_size=1, stride=strides, padding=0) if self.adapt else None
for i in range(reps):
self.layers.append(nn.Sequential(
nn.Conv2d(nf0, nf, kernel_size=3, stride=strides, padding=1),
nn.BatchNorm2d(nf, eps=0.001, momentum=0.99),
nn.ReLU(),
nn.Conv2d(nf, nf, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(nf, eps=0.001, momentum=0.99)))
self.relus.append(nn.ReLU())
strides = 1
nf0 = nf
def forward(self, x):
for i, (layer, relu) in enumerate(zip(self.layers, self.relus)):
rama = layer(x)
if self.adapt and i == 0:
x = self.adapt_layer(x)
x = x + rama
x = relu(x)
return x
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
nn.MaxPool2d(kernel_size=2, stride=2))
self.blocks = nn.Sequential(
ResnetBlock(1, 64, 64, 2, bn),
ResnetBlock(2, 128, 64, 2, bn),
ResnetBlock(2, 256, 128, 2, bn),
ResnetBlock(2, 512, 256, 2, bn))
self.fcout = nn.Linear(512, 10)
def forward(self, x):
out = self.layer1(x)
out = self.blocks(out)
out = out.reshape(out.size(0), -1)
out = self.fcout(out)
return out
num_epochs = 50
num_classes = 10
batch_size = 50
learning_rate = 0.00001
trans = transforms.ToTensor()
train_dataset = torchvision.datasets.CIFAR10(root="./dataset_pytorch", train=True, download=True, transform=trans)
test_dataset = torchvision.datasets.CIFAR10(root="./dataset_pytorch", train=False, download=True, transform=trans)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
def weights_init(m):
if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight.data)
nn.init.zeros_(m.bias.data)
model = ConvNet()
model.apply(weights_init)
model.to(device)
summary(model, (3,32,32))
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, eps=1e-6)
# Train the model
total_step = len(train_loader)
loss_list = []
acc_list = []
acc_list_test = []
for epoch in range(num_epochs):
total = 0
correct = 0
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
optimizer.zero_grad()
# Run the forward pass
outputs = model(images)
loss = criterion(outputs, labels)
loss_list.append(loss.item())
# Backprop and perform Adam optimisation
loss.backward()
optimizer.step()
# Track the accuracy
total += labels.size(0)
_, predicted = torch.max(outputs.data, 1)
correct += (predicted == labels).sum().item()
acc_list.append(correct / total)
print("Train")
print('Epoch [{}/{}], Accuracy: {:.2f}%'
.format(epoch + 1, num_epochs, (correct / total) * 100))
total_test = 0
correct_test = 0
for i, (images, labels) in enumerate(test_loader):
images = images.to(device)
labels = labels.to(device)
# Run the forward pass
outputs = model(images)
# Track the accuracy
total_test += labels.size(0)
_, predicted = torch.max(outputs.data, 1)
correct_test += (predicted == labels).sum().item()
acc_list_test.append(correct_test / total_test)
print("Test")
print('Epoch [{}/{}], Accuracy: {:.2f}%'
.format(epoch + 1, num_epochs, (correct_test / total_test) * 100))
It's weird because it's throwing me that error Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same even though I've moved both the model and the data to cuda.
I guess it's related with how I defined or used "ResnetBlock", because if I remove from ConvNet those blocks (removing the line out = self.blocks(out)), the code works. But I don't know what I'm doing wrong.
The problem is in this line:
model.to(device)
to is not in-place. It returns the converted model. You need to change it to:
model = model.to(device)
EDIT: Another problem: vanilla list cannot be tracked by PyTorch. You need to use nn.ModuleList.
From
self.layers = []
self.relus = []
To
self.layers = nn.ModuleList()
self.relus = nn.ModuleList()

Categories

Resources