I am trying to calculate the dot of gradients of the same layer of two different epochs but when I am using print(model.layer1[0].weight.grad) it returns
tensor([[[[-1.1855e-03, -3.7884e-03, -2.8973e-03, -2.8847e-03, -9.6510e-04],
[-2.0213e-03, -4.4927e-03, -5.4852e-03, -6.6060e-03, -3.5726e-03],
[ 7.4499e-04, -1.8440e-03, -5.0472e-03, -5.6322e-03, -1.9532e-03],
[-4.5696e-04, 9.6445e-04, -1.4923e-03, -2.9467e-03, -1.4610e-03],
[ 2.4987e-04, 2.2086e-03, -7.6576e-04, -2.7009e-03, -2.8571e-03]]],
[[[ 2.1447e-03, 3.1090e-03, 6.8175e-03, 6.4778e-03, 3.0501e-03],
[ 2.0214e-03, 3.9936e-03, 7.9528e-03, 6.0224e-03, 1.7545e-03],
[ 3.8781e-03, 5.6659e-03, 6.6901e-03, 5.4041e-03, 7.8014e-04],
[ 4.4273e-03, 3.4548e-03, 5.7185e-03, 4.1650e-03, 9.9067e-04],
[ 4.6075e-03, 4.1176e-03, 6.8392e-03, 3.4005e-03, 1.0009e-03]]],
[[[-3.8654e-04, -2.9567e-03, -6.1341e-03, -8.3991e-03, -8.2343e-03],
[-2.9113e-03, -5.4605e-03, -6.3008e-03, -8.2075e-03, -9.6702e-03],
[-1.5218e-03, -4.4105e-03, -5.5651e-03, -6.8926e-03, -6.6076e-03],
[-6.0357e-04, -3.1118e-03, -4.4441e-03, -4.0519e-03, -3.9733e-03],
[-2.8683e-04, -1.6281e-03, -4.2213e-03, -5.5304e-03, -5.0142e-03]]],
[[[-3.7607e-04, -1.7234e-04, -1.4569e-03, -3.5825e-04, 1.4530e-03],
[ 2.6226e-04, 8.5076e-04, 1.2195e-03, 2.7885e-03, 2.5953e-03],
[-7.7404e-04, 1.0984e-03, 7.8208e-04, 5.1286e-03, 4.6842e-03],
[-1.8183e-03, 8.9730e-04, 1.0955e-03, 4.9259e-03, 6.4677e-03],
[ 1.1674e-03, 4.0651e-03, 4.5886e-03, 8.3678e-03, 8.9893e-03]]],
Are that the gradients? If yes, why they are not vector? Below there is my neural network
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(1, 32, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential(
nn.Conv2d(32, 64, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2, stride=2))
self.drop_out = nn.Dropout()
self.fc1 = nn.Linear(7 * 7 * 64, 1000)
self.fc2 = nn.Linear(1000, 10)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.drop_out(out)
out = self.fc1(out)
out = self.fc2(out)
return out
Below is the code of how I train and compute the gradients
model = ConvNet()
klisi=[]
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
# Train the model
total_step = len(train_loader)
loss_list = []
acc_list = []
for epoch in range(num_epochs):
for i, (images, labels) in enumerate(train_loader):
# Run the forward pass
outputs = model(images)
loss = criterion(outputs, labels)
loss_list.append(loss.item())
# Backprop and perform Adam optimisation
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Track the accuracy
total = labels.size(0)
_, predicted = torch.max(outputs.data, 1)
correct = (predicted == labels).sum().item()
acc_list.append(correct / total)
if (i + 1) % 100 == 0:
print('Epoch [{}/{}], Step [{}/{}], Loss: {:.4f}, Accuracy: {:.2f}%'
.format(epoch + 1, num_epochs, i + 1, total_step, loss.item(),
(correct / total) * 100))
print(model.layer1[0].weight.grad)
klisi.append(model.layer1[0].weight.grad)
print(optimizer.param_groups[0]['lr'])
optimizer.param_groups[0]['lr'] *= 0.9999
Related
I'm trying to train a cnn with pytorch. My error message I'm getting is:
Given groups=1, weight of size [8, 32, 3], expected input[1, 9999, 5024] to have 32 channels, but got 9999 channels instead
Before starting to train my architecture I hand in my data and label to the
images_batch = torch.from_numpy(np.array(X))
labels_batch = torch.from_numpy(np.array(y))
dataset_train = TensorDataset(X, y)
train_loader = DataLoader(dataset_train, batch_size=32, shuffle=True)
The dimension of X is (5024, 9999, 1) with 5024 being the number of instances, 9999 the sequence length. The dimension to y is (5024,1).
My current code for the model is the following:
class Model(nn.Module):
def __init__(self, **kwargs):
super().__init__()
self.conv1 = nn.Conv1d(32, 8, kernel_size=3, stride=1, padding=0)
self.conv2 = nn.Conv1d(8, 16, kernel_size=3, stride=1, padding=0)
self.conv3 = nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=0)
#self.fc1 = nn.Linear(32, 2)
def forward(self, X):
X = F.relu(self.conv1(X))
X = F.relu(self.conv2(X))
X = F.max_pool2d(X,2)
X = self.conv3(X)
X = F.max_pool2d(X,2)
#X = self.fc1(X)
return F.softmax(X,dim =1)
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
model = Model().to('cpu')
# loss and optimizer
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x,y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x)
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print(
f"Got {num_correct} / {num_samples} with accuracy"
f" {float(num_correct) / float(num_samples) * 100:.2f}"
)
model.train()
check_accuracy(train_loader, model)
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(train_loader):
data.to(device=device)
targets.to(device=device)
data = data.reshape(data[0],1)
scores = model(data)
loss = criterion(scores, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
I'm aware that in tensorflow the ordering of the tensor is different than in pytorch.
I am getting the Runtime error while training the network in this line
loss.backward()
The whole error I'm getting
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [4, 2]], which is output 0 of SigmoidBackward0, is at version 4; expected version 0 instead. Hint: enable anomaly detection to find the operation that failed to compute its gradient, with torch.autograd.set_detect_anomaly(True).
class ANNModel(nn.Module):
def __init__(self, input_shape, output_shape, device="cpu"):
super(ANNModel, self).__init__()
self.device = torch.device(device)
self.loss = nn.CrossEntropyLoss()
self.model = nn.Sequential(
nn.Linear(input_shape, 512),
nn.ReLU(),
nn.Linear(512, 256),
nn.ReLU(),
nn.Linear(256, 64),
nn.ReLU(),
nn.Linear(64, 16),
nn.ReLU(),
nn.Linear(16, output_shape),
nn.Sigmoid(),
).to(device)
self.optim = RMSprop(params=self.model.parameters(), alpha=0.95, eps=0.01)
def forward(self, x):
return self.model(x)
def train_model(self, X, Y, batch_size, epochs=1):
for epoch in range(0, epochs):
print("[INFO] epoch:{} ...".format(epoch+1))
trainloss= 0
trainAcc = 0
samples = 0
self.model.train()
# print(X)
# print(Y)
for (batchX, batchY) in self.next_batch(X, Y, batch_size):
(batchX, batchY) = (batchX.to(self.device), batchY.to(self.device))
predictions = self.model(batchX)
loss = self.loss(predictions, batchY)
self.optim.zero_grad()
loss.backward()
self.optim.step()
trainloss += loss.item() * batchY.size(0)
trainAcc += (predictions.max(1)[1] == batchY).sum().item()
samples += batchY.size(0)
traintemplate = "epoch:{} train loss:{:.3f} train accuracy:{:.3f}"
print(traintemplate.format(epoch+1, trainloss/samples, trainAcc/samples))
def next_batch(self, inputs, targets, batchSize):
for i in range(0, inputs.shape[0], batchSize):
yield (inputs[i:i+batchSize], targets[i:i+batchSize])
I've built a CNN using Pytorch and am attempting to train it to classify dog and cat images from this Kaggle dataset.
The training loss starts at ~9 after the first epoch and then gets stuck at ~0.69 from the second epoch onwards. The testing loss and the accuracy are stack at ~0.69 and ~50% throughout the training.
At the moment my parameters are as follows:
batch_size = 128
num_epochs = 10
learning_rate = 0.1
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adamax(model.parameters(), lr=learning_rate)
I've tried changing the batch size, the optimizer and the learning rate. I've attached my code for creating the dataset and the CNN, and the training loop below.
Definition of dataset class and transforms for augmentations
class CatDogDataset(Dataset):
def __init__(self, images_list, mode="train", transform=None):
self.images_list = images_list
self.mode = mode
self.transform = transform
# dataset length
def __len__(self):
self.dataset_len = len(self.images_list)
return self.dataset_len
# load an image
def __getitem__(self, idx):
image_name = self.images_list[idx]
image = Image.open(image_name)
image = image.resize((224,224)) # this is important when feeding into a pretrained model
transformed_image = self.transform(image)
image_category = image_name.split("/")[-1].split(".")[0]
if self.mode == "train" or self.mode == "val":
if image_category == "cat":
label = 0
else:
label = 1
return transformed_image, label
else:
return transformed_image
train_transforms = transforms.Compose([
transforms.RandomHorizontalFlip(p=0.5),
transforms.RandomRotation(15),
transforms.RandomResizedCrop(224, scale=(0.8,1.0),ratio=(1.0,1.0)),
transforms.ToTensor(),
transforms.Normalize((0, 0, 0),(1, 1, 1))
])
val_transforms = transforms.Compose([
transforms.ToTensor(),
transforms.Normalize((0, 0, 0),(1, 1, 1))
])
CNN class definition
class CNN(nn.Module):
def __init__(self):
super().__init__()
self.cnn_layers = nn.Sequential(
# convolutional layer 1
nn.Conv2d(in_channels=3, out_channels=16, kernel_size=5, padding=0, stride=1),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2),
# convolutional layer 2
nn.Conv2d(in_channels=16, out_channels=32, kernel_size=5, padding=0, stride=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2),
# convolutional layer 3
nn.Conv2d(in_channels=32, out_channels=64, kernel_size=5, padding=0, stride=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(kernel_size=2)
)
self.linear_layers = nn.Sequential(
nn.Linear(in_features=64 * 24 * 24, out_features=10),
nn.ReLU(),
nn.Dropout(0.3),
nn.Linear(in_features=10, out_features=2)
)
def forward(self, x):
out = self.cnn_layers(x)
#print(out.shape)
out = out.view(-1, 64 * 24 * 24) # flatten
out = self.linear_layers(out)
return out
Model training and validation
from tqdm import tqdm
train_losses = []
val_losses = []
accuracy_list = []
for epoch in range(num_epochs):
# perform training on train set
model.train()
running_loss = 0
for images, labels in tqdm(train_dataloader):
# load to gpu
images = images.to(device)
labels = labels.to(device)
# forward pass
outputs = model(images)
loss = criterion(outputs, labels)
running_loss += loss.item()
# backprop and update model params
optimizer.zero_grad()
loss.backward()
optimizer.step()
# calculate training loss for the epoch
train_losses.append(running_loss / len(train_dataloader))
# calculate loss accuracy on validation set
model.eval()
running_loss = 0
num_correct = 0
num_predictions = 0
with torch.no_grad():
for images, labels in tqdm(val_dataloader):
# load to gpu
images = images.to(device)
labels = labels.to(device)
# forward pass
outputs = model(images)
loss = criterion(outputs, labels)
running_loss += loss.item()
# calculate accuracy for batch
_, predicted = torch.max(outputs.data, 1)
num_correct += (predicted == labels).sum().item()
num_predictions += labels.size(0)
# calculate val loss for epoch
val_losses.append(running_loss / len(val_dataloader))
# calculate accuracy for epoch
accuracy = num_correct / num_predictions * 100
accuracy_list.append(accuracy)
print("[Epoch: %d / %d], [Train loss: %.4f], [Test loss: %.4f], [Acc: %.2f]" \
%(epoch+1, num_epochs, train_losses[-1], val_losses[-1], accuracy))
I have a model which looks as follows:
IMG_WIDTH = IMG_HEIGHT = 224
class AlexNet(nn.Module):
def __init__(self, output_dim):
super(AlexNet, self).__init__()
self._to_linear = None
self.x = torch.randn(3, IMG_WIDTH, IMG_HEIGHT).view(-1, 3, IMG_WIDTH, IMG_HEIGHT)
self.features = nn.Sequential(
nn.Conv2d(3, 64, 3, 2, 1), # in_channels, out_channels, kernel_size, stride, padding
nn.MaxPool2d(2),
nn.ReLU(inplace=True),
nn.Conv2d(64, 192, 3, padding=1),
nn.MaxPool2d(2),
nn.ReLU(inplace=True),
nn.Conv2d(192, 384, 3, padding=1),
nn.MaxPool2d(2),
nn.ReLU(inplace=True),
nn.Conv2d(384, 256, 3, padding=1),
nn.MaxPool2d(2),
nn.ReLU(inplace=True),
nn.Conv2d(256, 512, 3, padding=1),
nn.ReLU(inplace=True),
nn.Conv2d(512, 256, 3, padding=1),
nn.MaxPool2d(2),
nn.ReLU(inplace=True)
)
self.conv(self.x)
self.classifier = nn.Sequential(
nn.Dropout(.5),
nn.Linear(self._to_linear, 4096),
nn.ReLU(inplace=True),
nn.Dropout(.5),
nn.Linear(4096, 4096),
nn.ReLU(inplace=True),
nn.Linear(4096, output_dim),
)
def conv(self, x):
x = self.features(x)
if self._to_linear is None:
self._to_linear = x.shape[1] * x.shape[2] * x.shape[3]
return x
def forward(self, x):
x = self.conv(x)
h = x.view(x.shape[0], -1)
x = self.classifier(h)
return x, h
Here is my optimizer and loss functions:
optimizer = torch.optim.Adam(model.parameters())
criterion = nn.BCEWithLogitsLoss().to(device)
Here is my train and evaluate functions:
def train(model, iterator, optimizer, criterion, device):
epoch_loss, epoch_acc = 0, 0
model.train()
for (x, y) in iterator:
# features and labels to the device
x = x.to(device)
y = y.to(device).long()
# Zero the gradients
optimizer.zero_grad()
y_pred, _ = model(x)
# Calculate the loss and accuracy
loss = criterion(y_pred.squeeze(), y)
acc = binary_accuracy(y_pred, y)
# Backward propagate
loss.backward()
# Update the weights
optimizer.step()
epoch_loss +=loss.item()
epoch_acc += acc.item()
return epoch_loss/len(iterator), epoch_acc/len(iterator)
def evaluate(model, iterator, criterion, device):
epoch_loss, epoch_acc = 0, 0
model.eval()
with torch.no_grad():
for (x, y) in iterator:
x = x.to(device)
y = y.to(device).long()
y_pred, _ = model(x)
loss = criterion(y_pred, y)
acc = binary_accuracy(y_pred, y)
epoch_loss += loss.item()
epoch_acc += acc.item()
return epoch_loss/len(iterator), epoch_acc/len(iterator)
This is the error that I'm getting:
RuntimeError: result type Float can't be cast to the desired output type Long
What may be possibly my problem because I have tried to convert my labels to long tensors as follows:
y = y.to(device).long()
But it seems not to work.
I was getting the same error doing this:
loss_fn(output, target)
where the output was Tensor torch.float32 and target was Tensor torch.int64. What solved this problem was calling the loss function like this:
loss_fn(output, target.float())
I encountered this error while using a library (Huggingface). In that case you do not have access to the code that computes the loss. You do not convert the data type of your labels that you pass to the library. What worked for me was:
labels = labels.astype(np.float32).tolist()
I'm trying to implement ResNet18 on pyTorch but I'm having some troubles with it. My code is this:
device = torch.device("cuda:0")
class ResnetBlock(nn.Module):
def __init__(self, strides, nf, nf0, reps, bn):
super(ResnetBlock, self).__init__()
self.adapt = strides == 2
self.layers = []
self.relus = []
self.adapt_layer = nn.Conv2d(nf0, nf, kernel_size=1, stride=strides, padding=0) if self.adapt else None
for i in range(reps):
self.layers.append(nn.Sequential(
nn.Conv2d(nf0, nf, kernel_size=3, stride=strides, padding=1),
nn.BatchNorm2d(nf, eps=0.001, momentum=0.99),
nn.ReLU(),
nn.Conv2d(nf, nf, kernel_size=3, stride=1, padding=1),
nn.BatchNorm2d(nf, eps=0.001, momentum=0.99)))
self.relus.append(nn.ReLU())
strides = 1
nf0 = nf
def forward(self, x):
for i, (layer, relu) in enumerate(zip(self.layers, self.relus)):
rama = layer(x)
if self.adapt and i == 0:
x = self.adapt_layer(x)
x = x + rama
x = relu(x)
return x
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=7, stride=2, padding=3),
nn.MaxPool2d(kernel_size=2, stride=2))
self.blocks = nn.Sequential(
ResnetBlock(1, 64, 64, 2, bn),
ResnetBlock(2, 128, 64, 2, bn),
ResnetBlock(2, 256, 128, 2, bn),
ResnetBlock(2, 512, 256, 2, bn))
self.fcout = nn.Linear(512, 10)
def forward(self, x):
out = self.layer1(x)
out = self.blocks(out)
out = out.reshape(out.size(0), -1)
out = self.fcout(out)
return out
num_epochs = 50
num_classes = 10
batch_size = 50
learning_rate = 0.00001
trans = transforms.ToTensor()
train_dataset = torchvision.datasets.CIFAR10(root="./dataset_pytorch", train=True, download=True, transform=trans)
test_dataset = torchvision.datasets.CIFAR10(root="./dataset_pytorch", train=False, download=True, transform=trans)
train_loader = DataLoader(dataset=train_dataset, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
def weights_init(m):
if isinstance(m, nn.Conv2d) or isinstance(m, nn.Linear):
nn.init.xavier_uniform_(m.weight.data)
nn.init.zeros_(m.bias.data)
model = ConvNet()
model.apply(weights_init)
model.to(device)
summary(model, (3,32,32))
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate, eps=1e-6)
# Train the model
total_step = len(train_loader)
loss_list = []
acc_list = []
acc_list_test = []
for epoch in range(num_epochs):
total = 0
correct = 0
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
optimizer.zero_grad()
# Run the forward pass
outputs = model(images)
loss = criterion(outputs, labels)
loss_list.append(loss.item())
# Backprop and perform Adam optimisation
loss.backward()
optimizer.step()
# Track the accuracy
total += labels.size(0)
_, predicted = torch.max(outputs.data, 1)
correct += (predicted == labels).sum().item()
acc_list.append(correct / total)
print("Train")
print('Epoch [{}/{}], Accuracy: {:.2f}%'
.format(epoch + 1, num_epochs, (correct / total) * 100))
total_test = 0
correct_test = 0
for i, (images, labels) in enumerate(test_loader):
images = images.to(device)
labels = labels.to(device)
# Run the forward pass
outputs = model(images)
# Track the accuracy
total_test += labels.size(0)
_, predicted = torch.max(outputs.data, 1)
correct_test += (predicted == labels).sum().item()
acc_list_test.append(correct_test / total_test)
print("Test")
print('Epoch [{}/{}], Accuracy: {:.2f}%'
.format(epoch + 1, num_epochs, (correct_test / total_test) * 100))
It's weird because it's throwing me that error Input type (torch.cuda.FloatTensor) and weight type (torch.FloatTensor) should be the same even though I've moved both the model and the data to cuda.
I guess it's related with how I defined or used "ResnetBlock", because if I remove from ConvNet those blocks (removing the line out = self.blocks(out)), the code works. But I don't know what I'm doing wrong.
The problem is in this line:
model.to(device)
to is not in-place. It returns the converted model. You need to change it to:
model = model.to(device)
EDIT: Another problem: vanilla list cannot be tracked by PyTorch. You need to use nn.ModuleList.
From
self.layers = []
self.relus = []
To
self.layers = nn.ModuleList()
self.relus = nn.ModuleList()