The dataset is CIFAR10. I've created a VGG-like network:
class FirstModel(nn.Module):
def __init__(self):
super(FirstModel, self).__init__()
self.vgg1 = nn.Sequential(
nn.Conv2d(3, 16, 3, padding=1),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.Conv2d(16, 16, 3, padding=1),
nn.BatchNorm2d(16),
nn.ReLU(),
nn.MaxPool2d(2,2),
nn.Dropout(0.2)
)
self.vgg2 = nn.Sequential(
nn.Conv2d(16, 32, 3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.Conv2d(32, 32, 3, padding=1),
nn.BatchNorm2d(32),
nn.ReLU(),
nn.MaxPool2d(2,2),
nn.Dropout(0.2)
)
self.vgg3 = nn.Sequential(
nn.Conv2d(32, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.Conv2d(64, 64, 3, padding=1),
nn.BatchNorm2d(64),
nn.ReLU(),
nn.MaxPool2d(2,2),
nn.Dropout(0.2)
)
self.fc1 = nn.Linear(4 * 4 * 64, 4096)
self.relu = nn.ReLU()
self.fc2 = nn.Linear(4096, 4096)
self.fc3 = nn.Linear(4096, 10)
self.softmax = nn.Softmax()
self.dropout = nn.Dropout(0.5)
def forward(self, x):
x = self.vgg3(self.vgg2(self.vgg1(x)))
x = nn.Flatten()(x)
x = self.relu(self.fc1(x))
x = self.dropout(x)
x = self.relu(self.fc2(x))
x = self.dropout(x)
x = self.softmax(self.fc3(x))
return x
Then I train it and visualize loss and accuracy:
import matplotlib.pyplot as plt
from IPython.display import clear_output
def plot_history(train_history, val_history, title='loss'):
plt.figure()
plt.title('{}'.format(title))
plt.plot(train_history, label='train', zorder=1)
points = np.array(val_history)
steps = list(range(0, len(train_history) + 1, int(len(train_history) / len(val_history))))[1:]
plt.scatter(steps, val_history, marker='*', s=180, c='red', label='val', zorder=2)
plt.xlabel('train steps')
plt.legend(loc='best')
plt.grid()
plt.show()
def train_model(model, optimizer, train_dataloader, test_dataloader):
criterion = nn.CrossEntropyLoss()
train_loss_log = []
train_acc_log = []
val_loss_log = []
val_acc_log = []
for epoch in range(NUM_EPOCH):
model.train()
train_loss = 0.
train_size = 0
train_acc = 0.
for inputs, labels in train_dataloader:
inputs, labels = inputs.to(device), labels.to(device)
optimizer.zero_grad()
y_pred = model(inputs)
loss = criterion(y_pred, labels)
loss.backward()
optimizer.step()
train_loss += loss.item()
train_size += y_pred.size(0)
train_loss_log.append(loss.data / y_pred.size(0))
_, pred_classes = torch.max(y_pred, 1)
train_acc += (pred_classes == labels).sum().item()
train_acc_log.append(np.mean((pred_classes == labels).cpu().numpy()))
# блок validation
val_loss = 0.
val_size = 0
val_acc = 0.
model.eval()
with torch.no_grad():
for inputs, labels in test_dataloader:
inputs, labels = inputs.to(device), labels.to(device)
y_pred = model(inputs)
loss = criterion(y_pred, labels)
val_loss += loss.item()
val_size += y_pred.size(0)
_, pred_classes = torch.max(y_pred, 1)
val_acc += (pred_classes == labels).sum().item()
val_loss_log.append(val_loss/val_size)
val_acc_log.append(val_acc/val_size)
clear_output()
plot_history(train_loss_log, val_loss_log, 'loss')
plot_history(train_acc_log, val_acc_log, 'accuracy')
print('Train loss:', train_loss / train_size)
print('Train acc:', train_acc / train_size)
print('Val loss:', val_loss / val_size)
print('Val acc:', val_acc / val_size)
Then I train the model:
first_model = FirstModel()
first_model.to(device)
optimizer = optim.RMSprop(first_model.parameters(), lr=0.001, momentum=0.9)
train_model(first_model_rms, optimizer, train_dataloader, test_dataloader)
The loss and accuracy do not change (accuracy at level of 0.1). However, if the optimizer is SGD with momentum everything works fine (loss and accuracy change). I've already tried to change momentum and lr, but it does not help.
What should be fixed? Would be grateful for any possible advice!
So first of all, you don't have to use softmax in the "model" as it is done by the nn.CrossEntropyLoss, and I also think that the RMSprop doesn't work with momentum.
try to decrease the learning rate more .....if then also there is no affect on the accuracy and loss then change the optimizer to adams or something else and play with different learning rates.
In my case, I was facing the same error. On my laptop without GPU the training was fine. When I tried on GPU the model didn’t change the accuracy and loss after the first epochs. I was using nn.CrossEntropyLoss() with Adam.
Changing Adam with SGD worked for me.
Related
I downloaded a train dataset in json format and i read it in Google Colab.
I have created a convolutional neural network and i want to train it with the train_set dataset. I also imported DataLoader because i want batches instead of images. So, i am running the following script in Google Colab:
import json
! wget -O train_set.json https://github.com/rslab-ntua/MSc_GBDA/blob/master/2020/Exercise_ML2/train_split.json
with open('train_set.json') as f:
train_set = f.read()
from torch.utils.data import DataLoader
trainloader = DataLoader(train_set, batch_size=32, shuffle=True)
from torch import nn
from torch.nn import functional as F
from torchsummary import summary
# Convolutional Neural Network definition
class CNN(nn.Module):
def __init__(self):
super().__init__()
self.conv_features = nn.Sequential(
nn.Conv2d(in_channels=1, out_channels=4, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
nn.Conv2d(in_channels=4, out_channels=16, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0),
nn.Conv2d(in_channels=16, out_channels=24, kernel_size=3, stride=1, padding=1),
nn.ReLU(inplace=True),
nn.MaxPool2d(kernel_size=2, stride=2, padding=0)
)
self.fc1 = nn.Linear(24*3*3,256)
self.fc2 = nn.Linear(256,10)
def forward(self, x):
x = self.conv_features(x)
x = x.view(x.size()[0], -1) # το κανω flattened
x = F.relu(self.fc1(x))
return self.fc2(x)
model = CNN()
summary(model, (1,28,28))
from torch.optim import SGD
num_epochs = 5
criterion = nn.CrossEntropyLoss()
optimizer = SGD(model.parameters(), lr=0.001, momentum = 0.9)
for epoch in range(num_epochs):
total_loss = 0
for images, labels in trainloader:
optimizer.zero_grad()
logits = model(images.to('cuda'))
loss = criterion(logits, labels.to('cuda'))
loss.backward()
optimizer.step()
total_loss += float(loss.cpu().detach())
print(f'Epoch {epoch+1}: total_mean_loss: {total_loss/len(trainloader)}')
and i get the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-12-082efbc4edf5> in <module>()
9 for epoch in range(num_epochs):
10 total_loss = 0
---> 11 for images, labels in trainloader:
12 optimizer.zero_grad()
13
ValueError: too many values to unpack (expected 2)
I can not find the problem.
Is it the object train_set that causes the problem?
I am learning pytorch and I have created binary classification algorithm. After having trained the model I have very low loss and quite good accuracy. However, on validation the accuracy is exactly 50%. I am wondering if I loaded samples incorrectly or the algorithm does not perform well.
Here you can find the plot of Training loss and accuracy.
Here is my training method:
epochs = 15
itr = 1
p_itr = 100
model.train()
total_loss = 0
loss_list = []
acc_list = []
for epoch in range(epochs):
for samples, labels in train_loader:
samples, labels = samples.to(device), labels.to(device)
optimizer.zero_grad()
output = model(samples)
labels = labels.unsqueeze(-1)
labels = labels.float()
loss = criterion(output, labels)
loss.backward()
optimizer.step()
total_loss += loss.item()
scheduler.step()
#if itr%p_itr == 0:
pred = torch.round(output)
correct = pred.eq(labels)
acc = torch.mean(correct.float())
print('[Epoch {}/{}] Iteration {} -> Train Loss: {:.4f}, Accuracy: {:.3f}'.format(epoch+1, epochs, itr, total_loss/p_itr, acc))
loss_list.append(total_loss/p_itr)
acc_list.append(acc)
total_loss = 0
itr += 1
Here, I am loading data from the path:
train_list_cats = glob.glob(os.path.join(train_cats_dir,'*.jpg'))
train_list_dogs = glob.glob(os.path.join(train_dogs_dir,'*.jpg'))
train_list = train_list_cats + train_list_dogs
val_list_cats = glob.glob(os.path.join(validation_cats_dir,'*.jpg'))
val_list_dogs = glob.glob(os.path.join(validation_dogs_dir,'*.jpg'))
val_list = val_list_cats + val_list_dogs
I am not attaching the model architecture, however I can add it if required.
I think that my training method is correct, although, I am not sure about training/validation data processing.
Edit:
The network params are as follow:
optimizer = torch.optim.RMSprop(model.parameters(), lr=0.001)
criterion = nn.BCELoss()
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[500,1000,1500], gamma=0.5)
Activation function is sigmoid.
The network architecture:
self.layer1 = nn.Sequential(
nn.Conv2d(3,16,kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(p=0.2)
)
self.layer2 = nn.Sequential(
nn.Conv2d(16,32, kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(p=0.2)
)
self.layer3 = nn.Sequential(
nn.Conv2d(32,64, kernel_size=3),
nn.ReLU(),
nn.MaxPool2d(2),
nn.Dropout(p=0.2)
)
self.fc1 = nn.Linear(17*17*64,512)
self.fc2 = nn.Linear(512,1)
self.relu = nn.ReLU()
self.sigmoid = nn.Sigmoid()
def forward(self,x):
out = self.layer1(x)
out = self.layer2(out)
out = self.layer3(out)
out = out.view(out.size(0),-1)
out = self.relu(self.fc1(out))
out = self.fc2(out)
return torch.sigmoid(out)
Going by your "Training loss and accuracy" plot your model is overfitting. Your train loss is near zero after 25 epochs and you continue training for 200+ epochs. This is wrong way to train a model. You should rather be doing early stopping based on the validation set. ie. Run one epoch of train and one epoch of eval and repeat. Stop when your train epoch is improving and the corresponding eval epoch is not improving.
I am building a conv net that classifies dog and cat. Architecture is pretty simple. 2 Conv(with batch norm, leakyReLU, Maxpooling) to 1 fc. Input image size is resized to 64. The size is good. The problem is loss is 0.0 from the start. I have no clue what the cause is. I couldn't find any answer. I have wrote every detail that might be important. If you need anything else please tell me, I will edit.
main.py
import torch
import torch.nn as nn
from torchvision import transforms, datasets
import PIL
import matplotlib.pyplot as plt
from Dataset import Dataset
from Network import Network
# Added to avoid torch._C._cuda_init() \n RuntimeError: CUDA error: unknown error
torch.cuda.current_device()
# Hyper Parameters
batch_size = 1
img_size = 64
learning_rate = 0.001
num_epoch = 1
# Directories
trainDir = "D:/Programming/python/Deep learning/datasets/dogs-vs-cats/train"
testDir = "D:/Programming/python/Deep learning/datasets/dogs-vs-cats/test1"
print("Initializing...")
# Device
device = torch.device('cuda:0' if torch.cuda.is_available() else 'cpu')
# Augmentation
transforms = transforms.Compose([
transforms.Resize((img_size, img_size)),
transforms.ColorJitter(hue=.05, saturation=.05),
transforms.RandomHorizontalFlip(),
transforms.RandomRotation(20, resample=PIL.Image.BILINEAR) ,
transforms.ToTensor()
])
trainset = datasets.ImageFolder(root=trainDir, transform=transforms)
testset = datasets.ImageFolder(root=testDir, transform=transforms)
train_loader = torch.utils.data.DataLoader(
trainset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(
testset, batch_size=batch_size, shuffle=False) # test set will not be shuffled
model = Network(img_size,2).to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
total_step = len(train_loader)
print("Tranining started")
for epoch in range(num_epoch):
for i, (images, labels) in enumerate(train_loader):
images = images.to(device)
labels = labels.to(device)
# forward propagate
outputs = model(images)
loss = criterion(outputs, labels)
# backpropagte and optimize
optimizer.zero_grad()
loss.backward()
optimizer.step()
if (i+1) % 100 == 0:
print(
"Epoch [{}/{}], Step[{}/{}], Loss: {}".format(
epoch+1, num_epoch, i+1, total_step, loss.item()
)
)
print("Tranining complete, validation started")
with torch.no_grad():
correct = 0
total = 0
for images, labels in test_loader:
images = images.to(device)
labels = labels.to(device)
outputs = model(images)
_, predicted = torch.max(outputs.data, 1)
total += labels.size(0)
correct += (predicted == labels).sum().item()
print('Test Accuracy: {} %'.format(100 * correct / total))
#
torch.save(model.state_dict(), "model.ckpy")
Network.py
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
"""
Input size for conv
l = number of input feature maps
k = number of output feature maps
n, m = width and height of kernel
total parameter = (n*m*l+1)*k
"""
class Network(nn.Module):
def __init__(self, input_size, num_class):
super(Network, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(3, 16, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(16),
nn.LeakyReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
) # output size = (128, 128, 16)
self.conv2 = nn.Sequential(
nn.Conv2d(16, 32, kernel_size=5, stride=1, padding=2),
nn.BatchNorm2d(32),
nn.LeakyReLU(),
nn.MaxPool2d(kernel_size=2, stride=2)
) # output size = (64, 64, 32)
self.fc1 = nn.Linear(
int((input_size/4)**2*32), num_class
)
def forward(self, x):
out = self.conv1(x)
out = self.conv2(out)
out = out.view(out.size(0), -1)
out = self.fc1(out)
return out
Output
Epoch [1/1], Step[5800/25000], Loss: 0.0
Epoch [1/1], Step[5900/25000], Loss: 0.0
Epoch [1/1], Step[6000/25000], Loss: 0.0
Epoch [1/1], Step[6100/25000], Loss: 0.0
Epoch [1/1], Step[6200/25000], Loss: 0.0
Epoch [1/1], Step[6300/25000], Loss: 0.0
Epoch [1/1], Step[6400/25000], Loss: 0.0
Epoch [1/1], Step[6500/25000], Loss: 0.0
Result after each layer
outputs of conv1,2
[[ 3.0135e-01, 3.5849e-01, 4.7758e-01, ..., 3.9759e-01,
3.7988e-01, 9.7870e-01],
[ 4.3010e-01, 6.0753e-03, 4.5642e-01, ..., -8.5486e-04,
4.4537e-02, 2.9074e-01],
[ 3.8567e-01, 7.8431e-02, 2.3859e-01, ..., -3.0013e-03,
-5.5821e-03, 1.2284e-01],
...,
[ 3.9181e-01, 3.9093e-01, 1.2053e-01, ..., -4.7156e-03,
5.6266e-01, 7.7017e-01],
outputs of fc1
[[-0.0772, 0.2166]]
loss = criterion(output, target.view(-1)) # Flatten target
try this.
could you remove these two line?
images = images.to(device)
labels = labels.to(device)
self.conv1 and 2 must be sent to cuda : self.conv1(2).cuda()
Task: Using the example of the "fetch_lfw_people" dataset to write and train an autocoder.
Write an iteration code by epoch. Write code to visualize the learning process and count the metrics for validation after each epoch.
Train auto encoder. Achieve low loss on validation.
My code:
from sklearn.datasets import fetch_lfw_people
import numpy as np
import torch
from torch.utils.data import TensorDataset, DataLoader
from sklearn.model_selection import train_test_split
Data preparation:
lfw_people = fetch_lfw_people(min_faces_per_person=70, resize=0.4)
X = lfw_people['images']
X_train, X_test = train_test_split(X, test_size=0.1)
X_train = torch.tensor(X_train, dtype=torch.float32, requires_grad=True)
X_test = torch.tensor(X_test, dtype=torch.float32, requires_grad=False)
dataset_train = TensorDataset(X_train, torch.zeros(len(X_train)))
dataset_test = TensorDataset(X_test, torch.zeros(len(X_test)))
batch_size = 32
train_loader = DataLoader(dataset_train, batch_size=batch_size, shuffle=True)
test_loader = DataLoader(dataset_test, batch_size=batch_size, shuffle=False)
Сreate a network with encoding and decoding functions:
class Autoencoder(torch.nn.Module):
def __init__(self):
super(Autoencoder, self).__init__()
self.encoder = torch.nn.Sequential(
torch.nn.Conv2d(in_channels=1, out_channels=32, kernel_size=3, stride=2),
torch.nn.ReLU(),
torch.nn.Conv2d(in_channels=32, out_channels=64, stride=2, kernel_size=3),
torch.nn.ReLU(),
torch.nn.Conv2d(in_channels=64, out_channels=64, stride=2, kernel_size=3),
torch.nn.ReLU(),
torch.nn.Conv2d(in_channels=64, out_channels=64, stride=2, kernel_size=3)
)
self.decoder = torch.nn.Sequential(
torch.nn.ConvTranspose2d(in_channels=64, out_channels=64, kernel_size=3, stride=2),
torch.nn.ConvTranspose2d(in_channels=64, out_channels=64, kernel_size=(3,4), stride=2),
torch.nn.ConvTranspose2d(in_channels=64, out_channels=32, kernel_size=4, stride=2),
torch.nn.ConvTranspose2d(in_channels=32, out_channels=1, kernel_size=(4,3), stride=2)
)
def encode(self, X):
encoded_X = self.encoder(X)
batch_size = X.shape[0]
return encoded_X.reshape(batch_size, -1)
def decode(self, X):
pre_decoder = X.reshape(-1, 64, 2, 1)
return self.decoder(pre_decoder)
I check the work of the model before learning by one example:
model = Autoencoder()
sample = X_test[:1]
sample = sample[:, None]
result = model.decode(model.encode(sample)) # before train
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2)
ax1.imshow(sample[0][0].detach().numpy(), cmap=plt.cm.Greys_r)
ax2.imshow(result[0][0].detach().numpy(), cmap=plt.cm.Greys_r)
plt.show()
The result is unsatisfactory. I start training:
model = Autoencoder()
loss = torch.nn.MSELoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.001)
history_train = []
history_test = []
for i in range(5):
for x, y in train_loader:
x = x[:, None]
model.train()
decoded_x = model.decode(model.encode(x))
mse_loss = loss(torch.tensor(decoded_x, dtype=torch.float), x)
optimizer.zero_grad()
mse_loss.backward()
optimizer.step()
history_train.append(mse_loss.detach().numpy())
model.eval()
with torch.no_grad():
for x, y in train_loader:
x = x[:, None]
result_x = model.decode(model.encode(x))
loss_test = loss(torch.tensor(result_x, dtype=torch.float), x)
history_test.append(loss_test.detach().numpy())
plt.subplot(1, 2, 1)
plt.plot(history_train)
plt.title("Optimization process for train data")
plt.subplot(1, 2, 2)
plt.plot(history_test)
plt.title("Loss for test data")
plt.show
A huge loss on the training data and on the test.
Аfter training nothing has changed:
with torch.no_grad():
model.eval()
res1 = model.decode(model.encode(sample))
fig, (ax1, ax2) = plt.subplots(nrows=1, ncols=2)
ax1.imshow(sample[0][0].detach().numpy(), cmap=plt.cm.Greys_r)
ax2.imshow(res1[0][0].detach().numpy(), cmap=plt.cm.Greys_r)
plt.show()
Why such a big loss? Reducing the input to the interval [-1, 1] does not help. I did it like this: (value / 255) * 2 - 1
Why do not change the parameters of the model after training?
Why does not change the decoded sample?
Result: before train, after train, loss
https://i.stack.imgur.com/OhdrJ.jpg
1) replace line
mse_loss = loss(torch.tensor(decoded_x, dtype=torch.float), x)
with line
mse_loss = loss(decoded_x, x)
2) replace lines
model.eval()
with torch.no_grad():
for x, y in train_loader:
with lines
replace lines
model.eval()
with torch.no_grad():
for x, y in test_loader:
I am working on a video animation project using PyTorch. My dataset contains 3904x60 mfcc audio features(input) and corresponding 3904x3 video features(output). The goal is to train a neural network model such that given an unknown audio feature, the model maps it into its corresponding video feature. In other words, the neural network performs a 60 to 3 feature mapping. I have already built the neural network following this tutorial:
class ConvNet(nn.Module):
def __init__(self):
super().__init__()
self.layer1 = nn.Sequential(
nn.Conv1d(1, 32, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.MaxPool1d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential(
nn.Conv1d(32, 64, kernel_size=5, stride=1, padding=2),
nn.ReLU(),
nn.MaxPool1d(kernel_size=2, stride=2))
self.drop_out = nn.Dropout()
self.fc1 = nn.Linear(15 * 64, 1000)
self.fc2 = nn.Linear(1000, 3)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.drop_out(out)
out = self.fc1(out)
out = self.fc2(out)
return out
and my training code looks like:
model = ConvNet()
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
for epoch in range(num_epochs):
for i, (a, v) in enumerate(train_loader):
# Run the forward pass
a = a.float()
v = v.long()
outputs = model(a.view(a.size(0),1,a.size(1)))
loss = criterion(outputs, v)
loss_list.append(loss.item())
# Backprop and perform Adam optimisation
optimizer.zero_grad()
loss.backward()
optimizer.step()
# Track the accuracy
total = labels.size(0)
_, predicted = torch.max(outputs.data, 1)
correct = (predicted == labels).sum().item()
acc_list.append(correct / total)
if (i + 1) % 100 == 0:
print('Epoch[{}/{}],Step[{}/{}],Loss{:.4f},Accuracy{:.2f}%'
.format(epoch + 1, num_epochs, i + 1, total_step, loss.item(),
(correct / total) * 100))
but received an error in training:
---> 15 loss = criterion(outputs, v)
multi-target not supported at /Users/soumith/miniconda2/conda-bld/pytorch_1532623076075/work/aten/src/THNN/generic/ClassNLLCriterion.c:21
I defined the batch size to be 4 so each a and v in the iteration should be a 4 by 60 tensor and a 4 by 3 tensor, respectively. How do I solve this problem?
The issue could be because of the definition of the target function that you use for nn.CrossEntropyLoss(). v is a 4 x 3 tensor you say, which doesn't appear correct.
In loss = criterion(outputs, v) , the loss function expects v to be a tensor of size minibatch with each value depicting on of the C classes (i.e. 0 to C-1). See the 'Shape' tab in https://pytorch.org/docs/stable/nn.html?highlight=crossentropyloss#torch.nn.CrossEntropyLoss
Target: (N) where each value is 0≤targets[i]≤C−1