why pytorch loss grow higher and higher?

why pytorch loss grow higher and higher? - python

i use nn.BCEWithLogitsLoss()
when i training my model, the loss grows higher and higher, why? how can i solve this problem?
fundermental code:
loss_fn = nn.BCEWithLogitsLoss()
def train_loop(dataloader, model, loss_fn, optimizer):
for batch, (X, y) in enumerate(dataloader):
#X,y=X.to(device), y.to(device)
m = nn.Sigmoid()
predict=model(X.float())
loss=loss_fn(m(predict),y.unsqueeze(1).float())
optimizer.zero_grad()
loss.backward()# Calculate Gradients
optimizer.step()# Update Weights
full code:
import pandas as pd
import numpy as np
import torch
from torch.utils.data import Dataset
from torch.utils.data import DataLoader
import torch.nn as nn
class myDataset(Dataset):
def __init__(self,data,label):#, annotations_file, img_dir, transform=None, target_transform=None):
df = pd.read_csv(data, encoding='gbk')
df = df.fillna(value=0)
self.data = np.array(df)
df = pd.read_csv(label, encoding='gbk')
df = df.fillna(value=0)
self.label = np.array(df).reshape(-1)
#self.transform = transform
#self.target_transform = target_transform
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx], self.label[idx]
class Network(nn.Module):
def __init__(self):
super(Network, self).__init__()
self.flatten = nn.Flatten()
self.network = nn.Sequential(
#nn.Conv2d(in_channels=1, out_channels=6,kernel_size=5),
nn.Linear(27, 100),
nn.ReLU(),
nn.Linear(100, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 1),
nn.ReLU()
)
def forward(self, x):
x=self.flatten(x)
return self.network(x)
def train_loop(dataloader, model, loss_fn, optimizer):
for batch, (X, y) in enumerate(dataloader):
#X,y=X.to(device), y.to(device)
m = nn.Sigmoid()
predict=model(X.float())
loss=loss_fn(m(predict),y.unsqueeze(1).float())
optimizer.zero_grad()
loss.backward()# Calculate Gradients
optimizer.step()# Update Weights
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{len(dataloader.dataset):>5d}]")
model=Network()
batch_size = 64
learning_rate = 1e-3
epochs = 5
loss_fn = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
trainDataloader = DataLoader(myDataset("mydata/traindata.csv","mydata/trainlabel.csv"),batch_size=batch_size,shuffle=True)
train_loop(trainDataloader, model, loss_fn, optimizer)

I think you don't need this line m = nn.Sigmoid().
The document said that: This loss combines a Sigmoid layer and the BCELoss in one single class. You can check the loss here.
def train_loop(dataloader, model, loss_fn, optimizer):
for batch, (X, y) in enumerate(dataloader):
#X,y=X.to(device), y.to(device)
predict=model(X.float())
loss=loss_fn(predict,y.unsqueeze(1).float())

Related

LightningDataModule with Trainer in PytorchLightning automatically fits validation model?

I try to fight with overfitting, this is why I decided to look through documentation (https://pytorch-lightning.readthedocs.io/en/stable/common/evaluation_basic.html#train-with-the-validation-loop), where I found that you can pass in Trainer.fit training and validation dataloader. The question is that - should I use this method, or I can simply pass the dataloader class in Trainer.fit to prevent overfitting ?
Code DataLoader:
class ClassifierDataModule(pl.LightningDataModule):
def __init__(self, train_dataset:pd.DataFrame, val_dataset:pd.DataFrame, batch_size:int):
super().__init__()
self.prepare_data_per_node = False
self.train_dataset = train_dataset
self.val_dataset = val_dataset
self.batch_size=batch_size
def train_dataloader(self):
return DataLoader(self.train_dataset, batch_size=self.batch_size, shuffle=True, num_workers=os.cpu_count())
def val_dataloader(self):
return DataLoader(self.val_dataset, batch_size=self.batch_size, shuffle=True, num_workers=os.cpu_count())
data_module_classifier = ClassifierDataModule(train_dataset,val_dataset,test_dataset,BATCH_SIZE )
And here is my Trainer.fit():
model = MulticlassClassificationLIGHT(class_weights)
#trainer.fit(model, data_module_classifier) # SHOULD I USE THIS METHOD TO PREVENT OVERFITTING
trainer.fit(model, data_module_classifier.train_dataloader(),data_module_classifier.val_dataloader() ) # OR THIS ONE ?
My LightningModule just in case:
class MulticlassClassificationLIGHT(pl.LightningModule):
def __init__(self,class_weights):
super(MulticlassClassificationLIGHT, self).__init__()
self.num_feature=35
self.num_class=36
self.layer_1 = nn.Linear(self.num_feature, 512)
self.layer_2 = nn.Linear(512, 128)
self.layer_3 = nn.Linear(128, 64)
self.layer_out = nn.Linear(64, self.num_class)
self.relu = nn.ReLU()
self.dropout = nn.Dropout(p=0.2)
self.batchnorm1 = nn.BatchNorm1d(512)
self.batchnorm2 = nn.BatchNorm1d(128)
self.batchnorm3 = nn.BatchNorm1d(64)
self.loss = nn.CrossEntropyLoss(weight=class_weights.to(device))
def forward(self, x):
x = self.layer_1(x)
x = self.batchnorm1(x)
x = self.relu(x)
x = self.layer_2(x)
x = self.batchnorm2(x)
x = self.relu(x)
x = self.dropout(x)
x = self.layer_3(x)
x = self.batchnorm3(x)
x = self.relu(x)
x = self.dropout(x)
x = self.layer_out(x)
return x
def training_step(self, batch, batch_idx):
x, y = batch
logits = self.forward(x)
loss = self.loss(logits, y)
self.log("train_loss", loss, prog_bar=True, logger=True)
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
logits = self.forward(x)
loss = self.loss(logits, y)
self.log("val_loss", loss, prog_bar=True, logger=True) # I ask Trainer to "ModelCheckpoint" this loss
return loss

Passing validation data loader during training does not fix overfitting. It allows to measure the overfitting/underfitting of the model. We want performance on validation data to be closer to performance on training data in case of a well-fit model.
Regarding the syntax, This should work :
trainer.fit(model=model, train_dataloaders =data_module_classifier.train_dataloader(), val_dataloaders =data_module_classifier.val_dataloader())
documentation for fit here - https://pytorch-lightning.readthedocs.io/en/latest/common/trainer.html#trainer-class-api

write a train routine in class with pytorch

I want to write a train function in a class for training a model; The following code reported an error; can anyone give me a hint for solving this issue?
import numpy as np
import os
import sys
sys.executable
sys.version
##define a neuralnet class
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision import datasets
from torchvision.transforms import ToTensor
os.chdir("/Users/zhangzhongheng/Downloads/")
os.getcwd()
#Download training data from open datasets.
training_data = datasets.FashionMNIST(
root="data",
train=True,
download=True,
transform=ToTensor(),
)
Download test data from open datasets.
test_data = datasets.FashionMNIST(
root="data",
train=False,
download=True,
transform=ToTensor(),
)
batch_size = 64
Create data loaders.
train_dataloader = DataLoader(training_data, batch_size=batch_size)
test_dataloader = DataLoader(test_data, batch_size=batch_size)
for X, y in test_dataloader:
print(f"Shape of X [N, C, H, W]: {X.shape}")
print(f"Shape of y: {y.shape} {y.dtype}")
break
Get cpu or gpu device for training.
device = "cuda" if torch.cuda.is_available() else "cpu"
print(f"Using {device} device")
Define model
class NeuralNetwork(nn.Module):
def __init__(self):
super().__init__()
self.flatten = nn.Flatten()
self.linear_relu_stack = nn.Sequential(
nn.Linear(28*28, 512),
nn.ReLU(),
nn.Linear(512, 512),
nn.ReLU(),
nn.Linear(512, 10)
)
def forward(self, x):
x = self.flatten(x)
logits = self.linear_relu_stack(x)
return logits
def model_train(self,dataloader):
self.train()
size = len(dataloader.dataset)
optimizer = torch.optim.SGD(self.parameters(), lr=1e-3)
for batch, (X, y) in enumerate(dataloader):
X, y = X.to(device), y.to(device)
# Compute prediction error
pred = self.forward(X)
loss = nn.CrossEntropyLoss(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
Model = NeuralNetwork()
epochs = 5
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
Model.model_train(train_dataloader)
print("Done!")
The above code reported the following error:
Epoch 1
-------------------------------
RuntimeError: Boolean value of Tensor with more than one value is ambiguous
Done!

Changing nn.CrossEntropyLoss to nn.CrossEntropyLoss() should solve this problem.
Refer to the official documentation here.
It will look something like this loss = nn.CrossEntropyLoss()(pred, y)

PyTorch-Lightning error on trainer.predict()

I'm using an example for training a model on MNIST dataset from pytorch-lightning's documentation (see here), to which I tried to add a prediction step. However, when performing trainer.predict(model) I get an error:
AttributeError: 'list' object has no attribute 'flatten'
I followed the instructions and examples I found online (adding the functions predict_step, predict_dataloader and adding a stage under setup function) and it looks pretty simple - however it doesn't work.
Here's the code I'm running:
import os
import torch
from pytorch_lightning import LightningModule, Trainer
from pytorch_lightning.callbacks.progress import TQDMProgressBar
from pytorch_lightning.loggers import CSVLogger
from torch import nn
from torch.nn import functional as F
from torch.utils.data import DataLoader, random_split
from torchmetrics import Accuracy
from torchvision import transforms
from torchvision.datasets import MNIST
PATH_DATASETS = os.environ.get("PATH_DATASETS", ".")
BATCH_SIZE = 256 if torch.cuda.is_available() else 64
class LitMNIST(LightningModule):
def __init__(self, data_dir=PATH_DATASETS, hidden_size=64, learning_rate=2e-4):
super().__init__()
# Set our init args as class attributes
self.data_dir = data_dir
self.hidden_size = hidden_size
self.learning_rate = learning_rate
# Hardcode some dataset specific attributes
self.num_classes = 10
self.dims = (1, 28, 28)
channels, width, height = self.dims
self.transform = transforms.Compose(
[
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,)),
]
)
# Define PyTorch model
self.model = nn.Sequential(
nn.Flatten(),
nn.Linear(channels * width * height, hidden_size),
nn.ReLU(),
nn.Dropout(p=0.9),
nn.Linear(hidden_size, hidden_size),
nn.ReLU(),
nn.Dropout(0.1),
nn.Linear(hidden_size, self.num_classes),
)
self.val_accuracy = Accuracy(task='multiclass', num_classes=10) # I fixed this since the code from the tutorial didn't work
self.test_accuracy = Accuracy(task='multiclass', num_classes=10) # I fixed this since the code from the tutorial didn't work
def forward(self, x):
x = self.model(x)
return F.log_softmax(x, dim=1)
def training_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = F.nll_loss(logits, y)
return loss
def validation_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = F.nll_loss(logits, y)
preds = torch.argmax(logits, dim=1)
self.val_accuracy.update(preds, y)
# Calling self.log will surface up scalars for you in TensorBoard
self.log("val_loss", loss, prog_bar=True)
self.log("val_acc", self.val_accuracy, prog_bar=True)
def test_step(self, batch, batch_idx):
x, y = batch
logits = self(x)
loss = F.nll_loss(logits, y)
preds = torch.argmax(logits, dim=1)
self.test_accuracy.update(preds, y)
# Calling self.log will surface up scalars for you in TensorBoard
self.log("test_loss", loss, prog_bar=True)
self.log("test_acc", self.test_accuracy, prog_bar=True)
def predict_step(self, batch, batch_idx, dataloader_idx=0):
return self(batch)
def configure_optimizers(self):
optimizer = torch.optim.Adam(self.parameters(), lr=self.learning_rate)
return optimizer
####################
# DATA RELATED HOOKS
####################
def prepare_data(self):
# download
MNIST(self.data_dir, train=True, download=True)
MNIST(self.data_dir, train=False, download=True)
def setup(self, stage=None):
# Assign train/val datasets for use in dataloaders
if stage == "fit" or stage is None:
mnist_full = MNIST(self.data_dir, train=True, transform=self.transform)
self.mnist_train, self.mnist_val = random_split(mnist_full, [55000, 5000])
# Assign test dataset for use in dataloader(s)
if stage == "test" or stage is None:
self.mnist_test = MNIST(self.data_dir, train=False, transform=self.transform)
if stage == "predict":
self.mnist_predict = MNIST(self.data_dir, train=False, transform=self.transform)
def train_dataloader(self):
return DataLoader(self.mnist_train, batch_size=BATCH_SIZE)
def val_dataloader(self):
return DataLoader(self.mnist_val, batch_size=BATCH_SIZE)
def test_dataloader(self):
return DataLoader(self.mnist_test, batch_size=BATCH_SIZE)
def predict_dataloader(self):
return DataLoader(self.mnist_predict, batch_size=BATCH_SIZE)
model = LitMNIST(hidden_size=2)
trainer = Trainer(
accelerator="auto",
devices=1 if torch.cuda.is_available() else None, # limiting got iPython runs
max_epochs=3,
callbacks=[TQDMProgressBar(refresh_rate=20)],
logger=CSVLogger(save_dir="logs/"),
)
trainer.fit(model)
predict = trainer.predict(model)
What is the problem?

You forgot to separate intput and label in the batch:
def predict_step(self, batch, batch_idx, dataloader_idx=0):
x, y = batch
return self(x)

Trying to compute the loss of an encoder/decoder model

I am attempting to create an encoder/decoder model with mini-batch. I continue to encounter an errors stating:
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation: [torch.FloatTensor [32, 6]], which is output 0 of AsStridedBackward0, is at version 2; expected version 1 instead. Hint: the backtrace further above shows the operation that failed to compute its gradient. The variable in question was changed in there or anywhere later. Good luck!
The traceback reveals something is wrong with the y=self.linear(out) but I am unsure what exactly. Any help would be greatly appreciated. Below is the model. Thank you.
import torch
import torch.nn as nn
import numpy as np
from torch.autograd import Variable
from sliding_window import sliding_window
from training_datasets import get_training_datasets_batch
torch.autograd.set_detect_anomaly(True)
class Encoder(nn.Module):
def __init__(self, input_size, hidden_size, num_layers=1):
super(Encoder, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.gru = nn.GRU(input_size=input_size, hidden_size=hidden_size,num_layers=num_layers,batch_first=True)
def forward(self, x):
flat = x.view(x.shape[0], x.shape[1], self.input_size)
out,h = self.gru(flat)
return out, h
class Decoder(nn.Module):
def __init__(self, input_size, hidden_size, output_size=6, num_layers=1):
super(Decoder, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.output_size = output_size
self.gru = nn.GRU(input_size=input_size,hidden_size=hidden_size,num_layers=num_layers,batch_first=True)
self.linear = nn.Linear(hidden_size, output_size)
self.ReLU = nn.ReLU()
def forward(self, x, h):
x = x.unsqueeze(1)
out, h = self.gru(x, h)
out = out.squeeze(1)
print(out.shape)
y = self.linear(out)
print(y.shape)
y = self.ReLU(y)
return y,h
class EncoderDecoder(nn.Module):
def __init__(self, hidden_size, input_size=6, output_size=6):
super(EncoderDecoder, self).__init__()
self.input_size = input_size
self.hidden_size = hidden_size
self.encoder = Encoder(input_size=input_size, hidden_size=hidden_size)
self.decoder = Decoder(input_size=input_size, hidden_size=hidden_size, output_size=output_size)
def train_model(self, ts, epochs, target_len, features, batch_size=64, test_len=288, method = 'teacher_forcing', tfr = 0.5, lr = 0.01, dynamic_tf=False):
X,Y= sliding_window(ts, features=288, target_len=target_len)
x_train, x_val, x_test, y_train, y_val, y_test = get_training_datasets_batch(X,Y, features, test_len=test_len, batch_size=batch_size)
losses = np.full(epochs,np.nan)
optimizer = torch.optim.SGD(filter(lambda x: x.requires_grad, self.parameters()),
lr=lr)
criterion = nn.MSELoss()
for e in range(epochs):
print('Starting epoch {}'.format(e))
x_train_data = iter(x_train)
y_train_data = iter(y_train)
x_val_data = iter(x_val)
y_val_data = iter(y_val)
x_train_shape = list(x_train)[0].shape
# predicted = torch.zeros(target_len,batch_size,x_train_shape[2])
# print(predicted.shape)
loss=0
for x_train_in in x_train_data:
optimizer.zero_grad()
x_train_in = Variable(x_train_in)
y_train_in = Variable(next(y_train_data).transpose(0,1))
_, enc_h = self.encoder(x_train_in)
dec_in = x_train_in[:,-1,:]
dec_h = enc_h
if method == 'recursive':
for t in range(target_len):
dec_out, dec_h = self.decoder(dec_in, dec_h)
predicted = dec_out
dec_in = dec_out
loss += criterion(predicted,y_train_in[t])
loss.backward(retain_graph=True)
optimizer.step()

The problem in this case was the loss.backward(retain_graph=True). The code started working after adding the line loss=0. The loss value continues to increase and needs to be reset.
loss.backward()
optimizer.step()
loss=0

Why does my torch convolutional NN return same outputs?

I am working with tutorial for convolutional neural nets on pytorch and using MNIST dataset as example. But I was getting same outputs for all labels, net always returns same tensor as result. I've decided to reduce complexity of net and decrease num of epoch, so here is my code:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")
plt.ion() # interactive mode
class DigitsDataset(Dataset):
def __init__(self, csv_file, transform=None):
"""
Args:
csv_file (string): Path to the csv file.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.digits_frame = pd.read_csv(csv_file)
self.transform = transform
def __len__(self):
return len(self.digits_frame)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
label = digits_df.iloc[idx, 0]
digit_pixels = digits_df.iloc[idx, 1:]
digit_pixels = np.asarray(digit_pixels)
digit_pixels = digit_pixels.astype('float').reshape(28, 28)
sample = {'label' : label, 'image' : digit_pixels}
if self.transform:
sample['image'] = self.transform(sample['image'])
return sample
class GrayScaleTransform:
''' Scale intensity from [0,255] to [0,1]'''
def __init__(self, new_min, new_max):
self.new_min = new_min
self.new_max = new_max
def __call__(self, x):
return (x) * (self.new_max - self.new_min) / (255) + self.new_min
min_max_transform = GrayScaleTransform(new_min = 0, new_max = 1)
train_dataset = DigitsDataset(csv_file='data/train.csv', transform = min_max_transform)
test_dataset = DigitsDataset(csv_file='data/test.csv', transform = min_max_transform)
train_loader = DataLoader(train_dataset)
test_loader = DataLoader(test_dataset)
learning_rate = 0.1
num_epochs = 2
from torch import nn
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential( nn.Conv2d(1, 4, kernel_size=5, stride=1, padding=2),
nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential( nn.Conv2d(4, 8, kernel_size=5, stride=1, padding=2),
nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2))
self.drop_out = nn.Dropout()
self.fc1 = nn.Linear(7 * 7 * 8, 10)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.drop_out(out)
out = self.fc1(out)
return out
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = ConvNet()
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
total_step = len(train_loader)
loss_list = []
acc_list = []
for epoch in range(num_epochs):
for i, sample in enumerate(train_loader):
# Прямой запуск
img = sample['image'].view(-1, 1, 28, 28).float().to(device)
label = sample['label'].to(device)
output = model(img)
loss = criterion(output, label)
loss_list.append(loss.item())
# Обратное распространение и оптимизатор
optimizer.zero_grad()
loss.backward()
optimizer.step()
But I am still getting the same tensor as output. What am I doing wrong? Why am I always getting the same output?

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

why pytorch loss grow higher and higher? - python

Related

LightningDataModule with Trainer in PytorchLightning automatically fits validation model?

write a train routine in class with pytorch

PyTorch-Lightning error on trainer.predict()

Trying to compute the loss of an encoder/decoder model

Why does my torch convolutional NN return same outputs?

Categories

Resources