Increase of GPU memory usage during training

Increase of GPU memory usage during training - python

I was training the network on usual MNIST dataset, and encountered the next problem:
when i start to add valid_metrics to a loss_list and accuracy_list, amount of GPU memory that is being used starts increasing with every 1 or 2 epochs.
This is the code of train_loop:
def train_model(model: torch.nn.Module,
train_dataset: torch.utils.data.Dataset,
valid_dataset: torch.utils.data.Dataset,
loss_function: torch.nn.Module = torch.nn.CrossEntropyLoss(),
optimizer_class: Type[torch.optim.Optimizer] = torch.optim,
optimizer_params: Dict = {},
initial_lr = 0.01,
lr_scheduler_class: Any = torch.optim.lr_scheduler.ReduceLROnPlateau,
lr_scheduler_params: Dict = {},
batch_size = 64,
max_epochs = 1000,
early_stopping_patience = 20):
optimizer = torch.optim.Adam(model.parameters(), lr=initial_lr, **optimizer_params)
lr_scheduler = lr_scheduler_class(optimizer, **lr_scheduler_params)
train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size)
best_valid_loss = None
best_epoch = None
loss_list = list()
accuracy_list = list()
for epoch in range(max_epochs):
print(f'Epoch {epoch}')
start = timer()
train_single_epoch(model, optimizer, loss_function, train_loader)
valid_metrics = validate_single_epoch(model, loss_function, valid_loader)
loss_list.append(valid_metrics['loss'])
accuracy_list.append(valid_metrics['accuracy'])
print('time:', timer() - start)
print(f'Validation metrics: \n{valid_metrics}')
lr_scheduler.step(valid_metrics['loss'])
if best_valid_loss is None or best_valid_loss > valid_metrics['loss']:
print(f'Best model yet, saving')
best_valid_loss = valid_metrics['loss']
best_epoch = epoch
torch.save(model, './best_model.pth')
if epoch - best_epoch > early_stopping_patience:
print('Early stopping triggered')
return loss_list, accuracy_list
and the code of validate_single_epoch:
def validate_single_epoch(model: torch.nn.Module,
loss_function: torch.nn.Module,
data_loader: torch.utils.data.DataLoader):
loss_total = 0
accuracy_total = 0
for data in data_loader:
X, y = data
X, y = X.view(-1, 784), y.to(device)
X = X.to(device)
output = model(X)
loss = loss_function(output, y)
loss_total += loss
y_pred = output.argmax(dim = 1, keepdim=True).to(device)
accuracy_total += y_pred.eq(y.view_as(y_pred)).sum().item()
loss_avg = loss_total / len(data_loader.dataset)
accuracy_avg = 100.0 * accuracy_total / len(data_loader.dataset)
return {'loss' : loss_avg, 'accuracy' : accuracy_avg}
I use GeForce MX250 as GPU

The problem is likely because the gradients are being computed and stored in the validation loop. To solve that, perhaps the easiest way is to wrap the validation call in a no_grad context:
with torch.no_grad():
valid_metrics = validate_single_epoch(model, loss_function, valid_loader)
If you prefer, you can also decorate the validate_single_epoch(...) with #torch.no_grad():
#torch.no_grad()
def validate_single_epoch(...):
# ...
Not related to your problem, but pay attention that you're using a model in training mode during validation, which may not be what you want. Perhaps there is a missing call to model.eval() in the validation function.

Related

Using ray tune `tune.run` with pytorch returns different optimal hyperparameters combination

I've initialized two identical ANN with PyTorch (both as structure and initial parameters), and I've noticed that the hyperparameters setting with Ray Tune, returns different results for the two ANN, even if I didn't have any random initialization.
Someone could explain what I'm doing wrong? I'll attach the code:
ANN Initialization:
class Featrues_model(nn.Module):
def __init__(self, n_inputs, dim_hidden, n_outputs):
super().__init__()
self.fc1 = nn.Linear(n_inputs, dim_hidden)
self.fc2 = nn.Linear(dim_hidden, n_outputs)
def forward(self, X):
X = self.fc1(X)
X = self.fc2(X)
return X
features_model_v1 = Featrues_model(len(list_input_variables),5,6)
features_model_v2 = Featrues_model(len(list_input_variables),5,6)
features_model_v2.load_state_dict(features_model_v1.state_dict())
Hyperpamameters setting
config = {
"lr": tune.choice([1e-2, 1e-5]),
"weight_decay": tune.choice([1e-2, 1e-5]),
"batch_size": tune.choice([16,64]),
"epochs": tune.choice([10,50])
}
Train & Validation Dataframe
trainset = df_final.copy()
test_abs = int(len(trainset) * 0.8)
train_subset, val_subset = random_split(
trainset, [test_abs, len(trainset) - test_abs]
)
df_train = df_final.iloc[train_subset.indices]
df_val = df_final.iloc[val_subset.indices]
Train function design
def setting_model(config, df_train, df_val, model):
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=config["lr"], weight_decay=config["weight_decay"])
BATCH_SIZE = config["batch_size"]
for epoch in range(config["epochs"]):
train_epoch_loss = 0
train_epoch_acc = 0
step = 0
for i in tqdm(range(0, df_train.shape[0], BATCH_SIZE)):
batch_X = np.array(
df_train[list_input_variables].iloc[i:i+BATCH_SIZE]
)
batch_X = torch.Tensor([x for x in batch_X])
batch_Y = np.array(
df_train[list_output_variables].iloc[i:i+BATCH_SIZE]
)
batch_Y = torch.Tensor([int(y) for y in batch_Y])
batch_Y = batch_Y.type(torch.int64)
optimizer.zero_grad()
outputs = model.forward(batch_X)
train_loss = criterion(outputs, batch_Y)
train_acc = multi_acc(outputs, batch_Y)
train_loss.backward()
optimizer.step()
train_epoch_loss += train_loss.item()
train_epoch_acc += train_acc.item()
step += 1
# print statistics
print(f"Epochs: {epoch}")
print(f"Train Loss: {train_epoch_loss/len(df_train)}")
print(f"Train Acc: {train_epoch_acc/step}")
print("\n")
# Validation loss
with torch.no_grad():
X_val = np.array(
df_val[list_input_variables]
)
X_val = torch.Tensor([x for x in X_val])
Y_val = np.array(
df_val[list_output_variables]
)
Y_val = torch.Tensor([int(y) for y in Y_val])
Y_val = Y_val.type(torch.int64)
outputs = model.forward(X_val)
_, predicted = torch.max(outputs.data, 1)
total = Y_val.size(0)
correct = (predicted == Y_val).sum().item()
loss = criterion(outputs, Y_val)
tune.report(loss=(loss.numpy()), accuracy=correct / total)
print(f"Validation Loss: {loss.numpy()/len(df_val)}")
print(f"Validation Acc: {correct / total:.3f}")
print("Finished Training")
Hyperparameters Tune
result_v1 = tune.run(
partial(setting_model, df_train=df_train, df_val=df_val, model=features_model_v1),
config=config,
fail_fast="raise",
)
result_v2 = tune.run(
partial(setting_model, df_train=df_train, df_val=df_val, model=features_model_v2),
config=config,
fail_fast="raise"
)
Output
result_v1.get_best_config()
{'lr': 1e-05, 'weight_decay': 1e-05, 'epochs': 1}
result_v2.get_best_config()
{'lr': 0.01, 'weight_decay': 1e-05, 'epochs': 1}

The issue is the use of torch.random under the hood. Since you are not directly providing a weight matrix for your layers, pytorch initializes it for you. Luckily, you can have a reproducible experiment by setting
torch.manual_seed(x) # where x is an integer
One should use only a few random seeds, otherwise you might overfit on the random seed. See lottery ticket hypothesis at https://arxiv.org/abs/1803.03635)

PyTorch adapt binary classification model to output probabilities of both classes

My dataset has 14 features and a target containing {0,1}.
I have trained this binary classifier:
class SimpleBinaryClassifier(nn.Module):
def __init__(self,input_shape):
super().__init__()
self.fc1 = nn.Linear(input_shape,64)
self.fc2 = nn.Linear(64,32)
self.dropout = nn.Dropout(p=0.1)
self.fc3 = nn.Linear(32,1)
def forward(self,x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = self.dropout(x)
x = self.fc3(x)
return x
with the following criterion and training loop:
criterion = nn.BCEWithLogitsLoss()
def binary_acc(y_pred, y_test):
y_pred_tag = torch.round(torch.sigmoid(y_pred))
correct_results_sum = (y_pred_tag == y_test).sum().float()
acc = correct_results_sum/y_test.shape[0]
acc = torch.round(acc * 100)
return acc
model.train()
for e in range(1, EPOCHS+1):
epoch_loss = 0
epoch_acc = 0
for X_batch, y_batch in train_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
y_pred = model(X_batch)
loss = criterion(y_pred, y_batch.unsqueeze(1))
acc = binary_acc(y_pred, y_batch.unsqueeze(1))
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()
print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')
This model, when called like sigmoid(model(input_tensor)) outputs a single number in [0,1]. The pipeline I'm working with, expects a model to output probabilities [p_class1, p_class2].
How can I adapt the model and the training loop?
If I set the output of the last layer to 2, I have problems with the criterion inside the training loop.
class SimpleBinaryClassifier2(nn.Module):
def __init__(self,input_shape):
super().__init__()
self.fc1 = nn.Linear(input_shape,64)
self.fc2 = nn.Linear(64,32)
self.dropout = nn.Dropout(p=0.1)
self.fc3 = nn.Linear(32,2) # now it's 2
def forward(self,x):
x = torch.relu(self.fc1(x))
x = torch.relu(self.fc2(x))
x = self.dropout(x)
x = self.fc3(x)
return x
I use the CrossEntropy
model = SimpleBinaryClassifier2(input_shape=14)
model.to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=LEARNING_RATE)
criterion = nn.CrossEntropyLoss()
and replace y_pred_tag = torch.round(torch.sigmoid(y_pred)) with argmax(softmax)
def binary_acc2(y_pred, y_test):
y_pred_tag = torch.argmax(torch.softmax(y_pred), dim=1)
correct_results_sum = (y_pred_tag == y_test).sum().float()
acc = correct_results_sum/y_test.shape[0]
acc = torch.round(acc * 100)
return acc
Then the train loop rises an error:
model.train()
for e in range(1, EPOCHS+1):
epoch_loss = 0
epoch_acc = 0
for X_batch, y_batch in train_loader:
X_batch, y_batch = X_batch.to(device), y_batch.to(device)
optimizer.zero_grad()
y_pred = model(X_batch)
loss = criterion(y_pred, y_batch)
acc = binary_acc(y_pred, y_batch)
loss.backward()
optimizer.step()
epoch_loss += loss.item()
epoch_acc += acc.item()

print(f'Epoch {e+0:03}: | Loss: {epoch_loss/len(train_loader):.5f} | Acc: {epoch_acc/len(train_loader):.3f}')
The error is the following:
RuntimeError: "nll_loss_forward_reduce_cuda_kernel_2d_index" not implemented for 'Float'
I already looked up on this other post where the cause of that error was that the element was a Float and not a tensor, but in my case the datasets are tensors:
train_dataset = GenericDataset(torch.FloatTensor(X_train), torch.FloatTensor(y_train))
test_dataset = GenericDataset(torch.FloatTensor(X_test), torch.FloatTensor(y_test))

According to nn.CrossEntropyLoss description it expects target as long and not float, while in your train_dataset you clearly convert it to float

Unable to run training session on GPU: Keras and Tensorflow

I tried to read images using CV2 and store them in a numpy array as shown in the code below. I do not wish to use ImageDataGenerator from Keras to read the images.
for image in images:
a_img = cv2.resize(cv2.imread( os.path.join(Augment_img_dir, image),0), (128,128))/255
lsri_img = cv2.resize(cv2.imread( os.path.join(LSRI_img_dir, image),0), (128,128), cv2.INTER_NEAREST)/255
hsri_img = cv2.resize(cv2.imread( os.path.join(HSRI_img_dir, image),0), (128,128))/255
img_train = [a_img, lsri_img]
img_train = np.asarray(img_train)
img_train = np.moveaxis(img_train,0, -1)
training_images.append(img_train)
target_images.append(hsri_img)
training_images = np.asarray(training_images)
target_images = np.asarray(target_images)
train_imgs, test_imgs, train_targets, test_targets = train_test_split(training_images, target_images,
test_size=.20, random_state=42)
batch_size = 8
train_img_batch = []
target_img_batch = []
len_imgs = len(train_imgs)
start = 0
temp_train = []
temp_target = []
for i in range(len_imgs+1):
if i%batch_size == 0 and i>0:
train_img_batch.append(np.asarray(temp_train))
target_img_batch.append(np.asarray(temp_target))
temp_train = []
temp_target = []
if i != len_imgs:
temp_train.append(train_imgs[i])
temp_target.append(train_targets[i])
I coded a GAN model training session as described in the code below and expect the model to train on GPU. However, the training is taking place on CPU.
batch_size = 8
lr = 0.0001
G_optimizer = Adam(learning_rate=lr)
n_epoch =50
iterations = 13500
def train():
G = Generator((128,128,2)).generator()
D = Discriminator((128,128,1)).discriminator()
g_optimizer_init = tf.optimizers.Adam(learning_rate=lr)
g_optimizer = tf.optimizers.Adam(learning_rate=lr)
d_optimizer = tf.optimizers.Adam(learning_rate=lr)
mse_loss = keras.losses.MeanSquaredError()
n_step_epoch = round(n_epoch // batch_size)
G_Loss_file = open("g_loss.txt",'w')
for epoch in range(n_epoch):
step_time = time.time()
for step, lr_patchs in enumerate(tf.data.Dataset.from_tensor_slices(train_img_batch)):
if lr_patchs.shape[0] != batch_size: # if the remaining data in this epoch < batch_size
break
hr_patchs = target_img_batch[step]
with tf.GradientTape() as tape:
#tape.watch(G.trainable_weights)
fake_hr_patchs = G(lr_patchs)
fake_hr_patchs = tf.reshape(fake_hr_patchs, (8,128,128))
mse_loss = tl.cost.mean_squared_error(fake_hr_patchs, hr_patchs, is_mean= True)
grad = tape.gradient(mse_loss, G.trainable_variables)
g_optimizer_init.apply_gradients(zip(grad, G.trainable_variables))
print("Epoch: [{}/{}], time: {:.2f}s, mse: {:.2f} ".format(
epoch, n_epoch, time.time() - step_time, mse_loss))
G_Loss_file.write("Epoch: [{}/{}], time: {:.2f}s, mse: {:.2f} \n".format(
epoch, n_epoch, time.time() - step_time, mse_loss))
G_Loss_file.close()
## adversarial learning (G, D)
Loss_file = open("loss.txt",'w')
n_step_epoch = round(n_epoch // batch_size)
for epoch in range(n_epoch):
step_time = time.time()
for step, lr_patchs in enumerate(train_img_batch):
if lr_patchs.shape[0] != batch_size: # if the remaining data in this epoch < batch_size
break
hr_patchs = target_img_batch[step]
with tf.GradientTape(persistent=True) as tape:
fake_patchs = G(lr_patchs)
fake_patchs = tf.reshape(fake_patchs, (8,128,128))
logits_fake = D(fake_patchs)
logits_real = D(hr_patchs)
d_Loss_int = Intensity_Loss(logits_fake, logits_real)
d_loss1 = tl.cost.sigmoid_cross_entropy(logits_fake, tf.zeros_like(logits_fake))
d_loss = tf.add(-tf.math.log(d_loss1), 0.1*d_Loss_int)
g_gan_loss = 1e-3 * tl.cost.sigmoid_cross_entropy(logits_fake, tf.ones_like(logits_fake))
mse_loss = tl.cost.mean_squared_error(fake_patchs, hr_patchs, is_mean=True)
g_loss = tf.add(mse_loss, g_gan_loss)
grad = tape.gradient(g_loss, G.trainable_weights)
#print(grad, len(G.trainable_weights))
g_optimizer.apply_gradients(zip(grad, G.trainable_weights))
grad = tape.gradient(d_loss, D.trainable_weights)
d_optimizer.apply_gradients(zip(grad, D.trainable_weights))
print("Epoch: [{}/{}], time: {:.3f}s, g_loss(mse:{:.3f}, adv:{:.3f}), d_loss: {:.3f}".format(
epoch, n_epoch, time.time() - step_time, mse_loss, g_gan_loss, d_loss))
Loss_file.write("Epoch: [{}/{}], time: {:.3f}s, g_loss(mse:{:.3f}, adv:{:.3f}), d_loss: {:.3f}".format(
epoch, n_epoch, time.time() - step_time, mse_loss, g_gan_loss, d_loss))
if epoch!=0 and ((epoch%5 == 0) or (epoch == n_epoch-1)):
G.save_weights("Gan_Weights/g_training_20_noise_no_contrast.h5")
D.save_weights("Gan_Weights/d_training_20_noise_no_contrast.h5")
Loss_file.close()
The model is not running on GPU and I want to utilize my GPU
This is how I call the function to train the model
with tf.device("/device:XLA_GPU:0"):
train()

I am trying to resume training from a certain checkpoint (Tensorflow) because I'm using Colab and 12 hours aren't enough

This is some part of the code I'm using
checkpoint_dir = 'training_checkpoints1'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
encoder=encoder,
decoder=decoder)
Now this is the training part
EPOCHS = 900
for epoch in range(EPOCHS):
start = time.time()
hidden = encoder.initialize_hidden_state()
total_loss = 0
for (batch, (inp, targ)) in enumerate(dataset):
loss = 0
with tf.GradientTape() as tape:
enc_output, enc_hidden = encoder(inp, hidden)
dec_hidden = enc_hidden
dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * batch_size, 1)
# Teacher forcing - feeding the target as the next input
for t in range(1, targ.shape[1]):
# passing enc_output to the decoder
predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
loss += loss_function(targ[:, t], predictions)
# using teacher forcing
dec_input = tf.expand_dims(targ[:, t], 1)
batch_loss = (loss / int(targ.shape[1]))
total_loss += batch_loss
variables = encoder.variables + decoder.variables
gradients = tape.gradient(loss, variables)
optimizer.apply_gradients(zip(gradients, variables))
if batch % 100 == 0:
print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
batch,
batch_loss.numpy()))
# saving (checkpoint) the model every 2 epochs
if (epoch + 1) % 2 == 0:
checkpoint.save(file_prefix = checkpoint_prefix)
print('Epoch {} Loss {:.4f}'.format(epoch + 1,
total_loss / num_batches))
print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
Now I want to restore for exp this checkpoint and start training from there but I don't know how.
path="/content/drive/My Drive/training_checkpoints1/ckpt-9"
checkpoint.restore(path)
Result
<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f6653263048>

You should create a CheckpointManager at the start as:
checkpoint_path = os.path.abspath('.') + "/checkpoints" # Put your path here
ckpt = tf.train.Checkpoint(encoder=encoder,
decoder=decoder,
optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
Now after running for few epoch, to restore latest checkpoint, you should get the latest checkpoint from the CheckpointManager:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
# restoring the latest checkpoint in checkpoint_path
ckpt.restore(ckpt_manager.latest_checkpoint)
This will restore your session from the latest epoch.

Expected device cuda:0 but got device cpu in PyTorch when I have already assigned the device to be cuda

I have a following neural network code, and I get "Expected device cuda:0 but got device cpu in PyTorch" error and I can't figure out why. I assign the device to be cuda and the print line returns cuda. I've tried assign the device as device = cuda:0 as well just in case but that had no effect. Here's the code:
def run():
torch.multiprocessing.freeze_support()
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
print(device)
metabolites = pd.read_excel("testmetabolitedata.xlsx")
subject_metadata = pd.read_excel("testsubj.xlsx")
metabolitesdf = pd.DataFrame(data=metabolites)
metabolitesdf = metabolitesdf.iloc[:, 1:9153]
subjectsdf = pd.DataFrame(data=subject_metadata)
n_samples, n_metabolites = metabolitesdf.shape
print(n_samples)
#genotypes of the target gene
print(subjectsdf['SLCO1B1_rs4149056'])
genotypes = subjectsdf['SLCO1B1_rs4149056']
print(genotypes)
# print('{} unique genotypes'.format(len(set(genotypes))))
labels = [1 if g == 1 else 0 for g in genotypes]
print('{} samples with genotype 1 out of {} samples ({:.1%})'.format(sum(labels), len(labels),
sum(labels) / len(labels)))
#Insert 0 into index 0 (first) into the list for the first row with column names
labels.insert(0, 0)
#log transform
log_metabol = np.log10(metabolitesdf + 1)
#Split data into training and validation 70% / 30%
data = torch.utils.data.TensorDataset(torch.Tensor(np.array(log_metabol)),
torch.Tensor(labels))
train, val = torch.utils.data.random_split(data, [int(0.7 * len(data)),
len(data) - int(0.7 * len(data))])
print('{:.0f}/{} training/total ({:.1%}) in training set, {:.0f}/{} val/total ({:.1%}) in validation set'.format(\
train[:][1].sum(), len(train), train[:][1].sum() / len(train),
val[:][1].sum(), len(val), val[:][1].sum() / len(val)))
class MultiLayerPredictor(torch.nn.Module):
def __init__(self, input_shape, output_shape=1, hidden_dim=1024, **kwargs):
super().__init__()
self.fc1 = torch.nn.Linear(in_features=input_shape, out_features=hidden_dim)
self.bn1 = torch.nn.BatchNorm1d(hidden_dim)
self.fc2 = torch.nn.Linear(in_features=hidden_dim, out_features=hidden_dim)
self.bn2 = torch.nn.BatchNorm1d(hidden_dim)
self.fc3 = torch.nn.Linear(in_features=hidden_dim, out_features=output_shape)
def forward(self, x):
l1 = torch.relu(self.bn1(self.fc1(x)))
l2 = torch.relu(self.bn2(self.fc2(l1)))
return torch.sigmoid(self.fc3(l2)).reshape(-1)
#load the training and validation sets
print("Load training and validation data ")
train_loader = torch.utils.data.DataLoader(train, batch_size=128,
shuffle=True, num_workers=10, pin_memory=True)
val_loader = torch.utils.data.DataLoader(val, batch_size=128,
shuffle=False, num_workers=10, pin_memory=True)
print("Loading complete, create model")
model3 = MultiLayerPredictor(input_shape=n_metabolites).to(device)
print("Model created! Moving to optimizer")
optimizer3 = torch.optim.SGD(model3.parameters(), lr=1e-2)
print("Optimizer done")
objective3 = torch.nn.BCELoss()
epochs = 30
print_stats_interval = 10
log3 = []
print("Moving to training loop")
for epoch in range(epochs):
loss = n_correct = 0
model3.train()
for batch, target in train_loader:
batch = batch.view(-1, n_metabolites).to(device)
optimizer3.zero_grad()
outputs = model3(batch) # stack trace shows the issue being either on this line
train_loss = objective3(outputs, target) # or this line
loss += train_loss.item()
n_correct += (target == (outputs.reshape(-1) > 0.5).float()).sum()
train_loss.backward()
optimizer3.step()
loss = loss / len(train_loader)
acc = (n_correct.float() / len(train)).numpy()
epoch += 1
model3.eval();
val_loss = val_n_correct = 0
with torch.no_grad():
for batch, target in val_loader:
batch = batch.view(-1, n_metabolites).to(device)
outputs = model3(batch)
val_loss += objective3(outputs, target)
val_n_correct += (target == (outputs.reshape(-1) > 0.5).float()).sum()
val_loss = (val_loss / len(val_loader)).numpy()
val_acc = (val_n_correct.float() / len(val)).numpy()
if (epoch % print_stats_interval) == 0 or epoch == epochs:
print(f'epoch={epoch:.0f}, loss={loss:.5f}, val_loss={np.round(val_loss,5):.5f}, acc={np.round(acc,5):.5f}, val_acc={np.round(val_acc,5):.5f}')
log3.append((epoch, loss, val_loss, acc, val_acc))
log3 = pd.DataFrame(log3, columns=['epoch', 'loss', 'val_loss', 'acc', 'val_acc'])
plt.figure(figsize=(6, 3))
plt.plot(log3['epoch'], log3['loss'], label='Training');
plt.plot(log3['epoch'], log3['val_loss'], label='Validation');
plt.xlabel('Epoch'); plt.ylabel('Loss')
plt.legend();
val_log_mutations = val_hcc[:][0].numpy().reshape(-1)
val_true_labels = val_hcc[:][1].numpy() + 0
res = model3(val_hcc[:][0])
predictions = (res.detach().numpy().reshape(-1) > 0.5) + 0
correct = (val_true_labels == predictions) + 0
n_correct = correct.sum()
print('{}/{} ({:.1%}) in the validation set'.format(n_correct, len(correct), n_correct / len(correct)))
print('Majority classifier accuracy: {:.1%}'.format((len(correct) - val_true_labels.sum()) / len(correct)))
if __name__ == '__main__':
run()
What's going on here? The stack trace here:
Traceback (most recent call last):
File "//ad..fi/home/h/h/Desktop/neuralnet/neuralnet_train.py", line 142, in <module>
run()
File "//ad..fi/home/h/h/Desktop/neuralnet/neuralnet_train.py", line 99, in run
train_loss = objective3(outputs, target)
File "C:\Users\h\AppData\Roaming\Python\Python38\site-packages\torch\nn\modules\module.py", line 550, in __call__
result = self.forward(*input, **kwargs)
File "C:\Users\h\AppData\Roaming\Python\Python38\site-packages\torch\nn\modules\loss.py", line 516, in forward
return F.binary_cross_entropy(input, target, weight=self.weight, reduction=self.reduction)
File "C:\Users\h\AppData\Roaming\Python\Python38\site-packages\torch\nn\functional.py", line 2378, in binary_cross_entropy
return torch._C._nn.binary_cross_entropy(
RuntimeError: expected device cuda:0 but got device cpu
PS Microsoft.PowerShell.Core\FileSystem::\\ad..fi\home\h\h\Desktop\neuralnet>

Also move the targets to CUDA in both the training and validation for loops.
for batch, target in train_loader:
batch,target = batch.view(-1, n_metabolites).to(device),target.to(device)
.
.
.
for batch, target in val_loader:
batch,target = batch.view(-1, n_metabolites).to(device),target.to(device)``
.
.
.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Increase of GPU memory usage during training - python

Related

Using ray tune `tune.run` with pytorch returns different optimal hyperparameters combination

PyTorch adapt binary classification model to output probabilities of both classes

Unable to run training session on GPU: Keras and Tensorflow

I am trying to resume training from a certain checkpoint (Tensorflow) because I'm using Colab and 12 hours aren't enough

Expected device cuda:0 but got device cpu in PyTorch when I have already assigned the device to be cuda

Categories

Resources