Unable to run training session on GPU: Keras and Tensorflow

Unable to run training session on GPU: Keras and Tensorflow - python

I tried to read images using CV2 and store them in a numpy array as shown in the code below. I do not wish to use ImageDataGenerator from Keras to read the images.
for image in images:
a_img = cv2.resize(cv2.imread( os.path.join(Augment_img_dir, image),0), (128,128))/255
lsri_img = cv2.resize(cv2.imread( os.path.join(LSRI_img_dir, image),0), (128,128), cv2.INTER_NEAREST)/255
hsri_img = cv2.resize(cv2.imread( os.path.join(HSRI_img_dir, image),0), (128,128))/255
img_train = [a_img, lsri_img]
img_train = np.asarray(img_train)
img_train = np.moveaxis(img_train,0, -1)
training_images.append(img_train)
target_images.append(hsri_img)
training_images = np.asarray(training_images)
target_images = np.asarray(target_images)
train_imgs, test_imgs, train_targets, test_targets = train_test_split(training_images, target_images,
test_size=.20, random_state=42)
batch_size = 8
train_img_batch = []
target_img_batch = []
len_imgs = len(train_imgs)
start = 0
temp_train = []
temp_target = []
for i in range(len_imgs+1):
if i%batch_size == 0 and i>0:
train_img_batch.append(np.asarray(temp_train))
target_img_batch.append(np.asarray(temp_target))
temp_train = []
temp_target = []
if i != len_imgs:
temp_train.append(train_imgs[i])
temp_target.append(train_targets[i])
I coded a GAN model training session as described in the code below and expect the model to train on GPU. However, the training is taking place on CPU.
batch_size = 8
lr = 0.0001
G_optimizer = Adam(learning_rate=lr)
n_epoch =50
iterations = 13500
def train():
G = Generator((128,128,2)).generator()
D = Discriminator((128,128,1)).discriminator()
g_optimizer_init = tf.optimizers.Adam(learning_rate=lr)
g_optimizer = tf.optimizers.Adam(learning_rate=lr)
d_optimizer = tf.optimizers.Adam(learning_rate=lr)
mse_loss = keras.losses.MeanSquaredError()
n_step_epoch = round(n_epoch // batch_size)
G_Loss_file = open("g_loss.txt",'w')
for epoch in range(n_epoch):
step_time = time.time()
for step, lr_patchs in enumerate(tf.data.Dataset.from_tensor_slices(train_img_batch)):
if lr_patchs.shape[0] != batch_size: # if the remaining data in this epoch < batch_size
break
hr_patchs = target_img_batch[step]
with tf.GradientTape() as tape:
#tape.watch(G.trainable_weights)
fake_hr_patchs = G(lr_patchs)
fake_hr_patchs = tf.reshape(fake_hr_patchs, (8,128,128))
mse_loss = tl.cost.mean_squared_error(fake_hr_patchs, hr_patchs, is_mean= True)
grad = tape.gradient(mse_loss, G.trainable_variables)
g_optimizer_init.apply_gradients(zip(grad, G.trainable_variables))
print("Epoch: [{}/{}], time: {:.2f}s, mse: {:.2f} ".format(
epoch, n_epoch, time.time() - step_time, mse_loss))
G_Loss_file.write("Epoch: [{}/{}], time: {:.2f}s, mse: {:.2f} \n".format(
epoch, n_epoch, time.time() - step_time, mse_loss))
G_Loss_file.close()
## adversarial learning (G, D)
Loss_file = open("loss.txt",'w')
n_step_epoch = round(n_epoch // batch_size)
for epoch in range(n_epoch):
step_time = time.time()
for step, lr_patchs in enumerate(train_img_batch):
if lr_patchs.shape[0] != batch_size: # if the remaining data in this epoch < batch_size
break
hr_patchs = target_img_batch[step]
with tf.GradientTape(persistent=True) as tape:
fake_patchs = G(lr_patchs)
fake_patchs = tf.reshape(fake_patchs, (8,128,128))
logits_fake = D(fake_patchs)
logits_real = D(hr_patchs)
d_Loss_int = Intensity_Loss(logits_fake, logits_real)
d_loss1 = tl.cost.sigmoid_cross_entropy(logits_fake, tf.zeros_like(logits_fake))
d_loss = tf.add(-tf.math.log(d_loss1), 0.1*d_Loss_int)
g_gan_loss = 1e-3 * tl.cost.sigmoid_cross_entropy(logits_fake, tf.ones_like(logits_fake))
mse_loss = tl.cost.mean_squared_error(fake_patchs, hr_patchs, is_mean=True)
g_loss = tf.add(mse_loss, g_gan_loss)
grad = tape.gradient(g_loss, G.trainable_weights)
#print(grad, len(G.trainable_weights))
g_optimizer.apply_gradients(zip(grad, G.trainable_weights))
grad = tape.gradient(d_loss, D.trainable_weights)
d_optimizer.apply_gradients(zip(grad, D.trainable_weights))
print("Epoch: [{}/{}], time: {:.3f}s, g_loss(mse:{:.3f}, adv:{:.3f}), d_loss: {:.3f}".format(
epoch, n_epoch, time.time() - step_time, mse_loss, g_gan_loss, d_loss))
Loss_file.write("Epoch: [{}/{}], time: {:.3f}s, g_loss(mse:{:.3f}, adv:{:.3f}), d_loss: {:.3f}".format(
epoch, n_epoch, time.time() - step_time, mse_loss, g_gan_loss, d_loss))
if epoch!=0 and ((epoch%5 == 0) or (epoch == n_epoch-1)):
G.save_weights("Gan_Weights/g_training_20_noise_no_contrast.h5")
D.save_weights("Gan_Weights/d_training_20_noise_no_contrast.h5")
Loss_file.close()
The model is not running on GPU and I want to utilize my GPU
This is how I call the function to train the model
with tf.device("/device:XLA_GPU:0"):
train()

Related

Where to set theta manually in U-Net using torch and Adam

I am using U-Net architecture and Adam optimizer.
At the beginning of the training phase, I want to initialize the value of theta manually. However, I didn't find an explanation for this part. This is the code and I need a little explanation about how to adjust the value of theta. Thanks.
`
def train(model, data_in, loss, optim, max_epochs, model_dir, test_interval=1 , device=torch.device("cuda:0")):
best_metric = -1
best_metric_epoch = -1
save_loss_train = []
save_loss_test = []
save_metric_train = []
save_metric_test = []
train_loader, test_loader = data_in
for epoch in range(max_epochs):
print("-" * 10)
print(f"epoch {epoch + 1}/{max_epochs}")
model.train()
train_epoch_loss = 0
train_step = 0
epoch_metric_train = 0
for batch_data in train_loader:
train_step += 1
volume = batch_data["vol"]
label = batch_data["seg"]
label = label != 0
volume, label = (volume.to(device), label.to(device))
optim.zero_grad()
outputs = model(volume)
train_loss = loss(outputs, label)
train_loss.backward()
optim.step()
train_epoch_loss += train_loss.item()
print(
f"{train_step}/{len(train_loader) // train_loader.batch_size}, "
f"Train_loss: {train_loss.item():.4f}")
train_metric = dice_metric(outputs, label)
epoch_metric_train += train_metric
print(f'Train_dice: {train_metric:.4f}')
print('-'*20)
train_epoch_loss /= train_step
print(f'Epoch_loss: {train_epoch_loss:.4f}')
save_loss_train.append(train_epoch_loss)
np.save(os.path.join(model_dir, 'loss_train.npy'), save_loss_train)
epoch_metric_train /= train_step
print(f'Epoch_metric: {epoch_metric_train:.4f}')
save_metric_train.append(epoch_metric_train)
np.save(os.path.join(model_dir, 'metric_train.npy'), save_metric_train)
if (epoch + 1) % test_interval == 0:
model.eval()
with torch.no_grad():
test_epoch_loss = 0
test_metric = 0
epoch_metric_test = 0
test_step = 0
for test_data in test_loader:
test_step += 1
test_volume = test_data["vol"]
test_label = test_data["seg"]
test_label = test_label != 0
test_volume, test_label = (test_volume.to(device), test_label.to(device),)
test_outputs = model(test_volume)
test_loss = loss(outputs, test_label)
test_epoch_loss += test_loss.item()
test_metric = dice_metric(test_outputs, test_label)
epoch_metric_test += test_metric
test_epoch_loss /= test_step
print(f'test_loss_epoch: {test_epoch_loss:.4f}')
save_loss_test.append(test_epoch_loss)
np.save(os.path.join(model_dir, 'loss_test.npy'), save_loss_test)
epoch_metric_test /= test_step
print(f'test_dice_epoch: {epoch_metric_test:.4f}')
save_metric_test.append(epoch_metric_test)
np.save(os.path.join(model_dir, 'metric_test.npy'), save_metric_test)
if epoch_metric_test > best_metric:
best_metric = epoch_metric_test
best_metric_epoch = epoch + 1
torch.save(model.state_dict(), os.path.join(
model_dir, "best_metric_model.pth"))
print(
f"current epoch: {epoch + 1} current mean dice: {test_metric:.4f}"
f"\nbest mean dice: {best_metric:.4f} "
f"at epoch: {best_metric_epoch}"
)
print(
f"train completed, best_metric: {best_metric:.4f} "
f"at epoch: {best_metric_epoch}")
`
`
loss_function = DiceLoss(to_onehot_y=True, sigmoid=True, squared_pred=True)
optimizer = torch.optim.Adam(model.parameters(), 1e-5, weight_decay=1e-5, amsgrad=True)
train(model, data_in, loss_function, optimizer, 500, model_dir)
`
I am trying to adjust the value of theta in the code. I am expecting the network to start the training using the manual weight.

How loss backpropagated to provide the effective weights for next iteration of batch?

I tried to fuse the multiple losses but that are not effecting the weights for next iteration, I would like to fuse the loss function because I am getting multiple output from the generating model. How may I solve it?
def train(self, epochs, batch_size=1, sample_interval=50):
LAMBDA = 0.1
start_time = datetime.datetime.now()
for epoch in range(epochs):
if epoch % 500 == 0:
optimizer = Adam(0.0001, 0.5)
self.combined.compile(loss=['mae'], loss_weights=[1, 100], optimizer=optimizer)
for batch_i, (imgs_A, imgs_B) in enumerate(self.data_loader.load_batch(batch_size)):
# os.environ['CUDA_VISIBLE_DEVICES'] = '-1'
fake_A1, fake_A2, fake_A3 = self.generator.predict(imgs_B)
g1_loss = self.combined.train_on_batch([imgs_A, imgs_B], [fake_A1])
fake_A2 = tf.image.resize(fake_A2, [256, 256])
fake_A3 = tf.image.resize(fake_A3, [256, 256])
g2_loss = self.combined.train_on_batch([imgs_A, imgs_B], [fake_A2])
g3_loss = self.combined.train_on_batch([imgs_A, imgs_B], [fake_A3])
g1_loss = tf.cast(g1_loss, tf.double)
g2_loss = tf.cast(g2_loss, tf.double)
g3_loss = tf.cast(g3_loss, tf.double)
g_loss = g1_loss + g2_loss + g3_loss
imgs_A = np.fft.fft2(imgs_A)
fake_A1 = np.fft.fft2(fake_A1)
fake_A2 = np.fft.fft2(fake_A2)
fake_A3 = np.fft.fft2(fake_A3)
mse = tf.keras.losses.MeanSquaredError()
l1_loss = mse(imgs_A, fake_A1).numpy()
l2_loss = mse(imgs_A, fake_A2).numpy()
l3_loss = mse(imgs_A, fake_A3).numpy()
l1_loss = tf.cast(l1_loss, tf.double)
l2_loss = tf.cast(l2_loss, tf.double)
l3_loss = tf.cast(l3_loss, tf.double)
l_loss = l1_loss + l2_loss + l3_loss
#Normalized loss
g_loss = g_loss + (LAMBDA * l_loss)
elapsed_time = datetime.datetime.now() - start_time
print("[Epoch %d/%d] [Batch %d/%d] [G loss: %f] time: %s" % (epoch, epochs, batch_i, self.data_loader.n_batches, g_loss, elapsed_time))
How do we backword the loss after normalization which is stored into g_loss using following formula: g_loss = g_loss + (LAMBDA * l_loss)
Such as the loss has been backpropagated in pytorch code present in the following link: https://github.com/chosj95/MIMO-UNet/blob/main/train.py
which is backpropagated using following statements in pytorch:
loss.backward()
optimizer.step()

Increase of GPU memory usage during training

I was training the network on usual MNIST dataset, and encountered the next problem:
when i start to add valid_metrics to a loss_list and accuracy_list, amount of GPU memory that is being used starts increasing with every 1 or 2 epochs.
This is the code of train_loop:
def train_model(model: torch.nn.Module,
train_dataset: torch.utils.data.Dataset,
valid_dataset: torch.utils.data.Dataset,
loss_function: torch.nn.Module = torch.nn.CrossEntropyLoss(),
optimizer_class: Type[torch.optim.Optimizer] = torch.optim,
optimizer_params: Dict = {},
initial_lr = 0.01,
lr_scheduler_class: Any = torch.optim.lr_scheduler.ReduceLROnPlateau,
lr_scheduler_params: Dict = {},
batch_size = 64,
max_epochs = 1000,
early_stopping_patience = 20):
optimizer = torch.optim.Adam(model.parameters(), lr=initial_lr, **optimizer_params)
lr_scheduler = lr_scheduler_class(optimizer, **lr_scheduler_params)
train_loader = torch.utils.data.DataLoader(train_dataset, shuffle=True, batch_size=batch_size)
valid_loader = torch.utils.data.DataLoader(valid_dataset, batch_size=batch_size)
best_valid_loss = None
best_epoch = None
loss_list = list()
accuracy_list = list()
for epoch in range(max_epochs):
print(f'Epoch {epoch}')
start = timer()
train_single_epoch(model, optimizer, loss_function, train_loader)
valid_metrics = validate_single_epoch(model, loss_function, valid_loader)
loss_list.append(valid_metrics['loss'])
accuracy_list.append(valid_metrics['accuracy'])
print('time:', timer() - start)
print(f'Validation metrics: \n{valid_metrics}')
lr_scheduler.step(valid_metrics['loss'])
if best_valid_loss is None or best_valid_loss > valid_metrics['loss']:
print(f'Best model yet, saving')
best_valid_loss = valid_metrics['loss']
best_epoch = epoch
torch.save(model, './best_model.pth')
if epoch - best_epoch > early_stopping_patience:
print('Early stopping triggered')
return loss_list, accuracy_list
and the code of validate_single_epoch:
def validate_single_epoch(model: torch.nn.Module,
loss_function: torch.nn.Module,
data_loader: torch.utils.data.DataLoader):
loss_total = 0
accuracy_total = 0
for data in data_loader:
X, y = data
X, y = X.view(-1, 784), y.to(device)
X = X.to(device)
output = model(X)
loss = loss_function(output, y)
loss_total += loss
y_pred = output.argmax(dim = 1, keepdim=True).to(device)
accuracy_total += y_pred.eq(y.view_as(y_pred)).sum().item()
loss_avg = loss_total / len(data_loader.dataset)
accuracy_avg = 100.0 * accuracy_total / len(data_loader.dataset)
return {'loss' : loss_avg, 'accuracy' : accuracy_avg}
I use GeForce MX250 as GPU

The problem is likely because the gradients are being computed and stored in the validation loop. To solve that, perhaps the easiest way is to wrap the validation call in a no_grad context:
with torch.no_grad():
valid_metrics = validate_single_epoch(model, loss_function, valid_loader)
If you prefer, you can also decorate the validate_single_epoch(...) with #torch.no_grad():
#torch.no_grad()
def validate_single_epoch(...):
# ...
Not related to your problem, but pay attention that you're using a model in training mode during validation, which may not be what you want. Perhaps there is a missing call to model.eval() in the validation function.

I am trying to resume training from a certain checkpoint (Tensorflow) because I'm using Colab and 12 hours aren't enough

This is some part of the code I'm using
checkpoint_dir = 'training_checkpoints1'
checkpoint_prefix = os.path.join(checkpoint_dir, "ckpt")
checkpoint = tf.train.Checkpoint(optimizer=optimizer,
encoder=encoder,
decoder=decoder)
Now this is the training part
EPOCHS = 900
for epoch in range(EPOCHS):
start = time.time()
hidden = encoder.initialize_hidden_state()
total_loss = 0
for (batch, (inp, targ)) in enumerate(dataset):
loss = 0
with tf.GradientTape() as tape:
enc_output, enc_hidden = encoder(inp, hidden)
dec_hidden = enc_hidden
dec_input = tf.expand_dims([targ_lang.word2idx['<start>']] * batch_size, 1)
# Teacher forcing - feeding the target as the next input
for t in range(1, targ.shape[1]):
# passing enc_output to the decoder
predictions, dec_hidden, _ = decoder(dec_input, dec_hidden, enc_output)
loss += loss_function(targ[:, t], predictions)
# using teacher forcing
dec_input = tf.expand_dims(targ[:, t], 1)
batch_loss = (loss / int(targ.shape[1]))
total_loss += batch_loss
variables = encoder.variables + decoder.variables
gradients = tape.gradient(loss, variables)
optimizer.apply_gradients(zip(gradients, variables))
if batch % 100 == 0:
print('Epoch {} Batch {} Loss {:.4f}'.format(epoch + 1,
batch,
batch_loss.numpy()))
# saving (checkpoint) the model every 2 epochs
if (epoch + 1) % 2 == 0:
checkpoint.save(file_prefix = checkpoint_prefix)
print('Epoch {} Loss {:.4f}'.format(epoch + 1,
total_loss / num_batches))
print('Time taken for 1 epoch {} sec\n'.format(time.time() - start))
Now I want to restore for exp this checkpoint and start training from there but I don't know how.
path="/content/drive/My Drive/training_checkpoints1/ckpt-9"
checkpoint.restore(path)
Result
<tensorflow.python.training.tracking.util.CheckpointLoadStatus at 0x7f6653263048>

You should create a CheckpointManager at the start as:
checkpoint_path = os.path.abspath('.') + "/checkpoints" # Put your path here
ckpt = tf.train.Checkpoint(encoder=encoder,
decoder=decoder,
optimizer = optimizer)
ckpt_manager = tf.train.CheckpointManager(ckpt, checkpoint_path, max_to_keep=5)
Now after running for few epoch, to restore latest checkpoint, you should get the latest checkpoint from the CheckpointManager:
start_epoch = 0
if ckpt_manager.latest_checkpoint:
start_epoch = int(ckpt_manager.latest_checkpoint.split('-')[-1])
# restoring the latest checkpoint in checkpoint_path
ckpt.restore(ckpt_manager.latest_checkpoint)
This will restore your session from the latest epoch.

tf.io.read_file unable to find file or directory from a numpy array

For tensor flow data base loading, I saved a numpy array from my database.
images= train_dataset[i]
image_string = tf.io.read_file(images)
This is a part of my training function. And it seems to be failing.
The issue is
(iP6)21.jpg; No such file or directory
Any help appreciated.
Edited to explain the whole code:
train_dataset is a variable to hold a numpy file.
train_dataset=X_train_shuffled
for the sake of completeness, here is the entire training function
'training'
import time
import tensorflow as tf
#from tensorflow import read_file
def train():
data=filenames_shuffled
data
info=y_labels_one_hot_shuffled
#data, info = tfds.load("mnist", with_info=True, data_dir='/data/tensorflow_datasets')
#train_data = data['train']
train_data=X_train_filenames
if not os.path.exists('./images'):
os.makedirs('./images')
# settting hyperparameter
latent_dim = 100
epochs = 800
batch_size = 32
buffer_size = 6000
save_interval = 50
img_shape = (32, 32, 3)
#num_classes = info.features['label'].num_classes
#num_classes=10
num_classes=y_train
generator = Generator(num_classes)
discriminator = Discriminator(num_classes)
gen_optimizer = tf.keras.optimizers.Adam(0.0002, 0.5)
disc_optimizer = tf.keras.optimizers.Adam(0.0002, 0.5)
#train_dataset = train_data.map(lambda x: preprocess_image(x, img_shape, num_classes)).shuffle(buffer_size).batch(batch_size)
train_dataset=train_data
cross_entropy = tf.keras.losses.BinaryCrossentropy(from_logits=True)
#tf.function
def train_step(images, labels):
noise = tf.random.normal([batch_size, latent_dim])
with tf.GradientTape(persistent=True) as tape:
generated_images = generator(noise, labels)
real_output = discriminator(images, labels)
generated_output = discriminator(generated_images, labels)
gen_loss = generator_loss(cross_entropy, generated_output)
disc_loss = discriminator_loss(cross_entropy, real_output, generated_output)
grad_gen = tape.gradient(gen_loss, generator.trainable_variables)
grad_disc = tape.gradient(disc_loss, discriminator.trainable_variables)
gen_optimizer.apply_gradients(zip(grad_gen, generator.trainable_variables))
disc_optimizer.apply_gradients(zip(grad_disc, discriminator.trainable_variables))
return gen_loss, disc_loss
seed = tf.random.normal([16, latent_dim])
#train_dataset.append(y_train)
#train_dataset=np.append(train_dataset,y_train)
for epoch in range(1, epochs + 1):
start = time.time()
total_gen_loss = 0
total_disc_loss = 0
#for images in filenames_shuffled and labels in y_labels_one_hot_shuffled:
'''
Trying an alternate for loop thing. As your numpy cannot be called
for images,labels in train_dataset():
gen_loss, disc_loss = train_step(images, labels)
total_gen_loss += gen_loss
total_disc_loss += disc_loss
'''
i=0
#print(train_dataset)
#print("done")
#print(y_train)
while i<2200:
images= train_dataset[i]
labels=y_train[i]
'''
added this region. Not sure what is happening
Input filename tensor must be scalar, but had shape: [2200] [Op:ReadFile]
'''
image_string = tf.io.read_file(images)
image_decoded = tf.image.decode_jpeg(image_string, channels=3)
images = tf.cast(image_decoded, tf.float32)
'''
till here
'''
#filenames_shuffled_numpy, y_labels_one_hot_shuffled
#print("No error")
gen_loss, disc_loss = train_step(images, labels)
#print("error")
total_gen_loss += gen_loss
total_disc_loss += disc_loss
i=i+1
print('Time for epoch {} is {} sec - gen_loss = {}, disc_loss = {}'.format(epoch, time.time() - start, total_gen_loss / batch_size, total_disc_loss / batch_size))
if epoch % save_interval == 0:
save_imgs(epoch, generator, seed)
if __name__ == "__main__":
train()
If you need more information, I will happily provide it

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Unable to run training session on GPU: Keras and Tensorflow - python

Related

Where to set theta manually in U-Net using torch and Adam

How loss backpropagated to provide the effective weights for next iteration of batch?

Increase of GPU memory usage during training

I am trying to resume training from a certain checkpoint (Tensorflow) because I'm using Colab and 12 hours aren't enough

tf.io.read_file unable to find file or directory from a numpy array

Categories

Resources