discrepancy between Tensor flow model and Pyorch model - python

I trained a Unet model on the same dataset using both Tensor flow model and Pyorch. Both model showed a fine loss curve for training data, but the Pyorch validation data keeps zigzagging.
I think I'm good on Tensor flow but I'm new to Pyorch. Please, check if I made a mistake on Pyorch .
Below is the Tensor flow:
and this the Pyorch
The below code for Tensor flow :
class DataGen(keras.utils.Sequence):
def __init__(self, ids, path, batch_size=8, image_size=128):
self.ids = ids
self.path = path
self.batch_size = batch_size
self.image_size = image_size
self.on_epoch_end()
def __load__(self, id_name):
## Path
#image_path = os.path.join(self.path, id_name, "images", id_name) + ".png"
#/content/drive/My Drive/mycolab/training/ patient0001 / patient0001
image_path = os.path.join(self.path, id_name, id_name,) + "_2CH_ED.mhd"
#mask_path = os.path.join(self.path, id_name, "masks/")
mask_path = os.path.join(self.path, id_name, id_name,) + "_2CH_ED_gt.mhd"
# not required all_masks = os.listdir(mask_path)
## Reading Image
#image = cv2.imread(image_path, 1)
my_img1= io.imread(image_path , plugin='simpleitk')
image=my_img1[0,:,:]
#--------------image = cv2.merge((image,image,image))
#image =convert_to_3_channel( cv2.resize(image, (self.image_size, self.image_size)))
image = cv2.resize(image, (self.image_size, self.image_size))
#image = cv2.merge((image,image,image,image))
#image = cv2.merge((image,image,image,image))
# same for mask
my_mask1= io.imread(mask_path , plugin='simpleitk')
mask=my_mask1[0,:,:]
mask= cv2.resize( mask, (self.image_size, self.image_size))
#one_hot_tensor= K.one_hot(K.cast( tf.convert_to_tensor(mask, dtype=tf.int32) , 'int32'), num_classes=4)
#mask=np.asarray(one_hot_tensor, np.float32)
#mask=np.asarray(one_hot_tensor, np.int32)
masks = [(mask == v) for v in range(4) ] #self.class_values]
mask = np.stack(masks, axis=-1).astype('float')
#masks = [(mask == v) for v in range(4)]#self.class_values]
#print("mask ttttttttttt5555555555:", type(masks))
#mask = np.stack(masks, axis=-1).astype('float')
# add background if mask is not binary
#if mask.shape[-1] != 1:
# #print("adding background if mask is not binary******************++++++++__________________$$")
# background = 1 - mask.sum(axis=-1, keepdims=True)
# mask = np.concatenate((mask, background), axis=-1)
#mask = np.zeros((self.image_size, self.image_size, 1))
## Reading Masks
#for name in all_masks:
# _mask_path = mask_path + name
# _mask_image = cv2.imread(_mask_path, -1)
# _mask_image = cv2.resize(_mask_image, (self.image_size, self.image_size)) #128x128
# _mask_image = np.expand_dims(_mask_image, axis=-1)
# mask = np.maximum(mask, _mask_image)
#print("image &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
#print("image &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
#print("image &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
#print(mask)
## Normalizaing
#-------------image = image/255.0
#---- check if tis correct mask = mask/255.0
return image, mask# image.astype('float'), mask.astype('float')
def __getitem__(self, index):
#print("Index *************************************:", index )
if(index+1)*self.batch_size > len(self.ids):
self.batch_size = len(self.ids) - index*self.batch_size
files_batch = self.ids[index*self.batch_size : (index+1)*self.batch_size]
image = []
mask = []
for id_name in files_batch:
_img, _mask = self.__load__(id_name)
image.append(_img ) #.astype('float'))
mask.append(_mask ) #.astype('float'))
image = np.array(image)
mask = np.array(mask)
#print("image shape%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%",image.shape)
#print("mask shape%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%",mask.shape)
return image , mask# image.astype('float'), mask.astype('float')
def on_epoch_end(self):
pass
def __len__(self):
return int(np.ceil(len(self.ids)/float(self.batch_size)))
# you may need to change variables names
image_size = 256
train_path ="path" #"dataset/stage1_train/"
epochs = 10 #70 # 5 # paper require 30
batch_size =1#32#1# 8
num_class = 4
print("train_ids length:", len(train_ids))
## Validation Data Size
val_data_size = 10
valid_ids = train_ids[:val_data_size]
my_slice_index=2 # class index that we are calauting
## Training Ids
train_ids = next(os.walk(train_path))[1]
def down_block(x, filters, kernel_size=(3, 3), padding="same", strides=1):
c = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(x)
c = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(c)
p = keras.layers.MaxPool2D((2, 2), (2, 2))(c)
return c, p
def down_block_test(x, filters, kernel_size=(3, 3), padding="same", strides=1):
residual = x
print("down_block: residual size", residual.shape)
c = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(x)
print("down_block: c size", c.shape)
c = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(c)
print("down_block: c2 size", c.shape)
print("down_block: residual.shape[1]", residual.shape[1])
print("down_block: residual.shape[3]", residual.shape[3])
if residual.shape[3] != c.shape[3]:
residual = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(residual)
c += residual
p = keras.layers.MaxPool2D((2, 2), (2, 2))(c)
return c, p
def up_block(x, skip, filters, kernel_size=(3, 3), padding="same", strides=1):
us = keras.layers.UpSampling2D((2, 2))(x)
concat = keras.layers.Concatenate()([us, skip])
c = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(concat)
c = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(c)
return c
def bottleneck(x, filters, kernel_size=(3, 3), padding="same", strides=1):
c = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(x)
c = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(c)
return c
#------------------------------------ model in the paper as Unet 1 ----------------------------------------
def UNet_test():
f = [16, 32, 64, 128, 256]
#inputs = keras.layers.Input((image_size, image_size, 3))
inputs = keras.layers.Input((image_size, image_size, 1))
p0 = inputs
tf.print("********************************************p0: ")
c1, p1 = down_block(p0, f[1]) #264 -> 128
print("c1",c1.shape )
print("p1",p1.shape )
c2, p2 = down_block(p1, f[1]) #128 -> 64
print("c2",c2.shape )
print("p2",p2.shape )
c3, p3 = down_block(p2, f[2]) #64 -> 32
print("c3",c3.shape )
print("p3",p3.shape )
c4, p4 = down_block(p3, f[3]) #32->16
print("c4",c4.shape )
print("p4",p4.shape )
c5, p5 = down_block(p4, f[3]) #16->8
print("c5",c5.shape )
print("p5",p5.shape )
bn = bottleneck(p5, f[3]) # 8
u1 = up_block(bn, c5, f[3]) #8 -> 16
u2 = up_block(u1, c4, f[3]) #16 -> 32
u3 = up_block(u2, c3, f[2]) #32 -> 64
u4 = up_block(u3, c2, f[1]) #64 -> 128
u5 = up_block(u4, c1, f[0]) #128 -> 256
#outputs = keras.layers.Conv2D(1, (1, 1), padding="same", activation="sigmoid")(u4)
outputs = keras.layers.Conv2D(num_class, (1, 1), padding="same", activation="softmax")(u5)
#outputs = keras.layers.Conv2D(1, (1, 1), padding="same", activation="softmax")(u4)
model = keras.models.Model(inputs, outputs)
return model
model=UNet_test()
import segmentation_models as sm
#---------------------model =new_model(Resmodel,'sigmoid')#model_standard() # UNet_1()
#model.load_weights(weights_path)
#model.load_weights("ckpt")
LR = 0.0001
optim = keras.optimizers.Adam(LR)
dice_loss_se2 = sm.losses.DiceLoss()
mae = tf.keras.losses.MeanAbsoluteError( )
metrics = [ mae,sm.metrics.IOUScore(threshold=0.5), sm.metrics.FScore(threshold=0.5) , dice_loss_se2]
model.compile(optimizer=optim,loss= dice_loss_se2,metrics= metrics)
train_gen = DataGen(train_ids, train_path, image_size=image_size, batch_size=batch_size)
valid_gen = DataGen(valid_ids, train_path, image_size=image_size, batch_size=batch_size)
train_steps = len(train_ids)//batch_size
valid_steps = len(valid_ids)//batch_size
history =model.fit_generator(train_gen, validation_data=valid_gen, steps_per_epoch=train_steps, validation_steps=valid_steps,
epochs=epochs)
and below code for pytorch:
class DoubleConv(nn.Module):
"""(convolution => [BN] => ReLU) * 2"""
def __init__(self, in_channels, out_channels, mid_channels=None):
super().__init__()
if not mid_channels:
mid_channels = out_channels
self.double_conv = nn.Sequential(
nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
nn.BatchNorm2d(mid_channels),
nn.ReLU(inplace=True),
nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)
)
def forward(self, x):
return self.double_conv(x)
class Down(nn.Module):
"""Downscaling with maxpool then double conv"""
def __init__(self, in_channels, out_channels):
super().__init__()
self.maxpool_conv = nn.Sequential(
nn.MaxPool2d(2),
DoubleConv(in_channels, out_channels)
)
def forward(self, x):
return self.maxpool_conv(x)
class Up(nn.Module):
"""Upscaling then double conv"""
def __init__(self, in_channels, out_channels, bilinear=True):
super().__init__()
# if bilinear, use the normal convolutions to reduce the number of channels
if bilinear:
self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
self.conv = DoubleConv(in_channels, out_channels, in_channels // 2)
else:
self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)
self.conv = DoubleConv(in_channels, out_channels)
def forward(self, x1, x2):
x1 = self.up(x1)
# input is CHW
diffY = x2.size()[2] - x1.size()[2]
diffX = x2.size()[3] - x1.size()[3]
x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
diffY // 2, diffY - diffY // 2])
# if you have padding issues, see
# https://github.com/HaiyongJiang/U-Net-Pytorch-Unstructured-Buggy/commit/0e854509c2cea854e247a9c615f175f76fbb2e3a
# https://github.com/xiaopeng-liao/Pytorch-UNet/commit/8ebac70e633bac59fc22bb5195e513d5832fb3bd
x = torch.cat([x2, x1], dim=1)
return self.conv(x)
class OutConv(nn.Module):
def __init__(self, in_channels, out_channels):
super(OutConv, self).__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1)
def forward(self, x):
return self.conv(x)
class UNet_standard(nn.Module):
def __init__(self, n_channels, n_classes, bilinear=False):
super(UNet_standard, self).__init__()
self.n_channels = n_channels
self.n_classes = n_classes
self.bilinear = bilinear
self.inc = DoubleConv(n_channels, 64)
self.down1 = Down(64, 128)
self.down2 = Down(128, 256)
self.down3 = Down(256, 512)
factor = 2 if bilinear else 1
self.down4 = Down(512, 1024 // factor)
self.up1 = Up(1024, 512 // factor, bilinear)
self.up2 = Up(512, 256 // factor, bilinear)
self.up3 = Up(256, 128 // factor, bilinear)
self.up4 = Up(128, 64, bilinear)
self.outc = OutConv(64, n_classes)
def forward(self, x):
x1 = self.inc(x)
x2 = self.down1(x1)
x3 = self.down2(x2)
x4 = self.down3(x3)
x5 = self.down4(x4)
x = self.up1(x5, x4)
x = self.up2(x, x3)
x = self.up3(x, x2)
x = self.up4(x, x1)
logits = self.outc(x)
return logits
class DiceLoss(nn.Module):
def __init__(self, n_classes):
super(DiceLoss, self).__init__()
self.n_classes = n_classes
def _one_hot_encoder(self, input_tensor):
tensor_list = []
for i in range(self.n_classes):
temp_prob = input_tensor == i # * torch.ones_like(input_tensor)
tensor_list.append(temp_prob.unsqueeze(1))
output_tensor = torch.cat(tensor_list, dim=1)
return output_tensor.float()
def _dice_loss(self, score, target):
target = target.float()
smooth = 1e-5
intersect = torch.sum(score * target)
y_sum = torch.sum(target * target)
z_sum = torch.sum(score * score)
loss = (2 * intersect + smooth) / (z_sum + y_sum + smooth)
loss = 1 - loss
return loss
def forward(self, inputs, target, weight=None, softmax=False):
if softmax:
inputs = torch.softmax(inputs, dim=1)
target = self._one_hot_encoder(target)
if weight is None:
weight = [1] * self.n_classes
assert inputs.size() == target.size(), 'predict {} & target {} shape do not match'.format(inputs.size(), target.size())
class_wise_dice = []
loss = 0.0
for i in range(0, self.n_classes):
dice = self._dice_loss(inputs[:, i], target[:, i])
class_wise_dice.append(1.0 - dice.item())
loss += dice * weight[i]
return loss / self.n_classes
def iou_score(output, target):
smooth = 1e-5
if torch.is_tensor(output):
output = torch.sigmoid(output).data.cpu().numpy()
if torch.is_tensor(target):
target = target.data.cpu().numpy()
output_ = output > 0.5
target_ = target > 0.5
intersection = (output_ & target_).sum()
union = (output_ | target_).sum()
return (intersection + smooth) / (union + smooth)
class easy_Synapse_dataset(Dataset):
def __init__(self, split, transform=None):
self.transform = transform # using transform in torch!
self.split = split
if self.split == "train":
use_path="path1"
else:
#use_path="path1"
use_path="path2"
self.sample_list =next(os.walk(use_path))[1] #open(os.path.join(list_dir, self.split+'.txt')).readlines()
def __len__(self):
return len(self.sample_list)
def __getitem__(self, idx):
if self.split == "train":
use_path="path1"
else:
#use_path="path1"
use_path="path2"
'''
if self.split == "train":
slice_name = self.sample_list[idx].strip('\n')
data_path = os.path.join(self.data_dir, slice_name+'.npz')
data = np.load(data_path)
image, label = data['image'], data['label']
else:
vol_name = self.sample_list[idx].strip('\n')
filepath = self.data_dir + "/{}.npy.h5".format(vol_name)
data = h5py.File(filepath)
image, label = data['image'][:], data['label'][:]
'''
#--------------------------------------------------------------------
#
image_path = os.path.join(use_path, self.sample_list[idx], self.sample_list[idx],) + "_2CH_ED.mhd"
#print(image_path)
#mask_path = os.path.join(self.path, id_name, "masks/")
mask_path = os.path.join(use_path, self.sample_list[idx], self.sample_list[idx] ,) + "_2CH_ED_gt.mhd"
#print(mask_path)
# not required all_masks = os.listdir(mask_path)
## Reading Image
#image = cv2.imread(image_path, 1)
my_img1= iio.imread(image_path , plugin='simpleitk')
image=my_img1[0,:,:]
#--------------image = cv2.merge((image,image,image))
#image =convert_to_3_channel( cv2.resize(image, (self.image_size, self.image_size)))
image = cv2.resize(image, (img_size, img_size))
#image = cv2.merge((image,image,image))
#image = np.moveaxis(image , 2, 0)
#image = cv2.merge((image,image,image,image))
# same for mask
my_mask1= iio.imread(mask_path , plugin='simpleitk')
mask=my_mask1[0,:,:]
mask= cv2.resize( mask, (img_size, img_size))
#one_hot_tensor= K.one_hot(K.cast( tf.convert_to_tensor(mask, dtype=tf.int32) , 'int32'), num_classes=4)
#mask=np.asarray(one_hot_tensor, np.float32)
#mask=np.asarray(one_hot_tensor, np.int32)
#masks = [(mask == v) for v in range(4) ] #self.class_values]
#mask = np.stack(masks, axis=-1).astype('float')
'''
mask = torch.Tensor(mask)
mask=torch.nn.functional.one_hot(mask.to(torch.int64) , num_classes=4)
mask = mask.to(torch.float)
mask = mask.permute(2, 0, 1)
'''
label=mask
#--------------------------------------------------------------------
#print("image ", image.shape)
#print("mask ", mask.shape)
'''
transform = transforms.Compose([
transforms.ToTensor()
])
'''
image = torch.from_numpy(image.astype(np.float32)).unsqueeze(0)
label = torch.from_numpy(label.astype(np.float32))
#if self.split != "train":
# image = torch.from_numpy(image.astype(np.float32)).unsqueeze(0)
# label = torch.from_numpy(label.astype(np.float32))
sample = {'image': image, 'label': label}
if self.transform:
sample = self.transform(sample)
#print("sample[image].size() ", sample["image"].shape)
#print("sample[label].size() ", sample["label"].shape)
return sample# sample#transform(image), transform(mask).squeeze(0) #sample
#logging.basicConfig(level=logging.NOTSET)
img_size=256# 224
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
#------------------------
db_train = easy_Synapse_dataset( split="train",
transform=None) #transforms.Compose(
#[RandomGenerator(output_size=[img_size, img_size])]))
print("The length of train set is: {}".format(len(db_train)))
train_loader = DataLoader(db_train, batch_size=1, shuffle=True)#, num_workers=8, pin_memory=True,
#worker_init_fn=worker_init_fn)
db_test = easy_Synapse_dataset( split="test_vol")
val_loader = DataLoader(db_test, batch_size=1, shuffle=False, num_workers=1)
#---------------------
# Now we can create a model and send it at once to the device
#----model = ManualLinearRegression().to(device)
'''
vit_patches_size=16
config_vit = CONFIGS['R50-ViT-B_16']
config_vit.n_classes = 4#args.num_classes
config_vit.n_skip =3 # args.n_skip
if 'R50-ViT-B_16'.find('R50') != -1:
config_vit.patches.grid = (int(img_size / vit_patches_size), int(img_size / vit_patches_size))
'''
#----------------------
model =UNet_standard( 1, 4).to(device)
# We can also inspect its parameters using its state_dict
#print(model.state_dict())
lr =0.01# 1e-1
n_epochs = 10
loss_fn =DiceLoss(4)#num_classes) nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=lr)#optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0001)
#-------------------------------------------
def make_train_step(model, loss_fn, optimizer):
# Builds function that performs a step in the train loop
def train_step(x, y):
# Sets model to TRAIN mode
model.train()
# Makes predictions
yhat = model(x)
#d0, d1, d2, d3= model(x)
#print(yhat.shape)
# Computes loss
#loss =muti_bce_loss_fusion2(d0, d1,d2, d3, y)# loss_fn(yhat, y, softmax=True) #loss_fn(y, yhat)
loss = loss_fn(yhat, y, softmax=True)
# Computes gradients
loss.backward()
# Updates parameters and zeroes gradients
optimizer.step()
optimizer.zero_grad()
# Returns the loss
return loss.item()
# Returns the function that will be called inside the train loop
return train_step
# Creates the train_step function for our model, loss function and optimizer
train_step = make_train_step(model, loss_fn, optimizer)
y_val_average_loss = []
y_average_loss = []
x_epoch= []
# For each epoch...
for epoch in tqdm(range(n_epochs)):
losses = []
val_losses = []
iou_metric = []
#for x_batch, y_batch in train_loader:
for i_batch, sampled_batch in enumerate(train_loader):
x_batch, y_batch = sampled_batch['image'], sampled_batch['label']
#x_batch, y_batch =image_batch.cpu(), label_batch.cpu()
# the dataset "lives" in the CPU, so do our mini-batches
# therefore, we need to send those mini-batches to the
# device where the model "lives"
x_batch = x_batch.to(device)
y_batch = y_batch.to(device)
#print( x_batch.shape , " ",y_batch.shape )
loss = train_step(x_batch, y_batch)
#print("loss = ",loss )
losses.append(loss)
#print('loss : %f' % (sum(losses) / len(losses) ))
avg=sum(losses) / len(losses)
y_average_loss .append (avg)
print('loss : %f' % (avg) )
losses = [] #clear
with torch.no_grad():
#for x_val, y_val in val_loader:
for i_batch, sampled_batch2 in enumerate(val_loader):
x_val, y_val = sampled_batch2['image'], sampled_batch2['label']
x_val = x_val.to(device)
y_val = y_val.to(device)
model.eval()
d3 = model(x_val)
#d0, d1, d2, d3 = model(x_val)
val_loss =loss_fn(d3, y_val, softmax=True) # loss_fn(y_val, yhat)
iou = iou_score(d3, y_val)
val_losses.append(val_loss.item())
iou_metric.append(iou.item())
#print('Validation loss : %f' % (sum(val_losses) / len(val_losses) ))
print('Validation iou : %f' % (sum(iou_metric) / len(iou_metric) ))
val_avg=sum(val_losses) / len(val_losses)
y_val_average_loss .append (val_avg)
x_epoch.append (epoch)
print('Validation loss : %f' % ( val_avg ))
val_losses = [] # clear
iou_metric = []
# Checks model's parameters
#print(model.state_dict())

I fixed the issue by using different Unet implementation from https://github.com/usuyama/pytorch-unet/blob/master/pytorch_unet.py

Related

In pytorch, self-made dataset and testing dataset seem to exhaust all RAM

In pytorch, self-made dataset and testing dataset seem to exhaust all RAM
I am new to pytorch and I wrote a ResNet program in pytorch on MNIST for an experiment.
If I use the data loader as below, it is fine:
import torch as pt
from torch.utils.data import DataLoader, TensorDataset
import torchvision as ptv
mnist_train = ptv.datasets.MNIST(ROOT_DIR,
train=True,
transform=ptv.transforms.ToTensor(),
download=False)
dl = pt.utils.data.DataLoader(dataset=mnist_train,
batch_size=BATCH_SIZE,
shuffle=True,
drop_last=True)
If I use a self-made dataset as below to use a validation set at each iteration, the program will exhaust all my RAM. The testing set is not used in each iteration, but at the end to evaluate the model.
mnist_test = ptv.datasets.MNIST(ROOT_DIR,
train=False,
transform=ptv.transforms.ToTensor(),
download=False)
M_TEST, PIC_H, PIC_W = mnist_test.data.shape
x_test = mnist_test.data.double() / 255.
y_test = mnist_test.targets
a = pt.randperm(M_TEST) # ATTENTION pt.randperm
x_test = x_test[a]
y_test = y_test[a]
VAL_RATE = 0.1
M_VAL = int(np.ceil(M_TEST * VAL_RATE))
M_TEST -= M_VAL
x_test, x_val = pt.split(x_test, (M_TEST, M_VAL))
y_test, y_val = pt.split(y_test, (M_TEST, M_VAL))
x_test = x_test.view(-1, 1, PIC_H, PIC_W).double()
x_val = x_val.view(-1, 1, PIC_H, PIC_W).double()
dl_test = DataLoader(TensorDataset(x_test, y_test),
batch_size=BATCH_SIZE)
def acc(ht, yt):
return (pt.argmax(ht, 1) == yt.long()).double().mean()
# in iteration:
for epoch in range(N_EPOCHS):
for i, (bx, by) in enumerate(dl):
model.train(True)
optim.zero_grad()
bx = bx.view(-1, 1, PIC_H, PIC_W).double()
ht = model(bx)
cost = criterion(ht, by)
cost.backward()
optim.step()
model.train(False)
accv = acc(ht, by)
ht_val = model(x_val)
val_cost = criterion(ht_val, y_val)
val_acc = acc(ht_val, y_val)
So I suspect only the ptv.datasets.MNIST and the pt.utils.data.DataLoader is available, so I removed the usage of my self-made validation set at each iteration; and the RAM usage is normal after the removal. But the test progress still exhaust all my RAM even I only use the ptv.datasets.MNIST and the pt.utils.data.DataLoader as below:
mnist_test = ptv.datasets.MNIST(ROOT_DIR,
train=False,
transform=ptv.transforms.ToTensor(),
download=False)
dl_test = pt.utils.data.DataLoader(dataset=mnist_test,
batch_size=BATCH_SIZE,
shuffle=False,
drop_last=True)
test_cost_avg = 0.
test_acc_avg = 0.
GROUP = int(np.ceil(M_TEST / BATCH_SIZE / 10))
for i, (bx, by) in enumerate(dl_test):
bx = bx.view(-1, 1, PIC_H, PIC_W).double()
ht = model(bx)
test_cost_avg += criterion(ht, by)
test_acc_avg += acc(ht, by)
if i % GROUP == 0:
print(f'Testing # {i + 1}')
if i % GROUP != 0:
print(f'Testing # {i + 1}')
test_cost_avg /= i + 1
test_acc_avg /= i + 1
print(f'Tested: cost = {test_cost_avg}, acc = {test_acc_avg}')
print('Over')
Please give me a help. Thanks a lot!
Update:
I suspect there is something wrong with my model, because I have a simple CNN model on self-made dataset from pytorchvision's MNIST does not have this RAM exhaustion problem. So I paste my model in this problem as below FYI:
def my_conv(in_side, in_ch, out_ch, kernel, stride, padding='same'):
if 'same' == padding:
ps = kernel - 1
padding = ps // 2
else:
padding = 0
print(padding) # tmp
return pt.nn.Conv2d(in_ch, out_ch, kernel_size=kernel, stride=stride, padding=padding)
class MyResnetBlock(pt.nn.Module):
def __init__(self, residual, in_side, in_ch, out_ch, kernel=3, stride=1, **kwargs):
super().__init__(**kwargs)
self.residual = residual
self.in_side = in_side
self.in_ch = in_ch
self.out_ch = out_ch
self.kernel = kernel
self.stride = stride
self.conv1 = my_conv(in_side, in_ch, out_ch, kernel, stride)
self.bn1 = pt.nn.BatchNorm2d(out_ch)
self.relu1 = pt.nn.ReLU()
self.conv2 = my_conv(np.ceil(in_side / stride), out_ch, out_ch, kernel, 1)
self.bn2 = pt.nn.BatchNorm2d(out_ch)
self.relu2 = pt.nn.ReLU()
if residual:
self.conv_down = my_conv(in_side, in_ch, out_ch, kernel, stride)
def forward(self, input):
x = input
x = self.conv1(x)
x = self.bn1(x)
x = self.relu1(x)
x = self.conv2(x)
x = self.bn2(x)
if self.residual:
res = self.conv_down(input)
else:
res = input
x += res
x = self.relu2(x)
return x
class MyResnetByPt(pt.nn.Module):
def __init__(self, blocks_spec_list, in_side, init_in_ch, init_out_ch, **kwargs):
super().__init__(**kwargs)
self.conv1 = my_conv(in_side, init_in_ch, init_out_ch, 3, 1)
in_ch = out_ch = init_out_ch
blocks = []
for block_id, n_blocks in enumerate(blocks_spec_list):
for layer_id in range(n_blocks):
if layer_id == 0:
if block_id != 0:
out_ch *= 2
block = MyResnetBlock(True, in_side, in_ch, out_ch, 3, 2)
in_ch = out_ch
in_side = int(np.ceil(in_side / 2))
else:
block = MyResnetBlock(False, in_side, in_ch, out_ch, 3, 1)
blocks.append(block)
self.blocks = pt.nn.Sequential(*blocks)
self.final_ch = out_ch
self.avg_pool = pt.nn.AvgPool2d(kernel_size=(in_side, in_side),
stride=(1, 1),
padding=(0, 0))
self.fc = pt.nn.Linear(out_ch, N_CLS)
def forward(self, input):
x = input
x = self.conv1(x)
x = self.blocks(x)
x = self.avg_pool(x)
x = x.view(-1, self.final_ch)
x = self.fc(x)
return x
model = MyResnetByPt([2, 2, 2, 2], PIC_H, 1, 16)
model = model.double()

Tensorflow 2.2.0 Keras Subclass Model can train and predict, but throws exception when save:Dimension size must be evenly divisible X

This Model is a variety of CNN and uses Causal Dilational Convolution Layer.
I can train and predict with 0 error, but when I use model.save() to save model, it throws Exception.
So I use save_weights and load_weights to save and load model.
I wonder why this error appears:
model.save("path")
out:
ValueError: Dimension size must be evenly divisible by 2 but is 745 for '{{node conv1d_5/SpaceToBatchND}} = SpaceToBatchND[T=DT_FLOAT, Tblock_shape=DT_INT32, Tpaddings=DT_INT32](conv1d_5/Pad, conv1d_5/SpaceToBatchND/block_shape, conv1d_5/SpaceToBatchND/paddings)' with input shapes: [?,745,32], [1], [1,2] and with computed input tensors: input[1] = <2>, input[2] = <[0 0]>.
Input shape is (None,743,27)
Output shape is (None,24,1)
def slice(x, seq_length):
return x[:, -seq_length:, :]
class ResidualBlock(tf.keras.layers.Layer):
def __init__(self, n_filters, filter_width, dilation_rate):
super(ResidualBlock, self).__init__()
self.n_filters = n_filters
self.filter_width = filter_width
self.dilation_rate = dilation_rate
# preprocessing - equivalent to time-distributed dense
self.x = Conv1D(32, 1, padding='same', activation='relu')
# filter convolution
self.x_f = Conv1D(filters=n_filters,
kernel_size=filter_width,
padding='causal',
dilation_rate=dilation_rate,
activation='tanh')
# gating convolution
self.x_g = Conv1D(filters=n_filters,
kernel_size=filter_width,
padding='causal',
dilation_rate=dilation_rate,
activation='sigmoid')
# postprocessing - equivalent to time-distributed dense
self.z_p = Conv1D(32, 1, padding='same', activation='relu')
def call(self, inputs):
x = self.x(inputs)
f = self.x_f(x)
g = self.x_g(x)
z = tf.multiply(f, g)
z = self.z_p(z)
return tf.add(x, z), z
def get_config(self):
config = super(ResidualBlock, self).get_config()
config.update({"n_filters": self.n_filters,
"filter_width": self.filter_width,
"dilation_rate": self.dilation_rate})
return config
class WaveNet(tf.keras.Model):
def __init__(self, n_filters=32, filter_width=2, dilation_rates=None, drop_out=0.2, pred_length=24):
super().__init__(name='WaveNet')
# Layer Parameter
self.n_filters = n_filters
self.filter_width = filter_width
self.drop_out = drop_out
self.pred_length = pred_length
if dilation_rates is None:
self.dilation_rates = [2 ** i for i in range(8)]
else:
self.dilation_rates = dilation_rates
# Layer
self.residual_stacks = []
for dilation_rate in self.dilation_rates:
self.residual_stacks.append(ResidualBlock(self.n_filters, self.filter_width, dilation_rate))
# self.add = Add()
self.cut = Lambda(slice, arguments={'seq_length': pred_length})
self.conv_1 = Conv1D(128, 1, padding='same')
self.relu = Activation('relu')
self.drop = Dropout(drop_out)
self.skip = Lambda(lambda x: x[:, -2 * pred_length + 1:-pred_length + 1, :1])
self.conv_2 = Conv1D(1, 1, padding='same')
def _unroll(self, inputs, **kwargs):
outputs = inputs
skips = []
for residual_block in self.residual_stacks:
outputs, z = residual_block(outputs)
skips.append(z)
outputs = self.relu(Add()(skips))
outputs = self.cut(outputs)
outputs = self.conv_1(outputs)
outputs = self.relu(outputs)
outputs = self.drop(outputs)
outputs = Concatenate()([outputs, self.skip(inputs)])
outputs = self.conv_2(outputs)
outputs = self.cut(outputs)
return outputs
def _get_output(self, input_tensor):
pass
def call(self, inputs, training=False, **kwargs):
if training:
return self._unroll(inputs)
else:
return self._get_output(inputs)
Train step
model = WaveNet()
model.compile(Adam(), loss=loss)
# ok
history = model.fit(train_x, train_y,
batch_size=batch_size,
epochs=epochs,
callbacks=[cp_callback] if save else None)
# ok
result = model.predict(test_x)
# error
model.save("path")

subclass a customized model in tensorflow2: Cannot convert a Tensor of dtype resource to a NumPy array

I'm newbie to tensorflow2 and use tensorflow2.3.1, cpu version.
I defined the model in subclassing way and, when showing the structure of my model, I encountered the error "tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot convert a Tensor of dtype resource to a NumPy array", which points to the following line in BST_DSSM.build_model
"self.item_sequence_embeddings = tf.nn.embedding_lookup("
I have browsed through similar questions but can't find satisfactory solution.
Any help will be appreciated :)
Below is my code.
import tensorflow as tf
class MultiHeadAttention(tf.keras.layers.Layer):
""" def multi head attention layer
q, k, v multiplied by Wq, Wk, Wv respectively -> q', k', v'
q' * k' -> w, w / sqrt(q'.shape[1]) -> w'
w' * v' -> z, z * Wz -> z'
z' add v (residual), then goes through LRelu, do a LN at last
"""
def __init__(
self,
scope_name,
num_units=8,
num_heads=1,
embed_dim=8,
has_residual=True,
dropout_keep_prob=1.0):
super(MultiHeadAttention, self).__init__()
assert num_units % num_heads == 0
assert scope_name in ["user", "item"]
self.num_heads = num_heads
self.num_units = num_units
self.embed_dim = embed_dim
self.dropout_keep_prob = dropout_keep_prob
self.Wq = tf.keras.layers.Dense(
units=self.num_units, activation=tf.nn.leaky_relu, name=f"{scope_name}_Wq")
self.Wk = tf.keras.layers.Dense(
units=self.num_units, activation=tf.nn.leaky_relu, name=f"{scope_name}_Wk")
self.Wv = tf.keras.layers.Dense(
units=self.num_units, activation=tf.nn.leaky_relu, name=f"{scope_name}_Wv")
self.has_residual = has_residual
self.Wz = tf.keras.layers.Dense(embed_dim)
def call(self, queries, keys_, values):
"""
:param queries: of shape [batch_size, max_length, emb_dim]
:param keys_: of shape [batch_size, max_length, emb_dim]
:param values: of shape [batch_size, max_length, emb_dim]
:return:
"""
assert values.get_shape().as_list()[-1] == self.embed_dim
assert queries.get_shape().as_list()[-1] == self.embed_dim
assert keys_.get_shape().as_list()[-1] == self.embed_dim
# Linear projections
Q = self.Wq(queries)
K = self.Wk(keys_)
V = self.Wv(values)
# Split and concat
Q_ = tf.concat(tf.split(Q, self.num_heads, axis=2), axis=0)
K_ = tf.concat(tf.split(K, self.num_heads, axis=2), axis=0)
V_ = tf.concat(tf.split(V, self.num_heads, axis=2), axis=0)
# Multiplication
weights = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))
# Scale
weights = weights / (K_.get_shape().as_list()[-1] ** 0.5)
# convert to prob vector
weights = tf.nn.softmax(weights)
# Dropouts
if 0 < self.dropout_keep_prob < 1:
weights = tf.keras.layers.AlphaDropout(
rate=1 - self.dropout_keep_prob)(weights)
# Weighted sum
# [batch_size * num_heads, max_length, num_units / num_heads]
outputs = tf.matmul(weights, V_)
# Restore shape to [batch_size, max_length, num_units]
z = tf.concat(tf.split(outputs, self.num_heads, axis=0), axis=2)
# Restore shape to [batch_size, max_length, embed_dim]
z = self.Wz(z)
# Residual connection
if self.has_residual:
z += values
z = tf.nn.leaky_relu(z)
# Normalize
z = tf.keras.layers.LayerNormalization(
beta_initializer="zeros", gamma_initializer="ones")(z)
return z
class BST_DSSM(tf.keras.Model):
"""define BST+DSSM model stucture
"""
def __init__(self, model_dir,
item_embedding=None, user_embedding=None,
embedding_size=8,
vocab_size=1000,
max_length_item=15, max_length_user=6,
epoch=10, batch_size=256, blocks=2,
learning_rate=0.001, optimizer_type="adam",
batch_norm=0, batch_norm_decay=0.995,
verbose=False, random_seed=2019,
l2_reg=0.0, has_residual=True):
"""
initial model related parms and tensors
"""
super(BST_DSSM, self).__init__()
# denote as K, size of the feature embedding
self.embedding_size = embedding_size
self.l2_reg = l2_reg
self.epoch = epoch
self.batch_size = batch_size
self.learning_rate = learning_rate
self.optimizer_type = optimizer_type
self.optimizer = None
self.blocks = blocks
self.batch_norm = batch_norm
self.batch_norm_decay = batch_norm_decay
self.verbose = verbose
self.random_seed = random_seed
self.model_dir = model_dir
# self._init_graph()
self.vocab_size = vocab_size
self.max_length_item = max_length_item
self.max_length_user = max_length_user
self.has_residual = has_residual
self.model = None
self.item_embedding = item_embedding
self.user_embedding = user_embedding
self.mha_user = MultiHeadAttention("user", num_units=embedding_size)
self.mha_item = MultiHeadAttention("item", num_units=embedding_size)
def _get_item_embedding_matrix(self):
if self.item_embedding is None:
std = 0.1
minval = -std
maxval = std
emb_matrix = tf.Variable(
tf.random.uniform(
[self.vocab_size, self.embedding_size],
minval, maxval,
seed=self.random_seed,
dtype=tf.float32),
name="item_embedding")
self.item_embedding = emb_matrix
def _get_user_embedding_matrix(self):
if self.user_embedding is None:
std = 0.1
minval = -std
maxval = std
emb_matrix = tf.Variable(
tf.random.uniform(
[self.vocab_size, self.embedding_size],
minval, maxval,
seed=self.random_seed,
dtype=tf.float32),
name="user_embedding")
self.user_embedding = emb_matrix
def build_model(self):
# initialize lut
self._get_item_embedding_matrix()
self._get_user_embedding_matrix()
item_inputs = tf.keras.Input(
shape=(
self.max_length_item
),
dtype=tf.int32,
name="item_sequence_idx")
user_inputs = tf.keras.Input(
shape=(
self.max_length_user
),
dtype=tf.int32,
name="user_sequence_idx")
# user and item use different lut, similarly to DSSM
self.item_sequence_embeddings = tf.nn.embedding_lookup(
self.item_embedding, item_inputs, name="item_sequence_embeddings")
self.video_sequence_embeddings = tf.nn.embedding_lookup(
self.user_embedding, user_inputs, name="video_sequence_embeddings")
# self attn part
for i in range(self.blocks):
self.item_sequence_embeddings = self.mha_item(
queries=self.item_sequence_embeddings,
keys=self.item_sequence_embeddings,
values=self.item_sequence_embeddings)
self.video_sequence_embeddings = self.mha_user(
queries=self.video_sequence_embeddings,
keys=self.video_sequence_embeddings,
values=self.video_sequence_embeddings)
# max pooling
self.item_sequence_embeddings = tf.nn.max_pool(
self.item_sequence_embeddings,
[1, self.max_length_item, 1],
[1 for _ in range(len(self.item_sequence_embeddings.shape))],
padding="VALID")
self.video_sequence_embeddings = tf.nn.max_pool(
self.video_sequence_embeddings,
[1, self.max_length_user, 1],
[1 for _ in range(len(self.video_sequence_embeddings.shape))],
padding="VALID")
# cosine similarity
self.item_sequence_embeddings = tf.nn.l2_normalize(
self.item_sequence_embeddings, axis=2)
self.video_sequence_embeddings = tf.nn.l2_normalize(
self.video_sequence_embeddings, axis=2)
outputs = tf.matmul(
self.item_sequence_embeddings,
tf.transpose(self.video_sequence_embeddings, [0, 2, 1]))
outputs = tf.reshape(outputs, [-1, 1])
# optimizer
if self.optimizer_type == "adam":
self.optimizer = tf.keras.optimizers.Adam(
learning_rate=self.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
elif self.optimizer_type == "adagrad":
self.optimizer = tf.keras.optimizers.Adagrad(
learning_rate=self.learning_rate,
initial_accumulator_value=1e-8)
elif self.optimizer_type == "gd":
self.optimizer = tf.keras.optimizers.SGD(
learning_rate=self.learning_rate)
elif self.optimizer_type == "momentum":
self.optimizer = tf.keras.optimizers.SGD(
learning_rate=self.learning_rate, momentum=0.95)
self.model = tf.keras.Model(
inputs={
"item_sequence_idx": item_inputs,
"user_sequence_idx": user_inputs
},
outputs=outputs)
self.model.compile(
optimizer=self.optimizer,
loss=self.loss_fn,
metrics=[
tf.keras.metrics.AUC(),
tf.keras.metrics.binary_accuracy()])
Although I didn't figure out why I got such an error, I have built my model by defining a call method and the code is as below
from conf_loader import (
emb_dim, n_layer,
item_max_len, user_max_len,
batch_size, lr, l2_reg,
vocab_size
)
class BST_DSSM(tf.keras.Model):
"""define BST+DSSM model stucture
"""
def __init__(self,
item_embedding=None, user_embedding=None,
emb_dim=emb_dim,
vocab_size=vocab_size,
item_max_len=item_max_len, user_max_len=user_max_len,
epoch=10, batch_size=batch_size, n_layers=n_layer,
learning_rate=lr, optimizer_type="adam",
random_seed=2019,
l2_reg=l2_reg, has_residual=True):
"""
initial model related parms and tensors
"""
super(BST_DSSM, self).__init__()
self.emb_dim = emb_dim
self.l2_reg = l2_reg
self.epoch = epoch
self.batch_size = batch_size
self.learning_rate = learning_rate
self.optimizer_type = optimizer_type
self.blocks = n_layers
self.random_seed = random_seed
self.vocab_size = vocab_size
self.item_max_len = item_max_len
self.user_max_len = user_max_len
self.has_residual = has_residual
self.item_embedding = item_embedding
self.user_embedding = user_embedding
self.mha_user = MultiHeadAttention(scope_name="user", embed_dim=self.emb_dim)
self.mha_item = MultiHeadAttention(scope_name="item", embed_dim=self.emb_dim)
# optimizer
if self.optimizer_type == "adam":
self.optimizer = tf.keras.optimizers.Adam(
learning_rate=self.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
elif self.optimizer_type == "adagrad":
self.optimizer = tf.keras.optimizers.Adagrad(
learning_rate=self.learning_rate,
initial_accumulator_value=1e-8)
elif self.optimizer_type == "gd":
self.optimizer = tf.keras.optimizers.SGD(
learning_rate=self.learning_rate)
elif self.optimizer_type == "momentum":
self.optimizer = tf.keras.optimizers.SGD(
learning_rate=self.learning_rate, momentum=0.95)
self.user_embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, output_dim=self.emb_dim)
self.item_embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, output_dim=self.emb_dim)
#tf.function
def call(self, inputs, training=True):
# multiple inputs
item_inputs = inputs[0]
user_inputs = inputs[1]
item_sequence_embeddings = self.item_embedding(item_inputs)
user_sequence_embeddings = self.user_embedding(user_inputs)
# [batch_size, max_length, 16]
for i in range(self.blocks):
item_sequence_embeddings = self.mha_item(item_sequence_embeddings)
user_sequence_embeddings = self.mha_user(user_sequence_embeddings)
# [batch_size, 1, 16]
item_outputs_max = tf.nn.max_pool(
item_sequence_embeddings,
[1, self.item_max_len, 1],
[1 for _ in range(len(item_sequence_embeddings.shape))],
padding="VALID")
user_outputs_max = tf.nn.max_pool(
user_sequence_embeddings,
[1, self.user_max_len, 1],
[1 for _ in range(len(user_sequence_embeddings.shape))],
padding="VALID")
# L2 normalize to get cosine similarity
item_normalized = tf.nn.l2_normalize(
item_outputs_max, axis=2)
user_normalized = tf.nn.l2_normalize(
user_outputs_max, axis=2)
outputs = tf.matmul(
item_normalized,
user_normalized,
transpose_b=True)
return tf.reshape(outputs, [-1, 1])
def loss_fn(self, target, output):
cross_entropy = tf.keras.backend.binary_crossentropy(
target, output, from_logits=False
)
if self.l2_reg > 0:
_regularizer = tf.keras.regularizers.l2(self.l2_reg)
cross_entropy += _regularizer(self.user_embedding)
cross_entropy += _regularizer(self.item_embedding)
return cross_entropy
def debug():
x_train = [
np.random.randint(low=0, high=20, size=(5, item_max_len)),
np.random.randint(low=0, high=20, size=(5, user_max_len))]
y_train = np.random.randint(low=0, high=2, size=5).astype(dtype=float)
model = BST_DSSM()
model.compile(
optimizer=model.optimizer,
loss=model.loss_fn
)
model.fit(x_train, y_train, epochs=n_epoch)
model.summary()

Why does requires_grad turns from true to false when doing torch.nn.conv2d operation?

I have Unet network which takes in MRI images of the brain, where the goal is to segment white substance in the brain. The images has the shape 256x256x183 (reshaped to 183x256x256) (FLAIR and T1 images). The problem I am having is that before sending the input to the Unet network, I have requires_grad=True on my pytorch tensor, but after one torch.nn.conv2d operation the requires_grad=False. This is a huge problem since the gradient will not update and learn.
from collections import OrderedDict
import torch
import torch.nn as nn
class UNet(nn.Module):
def __init__(self, in_channels=3, out_channels=1, init_features=32):
super(UNet, self).__init__()
features = init_features
self.encoder1 = UNet._block(in_channels, features, name="enc1")
self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2)
self.encoder2 = UNet._block(features, features * 2, name="enc2")
self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2)
self.encoder3 = UNet._block(features * 2, features * 4, name="enc3")
self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2)
self.encoder4 = UNet._block(features * 4, features * 8, name="enc4")
self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2)
self.bottleneck = UNet._block(features * 8, features * 16, name="bottleneck")
self.upconv4 = nn.ConvTranspose2d(
features * 16, features * 8, kernel_size=2, stride=2
)
self.decoder4 = UNet._block((features * 8) * 2, features * 8, name="dec4")
self.upconv3 = nn.ConvTranspose2d(
features * 8, features * 4, kernel_size=2, stride=2
)
self.decoder3 = UNet._block((features * 4) * 2, features * 4, name="dec3")
self.upconv2 = nn.ConvTranspose2d(
features * 4, features * 2, kernel_size=2, stride=2
)
self.decoder2 = UNet._block((features * 2) * 2, features * 2, name="dec2")
self.upconv1 = nn.ConvTranspose2d(
features * 2, features, kernel_size=2, stride=2
)
self.decoder1 = UNet._block(features * 2, features, name="dec1")
self.conv = nn.Conv2d(
in_channels=features, out_channels=out_channels, kernel_size=1
)
def forward(self, x):
print(x.requires_grad) #<---- here it is true
enc1 = self.encoder1(x)#<---- where the problem happens
print(enc1.requires_grad) #<---- here it is false
enc2 = self.encoder2(self.pool1(enc1))
print(enc2.requires_grad)
enc3 = self.encoder3(self.pool2(enc2))
print(enc3.requires_grad)
enc4 = self.encoder4(self.pool3(enc3))
print(enc4.requires_grad)
bottleneck = self.bottleneck(self.pool4(enc4))
print(bottleneck.requires_grad)
dec4 = self.upconv4(bottleneck)
print(dec4.requires_grad)
dec4 = torch.cat((dec4, enc4), dim=1)
print(dec4.requires_grad)
dec4 = self.decoder4(dec4)
print(dec4.requires_grad)
dec3 = self.upconv3(dec4)
print(dec3.requires_grad)
dec3 = torch.cat((dec3, enc3), dim=1)
print(dec3.requires_grad)
dec3 = self.decoder3(dec3)
print(dec3.requires_grad)
dec2 = self.upconv2(dec3)
print(dec2.requires_grad)
dec2 = torch.cat((dec2, enc2), dim=1)
print(dec2.requires_grad)
dec2 = self.decoder2(dec2)
print(dec2.requires_grad)
dec1 = self.upconv1(dec2)
print(dec1.requires_grad)
dec1 = torch.cat((dec1, enc1), dim=1)
print(dec1.requires_grad)
dec1 = self.decoder1(dec1)
print(dec1.requires_grad)
print("going out")
return torch.sigmoid(self.conv(dec1))
#staticmethod
def _block(in_channels, features, name):
return nn.Sequential(
OrderedDict(
[
(
name + "conv1",
nn.Conv2d(
in_channels=in_channels,
out_channels=features,
kernel_size=3,
padding=1,
bias=False,
),
),
(name + "norm1", nn.BatchNorm2d(num_features=features)),
(name + "relu1", nn.ReLU(inplace=True)),
(
name + "conv2",
nn.Conv2d(
in_channels=features,
out_channels=features,
kernel_size=3,
padding=1,
bias=False,
),
),
(name + "norm2", nn.BatchNorm2d(num_features=features)),
(name + "relu2", nn.ReLU(inplace=True)),
]
)
)
Edit:
This is the training code
class run_network:
def __init__(self, eta, epoch, batch_size, train_file_path, validation_file_path, shuffle_after_epoch = True):
self.eta = eta
self.epoch = epoch
self.batch_size = batch_size
self.train_file_path = train_file_path
self.validation_file_path = validation_file_path
self.shuffle_after_epoch = shuffle_after_epoch
def __call__(self, is_train = False):
device = torch.device("cpu" if not torch.cuda.is_available() else torch.cuda())
unet = torch.hub.load('mateuszbuda/brain-segmentation-pytorch', 'unet',
in_channels=3, out_channels=1, init_features=32, pretrained=True)
unet.to(device)
unet = unet.double()
optimizer = optim.Adam(unet.parameters(), lr=self.eta)
dsc_loss = DiceLoss()
Load_training = NiftiLoader(self.train_file_path)
Load_validation = NiftiLoader(self.validation_file_path)
mean_flair, mean_t1, std_flair, std_t1 = Load_training.average_mean_and_std(20, 79,99)
total_mean = [mean_flair, mean_t1]
total_std = [std_flair, std_t1]
loss_train = []
loss_validation = []
for current_epoch in tqdm(range(self.epoch)):
for phase in ["train", "validation"]:
if phase == "train":
mini_batch = Load_training.create_batch(self.batch_size, self.shuffle_after_epoch)
unet.train()
print("her22")
if phase == "validation":
print("her")
mini_batch = Load_validation.create_batch(self.batch_size, self.shuffle_after_epoch)
unet.eval()
dim1, dim2, dim3 = mini_batch.shape
for iteration in range(1):
if phase == "train":
current_batch = Load_training.Load_Image_batch(mini_batch, iteration)
image_batch = Load_training.image_zero_mean_normalizer(current_batch)
if phase == "validation":
current_batch = Load_validation.Load_Image_batch(mini_batch, iteration)
image_batch = Load_training.image_zero_mean_normalizer(current_batch, False, mean_list, std_list)
image_dim0, image_dim1, image_dim2, image_dim3, image_dim4 = image_batch.shape
image_batch = image_batch.reshape((
image_dim0,
image_dim1*image_dim2,
image_dim3,
image_dim4
))
image_batch = np.swapaxes(image_batch, 0,1)
image_batch = torch.as_tensor(image_batch)#.requires_grad_(True) #, requires_grad=True)
image_batch = image_batch.to(device)
print(image_batch.requires_grad)
optimizer.zero_grad()
with torch.set_grad_enabled(is_train == "train"):
for j in range(0, 10, 1):
# [183*5, 3, 256, 256] -> [12, 3, 256, 256]
# ANTALL ITERASJONER: (183*5/12) -> en chunk
input_image = image_batch[j:j+2,0:3,:,:]
print(input_image.requires_grad)
print("går inn")
y_predicted = unet(input_image)
print(y_predicted.requires_grad)
print(image_batch[j:j+2,3,:,:].requires_grad)
loss = dsc_loss(y_predicted.squeeze(1), image_batch[j:j+2,3,:,:])
print(loss.requires_grad)
if phase == "train":
loss_train.append(loss.item())
loss.backward()
print(loss.item())
exit()
optimizer.step()
print(loss.item())
exit()
if phase == "validation":
loss_validation.append(loss.item())
Number of iteration and print statement are for experimenting what the cause could be.
It works fine to me.
'''
# I changed your code a little bit to catch up the problem.
def forward(self, x):
print("encoder1", x.requires_grad) #<---- here it is true
enc1 = self.encoder1(x)#<---- where the problem happens
print("encoder2", enc1.requires_grad) #<---- here it is false
'''
a = torch.randn(32, 3, 255, 255, requires_grad=True)
# a.requires_grads = True
print(a)
UNet()(a)
# This is the result:
encoder1 True
encoder2 True
True
True
True
True
True
Can you show me your training source? I guess it's the problem. And why do you need to update the input data?
The training code is fine and the input doesn't need a gradient at all, if you just want to train and update the weights.
The real problem is this line here
with torch.set_grad_enabled(is_train == "train"):
So you want to disable the gradients if you are not training. The thing is is_train is a bool (judging form this: def __call__(self, is_train=False):), so the comparisons will be always false and no gradients will bet set. Just change it to
with torch.set_grad_enabled(is_train):
and you will be fine.

Is this a correct reimplementation of Pytorch Seq2Seq model?

I made a code that sort of change the tutorial script of seq2seq provided by Pytorch. Here’s the model:
class Seq2Seq(nn.Module):
def __init__(self, encoder, batch_size, vocab_size, input_size, output_size, hidden_dim, embedding_dim, n_layers=2, dropout_p=0.5):
super(Seq2Seq, self).__init__()
self.hidden_dim = hidden_dim
self.batch_size = batch_size
self.input_length = input_size
self.output_length = output_size
self.vocab_size = vocab_size
self.encoder = encoder
self.dropout = nn.Dropout(dropout_p)
self.selu = nn.SELU()
self.decoder_embeddings = nn.Embedding(vocab_size, hidden_dim)
self.decoder_gru = nn.GRU(hidden_dim, hidden_dim)
self.out = nn.Linear(hidden_dim, vocab_size)
self.softmax = nn.LogSoftmax()
def decode(self, SOS_token, encoder_hidden, target_output, teacher_forcing_ratio=0.8):
decoder_output_full = autograd.Variable(torch.zeros(self.output_length, self.batch_size, self.vocab_size))
decoder_output_full = decoder_output_full.cuda() if use_cuda else decoder_output_full
target = target_output.permute(1,0)
use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False
for idx in range(self.output_length):
if idx == 0:
decoder_input = SOS_token
decoder_hidden = encoder_hidden.unsqueeze(0)
output = self.decoder_embeddings(decoder_input).view(1, self.batch_size, -1)
output = self.dropout(output)
output = self.selu(output)
if use_teacher_forcing:
decoder_output, decoder_hidden = self.decoder_gru(output, decoder_hidden)
temp = 1
out = self.out(decoder_output[0])
out = out + sample_gumbel(out.shape)
decoder_output = F.softmax(out / temp, dim=1)
# decoder_output = (self.decoder_embeddings.weight * decoder_output.unsqueeze(1)).sum(0).view(1, 1, -1)
decoder_output_full[idx, :, :] = decoder_output
decoder_input = target[idx-1] # Teacher forcing
else:
decoder_output, decoder_hidden = self.decoder_gru(output, decoder_hidden)
temp = 1
out = self.out(decoder_output[0])
out = out + sample_gumbel(out.shape)
decoder_output = F.softmax(out / temp, dim=1)
# decoder_output = (self.decoder_embeddings.weight * decoder_output.unsqueeze(1)).sum(0).view(1, 1, -1)
topv, topi = decoder_output.data.topk(1)
# print topi
ni = topi
# decoder_input_v = autograd.Variable(torch.LongTensor([[ni]]))
decoder_input = autograd.Variable(ni)
# decoder_input = decoder_input.cuda() if use_cuda else decoder_input
# print decoder_input
decoder_output_full[idx, :, :] = decoder_output
decoder_output_full = decoder_output_full.permute(1,0,2)
# gen_output = self.softmax(self.out(decoder_output_full))
return decoder_output_full
def forward(self, input, target_output, teacher_forcing_ratio=0.8):
encoder_feat, _ = self.encoder(input)
SOS_token = np.zeros((self.batch_size,1), dtype=np.int32)
SOS_token = torch.LongTensor(SOS_token.tolist())
SOS_token = autograd.Variable(SOS_token)
if use_cuda:
SOS_token = SOS_token.cuda(gpu)
gen_output = self.decode(SOS_token, encoder_feat, target_output, teacher_forcing_ratio)
return gen_output
def initHidden(self):
result = autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim))
if use_cuda:
return result.cuda()
else:
return result
The way I calculate the NLL loss is by creating one whole sequence of output first and compare it with the target output. Here’s the loss function:
class batchNLLLoss(nn.Module):
def __init__(self):
super(batchNLLLoss, self).__init__()
def forward(self, synt, target, claim_length=20):
loss_fn = nn.NLLLoss()
loss = 0
for i in range(synt.shape[0]):
for j in range(claim_length):
loss += loss_fn(synt[i][j].unsqueeze(0), target[i][j])
return loss
The current problem is the loss value is really small and seems like the network learns nothing (the output is the same word repeated again and again). Any thought about this? Thanks in advance!

Categories

Resources