discrepancy between Tensor flow model and Pyorch model - python
I trained a Unet model on the same dataset using both Tensor flow model and Pyorch. Both model showed a fine loss curve for training data, but the Pyorch validation data keeps zigzagging.
I think I'm good on Tensor flow but I'm new to Pyorch. Please, check if I made a mistake on Pyorch .
Below is the Tensor flow:
and this the Pyorch
The below code for Tensor flow :
class DataGen(keras.utils.Sequence):
def __init__(self, ids, path, batch_size=8, image_size=128):
self.ids = ids
self.path = path
self.batch_size = batch_size
self.image_size = image_size
self.on_epoch_end()
def __load__(self, id_name):
## Path
#image_path = os.path.join(self.path, id_name, "images", id_name) + ".png"
#/content/drive/My Drive/mycolab/training/ patient0001 / patient0001
image_path = os.path.join(self.path, id_name, id_name,) + "_2CH_ED.mhd"
#mask_path = os.path.join(self.path, id_name, "masks/")
mask_path = os.path.join(self.path, id_name, id_name,) + "_2CH_ED_gt.mhd"
# not required all_masks = os.listdir(mask_path)
## Reading Image
#image = cv2.imread(image_path, 1)
my_img1= io.imread(image_path , plugin='simpleitk')
image=my_img1[0,:,:]
#--------------image = cv2.merge((image,image,image))
#image =convert_to_3_channel( cv2.resize(image, (self.image_size, self.image_size)))
image = cv2.resize(image, (self.image_size, self.image_size))
#image = cv2.merge((image,image,image,image))
#image = cv2.merge((image,image,image,image))
# same for mask
my_mask1= io.imread(mask_path , plugin='simpleitk')
mask=my_mask1[0,:,:]
mask= cv2.resize( mask, (self.image_size, self.image_size))
#one_hot_tensor= K.one_hot(K.cast( tf.convert_to_tensor(mask, dtype=tf.int32) , 'int32'), num_classes=4)
#mask=np.asarray(one_hot_tensor, np.float32)
#mask=np.asarray(one_hot_tensor, np.int32)
masks = [(mask == v) for v in range(4) ] #self.class_values]
mask = np.stack(masks, axis=-1).astype('float')
#masks = [(mask == v) for v in range(4)]#self.class_values]
#print("mask ttttttttttt5555555555:", type(masks))
#mask = np.stack(masks, axis=-1).astype('float')
# add background if mask is not binary
#if mask.shape[-1] != 1:
# #print("adding background if mask is not binary******************++++++++__________________$$")
# background = 1 - mask.sum(axis=-1, keepdims=True)
# mask = np.concatenate((mask, background), axis=-1)
#mask = np.zeros((self.image_size, self.image_size, 1))
## Reading Masks
#for name in all_masks:
# _mask_path = mask_path + name
# _mask_image = cv2.imread(_mask_path, -1)
# _mask_image = cv2.resize(_mask_image, (self.image_size, self.image_size)) #128x128
# _mask_image = np.expand_dims(_mask_image, axis=-1)
# mask = np.maximum(mask, _mask_image)
#print("image &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
#print("image &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
#print("image &&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&&")
#print(mask)
## Normalizaing
#-------------image = image/255.0
#---- check if tis correct mask = mask/255.0
return image, mask# image.astype('float'), mask.astype('float')
def __getitem__(self, index):
#print("Index *************************************:", index )
if(index+1)*self.batch_size > len(self.ids):
self.batch_size = len(self.ids) - index*self.batch_size
files_batch = self.ids[index*self.batch_size : (index+1)*self.batch_size]
image = []
mask = []
for id_name in files_batch:
_img, _mask = self.__load__(id_name)
image.append(_img ) #.astype('float'))
mask.append(_mask ) #.astype('float'))
image = np.array(image)
mask = np.array(mask)
#print("image shape%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%",image.shape)
#print("mask shape%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%%",mask.shape)
return image , mask# image.astype('float'), mask.astype('float')
def on_epoch_end(self):
pass
def __len__(self):
return int(np.ceil(len(self.ids)/float(self.batch_size)))
# you may need to change variables names
image_size = 256
train_path ="path" #"dataset/stage1_train/"
epochs = 10 #70 # 5 # paper require 30
batch_size =1#32#1# 8
num_class = 4
print("train_ids length:", len(train_ids))
## Validation Data Size
val_data_size = 10
valid_ids = train_ids[:val_data_size]
my_slice_index=2 # class index that we are calauting
## Training Ids
train_ids = next(os.walk(train_path))[1]
def down_block(x, filters, kernel_size=(3, 3), padding="same", strides=1):
c = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(x)
c = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(c)
p = keras.layers.MaxPool2D((2, 2), (2, 2))(c)
return c, p
def down_block_test(x, filters, kernel_size=(3, 3), padding="same", strides=1):
residual = x
print("down_block: residual size", residual.shape)
c = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(x)
print("down_block: c size", c.shape)
c = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(c)
print("down_block: c2 size", c.shape)
print("down_block: residual.shape[1]", residual.shape[1])
print("down_block: residual.shape[3]", residual.shape[3])
if residual.shape[3] != c.shape[3]:
residual = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(residual)
c += residual
p = keras.layers.MaxPool2D((2, 2), (2, 2))(c)
return c, p
def up_block(x, skip, filters, kernel_size=(3, 3), padding="same", strides=1):
us = keras.layers.UpSampling2D((2, 2))(x)
concat = keras.layers.Concatenate()([us, skip])
c = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(concat)
c = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(c)
return c
def bottleneck(x, filters, kernel_size=(3, 3), padding="same", strides=1):
c = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(x)
c = keras.layers.Conv2D(filters, kernel_size, padding=padding, strides=strides, activation="relu")(c)
return c
#------------------------------------ model in the paper as Unet 1 ----------------------------------------
def UNet_test():
f = [16, 32, 64, 128, 256]
#inputs = keras.layers.Input((image_size, image_size, 3))
inputs = keras.layers.Input((image_size, image_size, 1))
p0 = inputs
tf.print("********************************************p0: ")
c1, p1 = down_block(p0, f[1]) #264 -> 128
print("c1",c1.shape )
print("p1",p1.shape )
c2, p2 = down_block(p1, f[1]) #128 -> 64
print("c2",c2.shape )
print("p2",p2.shape )
c3, p3 = down_block(p2, f[2]) #64 -> 32
print("c3",c3.shape )
print("p3",p3.shape )
c4, p4 = down_block(p3, f[3]) #32->16
print("c4",c4.shape )
print("p4",p4.shape )
c5, p5 = down_block(p4, f[3]) #16->8
print("c5",c5.shape )
print("p5",p5.shape )
bn = bottleneck(p5, f[3]) # 8
u1 = up_block(bn, c5, f[3]) #8 -> 16
u2 = up_block(u1, c4, f[3]) #16 -> 32
u3 = up_block(u2, c3, f[2]) #32 -> 64
u4 = up_block(u3, c2, f[1]) #64 -> 128
u5 = up_block(u4, c1, f[0]) #128 -> 256
#outputs = keras.layers.Conv2D(1, (1, 1), padding="same", activation="sigmoid")(u4)
outputs = keras.layers.Conv2D(num_class, (1, 1), padding="same", activation="softmax")(u5)
#outputs = keras.layers.Conv2D(1, (1, 1), padding="same", activation="softmax")(u4)
model = keras.models.Model(inputs, outputs)
return model
model=UNet_test()
import segmentation_models as sm
#---------------------model =new_model(Resmodel,'sigmoid')#model_standard() # UNet_1()
#model.load_weights(weights_path)
#model.load_weights("ckpt")
LR = 0.0001
optim = keras.optimizers.Adam(LR)
dice_loss_se2 = sm.losses.DiceLoss()
mae = tf.keras.losses.MeanAbsoluteError( )
metrics = [ mae,sm.metrics.IOUScore(threshold=0.5), sm.metrics.FScore(threshold=0.5) , dice_loss_se2]
model.compile(optimizer=optim,loss= dice_loss_se2,metrics= metrics)
train_gen = DataGen(train_ids, train_path, image_size=image_size, batch_size=batch_size)
valid_gen = DataGen(valid_ids, train_path, image_size=image_size, batch_size=batch_size)
train_steps = len(train_ids)//batch_size
valid_steps = len(valid_ids)//batch_size
history =model.fit_generator(train_gen, validation_data=valid_gen, steps_per_epoch=train_steps, validation_steps=valid_steps,
epochs=epochs)
and below code for pytorch:
class DoubleConv(nn.Module):
"""(convolution => [BN] => ReLU) * 2"""
def __init__(self, in_channels, out_channels, mid_channels=None):
super().__init__()
if not mid_channels:
mid_channels = out_channels
self.double_conv = nn.Sequential(
nn.Conv2d(in_channels, mid_channels, kernel_size=3, padding=1, bias=False),
nn.BatchNorm2d(mid_channels),
nn.ReLU(inplace=True),
nn.Conv2d(mid_channels, out_channels, kernel_size=3, padding=1, bias=False),
nn.BatchNorm2d(out_channels),
nn.ReLU(inplace=True)
)
def forward(self, x):
return self.double_conv(x)
class Down(nn.Module):
"""Downscaling with maxpool then double conv"""
def __init__(self, in_channels, out_channels):
super().__init__()
self.maxpool_conv = nn.Sequential(
nn.MaxPool2d(2),
DoubleConv(in_channels, out_channels)
)
def forward(self, x):
return self.maxpool_conv(x)
class Up(nn.Module):
"""Upscaling then double conv"""
def __init__(self, in_channels, out_channels, bilinear=True):
super().__init__()
# if bilinear, use the normal convolutions to reduce the number of channels
if bilinear:
self.up = nn.Upsample(scale_factor=2, mode='bilinear', align_corners=True)
self.conv = DoubleConv(in_channels, out_channels, in_channels // 2)
else:
self.up = nn.ConvTranspose2d(in_channels, in_channels // 2, kernel_size=2, stride=2)
self.conv = DoubleConv(in_channels, out_channels)
def forward(self, x1, x2):
x1 = self.up(x1)
# input is CHW
diffY = x2.size()[2] - x1.size()[2]
diffX = x2.size()[3] - x1.size()[3]
x1 = F.pad(x1, [diffX // 2, diffX - diffX // 2,
diffY // 2, diffY - diffY // 2])
# if you have padding issues, see
# https://github.com/HaiyongJiang/U-Net-Pytorch-Unstructured-Buggy/commit/0e854509c2cea854e247a9c615f175f76fbb2e3a
# https://github.com/xiaopeng-liao/Pytorch-UNet/commit/8ebac70e633bac59fc22bb5195e513d5832fb3bd
x = torch.cat([x2, x1], dim=1)
return self.conv(x)
class OutConv(nn.Module):
def __init__(self, in_channels, out_channels):
super(OutConv, self).__init__()
self.conv = nn.Conv2d(in_channels, out_channels, kernel_size=1)
def forward(self, x):
return self.conv(x)
class UNet_standard(nn.Module):
def __init__(self, n_channels, n_classes, bilinear=False):
super(UNet_standard, self).__init__()
self.n_channels = n_channels
self.n_classes = n_classes
self.bilinear = bilinear
self.inc = DoubleConv(n_channels, 64)
self.down1 = Down(64, 128)
self.down2 = Down(128, 256)
self.down3 = Down(256, 512)
factor = 2 if bilinear else 1
self.down4 = Down(512, 1024 // factor)
self.up1 = Up(1024, 512 // factor, bilinear)
self.up2 = Up(512, 256 // factor, bilinear)
self.up3 = Up(256, 128 // factor, bilinear)
self.up4 = Up(128, 64, bilinear)
self.outc = OutConv(64, n_classes)
def forward(self, x):
x1 = self.inc(x)
x2 = self.down1(x1)
x3 = self.down2(x2)
x4 = self.down3(x3)
x5 = self.down4(x4)
x = self.up1(x5, x4)
x = self.up2(x, x3)
x = self.up3(x, x2)
x = self.up4(x, x1)
logits = self.outc(x)
return logits
class DiceLoss(nn.Module):
def __init__(self, n_classes):
super(DiceLoss, self).__init__()
self.n_classes = n_classes
def _one_hot_encoder(self, input_tensor):
tensor_list = []
for i in range(self.n_classes):
temp_prob = input_tensor == i # * torch.ones_like(input_tensor)
tensor_list.append(temp_prob.unsqueeze(1))
output_tensor = torch.cat(tensor_list, dim=1)
return output_tensor.float()
def _dice_loss(self, score, target):
target = target.float()
smooth = 1e-5
intersect = torch.sum(score * target)
y_sum = torch.sum(target * target)
z_sum = torch.sum(score * score)
loss = (2 * intersect + smooth) / (z_sum + y_sum + smooth)
loss = 1 - loss
return loss
def forward(self, inputs, target, weight=None, softmax=False):
if softmax:
inputs = torch.softmax(inputs, dim=1)
target = self._one_hot_encoder(target)
if weight is None:
weight = [1] * self.n_classes
assert inputs.size() == target.size(), 'predict {} & target {} shape do not match'.format(inputs.size(), target.size())
class_wise_dice = []
loss = 0.0
for i in range(0, self.n_classes):
dice = self._dice_loss(inputs[:, i], target[:, i])
class_wise_dice.append(1.0 - dice.item())
loss += dice * weight[i]
return loss / self.n_classes
def iou_score(output, target):
smooth = 1e-5
if torch.is_tensor(output):
output = torch.sigmoid(output).data.cpu().numpy()
if torch.is_tensor(target):
target = target.data.cpu().numpy()
output_ = output > 0.5
target_ = target > 0.5
intersection = (output_ & target_).sum()
union = (output_ | target_).sum()
return (intersection + smooth) / (union + smooth)
class easy_Synapse_dataset(Dataset):
def __init__(self, split, transform=None):
self.transform = transform # using transform in torch!
self.split = split
if self.split == "train":
use_path="path1"
else:
#use_path="path1"
use_path="path2"
self.sample_list =next(os.walk(use_path))[1] #open(os.path.join(list_dir, self.split+'.txt')).readlines()
def __len__(self):
return len(self.sample_list)
def __getitem__(self, idx):
if self.split == "train":
use_path="path1"
else:
#use_path="path1"
use_path="path2"
'''
if self.split == "train":
slice_name = self.sample_list[idx].strip('\n')
data_path = os.path.join(self.data_dir, slice_name+'.npz')
data = np.load(data_path)
image, label = data['image'], data['label']
else:
vol_name = self.sample_list[idx].strip('\n')
filepath = self.data_dir + "/{}.npy.h5".format(vol_name)
data = h5py.File(filepath)
image, label = data['image'][:], data['label'][:]
'''
#--------------------------------------------------------------------
#
image_path = os.path.join(use_path, self.sample_list[idx], self.sample_list[idx],) + "_2CH_ED.mhd"
#print(image_path)
#mask_path = os.path.join(self.path, id_name, "masks/")
mask_path = os.path.join(use_path, self.sample_list[idx], self.sample_list[idx] ,) + "_2CH_ED_gt.mhd"
#print(mask_path)
# not required all_masks = os.listdir(mask_path)
## Reading Image
#image = cv2.imread(image_path, 1)
my_img1= iio.imread(image_path , plugin='simpleitk')
image=my_img1[0,:,:]
#--------------image = cv2.merge((image,image,image))
#image =convert_to_3_channel( cv2.resize(image, (self.image_size, self.image_size)))
image = cv2.resize(image, (img_size, img_size))
#image = cv2.merge((image,image,image))
#image = np.moveaxis(image , 2, 0)
#image = cv2.merge((image,image,image,image))
# same for mask
my_mask1= iio.imread(mask_path , plugin='simpleitk')
mask=my_mask1[0,:,:]
mask= cv2.resize( mask, (img_size, img_size))
#one_hot_tensor= K.one_hot(K.cast( tf.convert_to_tensor(mask, dtype=tf.int32) , 'int32'), num_classes=4)
#mask=np.asarray(one_hot_tensor, np.float32)
#mask=np.asarray(one_hot_tensor, np.int32)
#masks = [(mask == v) for v in range(4) ] #self.class_values]
#mask = np.stack(masks, axis=-1).astype('float')
'''
mask = torch.Tensor(mask)
mask=torch.nn.functional.one_hot(mask.to(torch.int64) , num_classes=4)
mask = mask.to(torch.float)
mask = mask.permute(2, 0, 1)
'''
label=mask
#--------------------------------------------------------------------
#print("image ", image.shape)
#print("mask ", mask.shape)
'''
transform = transforms.Compose([
transforms.ToTensor()
])
'''
image = torch.from_numpy(image.astype(np.float32)).unsqueeze(0)
label = torch.from_numpy(label.astype(np.float32))
#if self.split != "train":
# image = torch.from_numpy(image.astype(np.float32)).unsqueeze(0)
# label = torch.from_numpy(label.astype(np.float32))
sample = {'image': image, 'label': label}
if self.transform:
sample = self.transform(sample)
#print("sample[image].size() ", sample["image"].shape)
#print("sample[label].size() ", sample["label"].shape)
return sample# sample#transform(image), transform(mask).squeeze(0) #sample
#logging.basicConfig(level=logging.NOTSET)
img_size=256# 224
torch.manual_seed(42)
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
print(device)
#------------------------
db_train = easy_Synapse_dataset( split="train",
transform=None) #transforms.Compose(
#[RandomGenerator(output_size=[img_size, img_size])]))
print("The length of train set is: {}".format(len(db_train)))
train_loader = DataLoader(db_train, batch_size=1, shuffle=True)#, num_workers=8, pin_memory=True,
#worker_init_fn=worker_init_fn)
db_test = easy_Synapse_dataset( split="test_vol")
val_loader = DataLoader(db_test, batch_size=1, shuffle=False, num_workers=1)
#---------------------
# Now we can create a model and send it at once to the device
#----model = ManualLinearRegression().to(device)
'''
vit_patches_size=16
config_vit = CONFIGS['R50-ViT-B_16']
config_vit.n_classes = 4#args.num_classes
config_vit.n_skip =3 # args.n_skip
if 'R50-ViT-B_16'.find('R50') != -1:
config_vit.patches.grid = (int(img_size / vit_patches_size), int(img_size / vit_patches_size))
'''
#----------------------
model =UNet_standard( 1, 4).to(device)
# We can also inspect its parameters using its state_dict
#print(model.state_dict())
lr =0.01# 1e-1
n_epochs = 10
loss_fn =DiceLoss(4)#num_classes) nn.MSELoss(reduction='mean')
optimizer = optim.SGD(model.parameters(), lr=lr)#optim.SGD(model.parameters(), lr=0.01, momentum=0.9, weight_decay=0.0001)
#-------------------------------------------
def make_train_step(model, loss_fn, optimizer):
# Builds function that performs a step in the train loop
def train_step(x, y):
# Sets model to TRAIN mode
model.train()
# Makes predictions
yhat = model(x)
#d0, d1, d2, d3= model(x)
#print(yhat.shape)
# Computes loss
#loss =muti_bce_loss_fusion2(d0, d1,d2, d3, y)# loss_fn(yhat, y, softmax=True) #loss_fn(y, yhat)
loss = loss_fn(yhat, y, softmax=True)
# Computes gradients
loss.backward()
# Updates parameters and zeroes gradients
optimizer.step()
optimizer.zero_grad()
# Returns the loss
return loss.item()
# Returns the function that will be called inside the train loop
return train_step
# Creates the train_step function for our model, loss function and optimizer
train_step = make_train_step(model, loss_fn, optimizer)
y_val_average_loss = []
y_average_loss = []
x_epoch= []
# For each epoch...
for epoch in tqdm(range(n_epochs)):
losses = []
val_losses = []
iou_metric = []
#for x_batch, y_batch in train_loader:
for i_batch, sampled_batch in enumerate(train_loader):
x_batch, y_batch = sampled_batch['image'], sampled_batch['label']
#x_batch, y_batch =image_batch.cpu(), label_batch.cpu()
# the dataset "lives" in the CPU, so do our mini-batches
# therefore, we need to send those mini-batches to the
# device where the model "lives"
x_batch = x_batch.to(device)
y_batch = y_batch.to(device)
#print( x_batch.shape , " ",y_batch.shape )
loss = train_step(x_batch, y_batch)
#print("loss = ",loss )
losses.append(loss)
#print('loss : %f' % (sum(losses) / len(losses) ))
avg=sum(losses) / len(losses)
y_average_loss .append (avg)
print('loss : %f' % (avg) )
losses = [] #clear
with torch.no_grad():
#for x_val, y_val in val_loader:
for i_batch, sampled_batch2 in enumerate(val_loader):
x_val, y_val = sampled_batch2['image'], sampled_batch2['label']
x_val = x_val.to(device)
y_val = y_val.to(device)
model.eval()
d3 = model(x_val)
#d0, d1, d2, d3 = model(x_val)
val_loss =loss_fn(d3, y_val, softmax=True) # loss_fn(y_val, yhat)
iou = iou_score(d3, y_val)
val_losses.append(val_loss.item())
iou_metric.append(iou.item())
#print('Validation loss : %f' % (sum(val_losses) / len(val_losses) ))
print('Validation iou : %f' % (sum(iou_metric) / len(iou_metric) ))
val_avg=sum(val_losses) / len(val_losses)
y_val_average_loss .append (val_avg)
x_epoch.append (epoch)
print('Validation loss : %f' % ( val_avg ))
val_losses = [] # clear
iou_metric = []
# Checks model's parameters
#print(model.state_dict())
I fixed the issue by using different Unet implementation from https://github.com/usuyama/pytorch-unet/blob/master/pytorch_unet.py
Related
In pytorch, self-made dataset and testing dataset seem to exhaust all RAM
In pytorch, self-made dataset and testing dataset seem to exhaust all RAM I am new to pytorch and I wrote a ResNet program in pytorch on MNIST for an experiment. If I use the data loader as below, it is fine: import torch as pt from torch.utils.data import DataLoader, TensorDataset import torchvision as ptv mnist_train = ptv.datasets.MNIST(ROOT_DIR, train=True, transform=ptv.transforms.ToTensor(), download=False) dl = pt.utils.data.DataLoader(dataset=mnist_train, batch_size=BATCH_SIZE, shuffle=True, drop_last=True) If I use a self-made dataset as below to use a validation set at each iteration, the program will exhaust all my RAM. The testing set is not used in each iteration, but at the end to evaluate the model. mnist_test = ptv.datasets.MNIST(ROOT_DIR, train=False, transform=ptv.transforms.ToTensor(), download=False) M_TEST, PIC_H, PIC_W = mnist_test.data.shape x_test = mnist_test.data.double() / 255. y_test = mnist_test.targets a = pt.randperm(M_TEST) # ATTENTION pt.randperm x_test = x_test[a] y_test = y_test[a] VAL_RATE = 0.1 M_VAL = int(np.ceil(M_TEST * VAL_RATE)) M_TEST -= M_VAL x_test, x_val = pt.split(x_test, (M_TEST, M_VAL)) y_test, y_val = pt.split(y_test, (M_TEST, M_VAL)) x_test = x_test.view(-1, 1, PIC_H, PIC_W).double() x_val = x_val.view(-1, 1, PIC_H, PIC_W).double() dl_test = DataLoader(TensorDataset(x_test, y_test), batch_size=BATCH_SIZE) def acc(ht, yt): return (pt.argmax(ht, 1) == yt.long()).double().mean() # in iteration: for epoch in range(N_EPOCHS): for i, (bx, by) in enumerate(dl): model.train(True) optim.zero_grad() bx = bx.view(-1, 1, PIC_H, PIC_W).double() ht = model(bx) cost = criterion(ht, by) cost.backward() optim.step() model.train(False) accv = acc(ht, by) ht_val = model(x_val) val_cost = criterion(ht_val, y_val) val_acc = acc(ht_val, y_val) So I suspect only the ptv.datasets.MNIST and the pt.utils.data.DataLoader is available, so I removed the usage of my self-made validation set at each iteration; and the RAM usage is normal after the removal. But the test progress still exhaust all my RAM even I only use the ptv.datasets.MNIST and the pt.utils.data.DataLoader as below: mnist_test = ptv.datasets.MNIST(ROOT_DIR, train=False, transform=ptv.transforms.ToTensor(), download=False) dl_test = pt.utils.data.DataLoader(dataset=mnist_test, batch_size=BATCH_SIZE, shuffle=False, drop_last=True) test_cost_avg = 0. test_acc_avg = 0. GROUP = int(np.ceil(M_TEST / BATCH_SIZE / 10)) for i, (bx, by) in enumerate(dl_test): bx = bx.view(-1, 1, PIC_H, PIC_W).double() ht = model(bx) test_cost_avg += criterion(ht, by) test_acc_avg += acc(ht, by) if i % GROUP == 0: print(f'Testing # {i + 1}') if i % GROUP != 0: print(f'Testing # {i + 1}') test_cost_avg /= i + 1 test_acc_avg /= i + 1 print(f'Tested: cost = {test_cost_avg}, acc = {test_acc_avg}') print('Over') Please give me a help. Thanks a lot! Update: I suspect there is something wrong with my model, because I have a simple CNN model on self-made dataset from pytorchvision's MNIST does not have this RAM exhaustion problem. So I paste my model in this problem as below FYI: def my_conv(in_side, in_ch, out_ch, kernel, stride, padding='same'): if 'same' == padding: ps = kernel - 1 padding = ps // 2 else: padding = 0 print(padding) # tmp return pt.nn.Conv2d(in_ch, out_ch, kernel_size=kernel, stride=stride, padding=padding) class MyResnetBlock(pt.nn.Module): def __init__(self, residual, in_side, in_ch, out_ch, kernel=3, stride=1, **kwargs): super().__init__(**kwargs) self.residual = residual self.in_side = in_side self.in_ch = in_ch self.out_ch = out_ch self.kernel = kernel self.stride = stride self.conv1 = my_conv(in_side, in_ch, out_ch, kernel, stride) self.bn1 = pt.nn.BatchNorm2d(out_ch) self.relu1 = pt.nn.ReLU() self.conv2 = my_conv(np.ceil(in_side / stride), out_ch, out_ch, kernel, 1) self.bn2 = pt.nn.BatchNorm2d(out_ch) self.relu2 = pt.nn.ReLU() if residual: self.conv_down = my_conv(in_side, in_ch, out_ch, kernel, stride) def forward(self, input): x = input x = self.conv1(x) x = self.bn1(x) x = self.relu1(x) x = self.conv2(x) x = self.bn2(x) if self.residual: res = self.conv_down(input) else: res = input x += res x = self.relu2(x) return x class MyResnetByPt(pt.nn.Module): def __init__(self, blocks_spec_list, in_side, init_in_ch, init_out_ch, **kwargs): super().__init__(**kwargs) self.conv1 = my_conv(in_side, init_in_ch, init_out_ch, 3, 1) in_ch = out_ch = init_out_ch blocks = [] for block_id, n_blocks in enumerate(blocks_spec_list): for layer_id in range(n_blocks): if layer_id == 0: if block_id != 0: out_ch *= 2 block = MyResnetBlock(True, in_side, in_ch, out_ch, 3, 2) in_ch = out_ch in_side = int(np.ceil(in_side / 2)) else: block = MyResnetBlock(False, in_side, in_ch, out_ch, 3, 1) blocks.append(block) self.blocks = pt.nn.Sequential(*blocks) self.final_ch = out_ch self.avg_pool = pt.nn.AvgPool2d(kernel_size=(in_side, in_side), stride=(1, 1), padding=(0, 0)) self.fc = pt.nn.Linear(out_ch, N_CLS) def forward(self, input): x = input x = self.conv1(x) x = self.blocks(x) x = self.avg_pool(x) x = x.view(-1, self.final_ch) x = self.fc(x) return x model = MyResnetByPt([2, 2, 2, 2], PIC_H, 1, 16) model = model.double()
Tensorflow 2.2.0 Keras Subclass Model can train and predict, but throws exception when save:Dimension size must be evenly divisible X
This Model is a variety of CNN and uses Causal Dilational Convolution Layer. I can train and predict with 0 error, but when I use model.save() to save model, it throws Exception. So I use save_weights and load_weights to save and load model. I wonder why this error appears: model.save("path") out: ValueError: Dimension size must be evenly divisible by 2 but is 745 for '{{node conv1d_5/SpaceToBatchND}} = SpaceToBatchND[T=DT_FLOAT, Tblock_shape=DT_INT32, Tpaddings=DT_INT32](conv1d_5/Pad, conv1d_5/SpaceToBatchND/block_shape, conv1d_5/SpaceToBatchND/paddings)' with input shapes: [?,745,32], [1], [1,2] and with computed input tensors: input[1] = <2>, input[2] = <[0 0]>. Input shape is (None,743,27) Output shape is (None,24,1) def slice(x, seq_length): return x[:, -seq_length:, :] class ResidualBlock(tf.keras.layers.Layer): def __init__(self, n_filters, filter_width, dilation_rate): super(ResidualBlock, self).__init__() self.n_filters = n_filters self.filter_width = filter_width self.dilation_rate = dilation_rate # preprocessing - equivalent to time-distributed dense self.x = Conv1D(32, 1, padding='same', activation='relu') # filter convolution self.x_f = Conv1D(filters=n_filters, kernel_size=filter_width, padding='causal', dilation_rate=dilation_rate, activation='tanh') # gating convolution self.x_g = Conv1D(filters=n_filters, kernel_size=filter_width, padding='causal', dilation_rate=dilation_rate, activation='sigmoid') # postprocessing - equivalent to time-distributed dense self.z_p = Conv1D(32, 1, padding='same', activation='relu') def call(self, inputs): x = self.x(inputs) f = self.x_f(x) g = self.x_g(x) z = tf.multiply(f, g) z = self.z_p(z) return tf.add(x, z), z def get_config(self): config = super(ResidualBlock, self).get_config() config.update({"n_filters": self.n_filters, "filter_width": self.filter_width, "dilation_rate": self.dilation_rate}) return config class WaveNet(tf.keras.Model): def __init__(self, n_filters=32, filter_width=2, dilation_rates=None, drop_out=0.2, pred_length=24): super().__init__(name='WaveNet') # Layer Parameter self.n_filters = n_filters self.filter_width = filter_width self.drop_out = drop_out self.pred_length = pred_length if dilation_rates is None: self.dilation_rates = [2 ** i for i in range(8)] else: self.dilation_rates = dilation_rates # Layer self.residual_stacks = [] for dilation_rate in self.dilation_rates: self.residual_stacks.append(ResidualBlock(self.n_filters, self.filter_width, dilation_rate)) # self.add = Add() self.cut = Lambda(slice, arguments={'seq_length': pred_length}) self.conv_1 = Conv1D(128, 1, padding='same') self.relu = Activation('relu') self.drop = Dropout(drop_out) self.skip = Lambda(lambda x: x[:, -2 * pred_length + 1:-pred_length + 1, :1]) self.conv_2 = Conv1D(1, 1, padding='same') def _unroll(self, inputs, **kwargs): outputs = inputs skips = [] for residual_block in self.residual_stacks: outputs, z = residual_block(outputs) skips.append(z) outputs = self.relu(Add()(skips)) outputs = self.cut(outputs) outputs = self.conv_1(outputs) outputs = self.relu(outputs) outputs = self.drop(outputs) outputs = Concatenate()([outputs, self.skip(inputs)]) outputs = self.conv_2(outputs) outputs = self.cut(outputs) return outputs def _get_output(self, input_tensor): pass def call(self, inputs, training=False, **kwargs): if training: return self._unroll(inputs) else: return self._get_output(inputs) Train step model = WaveNet() model.compile(Adam(), loss=loss) # ok history = model.fit(train_x, train_y, batch_size=batch_size, epochs=epochs, callbacks=[cp_callback] if save else None) # ok result = model.predict(test_x) # error model.save("path")
subclass a customized model in tensorflow2: Cannot convert a Tensor of dtype resource to a NumPy array
I'm newbie to tensorflow2 and use tensorflow2.3.1, cpu version. I defined the model in subclassing way and, when showing the structure of my model, I encountered the error "tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot convert a Tensor of dtype resource to a NumPy array", which points to the following line in BST_DSSM.build_model "self.item_sequence_embeddings = tf.nn.embedding_lookup(" I have browsed through similar questions but can't find satisfactory solution. Any help will be appreciated :) Below is my code. import tensorflow as tf class MultiHeadAttention(tf.keras.layers.Layer): """ def multi head attention layer q, k, v multiplied by Wq, Wk, Wv respectively -> q', k', v' q' * k' -> w, w / sqrt(q'.shape[1]) -> w' w' * v' -> z, z * Wz -> z' z' add v (residual), then goes through LRelu, do a LN at last """ def __init__( self, scope_name, num_units=8, num_heads=1, embed_dim=8, has_residual=True, dropout_keep_prob=1.0): super(MultiHeadAttention, self).__init__() assert num_units % num_heads == 0 assert scope_name in ["user", "item"] self.num_heads = num_heads self.num_units = num_units self.embed_dim = embed_dim self.dropout_keep_prob = dropout_keep_prob self.Wq = tf.keras.layers.Dense( units=self.num_units, activation=tf.nn.leaky_relu, name=f"{scope_name}_Wq") self.Wk = tf.keras.layers.Dense( units=self.num_units, activation=tf.nn.leaky_relu, name=f"{scope_name}_Wk") self.Wv = tf.keras.layers.Dense( units=self.num_units, activation=tf.nn.leaky_relu, name=f"{scope_name}_Wv") self.has_residual = has_residual self.Wz = tf.keras.layers.Dense(embed_dim) def call(self, queries, keys_, values): """ :param queries: of shape [batch_size, max_length, emb_dim] :param keys_: of shape [batch_size, max_length, emb_dim] :param values: of shape [batch_size, max_length, emb_dim] :return: """ assert values.get_shape().as_list()[-1] == self.embed_dim assert queries.get_shape().as_list()[-1] == self.embed_dim assert keys_.get_shape().as_list()[-1] == self.embed_dim # Linear projections Q = self.Wq(queries) K = self.Wk(keys_) V = self.Wv(values) # Split and concat Q_ = tf.concat(tf.split(Q, self.num_heads, axis=2), axis=0) K_ = tf.concat(tf.split(K, self.num_heads, axis=2), axis=0) V_ = tf.concat(tf.split(V, self.num_heads, axis=2), axis=0) # Multiplication weights = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1])) # Scale weights = weights / (K_.get_shape().as_list()[-1] ** 0.5) # convert to prob vector weights = tf.nn.softmax(weights) # Dropouts if 0 < self.dropout_keep_prob < 1: weights = tf.keras.layers.AlphaDropout( rate=1 - self.dropout_keep_prob)(weights) # Weighted sum # [batch_size * num_heads, max_length, num_units / num_heads] outputs = tf.matmul(weights, V_) # Restore shape to [batch_size, max_length, num_units] z = tf.concat(tf.split(outputs, self.num_heads, axis=0), axis=2) # Restore shape to [batch_size, max_length, embed_dim] z = self.Wz(z) # Residual connection if self.has_residual: z += values z = tf.nn.leaky_relu(z) # Normalize z = tf.keras.layers.LayerNormalization( beta_initializer="zeros", gamma_initializer="ones")(z) return z class BST_DSSM(tf.keras.Model): """define BST+DSSM model stucture """ def __init__(self, model_dir, item_embedding=None, user_embedding=None, embedding_size=8, vocab_size=1000, max_length_item=15, max_length_user=6, epoch=10, batch_size=256, blocks=2, learning_rate=0.001, optimizer_type="adam", batch_norm=0, batch_norm_decay=0.995, verbose=False, random_seed=2019, l2_reg=0.0, has_residual=True): """ initial model related parms and tensors """ super(BST_DSSM, self).__init__() # denote as K, size of the feature embedding self.embedding_size = embedding_size self.l2_reg = l2_reg self.epoch = epoch self.batch_size = batch_size self.learning_rate = learning_rate self.optimizer_type = optimizer_type self.optimizer = None self.blocks = blocks self.batch_norm = batch_norm self.batch_norm_decay = batch_norm_decay self.verbose = verbose self.random_seed = random_seed self.model_dir = model_dir # self._init_graph() self.vocab_size = vocab_size self.max_length_item = max_length_item self.max_length_user = max_length_user self.has_residual = has_residual self.model = None self.item_embedding = item_embedding self.user_embedding = user_embedding self.mha_user = MultiHeadAttention("user", num_units=embedding_size) self.mha_item = MultiHeadAttention("item", num_units=embedding_size) def _get_item_embedding_matrix(self): if self.item_embedding is None: std = 0.1 minval = -std maxval = std emb_matrix = tf.Variable( tf.random.uniform( [self.vocab_size, self.embedding_size], minval, maxval, seed=self.random_seed, dtype=tf.float32), name="item_embedding") self.item_embedding = emb_matrix def _get_user_embedding_matrix(self): if self.user_embedding is None: std = 0.1 minval = -std maxval = std emb_matrix = tf.Variable( tf.random.uniform( [self.vocab_size, self.embedding_size], minval, maxval, seed=self.random_seed, dtype=tf.float32), name="user_embedding") self.user_embedding = emb_matrix def build_model(self): # initialize lut self._get_item_embedding_matrix() self._get_user_embedding_matrix() item_inputs = tf.keras.Input( shape=( self.max_length_item ), dtype=tf.int32, name="item_sequence_idx") user_inputs = tf.keras.Input( shape=( self.max_length_user ), dtype=tf.int32, name="user_sequence_idx") # user and item use different lut, similarly to DSSM self.item_sequence_embeddings = tf.nn.embedding_lookup( self.item_embedding, item_inputs, name="item_sequence_embeddings") self.video_sequence_embeddings = tf.nn.embedding_lookup( self.user_embedding, user_inputs, name="video_sequence_embeddings") # self attn part for i in range(self.blocks): self.item_sequence_embeddings = self.mha_item( queries=self.item_sequence_embeddings, keys=self.item_sequence_embeddings, values=self.item_sequence_embeddings) self.video_sequence_embeddings = self.mha_user( queries=self.video_sequence_embeddings, keys=self.video_sequence_embeddings, values=self.video_sequence_embeddings) # max pooling self.item_sequence_embeddings = tf.nn.max_pool( self.item_sequence_embeddings, [1, self.max_length_item, 1], [1 for _ in range(len(self.item_sequence_embeddings.shape))], padding="VALID") self.video_sequence_embeddings = tf.nn.max_pool( self.video_sequence_embeddings, [1, self.max_length_user, 1], [1 for _ in range(len(self.video_sequence_embeddings.shape))], padding="VALID") # cosine similarity self.item_sequence_embeddings = tf.nn.l2_normalize( self.item_sequence_embeddings, axis=2) self.video_sequence_embeddings = tf.nn.l2_normalize( self.video_sequence_embeddings, axis=2) outputs = tf.matmul( self.item_sequence_embeddings, tf.transpose(self.video_sequence_embeddings, [0, 2, 1])) outputs = tf.reshape(outputs, [-1, 1]) # optimizer if self.optimizer_type == "adam": self.optimizer = tf.keras.optimizers.Adam( learning_rate=self.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8) elif self.optimizer_type == "adagrad": self.optimizer = tf.keras.optimizers.Adagrad( learning_rate=self.learning_rate, initial_accumulator_value=1e-8) elif self.optimizer_type == "gd": self.optimizer = tf.keras.optimizers.SGD( learning_rate=self.learning_rate) elif self.optimizer_type == "momentum": self.optimizer = tf.keras.optimizers.SGD( learning_rate=self.learning_rate, momentum=0.95) self.model = tf.keras.Model( inputs={ "item_sequence_idx": item_inputs, "user_sequence_idx": user_inputs }, outputs=outputs) self.model.compile( optimizer=self.optimizer, loss=self.loss_fn, metrics=[ tf.keras.metrics.AUC(), tf.keras.metrics.binary_accuracy()])
Although I didn't figure out why I got such an error, I have built my model by defining a call method and the code is as below from conf_loader import ( emb_dim, n_layer, item_max_len, user_max_len, batch_size, lr, l2_reg, vocab_size ) class BST_DSSM(tf.keras.Model): """define BST+DSSM model stucture """ def __init__(self, item_embedding=None, user_embedding=None, emb_dim=emb_dim, vocab_size=vocab_size, item_max_len=item_max_len, user_max_len=user_max_len, epoch=10, batch_size=batch_size, n_layers=n_layer, learning_rate=lr, optimizer_type="adam", random_seed=2019, l2_reg=l2_reg, has_residual=True): """ initial model related parms and tensors """ super(BST_DSSM, self).__init__() self.emb_dim = emb_dim self.l2_reg = l2_reg self.epoch = epoch self.batch_size = batch_size self.learning_rate = learning_rate self.optimizer_type = optimizer_type self.blocks = n_layers self.random_seed = random_seed self.vocab_size = vocab_size self.item_max_len = item_max_len self.user_max_len = user_max_len self.has_residual = has_residual self.item_embedding = item_embedding self.user_embedding = user_embedding self.mha_user = MultiHeadAttention(scope_name="user", embed_dim=self.emb_dim) self.mha_item = MultiHeadAttention(scope_name="item", embed_dim=self.emb_dim) # optimizer if self.optimizer_type == "adam": self.optimizer = tf.keras.optimizers.Adam( learning_rate=self.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8) elif self.optimizer_type == "adagrad": self.optimizer = tf.keras.optimizers.Adagrad( learning_rate=self.learning_rate, initial_accumulator_value=1e-8) elif self.optimizer_type == "gd": self.optimizer = tf.keras.optimizers.SGD( learning_rate=self.learning_rate) elif self.optimizer_type == "momentum": self.optimizer = tf.keras.optimizers.SGD( learning_rate=self.learning_rate, momentum=0.95) self.user_embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, output_dim=self.emb_dim) self.item_embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, output_dim=self.emb_dim) #tf.function def call(self, inputs, training=True): # multiple inputs item_inputs = inputs[0] user_inputs = inputs[1] item_sequence_embeddings = self.item_embedding(item_inputs) user_sequence_embeddings = self.user_embedding(user_inputs) # [batch_size, max_length, 16] for i in range(self.blocks): item_sequence_embeddings = self.mha_item(item_sequence_embeddings) user_sequence_embeddings = self.mha_user(user_sequence_embeddings) # [batch_size, 1, 16] item_outputs_max = tf.nn.max_pool( item_sequence_embeddings, [1, self.item_max_len, 1], [1 for _ in range(len(item_sequence_embeddings.shape))], padding="VALID") user_outputs_max = tf.nn.max_pool( user_sequence_embeddings, [1, self.user_max_len, 1], [1 for _ in range(len(user_sequence_embeddings.shape))], padding="VALID") # L2 normalize to get cosine similarity item_normalized = tf.nn.l2_normalize( item_outputs_max, axis=2) user_normalized = tf.nn.l2_normalize( user_outputs_max, axis=2) outputs = tf.matmul( item_normalized, user_normalized, transpose_b=True) return tf.reshape(outputs, [-1, 1]) def loss_fn(self, target, output): cross_entropy = tf.keras.backend.binary_crossentropy( target, output, from_logits=False ) if self.l2_reg > 0: _regularizer = tf.keras.regularizers.l2(self.l2_reg) cross_entropy += _regularizer(self.user_embedding) cross_entropy += _regularizer(self.item_embedding) return cross_entropy def debug(): x_train = [ np.random.randint(low=0, high=20, size=(5, item_max_len)), np.random.randint(low=0, high=20, size=(5, user_max_len))] y_train = np.random.randint(low=0, high=2, size=5).astype(dtype=float) model = BST_DSSM() model.compile( optimizer=model.optimizer, loss=model.loss_fn ) model.fit(x_train, y_train, epochs=n_epoch) model.summary()
Why does requires_grad turns from true to false when doing torch.nn.conv2d operation?
I have Unet network which takes in MRI images of the brain, where the goal is to segment white substance in the brain. The images has the shape 256x256x183 (reshaped to 183x256x256) (FLAIR and T1 images). The problem I am having is that before sending the input to the Unet network, I have requires_grad=True on my pytorch tensor, but after one torch.nn.conv2d operation the requires_grad=False. This is a huge problem since the gradient will not update and learn. from collections import OrderedDict import torch import torch.nn as nn class UNet(nn.Module): def __init__(self, in_channels=3, out_channels=1, init_features=32): super(UNet, self).__init__() features = init_features self.encoder1 = UNet._block(in_channels, features, name="enc1") self.pool1 = nn.MaxPool2d(kernel_size=2, stride=2) self.encoder2 = UNet._block(features, features * 2, name="enc2") self.pool2 = nn.MaxPool2d(kernel_size=2, stride=2) self.encoder3 = UNet._block(features * 2, features * 4, name="enc3") self.pool3 = nn.MaxPool2d(kernel_size=2, stride=2) self.encoder4 = UNet._block(features * 4, features * 8, name="enc4") self.pool4 = nn.MaxPool2d(kernel_size=2, stride=2) self.bottleneck = UNet._block(features * 8, features * 16, name="bottleneck") self.upconv4 = nn.ConvTranspose2d( features * 16, features * 8, kernel_size=2, stride=2 ) self.decoder4 = UNet._block((features * 8) * 2, features * 8, name="dec4") self.upconv3 = nn.ConvTranspose2d( features * 8, features * 4, kernel_size=2, stride=2 ) self.decoder3 = UNet._block((features * 4) * 2, features * 4, name="dec3") self.upconv2 = nn.ConvTranspose2d( features * 4, features * 2, kernel_size=2, stride=2 ) self.decoder2 = UNet._block((features * 2) * 2, features * 2, name="dec2") self.upconv1 = nn.ConvTranspose2d( features * 2, features, kernel_size=2, stride=2 ) self.decoder1 = UNet._block(features * 2, features, name="dec1") self.conv = nn.Conv2d( in_channels=features, out_channels=out_channels, kernel_size=1 ) def forward(self, x): print(x.requires_grad) #<---- here it is true enc1 = self.encoder1(x)#<---- where the problem happens print(enc1.requires_grad) #<---- here it is false enc2 = self.encoder2(self.pool1(enc1)) print(enc2.requires_grad) enc3 = self.encoder3(self.pool2(enc2)) print(enc3.requires_grad) enc4 = self.encoder4(self.pool3(enc3)) print(enc4.requires_grad) bottleneck = self.bottleneck(self.pool4(enc4)) print(bottleneck.requires_grad) dec4 = self.upconv4(bottleneck) print(dec4.requires_grad) dec4 = torch.cat((dec4, enc4), dim=1) print(dec4.requires_grad) dec4 = self.decoder4(dec4) print(dec4.requires_grad) dec3 = self.upconv3(dec4) print(dec3.requires_grad) dec3 = torch.cat((dec3, enc3), dim=1) print(dec3.requires_grad) dec3 = self.decoder3(dec3) print(dec3.requires_grad) dec2 = self.upconv2(dec3) print(dec2.requires_grad) dec2 = torch.cat((dec2, enc2), dim=1) print(dec2.requires_grad) dec2 = self.decoder2(dec2) print(dec2.requires_grad) dec1 = self.upconv1(dec2) print(dec1.requires_grad) dec1 = torch.cat((dec1, enc1), dim=1) print(dec1.requires_grad) dec1 = self.decoder1(dec1) print(dec1.requires_grad) print("going out") return torch.sigmoid(self.conv(dec1)) #staticmethod def _block(in_channels, features, name): return nn.Sequential( OrderedDict( [ ( name + "conv1", nn.Conv2d( in_channels=in_channels, out_channels=features, kernel_size=3, padding=1, bias=False, ), ), (name + "norm1", nn.BatchNorm2d(num_features=features)), (name + "relu1", nn.ReLU(inplace=True)), ( name + "conv2", nn.Conv2d( in_channels=features, out_channels=features, kernel_size=3, padding=1, bias=False, ), ), (name + "norm2", nn.BatchNorm2d(num_features=features)), (name + "relu2", nn.ReLU(inplace=True)), ] ) ) Edit: This is the training code class run_network: def __init__(self, eta, epoch, batch_size, train_file_path, validation_file_path, shuffle_after_epoch = True): self.eta = eta self.epoch = epoch self.batch_size = batch_size self.train_file_path = train_file_path self.validation_file_path = validation_file_path self.shuffle_after_epoch = shuffle_after_epoch def __call__(self, is_train = False): device = torch.device("cpu" if not torch.cuda.is_available() else torch.cuda()) unet = torch.hub.load('mateuszbuda/brain-segmentation-pytorch', 'unet', in_channels=3, out_channels=1, init_features=32, pretrained=True) unet.to(device) unet = unet.double() optimizer = optim.Adam(unet.parameters(), lr=self.eta) dsc_loss = DiceLoss() Load_training = NiftiLoader(self.train_file_path) Load_validation = NiftiLoader(self.validation_file_path) mean_flair, mean_t1, std_flair, std_t1 = Load_training.average_mean_and_std(20, 79,99) total_mean = [mean_flair, mean_t1] total_std = [std_flair, std_t1] loss_train = [] loss_validation = [] for current_epoch in tqdm(range(self.epoch)): for phase in ["train", "validation"]: if phase == "train": mini_batch = Load_training.create_batch(self.batch_size, self.shuffle_after_epoch) unet.train() print("her22") if phase == "validation": print("her") mini_batch = Load_validation.create_batch(self.batch_size, self.shuffle_after_epoch) unet.eval() dim1, dim2, dim3 = mini_batch.shape for iteration in range(1): if phase == "train": current_batch = Load_training.Load_Image_batch(mini_batch, iteration) image_batch = Load_training.image_zero_mean_normalizer(current_batch) if phase == "validation": current_batch = Load_validation.Load_Image_batch(mini_batch, iteration) image_batch = Load_training.image_zero_mean_normalizer(current_batch, False, mean_list, std_list) image_dim0, image_dim1, image_dim2, image_dim3, image_dim4 = image_batch.shape image_batch = image_batch.reshape(( image_dim0, image_dim1*image_dim2, image_dim3, image_dim4 )) image_batch = np.swapaxes(image_batch, 0,1) image_batch = torch.as_tensor(image_batch)#.requires_grad_(True) #, requires_grad=True) image_batch = image_batch.to(device) print(image_batch.requires_grad) optimizer.zero_grad() with torch.set_grad_enabled(is_train == "train"): for j in range(0, 10, 1): # [183*5, 3, 256, 256] -> [12, 3, 256, 256] # ANTALL ITERASJONER: (183*5/12) -> en chunk input_image = image_batch[j:j+2,0:3,:,:] print(input_image.requires_grad) print("går inn") y_predicted = unet(input_image) print(y_predicted.requires_grad) print(image_batch[j:j+2,3,:,:].requires_grad) loss = dsc_loss(y_predicted.squeeze(1), image_batch[j:j+2,3,:,:]) print(loss.requires_grad) if phase == "train": loss_train.append(loss.item()) loss.backward() print(loss.item()) exit() optimizer.step() print(loss.item()) exit() if phase == "validation": loss_validation.append(loss.item()) Number of iteration and print statement are for experimenting what the cause could be.
It works fine to me. ''' # I changed your code a little bit to catch up the problem. def forward(self, x): print("encoder1", x.requires_grad) #<---- here it is true enc1 = self.encoder1(x)#<---- where the problem happens print("encoder2", enc1.requires_grad) #<---- here it is false ''' a = torch.randn(32, 3, 255, 255, requires_grad=True) # a.requires_grads = True print(a) UNet()(a) # This is the result: encoder1 True encoder2 True True True True True True Can you show me your training source? I guess it's the problem. And why do you need to update the input data?
The training code is fine and the input doesn't need a gradient at all, if you just want to train and update the weights. The real problem is this line here with torch.set_grad_enabled(is_train == "train"): So you want to disable the gradients if you are not training. The thing is is_train is a bool (judging form this: def __call__(self, is_train=False):), so the comparisons will be always false and no gradients will bet set. Just change it to with torch.set_grad_enabled(is_train): and you will be fine.
Is this a correct reimplementation of Pytorch Seq2Seq model?
I made a code that sort of change the tutorial script of seq2seq provided by Pytorch. Here’s the model: class Seq2Seq(nn.Module): def __init__(self, encoder, batch_size, vocab_size, input_size, output_size, hidden_dim, embedding_dim, n_layers=2, dropout_p=0.5): super(Seq2Seq, self).__init__() self.hidden_dim = hidden_dim self.batch_size = batch_size self.input_length = input_size self.output_length = output_size self.vocab_size = vocab_size self.encoder = encoder self.dropout = nn.Dropout(dropout_p) self.selu = nn.SELU() self.decoder_embeddings = nn.Embedding(vocab_size, hidden_dim) self.decoder_gru = nn.GRU(hidden_dim, hidden_dim) self.out = nn.Linear(hidden_dim, vocab_size) self.softmax = nn.LogSoftmax() def decode(self, SOS_token, encoder_hidden, target_output, teacher_forcing_ratio=0.8): decoder_output_full = autograd.Variable(torch.zeros(self.output_length, self.batch_size, self.vocab_size)) decoder_output_full = decoder_output_full.cuda() if use_cuda else decoder_output_full target = target_output.permute(1,0) use_teacher_forcing = True if random.random() < teacher_forcing_ratio else False for idx in range(self.output_length): if idx == 0: decoder_input = SOS_token decoder_hidden = encoder_hidden.unsqueeze(0) output = self.decoder_embeddings(decoder_input).view(1, self.batch_size, -1) output = self.dropout(output) output = self.selu(output) if use_teacher_forcing: decoder_output, decoder_hidden = self.decoder_gru(output, decoder_hidden) temp = 1 out = self.out(decoder_output[0]) out = out + sample_gumbel(out.shape) decoder_output = F.softmax(out / temp, dim=1) # decoder_output = (self.decoder_embeddings.weight * decoder_output.unsqueeze(1)).sum(0).view(1, 1, -1) decoder_output_full[idx, :, :] = decoder_output decoder_input = target[idx-1] # Teacher forcing else: decoder_output, decoder_hidden = self.decoder_gru(output, decoder_hidden) temp = 1 out = self.out(decoder_output[0]) out = out + sample_gumbel(out.shape) decoder_output = F.softmax(out / temp, dim=1) # decoder_output = (self.decoder_embeddings.weight * decoder_output.unsqueeze(1)).sum(0).view(1, 1, -1) topv, topi = decoder_output.data.topk(1) # print topi ni = topi # decoder_input_v = autograd.Variable(torch.LongTensor([[ni]])) decoder_input = autograd.Variable(ni) # decoder_input = decoder_input.cuda() if use_cuda else decoder_input # print decoder_input decoder_output_full[idx, :, :] = decoder_output decoder_output_full = decoder_output_full.permute(1,0,2) # gen_output = self.softmax(self.out(decoder_output_full)) return decoder_output_full def forward(self, input, target_output, teacher_forcing_ratio=0.8): encoder_feat, _ = self.encoder(input) SOS_token = np.zeros((self.batch_size,1), dtype=np.int32) SOS_token = torch.LongTensor(SOS_token.tolist()) SOS_token = autograd.Variable(SOS_token) if use_cuda: SOS_token = SOS_token.cuda(gpu) gen_output = self.decode(SOS_token, encoder_feat, target_output, teacher_forcing_ratio) return gen_output def initHidden(self): result = autograd.Variable(torch.zeros(1, self.batch_size, self.hidden_dim)) if use_cuda: return result.cuda() else: return result The way I calculate the NLL loss is by creating one whole sequence of output first and compare it with the target output. Here’s the loss function: class batchNLLLoss(nn.Module): def __init__(self): super(batchNLLLoss, self).__init__() def forward(self, synt, target, claim_length=20): loss_fn = nn.NLLLoss() loss = 0 for i in range(synt.shape[0]): for j in range(claim_length): loss += loss_fn(synt[i][j].unsqueeze(0), target[i][j]) return loss The current problem is the loss value is really small and seems like the network learns nothing (the output is the same word repeated again and again). Any thought about this? Thanks in advance!