About the memory usage of Mobilenet - python

I'm building MobileNetV1 with Pytorch and had my memory ran out every time I train the model. (The pytorch log "Killed!" and suddenly crashed).
This is my code
Config file: (yaml)
n_gpu: 0
arch:
type: MobileNet
args:
in_channels: 3
num_classes: 26
data_loader:
type: BallDataLoader
args:
data_dir: data/balls/
batch_size: 64
shuffle: true
validation_split: 0.2
num_workers: 0
resize:
- 224
- 224
optimizer:
type: Adam
args:
lr: 1.0e-2
weight_decay: 0
amsgrad: true
loss: nll_loss
metrics:
- accuracy
- top_k_acc
lr_scheduler:
type: StepLR
args:
step_size: 50
gamma: 0.1
trainer:
epochs: 50
save_dir: saved/
save_period: 2
verbosity: 2
monitor: min val_loss
early_stop: 10
tensorboard: true
modules.py:
class DepthwiseSeparableConv(nn.Module):
def __init__(self, in_channels, out_channels, kernel_size = 3, stride = 1, padding = None):
super().__init__()
if padding == None:
padding = kernel_size // 2
self.depth_wise_conv = nn.Conv2d(in_channels, in_channels, kernel_size, stride, padding, groups= in_channels)
self.bn1 = nn.BatchNorm2d(in_channels)
self.point_wise_conv = nn.Conv2d(in_channels, out_channels, (1,1), 1, 0)
self.bn2 = nn.BatchNorm2d(out_channels)
self.in_channels = in_channels
self.out_channels = out_channels
def forward(self, x):
x = self.depth_wise_conv(x)
x = self.bn1(x)
x = F.relu(x)
x = self.point_wise_conv(x)
x = self.bn2(x)
x = F.relu(x)
return x
model.py
class MobileNet(ImageNet):
def __init__(self, in_channels = 3, num_classes = 1000):
super().__init__()
self.convs = nn.Sequential(
nn.Conv2d(in_channels, 32, kernel_size= 3, padding= 1, stride = 1 ),
nn.BatchNorm2d(32),
nn.ReLU(inplace = True),
DepthwiseSeparableConv(32, 64),
DepthwiseSeparableConv(64, 128, stride = 2),
DepthwiseSeparableConv(128, 128),
DepthwiseSeparableConv(128, 256),
DepthwiseSeparableConv(256, 256),
DepthwiseSeparableConv(256, 512, stride = 2),
DepthwiseSeparableConv(512, 512),
DepthwiseSeparableConv(512, 512),
DepthwiseSeparableConv(512, 512),
DepthwiseSeparableConv(512, 512),
DepthwiseSeparableConv(512, 512),
DepthwiseSeparableConv(512, 1024, stride = 1),
DepthwiseSeparableConv(1024, 1024, stride= 2),
nn.AdaptiveAvgPool2d(1)
)
self.fc = nn.Linear(1024, num_classes)
def forward(self, x):
x = self.convs(x)
x = x.view(-1, 1024)
x = self.fc(x)
x = F.log_softmax(x, dim = 1)
return x
So I found a model from https://github.com/jmjeon94/MobileNet-Pytorch, and it worked. After hours I still can't find out why this happened as the models are nearly identical, and since the architect of mobilenet is farely light, this shouldn't take much space to run I supposed. Is there any chance that this is because of the python interpreter or there are actually something wrong with my code?

I think it's because of your batch size. Try using smaller batch size like 32,16,8,4,2.

I delete the line nn.Conv2d(in_channels, 32, kernel_size= 3, padding= 1, stride = 1 ) and rewrite the same and the code ran. Still don't know why but it seem to be the interpreter or the text editor which cause the error. Thank you for attending.
And special thanks to Mr. #Anmol Narang for your efford, I'm very appreciated.

Related

cGAN: RuntimeError: The size of tensor a (100) must match the size of tensor b (7) at non-singleton dimension 0

I am new to cGAN architectures and I was trying out on the MNIST which worked out quite fine but not my own data set. The data set has 7 classes and an Input size of each image (64, 64, 3). I am using Pytorch with Python 3.10:
class GeneratorModel(nn.Module):
def __init__(self):
super(GeneratorModel, self).__init__()
input_dim = 100 +7
output_dim = 12288
self.label_embedding = nn.Embedding(7, 7)
self.hidden_layer1 = nn.Sequential(
nn.Linear(input_dim, 256),
nn.LeakyReLU(0.2)
)
...
self.hidden_layer4 = nn.Sequential(
nn.Linear(1024, output_dim),
nn.Tanh()
)
def forward(self, x, labels):
c = self.label_embedding(labels)
x = torch.cat([x, c], 1)
output = self.hidden_layer1(x)
output = self.hidden_layer2(output)
output = self.hidden_layer3(output)
output = self.hidden_layer4(output)
return output.to(device)
class DiscriminatorModel(nn.Module):
def __init__(self):
super(DiscriminatorModel, self).__init__()
input_dim = 12288 + 7
output_dim = 1
self.label_embedding = nn.Embedding(7, 7)
self.hidden_layer1 = nn.Sequential(
nn.Linear(input_dim, 1024),
nn.LeakyReLU(0.2),
nn.Dropout(0.3)
)
...
self.hidden_layer4 = nn.Sequential(
nn.Linear(256, output_dim),
nn.Sigmoid()
)
def forward(self, x, labels):
c = self.label_embedding(labels)
x = torch.cat([x, c], 1)
output = self.hidden_layer1(x)
output = self.hidden_layer2(output)
output = self.hidden_layer3(output)
output = self.hidden_layer4(output)
return output.to(device)
This is the Error:
RuntimeError: The size of tensor a (100) must match the size of tensor b (7) at non-singleton dimension 0

output from bert into cnn model

i am trying to concatenate bert model with Cnn 1d using pytorch . I used this code but I do not understand what is meaning of in_channels and out_channels in function conv1d
if input shape into cnn model is torch(256,64,768)
class MixModel(nn.Module):
def __init__(self,pre_trained='distilbert-base-uncased'):
super().__init__()
self.bert = AutoModel.from_pretrained('distilbert-base-uncased')
self.hidden_size = self.bert.config.hidden_size
self.conv = nn.Conv1d(in_channels=1, out_channels=256, kernel_size=5, padding='valid', stride=1)
self.relu = nn.ReLU()
self.pool = nn.MaxPool1d(kernel_size= 256- 5 + 1)
self.dropout = nn.Dropout(0.3)
self.clf = nn.Linear(self.hidden_size*2,6)
def forward(self,inputs, mask , labels):
cls_hs = self.bert(input_ids=inputs,attention_mask=mask, return_dict= False)
x=cls_hs
# x = torch.cat(cls_hs[0]) # x= [416, 64, 768]
x = self.conv(x)
x = self.relu(x)
x = self.pool(x)
x = self.dropout(x)
x = self.clf(x)
return x
Edit
I use recommended answer and change the parameters but i got error
class MixModel(nn.Module):
def __init__(self,pre_trained='bert-base-uncased'):
super().__init__()
self.bert = AutoModel.from_pretrained('distilbert-base-uncased')
self.hidden_size = self.bert.config.hidden_size
self.conv = nn.Conv1d(in_channels=768, out_channels=256, kernel_size=5, padding='valid', stride=1)
self.relu = nn.ReLU()
self.pool = nn.MaxPool1d(kernel_size= 64- 5 + 1)
print(11)
self.dropout = nn.Dropout(0.3)
print(12)
self.clf = nn.Linear(self.hidden_size*2,6)
print(13)
def forward(self,inputs, mask , labels):
cls_hs = self.bert(input_ids=inputs,attention_mask=mask, return_dict= False)
x=cls_hs[0]
print(cls_hs[0])
print(len(cls_hs[0]))
print(cls_hs[0].size())
#x = torch.cat(cls_hs,0) # x= [416, 64, 768]
x = x.permute(0, 2, 1)
x = self.conv(x)
x = self.relu(x)
x = self.pool(x)
x = self.dropout(x)
x = self.clf(x)
return x
the error is
5 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in linear(input, weight, bias)
1846 if has_torch_function_variadic(input, weight, bias):
1847 return handle_torch_function(linear, (input, weight, bias), input, weight, bias=bias)
-> 1848 return torch._C._nn.linear(input, weight, bias)
1849
1850
RuntimeError: mat1 and mat2 shapes cannot be multiplied (65536x1 and 1536x6)
The dimension of the output prediction of BERT (and many other transformer-based models) is of shape batchxseq-lenxfeature-dim: That is, your input is a batch of 256 sequences of length (probably with padding) of 64 tokens, each token is represented by a feature vector of dimension 768.
In order to apply 1-d convolution along the sequence-len dimension, you will need first to permute x to be of shape batchxdimxlen:
x = x.permute(0, 2, 1)
Now you can apply nn.Conv1d, where the in_channels is the dimension of x = 768. the out_channels is up to you - what is going to be the hidden dimension of your model.

Pytorch transfer learning error: The size of tensor a (16) must match the size of tensor b (128) at non-singleton dimension 2

Currently, I'm working on an image motion deblurring problem with PyTorch. I have two kinds of images: Blurry images (variable = blur_image) that are the input image and the sharp version of the same images (variable = shar_image), which should be the output. Now I wanted to try out transfer learning, but I can't get it to work.
Here is the code for my dataloaders:
train_loader = torch.utils.data.DataLoader(train_dataset,
batch_size=batch_size,
shuffle = True)
validation_loader = torch.utils.data.DataLoader(valid_dataset,
batch_size=batch_size,
shuffle = False)
test_loader = torch.utils.data.DataLoader(test_dataset,
batch_size=batch_size,
shuffle = False)
Their shape:
Trainloader - Shape of blur_image [N, C, H, W]: torch.Size([16, 3, 128, 128])
Trainloader - Shape of sharp_image [N, C, H, W]: torch.Size([16, 3, 128, 128]) torch.float32
Validationloader - Shape of blur_image [N, C, H, W]: torch.Size([16, 3, 128, 128])
Validationloader - Shape of sharp_image [N, C, H, W]: torch.Size([16, 3, 128, 128]) torch.float32
Testloader- Shape of blur_image [N, C, H, W]: torch.Size([16, 3, 128, 128])
Testloader- Shape of sharp_image [N, C, H, W]: torch.Size([16, 3, 128, 128]) torch.float32
The way I use transfer learning (I thought that for the 'in_features' I have to put in the amount of pixels):
model = models.alexnet(pretrained=True)
model.classifier[6] = torch.nn.Linear(model.classifier[6].in_features, 128)
device_string = "cuda" if torch.cuda.is_available() else "cpu"
device = torch.device(device_string)
model = model.to(device)
The way I define my training process:
# Define the loss function (MSE was chosen due to the comparsion of pixels
# between blurred and sharp images
criterion = nn.MSELoss()
# Define the optimizer and learning rate
optimizer = optim.Adam(model.parameters(), lr=0.001)
# Learning rate schedule - If the loss value does not improve after 5 epochs
# back-to-back then the new learning rate will be: previous_rate*0.5
#scheduler = torch.optim.lr_scheduler.StepLR(optimizer, step_size=7, gamma=0.1)
scheduler = torch.optim.lr_scheduler.ReduceLROnPlateau(
optimizer,
mode='min',
patience=5,
factor=0.5,
verbose=True
)
def training(model, trainDataloader, epoch):
""" Function to define the model training
Args:
model (Model object): The model that is going to be trained.
trainDataloader (Dataloader object): Dataloader object of the trainset.
epoch (Integer): Number of training epochs.
"""
# Changing model into trainings mode
model.train()
# Supporting variable to display the loss for each epoch
running_loss = 0.0
running_psnr = 0.0
for i, data in tqdm(enumerate(trainDataloader),
total=int(len(train_dataset)/trainDataloader.batch_size)):
blur_image = data[0]
sharp_image = data[1]
# Transfer the blurred and sharp image instance to the device
blur_image = blur_image.to(device)
sharp_image = sharp_image.to(device)
# Sets the gradient of tensors to zero
optimizer.zero_grad()
outputs = model(blur_image)
loss = criterion(outputs, sharp_image)
# Perform backpropagation
loss.backward()
# Update the weights
optimizer.step()
# Add the loss that was calculated during the trainigs run
running_loss += loss.item()
# calculate batch psnr (once every `batch_size` iterations)
batch_psnr = psnr(sharp_image, blur_image)
running_psnr += batch_psnr
# Display trainings loss
trainings_loss = running_loss/len(trainDataloader.dataset)
final_psnr = running_psnr/int(len(train_dataset)/trainDataloader.batch_size)
final_ssim = ssim(sharp_image, blur_image, data_range=1, size_average=True)
print(f"Trainings loss: {trainings_loss:.5f}")
print(f"Train PSNR: {final_psnr:.5f}")
print(f"Train SSIM: {final_ssim:.5f}")
return trainings_loss, final_psnr, final_ssim
And here is my way to start the training:
train_loss = []
val_loss = []
train_PSNR_score = []
train_SSIM_score = []
val_PSNR_score = []
val_SSIM_score = []
start = time.time()
for epoch in range(nb_epochs):
print(f"Epoch {epoch+1}\n-------------------------------")
train_epoch_loss = training(model, train_loader, nb_epochs)
val_epoch_loss = validation(model, validation_loader, nb_epochs)
train_loss.append(train_epoch_loss[0])
val_loss.append(val_epoch_loss[0])
train_PSNR_score.append(train_epoch_loss[1])
train_SSIM_score.append(train_epoch_loss[2])
val_PSNR_score.append(val_epoch_loss[1])
val_SSIM_score.append(val_epoch_loss[2])
scheduler.step(train_epoch_loss[0])
scheduler.step(val_epoch_loss[0])
end = time.time()
print(f"Took {((end-start)/60):.3f} minutes to train")
But every time when I want to perform the training I receive the following error:
0%| | 0/249 [00:00<?, ?it/s]Epoch 1
-------------------------------
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/loss.py:528: UserWarning: Using a target size (torch.Size([16, 3, 128, 128])) that is different to the input size (torch.Size([16, 128])). This will likely lead to incorrect results due to broadcasting. Please ensure they have the same size.
return F.mse_loss(input, target, reduction=self.reduction)
---------------------------------------------------------------------------
RuntimeError Traceback (most recent call last)
<ipython-input-195-ff0214e227cd> in <module>()
9 for epoch in range(nb_epochs):
10 print(f"Epoch {epoch+1}\n-------------------------------")
---> 11 train_epoch_loss = training(model, train_loader, nb_epochs)
12 val_epoch_loss = validation(model, validation_loader, nb_epochs)
13 train_loss.append(train_epoch_loss[0])
<ipython-input-170-dfa2c212ad23> in training(model, trainDataloader, epoch)
25 optimizer.zero_grad()
26 outputs = model(blur_image)
---> 27 loss = criterion(outputs, sharp_image)
28
29 # Perform backpropagation
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/module.py in _call_impl(self, *input, **kwargs)
887 result = self._slow_forward(*input, **kwargs)
888 else:
--> 889 result = self.forward(*input, **kwargs)
890 for hook in itertools.chain(
891 _global_forward_hooks.values(),
/usr/local/lib/python3.7/dist-packages/torch/nn/modules/loss.py in forward(self, input, target)
526
527 def forward(self, input: Tensor, target: Tensor) -> Tensor:
--> 528 return F.mse_loss(input, target, reduction=self.reduction)
529
530
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in mse_loss(input, target, size_average, reduce, reduction)
2926 reduction = _Reduction.legacy_get_string(size_average, reduce)
2927
-> 2928 expanded_input, expanded_target = torch.broadcast_tensors(input, target)
2929 return torch._C._nn.mse_loss(expanded_input, expanded_target, _Reduction.get_enum(reduction))
2930
/usr/local/lib/python3.7/dist-packages/torch/functional.py in broadcast_tensors(*tensors)
72 if has_torch_function(tensors):
73 return handle_torch_function(broadcast_tensors, tensors, *tensors)
---> 74 return _VF.broadcast_tensors(tensors) # type: ignore
75
76
RuntimeError: The size of tensor a (16) must match the size of tensor b (128) at non-singleton dimension 2
model structure:
AlexNet(
(features): Sequential(
(0): Conv2d(3, 64, kernel_size=(11, 11), stride=(4, 4), padding=(2, 2))
(1): ReLU(inplace=True)
(2): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
(3): Conv2d(64, 192, kernel_size=(5, 5), stride=(1, 1), padding=(2, 2))
(4): ReLU(inplace=True)
(5): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
(6): Conv2d(192, 384, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(7): ReLU(inplace=True)
(8): Conv2d(384, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(9): ReLU(inplace=True)
(10): Conv2d(256, 256, kernel_size=(3, 3), stride=(1, 1), padding=(1, 1))
(11): ReLU(inplace=True)
(12): MaxPool2d(kernel_size=3, stride=2, padding=0, dilation=1, ceil_mode=False)
)
(avgpool): AdaptiveAvgPool2d(output_size=(6, 6))
(classifier): Sequential(
(0): Dropout(p=0.5, inplace=False)
(1): Linear(in_features=9216, out_features=4096, bias=True)
(2): ReLU(inplace=True)
(3): Dropout(p=0.5, inplace=False)
(4): Linear(in_features=4096, out_features=4096, bias=True)
(5): ReLU(inplace=True)
(6): Linear(in_features=4096, out_features=128, bias=True)
)
)
I'm a newbie in terms of using Pytorch (and image deblurring in general) and so I rather confused about the meaning of the error message and how to fix it. I tried to change my parameters and nothing worked. Does anyone have any advice for me on how to solve this problem?
I would appreciate every input :)
Here your you can't use alexnet for this task. becouse output from your model and sharp_image should be shame. because convnet encode your image as enbeddings you and fully connected layers can not convert these images to its normal size you can not use fully connected layers for decoding, for obtain the same size you need to use ConvTranspose2d() for this task.
your encoder should be:
class ConvEncoder(nn.Module):
"""
A simple Convolutional Encoder Model
"""
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(3, 16, (3, 3), padding=(1, 1))
self.relu1 = nn.ReLU(inplace=True)
self.maxpool1 = nn.MaxPool2d((2, 2))
self.conv2 = nn.Conv2d(16, 32, (3, 3), padding=(1, 1))
self.relu2 = nn.ReLU(inplace=True)
self.maxpool2 = nn.MaxPool2d((2, 2))
self.conv3 = nn.Conv2d(32, 64, (3, 3), padding=(1, 1))
self.relu3 = nn.ReLU(inplace=True)
self.maxpool3 = nn.MaxPool2d((2, 2))
self.conv4 = nn.Conv2d(64, 128, (3, 3), padding=(1, 1))
self.relu4 = nn.ReLU(inplace=True)
self.maxpool4 = nn.MaxPool2d((2, 2))
def forward(self, x):
# Downscale the image with conv maxpool etc.
x = self.conv1(x)
x = self.relu1(x)
x = self.maxpool1(x)
x = self.conv2(x)
x = self.relu2(x)
x = self.maxpool2(x)
x = self.conv3(x)
x = self.relu3(x)
x = self.maxpool3(x)
x = self.conv4(x)
x = self.relu4(x)
x = self.maxpool4(x)
return x
And your decoder should be:
class ConvDecoder(nn.Module):
"""
A simple Convolutional Decoder Model
"""
def __init__(self):
super().__init__()
self.deconv1 = nn.ConvTranspose2d(256, 128, (2, 2), stride=(2, 2))
self.relu1 = nn.ReLU(inplace=True)
self.deconv2 = nn.ConvTranspose2d(128, 64, (2, 2), stride=(2, 2))
self.relu2 = nn.ReLU(inplace=True)
self.deconv3 = nn.ConvTranspose2d(64, 32, (2, 2), stride=(2, 2))
self.relu3 = nn.ReLU(inplace=True)
self.deconv4 = nn.ConvTranspose2d(32, 16, (2, 2), stride=(2, 2))
self.relu4 = nn.ReLU(inplace=True)
def forward(self, x):
# Upscale the image with convtranspose etc.
x = self.deconv1(x)
x = self.relu1(x)
x = self.deconv2(x)
x = self.relu2(x)
x = self.deconv3(x)
x = self.relu3(x)
x = self.deconv4(x)
x = self.relu4(x)
return x
encoder = ConvEncoder()
decoder = ConvDecoder()
You can train your model like that:
encoder.train()
decoder.train()
for batch_idx, (train_img, target_img) in enumerate(train_loader):
# Move images to device
train_img = train_img.to(device)
target_img = target_img.to(device)
# Zero grad the optimizer
optimizer.zero_grad()
# Feed the train images to encoder
enc_output = encoder(train_img)
# The output of encoder is input to decoder !
dec_output = decoder(enc_output)
# Decoder output is reconstructed image
# Compute loss with it and orginal image which is target image.
loss = loss_fn(dec_output, target_img)
# Backpropogate
loss.backward()
# Apply the optimizer to network by calling step.
optimizer.step()
# Return the loss
return loss.item()
you might want visit this for getting help in your project.

Converting from TF 1.x to TF 2.0 keras

I have a model written in TF 1.x code using tf-slim API as well. Is it possible to convert that to tf.keras in TF 2.0 EXACTLY the way it is? For instance, have exactly the same amount of parameters and training?
In my case, I've tried doing so, but my model in tf.keras actually has about 5% LESS parameters than the one in TF 1.x. I also noticed my model in tf.keras has a much less smooth training stage too. Any thoughts? Thanks
Maybe I'm setting some of the parameters to initialize the layers differently? Any other suggestions would be greatly appreciated
This isn't my full model, but I use a lot of the components below:
Original TF.1x model:
import tensorflow as tf
from tensorflow.contrib import slim
def batch_norm_relu(inputs, is_training):
net = slim.batch_norm(inputs, is_training=is_training)
net = tf.nn.relu(net)
return net
def conv2d_transpose(inputs, output_channels, kernel_size):
upsamp = tf.contrib.slim.conv2d_transpose(
inputs,
num_outputs=output_channels,
kernel_size=kernel_size,
stride=2,
)
return upsamp
def conv2d_fixed_padding(inputs, filters, kernel_size, stride, rate):
net = slim.conv2d(inputs,
filters,
kernel_size,
stride=stride,
rate = rate,
padding=('SAME' if stride == 1 else 'VALID'),
activation_fn=None
)
return net
def block(inputs, filters, is_training, projection_shortcut, stride):
inputs = batch_norm_relu(inputs, is_training)
shortcut = inputs
if projection_shortcut is not None:
shortcut = projection_shortcut(inputs)
conv_k1_s1_r1 = shortcut
conv_k3_s1_r1 = slim.conv2d(shortcut,
filters,
kernel_size = 3,
stride = 1,
rate = 1,
padding=('SAME' if stride == 1 else 'VALID'),
activation_fn=None
)
conv_k3_s1_r3 = slim.conv2d(shortcut,
filters,
kernel_size = 3,
stride = 1,
rate = 3,
padding=('SAME' if stride == 1 else 'VALID'),
activation_fn=None
)
conv_k3_s1_r5 = slim.conv2d(shortcut,
filters,
kernel_size = 3,
stride = 1,
rate = 5,
padding=('SAME' if stride == 1 else 'VALID'),
activation_fn=None
)
net = conv_k1_s1_r1 + conv_k3_s1_r1 + conv_k3_s1_r3 + conv_k3_s1_r5
net = batch_norm_relu(net, is_training)
net = conv2d_fixed_padding(inputs=net, filters=filters, kernel_size=1, stride=1, rate = 1)
outputs = shortcut + net
return outputs
Attempted TF 2.x.keras model same component:
import tensorflow as tf
class BatchNormRelu(tf.keras.layers.Layer):
"""Batch normalization + ReLu"""
def __init__(self, name=None):
super(BatchNormRelu, self).__init__(name=name)
self.bnorm = tf.keras.layers.BatchNormalization(momentum=0.999,
scale=False)
self.relu = tf.keras.layers.ReLU()
def call(self, inputs, is_training):
x = self.bnorm(inputs, training=is_training)
x = self.relu(x)
return x
class Conv2DTranspose(tf.keras.layers.Layer):
"""Conv2DTranspose layer"""
def __init__(self, output_channels, kernel_size, name=None):
super(Conv2DTranspose, self).__init__(name=name)
self.tconv1 = tf.keras.layers.Conv2DTranspose(
filters=output_channels,
kernel_size=kernel_size,
strides=2,
padding='same',
activation=tf.keras.activations.relu
)
def call(self, inputs):
x = self.tconv1(inputs)
return x
class Conv2DFixedPadding(tf.keras.layers.Layer):
"""Conv2D Fixed Padding layer"""
def __init__(self, filters, kernel_size, stride, rate, name=None):
super(Conv2DFixedPadding, self).__init__(name=name)
self.conv1 = tf.keras.layers.Conv2D(filters,
kernel_size,
strides=stride,
dilation_rate=rate,
padding=('same' if stride==1 else 'valid'),
activation=None
)
def call(self, inputs):
x = self.conv1(inputs)
return x
class block(tf.keras.layers.Layer):
def __init__(self,
filters,
stride,
projection_shortcut=True,
name=None):
super(block, self).__init__(name=name)
self.projection_shortcut = projection_shortcut
self.brelu1 = BatchNormRelu()
self.brelu2 = BatchNormRelu()
self.conv1 = tf.keras.layers.Conv2D(filters,
kernel_size=3,
strides=1,
dilation_rate=1,
padding=('same' if stride==1 else 'valid'),
activation=None
)
self.conv2 = tf.keras.layers.Conv2D(filters,
kernel_size=3,
strides=1,
dilation_rate=3,
padding=('same' if stride==1 else 'valid'),
activation=None
)
self.conv3 = tf.keras.layers.Conv2D(filters,
kernel_size=3,
strides=1,
dilation_rate=5,
padding=('same' if stride==1 else 'valid'),
activation=None
)
self.conv4 = Conv2DFixedPadding(filters, 1, 1, 1)
self.conv_sc = Conv2DFixedPadding(filters, 1, stride, 1)
def call(self, inputs, is_training):
x = self.brelu1(inputs, is_training)
shortcut = x
if self.projection_shortcut:
shortcut = self.conv_sc(x)
conv_k1_s1_r1 = shortcut
conv_k3_s1_r1 = self.conv1(shortcut)
conv_k3_s1_r3 = self.conv2(shortcut)
conv_k3_s1_r5 = self.conv3(shortcut)
x = conv_k1_s1_r1 + conv_k3_s1_r1 + conv_k3_s1_r3 + conv_k3_s1_r5
x = self.brelu2(x, is_training)
x = self.conv4(x)
outputs = shortcut + x
return outputs

saving model in pytorch with weight decay

this is my model :
# basic LeNet5 network
class LeNet5_mode0 (nn.Module) :
# constructor
def __init__(self):
super(LeNet5_mode0, self).__init__() # call to super constructor
# define layers
# 6 # 28x28
self.conv1 = nn.Sequential(
# Lenet's first conv layer is 3x32x32, squeeze color channels into 1 and pad 2
nn.Conv2d(in_channels = 1, out_channels = 6, kernel_size = 5, stride = 1, padding = 2),
nn.ReLU(),
nn.MaxPool2d(kernel_size = 2, stride = 2)
)
# 16 # 10x10
self.conv2 = nn.Sequential(
nn.Conv2d(in_channels = 6, out_channels = 16, kernel_size = 5, stride = 1, padding = 0),
nn.ReLU(),
nn.MaxPool2d(kernel_size =2, stride = 2)
)
self.fc1 = nn.Sequential(
nn.Linear(in_features = 16*5*5, out_features = 120),
nn.ReLU()
)
self.fc2 = nn.Sequential(
nn.Linear(in_features = 120, out_features = 84),
nn.ReLU()
)
self.classifier = nn.Sequential(
nn.Linear(in_features = 84,out_features = 10),
nn.Softmax(dim = 1) # dim =1 meaning do softmax on the colums of 84x10
)
# define forward function
def forward(self, x):
x = self.conv1(x)
x = self.conv2(x)
x = x.view(-1, 16*5*5) # reshape the tensor to [-1,16*5*5]
x = self.fc1(x)
x = self.fc2(x)
x = self.classifier(x)
return x
and I train this model once with :
criterion = nn.CrossEntropyLoss() # aka, LogLoss
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005)
scheduler = torch.optim.lr_scheduler.MultiStepLR(optimizer, milestones=[5,10,15], gamma=0.5)
and then save with with
torch.save(model.state_dict(), savepath)
and load it with
model.load_state_dict(torch.load(loadpath))
so far no problem . but when i change the optimizer a little to
optimizer = torch.optim.Adam(model.parameters(), lr=0.0005, weight_decay = 0.0005)
and use the same save & load method
I receive the following error:
in loading state_dict for LeNet5_mode0:
Unexpected key(s) in state_dict: "conv1.1.weight", "conv1.1.bias", "conv1.1.running_mean", "conv1.1.running_var", "conv1.1.num_batches_tracked", "conv2.1.weight", "conv2.1.bias", "conv2.1.running_mean", "conv2.1.running_var", "conv2.1.num_batches_tracked".
how can it be fixed? why different optimizer have that effect on the saving of the trained network?

Categories

Resources