I have a model written in TF 1.x code using tf-slim API as well. Is it possible to convert that to tf.keras in TF 2.0 EXACTLY the way it is? For instance, have exactly the same amount of parameters and training?
In my case, I've tried doing so, but my model in tf.keras actually has about 5% LESS parameters than the one in TF 1.x. I also noticed my model in tf.keras has a much less smooth training stage too. Any thoughts? Thanks
Maybe I'm setting some of the parameters to initialize the layers differently? Any other suggestions would be greatly appreciated
This isn't my full model, but I use a lot of the components below:
Original TF.1x model:
import tensorflow as tf
from tensorflow.contrib import slim
def batch_norm_relu(inputs, is_training):
net = slim.batch_norm(inputs, is_training=is_training)
net = tf.nn.relu(net)
return net
def conv2d_transpose(inputs, output_channels, kernel_size):
upsamp = tf.contrib.slim.conv2d_transpose(
inputs,
num_outputs=output_channels,
kernel_size=kernel_size,
stride=2,
)
return upsamp
def conv2d_fixed_padding(inputs, filters, kernel_size, stride, rate):
net = slim.conv2d(inputs,
filters,
kernel_size,
stride=stride,
rate = rate,
padding=('SAME' if stride == 1 else 'VALID'),
activation_fn=None
)
return net
def block(inputs, filters, is_training, projection_shortcut, stride):
inputs = batch_norm_relu(inputs, is_training)
shortcut = inputs
if projection_shortcut is not None:
shortcut = projection_shortcut(inputs)
conv_k1_s1_r1 = shortcut
conv_k3_s1_r1 = slim.conv2d(shortcut,
filters,
kernel_size = 3,
stride = 1,
rate = 1,
padding=('SAME' if stride == 1 else 'VALID'),
activation_fn=None
)
conv_k3_s1_r3 = slim.conv2d(shortcut,
filters,
kernel_size = 3,
stride = 1,
rate = 3,
padding=('SAME' if stride == 1 else 'VALID'),
activation_fn=None
)
conv_k3_s1_r5 = slim.conv2d(shortcut,
filters,
kernel_size = 3,
stride = 1,
rate = 5,
padding=('SAME' if stride == 1 else 'VALID'),
activation_fn=None
)
net = conv_k1_s1_r1 + conv_k3_s1_r1 + conv_k3_s1_r3 + conv_k3_s1_r5
net = batch_norm_relu(net, is_training)
net = conv2d_fixed_padding(inputs=net, filters=filters, kernel_size=1, stride=1, rate = 1)
outputs = shortcut + net
return outputs
Attempted TF 2.x.keras model same component:
import tensorflow as tf
class BatchNormRelu(tf.keras.layers.Layer):
"""Batch normalization + ReLu"""
def __init__(self, name=None):
super(BatchNormRelu, self).__init__(name=name)
self.bnorm = tf.keras.layers.BatchNormalization(momentum=0.999,
scale=False)
self.relu = tf.keras.layers.ReLU()
def call(self, inputs, is_training):
x = self.bnorm(inputs, training=is_training)
x = self.relu(x)
return x
class Conv2DTranspose(tf.keras.layers.Layer):
"""Conv2DTranspose layer"""
def __init__(self, output_channels, kernel_size, name=None):
super(Conv2DTranspose, self).__init__(name=name)
self.tconv1 = tf.keras.layers.Conv2DTranspose(
filters=output_channels,
kernel_size=kernel_size,
strides=2,
padding='same',
activation=tf.keras.activations.relu
)
def call(self, inputs):
x = self.tconv1(inputs)
return x
class Conv2DFixedPadding(tf.keras.layers.Layer):
"""Conv2D Fixed Padding layer"""
def __init__(self, filters, kernel_size, stride, rate, name=None):
super(Conv2DFixedPadding, self).__init__(name=name)
self.conv1 = tf.keras.layers.Conv2D(filters,
kernel_size,
strides=stride,
dilation_rate=rate,
padding=('same' if stride==1 else 'valid'),
activation=None
)
def call(self, inputs):
x = self.conv1(inputs)
return x
class block(tf.keras.layers.Layer):
def __init__(self,
filters,
stride,
projection_shortcut=True,
name=None):
super(block, self).__init__(name=name)
self.projection_shortcut = projection_shortcut
self.brelu1 = BatchNormRelu()
self.brelu2 = BatchNormRelu()
self.conv1 = tf.keras.layers.Conv2D(filters,
kernel_size=3,
strides=1,
dilation_rate=1,
padding=('same' if stride==1 else 'valid'),
activation=None
)
self.conv2 = tf.keras.layers.Conv2D(filters,
kernel_size=3,
strides=1,
dilation_rate=3,
padding=('same' if stride==1 else 'valid'),
activation=None
)
self.conv3 = tf.keras.layers.Conv2D(filters,
kernel_size=3,
strides=1,
dilation_rate=5,
padding=('same' if stride==1 else 'valid'),
activation=None
)
self.conv4 = Conv2DFixedPadding(filters, 1, 1, 1)
self.conv_sc = Conv2DFixedPadding(filters, 1, stride, 1)
def call(self, inputs, is_training):
x = self.brelu1(inputs, is_training)
shortcut = x
if self.projection_shortcut:
shortcut = self.conv_sc(x)
conv_k1_s1_r1 = shortcut
conv_k3_s1_r1 = self.conv1(shortcut)
conv_k3_s1_r3 = self.conv2(shortcut)
conv_k3_s1_r5 = self.conv3(shortcut)
x = conv_k1_s1_r1 + conv_k3_s1_r1 + conv_k3_s1_r3 + conv_k3_s1_r5
x = self.brelu2(x, is_training)
x = self.conv4(x)
outputs = shortcut + x
return outputs
Related
I am trying to customize a Resnet 50 with an attention layer. Please find my code below:
IMAGE_SIZE = [224, 224]
resnet = ResNet50(input_shape=IMAGE_SIZE + [3], weights='imagenet', include_top=False)
# don't train existing weights
for layer in resnet.layers:
layer.trainable = False
import torch
import math
import torch.nn as nn
class BasicConv(nn.Module):
def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1,
groups=1, relu=True, bn=True, bias=False):
super(BasicConv, self).__init__()
self.out_channels = out_planes
self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
padding=padding, dilation=dilation, groups=groups, bias=bias)
self.bn = nn.BatchNorm2d(out_planes,eps=1e-5, momentum=0.01, affine=True) if bn else
None
self.relu = nn.ReLU() if relu else None
def forward(self, x):
x = self.conv(x)
if self.bn is not None:
x = self.bn(x)
if self.relu is not None:
x = self.relu(x)
return x
class Flatten(nn.Module):
def forward(self, x):
return x.view(x.size(0), -1)
class ChannelGate(nn.Module):
def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max']):
super(ChannelGate, self).__init__()
self.gate_channels = gate_channels
self.mlp = nn.Sequential(
Flatten(),
nn.Linear(gate_channels, gate_channels // reduction_ratio),
nn.ReLU(),
nn.Linear(gate_channels // reduction_ratio, gate_channels)
)
self.pool_types = pool_types
def forward(self, x):
channel_att_sum = None
for pool_type in self.pool_types:
if pool_type=='avg':
avg_pool = F.avg_pool2d( x, (x.size(2), x.size(3)), stride=(x.size(2),
x.size(3)))
channel_att_raw = self.mlp( avg_pool )
elif pool_type=='max':
max_pool = F.max_pool2d( x, (x.size(2), x.size(3)), stride=(x.size(2),
x.size(3)))
channel_att_raw = self.mlp( max_pool )
elif pool_type=='lp':
lp_pool = F.lp_pool2d( x, 2, (x.size(2), x.size(3)), stride=(x.size(2),
x.size(3)))
channel_att_raw = self.mlp( lp_pool )
elif pool_type=='lse':
# LSE pool only
lse_pool = logsumexp_2d(x)
channel_att_raw = self.mlp( lse_pool )
if channel_att_sum is None:
channel_att_sum = channel_att_raw
else:
channel_att_sum = channel_att_sum + channel_att_raw
scale = F.sigmoid( channel_att_sum ).unsqueeze(2).unsqueeze(3).expand_as(x)
return x * scale
def logsumexp_2d(tensor):
tensor_flatten = tensor.view(tensor.size(0), tensor.size(1), -1)
s, _ = torch.max(tensor_flatten, dim=2, keepdim=True)
outputs = s + (tensor_flatten - s).exp().sum(dim=2, keepdim=True).log()
return outputs
class ChannelPool(nn.Module):
def forward(self, x):
return torch.cat( (torch.max(x,1)[0].unsqueeze(1), torch.mean(x,1).unsqueeze(1)),
dim=1 )
class SpatialGate(nn.Module):
def __init__(self):
super(SpatialGate, self).__init__()
kernel_size = 7
self.compress = ChannelPool()
self.spatial = BasicConv(2, 1, kernel_size, stride=1, padding=(kernel_size-1) // 2, relu=False)
def forward(self, x):
x_compress = self.compress(x)
x_out = self.spatial(x_compress)
scale = F.sigmoid(x_out) # broadcasting
return x * scale
class CBAM(nn.Module):
def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max'],
no_spatial=False):
super(CBAM, self).__init__()
self.ChannelGate = ChannelGate(gate_channels, reduction_ratio, pool_types)
self.no_spatial=no_spatial
if not no_spatial:
self.SpatialGate = SpatialGate()
def forward(self, x):
x_out = self.ChannelGate(x)
if not self.no_spatial:
x_out = self.SpatialGate(x_out)
return x_out
flat1 = Flatten()(resnet.output)
class1 = Dense(256, activation='relu')(flat1)
class1=BatchNormalization()(class1)
# receive 3D and output 3D
class2 = Dense(128, activation='relu')(class1)
class2=BatchNormalization()(class2)
class2=CBAM(128,8)(class2)
output = Dense(len(folders), activation='softmax')(class2)
I am getting the following error message while implementing the code:
flat1 = Flatten()(resnet.output)
in forward(self, x)
class Flatten(nn.Module):
def forward(self, x):
---> return x.view(x.size(0), -1)
AttributeError: 'KerasTensor' object has no attribute 'view'
Recently, I want to implement CDCN in CVPR2020 using tensorflow2.8 + python3.9. This requires my custom layer acquire current conv2d layer's weight.
However, when I try to add my custom layer to the sequential model, error occurred:NotImplementedError: numpy() is only available when eager execution is enabled.
This is my code. Can anyone helps me? I have tried to add tf.compat.v1.enable_eager_execution(), but it doesn't work.
import numpy as np
import tensorflow.keras as tfk
import tensorflow as tf
class CDC(tfk.layers.Layer):
def __init__(self, output_dim, kernel_size=(3, 3), padding='same', activation=None, theta=0.7, **kwargs):
super(CDC, self).__init__()
self.theta = theta
self.activation = None
self.output_dim = output_dim
self.kernel_size = kernel_size
self.padding = padding
if activation is not None:
self.activation = tfk.activations.get(activation)
def build(self, input_shape):
self.conv = tfk.layers.Conv2D(self.output_dim, self.kernel_size, padding=self.padding, input_shape=input_shape)
self.conv.build(input_shape=input_shape)
self._kernel = self.conv.kernel
super(CDC, self).build(input_shape)
self.built = True
def call(self, inputs, training=None, mask=None):
vanillaOutput = self.conv(inputs)
weightSum = self.conv.kernel.numpy().sum(axis=0).sum(axis=0).sum(axis=0)
weightSum = np.reshape(weightSum, (1, 1, 1, self.output_dim))
weightSum = tf.constant(weightSum, dtype=tf.float32)
cDiff = tf.nn.conv2d(inputs, filters=weightSum, strides=self.conv.strides, padding=self.conv.padding.upper())
result = vanillaOutput - self.theta * cDiff
if self.activation is not None:
return self.activation(result)
return vanillaOutput
If you just want the sum of all elements in kernel
tf.math.reduce_sum()
Also replace the lines
weightSum = self.conv.kernel.numpy().sum(axis=0).sum(axis=0).sum(axis=0)
weightSum = np.reshape(weightSum, (1, 1, 1, self.output_dim))
weightSum = tf.constant(weightSum, dtype=tf.float32)
...
weightSum = tf.math.reduce_sum(tf.math.reduce_sum(tf.math.reduce_sum(self.conv.kernel,axis=0),axis=0),axis=0)
weightSum = tf.reshape(weightSum, (1, 1, 1, self.output_dim))
I am trying to apply channel attention and position attention layer on last convolutional layer of VGG16. But stuck badly at the above error. I am new to coding in python and deep learning.
I am confused about shape values in Class CAM and class PAM.
Here is my code:
from keras import initializers
from keras import regularizers
from keras import constraints
class PAM(Layer):**strong text**
def __init__(self,
gamma_initializer=tf.zeros_initializer(),
gamma_regularizer=None,
gamma_constraint=None,
**kwargs):
super(PAM, self).__init__(**kwargs)
self.gamma_initializer = gamma_initializer
self.gamma_regularizer = gamma_regularizer
self.gamma_constraint = gamma_constraint
def build(self, input_shape):
self.gamma = self.add_weight(shape=(512,),
initializer=self.gamma_initializer,
name='gamma',
regularizer=self.gamma_regularizer,
constraint=self.gamma_constraint)
self.built = True
def compute_output_shape(self, input_shape):
return input_shape
def call(self, input):
input_shape = input.get_shape().as_list()
_, h, w, filters = input_shape
b = Conv2D(512, 3, use_bias=False, kernel_initializer='he_normal')(att_input)
c = Conv2D(512, 3, use_bias=False, kernel_initializer='he_normal')(att_input)
d = Conv2D(512, 3, use_bias=False, kernel_initializer='he_normal')(att_input)
vec_b = K.reshape(b, (-1, h * w, 512))
vec_cT = tf.transpose(K.reshape(c, (-1, h * w,512)), (0, 2, 1))
bcT = K.batch_dot(vec_b, vec_cT)
softmax_bcT = Activation('softmax')(bcT)
vec_d = K.reshape(d, (-1, h * w, 512))
bcTd = K.batch_dot(softmax_bcT, vec_d)
bcTd = K.reshape(bcTd, (-1, h, w, 512))
out = self.gamma*bcTd + att_input
return out
class CAM(Layer):
def __init__(self,
gamma_initializer=tf.zeros_initializer(),
gamma_regularizer=None,
gamma_constraint=None,
**kwargs):
super(CAM, self).__init__(**kwargs)
self.gamma_initializer = gamma_initializer
self.gamma_regularizer = gamma_regularizer
self.gamma_constraint = gamma_constraint
def build(self, input_shape):
self.gamma = self.add_weight(shape=(512,),
initializer=self.gamma_initializer,
name='gamma',
regularizer=self.gamma_regularizer,
constraint=self.gamma_constraint)
self.built = True
def compute_output_shape(self, input_shape):
return input_shape
def call(self, input):
input_shape = input.get_shape().as_list()
_, h, w, filters = input_shape
vec_a = K.reshape(input, (-1, h * w, 512))
vec_aT = tf.transpose(vec_a, (0, 2, 1))
aTa = K.batch_dot(vec_aT, vec_a)
softmax_aTa = Activation('softmax')(aTa)
aaTa = K.batch_dot(vec_a, softmax_aTa)
aaTa = K.reshape(aaTa, (-1, h, w, 512))
out = self.gamma*aaTa + att_input
return out
pam = PAM()(att_input)
pam = Conv2D(512, 3, padding='same', use_bias=False, kernel_initializer='he_normal')(pam)
pam = BatchNormalization(axis=3)(pam)
pam = Activation('relu')(pam)
pam = Dropout(0.5)(pam)
pam = Conv2D(512, 3, padding='same', use_bias=False, kernel_initializer='he_normal')(pam)
cam = CAM()(att_input)
cam = Conv2D(512, 3, padding='same', use_bias=False, kernel_initializer='he_normal')(cam)
cam = BatchNormalization(axis=3)(cam)
cam = Activation('relu')(cam)
cam = Dropout(0.5)(cam)
cam = Conv2D(512, 3, padding='same', use_bias=False, kernel_initializer='he_normal')(cam)
Make sure you have the input dimensions correct. Can't say exactly where the error is without looking at your code but whenever I've had an error like that it has almost always been a case where I overlooked one of the dimensions. Think carefully about how your input changes through the layers. Printing out the model summary might help. Good luck
class BasicBlock(nn.Module):
def __init__(self, in_planes, out_planes, stride, dropRate=0.0):
super(BasicBlock, self).__init__()
self.bn1 = nn.BatchNorm2d(in_planes)
self.relu1 = nn.ReLU(inplace=True)
self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride,
padding=1, bias=False) # 1
self.bn2 = nn.BatchNorm2d(out_planes)
self.relu2 = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(out_planes, out_planes, kernel_size=3, stride=1,
padding=1, bias=False)
self.droprate = dropRate
self.equalInOut = (in_planes == out_planes)
self.convShortcut = (not self.equalInOut) and nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride,
padding=0, bias=False) or None
def forward(self, x):
if not self.equalInOut:
x = self.relu1(self.bn1(x))
else:
out = self.relu1(self.bn1(x))
out = self.relu2(self.bn2(self.conv1(out if self.equalInOut else x)))
if self.droprate > 0:
out = F.dropout(out, p=self.droprate, training=self.training)
out = self.conv2(out)
if self.convShortcut is not None:
return torch.add(x if self.equalInOut else self.convShortcut(x), out)
class NetworkBlock(nn.Module):
def __init__(self, nb_layers, in_planes, out_planes, block, stride, dropRate=0.0):
super(NetworkBlock, self).__init__()
self.layer = self._make_layer(block, in_planes, out_planes, nb_layers, stride, dropRate)
def _make_layer(self, block, in_planes, out_planes, nb_layers, stride, dropRate):
layers = []
for i in range(int(nb_layers)):
layers.append(block(i == 0 and in_planes or out_planes, out_planes, i == 0 and stride or 1, dropRate))
return nn.Sequential(*layers)
def forward(self, x):
return self.layer(x)
class WideResNet(nn.Module):
def __init__(self, depth=34, num_classes=10, widen_factor=10, dropRate=0.0):
super(WideResNet, self).__init__()
nChannels = [16, 16 * widen_factor, 32 * widen_factor, 64 * widen_factor]
assert ((depth - 4) % 6 == 0)
n = (depth - 4) / 6
block = BasicBlock
# 1st conv before any network block
self.conv1 = nn.Conv2d(3, nChannels[0], kernel_size=3, stride=1,
padding=1, bias=False)
# 1st block
self.block1 = NetworkBlock(n, nChannels[0], nChannels[1], block, 1, dropRate)
# 1st sub-block
self.sub_block1 = NetworkBlock(n, nChannels[0], nChannels[1], block, 1, dropRate)
# 2nd block
self.block2 = NetworkBlock(n, nChannels[1], nChannels[2], block, 2, dropRate) # 2
# 3rd block
self.block3 = NetworkBlock(n, nChannels[2], nChannels[3], block, 2, dropRate) # 2
# global average pooling and classifier
self.bn1 = nn.BatchNorm2d(nChannels[3])
self.relu = nn.ReLU(inplace=True)
self.fc = nn.Linear(nChannels[3], num_classes)
self.nChannels = nChannels[3]
for m in self.modules():
if isinstance(m, nn.Conv2d):
n = m.kernel_size[0] * m.kernel_size[1] * m.out_channels
m.weight.data.normal_(0, math.sqrt(2. / n))
elif isinstance(m, nn.BatchNorm2d):
m.weight.data.fill_(1)
m.bias.data.zero_()
elif isinstance(m, nn.Linear):
m.bias.data.zero_()
def forward(self, x):
out = self.conv1(x)
out = self.block1(out)
out = self.block2(out)
out = self.block3(out)
out = self.relu(self.bn1(out))
out = F.avg_pool2d(out, 8)
out = out.view(-1, self.nChannels)
return self.fc(out)
def _conv(self, name, x, filter_size, in_filters, out_filters, strides, padding='SAME'):
"""Convolution."""
with tf.variable_scope(name):
n = filter_size * filter_size * out_filters
kernel = tf.get_variable(
'DW', [filter_size, filter_size, in_filters, out_filters],
tf.float32, initializer=tf.random_normal_initializer(
stddev=np.sqrt(2.0/n)))
return tf.nn.conv2d(x, kernel, strides, padding=padding)
def _residual(self, x, in_filter, out_filter, stride,
activate_before_residual=False, is_log=False):
"""Residual unit with 2 sub layers."""
if activate_before_residual:
x = self._batch_norm('bn1', x)
x = self._relu(x)
orig_x = x
else:
orig_x = x
x = self._batch_norm('bn1', x)
x = self._relu(x)
x = self._conv('conv1', x, 3, in_filter, out_filter, stride)
x = self._batch_norm('bn2', x)
x = self._relu(x)
x = self._conv('conv2', x, 3, out_filter, out_filter, [1, 1, 1, 1])
if in_filter != out_filter:
orig_x = self._conv('shortcut_conv', orig_x, filter_size=1, in_filters=in_filter, out_filters=out_filter,
strides=stride, padding="VALID")
x += orig_x
return x
def _build_model(self):
assert self.mode == 'train' or self.mode == 'eval'
with tf.variable_scope('input'):
self.x_input = tf.placeholder(tf.float32, shape=[None, 32, 32, 3])
self.y_input = tf.placeholder(tf.float32, shape=[None, 10])
self.is_training = tf.placeholder(tf.bool, shape=None)
x = self._conv('conv1.weight', self.x_input, 3, 3, 16, self._stride_arr(1))
strides = [1, 2, 2]
activate_before_residual = [True, True, True]
res_func = self._residual
# wide residual network (https://arxiv.org/abs/1605.07146v1)
filters = [16, 160, 320, 640]
with tf.variable_scope('block1.layer.0'):
x = res_func(x, filters[0], filters[1], self._stride_arr(strides[0]),
activate_before_residual[0])
for i in range(1, 5):
with tf.variable_scope('block1.layer.%d' % i):
x = res_func(x, filters[1], filters[1], self._stride_arr(1), False)
with tf.variable_scope('block2.layer.0'):
x = res_func(x, filters[1], filters[2], self._stride_arr(strides[1]),
activate_before_residual[1], is_log=True)
for i in range(1, 5):
with tf.variable_scope('block2.layer.%d' % i):
x = res_func(x, filters[2], filters[2], self._stride_arr(1), False)
with tf.variable_scope('block3.layer.0'):
x = res_func(x, filters[2], filters[3], self._stride_arr(strides[2]),
activate_before_residual[2])
for i in range(1, 5):
with tf.variable_scope('block3.layer.%d' % i):
x = res_func(x, filters[3], filters[3], self._stride_arr(1), False)
x = self._batch_norm('bn1', x)
x = self._relu(x)
x = self._global_avg_pool(x)
with tf.variable_scope('fc'):
self.pre_softmax = self._fully_connected(x, 10)
I'm doing experiment on "adversarial defense", and I checked that the performances of pytorch and tensorflow is different with same weights (I exported it as numpy and loaded to pytorch and tensorflow) I printed out each result of WideResNet34 and calculate the difference of each output, then, the above output of below image comes out
The results start to be different from block2. Then, I only change the stride of each block to all 1 (stride of block 2 and 3), the below output of above image comes out
The differences are negligible at all layers, so I think the difference appear only when stride=2. I don't know why there is no difference when stride=1 but different when stride=2... Who knows about this thing?
I finally found that the problem was the "padding". Tensorflow's "SAME" padding zero-pads assymmetrically (left=0, right=1, top=0, bottom=1) when symmetric padding results in odd number... While, pytorch do not support assymmetric padding in nn.conv2d, so it zero-pads symmetrically (left=1, right=1, top=1, bottom=1)..
So, I think that when input size=8, filter size=3, and stride=2, index of left-top of filter in tensorflow would be 0,2,4,6 but in pytorch it would be -1(zero-pad), 1, 3, 5... I checked that when I zero-pads assymetrically using nn.Zero-pad2d , it gives almost same results (2-norm diff < 1e-2)
class BasicBlock(nn.Module):
def __init__(self, in_planes, out_planes, stride, dropRate=0.0):
super(BasicBlock, self).__init__()
self.bn1 = nn.BatchNorm2d(in_planes)
self.relu1 = nn.ReLU(inplace=True)
if stride==1:
self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False) # 1
else:
self.conv1 = nn.Conv2d(in_planes, out_planes, kernel_size=3, stride=stride, padding=0, bias=False) # 1
self.pad1 = nn.ZeroPad2d((0,1,0,1)) # 0,1,0,1
self.stride = stride
self.bn2 = nn.BatchNorm2d(out_planes)
self.relu2 = nn.ReLU(inplace=True)
self.conv2 = nn.Conv2d(out_planes, out_planes, kernel_size=3, stride=1,
padding=1, bias=False)
self.droprate = dropRate
self.equalInOut = (in_planes == out_planes)
self.convShortcut = (not self.equalInOut) and nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride,
padding=0, bias=False) or None
def forward(self, x):
if not self.equalInOut:
x = self.relu1(self.bn1(x))
else:
out = self.relu1(self.bn1(x))
if self.stride==1:
out = self.relu2(self.bn2(self.conv1(out if self.equalInOut else x)))
else:
out = self.relu2(self.bn2(self.conv1(out if self.equalInOut else self.pad1(x))))
if self.droprate > 0:
out = F.dropout(out, p=self.droprate, training=self.training)
out = self.conv2(out)
return torch.add(x if self.equalInOut else self.convShortcut(x), out)
I used pytorch to build a segmentation model that uses the BatchNormalization layer. I found that when I set model.eval() on the test, the test result will be 0. If I don't set model.eval(), it will perform well.
I tried to search for related questions, but I got the conclusion that model.eval() can fix the parameters of BN, but I am still confused about how to solve this problem.
My batchsize is 1 and this is my model:
import torch
import torch.nn as nn
class Encode_Block(nn.Module):
def __init__(self, in_feat, out_feat):
super(Encode_Block, self).__init__()
self.conv1 = Res_Block(in_feat, out_feat)
self.conv2 = Res_Block_identity(out_feat, out_feat)
def forward(self, inputs):
outputs = self.conv1(inputs)
outputs = self.conv2(outputs)
return outputs
class Decode_Block(nn.Module):
def __init__(self, in_feat, out_feat):
super(Decode_Block, self).__init__()
self.conv1 = Res_Block(in_feat, out_feat)
self.conv2 = Res_Block_identity(out_feat, out_feat)
def forward(self, inputs):
outputs = self.conv1(inputs)
outputs = self.conv2(outputs)
return outputs
class Conv_Block(nn.Module):
def __init__(self, in_feat, out_feat):
super(Conv_Block, self).__init__()
self.conv1 = nn.Sequential(
nn.Conv2d(in_feat, out_feat, kernel_size=3, stride=1, padding=1),
nn.LeakyReLU(),
)
def forward(self, inputs):
outputs = self.conv1(inputs)
return outputs
def conv3x3(in_planes, out_planes, stride=1):
"""3x3 convolution with padding"""
return nn.Conv2d(
in_planes, out_planes, kernel_size=3, stride=stride, padding=1, bias=False
)
def conv1x1(in_planes, out_planes, stride=1):
"""1x1 convolution with padding"""
return nn.Conv2d(in_planes, out_planes, kernel_size=1, stride=stride, bias=False)
class Res_Block(nn.Module):
def __init__(self, inplanes, planes, stride=1):
super(Res_Block, self).__init__()
self.conv_input = conv1x1(inplanes, planes)
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn = nn.BatchNorm2d(planes)
self.relu = nn.LeakyReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.conv3 = conv1x1(planes, planes)
self.stride = stride
def forward(self, x):
residual = self.conv_input(x)
out = self.conv1(x)
out = self.bn(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn(out)
out += residual
out = self.relu(out)
return out
class Res_Block_identity(nn.Module):
def __init__(self, inplanes, planes, stride=1):
super(Res_Block_identity, self).__init__()
self.conv1 = conv3x3(inplanes, planes, stride)
self.bn = nn.BatchNorm2d(planes)
self.relu = nn.LeakyReLU(inplace=True)
self.conv2 = conv3x3(planes, planes)
self.conv3 = conv1x1(planes, planes)
self.stride = stride
def forward(self, x):
residual = x
out = self.conv1(x)
out = self.bn(out)
out = self.relu(out)
out = self.conv2(out)
out = self.bn(out)
out = self.relu(out)
out = self.conv3(out)
out = self.bn(out)
out += residual
out = self.relu(out)
return out
class UpConcat(nn.Module):
def __init__(self, in_feat, out_feat):
super(UpConcat, self).__init__()
self.de_conv = nn.ConvTranspose2d(in_feat, out_feat, kernel_size=2, stride=2)
def forward(self, inputs, down_outputs):
outputs = self.de_conv(inputs)
out = torch.cat([down_outputs, outputs], 1)
return out
class Res_UNet(nn.Module):
def __init__(self, num_channels=1, num_classes=1):
super(Res_UNet, self).__init__()
flt = 64
self.down1 = Encode_Block(num_channels, flt)
self.down2 = Encode_Block(flt, flt * 2)
self.down3 = Encode_Block(flt * 2, flt * 4)
self.down4 = Encode_Block(flt * 4, flt * 8)
self.down_pool = nn.MaxPool2d(kernel_size=2)
self.bottom = Encode_Block(flt * 8, flt * 16)
self.up_cat1 = UpConcat(flt * 16, flt * 8)
self.up_conv1 = Decode_Block(flt * 16, flt * 8)
self.up_cat2 = UpConcat(flt * 8, flt * 4)
self.up_conv2 = Decode_Block(flt * 8, flt * 4)
self.up_cat3 = UpConcat(flt * 4, flt * 2)
self.up_conv3 = Decode_Block(flt * 4, flt * 2)
self.up_cat4 = UpConcat(flt * 2, flt)
self.up_conv4 = Decode_Block(flt * 2, flt)
self.final = nn.Sequential(
nn.Conv2d(flt, num_classes, kernel_size=1), nn.Sigmoid()
)
def forward(self, inputs):
down1_feat = self.down1(inputs)
pool1_feat = self.down_pool(down1_feat)
down2_feat = self.down2(pool1_feat)
pool2_feat = self.down_pool(down2_feat)
down3_feat = self.down3(pool2_feat)
pool3_feat = self.down_pool(down3_feat)
down4_feat = self.down4(pool3_feat)
pool4_feat = self.down_pool(down4_feat)
bottom_feat = self.bottom(pool4_feat)
up1_feat = self.up_cat1(bottom_feat, down4_feat)
up1_feat = self.up_conv1(up1_feat)
up2_feat = self.up_cat2(up1_feat, down3_feat)
up2_feat = self.up_conv2(up2_feat)
up3_feat = self.up_cat3(up2_feat, down2_feat)
up3_feat = self.up_conv3(up3_feat)
up4_feat = self.up_cat4(up3_feat, down1_feat)
up4_feat = self.up_conv4(up4_feat)
outputs = self.final(up4_feat)
return outputs
The model completely fails to segmentation after setting model.eval(), but the model is good after model.eval() is removed. I am confused about this, and is model.eval() necessary in the test?
BatchNorm layers keeps running estimates of its computed mean and variance during training model.train(), which are then used for normalization during evaluation model.eval().
Each layer has it own statistics of the mean and variance of its outputs/activations.
Since you are reusing your BatchNorm layer self.bn = nn.BatchNorm2d(planes) multiple times, the statics get mixed up and don't represent the actual mean and variance.
So you should create a new BatchNorm layer for every time you use it.
EDIT: I just read that your batch_size is 1, which could also be the core of your problem: see Tensorflow and Batch Normalization with Batch Size==1 => Outputs all zeros