I implemented self attention in tensorflow keras initially with just one function and then later with the help of Class.I implemented the method in two distinct ways ( 1: Function and 2: Class). Let me to present both approaches first, and then I will describe the problem:
What is my task:
My goal is to process TensorSpec(shape=(None, 8, 6, 64) (8 time stamps one by one (6 * 64)) through self attention and get self attention feature map for every time stamp and then concatenate it again into output tensor shape (None, 8, 6, 64)
First Implementation with the help of Function:
def conv1d(x, channels, ks=1, strides=1, padding='same'):
conv = tf.keras.layers.Conv1D(channels, ks, strides, padding, activation='relu', use_bias=False,
kernel_initializer='HeNormal')(x)
return conv
# Self attention
def my_self_attention(x, channels):
size = x.shape
x = tf.reshape(x, shape=[-1, x.shape[2], x.shape[3]])
f = conv1d(x, channels)
g = conv1d(x, channels)
h = conv1d(x, channels)
attention_weights = tf.keras.activations.softmax(
tf.matmul(g, Permute((2, 1))(f))) # query multiply with key and then softmax on it
sensor_att_fm = tf.matmul(attention_weights, h)
gamma = tf.compat.v1.get_variable("gamma", [1], initializer=tf.constant_initializer(0.0))
o = gamma * sensor_att_fm + x
# return tf.reshape(o, shape = [-1, 1, x.shape[1], x.shape[2]])
return tf.reshape(o, shape = [-1, 1, x.shape[1], x.shape[2]])
refined_fm = tf.concat([my_self_attention(tf.expand_dims(my_input[:, t, :, :], 1), 64) for t in range(my_input.shape[1])], 1)
**
2nd Implementation with the help of Class
def conv1d(channels, ks=1, strides=1, padding='same'):
conv = tf.keras.layers.Conv1D(channels, ks, strides, padding, activation='relu', use_bias=False,
kernel_initializer='HeNormal')
return conv
class my_self_attention(tf.keras.layers.Layer):
def __init__(self, channels):
super(my_self_attention, self).__init__()
self.query = conv1d(channels)
self.key = conv1d(channels)
self.value = conv1d(channels)
self.gamma = tf.compat.v1.get_variable("gamma", [1], initializer=tf.constant_initializer(0.0))
def call(self, x):
x = tf.reshape(x, shape=[-1, x.shape[2], x.shape[3]])
f = self.query(x)
g = self.key(x)
h = self.value(x)
attention_weights = tf.keras.activations.softmax(
tf.matmul(g, Permute((2, 1))(f))) # query multiply with key and then softmax on it
sensor_att_fm = tf.matmul(attention_weights, h)
o = self.gamma * sensor_att_fm + x
# return tf.reshape(o, shape = [-1, 1, x.shape[1], x.shape[2]])
return tf.reshape(o, shape=[-1, 1, x.shape[1], x.shape[2]])
sa = my_self_attention(channels)
refined_fm = tf.concat([sa(tf.expand_dims(my_input[:, t, :, :], 1)) for t in range(my_input.shape[1])], 1)
Problem
From my perspective, I implemented the same method in 2 separate ways. The model's performance should be similar. However, model performance dropped by over 3% over class implementation. I'm not sure why. Could someone please respond?
The first method is has way more operations, layers, and trainable weights, since my_self_attention is called in the loop for every timestep. Check out the model.summary() and you will quickly see the differences:
First model with way more parameters:
import tensorflow as tf
from tensorflow.keras.layers import Permute
def conv1d(x, channels, ks=1, strides=1, padding='same'):
conv = tf.keras.layers.Conv1D(channels, ks, strides, padding, activation='relu', use_bias=False,
kernel_initializer='HeNormal')(x)
return conv
def my_self_attention(x, channels):
size = x.shape
x = tf.reshape(x, shape=[-1, x.shape[2], x.shape[3]])
f = conv1d(x, channels)
g = conv1d(x, channels)
h = conv1d(x, channels)
attention_weights = tf.keras.activations.softmax(
tf.matmul(g, Permute((2, 1))(f))) # query multiply with key and then softmax on it
sensor_att_fm = tf.matmul(attention_weights, h)
gamma = tf.compat.v1.get_variable("gamma", [1], initializer=tf.constant_initializer(0.0))
o = gamma * sensor_att_fm + x
# return tf.reshape(o, shape = [-1, 1, x.shape[1], x.shape[2]])
return tf.reshape(o, shape = [-1, 1, x.shape[1], x.shape[2]])
inputs = tf.keras.layers.Input((8, 6, 64))
outputs = tf.concat([my_self_attention(tf.expand_dims(inputs[:, t, :, :], 1), 64) for t in range(inputs.shape[1])], 1)
model = tf.keras.Model(inputs, outputs)
print(model.summary())
....
Total params: 98,304
Trainable params: 98,304
Non-trainable params: 0
__________________________________________________________________________________________________
None
Second model with fewer parameters:
def conv1d(channels, ks=1, strides=1, padding='same'):
conv = tf.keras.layers.Conv1D(channels, ks, strides, padding, activation='relu', use_bias=False,
kernel_initializer='HeNormal')
return conv
class my_self_attention(tf.keras.layers.Layer):
def __init__(self, channels):
super(my_self_attention, self).__init__()
self.query = conv1d(channels)
self.key = conv1d(channels)
self.value = conv1d(channels)
self.gamma = tf.compat.v1.get_variable("gamma", [1], initializer=tf.constant_initializer(0.0))
def call(self, x):
x = tf.reshape(x, shape=[-1, x.shape[2], x.shape[3]])
f = self.query(x)
g = self.key(x)
h = self.value(x)
attention_weights = tf.keras.activations.softmax(
tf.matmul(g, Permute((2, 1))(f))) # query multiply with key and then softmax on it
sensor_att_fm = tf.matmul(attention_weights, h)
o = self.gamma * sensor_att_fm + x
return tf.reshape(o, shape=[-1, 1, x.shape[1], x.shape[2]])
inputs = tf.keras.layers.Input((8, 6, 64))
sa = my_self_attention(64)
outputs = tf.concat([sa(tf.expand_dims(inputs[:, t, :, :], 1)) for t in range(inputs.shape[1])], 1)
model = tf.keras.Model(inputs, outputs)
print(model.summary())
...
Total params: 12,289
Trainable params: 12,289
Non-trainable params: 0
__________________________________________________________________________________________________
None
Related
i am trying to concatenate bert model with Cnn 1d using pytorch . I used this code but I do not understand what is meaning of in_channels and out_channels in function conv1d
if input shape into cnn model is torch(256,64,768)
class MixModel(nn.Module):
def __init__(self,pre_trained='distilbert-base-uncased'):
super().__init__()
self.bert = AutoModel.from_pretrained('distilbert-base-uncased')
self.hidden_size = self.bert.config.hidden_size
self.conv = nn.Conv1d(in_channels=1, out_channels=256, kernel_size=5, padding='valid', stride=1)
self.relu = nn.ReLU()
self.pool = nn.MaxPool1d(kernel_size= 256- 5 + 1)
self.dropout = nn.Dropout(0.3)
self.clf = nn.Linear(self.hidden_size*2,6)
def forward(self,inputs, mask , labels):
cls_hs = self.bert(input_ids=inputs,attention_mask=mask, return_dict= False)
x=cls_hs
# x = torch.cat(cls_hs[0]) # x= [416, 64, 768]
x = self.conv(x)
x = self.relu(x)
x = self.pool(x)
x = self.dropout(x)
x = self.clf(x)
return x
Edit
I use recommended answer and change the parameters but i got error
class MixModel(nn.Module):
def __init__(self,pre_trained='bert-base-uncased'):
super().__init__()
self.bert = AutoModel.from_pretrained('distilbert-base-uncased')
self.hidden_size = self.bert.config.hidden_size
self.conv = nn.Conv1d(in_channels=768, out_channels=256, kernel_size=5, padding='valid', stride=1)
self.relu = nn.ReLU()
self.pool = nn.MaxPool1d(kernel_size= 64- 5 + 1)
print(11)
self.dropout = nn.Dropout(0.3)
print(12)
self.clf = nn.Linear(self.hidden_size*2,6)
print(13)
def forward(self,inputs, mask , labels):
cls_hs = self.bert(input_ids=inputs,attention_mask=mask, return_dict= False)
x=cls_hs[0]
print(cls_hs[0])
print(len(cls_hs[0]))
print(cls_hs[0].size())
#x = torch.cat(cls_hs,0) # x= [416, 64, 768]
x = x.permute(0, 2, 1)
x = self.conv(x)
x = self.relu(x)
x = self.pool(x)
x = self.dropout(x)
x = self.clf(x)
return x
the error is
5 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in linear(input, weight, bias)
1846 if has_torch_function_variadic(input, weight, bias):
1847 return handle_torch_function(linear, (input, weight, bias), input, weight, bias=bias)
-> 1848 return torch._C._nn.linear(input, weight, bias)
1849
1850
RuntimeError: mat1 and mat2 shapes cannot be multiplied (65536x1 and 1536x6)
The dimension of the output prediction of BERT (and many other transformer-based models) is of shape batchxseq-lenxfeature-dim: That is, your input is a batch of 256 sequences of length (probably with padding) of 64 tokens, each token is represented by a feature vector of dimension 768.
In order to apply 1-d convolution along the sequence-len dimension, you will need first to permute x to be of shape batchxdimxlen:
x = x.permute(0, 2, 1)
Now you can apply nn.Conv1d, where the in_channels is the dimension of x = 768. the out_channels is up to you - what is going to be the hidden dimension of your model.
I want to implement a Hierarchical attention mechanism for document classification presented by Yang. But I want to replace LSTM with Transformer.
I used Apoorv Nandan's text classification with Transformer:
https://keras.io/examples/nlp/text_classification_with_transformer/
I have implemented Transformer hierarchically to classification. One for sentence representation and another one for document representation. The code is as follow:
import tensorflow as tf
from tensorflow import keras
from tensorflow.keras import layers
from keras.utils.np_utils import to_categorical
class MultiHeadSelfAttention(layers.Layer):
def __init__(self, embed_dim, num_heads=8):
super(MultiHeadSelfAttention, self).__init__()
self.embed_dim = embed_dim
self.num_heads = num_heads
if embed_dim % num_heads != 0:
raise ValueError(
f"embedding dimension = {embed_dim} should be divisible by number of heads = {num_heads}"
)
self.projection_dim = embed_dim // num_heads
self.query_dense = layers.Dense(embed_dim)
self.key_dense = layers.Dense(embed_dim)
self.value_dense = layers.Dense(embed_dim)
self.combine_heads = layers.Dense(embed_dim)
def attention(self, query, key, value):
score = tf.matmul(query, key, transpose_b=True)
dim_key = tf.cast(tf.shape(key)[-1], tf.float32)
scaled_score = score / tf.math.sqrt(dim_key)
weights = tf.nn.softmax(scaled_score, axis=-1)
output = tf.matmul(weights, value)
return output, weights
def separate_heads(self, x, batch_size):
x = tf.reshape(x, (batch_size, -1, self.num_heads, self.projection_dim))
return tf.transpose(x, perm=[0, 2, 1, 3])
def call(self, inputs):
# x.shape = [batch_size, seq_len, embedding_dim]
batch_size = tf.shape(inputs)[0]
query = self.query_dense(inputs) # (batch_size, seq_len, embed_dim)
key = self.key_dense(inputs) # (batch_size, seq_len, embed_dim)
value = self.value_dense(inputs) # (batch_size, seq_len, embed_dim)
query = self.separate_heads(
query, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
key = self.separate_heads(
key, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
value = self.separate_heads(
value, batch_size
) # (batch_size, num_heads, seq_len, projection_dim)
attention, weights = self.attention(query, key, value)
attention = tf.transpose(
attention, perm=[0, 2, 1, 3]
) # (batch_size, seq_len, num_heads, projection_dim)
concat_attention = tf.reshape(
attention, (batch_size, -1, self.embed_dim)
) # (batch_size, seq_len, embed_dim)
output = self.combine_heads(
concat_attention
) # (batch_size, seq_len, embed_dim)
return output
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, dropout_rate, name=None):
super(TransformerBlock, self).__init__(name=name)
self.att = MultiHeadSelfAttention(embed_dim, num_heads)
self.ffn = keras.Sequential(
[layers.Dense(ff_dim, activation="relu"), layers.Dense(embed_dim), ]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(dropout_rate)
self.dropout2 = layers.Dropout(dropout_rate)
def call(self, inputs, training):
attn_output = self.att(inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
def compute_output_shape(self, input_shape):
# it does not change the shape of its input
return input_shape
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim, name=None):
super(TokenAndPositionEmbedding, self).__init__(name=name)
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
def compute_output_shape(self, input_shape):
# it changes the shape from (batch_size, maxlen) to (batch_size, maxlen, embed_dim)
return input_shape + (self.pos_emb.output_dim,)
# Lower level (produce a representation of each sentence):
embed_dim = 100 # Embedding size for each token
num_heads = 2 # Number of attention heads
ff_dim = 64 # Hidden layer size in feed forward network inside transformer
L1_dense_units = 100 # Size of the sentence-level representations output by the word-level model
dropout_rate = 0.1
vocab_size = 1000
class_number = 5
max_docs = 10000
max_sentences = 15
max_words = 60
word_input = layers.Input(shape=(max_words,), name='word_input')
word_embedding = TokenAndPositionEmbedding(maxlen=max_words, vocab_size=vocab_size,
embed_dim=embed_dim, name='word_embedding')(word_input)
word_transformer = TransformerBlock(embed_dim=embed_dim, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='word_transformer')(word_embedding)
word_pool = layers.GlobalAveragePooling1D(name='word_pooling')(word_transformer)
word_drop = layers.Dropout(dropout_rate, name='word_drop')(word_pool)
word_dense = layers.Dense(L1_dense_units, activation="relu", name='word_dense')(word_drop)
word_encoder = keras.Model(word_input, word_dense)
word_encoder.summary()
# =========================================================================
# Upper level (produce a representation of each document):
L2_dense_units = 100
sentence_input = layers.Input(shape=(max_sentences, max_words), name='sentence_input')
# This is the line producing "NotImplementedError":
sentence_encoder = tf.keras.layers.TimeDistributed(word_encoder, name='sentence_encoder')(sentence_input)
sentence_transformer = TransformerBlock(embed_dim=L1_dense_units, num_heads=num_heads, ff_dim=ff_dim,
dropout_rate=dropout_rate, name='sentence_transformer')(sentence_encoder)
sentence_dense = layers.TimeDistributed(layers.Dense(int(L2_dense_units)),name='sentence_dense')(sentence_transformer)
sentence_out = layers.Dropout(dropout_rate)(sentence_dense)
preds = layers.Dense(class_number , activation='softmax', name='sentence_output')(sentence_out)
model = keras.Model(sentence_input, preds)
model.summary()
#==========================================================================
Everything is OK(for testing you can copy and paste it in googlecolab). But when I compile and fit the model by following codes, it throws an error:
X = tf.random.uniform(shape=(max_docs, max_sentences, max_words), minval=1, maxval=1000, dtype=tf.dtypes.int32, seed=1)
y = tf.random.uniform(shape=(max_docs, ), minval=0, maxval=class_number , dtype=tf.dtypes.int32, seed=1)
y = to_categorical(y)
model.compile(optimizer="adam", loss="categorical_crossentropy", metrics=["accuracy"])
history = model.fit(
X, y, batch_size=32, epochs=25,
)
The error is:
ValueError: Shapes (None, 5) and (None, 15, 5) are incompatible
When I had a similar error, I found that a Flatten() layer helped, I had incompatible shapes of (None, x, y) and (None, y).
If you try to provide a flatten layer for the part that gives you the (None, 15, 5), then it should output something like (None, 75).
The flatten layer merely removes dimensions, when I was doing this I got the output as (None, xy) and due to the way Tensorflow works, it was able to match both shapes as xy is obviously a factor of just y.
This Model is a variety of CNN and uses Causal Dilational Convolution Layer.
I can train and predict with 0 error, but when I use model.save() to save model, it throws Exception.
So I use save_weights and load_weights to save and load model.
I wonder why this error appears:
model.save("path")
out:
ValueError: Dimension size must be evenly divisible by 2 but is 745 for '{{node conv1d_5/SpaceToBatchND}} = SpaceToBatchND[T=DT_FLOAT, Tblock_shape=DT_INT32, Tpaddings=DT_INT32](conv1d_5/Pad, conv1d_5/SpaceToBatchND/block_shape, conv1d_5/SpaceToBatchND/paddings)' with input shapes: [?,745,32], [1], [1,2] and with computed input tensors: input[1] = <2>, input[2] = <[0 0]>.
Input shape is (None,743,27)
Output shape is (None,24,1)
def slice(x, seq_length):
return x[:, -seq_length:, :]
class ResidualBlock(tf.keras.layers.Layer):
def __init__(self, n_filters, filter_width, dilation_rate):
super(ResidualBlock, self).__init__()
self.n_filters = n_filters
self.filter_width = filter_width
self.dilation_rate = dilation_rate
# preprocessing - equivalent to time-distributed dense
self.x = Conv1D(32, 1, padding='same', activation='relu')
# filter convolution
self.x_f = Conv1D(filters=n_filters,
kernel_size=filter_width,
padding='causal',
dilation_rate=dilation_rate,
activation='tanh')
# gating convolution
self.x_g = Conv1D(filters=n_filters,
kernel_size=filter_width,
padding='causal',
dilation_rate=dilation_rate,
activation='sigmoid')
# postprocessing - equivalent to time-distributed dense
self.z_p = Conv1D(32, 1, padding='same', activation='relu')
def call(self, inputs):
x = self.x(inputs)
f = self.x_f(x)
g = self.x_g(x)
z = tf.multiply(f, g)
z = self.z_p(z)
return tf.add(x, z), z
def get_config(self):
config = super(ResidualBlock, self).get_config()
config.update({"n_filters": self.n_filters,
"filter_width": self.filter_width,
"dilation_rate": self.dilation_rate})
return config
class WaveNet(tf.keras.Model):
def __init__(self, n_filters=32, filter_width=2, dilation_rates=None, drop_out=0.2, pred_length=24):
super().__init__(name='WaveNet')
# Layer Parameter
self.n_filters = n_filters
self.filter_width = filter_width
self.drop_out = drop_out
self.pred_length = pred_length
if dilation_rates is None:
self.dilation_rates = [2 ** i for i in range(8)]
else:
self.dilation_rates = dilation_rates
# Layer
self.residual_stacks = []
for dilation_rate in self.dilation_rates:
self.residual_stacks.append(ResidualBlock(self.n_filters, self.filter_width, dilation_rate))
# self.add = Add()
self.cut = Lambda(slice, arguments={'seq_length': pred_length})
self.conv_1 = Conv1D(128, 1, padding='same')
self.relu = Activation('relu')
self.drop = Dropout(drop_out)
self.skip = Lambda(lambda x: x[:, -2 * pred_length + 1:-pred_length + 1, :1])
self.conv_2 = Conv1D(1, 1, padding='same')
def _unroll(self, inputs, **kwargs):
outputs = inputs
skips = []
for residual_block in self.residual_stacks:
outputs, z = residual_block(outputs)
skips.append(z)
outputs = self.relu(Add()(skips))
outputs = self.cut(outputs)
outputs = self.conv_1(outputs)
outputs = self.relu(outputs)
outputs = self.drop(outputs)
outputs = Concatenate()([outputs, self.skip(inputs)])
outputs = self.conv_2(outputs)
outputs = self.cut(outputs)
return outputs
def _get_output(self, input_tensor):
pass
def call(self, inputs, training=False, **kwargs):
if training:
return self._unroll(inputs)
else:
return self._get_output(inputs)
Train step
model = WaveNet()
model.compile(Adam(), loss=loss)
# ok
history = model.fit(train_x, train_y,
batch_size=batch_size,
epochs=epochs,
callbacks=[cp_callback] if save else None)
# ok
result = model.predict(test_x)
# error
model.save("path")
I'm currently testing some modified versions of dropout in Keras and one of them involves adjusting the weights during the training of a customized dense layer. I however have not been able to run it without error yet. I suspect is has something to do with eager execution but I'm not sure.
class Linear(keras.layers.Layer):
def __init__(self, units, **kwargs):
super(Linear, self).__init__(**kwargs)
self.units = units
def build(self, input_shape):
self.w = self.add_weight(
shape=(input_shape[-1], self.units),
initializer="random_normal",
trainable=True,
)
self.b = self.add_weight(
shape=(self.units,), initializer="random_normal", trainable=True
)
def call(self, inputs, training=False):
prob = 0.0/10
if training:
w = np.matrix(self.w)
# w = self.w
shape = w.shape
size = shape[0] * shape[1]
arr = np.random.choice([0,1], size=size, p=[prob, 1 - prob]) #random array of 1's and 0's
arr = arr.reshape(shape) #reshape it to same dimensions as weights
new_weights = np.multiply(arr, w) #element wise multiplication
self.w = new_weights
return tf.matmul(inputs, self.w) + self.b
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
model.add(layers.MaxPooling2D())
model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(layers.MaxPooling2D())
model.add(layers.Conv2D(128, (3, 3), activation='relu',padding='same'))
model.add(layers.MaxPooling2D())
model.add(layers.Conv2D(4, (3, 3), activation='relu',padding='same'))
model.add(layers.MaxPooling2D())
model.add(layers.Flatten())
model.add(Linear(3)) #Custom layer
model.add(layers.Dense(10, activation='softmax'))
model.compile(loss = 'CategoricalCrossentropy',
optimizer = 'adam',
metrics=['accuracy'])
epochs = 1
history = model.fit(train_dataset, validation_data=validation_dataset, epochs=epochs)
Error: TypeError: Expected binary or unicode string, got <tf.Tensor 'sequential_3/linear_3/mul:0' shape=(4, 3) dtype=float32>
self.w has to be tensorflow.Variable. However after multiplication in call() it becomes tensorflow.Tensor. Just find another way to do the same thing in call()
Try this code:
def call(self, inputs, training=False):
prob = 0.0/10
if training:
w = np.matrix(self.w)
shape = w.shape
size = shape[0] * shape[1]
arr = np.random.choice([0,1], size=size, p=[prob, 1 - prob]) #random array of 1's and 0's
arr = arr.reshape(shape) #reshape it to same dimensions as weights
# CHANGED 3 LINES BELOW:
arr = tf.convert_to_tensor(arr, dtype=tf.float32)
new_weights = tf.multiply(arr, self.w)
self.w.assign(new_weights) # Assign preserves tf.Variable
return tf.matmul(inputs, self.w) + self.b
I want to make a network like a photo.
Here, 200 would like to have the height and width of the network depend on the input (None).
This network is different from CNN.
The ConvCapsuleLayer is defined as follows.
class ConvCapsuleLayer(layers.Layer):
def __init__(self, kernel_size, num_capsule, num_atoms, strides=1, padding='same', routings=3,
kernel_initializer='he_normal', **kwargs):
super(ConvCapsuleLayer, self).__init__(**kwargs)
self.kernel_size = kernel_size
self.num_capsule = num_capsule
self.num_atoms = num_atoms
self.strides = strides
self.padding = padding
self.routings = routings
self.kernel_initializer = initializers.get(kernel_initializer)
def build(self, input_shape):
assert len(input_shape) == 5, "The input Tensor should have shape=[None, input_height, input_width," \
" input_num_capsule, input_num_atoms]"
self.input_height = input_shape[1]
self.input_width = input_shape[2] #None
self.input_num_capsule = input_shape[3]
self.input_num_atoms = input_shape[4]
# Transform matrix
self.W = self.add_weight(shape=[self.kernel_size, self.kernel_size,
self.input_num_atoms, self.num_capsule * self.num_atoms],
initializer=self.kernel_initializer,
name='W')
self.b = self.add_weight(shape=[1, 1, self.num_capsule, self.num_atoms],
initializer=initializers.constant(0.1),
name='b')
self.built = True
def call(self, input_tensor, training=None):
input_transposed = tf.transpose(input_tensor, [3, 0, 1, 2, 4]) #[capsule_num, batch_num, input_height, input_width, atoms_num]
input_shape = K.shape(input_transposed)
input_tensor_reshaped = K.reshape(input_transposed, [
input_shape[0] * input_shape[1], self.input_height, -1, self.input_num_atoms]) #[capsule_num*batch_num, input_height, input_width, atoms_num]
input_tensor_reshaped.set_shape((None, self.input_height, None, self.input_num_atoms))
conv = K.conv2d(input_tensor_reshaped, self.W, (self.strides, self.strides),
padding=self.padding, data_format='channels_last') #kernel_shape: shape of self.W
votes_shape = K.shape(conv)
_, conv_height, conv_width, _ = conv.get_shape()
votes = K.reshape(conv, [input_shape[1], input_shape[0], votes_shape[1], votes_shape[2],
self.num_capsule, self.num_atoms]) #[input_batch_num, input_capsule_num, input_height, input_width, num_capsule, num_atoms]
votes.set_shape((None, self.input_num_capsule, conv_height.value, conv_width,
self.num_capsule, self.num_atoms))
logit_shape = K.stack([
input_shape[1], input_shape[0], votes_shape[1], votes_shape[2], self.num_capsule])
biases_replicated = K.tile(self.b, [conv_height.value, -1 , 1, 1]) #[input_height, input_width, num_capsule, num_atoms]
activations = update_routing(
votes=votes,
biases=biases_replicated,
logit_shape=logit_shape,
num_dims=6,
input_dim=self.input_num_capsule,
output_dim=self.num_capsule,
num_routing=self.routings)
return activations
def compute_output_shape(self, input_shape):
space = input_shape[1:-2]
new_space = []
for i in range(len(space)):
new_dim = conv_output_length(
space[i],
self.kernel_size,
padding=self.padding,
stride=self.strides,
dilation=1)
new_space.append(new_dim)
return (input_shape[0],) + tuple(new_space) + (self.num_capsule, self.num_atoms)
def get_config(self):
config = {
'kernel_size': self.kernel_size,
'num_capsule': self.num_capsule,
'num_atoms': self.num_atoms,
'strides': self.strides,
'padding': self.padding,
'routings': self.routings,
'kernel_initializer': initializers.serialize(self.kernel_initializer)
}
base_config = super(ConvCapsuleLayer, self).get_config()
return dict(list(base_config.items()) + list(config.items()))
There is error in biases_replicated = K.tile(self.b, [conv_height.value, -1 , 1, 1]) #[input_height, input_width, num_capsule, num_atoms]
InvalidArgumentError: Expected multiples[1] >= 0, but got -1
[[{{node conv_cap_4_3_1/Tile}} = Tile[T=DT_FLOAT, Tmultiples=DT_INT32, _class=["loc:#training/Adam/gradients/conv_cap_4_3_1/Tile_grad/Sum"], _device="/job:localhost/replica:0/task:0/device:GPU:0"](conv_cap_4_3_1/b/read, conv_cap_4_3_1/Tile/multiples)]]
[[{{node metrics/acc/Mean/_375}} = _Recv[client_terminated=false, recv_device="/job:localhost/replica:0/task:0/device:CPU:0", send_device="/job:localhost/replica:0/task:0/device:GPU:0", send_device_incarnation=1, tensor_name="edge_8437_metrics/acc/Mean", tensor_type=DT_FLOAT, _device="/job:localhost/replica:0/task:0/device:CPU:0"]()]]
However, if I change -1 to 1 in tile, it is not what I want. The shape is (200, 1 , 2, 16), but shape that I want is (200, width_size, 2, 16). How can I do?