I'm currently testing some modified versions of dropout in Keras and one of them involves adjusting the weights during the training of a customized dense layer. I however have not been able to run it without error yet. I suspect is has something to do with eager execution but I'm not sure.
class Linear(keras.layers.Layer):
def __init__(self, units, **kwargs):
super(Linear, self).__init__(**kwargs)
self.units = units
def build(self, input_shape):
self.w = self.add_weight(
shape=(input_shape[-1], self.units),
initializer="random_normal",
trainable=True,
)
self.b = self.add_weight(
shape=(self.units,), initializer="random_normal", trainable=True
)
def call(self, inputs, training=False):
prob = 0.0/10
if training:
w = np.matrix(self.w)
# w = self.w
shape = w.shape
size = shape[0] * shape[1]
arr = np.random.choice([0,1], size=size, p=[prob, 1 - prob]) #random array of 1's and 0's
arr = arr.reshape(shape) #reshape it to same dimensions as weights
new_weights = np.multiply(arr, w) #element wise multiplication
self.w = new_weights
return tf.matmul(inputs, self.w) + self.b
model = models.Sequential()
model.add(layers.Conv2D(32, (3, 3), activation='relu', padding='same'))
model.add(layers.MaxPooling2D())
model.add(layers.Conv2D(64, (3, 3), activation='relu', padding='same'))
model.add(layers.MaxPooling2D())
model.add(layers.Conv2D(128, (3, 3), activation='relu',padding='same'))
model.add(layers.MaxPooling2D())
model.add(layers.Conv2D(4, (3, 3), activation='relu',padding='same'))
model.add(layers.MaxPooling2D())
model.add(layers.Flatten())
model.add(Linear(3)) #Custom layer
model.add(layers.Dense(10, activation='softmax'))
model.compile(loss = 'CategoricalCrossentropy',
optimizer = 'adam',
metrics=['accuracy'])
epochs = 1
history = model.fit(train_dataset, validation_data=validation_dataset, epochs=epochs)
Error: TypeError: Expected binary or unicode string, got <tf.Tensor 'sequential_3/linear_3/mul:0' shape=(4, 3) dtype=float32>
self.w has to be tensorflow.Variable. However after multiplication in call() it becomes tensorflow.Tensor. Just find another way to do the same thing in call()
Try this code:
def call(self, inputs, training=False):
prob = 0.0/10
if training:
w = np.matrix(self.w)
shape = w.shape
size = shape[0] * shape[1]
arr = np.random.choice([0,1], size=size, p=[prob, 1 - prob]) #random array of 1's and 0's
arr = arr.reshape(shape) #reshape it to same dimensions as weights
# CHANGED 3 LINES BELOW:
arr = tf.convert_to_tensor(arr, dtype=tf.float32)
new_weights = tf.multiply(arr, self.w)
self.w.assign(new_weights) # Assign preserves tf.Variable
return tf.matmul(inputs, self.w) + self.b
Related
In this case, I want to use tf.matmul between the weights and a mathematical operation. I have this following custom layer
rotationSpeed = 2* np.pi*70 #Hz
class PhysicalLayer(keras.layers.Layer):
def __init__(self, units=1):
super(PhysicalLayer, self).__init__()
self.units = units
def build(self, input_shape):
print(input_shape[0])
self.w = self.add_weight(
shape=(input_shape[-1], self.units),
initializer="random_normal",
trainable=True,
)
self.b = self.add_weight(
shape=(self.units,), initializer="random_normal", trainable=True
)
def call(self, inputs):
rotationSpeedSquare = tf.math.square(rotationSpeed)
maximumVibration = tf.math.reduce_max(inputs, axis = 1, keepdims = True)
stiff = rotationSpeedSquare/maximumVibration
return tf.matmul(stiff, self.w) + self.b
And this following model
class modelMaximum(tf.keras.Model):
def __init__(self, num_classes=50):
super(modelMaximum, self).__init__()
self.dense1 = tf.keras.layers.Dense(num_classes, activation=tf.nn.relu)
self.physical = PhysicalLayer()
self.dense2 = tf.keras.layers.Dense(128, activation=tf.nn.relu)
self.dense3 = tf.keras.layers.Dense(64, activation=tf.nn.relu)
self.dense4 = tf.keras.layers.Dense(64, activation=tf.nn.relu)
self.dense5 = tf.keras.layers.Dense(32, activation=tf.nn.relu)
self.dense6 = tf.keras.layers.Dense(1)
def call(self, inputs):
x = self.dense1(inputs)
x= self.physical(x)
x = self.dense2(x)
x = self.dense3(x)
x = self.dense4(x)
x = self.dense5(x)
return self.dense6(x)
When I'm trying to fit this model with the training set
modelMax = modelMaximum()
modelMax.compile(optimizer='adam', loss='mae', metrics=[tf.keras.metrics.RootMeanSquaredError()])
modelMax.fit(train, trainRUL, batch_size=64, epochs=50, verbose=1)
I obtain the following error:
ValueError: Dimensions must be equal, but are 1 and 50 for '{{node model_maximum_1/physical_layer_1/MatMul}} = MatMul[T=DT_FLOAT, transpose_a=false, transpose_b=false](model_maximum_1/physical_layer_1/truediv, model_maximum_1/physical_layer_1/MatMul/ReadVariableOp)' with input shapes: [?,1], [50,1].
How should I fix this problem ?
Thanks
This is my attention layer code :
implementation of attention layer
**class Attention(nn.Module):
def __init__(self, feature_dim, step_dim, bias=True, **kwargs):
super(Attention, self).__init__(**kwargs)
self.supports_masking = True
self.bias = bias
self.feature_dim = feature_dim
self.step_dim = step_dim
self.features_dim = 0
weight = torch.zeros(feature_dim, 1)
nn.init.kaiming_uniform_(weight)
self.weight = nn.Parameter(weight)
if bias:
self.b = nn.Parameter(torch.zeros(step_dim))
def forward(self, x, mask=None):
feature_dim = self.feature_dim
step_dim = self.step_dim
eij = torch.mm(
x.contiguous().view(-1, feature_dim),
self.weight
).view(-1, step_dim)
if self.bias:
eij = eij + self.b
eij = torch.tanh(eij)
a = torch.exp(eij)
if mask is not None:
a = a * mask
a = a / (torch.sum(a, 1, keepdim=True) + 1e-10)
weighted_input = x * torch.unsqueeze(a, -1)
return torch.sum(weighted_input, 1)**
This is RNN codes :
**# Instantiate the model w/ hyperparams
weights_matrix = weights_matrix
output_size = 13 # number of classes to predict
hidden_dim = 64
drop_prob = 0.5
# The RNN model that will be used to perform classification
class AttentionLSTM(nn.Module):
def __init__(self, weights_matrix, output_size, hidden_dim, drop_prob):
super(AttentionLSTM, self).__init__()
# embedding layers
self.embedding, self.num_embeddings, self.embeddings_size = create_emb_layer(weights_matrix, True)
# embedding dropout
self.dropout = nn.Dropout2d(drop_prob)
# First lstm and GRU layers
self.lstm1 = nn.LSTM(self.embeddings_size, hidden_dim, batch_first=True, bidirectional=True)
self.gru1 = nn.GRU(hidden_dim * 2, hidden_dim, bidirectional=True, batch_first=True)
# attention layer
self.attention = Attention(hidden_dim*2, seq_length)
# Second lstm and GRU layers
self.lstm2 = nn.LSTM(hidden_dim * 2, hidden_dim, batch_first=True, bidirectional=True)
self.gru2 = nn.GRU(hidden_dim * 2, hidden_dim, bidirectional=True, batch_first=True)
# linear
self.fc = nn.Linear(hidden_dim * 2, hidden_dim * 2)
self.out = nn.Linear(hidden_dim * 2, output_size)
# activation functions
self.sigmoid = nn.Sigmoid() # for hidden layers
self.softmax = nn.Softmax(dim=1) # for output layer
def forward(self, x):
batch_size = x.size(0)
# embedding output
x = x.long()
embeds = self.embedding(x)
embeds = torch.squeeze(torch.unsqueeze(embeds, 0))
# lstm, and gru outputs
lstm_out1, _ = self.lstm1(embeds)
gru_out1, _ = self.gru1(lstm_out1)
gru_out1 = gru_out1.view(batch_size, -1, hidden_dim * 2)
attention_out = self.attention(gru_out1, seq_length)
attention_out = attention_out.view(batch_size, -1, hidden_dim * 2)
attention_out = self.sigmoid(attention_out)
lstm_out2, _ = self.lstm2(attention_out)
# slice lstm_out to just get output of last element of the input sequence
lstm_out2 = lstm_out2[:, -1]
gru_out2, _ = self.gru2(lstm_out2)
# linear outputs
fc_out = self.softmax(self.fc(gru_out2))
final_out = self.out(fc_out)
return final_out**
I am sure that my dataset is balanced after pre-processing step but my model always predict the same output. Precision and fscore are changing for each input, however, this problem makes my recall score 1.0 since output is always same whatever input is.
If anybody help me, i will be appreciated
It required some time to build networks from your requirements but I provided a few samples to create a customer layer or model, you start from an embedded layer and suddenly random leaves of data create different input every time GRU and LSTM learning layers may provide good results when they had :
Matching input and target layer and parameters.
Learning scopes when they can differentiate input, repeating of gated current, and LSTM is specifically used when patterns of data are
significant such as pictures or continue data.
Linear, and Sigmoid provide contrast differentiate and softmax sometime we required when compared based on distribution values. This
is supposed to create contrast output excepted softmax applied on
weights of values.
Loss Fn is based on a similar output dimension/expectation
[ Sample ]:
class create_emb_layer( tf.keras.layers.Embedding ):
def __init__( self, weights_matrix, bidirectional=True ):
self.num_embeddings = weights_matrix[0]
self.embeddings_size = weights_matrix[1]
self.bidirectional = bidirectional
super(create_emb_layer, self).__init__( self.embeddings_size, self.num_embeddings )
def build(self, input_shape):
self.kernel = self.add_weight("kernel",
shape=[int(input_shape[-1]),
self.input_dim])
def call(self, inputs):
return tf.matmul(inputs, self.kernel)
[ My model ]:
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
: Model Initialize
"""""""""""""""""""""""""""""""""""""""""""""""""""""""""
model = tf.keras.models.Sequential([
tf.keras.layers.InputLayer(input_shape=( 32, 32, 4 )),
tf.keras.layers.Normalization(mean=3., variance=2.),
tf.keras.layers.Normalization(mean=4., variance=6.),
tf.keras.layers.Conv2D(32, (3, 3), activation='relu'),
tf.keras.layers.MaxPooling2D((2, 2)),
tf.keras.layers.Dense(128, activation='relu'),
tf.keras.layers.Reshape((128, 225)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(96, return_sequences=True, return_state=False)),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(96)),
tf.keras.layers.Flatten(),
tf.keras.layers.Dense(192, activation='relu'),
tf.keras.layers.Dense(10),
])
[ Output ]:
Recently, I want to implement CDCN in CVPR2020 using tensorflow2.8 + python3.9. This requires my custom layer acquire current conv2d layer's weight.
However, when I try to add my custom layer to the sequential model, error occurred:NotImplementedError: numpy() is only available when eager execution is enabled.
This is my code. Can anyone helps me? I have tried to add tf.compat.v1.enable_eager_execution(), but it doesn't work.
import numpy as np
import tensorflow.keras as tfk
import tensorflow as tf
class CDC(tfk.layers.Layer):
def __init__(self, output_dim, kernel_size=(3, 3), padding='same', activation=None, theta=0.7, **kwargs):
super(CDC, self).__init__()
self.theta = theta
self.activation = None
self.output_dim = output_dim
self.kernel_size = kernel_size
self.padding = padding
if activation is not None:
self.activation = tfk.activations.get(activation)
def build(self, input_shape):
self.conv = tfk.layers.Conv2D(self.output_dim, self.kernel_size, padding=self.padding, input_shape=input_shape)
self.conv.build(input_shape=input_shape)
self._kernel = self.conv.kernel
super(CDC, self).build(input_shape)
self.built = True
def call(self, inputs, training=None, mask=None):
vanillaOutput = self.conv(inputs)
weightSum = self.conv.kernel.numpy().sum(axis=0).sum(axis=0).sum(axis=0)
weightSum = np.reshape(weightSum, (1, 1, 1, self.output_dim))
weightSum = tf.constant(weightSum, dtype=tf.float32)
cDiff = tf.nn.conv2d(inputs, filters=weightSum, strides=self.conv.strides, padding=self.conv.padding.upper())
result = vanillaOutput - self.theta * cDiff
if self.activation is not None:
return self.activation(result)
return vanillaOutput
If you just want the sum of all elements in kernel
tf.math.reduce_sum()
Also replace the lines
weightSum = self.conv.kernel.numpy().sum(axis=0).sum(axis=0).sum(axis=0)
weightSum = np.reshape(weightSum, (1, 1, 1, self.output_dim))
weightSum = tf.constant(weightSum, dtype=tf.float32)
...
weightSum = tf.math.reduce_sum(tf.math.reduce_sum(tf.math.reduce_sum(self.conv.kernel,axis=0),axis=0),axis=0)
weightSum = tf.reshape(weightSum, (1, 1, 1, self.output_dim))
I implemented self attention in tensorflow keras initially with just one function and then later with the help of Class.I implemented the method in two distinct ways ( 1: Function and 2: Class). Let me to present both approaches first, and then I will describe the problem:
What is my task:
My goal is to process TensorSpec(shape=(None, 8, 6, 64) (8 time stamps one by one (6 * 64)) through self attention and get self attention feature map for every time stamp and then concatenate it again into output tensor shape (None, 8, 6, 64)
First Implementation with the help of Function:
def conv1d(x, channels, ks=1, strides=1, padding='same'):
conv = tf.keras.layers.Conv1D(channels, ks, strides, padding, activation='relu', use_bias=False,
kernel_initializer='HeNormal')(x)
return conv
# Self attention
def my_self_attention(x, channels):
size = x.shape
x = tf.reshape(x, shape=[-1, x.shape[2], x.shape[3]])
f = conv1d(x, channels)
g = conv1d(x, channels)
h = conv1d(x, channels)
attention_weights = tf.keras.activations.softmax(
tf.matmul(g, Permute((2, 1))(f))) # query multiply with key and then softmax on it
sensor_att_fm = tf.matmul(attention_weights, h)
gamma = tf.compat.v1.get_variable("gamma", [1], initializer=tf.constant_initializer(0.0))
o = gamma * sensor_att_fm + x
# return tf.reshape(o, shape = [-1, 1, x.shape[1], x.shape[2]])
return tf.reshape(o, shape = [-1, 1, x.shape[1], x.shape[2]])
refined_fm = tf.concat([my_self_attention(tf.expand_dims(my_input[:, t, :, :], 1), 64) for t in range(my_input.shape[1])], 1)
**
2nd Implementation with the help of Class
def conv1d(channels, ks=1, strides=1, padding='same'):
conv = tf.keras.layers.Conv1D(channels, ks, strides, padding, activation='relu', use_bias=False,
kernel_initializer='HeNormal')
return conv
class my_self_attention(tf.keras.layers.Layer):
def __init__(self, channels):
super(my_self_attention, self).__init__()
self.query = conv1d(channels)
self.key = conv1d(channels)
self.value = conv1d(channels)
self.gamma = tf.compat.v1.get_variable("gamma", [1], initializer=tf.constant_initializer(0.0))
def call(self, x):
x = tf.reshape(x, shape=[-1, x.shape[2], x.shape[3]])
f = self.query(x)
g = self.key(x)
h = self.value(x)
attention_weights = tf.keras.activations.softmax(
tf.matmul(g, Permute((2, 1))(f))) # query multiply with key and then softmax on it
sensor_att_fm = tf.matmul(attention_weights, h)
o = self.gamma * sensor_att_fm + x
# return tf.reshape(o, shape = [-1, 1, x.shape[1], x.shape[2]])
return tf.reshape(o, shape=[-1, 1, x.shape[1], x.shape[2]])
sa = my_self_attention(channels)
refined_fm = tf.concat([sa(tf.expand_dims(my_input[:, t, :, :], 1)) for t in range(my_input.shape[1])], 1)
Problem
From my perspective, I implemented the same method in 2 separate ways. The model's performance should be similar. However, model performance dropped by over 3% over class implementation. I'm not sure why. Could someone please respond?
The first method is has way more operations, layers, and trainable weights, since my_self_attention is called in the loop for every timestep. Check out the model.summary() and you will quickly see the differences:
First model with way more parameters:
import tensorflow as tf
from tensorflow.keras.layers import Permute
def conv1d(x, channels, ks=1, strides=1, padding='same'):
conv = tf.keras.layers.Conv1D(channels, ks, strides, padding, activation='relu', use_bias=False,
kernel_initializer='HeNormal')(x)
return conv
def my_self_attention(x, channels):
size = x.shape
x = tf.reshape(x, shape=[-1, x.shape[2], x.shape[3]])
f = conv1d(x, channels)
g = conv1d(x, channels)
h = conv1d(x, channels)
attention_weights = tf.keras.activations.softmax(
tf.matmul(g, Permute((2, 1))(f))) # query multiply with key and then softmax on it
sensor_att_fm = tf.matmul(attention_weights, h)
gamma = tf.compat.v1.get_variable("gamma", [1], initializer=tf.constant_initializer(0.0))
o = gamma * sensor_att_fm + x
# return tf.reshape(o, shape = [-1, 1, x.shape[1], x.shape[2]])
return tf.reshape(o, shape = [-1, 1, x.shape[1], x.shape[2]])
inputs = tf.keras.layers.Input((8, 6, 64))
outputs = tf.concat([my_self_attention(tf.expand_dims(inputs[:, t, :, :], 1), 64) for t in range(inputs.shape[1])], 1)
model = tf.keras.Model(inputs, outputs)
print(model.summary())
....
Total params: 98,304
Trainable params: 98,304
Non-trainable params: 0
__________________________________________________________________________________________________
None
Second model with fewer parameters:
def conv1d(channels, ks=1, strides=1, padding='same'):
conv = tf.keras.layers.Conv1D(channels, ks, strides, padding, activation='relu', use_bias=False,
kernel_initializer='HeNormal')
return conv
class my_self_attention(tf.keras.layers.Layer):
def __init__(self, channels):
super(my_self_attention, self).__init__()
self.query = conv1d(channels)
self.key = conv1d(channels)
self.value = conv1d(channels)
self.gamma = tf.compat.v1.get_variable("gamma", [1], initializer=tf.constant_initializer(0.0))
def call(self, x):
x = tf.reshape(x, shape=[-1, x.shape[2], x.shape[3]])
f = self.query(x)
g = self.key(x)
h = self.value(x)
attention_weights = tf.keras.activations.softmax(
tf.matmul(g, Permute((2, 1))(f))) # query multiply with key and then softmax on it
sensor_att_fm = tf.matmul(attention_weights, h)
o = self.gamma * sensor_att_fm + x
return tf.reshape(o, shape=[-1, 1, x.shape[1], x.shape[2]])
inputs = tf.keras.layers.Input((8, 6, 64))
sa = my_self_attention(64)
outputs = tf.concat([sa(tf.expand_dims(inputs[:, t, :, :], 1)) for t in range(inputs.shape[1])], 1)
model = tf.keras.Model(inputs, outputs)
print(model.summary())
...
Total params: 12,289
Trainable params: 12,289
Non-trainable params: 0
__________________________________________________________________________________________________
None
i am trying to concatenate bert model with Cnn 1d using pytorch . I used this code but I do not understand what is meaning of in_channels and out_channels in function conv1d
if input shape into cnn model is torch(256,64,768)
class MixModel(nn.Module):
def __init__(self,pre_trained='distilbert-base-uncased'):
super().__init__()
self.bert = AutoModel.from_pretrained('distilbert-base-uncased')
self.hidden_size = self.bert.config.hidden_size
self.conv = nn.Conv1d(in_channels=1, out_channels=256, kernel_size=5, padding='valid', stride=1)
self.relu = nn.ReLU()
self.pool = nn.MaxPool1d(kernel_size= 256- 5 + 1)
self.dropout = nn.Dropout(0.3)
self.clf = nn.Linear(self.hidden_size*2,6)
def forward(self,inputs, mask , labels):
cls_hs = self.bert(input_ids=inputs,attention_mask=mask, return_dict= False)
x=cls_hs
# x = torch.cat(cls_hs[0]) # x= [416, 64, 768]
x = self.conv(x)
x = self.relu(x)
x = self.pool(x)
x = self.dropout(x)
x = self.clf(x)
return x
Edit
I use recommended answer and change the parameters but i got error
class MixModel(nn.Module):
def __init__(self,pre_trained='bert-base-uncased'):
super().__init__()
self.bert = AutoModel.from_pretrained('distilbert-base-uncased')
self.hidden_size = self.bert.config.hidden_size
self.conv = nn.Conv1d(in_channels=768, out_channels=256, kernel_size=5, padding='valid', stride=1)
self.relu = nn.ReLU()
self.pool = nn.MaxPool1d(kernel_size= 64- 5 + 1)
print(11)
self.dropout = nn.Dropout(0.3)
print(12)
self.clf = nn.Linear(self.hidden_size*2,6)
print(13)
def forward(self,inputs, mask , labels):
cls_hs = self.bert(input_ids=inputs,attention_mask=mask, return_dict= False)
x=cls_hs[0]
print(cls_hs[0])
print(len(cls_hs[0]))
print(cls_hs[0].size())
#x = torch.cat(cls_hs,0) # x= [416, 64, 768]
x = x.permute(0, 2, 1)
x = self.conv(x)
x = self.relu(x)
x = self.pool(x)
x = self.dropout(x)
x = self.clf(x)
return x
the error is
5 frames
/usr/local/lib/python3.7/dist-packages/torch/nn/functional.py in linear(input, weight, bias)
1846 if has_torch_function_variadic(input, weight, bias):
1847 return handle_torch_function(linear, (input, weight, bias), input, weight, bias=bias)
-> 1848 return torch._C._nn.linear(input, weight, bias)
1849
1850
RuntimeError: mat1 and mat2 shapes cannot be multiplied (65536x1 and 1536x6)
The dimension of the output prediction of BERT (and many other transformer-based models) is of shape batchxseq-lenxfeature-dim: That is, your input is a batch of 256 sequences of length (probably with padding) of 64 tokens, each token is represented by a feature vector of dimension 768.
In order to apply 1-d convolution along the sequence-len dimension, you will need first to permute x to be of shape batchxdimxlen:
x = x.permute(0, 2, 1)
Now you can apply nn.Conv1d, where the in_channels is the dimension of x = 768. the out_channels is up to you - what is going to be the hidden dimension of your model.