Why LSTM+transformer is not working well?

Why LSTM+transformer is not working well? - python

I'm trying to use LSTM and transformer to do binary-classification, but it does not improve the performance than normal LSTM model, sometimes it will go even worse. Input shape of training data is (3014, 48, 178), input data is time-series medical data, the following code is for transformer.
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super(TokenAndPositionEmbedding, self).__init__()
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.001):
super(TransformerBlock, self).__init__()
self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = Sequential(
[layers.Dense(ff_dim, activation="relu"),layers.Dense(embed_dim),]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
def call(self, inputs, training):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
class PositionEmbeddingFixedWeights(layers.Layer):
def __init__(self, sequence_length, output_dim, **kwargs):
super(PositionEmbeddingFixedWeights, self).__init__(**kwargs)
position_embedding_matrix = self.get_position_encoding(sequence_length, output_dim)
self.position_embedding_layer = layers.Embedding(
input_dim=sequence_length, output_dim=output_dim,
weights=[position_embedding_matrix],
trainable=False
)
def get_position_encoding(self, seq_len, d, n=10000):
P = np.zeros((seq_len, d))
for k in range(seq_len):
for i in np.arange(int(d/2)):
denominator = np.power(n, 2*i/d)
P[k, 2*i] = np.sin(k/denominator)
P[k, 2*i+1] = np.cos(k/denominator)
return P
def call(self, inputs):
position_indices = tf.range(tf.shape(inputs)[-2])
embedded_indices = self.position_embedding_layer(position_indices)
return embedded_indices
Model code is
model = Sequential([tf.keras.Input(shape=(48,178)),
BatchNormalization(),
tf.keras.layers.GRU(units = 128,recurrent_dropout=0.5,activation='tanh', dropout=0.5,return_sequences = True,activity_regularizer=regularizers.L2(0.01)),
TransformerBlock(128, 48, 178),
tf.keras.layers.GlobalAveragePooling1D(),
Dense(60,activation='tanh'),
Dense(1,activation='sigmoid')])
It had troubled me for a long time. I'm trying to use LSTM to dealing with the time-series feature, and transformer to learn the importance between features, but it seems not working.

Related

How to plot Graph-neural-network model-graph when using tensorflow Model Subclass API with spektral layers?

I am unable to plot graph-neural-networking. I have seen few related questions(1, 2, 3) to this topic but their answers do not apply to graph-neural-networks.
What makes it different is that the input vector include objects of different dimensions e.g. properties matrix dimension is [n_nodes, n_node_features], adjacency matrix dimension is [n_nodes, n_nodes] etc. Here is the example of my Model:
class GIN0(Model):
def __init__(self, channels, n_layers):
super().__init__()
self.conv1 = GINConv(channels, epsilon=0, mlp_hidden=[channels, channels])
self.convs = []
for _ in range(1, n_layers):
self.convs.append(
GINConv(channels, epsilon=0, mlp_hidden=[channels, channels])
)
self.pool = GlobalAvgPool()
self.dense1 = Dense(channels, activation="relu")
self.dropout = Dropout(0.5)
self.dense2 = Dense(channels, activation="relu")
def call(self, inputs):
x, a, i = inputs
x = self.conv1([x, a])
for conv in self.convs:
x = conv([x, a])
x = self.pool([x, i])
x = self.dense1(x)
x = self.dropout(x)
return self.dense2(x)
One of the answers in 2 suggested to add build_graph function as follows:
class my_model(Model):
def __init__(self, dim):
super(my_model, self).__init__()
self.Base = VGG16(input_shape=(dim), include_top = False, weights = 'imagenet')
self.GAP = L.GlobalAveragePooling2D()
self.BAT = L.BatchNormalization()
self.DROP = L.Dropout(rate=0.1)
self.DENS = L.Dense(256, activation='relu', name = 'dense_A')
self.OUT = L.Dense(1, activation='sigmoid')
def call(self, inputs):
x = self.Base(inputs)
g = self.GAP(x)
b = self.BAT(g)
d = self.DROP(b)
d = self.DENS(d)
return self.OUT(d)
# AFAIK: The most convenient method to print model.summary()
# similar to the sequential or functional API like.
def build_graph(self):
x = Input(shape=(dim))
return Model(inputs=[x], outputs=self.call(x))
dim = (124,124,3)
model = my_model((dim))
model.build((None, *dim))
model.build_graph().summary()
However, I am not sure how to define dim or Input Layer using tf.keras.layers.Input for such a hybrid data-structure as described above.
Any suggestions?

Here is the minimal code to plot such subclass multi-input model. Note, as stated in the comment above, there are some issue of your GINConv which is from spektral and it's not related to the main query. So, I will give general soluton of such multi-input modeling scenarios. To make it work with your speckral, please reach to the package author for further discussion.
From specktral repo, here, I got the idea the shape of the input tensors.
x, y = next(iter(loader_tr))
bs_x = list(x[0].shape)
bs_y = list(x[1].shape)
bs_z = list(x[2].shape)
bs_x, bs_y, bs_z
([1067, 4], [1067, 1067], [1067])
Similar model, it also takes same amount of inputs and with same shape. But without GINConv.
class GIN0(Model):
def __init__(self, channels, n_layers):
super().__init__()
self.conv1 = tf.keras.layers.Conv1D(channels, 3, activation='relu')
self.conv2 = tf.keras.layers.Conv1D(channels, 3, activation='relu')
self.dense1 = Dense(channels, activation="relu")
self.dropout = Dropout(0.5)
self.dense2 = Dense(n_out, activation="softmax")
def call(self, inputs):
x, a, i = inputs
x = self.conv1(x)
x = tf.keras.layers.GlobalAveragePooling1D()(x)
a = self.conv2(a)
a = tf.keras.layers.GlobalAveragePooling1D()(a)
x = tf.keras.layers.Concatenate(axis=1)([a, x, i])
x = self.dense1(x)
x = self.dropout(x)
return self.dense2(x)
def build_graph(self):
x = tf.keras.Input(shape=bs_x)
y = tf.keras.Input(shape=bs_y)
z = tf.keras.Input(shape=bs_z)
return tf.keras.Model(
inputs=[x, y, z],
outputs=self.call([x, y, z])
)
model = GIN0(channels, layers)
model.build(
[
(None, *bs_x),
(None, *bs_y),
(None, *bs_z)
]
)
# OK
model.build_graph().summary()
# OK
tf.keras.utils.plot_model(
model.build_graph(), show_shapes=True
)

neural network no attribute weight

I have a one hop GCN layer
class GCN_AISUMMER(nn.Module):
"""
"""
def __init__(self, in_features, out_features, bias=True):
super().__init__()
self.linear = nn.Linear(in_features, out_features, bias=bias)
def forward(self, X, A):
"""
A: adjecency matrix
X: graph signal
"""
L = create_graph_lapl_norm(A)
num_neighbours = L.sum(dim=-1, keepdims=True)
x = self.linear(X)
node_feats = torch.bmm(L, x)
node_feats = node_feats / num_neighbours
return node_feats
which is used in the following neural net
class GNN(nn.Module):
def __init__(self,
in_features = 12,
hidden_dim = 128,
classes = 2,
dropout = 0.5):
super(GNN, self).__init__()
self.conv1 = GCN_AISUMMER(in_features, hidden_dim)
self.conv2 = GCN_AISUMMER(hidden_dim, hidden_dim)
self.conv3 = GCN_AISUMMER(hidden_dim, hidden_dim)
self.fc = nn.Linear(hidden_dim, classes)
self.dropout = dropout
def forward(self, x,A):
x = self.conv1(x, A)
x = F.relu(x)
x = self.conv2(x, A)
x = F.relu(x)
x = self.conv3(x, A)
x = F.dropout(x, p=self.dropout, training=self.training)
# aggregate node embeddings
x = x.mean(dim=1)
# final classification layer
return self.fc(x)
I tried to print out the weight of input data after training. I tried print(model.conv1.weight) and gotAttributeError: 'GCN_AISUMMER' object has no attribute 'weight'
print(model.trainable_weights) and gotAttributeError: 'GNN' object has no attribute 'trainable_weights'
I got the weight of fc1, when I use print(model.fc1.weight), but I want to got the weight of input data after training.

Combining two pytorch functions into one

I have two functions like this (code source of the functions is here):
device = torch.device('cuda')
dataset = TUDataset(root='/tmp/MUTAG', name='MUTAG', use_node_attr=True)
loader = DataLoader(dataset, batch_size=32, shuffle=True)
train_dataset = dataset #just for testing
val_dataset = dataset
test_dataset = dataset
graph_train_loader = DataLoader(train_dataset, batch_size=8, shuffle=True)
graph_val_loader = DataLoader(val_dataset, batch_size=8)
gnn_layer_by_name = {
"GCN": geom_nn.GCNConv,
"GAT": geom_nn.GATConv,
"GraphConv": geom_nn.GraphConv
}
class GCNLayer(nn.Module):
def __init__(self, c_in, c_out):
super().__init__()
self.projection = nn.Linear(c_in, c_out)
def forward(self, node_feats, adj_matrix):
num_neighbours = adj_matrix.sum(dim=-1, keepdims=True)
node_feats = self.projection(node_feats)
node_feats = torch.bmm(adj_matrix, node_feats)
node_feats = node_feats / num_neighbours
return node_feats
class GNNModel(nn.Module)
def __init__(self, c_in, c_hidden, c_out, num_layers, activation_function, optimizer_name, learning_rate, dp_rate_linear,layer_name="GCN", **kwargs):
super().__init__()
gnn_layer = gnn_layer_by_name[layer_name]
layers = []
activation_function = eval(activation_function) ##not great to use
in_channels, out_channels = c_in, c_hidden
for l_idx in range(num_layers-1):
layers += [
gnn_layer(in_channels=in_channels,
out_channels=out_channels,
**kwargs),
activation_function,
nn.Dropout(p=dp_rate_linear)
]
in_channels = c_hidden
layers += [gnn_layer(in_channels=in_channels,
out_channels=c_out,
**kwargs)]
self.layers = nn.ModuleList(layers)
def forward(self, x, edge_index):
for l in self.layers:
if isinstance(l, geom_nn.MessagePassing):
x = l(x, edge_index)
else:
x = l(x)
return x
class GraphGNNModel(nn.Module):
def __init__(self, c_in, c_hidden, c_out, dp_rate_linear,**kwargs):
super().__init__()
self.GNN = GNNModel(c_in=c_in,
c_hidden=c_hidden,
c_out=c_hidden,
dp_rate_linear = dp_rate_linear,
**kwargs)
self.head = nn.Sequential(
nn.Dropout(p=dp_rate_linear),
nn.Linear(c_hidden, c_out)
)
def forward(self, x, edge_index, batch_idx):
x = self.GNN(x, edge_index)
x = geom_nn.global_mean_pool(x, batch_idx)
x = self.head(x)
return x
As you can see, I really don't need GNNModel and GraphGNNModel to be two separate functions, the second function is just adding a sequential layer to the end of the first function.
I tried combining the functions by doing:
class GNNModel(nn.Module):
def __init__(self, c_in, c_hidden, c_out, num_layers, activation_function, optimizer_name, learning_rate, dp_rate_linear,layer_name="GCN" ,**kwargs):
"""
Inputs:
c_in - Dimension of input features
c_hidden - Dimension of hidden features
c_out - Dimension of the output features. Usually number of classes in classification
num_layers - Number of "hidden" graph layers
layer_name - String of the graph layer to use
dp_rate_linear - Dropout rate to apply throughout the network
kwargs - Additional arguments for the graph layer (e.g. number of heads for GAT; i'm not using gat here)
activation_function - Activation function
"""
super().__init__()
gnn_layer = gnn_layer_by_name[layer_name]
layers = []
activation_function = eval(activation_function) ##not great to use
in_channels, out_channels = c_in, c_hidden
for l_idx in range(num_layers-1):
layers += [
gnn_layer(in_channels=in_channels,
out_channels=out_channels,
**kwargs),
activation_function,
nn.Dropout(p=dp_rate_linear)
]
in_channels = c_hidden
layers += [gnn_layer(in_channels=in_channels,
out_channels=c_out,
**kwargs)]
self.layers = nn.ModuleList(layers)
self.head = nn.Sequential(
nn.Dropout(p=dp_rate_linear),
nn.Linear(c_hidden, c_out)
)
def forward(self, x, edge_index):
for l in self.layers:
if isinstance(l, geom_nn.MessagePassing): #passing data between conv
x = l(x, edge_index) #what is this
else:
x = l(x)
x = self.GNN(x, edge_index)
x = geom_nn.global_mean_pool(x, batch_idx)
x = self.head(x)
return x
But I get the error:
TypeError: forward() takes 3 positional arguments but 4 were given
Could someone show me the correct way to combine these (the exact explanation of the code is in the Graph level tasks/graph classification of here?

Try adding batch_idx as param in your new forward function. I noted some other inconsistencies like, where is geom_nn being passed to the function? you probably want to use self.geom_nn, and for that you need to fix the __init__() part as well.
def forward(self, x, edge_index, batch_idx): #here you must pass batch_idx
for l in self.layers:
if isinstance(l, geom_nn.MessagePassing): #passing data between conv
x = l(x, edge_index) #what is this
else:
x = l(x)
x = self.GNN(x, edge_index)
x = geom_nn.global_mean_pool(x, batch_idx) #here you use batch_idx
#where is geom_nn coming from???
x = self.head(x)
return x

Pytorch variable size input for GRUCell

I'm new in PyTorch and I'm trying to train something with variable size inputs. And I want to use nn.GRUCell not nn.GRU. I firstly post padded the inputs, such that it could be of the same length. I want to whether my Model class is correct or not. What I'm doing is, I'm storing the output after every time_step into output_sequence and then only keeping the output only for the trajectory_length point.
class Model(nn.Module):
def __init__(self, cell, length, input_size, hidden_size, num_layers, output_size):
super(Model,self).__init__()
self.cell = cell
self.length = length
self.input_size = input_size
self.hidden_size = hidden_size
self.num_layers = num_layers
self.output_size = output_size
self.rnn = cell(self.input_size, self.hidden_size)
self.linear1 = nn.Linear(self.hidden_size+1, self.hidden_size)
self.linear2 = nn.Linear(self.hidden_size, self.output_size)
def forward(self, state, batch_size, action_masks, trajectory_length):
hidden_layer = torch.zeros(batch_size, self.hidden_size)
state = torch.transpose(state, 0, 1).float()
output_sequence = []
for i in range(self.length):
hidden_layer = self.rnn(state[i], hidden_layer)
output_sequence.append(hidden_layer)
output = torch.stack([output_sequence[l-1][i] for i, l in enumerate(trajectory_length)])
output = self.linear2(output)
for i in range(batch_size):
output[i][~action_masks[i]] = float('-inf')
return output

No gradients provided Tensorflow Keras with custom Training Step

I am trying to experiment with different implementations of VAE in tensorflow Keras. In the following model I get an error that no gradients are being provided for any variables in any layer.
tfkl = tf.keras.layers
class sampling2(tfk.layers.Layer):
def call(self, inputs):
z_mean, z_log_var = inputs
batch_size = tf.shape(z_mean)[0]
dim_z = tf.shape(z_mean)[1]
epsilon = tf.keras.backend.random_normal(shape=(batch_size, dim_z))
z_sample = z_mean + tf.exp(0.5 * z_log_var) * epsilon
return z_sample
class encoder2(tfk.layers.Layer):
def __init__(self, latent_dim = 30, intermediate_dim = 200, name= 'encoder2', **kwargs):
super(encoder2, self).__init__(name = name, **kwargs)
self.dense_1 = tfkl.Dense(intermediate_dim, activation="relu")
self.dense_mean = tfkl.Dense(latent_dim)
self.dense_log_var = tfkl.Dense(latent_dim)
self.sampling = sampling2()
def call(self, inputs):
x = self.dense_1(inputs)
z_mean = self.dense_mean(x)
z_log_var = self.dense_log_var(x)
z = self.sampling((z_mean, z_log_var))
return z_mean, z_log_var, z
class decoder2(tfk.layers.Layer):
def __init__(self, original_dim, intermediate_dim= 200, name = 'decoder2', **kwargs):
super(decoder2, self).__init__(name = name, **kwargs)
self.dense_1 = tfkl.Dense(intermediate_dim, activation='relu')
self.dense_output = tfkl.Dense(original_dim, activation = 'sigmoid')
def call(self, inputs):
x = self.dense_1(inputs)
logits = self.dense_output(x)
return logits
class VAE2(tfk.Model):
def __init__(self, original_dim, intermediate_dim = 800, latent_dim = 50,
name = 'VAE2', **kwargs):
super(VAE2, self).__init__(name = name, **kwargs)
self.original_dim = original_dim
self.encoder = encoder2(latent_dim = latent_dim, intermediate_dim = intermediate_dim)
self.decoder = decoder2(original_dim, intermediate_dim = intermediate_dim)
def call(self,inputs):
z_mean, z_log_var, z = self.encoder(inputs)
reconstructed = self.decoder(z)
return reconstructed
def training_step(self, inputs):
dense_train_batch = tf.sparse.to_dense(inputs)
with tf.GradientTape() as tape:
z_mean, z_log_var, z = self.encoder(dense_train_batch)
reconstructed = self.decoder(z)
kl_loss = -0.5 * tf.reduce_mean(z_log_var - tf.square(z_mean) -
tf.exp(z_log_var)+1)
self.add_loss(kl_loss)
grads = tape.gradient(loss, self.trainable_weights)
optimizer.apply_gradients(zip(grads, self.trainable_weights))
loss_fn = keras.losses.BinaryCrossentropy()
optimizer = tf.keras.optimizers.Adam(learning_rate=1e-3)
vae2 = VAE2(df_track_names_reduced.shape[0])
vae2.compile(optimizer=keras.optimizers.Adam(learning_rate = 0.001, amsgrad = True), loss = loss_fn)
vae2.fit(train_dataset, epochs =20)
Bellow I am providing the error message
ValueError: No gradients provided for any variable: ['VAE2/encoder2/dense_8/kernel:0','VAE2/encoder2/dense_8/bias:0', 'VAE2/encoder2/dense_9/kernel:0', 'VAE2/encoder2/dense_9/bias:0', 'VAE2/encoder2/dense_10/kernel:0', 'VAE2/encoder2/dense_10/bias:0', 'VAE2/decoder2
/dense_11/kernel:0', 'VAE2/decoder2/dense_11/bias:0', 'VAE2/decoder2/dense_12/kernel:0',
'VAE2/decoder2/dense_12/bias:0'].

You have to pass a loss tensor to tape.gradient not a function. Calculate binary loss, add it to kl_loss: loss = binary_loss + kl_loss and then pass to tape.gradient().
If you apply gradients manually - you should not call model.compile(), model.fit(). Build your custom loop instead. See here: https://keras.io/guides/writing_a_training_loop_from_scratch/.
But I don't think you really need applying gradients manually. I would just add kl_loss within call function. See here: https://keras.io/api/losses/

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Why LSTM+transformer is not working well? - python

Related

How to plot Graph-neural-network model-graph when using tensorflow Model Subclass API with spektral layers?

neural network no attribute weight

Combining two pytorch functions into one

Pytorch variable size input for GRUCell

No gradients provided Tensorflow Keras with custom Training Step

Categories

Resources