I'm trying to train the MLP mixer on a custom dataset based on this repository.
The code I have so far is shown below. How can I save the training model to further use it on test images?
import torch
import numpy as np
from torch import nn
from einops.layers.torch import Rearrange
import glob
import cv2
from torch.utils.data import Dataset, DataLoader
class customDataset(Dataset):
def __init__(self):
self.imags_path = '/path_to_dataset/'
file_list = glob.glob(self.imags_path + '*')
self.data = []
for class_path in file_list:
class_name = class_path.split('/')[-1]
for img_path in glob.glob(class_path + '/*.jpg'):
self.data.append([img_path,class_name])
self.class_map = {'dogs':0, 'cats':1}
self.img_dim = (416,416)
def __len__(self):
return len(self.data)
def __getitem__(self,idx):
img_path,class_name = self.data[idx]
img = cv2.imread(img_path)
img = cv2.resize(img,self.img_dim)
class_id = self.class_map[class_name]
img_tensor = torch.from_numpy(img)
img_tensor = img_tensor.permute(2, 0, 1)
class_id = torch.tensor([class_id])
return img_tensor, class_id
class FeedForward(nn.Module):
def __init__(self, dim, hidden_dim, dropout = 0.):
super().__init__()
self.net = nn.Sequential(
nn.Linear(dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, dim),
nn.Dropout(dropout)
)
def forward(self, x):
return self.net(x)
class MixerBlock(nn.Module):
def __init__(self, dim, num_patch, token_dim, channel_dim, dropout = 0.):
super().__init__()
self.token_mix = nn.Sequential(
nn.LayerNorm(dim),
Rearrange('b n d -> b d n'),
FeedForward(num_patch, token_dim, dropout),
Rearrange('b d n -> b n d')
)
self.channel_mix = nn.Sequential(
nn.LayerNorm(dim),
FeedForward(dim, channel_dim, dropout),
)
def forward(self, x):
x = x + self.token_mix(x)
x = x + self.channel_mix(x)
return x
class MLPMixer(nn.Module):
def __init__(self, in_channels, dim, num_classes, patch_size, image_size, depth, token_dim, channel_dim):
super().__init__()
assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
self.num_patch = (image_size // patch_size) ** 2
self.to_patch_embedding = nn.Sequential(
nn.Conv2d(in_channels, dim, patch_size, patch_size),
Rearrange('b c h w -> b (h w) c'),
)
self.mixer_blocks = nn.ModuleList([])
for _ in range(depth):
self.mixer_blocks.append(MixerBlock(dim, self.num_patch, token_dim, channel_dim))
self.layer_norm = nn.LayerNorm(dim)
self.mlp_head = nn.Sequential(
nn.Linear(dim, num_classes)
)
def forward(self, x):
x = self.to_patch_embedding(x)
for mixer_block in self.mixer_blocks:
x = mixer_block(x)
x = self.layer_norm(x)
x = x.mean(dim=1)
return self.mlp_head(x)
if __name__ == '__main__':
dataset = customDataset()
train_loader = DataLoader(dataset,batch_size=1,shuffle=True)
mixer_model = MLPMixer(in_channels=3,
image_size=416,
patch_size=16,
num_classes=2,
dim=512,
depth=8,
token_dim=256,
channel_dim=2048)
for i, data in enumerate(train_loader,0):
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
inputs, labels = inputs.float(), labels.float()
outputs = mixer_model(inputs)
Thanks.
Related
I'm trying to use LSTM and transformer to do binary-classification, but it does not improve the performance than normal LSTM model, sometimes it will go even worse. Input shape of training data is (3014, 48, 178), input data is time-series medical data, the following code is for transformer.
class TokenAndPositionEmbedding(layers.Layer):
def __init__(self, maxlen, vocab_size, embed_dim):
super(TokenAndPositionEmbedding, self).__init__()
self.token_emb = layers.Embedding(input_dim=vocab_size, output_dim=embed_dim)
self.pos_emb = layers.Embedding(input_dim=maxlen, output_dim=embed_dim)
def call(self, x):
maxlen = tf.shape(x)[-1]
positions = tf.range(start=0, limit=maxlen, delta=1)
positions = self.pos_emb(positions)
x = self.token_emb(x)
return x + positions
class TransformerBlock(layers.Layer):
def __init__(self, embed_dim, num_heads, ff_dim, rate=0.001):
super(TransformerBlock, self).__init__()
self.att = layers.MultiHeadAttention(num_heads=num_heads, key_dim=embed_dim)
self.ffn = Sequential(
[layers.Dense(ff_dim, activation="relu"),layers.Dense(embed_dim),]
)
self.layernorm1 = layers.LayerNormalization(epsilon=1e-6)
self.layernorm2 = layers.LayerNormalization(epsilon=1e-6)
self.dropout1 = layers.Dropout(rate)
self.dropout2 = layers.Dropout(rate)
def call(self, inputs, training):
attn_output = self.att(inputs, inputs)
attn_output = self.dropout1(attn_output, training=training)
out1 = self.layernorm1(inputs + attn_output)
ffn_output = self.ffn(out1)
ffn_output = self.dropout2(ffn_output, training=training)
return self.layernorm2(out1 + ffn_output)
class PositionEmbeddingFixedWeights(layers.Layer):
def __init__(self, sequence_length, output_dim, **kwargs):
super(PositionEmbeddingFixedWeights, self).__init__(**kwargs)
position_embedding_matrix = self.get_position_encoding(sequence_length, output_dim)
self.position_embedding_layer = layers.Embedding(
input_dim=sequence_length, output_dim=output_dim,
weights=[position_embedding_matrix],
trainable=False
)
def get_position_encoding(self, seq_len, d, n=10000):
P = np.zeros((seq_len, d))
for k in range(seq_len):
for i in np.arange(int(d/2)):
denominator = np.power(n, 2*i/d)
P[k, 2*i] = np.sin(k/denominator)
P[k, 2*i+1] = np.cos(k/denominator)
return P
def call(self, inputs):
position_indices = tf.range(tf.shape(inputs)[-2])
embedded_indices = self.position_embedding_layer(position_indices)
return embedded_indices
Model code is
model = Sequential([tf.keras.Input(shape=(48,178)),
BatchNormalization(),
tf.keras.layers.GRU(units = 128,recurrent_dropout=0.5,activation='tanh', dropout=0.5,return_sequences = True,activity_regularizer=regularizers.L2(0.01)),
TransformerBlock(128, 48, 178),
tf.keras.layers.GlobalAveragePooling1D(),
Dense(60,activation='tanh'),
Dense(1,activation='sigmoid')])
It had troubled me for a long time. I'm trying to use LSTM to dealing with the time-series feature, and transformer to learn the importance between features, but it seems not working.
I want to manually loop over the varying sequence lengths of the input sequences but Tensorflow automatically makes the time axis to None after noticing varying sequence lengths. Is there any work around for this?
Sample example
import tensorflow as tf
import numpy as np
class MyExample(tf.keras.Model):
def __init__(self, int_dim, **kwargs):
super(MyExample, self).__init__(**kwargs)
self.int_dim = int_dim
self.lstm = tf.keras.layers.LSTMCell(self.int_dim)
self.d2 = tf.keras.layers.Dense(self.int_dim)
def call(self, inputs):
states = (tf.zeros((1, self.int_dim)),
tf.zeros((1, self.int_dim)))
outputs = []
for t in range(inputs.shape[1]):
lstm_out, states = self.lstm(inputs[:, t, :], states)
d2_out = self.d2(lstm_out)
outputs.append(d2_out)
output_stack = tf.stack(outputs, 1)
return output_stack
def generator():
while True:
seq_len = np.random.randint(2, 10)
X = tf.random.uniform((1, seq_len, 5))
Y = tf.random.uniform((1, seq_len, 5))
yield X, Y
model = MyExample(5)
model.compile('adam', 'BinaryCrossentropy')
model.fit(generator(), batch_size=1)
Here is a fix for Eager Execution mode:
import tensorflow as tf
import numpy as np
class MyExample(tf.keras.Model):
def __init__(self, int_dim, **kwargs):
super(MyExample, self).__init__(**kwargs)
self.int_dim = int_dim
self.lstm = tf.keras.layers.LSTMCell(self.int_dim)
self.d2 = tf.keras.layers.Dense(self.int_dim)
def call(self, inputs):
states = (tf.zeros((tf.shape(inputs)[0], self.int_dim)),
tf.zeros((tf.shape(inputs)[0], self.int_dim)))
outputs = []
for t in range(tf.shape(inputs)[1]):
lstm_out, states = self.lstm(inputs[:, t, :], states)
d2_out = self.d2(lstm_out)
outputs.append(d2_out)
output_stack = tf.stack(outputs, 1)
return output_stack
def generator():
while True:
seq_len = np.random.randint(2, 10)
X = tf.random.uniform((1, seq_len, 5))
Y = tf.random.uniform((1, seq_len, 5))
yield X, Y
model = MyExample(5)
model.compile('adam', 'BinaryCrossentropy', run_eagerly=True)
model.fit(generator(), batch_size=1)
A Graph mode solution could look like this:
import tensorflow as tf
import numpy as np
class MyExample(tf.keras.Model):
def __init__(self, int_dim, **kwargs):
super(MyExample, self).__init__(**kwargs)
self.int_dim = int_dim
self.lstm = tf.keras.layers.LSTMCell(self.int_dim)
self.d2 = tf.keras.layers.Dense(self.int_dim)
def some_logic(self, i, inputs, s1, s2, o):
lstm_out, s = self.lstm(inputs[:, i, :], (s1, s2))
d2_out = self.d2(lstm_out)
o = o.write(o.size(), d2_out)
s1, s2 = s
return tf.add(i, 1), inputs, s1, s2, o
def call(self, inputs):
states = (tf.zeros((tf.shape(inputs)[0], self.int_dim)),
tf.zeros((tf.shape(inputs)[0], self.int_dim)))
s1, s2 = states
outputs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
i = tf.constant(0)
while_condition = lambda i, inputs, s1, s2, outputs: tf.less(i, tf.shape(inputs)[1])
_, _, _, _, result = tf.while_loop(while_condition, self.some_logic, loop_vars=(i, inputs, s1, s2, outputs))
return result.stack()
def generator():
while True:
seq_len = np.random.randint(2, 10)
X = tf.random.uniform((1, seq_len, 5))
Y = tf.random.uniform((1, seq_len, 5))
yield X, Y
model = MyExample(5)
model.compile('adam', 'BinaryCrossentropy')
model.fit(generator(), batch_size=1)
I am trying to customize a Resnet 50 with an attention layer. Please find my code below:
IMAGE_SIZE = [224, 224]
resnet = ResNet50(input_shape=IMAGE_SIZE + [3], weights='imagenet', include_top=False)
# don't train existing weights
for layer in resnet.layers:
layer.trainable = False
import torch
import math
import torch.nn as nn
class BasicConv(nn.Module):
def __init__(self, in_planes, out_planes, kernel_size, stride=1, padding=0, dilation=1,
groups=1, relu=True, bn=True, bias=False):
super(BasicConv, self).__init__()
self.out_channels = out_planes
self.conv = nn.Conv2d(in_planes, out_planes, kernel_size=kernel_size, stride=stride,
padding=padding, dilation=dilation, groups=groups, bias=bias)
self.bn = nn.BatchNorm2d(out_planes,eps=1e-5, momentum=0.01, affine=True) if bn else
None
self.relu = nn.ReLU() if relu else None
def forward(self, x):
x = self.conv(x)
if self.bn is not None:
x = self.bn(x)
if self.relu is not None:
x = self.relu(x)
return x
class Flatten(nn.Module):
def forward(self, x):
return x.view(x.size(0), -1)
class ChannelGate(nn.Module):
def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max']):
super(ChannelGate, self).__init__()
self.gate_channels = gate_channels
self.mlp = nn.Sequential(
Flatten(),
nn.Linear(gate_channels, gate_channels // reduction_ratio),
nn.ReLU(),
nn.Linear(gate_channels // reduction_ratio, gate_channels)
)
self.pool_types = pool_types
def forward(self, x):
channel_att_sum = None
for pool_type in self.pool_types:
if pool_type=='avg':
avg_pool = F.avg_pool2d( x, (x.size(2), x.size(3)), stride=(x.size(2),
x.size(3)))
channel_att_raw = self.mlp( avg_pool )
elif pool_type=='max':
max_pool = F.max_pool2d( x, (x.size(2), x.size(3)), stride=(x.size(2),
x.size(3)))
channel_att_raw = self.mlp( max_pool )
elif pool_type=='lp':
lp_pool = F.lp_pool2d( x, 2, (x.size(2), x.size(3)), stride=(x.size(2),
x.size(3)))
channel_att_raw = self.mlp( lp_pool )
elif pool_type=='lse':
# LSE pool only
lse_pool = logsumexp_2d(x)
channel_att_raw = self.mlp( lse_pool )
if channel_att_sum is None:
channel_att_sum = channel_att_raw
else:
channel_att_sum = channel_att_sum + channel_att_raw
scale = F.sigmoid( channel_att_sum ).unsqueeze(2).unsqueeze(3).expand_as(x)
return x * scale
def logsumexp_2d(tensor):
tensor_flatten = tensor.view(tensor.size(0), tensor.size(1), -1)
s, _ = torch.max(tensor_flatten, dim=2, keepdim=True)
outputs = s + (tensor_flatten - s).exp().sum(dim=2, keepdim=True).log()
return outputs
class ChannelPool(nn.Module):
def forward(self, x):
return torch.cat( (torch.max(x,1)[0].unsqueeze(1), torch.mean(x,1).unsqueeze(1)),
dim=1 )
class SpatialGate(nn.Module):
def __init__(self):
super(SpatialGate, self).__init__()
kernel_size = 7
self.compress = ChannelPool()
self.spatial = BasicConv(2, 1, kernel_size, stride=1, padding=(kernel_size-1) // 2, relu=False)
def forward(self, x):
x_compress = self.compress(x)
x_out = self.spatial(x_compress)
scale = F.sigmoid(x_out) # broadcasting
return x * scale
class CBAM(nn.Module):
def __init__(self, gate_channels, reduction_ratio=16, pool_types=['avg', 'max'],
no_spatial=False):
super(CBAM, self).__init__()
self.ChannelGate = ChannelGate(gate_channels, reduction_ratio, pool_types)
self.no_spatial=no_spatial
if not no_spatial:
self.SpatialGate = SpatialGate()
def forward(self, x):
x_out = self.ChannelGate(x)
if not self.no_spatial:
x_out = self.SpatialGate(x_out)
return x_out
flat1 = Flatten()(resnet.output)
class1 = Dense(256, activation='relu')(flat1)
class1=BatchNormalization()(class1)
# receive 3D and output 3D
class2 = Dense(128, activation='relu')(class1)
class2=BatchNormalization()(class2)
class2=CBAM(128,8)(class2)
output = Dense(len(folders), activation='softmax')(class2)
I am getting the following error message while implementing the code:
flat1 = Flatten()(resnet.output)
in forward(self, x)
class Flatten(nn.Module):
def forward(self, x):
---> return x.view(x.size(0), -1)
AttributeError: 'KerasTensor' object has no attribute 'view'
I am working with tutorial for convolutional neural nets on pytorch and using MNIST dataset as example. But I was getting same outputs for all labels, net always returns same tensor as result. I've decided to reduce complexity of net and decrease num of epoch, so here is my code:
from __future__ import print_function, division
import os
import torch
import pandas as pd
from skimage import io, transform
import numpy as np
import matplotlib.pyplot as plt
from torch.utils.data import Dataset, DataLoader
from torchvision import transforms, utils
# Ignore warnings
import warnings
warnings.filterwarnings("ignore")
plt.ion() # interactive mode
class DigitsDataset(Dataset):
def __init__(self, csv_file, transform=None):
"""
Args:
csv_file (string): Path to the csv file.
transform (callable, optional): Optional transform to be applied
on a sample.
"""
self.digits_frame = pd.read_csv(csv_file)
self.transform = transform
def __len__(self):
return len(self.digits_frame)
def __getitem__(self, idx):
if torch.is_tensor(idx):
idx = idx.tolist()
label = digits_df.iloc[idx, 0]
digit_pixels = digits_df.iloc[idx, 1:]
digit_pixels = np.asarray(digit_pixels)
digit_pixels = digit_pixels.astype('float').reshape(28, 28)
sample = {'label' : label, 'image' : digit_pixels}
if self.transform:
sample['image'] = self.transform(sample['image'])
return sample
class GrayScaleTransform:
''' Scale intensity from [0,255] to [0,1]'''
def __init__(self, new_min, new_max):
self.new_min = new_min
self.new_max = new_max
def __call__(self, x):
return (x) * (self.new_max - self.new_min) / (255) + self.new_min
min_max_transform = GrayScaleTransform(new_min = 0, new_max = 1)
train_dataset = DigitsDataset(csv_file='data/train.csv', transform = min_max_transform)
test_dataset = DigitsDataset(csv_file='data/test.csv', transform = min_max_transform)
train_loader = DataLoader(train_dataset)
test_loader = DataLoader(test_dataset)
learning_rate = 0.1
num_epochs = 2
from torch import nn
class ConvNet(nn.Module):
def __init__(self):
super(ConvNet, self).__init__()
self.layer1 = nn.Sequential( nn.Conv2d(1, 4, kernel_size=5, stride=1, padding=2),
nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2))
self.layer2 = nn.Sequential( nn.Conv2d(4, 8, kernel_size=5, stride=1, padding=2),
nn.ReLU(), nn.MaxPool2d(kernel_size=2, stride=2))
self.drop_out = nn.Dropout()
self.fc1 = nn.Linear(7 * 7 * 8, 10)
def forward(self, x):
out = self.layer1(x)
out = self.layer2(out)
out = out.reshape(out.size(0), -1)
out = self.drop_out(out)
out = self.fc1(out)
return out
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
model = ConvNet()
model.to(device)
criterion = nn.CrossEntropyLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=learning_rate)
total_step = len(train_loader)
loss_list = []
acc_list = []
for epoch in range(num_epochs):
for i, sample in enumerate(train_loader):
# Прямой запуск
img = sample['image'].view(-1, 1, 28, 28).float().to(device)
label = sample['label'].to(device)
output = model(img)
loss = criterion(output, label)
loss_list.append(loss.item())
# Обратное распространение и оптимизатор
optimizer.zero_grad()
loss.backward()
optimizer.step()
But I am still getting the same tensor as output. What am I doing wrong? Why am I always getting the same output?
I'm following the guide to Transformers and the colab project https://colab.research.google.com/drive/1XBP0Zh8K4g_n0A2p1UlGFf3dij0EX_Kt
but when I run the cell with the line multi_head = build_model() I get the error.
this is the output from the console:
NameError Traceback (most recent call
last) in ()
----> 1 multi_head = build_model()
5 frames in (x)
40 self.dropout = Dropout(attn_dropout)
41 def call(self, q, k, v, mask):
---> 42 attn = Lambda(lambda x:K.batch_dot(x[0],x[1],axes=[2,2])/self.temper)([q, k])
43 if mask is not None:
44 mmask = Lambda(lambda x:(-1e+10)*(1-x))(mask)
NameError: name 'K' is not defined
It just runs after the model architecture code, which the error refers to.
Can you see where this Kshould be defined?
import random, os, sys
import numpy as np
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.initializers import *
import tensorflow as tf
from tensorflow.python.keras.layers import Layer
try:
from dataloader import TokenList, pad_to_longest
# for transformer
except: pass
embed_size = 60
class LayerNormalization(Layer):
def __init__(self, eps=1e-6, **kwargs):
self.eps = eps
super(LayerNormalization, self).__init__(**kwargs)
def build(self, input_shape):
self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
initializer=Ones(), trainable=True)
self.beta = self.add_weight(name='beta', shape=input_shape[-1:],
initializer=Zeros(), trainable=True)
super(LayerNormalization, self).build(input_shape)
def call(self, x):
mean = K.mean(x, axis=-1, keepdims=True)
std = K.std(x, axis=-1, keepdims=True)
return self.gamma * (x - mean) / (std + self.eps) + self.beta
def compute_output_shape(self, input_shape):
return input_shape
class ScaledDotProductAttention():
def __init__(self, d_model, attn_dropout=0.1):
self.temper = np.sqrt(d_model)
self.dropout = Dropout(attn_dropout)
def __call__(self, q, k, v, mask):
attn = Lambda(lambda x:K.batch_dot(x[0],x[1],axes=[2,2])/self.temper)([q, k])
if mask is not None:
mmask = Lambda(lambda x:(-1e+10)*(1-x))(mask)
attn = Add()([attn, mmask])
attn = Activation('softmax')(attn)
attn = self.dropout(attn)
output = Lambda(lambda x:K.batch_dot(x[0], x[1]))([attn, v])
return output, attn
class MultiHeadAttention():
# mode 0 - big martixes, faster; mode 1 - more clear implementation
def __init__(self, n_head, d_model, d_k, d_v, dropout, mode=0, use_norm=True):
self.mode = mode
self.n_head = n_head
self.d_k = d_k
self.d_v = d_v
self.dropout = dropout
if mode == 0:
self.qs_layer = Dense(n_head*d_k, use_bias=False)
self.ks_layer = Dense(n_head*d_k, use_bias=False)
self.vs_layer = Dense(n_head*d_v, use_bias=False)
elif mode == 1:
self.qs_layers = []
self.ks_layers = []
self.vs_layers = []
for _ in range(n_head):
self.qs_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))
self.ks_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))
self.vs_layers.append(TimeDistributed(Dense(d_v, use_bias=False)))
self.attention = ScaledDotProductAttention(d_model)
self.layer_norm = LayerNormalization() if use_norm else None
self.w_o = TimeDistributed(Dense(d_model))
def __call__(self, q, k, v, mask=None):
d_k, d_v = self.d_k, self.d_v
n_head = self.n_head
if self.mode == 0:
qs = self.qs_layer(q) # [batch_size, len_q, n_head*d_k]
ks = self.ks_layer(k)
vs = self.vs_layer(v)
def reshape1(x):
s = tf.shape(x) # [batch_size, len_q, n_head * d_k]
x = tf.reshape(x, [s[0], s[1], n_head, d_k])
x = tf.transpose(x, [2, 0, 1, 3])
x = tf.reshape(x, [-1, s[1], d_k]) # [n_head * batch_size, len_q, d_k]
return x
qs = Lambda(reshape1)(qs)
ks = Lambda(reshape1)(ks)
vs = Lambda(reshape1)(vs)
if mask is not None:
mask = Lambda(lambda x:K.repeat_elements(x, n_head, 0))(mask)
head, attn = self.attention(qs, ks, vs, mask=mask)
def reshape2(x):
s = tf.shape(x) # [n_head * batch_size, len_v, d_v]
x = tf.reshape(x, [n_head, -1, s[1], s[2]])
x = tf.transpose(x, [1, 2, 0, 3])
x = tf.reshape(x, [-1, s[1], n_head*d_v]) # [batch_size, len_v, n_head * d_v]
return x
head = Lambda(reshape2)(head)
elif self.mode == 1:
heads = []; attns = []
for i in range(n_head):
qs = self.qs_layers[i](q)
ks = self.ks_layers[i](k)
vs = self.vs_layers[i](v)
head, attn = self.attention(qs, ks, vs, mask)
heads.append(head); attns.append(attn)
head = Concatenate()(heads) if n_head > 1 else heads[0]
attn = Concatenate()(attns) if n_head > 1 else attns[0]
outputs = self.w_o(head)
outputs = Dropout(self.dropout)(outputs)
if not self.layer_norm: return outputs, attn
# outputs = Add()([outputs, q]) # sl: fix
return self.layer_norm(outputs), attn
class PositionwiseFeedForward():
def __init__(self, d_hid, d_inner_hid, dropout=0.1):
self.w_1 = Conv1D(d_inner_hid, 1, activation='relu')
self.w_2 = Conv1D(d_hid, 1)
self.layer_norm = LayerNormalization()
self.dropout = Dropout(dropout)
def __call__(self, x):
output = self.w_1(x)
output = self.w_2(output)
output = self.dropout(output)
output = Add()([output, x])
return self.layer_norm(output)
class EncoderLayer():
def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1):
self.self_att_layer = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
self.pos_ffn_layer = PositionwiseFeedForward(d_model, d_inner_hid, dropout=dropout)
def __call__(self, enc_input, mask=None):
output, slf_attn = self.self_att_layer(enc_input, enc_input, enc_input, mask=mask)
output = self.pos_ffn_layer(output)
return output, slf_attn
def GetPosEncodingMatrix(max_len, d_emb):
pos_enc = np.array([
[pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
if pos != 0 else np.zeros(d_emb)
for pos in range(max_len)
])
pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2]) # dim 2i
pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2]) # dim 2i+1
return pos_enc
def GetPadMask(q, k):
ones = K.expand_dims(K.ones_like(q, 'float32'), -1)
mask = K.cast(K.expand_dims(K.not_equal(k, 0), 1), 'float32')
mask = K.batch_dot(ones, mask, axes=[2,1])
return mask
def GetSubMask(s):
len_s = tf.shape(s)[1]
bs = tf.shape(s)[:1]
mask = K.cumsum(tf.eye(len_s, batch_shape=bs), 1)
return mask
class Transformer():
def __init__(self, len_limit, embedding_matrix, d_model=embed_size, \
d_inner_hid=512, n_head=10, d_k=64, d_v=64, layers=2, dropout=0.1, \
share_word_emb=False, **kwargs):
self.name = 'Transformer'
self.len_limit = len_limit
self.src_loc_info = False # True # sl: fix later
self.d_model = d_model
self.decode_model = None
d_emb = d_model
pos_emb = Embedding(len_limit, d_emb, trainable=False, \
weights=[GetPosEncodingMatrix(len_limit, d_emb)])
i_word_emb = Embedding(max_features, d_emb, weights=[embedding_matrix]) # Add Kaggle provided embedding here
self.encoder = Encoder(d_model, d_inner_hid, n_head, d_k, d_v, layers, dropout, \
word_emb=i_word_emb, pos_emb=pos_emb)
def get_pos_seq(self, x):
mask = K.cast(K.not_equal(x, 0), 'int32')
pos = K.cumsum(K.ones_like(x, 'int32'), 1)
return pos * mask
def compile(self, active_layers=999):
src_seq_input = Input(shape=(None, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(src_seq_input)
# LSTM before attention layers
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x, slf_attn = MultiHeadAttention(n_head=3, d_model=300, d_k=64, d_v=64, dropout=0.1)(x, x, x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
conc = concatenate([avg_pool, max_pool])
conc = Dense(64, activation="relu")(conc)
x = Dense(1, activation="sigmoid")(conc)
self.model = Model(inputs=src_seq_input, outputs=x)
self.model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics=['accuracy'])
If you look at where K is being used you will see:
K.expand_dims
K.cumsum
K.batch_dot
These are Keras backend functions. The code is missing a from keras import backend as K, which I think is a standard abbreviation.