NameError: name 'K' is not defined - python

I'm following the guide to Transformers and the colab project https://colab.research.google.com/drive/1XBP0Zh8K4g_n0A2p1UlGFf3dij0EX_Kt
but when I run the cell with the line multi_head = build_model() I get the error.
this is the output from the console:
NameError Traceback (most recent call
last) in ()
----> 1 multi_head = build_model()
5 frames in (x)
40 self.dropout = Dropout(attn_dropout)
41 def call(self, q, k, v, mask):
---> 42 attn = Lambda(lambda x:K.batch_dot(x[0],x[1],axes=[2,2])/self.temper)([q, k])
43 if mask is not None:
44 mmask = Lambda(lambda x:(-1e+10)*(1-x))(mask)
NameError: name 'K' is not defined
It just runs after the model architecture code, which the error refers to.
Can you see where this Kshould be defined?
import random, os, sys
import numpy as np
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.initializers import *
import tensorflow as tf
from tensorflow.python.keras.layers import Layer
try:
from dataloader import TokenList, pad_to_longest
# for transformer
except: pass
embed_size = 60
class LayerNormalization(Layer):
def __init__(self, eps=1e-6, **kwargs):
self.eps = eps
super(LayerNormalization, self).__init__(**kwargs)
def build(self, input_shape):
self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
initializer=Ones(), trainable=True)
self.beta = self.add_weight(name='beta', shape=input_shape[-1:],
initializer=Zeros(), trainable=True)
super(LayerNormalization, self).build(input_shape)
def call(self, x):
mean = K.mean(x, axis=-1, keepdims=True)
std = K.std(x, axis=-1, keepdims=True)
return self.gamma * (x - mean) / (std + self.eps) + self.beta
def compute_output_shape(self, input_shape):
return input_shape
class ScaledDotProductAttention():
def __init__(self, d_model, attn_dropout=0.1):
self.temper = np.sqrt(d_model)
self.dropout = Dropout(attn_dropout)
def __call__(self, q, k, v, mask):
attn = Lambda(lambda x:K.batch_dot(x[0],x[1],axes=[2,2])/self.temper)([q, k])
if mask is not None:
mmask = Lambda(lambda x:(-1e+10)*(1-x))(mask)
attn = Add()([attn, mmask])
attn = Activation('softmax')(attn)
attn = self.dropout(attn)
output = Lambda(lambda x:K.batch_dot(x[0], x[1]))([attn, v])
return output, attn
class MultiHeadAttention():
# mode 0 - big martixes, faster; mode 1 - more clear implementation
def __init__(self, n_head, d_model, d_k, d_v, dropout, mode=0, use_norm=True):
self.mode = mode
self.n_head = n_head
self.d_k = d_k
self.d_v = d_v
self.dropout = dropout
if mode == 0:
self.qs_layer = Dense(n_head*d_k, use_bias=False)
self.ks_layer = Dense(n_head*d_k, use_bias=False)
self.vs_layer = Dense(n_head*d_v, use_bias=False)
elif mode == 1:
self.qs_layers = []
self.ks_layers = []
self.vs_layers = []
for _ in range(n_head):
self.qs_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))
self.ks_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))
self.vs_layers.append(TimeDistributed(Dense(d_v, use_bias=False)))
self.attention = ScaledDotProductAttention(d_model)
self.layer_norm = LayerNormalization() if use_norm else None
self.w_o = TimeDistributed(Dense(d_model))
def __call__(self, q, k, v, mask=None):
d_k, d_v = self.d_k, self.d_v
n_head = self.n_head
if self.mode == 0:
qs = self.qs_layer(q) # [batch_size, len_q, n_head*d_k]
ks = self.ks_layer(k)
vs = self.vs_layer(v)
def reshape1(x):
s = tf.shape(x) # [batch_size, len_q, n_head * d_k]
x = tf.reshape(x, [s[0], s[1], n_head, d_k])
x = tf.transpose(x, [2, 0, 1, 3])
x = tf.reshape(x, [-1, s[1], d_k]) # [n_head * batch_size, len_q, d_k]
return x
qs = Lambda(reshape1)(qs)
ks = Lambda(reshape1)(ks)
vs = Lambda(reshape1)(vs)
if mask is not None:
mask = Lambda(lambda x:K.repeat_elements(x, n_head, 0))(mask)
head, attn = self.attention(qs, ks, vs, mask=mask)
def reshape2(x):
s = tf.shape(x) # [n_head * batch_size, len_v, d_v]
x = tf.reshape(x, [n_head, -1, s[1], s[2]])
x = tf.transpose(x, [1, 2, 0, 3])
x = tf.reshape(x, [-1, s[1], n_head*d_v]) # [batch_size, len_v, n_head * d_v]
return x
head = Lambda(reshape2)(head)
elif self.mode == 1:
heads = []; attns = []
for i in range(n_head):
qs = self.qs_layers[i](q)
ks = self.ks_layers[i](k)
vs = self.vs_layers[i](v)
head, attn = self.attention(qs, ks, vs, mask)
heads.append(head); attns.append(attn)
head = Concatenate()(heads) if n_head > 1 else heads[0]
attn = Concatenate()(attns) if n_head > 1 else attns[0]
outputs = self.w_o(head)
outputs = Dropout(self.dropout)(outputs)
if not self.layer_norm: return outputs, attn
# outputs = Add()([outputs, q]) # sl: fix
return self.layer_norm(outputs), attn
class PositionwiseFeedForward():
def __init__(self, d_hid, d_inner_hid, dropout=0.1):
self.w_1 = Conv1D(d_inner_hid, 1, activation='relu')
self.w_2 = Conv1D(d_hid, 1)
self.layer_norm = LayerNormalization()
self.dropout = Dropout(dropout)
def __call__(self, x):
output = self.w_1(x)
output = self.w_2(output)
output = self.dropout(output)
output = Add()([output, x])
return self.layer_norm(output)
class EncoderLayer():
def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1):
self.self_att_layer = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
self.pos_ffn_layer = PositionwiseFeedForward(d_model, d_inner_hid, dropout=dropout)
def __call__(self, enc_input, mask=None):
output, slf_attn = self.self_att_layer(enc_input, enc_input, enc_input, mask=mask)
output = self.pos_ffn_layer(output)
return output, slf_attn
def GetPosEncodingMatrix(max_len, d_emb):
pos_enc = np.array([
[pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
if pos != 0 else np.zeros(d_emb)
for pos in range(max_len)
])
pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2]) # dim 2i
pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2]) # dim 2i+1
return pos_enc
def GetPadMask(q, k):
ones = K.expand_dims(K.ones_like(q, 'float32'), -1)
mask = K.cast(K.expand_dims(K.not_equal(k, 0), 1), 'float32')
mask = K.batch_dot(ones, mask, axes=[2,1])
return mask
def GetSubMask(s):
len_s = tf.shape(s)[1]
bs = tf.shape(s)[:1]
mask = K.cumsum(tf.eye(len_s, batch_shape=bs), 1)
return mask
class Transformer():
def __init__(self, len_limit, embedding_matrix, d_model=embed_size, \
d_inner_hid=512, n_head=10, d_k=64, d_v=64, layers=2, dropout=0.1, \
share_word_emb=False, **kwargs):
self.name = 'Transformer'
self.len_limit = len_limit
self.src_loc_info = False # True # sl: fix later
self.d_model = d_model
self.decode_model = None
d_emb = d_model
pos_emb = Embedding(len_limit, d_emb, trainable=False, \
weights=[GetPosEncodingMatrix(len_limit, d_emb)])
i_word_emb = Embedding(max_features, d_emb, weights=[embedding_matrix]) # Add Kaggle provided embedding here
self.encoder = Encoder(d_model, d_inner_hid, n_head, d_k, d_v, layers, dropout, \
word_emb=i_word_emb, pos_emb=pos_emb)
def get_pos_seq(self, x):
mask = K.cast(K.not_equal(x, 0), 'int32')
pos = K.cumsum(K.ones_like(x, 'int32'), 1)
return pos * mask
def compile(self, active_layers=999):
src_seq_input = Input(shape=(None, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(src_seq_input)
# LSTM before attention layers
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x, slf_attn = MultiHeadAttention(n_head=3, d_model=300, d_k=64, d_v=64, dropout=0.1)(x, x, x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
conc = concatenate([avg_pool, max_pool])
conc = Dense(64, activation="relu")(conc)
x = Dense(1, activation="sigmoid")(conc)
self.model = Model(inputs=src_seq_input, outputs=x)
self.model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics=['accuracy'])

If you look at where K is being used you will see:
K.expand_dims
K.cumsum
K.batch_dot
These are Keras backend functions. The code is missing a from keras import backend as K, which I think is a standard abbreviation.

Related

PyTorch T5 Transformer Implementation

I have been working on an implementation of the T5 architecture in PyTorch. I am having some issues properly implementing the Cross Attention Layers and Decoder.
If anyone who is familiar with the architecture could provide any advice it would be greatly appreciated.
I am sometimes receiving this error as well:
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)
Thank you!
Code for T5 in PyTorch:
import torch
from torch import nn
import torch.nn.functional as F
import math
from einops import rearrange
def exists(val):
return val is not None
def default(val, d):
return val if exists(val) else d
# residual wrapper
class Residual(nn.Module):
def __init__(self, fn):
super().__init__()
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(x, **kwargs) + x
# pre-normalization wrapper
# they use layernorm without bias
class T5LayerNorm(nn.Module):
def __init__(self, dim):
super().__init__()
self.gamma = nn.Parameter(torch.ones(dim))
self.register_buffer("beta", torch.zeros(dim))
def forward(self, x):
return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta)
class PreNorm(nn.Module):
def __init__(self, dim, fn):
super().__init__()
self.norm = T5LayerNorm(dim)
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(self.norm(x), **kwargs)
# feedforward layer
class FeedForward(nn.Module):
def __init__(self, dim, mult = 4, dropout = 0.):
super().__init__()
inner_dim = int(dim * mult)
self.net = nn.Sequential(
nn.Linear(dim, inner_dim),
nn.ReLU(),
nn.Dropout(dropout), # optional dropout
nn.Linear(inner_dim, dim)
)
def forward(self, x):
return self.net(x)
# T5 relative positional bias
class T5RelativePositionBias(nn.Module):
def __init__(self, scale, causal = False, num_buckets = 32, max_distance = 128, heads = 12):
super().__init__()
self.scale = scale
self.causal = causal
self.num_buckets = num_buckets
self.max_distance = max_distance
self.relative_attention_bias = nn.Embedding(num_buckets, heads)
#staticmethod
def _relative_position_bucket(relative_position, causal = True, num_buckets = 32, max_distance = 128):
ret = 0
n = -relative_position
if not causal:
num_buckets //= 2
ret += (n < 0).long() * num_buckets
n = torch.abs(n)
else:
n = torch.max(n, torch.zeros_like(n))
max_exact = num_buckets // 2
is_small = n < max_exact
val_if_large = max_exact + (
torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
).long()
val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
ret += torch.where(is_small, n, val_if_large)
return ret
def forward(self, qk_dots):
i, j, device = *qk_dots.shape[-2:], qk_dots.device
q_pos = torch.arange(j - i, j, dtype = torch.long, device = device)
k_pos = torch.arange(j, dtype = torch.long, device = device)
rel_pos = k_pos[None, :] - q_pos[:, None]
rp_bucket = self._relative_position_bucket(
rel_pos,
causal = self.causal,
num_buckets = self.num_buckets,
max_distance = self.max_distance
)
values = self.relative_attention_bias(rp_bucket)
bias = rearrange(values, 'i j h -> h i j')
return qk_dots + (bias * self.scale)
# T5 Self Attention
class T5SelfAttention(nn.Module):
def __init__(
self,
*,
dim,
heads = 12,
dim_head = 64,
causal = False,
dropout = 0.
):
super().__init__()
inner_dim = dim_head * heads
self.heads = heads
self.scale = dim_head ** -0.5
self.causal = causal
self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_k = nn.Linear(dim, inner_dim, bias = False)
self.to_v = nn.Linear(dim, inner_dim, bias = False)
self.to_out = nn.Linear(inner_dim, dim)
self.relative_position_bias = T5RelativePositionBias(
scale = dim_head ** -0.5,
causal = causal,
heads = heads
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask = None):
b, n, _, h = *x.shape, self.heads
q, k, v = self.to_q(x), self.to_k(x), self.to_v(x)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
q = q * self.scale
sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
sim = self.relative_position_bias(sim)
# mask
mask_value = -torch.finfo(sim.dtype).max
if mask is not None:
sim = sim.masked_fill_(~mask, mask_value)
if self.causal:
i, j = sim.shape[-2:]
causal_mask = torch.ones((i, j), dtype = torch.bool, device = x.device).triu(j - i + 1)
sim = sim.masked_fill(causal_mask, mask_value)
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate
out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
# merge heads
out = rearrange(out, 'b h n d -> b n (h d)')
# combine heads and linear output
return self.to_out(out)
# T5 Cross Attention
class T5CrossAttention(nn.Module):
def __init__(
self,
*,
dim,
context_dim = None,
heads = 12,
dim_head = 64,
dropout = 0.
):
super().__init__()
inner_dim = dim_head * heads
context_dim = default(context_dim, dim)
self.heads = heads
self.scale = dim_head ** -0.5
self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_k = nn.Linear(context_dim, inner_dim, bias = False)
self.to_v = nn.Linear(context_dim, inner_dim, bias = False)
self.to_out = nn.Linear(inner_dim, dim)
self.relative_position_bias = T5RelativePositionBias(
scale = dim_head ** -0.5,
causal = False,
heads = heads
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, context, mask = None, context_mask = None):
b, n, _, h = *x.shape, self.heads
kv_input = default(context, x)
q, k, v = self.to_q(x), self.to_k(kv_input), self.to_v(kv_input)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
q = q * self.scale
sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
sim = self.relative_position_bias(sim)
# mask
mask_value = -torch.finfo(sim.dtype).max
if mask is not None:
sim = sim.masked_fill_(~mask, mask_value)
if context_mask is not None:
sim = sim.masked_fill_(~context_mask[:, None, :], mask_value)
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate
out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
# merge heads
out = rearrange(out, 'b h n d -> b n (h d)')
# combine heads and linear output
return self.to_out(out)
# T5 Encoder
class T5Encoder(nn.Module):
def __init__(
self,
*,
dim,
num_tokens,
#max_seq_len,
depth,
heads = 12,
dim_head = 64,
causal = False,
mlp_mult = 4,
dropout = 0.
):
super().__init__()
self.token_emb = nn.Embedding(num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.layer = nn.ModuleList([])
for _ in range(depth):
self.layer.append(nn.ModuleList([
Residual(PreNorm(dim, T5SelfAttention(dim = dim, heads = heads, dim_head = dim_head, causal = causal, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim = dim, mult = mlp_mult, dropout = dropout))),
]))
self.final_norm = T5LayerNorm(dim)
def forward(self, x, mask = None):
x = self.token_emb(x)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
for attn, mlp in self.layer:
x = attn(x, mask = mask)
x = mlp(x)
x = self.final_norm(x)
return x
# T5 Decoder
class T5Decoder(nn.Module):
def __init__(
self,
*,
dim,
num_tokens,
#max_seq_len,
depth,
heads = 12,
dim_head = 64,
causal = True,
mlp_mult = 4,
dropout = 0.
):
super().__init__()
self.token_emb = nn.Embedding(num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.layer = nn.ModuleList([])
for _ in range(depth):
self.layer.append(nn.ModuleList([
Residual(PreNorm(dim, T5SelfAttention(dim = dim, heads = heads, dim_head = dim_head, causal = causal, dropout = dropout))),
Residual(PreNorm(dim, T5CrossAttention(dim = dim, heads = heads, dim_head = dim_head, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim = dim, mult = mlp_mult, dropout = dropout))),
]))
self.final_norm = T5LayerNorm(dim)
def forward(self, x, context, mask = None, context_mask = None):
x = self.token_emb(x)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
for attn, cross_attn, mlp in self.layer:
x = attn(x, mask = mask)
x = cross_attn(x, context = context, mask = mask, context_mask = context_mask)
x = mlp(x)
x = self.final_norm(x)
return x
# T5
class T5(nn.Module):
def __init__(
self,
*,
dim,
#max_seq_len,
enc_num_tokens,
enc_depth,
enc_heads,
enc_dim_head,
enc_mlp_mult,
dec_num_tokens,
dec_depth,
dec_heads,
dec_dim_head,
dec_mlp_mult,
dropout = 0.,
tie_token_emb = True
):
super().__init__()
self.embedding = nn.Embedding(enc_num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.encoder = T5Encoder(
dim = dim,
#max_seq_len = max_seq_len,
num_tokens = enc_num_tokens,
depth = enc_depth,
heads = enc_heads,
dim_head = enc_dim_head,
mlp_mult = enc_mlp_mult,
dropout = dropout
)
self.decoder = T5Decoder(
dim = dim,
#max_seq_len= max_seq_len,
num_tokens = dec_num_tokens,
depth = dec_depth,
heads = dec_heads,
dim_head = dec_dim_head,
mlp_mult = dec_mlp_mult,
dropout = dropout
)
self.to_logits = nn.Linear(dim, dec_num_tokens)
# tie weights
if tie_token_emb:
self.encoder.token_emb.weight = self.decoder.token_emb.weight
def forward(self, src, tgt, mask = None, context_mask = None):
x = self.embedding(src)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
x = self.encoder(src, mask = mask)
x = self.decoder(tgt, x, mask = mask, context_mask = context_mask)
x = self.to_logits(x)
return x
if __name__ == '__main__':
from opendelta import Visualization
model = T5(
dim = 768,
#max_seq_len = 1024,
enc_num_tokens = 512,
enc_depth = 6,
enc_heads = 12,
enc_dim_head = 64,
enc_mlp_mult = 4,
dec_num_tokens = 512,
dec_depth = 6,
dec_heads = 12,
dec_dim_head = 64,
dec_mlp_mult = 4,
dropout = 0.,
tie_token_emb = True
)
src = torch.randint(0, 512, (1, 1024))
src_mask = torch.ones_like(src).bool()
tgt = torch.randint(0, 512, (1, 1024))
loss = model(src, tgt, mask = src_mask)
Visualization(model).structure_graph()
print(loss.shape) #torch.Size([1, 1024, 512])
Working implementation of T5 in pytorch:
import torch
from torch import nn
import torch.nn.functional as F
import math
from einops import rearrange
def exists(val):
return val is not None
def default(val, d):
return val if exists(val) else d
# residual wrapper
class Residual(nn.Module):
def __init__(self, fn):
super().__init__()
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(x, **kwargs) + x
# pre-normalization wrapper
# they use layernorm without bias
class T5LayerNorm(nn.Module):
def __init__(self, dim):
super().__init__()
self.gamma = nn.Parameter(torch.ones(dim))
self.register_buffer("beta", torch.zeros(dim))
def forward(self, x):
return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta)
class PreNorm(nn.Module):
def __init__(self, dim, fn):
super().__init__()
self.norm = T5LayerNorm(dim)
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(self.norm(x), **kwargs)
# feedforward layer
class FeedForward(nn.Module):
def __init__(self, dim, mult = 4, dropout = 0.):
super().__init__()
inner_dim = int(dim * mult)
self.net = nn.Sequential(
nn.Linear(dim, inner_dim),
nn.ReLU(),
nn.Dropout(dropout), # optional dropout
nn.Linear(inner_dim, dim)
)
def forward(self, x):
return self.net(x)
# T5 relative positional bias
class T5RelativePositionBias(nn.Module):
def __init__(self, scale, causal = False, num_buckets = 32, max_distance = 128, heads = 12):
super().__init__()
self.scale = scale
self.causal = causal
self.num_buckets = num_buckets
self.max_distance = max_distance
self.relative_attention_bias = nn.Embedding(num_buckets, heads)
#staticmethod
def _relative_position_bucket(relative_position, causal = True, num_buckets = 32, max_distance = 128):
ret = 0
n = -relative_position
if not causal:
num_buckets //= 2
ret += (n < 0).long() * num_buckets
n = torch.abs(n)
else:
n = torch.max(n, torch.zeros_like(n))
max_exact = num_buckets // 2
is_small = n < max_exact
val_if_large = max_exact + (
torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
).long()
val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
ret += torch.where(is_small, n, val_if_large)
return ret
def forward(self, qk_dots):
i, j, device = *qk_dots.shape[-2:], qk_dots.device
q_pos = torch.arange(j - i, j, dtype = torch.long, device = device)
k_pos = torch.arange(j, dtype = torch.long, device = device)
rel_pos = k_pos[None, :] - q_pos[:, None]
rp_bucket = self._relative_position_bucket(
rel_pos,
causal = self.causal,
num_buckets = self.num_buckets,
max_distance = self.max_distance
)
values = self.relative_attention_bias(rp_bucket)
bias = rearrange(values, 'i j h -> h i j')
return qk_dots + (bias * self.scale)
# T5 Self Attention
class T5SelfAttention(nn.Module):
def __init__(
self,
*,
dim,
heads = 12,
dim_head = 64,
causal = False,
dropout = 0.
):
super().__init__()
inner_dim = dim_head * heads
self.heads = heads
self.scale = dim_head ** -0.5
self.causal = causal
self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_k = nn.Linear(dim, inner_dim, bias = False)
self.to_v = nn.Linear(dim, inner_dim, bias = False)
self.to_out = nn.Linear(inner_dim, dim)
self.relative_position_bias = T5RelativePositionBias(
scale = dim_head ** -0.5,
causal = causal,
heads = heads
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask = None):
b, n, _, h = *x.shape, self.heads
q, k, v = self.to_q(x), self.to_k(x), self.to_v(x)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
q = q * self.scale
sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
sim = self.relative_position_bias(sim)
# mask
mask_value = -torch.finfo(sim.dtype).max
if mask is not None:
sim = sim.masked_fill_(~mask, mask_value)
if self.causal:
i, j = sim.shape[-2:]
causal_mask = torch.ones((i, j), dtype = torch.bool, device = x.device).triu(j - i + 1)
sim = sim.masked_fill(causal_mask, mask_value)
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate
out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
# merge heads
out = rearrange(out, 'b h n d -> b n (h d)')
# combine heads and linear output
return self.to_out(out)
# T5 Cross Attention
class T5CrossAttention(nn.Module):
def __init__(
self,
*,
dim,
context_dim = None,
heads = 12,
dim_head = 64,
dropout = 0.
):
super().__init__()
inner_dim = dim_head * heads
context_dim = default(context_dim, dim)
self.heads = heads
self.scale = dim_head ** -0.5
self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_k = nn.Linear(context_dim, inner_dim, bias = False)
self.to_v = nn.Linear(context_dim, inner_dim, bias = False)
self.to_out = nn.Linear(inner_dim, dim)
# self.relative_position_bias = T5RelativePositionBias(
# scale = dim_head ** -0.5,
# causal = False,
# heads = heads
# )
self.dropout = nn.Dropout(dropout)
def forward(self, x, context, mask = None, context_mask = None):
b, n, _, h = *x.shape, self.heads
kv_input = default(context, x)
q, k, v = self.to_q(x), self.to_k(kv_input), self.to_v(kv_input)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
q = q * self.scale
sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
#sim = self.relative_position_bias(sim)
# mask
mask_value = -torch.finfo(sim.dtype).max
if mask is not None:
sim = sim.masked_fill_(~mask, mask_value)
if context_mask is not None:
sim = sim.masked_fill_(~context_mask[:, None, :], mask_value)
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate
out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
# merge heads
out = rearrange(out, 'b h n d -> b n (h d)')
# combine heads and linear output
return self.to_out(out)
# T5 Encoder
class T5Encoder(nn.Module):
def __init__(
self,
*,
dim,
num_tokens,
#max_seq_len,
depth,
heads = 12,
dim_head = 64,
causal = False,
mlp_mult = 4,
dropout = 0.
):
super().__init__()
self.token_emb = nn.Embedding(num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.layer = nn.ModuleList([])
for _ in range(depth):
self.layer.append(nn.ModuleList([
Residual(PreNorm(dim, T5SelfAttention(dim = dim, heads = heads, dim_head = dim_head, causal = causal, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim = dim, mult = mlp_mult, dropout = dropout))),
]))
self.final_norm = T5LayerNorm(dim)
def forward(self, x, mask = None):
x = self.token_emb(x)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
for attn, mlp in self.layer:
x = attn(x, mask = mask)
x = mlp(x)
x = self.final_norm(x)
return x
# T5 Decoder
class T5Decoder(nn.Module):
def __init__(
self,
*,
dim,
num_tokens,
#max_seq_len,
depth,
heads = 12,
dim_head = 64,
causal = True,
mlp_mult = 4,
dropout = 0.
):
super().__init__()
self.token_emb = nn.Embedding(num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.layer = nn.ModuleList([])
for _ in range(depth):
self.layer.append(nn.ModuleList([
Residual(PreNorm(dim, T5SelfAttention(dim = dim, heads = heads, dim_head = dim_head, causal = causal, dropout = dropout))),
Residual(PreNorm(dim, T5CrossAttention(dim = dim, heads = heads, dim_head = dim_head, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim = dim, mult = mlp_mult, dropout = dropout))),
]))
self.final_norm = T5LayerNorm(dim)
def forward(self, x, context, mask = None, context_mask = None):
x = self.token_emb(x)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
for attn, cross_attn, mlp in self.layer:
x = attn(x, mask = mask)
x = cross_attn(x, context = context, mask = mask, context_mask = context_mask)
x = mlp(x)
x = self.final_norm(x)
return x
# T5
class T5(nn.Module):
def __init__(
self,
*,
dim,
#max_seq_len,
enc_num_tokens,
enc_depth,
enc_heads,
enc_dim_head,
enc_mlp_mult,
dec_num_tokens,
dec_depth,
dec_heads,
dec_dim_head,
dec_mlp_mult,
dropout = 0.,
tie_token_emb = True
):
super().__init__()
self.embedding = nn.Embedding(enc_num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.encoder = T5Encoder(
dim = dim,
#max_seq_len = max_seq_len,
num_tokens = enc_num_tokens,
depth = enc_depth,
heads = enc_heads,
dim_head = enc_dim_head,
mlp_mult = enc_mlp_mult,
dropout = dropout
)
self.decoder = T5Decoder(
dim = dim,
#max_seq_len= max_seq_len,
num_tokens = dec_num_tokens,
depth = dec_depth,
heads = dec_heads,
dim_head = dec_dim_head,
mlp_mult = dec_mlp_mult,
dropout = dropout
)
self.to_logits = nn.Linear(dim, dec_num_tokens)
# tie weights
if tie_token_emb:
self.encoder.token_emb.weight = self.decoder.token_emb.weight
def forward(self, src, tgt, mask = None, context_mask = None):
x = self.embedding(src)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
x = self.encoder(src, mask = mask)
x = self.decoder(tgt, x, mask = mask, context_mask = context_mask)
x = self.to_logits(x)
return x
if __name__ == '__main__':
model = T5(
dim = 768,
#max_seq_len = 1024,
enc_num_tokens = 512,
enc_depth = 6,
enc_heads = 12,
enc_dim_head = 64,
enc_mlp_mult = 4,
dec_num_tokens = 512,
dec_depth = 6,
dec_heads = 12,
dec_dim_head = 64,
dec_mlp_mult = 4,
dropout = 0.,
tie_token_emb = True
)
src = torch.randint(0, 512, (1, 1024))
src_mask = torch.ones_like(src).bool()
tgt = torch.randint(0, 512, (1, 1024))
loss = model(src, tgt, mask = src_mask)
print(loss.shape) #torch.Size([1, 1024, 512])

Keras LSTM - looping over variable sequence length

I want to manually loop over the varying sequence lengths of the input sequences but Tensorflow automatically makes the time axis to None after noticing varying sequence lengths. Is there any work around for this?
Sample example
import tensorflow as tf
import numpy as np
class MyExample(tf.keras.Model):
def __init__(self, int_dim, **kwargs):
super(MyExample, self).__init__(**kwargs)
self.int_dim = int_dim
self.lstm = tf.keras.layers.LSTMCell(self.int_dim)
self.d2 = tf.keras.layers.Dense(self.int_dim)
def call(self, inputs):
states = (tf.zeros((1, self.int_dim)),
tf.zeros((1, self.int_dim)))
outputs = []
for t in range(inputs.shape[1]):
lstm_out, states = self.lstm(inputs[:, t, :], states)
d2_out = self.d2(lstm_out)
outputs.append(d2_out)
output_stack = tf.stack(outputs, 1)
return output_stack
def generator():
while True:
seq_len = np.random.randint(2, 10)
X = tf.random.uniform((1, seq_len, 5))
Y = tf.random.uniform((1, seq_len, 5))
yield X, Y
model = MyExample(5)
model.compile('adam', 'BinaryCrossentropy')
model.fit(generator(), batch_size=1)
Here is a fix for Eager Execution mode:
import tensorflow as tf
import numpy as np
class MyExample(tf.keras.Model):
def __init__(self, int_dim, **kwargs):
super(MyExample, self).__init__(**kwargs)
self.int_dim = int_dim
self.lstm = tf.keras.layers.LSTMCell(self.int_dim)
self.d2 = tf.keras.layers.Dense(self.int_dim)
def call(self, inputs):
states = (tf.zeros((tf.shape(inputs)[0], self.int_dim)),
tf.zeros((tf.shape(inputs)[0], self.int_dim)))
outputs = []
for t in range(tf.shape(inputs)[1]):
lstm_out, states = self.lstm(inputs[:, t, :], states)
d2_out = self.d2(lstm_out)
outputs.append(d2_out)
output_stack = tf.stack(outputs, 1)
return output_stack
def generator():
while True:
seq_len = np.random.randint(2, 10)
X = tf.random.uniform((1, seq_len, 5))
Y = tf.random.uniform((1, seq_len, 5))
yield X, Y
model = MyExample(5)
model.compile('adam', 'BinaryCrossentropy', run_eagerly=True)
model.fit(generator(), batch_size=1)
A Graph mode solution could look like this:
import tensorflow as tf
import numpy as np
class MyExample(tf.keras.Model):
def __init__(self, int_dim, **kwargs):
super(MyExample, self).__init__(**kwargs)
self.int_dim = int_dim
self.lstm = tf.keras.layers.LSTMCell(self.int_dim)
self.d2 = tf.keras.layers.Dense(self.int_dim)
def some_logic(self, i, inputs, s1, s2, o):
lstm_out, s = self.lstm(inputs[:, i, :], (s1, s2))
d2_out = self.d2(lstm_out)
o = o.write(o.size(), d2_out)
s1, s2 = s
return tf.add(i, 1), inputs, s1, s2, o
def call(self, inputs):
states = (tf.zeros((tf.shape(inputs)[0], self.int_dim)),
tf.zeros((tf.shape(inputs)[0], self.int_dim)))
s1, s2 = states
outputs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
i = tf.constant(0)
while_condition = lambda i, inputs, s1, s2, outputs: tf.less(i, tf.shape(inputs)[1])
_, _, _, _, result = tf.while_loop(while_condition, self.some_logic, loop_vars=(i, inputs, s1, s2, outputs))
return result.stack()
def generator():
while True:
seq_len = np.random.randint(2, 10)
X = tf.random.uniform((1, seq_len, 5))
Y = tf.random.uniform((1, seq_len, 5))
yield X, Y
model = MyExample(5)
model.compile('adam', 'BinaryCrossentropy')
model.fit(generator(), batch_size=1)

how to convert k.variable, addweights to pytorch code

i have a code written by tensorflow code i need to convert to pytorch.
i think i almost done with this but i'm not sure about detail.
first i have
class adaptive_implicit_trans(layers.Layer):
def __init__(self, **kwargs):
super(adaptive_implicit_trans, self).__init__(**kwargs)
def build(self, input_shape):
conv_shape = (1,1,64,64)
self.it_weights = self.add_weight(
shape = (1,1,64,1),
initializer = initializers.get('ones'),
constraint = constraints.NonNeg(),
name = 'ait_conv')
kernel = np.zeros(conv_shape)
r1 = sqrt(1.0/8)
r2 = sqrt(2.0/8)
for i in range(8):
_u = 2*i+1
for j in range(8):
_v = 2*j+1
index = i*8+j
for u in range(8):
for v in range(8):
index2 = u*8+v
t = cos(_u*u*pi/16)*cos(_v*v*pi/16)
t = t*r1 if u==0 else t*r2
t = t*r1 if v==0 else t*r2
kernel[0,0,index2,index] = t
self.kernel = k.variable(value = kernel, dtype = 'float32')
def call(self, inputs):
self.kernel = self.kernel*self.it_weights
y = k.conv2d(inputs,
self.kernel,
padding = 'same',
data_format='channels_last')
return y
def compute_output_shape(self, input_shape):
return input_shape
and i need to convert it to pytorch code.
so i made this.
class It_Weight(nn.Module):
def __init__(self):
super().__init__()
self.ReLU = nn.ReLU()
self.it_weights = nn.Parameter(torch.autograd.Variable(torch.ones((1, 64, 1, 1)),
requires_grad=True))
def forward(self, input):
y = input.to('cuda') * self.it_weights
return self.ReLU(y)
def compute_output_shape(self, input_shape):
return input_shape
class Kernel(nn.Module):
def __init__(self):
super().__init__()
conv_shape = (64, 64, 1, 1)
kernel = torch.zeros(conv_shape).cuda()
r1 = sqrt(1.0 / 8)
r2 = sqrt(2.0 / 8)
for i in range(8):
_u = 2 * i + 1
for j in range(8):
_v = 2 * j + 1
index = i * 8 + j
for u in range(8):
for v in range(8):
index2 = u * 8 + v
t = cos(_u * u * pi / 16) * cos(_v * v * pi / 16)
t = t * r1 if u == 0 else t * r2
t = t * r1 if v == 0 else t * r2
kernel[index, index2, 0, 0] = t
self.kernel = torch.autograd.Variable(kernel)
def forward(self):
return self.kernel
def compute_output_shape(self, input_shape):
return input_shape
class adaptive_implicit_trans(nn.Module):
def __init__(self):
super().__init__()
self.it_weights = It_Weight()
self.kernel = Kernel()
def forward(self, inputs):
self.kernel1 = self.it_weights(self.kernel()) # 출력도 kernel사이즈랑 동일
y = F.conv2d(inputs, self.kernel1)
return y
def compute_output_shape(self, input_shape):
return input_shape
the number of class was increased but it was my best in order to avoid error.
i don't know where is wrong.
i have been trying to train this pytorch code. but performance is lower than tensorflow code.
where should i edit ??
please help me.

MLP mixer - Saving the training model

I'm trying to train the MLP mixer on a custom dataset based on this repository.
The code I have so far is shown below. How can I save the training model to further use it on test images?
import torch
import numpy as np
from torch import nn
from einops.layers.torch import Rearrange
import glob
import cv2
from torch.utils.data import Dataset, DataLoader
class customDataset(Dataset):
def __init__(self):
self.imags_path = '/path_to_dataset/'
file_list = glob.glob(self.imags_path + '*')
self.data = []
for class_path in file_list:
class_name = class_path.split('/')[-1]
for img_path in glob.glob(class_path + '/*.jpg'):
self.data.append([img_path,class_name])
self.class_map = {'dogs':0, 'cats':1}
self.img_dim = (416,416)
def __len__(self):
return len(self.data)
def __getitem__(self,idx):
img_path,class_name = self.data[idx]
img = cv2.imread(img_path)
img = cv2.resize(img,self.img_dim)
class_id = self.class_map[class_name]
img_tensor = torch.from_numpy(img)
img_tensor = img_tensor.permute(2, 0, 1)
class_id = torch.tensor([class_id])
return img_tensor, class_id
class FeedForward(nn.Module):
def __init__(self, dim, hidden_dim, dropout = 0.):
super().__init__()
self.net = nn.Sequential(
nn.Linear(dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, dim),
nn.Dropout(dropout)
)
def forward(self, x):
return self.net(x)
class MixerBlock(nn.Module):
def __init__(self, dim, num_patch, token_dim, channel_dim, dropout = 0.):
super().__init__()
self.token_mix = nn.Sequential(
nn.LayerNorm(dim),
Rearrange('b n d -> b d n'),
FeedForward(num_patch, token_dim, dropout),
Rearrange('b d n -> b n d')
)
self.channel_mix = nn.Sequential(
nn.LayerNorm(dim),
FeedForward(dim, channel_dim, dropout),
)
def forward(self, x):
x = x + self.token_mix(x)
x = x + self.channel_mix(x)
return x
class MLPMixer(nn.Module):
def __init__(self, in_channels, dim, num_classes, patch_size, image_size, depth, token_dim, channel_dim):
super().__init__()
assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
self.num_patch = (image_size // patch_size) ** 2
self.to_patch_embedding = nn.Sequential(
nn.Conv2d(in_channels, dim, patch_size, patch_size),
Rearrange('b c h w -> b (h w) c'),
)
self.mixer_blocks = nn.ModuleList([])
for _ in range(depth):
self.mixer_blocks.append(MixerBlock(dim, self.num_patch, token_dim, channel_dim))
self.layer_norm = nn.LayerNorm(dim)
self.mlp_head = nn.Sequential(
nn.Linear(dim, num_classes)
)
def forward(self, x):
x = self.to_patch_embedding(x)
for mixer_block in self.mixer_blocks:
x = mixer_block(x)
x = self.layer_norm(x)
x = x.mean(dim=1)
return self.mlp_head(x)
if __name__ == '__main__':
dataset = customDataset()
train_loader = DataLoader(dataset,batch_size=1,shuffle=True)
mixer_model = MLPMixer(in_channels=3,
image_size=416,
patch_size=16,
num_classes=2,
dim=512,
depth=8,
token_dim=256,
channel_dim=2048)
for i, data in enumerate(train_loader,0):
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
inputs, labels = inputs.float(), labels.float()
outputs = mixer_model(inputs)
Thanks.

subclass a customized model in tensorflow2: Cannot convert a Tensor of dtype resource to a NumPy array

I'm newbie to tensorflow2 and use tensorflow2.3.1, cpu version.
I defined the model in subclassing way and, when showing the structure of my model, I encountered the error "tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot convert a Tensor of dtype resource to a NumPy array", which points to the following line in BST_DSSM.build_model
"self.item_sequence_embeddings = tf.nn.embedding_lookup("
I have browsed through similar questions but can't find satisfactory solution.
Any help will be appreciated :)
Below is my code.
import tensorflow as tf
class MultiHeadAttention(tf.keras.layers.Layer):
""" def multi head attention layer
q, k, v multiplied by Wq, Wk, Wv respectively -> q', k', v'
q' * k' -> w, w / sqrt(q'.shape[1]) -> w'
w' * v' -> z, z * Wz -> z'
z' add v (residual), then goes through LRelu, do a LN at last
"""
def __init__(
self,
scope_name,
num_units=8,
num_heads=1,
embed_dim=8,
has_residual=True,
dropout_keep_prob=1.0):
super(MultiHeadAttention, self).__init__()
assert num_units % num_heads == 0
assert scope_name in ["user", "item"]
self.num_heads = num_heads
self.num_units = num_units
self.embed_dim = embed_dim
self.dropout_keep_prob = dropout_keep_prob
self.Wq = tf.keras.layers.Dense(
units=self.num_units, activation=tf.nn.leaky_relu, name=f"{scope_name}_Wq")
self.Wk = tf.keras.layers.Dense(
units=self.num_units, activation=tf.nn.leaky_relu, name=f"{scope_name}_Wk")
self.Wv = tf.keras.layers.Dense(
units=self.num_units, activation=tf.nn.leaky_relu, name=f"{scope_name}_Wv")
self.has_residual = has_residual
self.Wz = tf.keras.layers.Dense(embed_dim)
def call(self, queries, keys_, values):
"""
:param queries: of shape [batch_size, max_length, emb_dim]
:param keys_: of shape [batch_size, max_length, emb_dim]
:param values: of shape [batch_size, max_length, emb_dim]
:return:
"""
assert values.get_shape().as_list()[-1] == self.embed_dim
assert queries.get_shape().as_list()[-1] == self.embed_dim
assert keys_.get_shape().as_list()[-1] == self.embed_dim
# Linear projections
Q = self.Wq(queries)
K = self.Wk(keys_)
V = self.Wv(values)
# Split and concat
Q_ = tf.concat(tf.split(Q, self.num_heads, axis=2), axis=0)
K_ = tf.concat(tf.split(K, self.num_heads, axis=2), axis=0)
V_ = tf.concat(tf.split(V, self.num_heads, axis=2), axis=0)
# Multiplication
weights = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))
# Scale
weights = weights / (K_.get_shape().as_list()[-1] ** 0.5)
# convert to prob vector
weights = tf.nn.softmax(weights)
# Dropouts
if 0 < self.dropout_keep_prob < 1:
weights = tf.keras.layers.AlphaDropout(
rate=1 - self.dropout_keep_prob)(weights)
# Weighted sum
# [batch_size * num_heads, max_length, num_units / num_heads]
outputs = tf.matmul(weights, V_)
# Restore shape to [batch_size, max_length, num_units]
z = tf.concat(tf.split(outputs, self.num_heads, axis=0), axis=2)
# Restore shape to [batch_size, max_length, embed_dim]
z = self.Wz(z)
# Residual connection
if self.has_residual:
z += values
z = tf.nn.leaky_relu(z)
# Normalize
z = tf.keras.layers.LayerNormalization(
beta_initializer="zeros", gamma_initializer="ones")(z)
return z
class BST_DSSM(tf.keras.Model):
"""define BST+DSSM model stucture
"""
def __init__(self, model_dir,
item_embedding=None, user_embedding=None,
embedding_size=8,
vocab_size=1000,
max_length_item=15, max_length_user=6,
epoch=10, batch_size=256, blocks=2,
learning_rate=0.001, optimizer_type="adam",
batch_norm=0, batch_norm_decay=0.995,
verbose=False, random_seed=2019,
l2_reg=0.0, has_residual=True):
"""
initial model related parms and tensors
"""
super(BST_DSSM, self).__init__()
# denote as K, size of the feature embedding
self.embedding_size = embedding_size
self.l2_reg = l2_reg
self.epoch = epoch
self.batch_size = batch_size
self.learning_rate = learning_rate
self.optimizer_type = optimizer_type
self.optimizer = None
self.blocks = blocks
self.batch_norm = batch_norm
self.batch_norm_decay = batch_norm_decay
self.verbose = verbose
self.random_seed = random_seed
self.model_dir = model_dir
# self._init_graph()
self.vocab_size = vocab_size
self.max_length_item = max_length_item
self.max_length_user = max_length_user
self.has_residual = has_residual
self.model = None
self.item_embedding = item_embedding
self.user_embedding = user_embedding
self.mha_user = MultiHeadAttention("user", num_units=embedding_size)
self.mha_item = MultiHeadAttention("item", num_units=embedding_size)
def _get_item_embedding_matrix(self):
if self.item_embedding is None:
std = 0.1
minval = -std
maxval = std
emb_matrix = tf.Variable(
tf.random.uniform(
[self.vocab_size, self.embedding_size],
minval, maxval,
seed=self.random_seed,
dtype=tf.float32),
name="item_embedding")
self.item_embedding = emb_matrix
def _get_user_embedding_matrix(self):
if self.user_embedding is None:
std = 0.1
minval = -std
maxval = std
emb_matrix = tf.Variable(
tf.random.uniform(
[self.vocab_size, self.embedding_size],
minval, maxval,
seed=self.random_seed,
dtype=tf.float32),
name="user_embedding")
self.user_embedding = emb_matrix
def build_model(self):
# initialize lut
self._get_item_embedding_matrix()
self._get_user_embedding_matrix()
item_inputs = tf.keras.Input(
shape=(
self.max_length_item
),
dtype=tf.int32,
name="item_sequence_idx")
user_inputs = tf.keras.Input(
shape=(
self.max_length_user
),
dtype=tf.int32,
name="user_sequence_idx")
# user and item use different lut, similarly to DSSM
self.item_sequence_embeddings = tf.nn.embedding_lookup(
self.item_embedding, item_inputs, name="item_sequence_embeddings")
self.video_sequence_embeddings = tf.nn.embedding_lookup(
self.user_embedding, user_inputs, name="video_sequence_embeddings")
# self attn part
for i in range(self.blocks):
self.item_sequence_embeddings = self.mha_item(
queries=self.item_sequence_embeddings,
keys=self.item_sequence_embeddings,
values=self.item_sequence_embeddings)
self.video_sequence_embeddings = self.mha_user(
queries=self.video_sequence_embeddings,
keys=self.video_sequence_embeddings,
values=self.video_sequence_embeddings)
# max pooling
self.item_sequence_embeddings = tf.nn.max_pool(
self.item_sequence_embeddings,
[1, self.max_length_item, 1],
[1 for _ in range(len(self.item_sequence_embeddings.shape))],
padding="VALID")
self.video_sequence_embeddings = tf.nn.max_pool(
self.video_sequence_embeddings,
[1, self.max_length_user, 1],
[1 for _ in range(len(self.video_sequence_embeddings.shape))],
padding="VALID")
# cosine similarity
self.item_sequence_embeddings = tf.nn.l2_normalize(
self.item_sequence_embeddings, axis=2)
self.video_sequence_embeddings = tf.nn.l2_normalize(
self.video_sequence_embeddings, axis=2)
outputs = tf.matmul(
self.item_sequence_embeddings,
tf.transpose(self.video_sequence_embeddings, [0, 2, 1]))
outputs = tf.reshape(outputs, [-1, 1])
# optimizer
if self.optimizer_type == "adam":
self.optimizer = tf.keras.optimizers.Adam(
learning_rate=self.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
elif self.optimizer_type == "adagrad":
self.optimizer = tf.keras.optimizers.Adagrad(
learning_rate=self.learning_rate,
initial_accumulator_value=1e-8)
elif self.optimizer_type == "gd":
self.optimizer = tf.keras.optimizers.SGD(
learning_rate=self.learning_rate)
elif self.optimizer_type == "momentum":
self.optimizer = tf.keras.optimizers.SGD(
learning_rate=self.learning_rate, momentum=0.95)
self.model = tf.keras.Model(
inputs={
"item_sequence_idx": item_inputs,
"user_sequence_idx": user_inputs
},
outputs=outputs)
self.model.compile(
optimizer=self.optimizer,
loss=self.loss_fn,
metrics=[
tf.keras.metrics.AUC(),
tf.keras.metrics.binary_accuracy()])
Although I didn't figure out why I got such an error, I have built my model by defining a call method and the code is as below
from conf_loader import (
emb_dim, n_layer,
item_max_len, user_max_len,
batch_size, lr, l2_reg,
vocab_size
)
class BST_DSSM(tf.keras.Model):
"""define BST+DSSM model stucture
"""
def __init__(self,
item_embedding=None, user_embedding=None,
emb_dim=emb_dim,
vocab_size=vocab_size,
item_max_len=item_max_len, user_max_len=user_max_len,
epoch=10, batch_size=batch_size, n_layers=n_layer,
learning_rate=lr, optimizer_type="adam",
random_seed=2019,
l2_reg=l2_reg, has_residual=True):
"""
initial model related parms and tensors
"""
super(BST_DSSM, self).__init__()
self.emb_dim = emb_dim
self.l2_reg = l2_reg
self.epoch = epoch
self.batch_size = batch_size
self.learning_rate = learning_rate
self.optimizer_type = optimizer_type
self.blocks = n_layers
self.random_seed = random_seed
self.vocab_size = vocab_size
self.item_max_len = item_max_len
self.user_max_len = user_max_len
self.has_residual = has_residual
self.item_embedding = item_embedding
self.user_embedding = user_embedding
self.mha_user = MultiHeadAttention(scope_name="user", embed_dim=self.emb_dim)
self.mha_item = MultiHeadAttention(scope_name="item", embed_dim=self.emb_dim)
# optimizer
if self.optimizer_type == "adam":
self.optimizer = tf.keras.optimizers.Adam(
learning_rate=self.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
elif self.optimizer_type == "adagrad":
self.optimizer = tf.keras.optimizers.Adagrad(
learning_rate=self.learning_rate,
initial_accumulator_value=1e-8)
elif self.optimizer_type == "gd":
self.optimizer = tf.keras.optimizers.SGD(
learning_rate=self.learning_rate)
elif self.optimizer_type == "momentum":
self.optimizer = tf.keras.optimizers.SGD(
learning_rate=self.learning_rate, momentum=0.95)
self.user_embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, output_dim=self.emb_dim)
self.item_embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, output_dim=self.emb_dim)
#tf.function
def call(self, inputs, training=True):
# multiple inputs
item_inputs = inputs[0]
user_inputs = inputs[1]
item_sequence_embeddings = self.item_embedding(item_inputs)
user_sequence_embeddings = self.user_embedding(user_inputs)
# [batch_size, max_length, 16]
for i in range(self.blocks):
item_sequence_embeddings = self.mha_item(item_sequence_embeddings)
user_sequence_embeddings = self.mha_user(user_sequence_embeddings)
# [batch_size, 1, 16]
item_outputs_max = tf.nn.max_pool(
item_sequence_embeddings,
[1, self.item_max_len, 1],
[1 for _ in range(len(item_sequence_embeddings.shape))],
padding="VALID")
user_outputs_max = tf.nn.max_pool(
user_sequence_embeddings,
[1, self.user_max_len, 1],
[1 for _ in range(len(user_sequence_embeddings.shape))],
padding="VALID")
# L2 normalize to get cosine similarity
item_normalized = tf.nn.l2_normalize(
item_outputs_max, axis=2)
user_normalized = tf.nn.l2_normalize(
user_outputs_max, axis=2)
outputs = tf.matmul(
item_normalized,
user_normalized,
transpose_b=True)
return tf.reshape(outputs, [-1, 1])
def loss_fn(self, target, output):
cross_entropy = tf.keras.backend.binary_crossentropy(
target, output, from_logits=False
)
if self.l2_reg > 0:
_regularizer = tf.keras.regularizers.l2(self.l2_reg)
cross_entropy += _regularizer(self.user_embedding)
cross_entropy += _regularizer(self.item_embedding)
return cross_entropy
def debug():
x_train = [
np.random.randint(low=0, high=20, size=(5, item_max_len)),
np.random.randint(low=0, high=20, size=(5, user_max_len))]
y_train = np.random.randint(low=0, high=2, size=5).astype(dtype=float)
model = BST_DSSM()
model.compile(
optimizer=model.optimizer,
loss=model.loss_fn
)
model.fit(x_train, y_train, epochs=n_epoch)
model.summary()

Categories

Resources