I have been working on an implementation of the T5 architecture in PyTorch. I am having some issues properly implementing the Cross Attention Layers and Decoder.
If anyone who is familiar with the architecture could provide any advice it would be greatly appreciated.
I am sometimes receiving this error as well:
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)
Thank you!
Code for T5 in PyTorch:
import torch
from torch import nn
import torch.nn.functional as F
import math
from einops import rearrange
def exists(val):
return val is not None
def default(val, d):
return val if exists(val) else d
# residual wrapper
class Residual(nn.Module):
def __init__(self, fn):
super().__init__()
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(x, **kwargs) + x
# pre-normalization wrapper
# they use layernorm without bias
class T5LayerNorm(nn.Module):
def __init__(self, dim):
super().__init__()
self.gamma = nn.Parameter(torch.ones(dim))
self.register_buffer("beta", torch.zeros(dim))
def forward(self, x):
return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta)
class PreNorm(nn.Module):
def __init__(self, dim, fn):
super().__init__()
self.norm = T5LayerNorm(dim)
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(self.norm(x), **kwargs)
# feedforward layer
class FeedForward(nn.Module):
def __init__(self, dim, mult = 4, dropout = 0.):
super().__init__()
inner_dim = int(dim * mult)
self.net = nn.Sequential(
nn.Linear(dim, inner_dim),
nn.ReLU(),
nn.Dropout(dropout), # optional dropout
nn.Linear(inner_dim, dim)
)
def forward(self, x):
return self.net(x)
# T5 relative positional bias
class T5RelativePositionBias(nn.Module):
def __init__(self, scale, causal = False, num_buckets = 32, max_distance = 128, heads = 12):
super().__init__()
self.scale = scale
self.causal = causal
self.num_buckets = num_buckets
self.max_distance = max_distance
self.relative_attention_bias = nn.Embedding(num_buckets, heads)
#staticmethod
def _relative_position_bucket(relative_position, causal = True, num_buckets = 32, max_distance = 128):
ret = 0
n = -relative_position
if not causal:
num_buckets //= 2
ret += (n < 0).long() * num_buckets
n = torch.abs(n)
else:
n = torch.max(n, torch.zeros_like(n))
max_exact = num_buckets // 2
is_small = n < max_exact
val_if_large = max_exact + (
torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
).long()
val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
ret += torch.where(is_small, n, val_if_large)
return ret
def forward(self, qk_dots):
i, j, device = *qk_dots.shape[-2:], qk_dots.device
q_pos = torch.arange(j - i, j, dtype = torch.long, device = device)
k_pos = torch.arange(j, dtype = torch.long, device = device)
rel_pos = k_pos[None, :] - q_pos[:, None]
rp_bucket = self._relative_position_bucket(
rel_pos,
causal = self.causal,
num_buckets = self.num_buckets,
max_distance = self.max_distance
)
values = self.relative_attention_bias(rp_bucket)
bias = rearrange(values, 'i j h -> h i j')
return qk_dots + (bias * self.scale)
# T5 Self Attention
class T5SelfAttention(nn.Module):
def __init__(
self,
*,
dim,
heads = 12,
dim_head = 64,
causal = False,
dropout = 0.
):
super().__init__()
inner_dim = dim_head * heads
self.heads = heads
self.scale = dim_head ** -0.5
self.causal = causal
self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_k = nn.Linear(dim, inner_dim, bias = False)
self.to_v = nn.Linear(dim, inner_dim, bias = False)
self.to_out = nn.Linear(inner_dim, dim)
self.relative_position_bias = T5RelativePositionBias(
scale = dim_head ** -0.5,
causal = causal,
heads = heads
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask = None):
b, n, _, h = *x.shape, self.heads
q, k, v = self.to_q(x), self.to_k(x), self.to_v(x)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
q = q * self.scale
sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
sim = self.relative_position_bias(sim)
# mask
mask_value = -torch.finfo(sim.dtype).max
if mask is not None:
sim = sim.masked_fill_(~mask, mask_value)
if self.causal:
i, j = sim.shape[-2:]
causal_mask = torch.ones((i, j), dtype = torch.bool, device = x.device).triu(j - i + 1)
sim = sim.masked_fill(causal_mask, mask_value)
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate
out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
# merge heads
out = rearrange(out, 'b h n d -> b n (h d)')
# combine heads and linear output
return self.to_out(out)
# T5 Cross Attention
class T5CrossAttention(nn.Module):
def __init__(
self,
*,
dim,
context_dim = None,
heads = 12,
dim_head = 64,
dropout = 0.
):
super().__init__()
inner_dim = dim_head * heads
context_dim = default(context_dim, dim)
self.heads = heads
self.scale = dim_head ** -0.5
self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_k = nn.Linear(context_dim, inner_dim, bias = False)
self.to_v = nn.Linear(context_dim, inner_dim, bias = False)
self.to_out = nn.Linear(inner_dim, dim)
self.relative_position_bias = T5RelativePositionBias(
scale = dim_head ** -0.5,
causal = False,
heads = heads
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, context, mask = None, context_mask = None):
b, n, _, h = *x.shape, self.heads
kv_input = default(context, x)
q, k, v = self.to_q(x), self.to_k(kv_input), self.to_v(kv_input)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
q = q * self.scale
sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
sim = self.relative_position_bias(sim)
# mask
mask_value = -torch.finfo(sim.dtype).max
if mask is not None:
sim = sim.masked_fill_(~mask, mask_value)
if context_mask is not None:
sim = sim.masked_fill_(~context_mask[:, None, :], mask_value)
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate
out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
# merge heads
out = rearrange(out, 'b h n d -> b n (h d)')
# combine heads and linear output
return self.to_out(out)
# T5 Encoder
class T5Encoder(nn.Module):
def __init__(
self,
*,
dim,
num_tokens,
#max_seq_len,
depth,
heads = 12,
dim_head = 64,
causal = False,
mlp_mult = 4,
dropout = 0.
):
super().__init__()
self.token_emb = nn.Embedding(num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.layer = nn.ModuleList([])
for _ in range(depth):
self.layer.append(nn.ModuleList([
Residual(PreNorm(dim, T5SelfAttention(dim = dim, heads = heads, dim_head = dim_head, causal = causal, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim = dim, mult = mlp_mult, dropout = dropout))),
]))
self.final_norm = T5LayerNorm(dim)
def forward(self, x, mask = None):
x = self.token_emb(x)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
for attn, mlp in self.layer:
x = attn(x, mask = mask)
x = mlp(x)
x = self.final_norm(x)
return x
# T5 Decoder
class T5Decoder(nn.Module):
def __init__(
self,
*,
dim,
num_tokens,
#max_seq_len,
depth,
heads = 12,
dim_head = 64,
causal = True,
mlp_mult = 4,
dropout = 0.
):
super().__init__()
self.token_emb = nn.Embedding(num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.layer = nn.ModuleList([])
for _ in range(depth):
self.layer.append(nn.ModuleList([
Residual(PreNorm(dim, T5SelfAttention(dim = dim, heads = heads, dim_head = dim_head, causal = causal, dropout = dropout))),
Residual(PreNorm(dim, T5CrossAttention(dim = dim, heads = heads, dim_head = dim_head, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim = dim, mult = mlp_mult, dropout = dropout))),
]))
self.final_norm = T5LayerNorm(dim)
def forward(self, x, context, mask = None, context_mask = None):
x = self.token_emb(x)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
for attn, cross_attn, mlp in self.layer:
x = attn(x, mask = mask)
x = cross_attn(x, context = context, mask = mask, context_mask = context_mask)
x = mlp(x)
x = self.final_norm(x)
return x
# T5
class T5(nn.Module):
def __init__(
self,
*,
dim,
#max_seq_len,
enc_num_tokens,
enc_depth,
enc_heads,
enc_dim_head,
enc_mlp_mult,
dec_num_tokens,
dec_depth,
dec_heads,
dec_dim_head,
dec_mlp_mult,
dropout = 0.,
tie_token_emb = True
):
super().__init__()
self.embedding = nn.Embedding(enc_num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.encoder = T5Encoder(
dim = dim,
#max_seq_len = max_seq_len,
num_tokens = enc_num_tokens,
depth = enc_depth,
heads = enc_heads,
dim_head = enc_dim_head,
mlp_mult = enc_mlp_mult,
dropout = dropout
)
self.decoder = T5Decoder(
dim = dim,
#max_seq_len= max_seq_len,
num_tokens = dec_num_tokens,
depth = dec_depth,
heads = dec_heads,
dim_head = dec_dim_head,
mlp_mult = dec_mlp_mult,
dropout = dropout
)
self.to_logits = nn.Linear(dim, dec_num_tokens)
# tie weights
if tie_token_emb:
self.encoder.token_emb.weight = self.decoder.token_emb.weight
def forward(self, src, tgt, mask = None, context_mask = None):
x = self.embedding(src)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
x = self.encoder(src, mask = mask)
x = self.decoder(tgt, x, mask = mask, context_mask = context_mask)
x = self.to_logits(x)
return x
if __name__ == '__main__':
from opendelta import Visualization
model = T5(
dim = 768,
#max_seq_len = 1024,
enc_num_tokens = 512,
enc_depth = 6,
enc_heads = 12,
enc_dim_head = 64,
enc_mlp_mult = 4,
dec_num_tokens = 512,
dec_depth = 6,
dec_heads = 12,
dec_dim_head = 64,
dec_mlp_mult = 4,
dropout = 0.,
tie_token_emb = True
)
src = torch.randint(0, 512, (1, 1024))
src_mask = torch.ones_like(src).bool()
tgt = torch.randint(0, 512, (1, 1024))
loss = model(src, tgt, mask = src_mask)
Visualization(model).structure_graph()
print(loss.shape) #torch.Size([1, 1024, 512])
Working implementation of T5 in pytorch:
import torch
from torch import nn
import torch.nn.functional as F
import math
from einops import rearrange
def exists(val):
return val is not None
def default(val, d):
return val if exists(val) else d
# residual wrapper
class Residual(nn.Module):
def __init__(self, fn):
super().__init__()
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(x, **kwargs) + x
# pre-normalization wrapper
# they use layernorm without bias
class T5LayerNorm(nn.Module):
def __init__(self, dim):
super().__init__()
self.gamma = nn.Parameter(torch.ones(dim))
self.register_buffer("beta", torch.zeros(dim))
def forward(self, x):
return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta)
class PreNorm(nn.Module):
def __init__(self, dim, fn):
super().__init__()
self.norm = T5LayerNorm(dim)
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(self.norm(x), **kwargs)
# feedforward layer
class FeedForward(nn.Module):
def __init__(self, dim, mult = 4, dropout = 0.):
super().__init__()
inner_dim = int(dim * mult)
self.net = nn.Sequential(
nn.Linear(dim, inner_dim),
nn.ReLU(),
nn.Dropout(dropout), # optional dropout
nn.Linear(inner_dim, dim)
)
def forward(self, x):
return self.net(x)
# T5 relative positional bias
class T5RelativePositionBias(nn.Module):
def __init__(self, scale, causal = False, num_buckets = 32, max_distance = 128, heads = 12):
super().__init__()
self.scale = scale
self.causal = causal
self.num_buckets = num_buckets
self.max_distance = max_distance
self.relative_attention_bias = nn.Embedding(num_buckets, heads)
#staticmethod
def _relative_position_bucket(relative_position, causal = True, num_buckets = 32, max_distance = 128):
ret = 0
n = -relative_position
if not causal:
num_buckets //= 2
ret += (n < 0).long() * num_buckets
n = torch.abs(n)
else:
n = torch.max(n, torch.zeros_like(n))
max_exact = num_buckets // 2
is_small = n < max_exact
val_if_large = max_exact + (
torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
).long()
val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
ret += torch.where(is_small, n, val_if_large)
return ret
def forward(self, qk_dots):
i, j, device = *qk_dots.shape[-2:], qk_dots.device
q_pos = torch.arange(j - i, j, dtype = torch.long, device = device)
k_pos = torch.arange(j, dtype = torch.long, device = device)
rel_pos = k_pos[None, :] - q_pos[:, None]
rp_bucket = self._relative_position_bucket(
rel_pos,
causal = self.causal,
num_buckets = self.num_buckets,
max_distance = self.max_distance
)
values = self.relative_attention_bias(rp_bucket)
bias = rearrange(values, 'i j h -> h i j')
return qk_dots + (bias * self.scale)
# T5 Self Attention
class T5SelfAttention(nn.Module):
def __init__(
self,
*,
dim,
heads = 12,
dim_head = 64,
causal = False,
dropout = 0.
):
super().__init__()
inner_dim = dim_head * heads
self.heads = heads
self.scale = dim_head ** -0.5
self.causal = causal
self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_k = nn.Linear(dim, inner_dim, bias = False)
self.to_v = nn.Linear(dim, inner_dim, bias = False)
self.to_out = nn.Linear(inner_dim, dim)
self.relative_position_bias = T5RelativePositionBias(
scale = dim_head ** -0.5,
causal = causal,
heads = heads
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask = None):
b, n, _, h = *x.shape, self.heads
q, k, v = self.to_q(x), self.to_k(x), self.to_v(x)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
q = q * self.scale
sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
sim = self.relative_position_bias(sim)
# mask
mask_value = -torch.finfo(sim.dtype).max
if mask is not None:
sim = sim.masked_fill_(~mask, mask_value)
if self.causal:
i, j = sim.shape[-2:]
causal_mask = torch.ones((i, j), dtype = torch.bool, device = x.device).triu(j - i + 1)
sim = sim.masked_fill(causal_mask, mask_value)
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate
out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
# merge heads
out = rearrange(out, 'b h n d -> b n (h d)')
# combine heads and linear output
return self.to_out(out)
# T5 Cross Attention
class T5CrossAttention(nn.Module):
def __init__(
self,
*,
dim,
context_dim = None,
heads = 12,
dim_head = 64,
dropout = 0.
):
super().__init__()
inner_dim = dim_head * heads
context_dim = default(context_dim, dim)
self.heads = heads
self.scale = dim_head ** -0.5
self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_k = nn.Linear(context_dim, inner_dim, bias = False)
self.to_v = nn.Linear(context_dim, inner_dim, bias = False)
self.to_out = nn.Linear(inner_dim, dim)
# self.relative_position_bias = T5RelativePositionBias(
# scale = dim_head ** -0.5,
# causal = False,
# heads = heads
# )
self.dropout = nn.Dropout(dropout)
def forward(self, x, context, mask = None, context_mask = None):
b, n, _, h = *x.shape, self.heads
kv_input = default(context, x)
q, k, v = self.to_q(x), self.to_k(kv_input), self.to_v(kv_input)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
q = q * self.scale
sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
#sim = self.relative_position_bias(sim)
# mask
mask_value = -torch.finfo(sim.dtype).max
if mask is not None:
sim = sim.masked_fill_(~mask, mask_value)
if context_mask is not None:
sim = sim.masked_fill_(~context_mask[:, None, :], mask_value)
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate
out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
# merge heads
out = rearrange(out, 'b h n d -> b n (h d)')
# combine heads and linear output
return self.to_out(out)
# T5 Encoder
class T5Encoder(nn.Module):
def __init__(
self,
*,
dim,
num_tokens,
#max_seq_len,
depth,
heads = 12,
dim_head = 64,
causal = False,
mlp_mult = 4,
dropout = 0.
):
super().__init__()
self.token_emb = nn.Embedding(num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.layer = nn.ModuleList([])
for _ in range(depth):
self.layer.append(nn.ModuleList([
Residual(PreNorm(dim, T5SelfAttention(dim = dim, heads = heads, dim_head = dim_head, causal = causal, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim = dim, mult = mlp_mult, dropout = dropout))),
]))
self.final_norm = T5LayerNorm(dim)
def forward(self, x, mask = None):
x = self.token_emb(x)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
for attn, mlp in self.layer:
x = attn(x, mask = mask)
x = mlp(x)
x = self.final_norm(x)
return x
# T5 Decoder
class T5Decoder(nn.Module):
def __init__(
self,
*,
dim,
num_tokens,
#max_seq_len,
depth,
heads = 12,
dim_head = 64,
causal = True,
mlp_mult = 4,
dropout = 0.
):
super().__init__()
self.token_emb = nn.Embedding(num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.layer = nn.ModuleList([])
for _ in range(depth):
self.layer.append(nn.ModuleList([
Residual(PreNorm(dim, T5SelfAttention(dim = dim, heads = heads, dim_head = dim_head, causal = causal, dropout = dropout))),
Residual(PreNorm(dim, T5CrossAttention(dim = dim, heads = heads, dim_head = dim_head, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim = dim, mult = mlp_mult, dropout = dropout))),
]))
self.final_norm = T5LayerNorm(dim)
def forward(self, x, context, mask = None, context_mask = None):
x = self.token_emb(x)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
for attn, cross_attn, mlp in self.layer:
x = attn(x, mask = mask)
x = cross_attn(x, context = context, mask = mask, context_mask = context_mask)
x = mlp(x)
x = self.final_norm(x)
return x
# T5
class T5(nn.Module):
def __init__(
self,
*,
dim,
#max_seq_len,
enc_num_tokens,
enc_depth,
enc_heads,
enc_dim_head,
enc_mlp_mult,
dec_num_tokens,
dec_depth,
dec_heads,
dec_dim_head,
dec_mlp_mult,
dropout = 0.,
tie_token_emb = True
):
super().__init__()
self.embedding = nn.Embedding(enc_num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.encoder = T5Encoder(
dim = dim,
#max_seq_len = max_seq_len,
num_tokens = enc_num_tokens,
depth = enc_depth,
heads = enc_heads,
dim_head = enc_dim_head,
mlp_mult = enc_mlp_mult,
dropout = dropout
)
self.decoder = T5Decoder(
dim = dim,
#max_seq_len= max_seq_len,
num_tokens = dec_num_tokens,
depth = dec_depth,
heads = dec_heads,
dim_head = dec_dim_head,
mlp_mult = dec_mlp_mult,
dropout = dropout
)
self.to_logits = nn.Linear(dim, dec_num_tokens)
# tie weights
if tie_token_emb:
self.encoder.token_emb.weight = self.decoder.token_emb.weight
def forward(self, src, tgt, mask = None, context_mask = None):
x = self.embedding(src)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
x = self.encoder(src, mask = mask)
x = self.decoder(tgt, x, mask = mask, context_mask = context_mask)
x = self.to_logits(x)
return x
if __name__ == '__main__':
model = T5(
dim = 768,
#max_seq_len = 1024,
enc_num_tokens = 512,
enc_depth = 6,
enc_heads = 12,
enc_dim_head = 64,
enc_mlp_mult = 4,
dec_num_tokens = 512,
dec_depth = 6,
dec_heads = 12,
dec_dim_head = 64,
dec_mlp_mult = 4,
dropout = 0.,
tie_token_emb = True
)
src = torch.randint(0, 512, (1, 1024))
src_mask = torch.ones_like(src).bool()
tgt = torch.randint(0, 512, (1, 1024))
loss = model(src, tgt, mask = src_mask)
print(loss.shape) #torch.Size([1, 1024, 512])
I want to manually loop over the varying sequence lengths of the input sequences but Tensorflow automatically makes the time axis to None after noticing varying sequence lengths. Is there any work around for this?
Sample example
import tensorflow as tf
import numpy as np
class MyExample(tf.keras.Model):
def __init__(self, int_dim, **kwargs):
super(MyExample, self).__init__(**kwargs)
self.int_dim = int_dim
self.lstm = tf.keras.layers.LSTMCell(self.int_dim)
self.d2 = tf.keras.layers.Dense(self.int_dim)
def call(self, inputs):
states = (tf.zeros((1, self.int_dim)),
tf.zeros((1, self.int_dim)))
outputs = []
for t in range(inputs.shape[1]):
lstm_out, states = self.lstm(inputs[:, t, :], states)
d2_out = self.d2(lstm_out)
outputs.append(d2_out)
output_stack = tf.stack(outputs, 1)
return output_stack
def generator():
while True:
seq_len = np.random.randint(2, 10)
X = tf.random.uniform((1, seq_len, 5))
Y = tf.random.uniform((1, seq_len, 5))
yield X, Y
model = MyExample(5)
model.compile('adam', 'BinaryCrossentropy')
model.fit(generator(), batch_size=1)
Here is a fix for Eager Execution mode:
import tensorflow as tf
import numpy as np
class MyExample(tf.keras.Model):
def __init__(self, int_dim, **kwargs):
super(MyExample, self).__init__(**kwargs)
self.int_dim = int_dim
self.lstm = tf.keras.layers.LSTMCell(self.int_dim)
self.d2 = tf.keras.layers.Dense(self.int_dim)
def call(self, inputs):
states = (tf.zeros((tf.shape(inputs)[0], self.int_dim)),
tf.zeros((tf.shape(inputs)[0], self.int_dim)))
outputs = []
for t in range(tf.shape(inputs)[1]):
lstm_out, states = self.lstm(inputs[:, t, :], states)
d2_out = self.d2(lstm_out)
outputs.append(d2_out)
output_stack = tf.stack(outputs, 1)
return output_stack
def generator():
while True:
seq_len = np.random.randint(2, 10)
X = tf.random.uniform((1, seq_len, 5))
Y = tf.random.uniform((1, seq_len, 5))
yield X, Y
model = MyExample(5)
model.compile('adam', 'BinaryCrossentropy', run_eagerly=True)
model.fit(generator(), batch_size=1)
A Graph mode solution could look like this:
import tensorflow as tf
import numpy as np
class MyExample(tf.keras.Model):
def __init__(self, int_dim, **kwargs):
super(MyExample, self).__init__(**kwargs)
self.int_dim = int_dim
self.lstm = tf.keras.layers.LSTMCell(self.int_dim)
self.d2 = tf.keras.layers.Dense(self.int_dim)
def some_logic(self, i, inputs, s1, s2, o):
lstm_out, s = self.lstm(inputs[:, i, :], (s1, s2))
d2_out = self.d2(lstm_out)
o = o.write(o.size(), d2_out)
s1, s2 = s
return tf.add(i, 1), inputs, s1, s2, o
def call(self, inputs):
states = (tf.zeros((tf.shape(inputs)[0], self.int_dim)),
tf.zeros((tf.shape(inputs)[0], self.int_dim)))
s1, s2 = states
outputs = tf.TensorArray(dtype=tf.float32, size=0, dynamic_size=True)
i = tf.constant(0)
while_condition = lambda i, inputs, s1, s2, outputs: tf.less(i, tf.shape(inputs)[1])
_, _, _, _, result = tf.while_loop(while_condition, self.some_logic, loop_vars=(i, inputs, s1, s2, outputs))
return result.stack()
def generator():
while True:
seq_len = np.random.randint(2, 10)
X = tf.random.uniform((1, seq_len, 5))
Y = tf.random.uniform((1, seq_len, 5))
yield X, Y
model = MyExample(5)
model.compile('adam', 'BinaryCrossentropy')
model.fit(generator(), batch_size=1)
I'm newbie to tensorflow2 and use tensorflow2.3.1, cpu version.
I defined the model in subclassing way and, when showing the structure of my model, I encountered the error "tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot convert a Tensor of dtype resource to a NumPy array", which points to the following line in BST_DSSM.build_model
"self.item_sequence_embeddings = tf.nn.embedding_lookup("
I have browsed through similar questions but can't find satisfactory solution.
Any help will be appreciated :)
Below is my code.
import tensorflow as tf
class MultiHeadAttention(tf.keras.layers.Layer):
""" def multi head attention layer
q, k, v multiplied by Wq, Wk, Wv respectively -> q', k', v'
q' * k' -> w, w / sqrt(q'.shape[1]) -> w'
w' * v' -> z, z * Wz -> z'
z' add v (residual), then goes through LRelu, do a LN at last
"""
def __init__(
self,
scope_name,
num_units=8,
num_heads=1,
embed_dim=8,
has_residual=True,
dropout_keep_prob=1.0):
super(MultiHeadAttention, self).__init__()
assert num_units % num_heads == 0
assert scope_name in ["user", "item"]
self.num_heads = num_heads
self.num_units = num_units
self.embed_dim = embed_dim
self.dropout_keep_prob = dropout_keep_prob
self.Wq = tf.keras.layers.Dense(
units=self.num_units, activation=tf.nn.leaky_relu, name=f"{scope_name}_Wq")
self.Wk = tf.keras.layers.Dense(
units=self.num_units, activation=tf.nn.leaky_relu, name=f"{scope_name}_Wk")
self.Wv = tf.keras.layers.Dense(
units=self.num_units, activation=tf.nn.leaky_relu, name=f"{scope_name}_Wv")
self.has_residual = has_residual
self.Wz = tf.keras.layers.Dense(embed_dim)
def call(self, queries, keys_, values):
"""
:param queries: of shape [batch_size, max_length, emb_dim]
:param keys_: of shape [batch_size, max_length, emb_dim]
:param values: of shape [batch_size, max_length, emb_dim]
:return:
"""
assert values.get_shape().as_list()[-1] == self.embed_dim
assert queries.get_shape().as_list()[-1] == self.embed_dim
assert keys_.get_shape().as_list()[-1] == self.embed_dim
# Linear projections
Q = self.Wq(queries)
K = self.Wk(keys_)
V = self.Wv(values)
# Split and concat
Q_ = tf.concat(tf.split(Q, self.num_heads, axis=2), axis=0)
K_ = tf.concat(tf.split(K, self.num_heads, axis=2), axis=0)
V_ = tf.concat(tf.split(V, self.num_heads, axis=2), axis=0)
# Multiplication
weights = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))
# Scale
weights = weights / (K_.get_shape().as_list()[-1] ** 0.5)
# convert to prob vector
weights = tf.nn.softmax(weights)
# Dropouts
if 0 < self.dropout_keep_prob < 1:
weights = tf.keras.layers.AlphaDropout(
rate=1 - self.dropout_keep_prob)(weights)
# Weighted sum
# [batch_size * num_heads, max_length, num_units / num_heads]
outputs = tf.matmul(weights, V_)
# Restore shape to [batch_size, max_length, num_units]
z = tf.concat(tf.split(outputs, self.num_heads, axis=0), axis=2)
# Restore shape to [batch_size, max_length, embed_dim]
z = self.Wz(z)
# Residual connection
if self.has_residual:
z += values
z = tf.nn.leaky_relu(z)
# Normalize
z = tf.keras.layers.LayerNormalization(
beta_initializer="zeros", gamma_initializer="ones")(z)
return z
class BST_DSSM(tf.keras.Model):
"""define BST+DSSM model stucture
"""
def __init__(self, model_dir,
item_embedding=None, user_embedding=None,
embedding_size=8,
vocab_size=1000,
max_length_item=15, max_length_user=6,
epoch=10, batch_size=256, blocks=2,
learning_rate=0.001, optimizer_type="adam",
batch_norm=0, batch_norm_decay=0.995,
verbose=False, random_seed=2019,
l2_reg=0.0, has_residual=True):
"""
initial model related parms and tensors
"""
super(BST_DSSM, self).__init__()
# denote as K, size of the feature embedding
self.embedding_size = embedding_size
self.l2_reg = l2_reg
self.epoch = epoch
self.batch_size = batch_size
self.learning_rate = learning_rate
self.optimizer_type = optimizer_type
self.optimizer = None
self.blocks = blocks
self.batch_norm = batch_norm
self.batch_norm_decay = batch_norm_decay
self.verbose = verbose
self.random_seed = random_seed
self.model_dir = model_dir
# self._init_graph()
self.vocab_size = vocab_size
self.max_length_item = max_length_item
self.max_length_user = max_length_user
self.has_residual = has_residual
self.model = None
self.item_embedding = item_embedding
self.user_embedding = user_embedding
self.mha_user = MultiHeadAttention("user", num_units=embedding_size)
self.mha_item = MultiHeadAttention("item", num_units=embedding_size)
def _get_item_embedding_matrix(self):
if self.item_embedding is None:
std = 0.1
minval = -std
maxval = std
emb_matrix = tf.Variable(
tf.random.uniform(
[self.vocab_size, self.embedding_size],
minval, maxval,
seed=self.random_seed,
dtype=tf.float32),
name="item_embedding")
self.item_embedding = emb_matrix
def _get_user_embedding_matrix(self):
if self.user_embedding is None:
std = 0.1
minval = -std
maxval = std
emb_matrix = tf.Variable(
tf.random.uniform(
[self.vocab_size, self.embedding_size],
minval, maxval,
seed=self.random_seed,
dtype=tf.float32),
name="user_embedding")
self.user_embedding = emb_matrix
def build_model(self):
# initialize lut
self._get_item_embedding_matrix()
self._get_user_embedding_matrix()
item_inputs = tf.keras.Input(
shape=(
self.max_length_item
),
dtype=tf.int32,
name="item_sequence_idx")
user_inputs = tf.keras.Input(
shape=(
self.max_length_user
),
dtype=tf.int32,
name="user_sequence_idx")
# user and item use different lut, similarly to DSSM
self.item_sequence_embeddings = tf.nn.embedding_lookup(
self.item_embedding, item_inputs, name="item_sequence_embeddings")
self.video_sequence_embeddings = tf.nn.embedding_lookup(
self.user_embedding, user_inputs, name="video_sequence_embeddings")
# self attn part
for i in range(self.blocks):
self.item_sequence_embeddings = self.mha_item(
queries=self.item_sequence_embeddings,
keys=self.item_sequence_embeddings,
values=self.item_sequence_embeddings)
self.video_sequence_embeddings = self.mha_user(
queries=self.video_sequence_embeddings,
keys=self.video_sequence_embeddings,
values=self.video_sequence_embeddings)
# max pooling
self.item_sequence_embeddings = tf.nn.max_pool(
self.item_sequence_embeddings,
[1, self.max_length_item, 1],
[1 for _ in range(len(self.item_sequence_embeddings.shape))],
padding="VALID")
self.video_sequence_embeddings = tf.nn.max_pool(
self.video_sequence_embeddings,
[1, self.max_length_user, 1],
[1 for _ in range(len(self.video_sequence_embeddings.shape))],
padding="VALID")
# cosine similarity
self.item_sequence_embeddings = tf.nn.l2_normalize(
self.item_sequence_embeddings, axis=2)
self.video_sequence_embeddings = tf.nn.l2_normalize(
self.video_sequence_embeddings, axis=2)
outputs = tf.matmul(
self.item_sequence_embeddings,
tf.transpose(self.video_sequence_embeddings, [0, 2, 1]))
outputs = tf.reshape(outputs, [-1, 1])
# optimizer
if self.optimizer_type == "adam":
self.optimizer = tf.keras.optimizers.Adam(
learning_rate=self.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
elif self.optimizer_type == "adagrad":
self.optimizer = tf.keras.optimizers.Adagrad(
learning_rate=self.learning_rate,
initial_accumulator_value=1e-8)
elif self.optimizer_type == "gd":
self.optimizer = tf.keras.optimizers.SGD(
learning_rate=self.learning_rate)
elif self.optimizer_type == "momentum":
self.optimizer = tf.keras.optimizers.SGD(
learning_rate=self.learning_rate, momentum=0.95)
self.model = tf.keras.Model(
inputs={
"item_sequence_idx": item_inputs,
"user_sequence_idx": user_inputs
},
outputs=outputs)
self.model.compile(
optimizer=self.optimizer,
loss=self.loss_fn,
metrics=[
tf.keras.metrics.AUC(),
tf.keras.metrics.binary_accuracy()])
Although I didn't figure out why I got such an error, I have built my model by defining a call method and the code is as below
from conf_loader import (
emb_dim, n_layer,
item_max_len, user_max_len,
batch_size, lr, l2_reg,
vocab_size
)
class BST_DSSM(tf.keras.Model):
"""define BST+DSSM model stucture
"""
def __init__(self,
item_embedding=None, user_embedding=None,
emb_dim=emb_dim,
vocab_size=vocab_size,
item_max_len=item_max_len, user_max_len=user_max_len,
epoch=10, batch_size=batch_size, n_layers=n_layer,
learning_rate=lr, optimizer_type="adam",
random_seed=2019,
l2_reg=l2_reg, has_residual=True):
"""
initial model related parms and tensors
"""
super(BST_DSSM, self).__init__()
self.emb_dim = emb_dim
self.l2_reg = l2_reg
self.epoch = epoch
self.batch_size = batch_size
self.learning_rate = learning_rate
self.optimizer_type = optimizer_type
self.blocks = n_layers
self.random_seed = random_seed
self.vocab_size = vocab_size
self.item_max_len = item_max_len
self.user_max_len = user_max_len
self.has_residual = has_residual
self.item_embedding = item_embedding
self.user_embedding = user_embedding
self.mha_user = MultiHeadAttention(scope_name="user", embed_dim=self.emb_dim)
self.mha_item = MultiHeadAttention(scope_name="item", embed_dim=self.emb_dim)
# optimizer
if self.optimizer_type == "adam":
self.optimizer = tf.keras.optimizers.Adam(
learning_rate=self.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
elif self.optimizer_type == "adagrad":
self.optimizer = tf.keras.optimizers.Adagrad(
learning_rate=self.learning_rate,
initial_accumulator_value=1e-8)
elif self.optimizer_type == "gd":
self.optimizer = tf.keras.optimizers.SGD(
learning_rate=self.learning_rate)
elif self.optimizer_type == "momentum":
self.optimizer = tf.keras.optimizers.SGD(
learning_rate=self.learning_rate, momentum=0.95)
self.user_embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, output_dim=self.emb_dim)
self.item_embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, output_dim=self.emb_dim)
#tf.function
def call(self, inputs, training=True):
# multiple inputs
item_inputs = inputs[0]
user_inputs = inputs[1]
item_sequence_embeddings = self.item_embedding(item_inputs)
user_sequence_embeddings = self.user_embedding(user_inputs)
# [batch_size, max_length, 16]
for i in range(self.blocks):
item_sequence_embeddings = self.mha_item(item_sequence_embeddings)
user_sequence_embeddings = self.mha_user(user_sequence_embeddings)
# [batch_size, 1, 16]
item_outputs_max = tf.nn.max_pool(
item_sequence_embeddings,
[1, self.item_max_len, 1],
[1 for _ in range(len(item_sequence_embeddings.shape))],
padding="VALID")
user_outputs_max = tf.nn.max_pool(
user_sequence_embeddings,
[1, self.user_max_len, 1],
[1 for _ in range(len(user_sequence_embeddings.shape))],
padding="VALID")
# L2 normalize to get cosine similarity
item_normalized = tf.nn.l2_normalize(
item_outputs_max, axis=2)
user_normalized = tf.nn.l2_normalize(
user_outputs_max, axis=2)
outputs = tf.matmul(
item_normalized,
user_normalized,
transpose_b=True)
return tf.reshape(outputs, [-1, 1])
def loss_fn(self, target, output):
cross_entropy = tf.keras.backend.binary_crossentropy(
target, output, from_logits=False
)
if self.l2_reg > 0:
_regularizer = tf.keras.regularizers.l2(self.l2_reg)
cross_entropy += _regularizer(self.user_embedding)
cross_entropy += _regularizer(self.item_embedding)
return cross_entropy
def debug():
x_train = [
np.random.randint(low=0, high=20, size=(5, item_max_len)),
np.random.randint(low=0, high=20, size=(5, user_max_len))]
y_train = np.random.randint(low=0, high=2, size=5).astype(dtype=float)
model = BST_DSSM()
model.compile(
optimizer=model.optimizer,
loss=model.loss_fn
)
model.fit(x_train, y_train, epochs=n_epoch)
model.summary()