how to convert k.variable, addweights to pytorch code - python

i have a code written by tensorflow code i need to convert to pytorch.
i think i almost done with this but i'm not sure about detail.
first i have
class adaptive_implicit_trans(layers.Layer):
def __init__(self, **kwargs):
super(adaptive_implicit_trans, self).__init__(**kwargs)
def build(self, input_shape):
conv_shape = (1,1,64,64)
self.it_weights = self.add_weight(
shape = (1,1,64,1),
initializer = initializers.get('ones'),
constraint = constraints.NonNeg(),
name = 'ait_conv')
kernel = np.zeros(conv_shape)
r1 = sqrt(1.0/8)
r2 = sqrt(2.0/8)
for i in range(8):
_u = 2*i+1
for j in range(8):
_v = 2*j+1
index = i*8+j
for u in range(8):
for v in range(8):
index2 = u*8+v
t = cos(_u*u*pi/16)*cos(_v*v*pi/16)
t = t*r1 if u==0 else t*r2
t = t*r1 if v==0 else t*r2
kernel[0,0,index2,index] = t
self.kernel = k.variable(value = kernel, dtype = 'float32')
def call(self, inputs):
self.kernel = self.kernel*self.it_weights
y = k.conv2d(inputs,
self.kernel,
padding = 'same',
data_format='channels_last')
return y
def compute_output_shape(self, input_shape):
return input_shape
and i need to convert it to pytorch code.
so i made this.
class It_Weight(nn.Module):
def __init__(self):
super().__init__()
self.ReLU = nn.ReLU()
self.it_weights = nn.Parameter(torch.autograd.Variable(torch.ones((1, 64, 1, 1)),
requires_grad=True))
def forward(self, input):
y = input.to('cuda') * self.it_weights
return self.ReLU(y)
def compute_output_shape(self, input_shape):
return input_shape
class Kernel(nn.Module):
def __init__(self):
super().__init__()
conv_shape = (64, 64, 1, 1)
kernel = torch.zeros(conv_shape).cuda()
r1 = sqrt(1.0 / 8)
r2 = sqrt(2.0 / 8)
for i in range(8):
_u = 2 * i + 1
for j in range(8):
_v = 2 * j + 1
index = i * 8 + j
for u in range(8):
for v in range(8):
index2 = u * 8 + v
t = cos(_u * u * pi / 16) * cos(_v * v * pi / 16)
t = t * r1 if u == 0 else t * r2
t = t * r1 if v == 0 else t * r2
kernel[index, index2, 0, 0] = t
self.kernel = torch.autograd.Variable(kernel)
def forward(self):
return self.kernel
def compute_output_shape(self, input_shape):
return input_shape
class adaptive_implicit_trans(nn.Module):
def __init__(self):
super().__init__()
self.it_weights = It_Weight()
self.kernel = Kernel()
def forward(self, inputs):
self.kernel1 = self.it_weights(self.kernel()) # 출력도 kernel사이즈랑 동일
y = F.conv2d(inputs, self.kernel1)
return y
def compute_output_shape(self, input_shape):
return input_shape
the number of class was increased but it was my best in order to avoid error.
i don't know where is wrong.
i have been trying to train this pytorch code. but performance is lower than tensorflow code.
where should i edit ??
please help me.

Related

PyTorch T5 Transformer Implementation

I have been working on an implementation of the T5 architecture in PyTorch. I am having some issues properly implementing the Cross Attention Layers and Decoder.
If anyone who is familiar with the architecture could provide any advice it would be greatly appreciated.
I am sometimes receiving this error as well:
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)
Thank you!
Code for T5 in PyTorch:
import torch
from torch import nn
import torch.nn.functional as F
import math
from einops import rearrange
def exists(val):
return val is not None
def default(val, d):
return val if exists(val) else d
# residual wrapper
class Residual(nn.Module):
def __init__(self, fn):
super().__init__()
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(x, **kwargs) + x
# pre-normalization wrapper
# they use layernorm without bias
class T5LayerNorm(nn.Module):
def __init__(self, dim):
super().__init__()
self.gamma = nn.Parameter(torch.ones(dim))
self.register_buffer("beta", torch.zeros(dim))
def forward(self, x):
return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta)
class PreNorm(nn.Module):
def __init__(self, dim, fn):
super().__init__()
self.norm = T5LayerNorm(dim)
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(self.norm(x), **kwargs)
# feedforward layer
class FeedForward(nn.Module):
def __init__(self, dim, mult = 4, dropout = 0.):
super().__init__()
inner_dim = int(dim * mult)
self.net = nn.Sequential(
nn.Linear(dim, inner_dim),
nn.ReLU(),
nn.Dropout(dropout), # optional dropout
nn.Linear(inner_dim, dim)
)
def forward(self, x):
return self.net(x)
# T5 relative positional bias
class T5RelativePositionBias(nn.Module):
def __init__(self, scale, causal = False, num_buckets = 32, max_distance = 128, heads = 12):
super().__init__()
self.scale = scale
self.causal = causal
self.num_buckets = num_buckets
self.max_distance = max_distance
self.relative_attention_bias = nn.Embedding(num_buckets, heads)
#staticmethod
def _relative_position_bucket(relative_position, causal = True, num_buckets = 32, max_distance = 128):
ret = 0
n = -relative_position
if not causal:
num_buckets //= 2
ret += (n < 0).long() * num_buckets
n = torch.abs(n)
else:
n = torch.max(n, torch.zeros_like(n))
max_exact = num_buckets // 2
is_small = n < max_exact
val_if_large = max_exact + (
torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
).long()
val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
ret += torch.where(is_small, n, val_if_large)
return ret
def forward(self, qk_dots):
i, j, device = *qk_dots.shape[-2:], qk_dots.device
q_pos = torch.arange(j - i, j, dtype = torch.long, device = device)
k_pos = torch.arange(j, dtype = torch.long, device = device)
rel_pos = k_pos[None, :] - q_pos[:, None]
rp_bucket = self._relative_position_bucket(
rel_pos,
causal = self.causal,
num_buckets = self.num_buckets,
max_distance = self.max_distance
)
values = self.relative_attention_bias(rp_bucket)
bias = rearrange(values, 'i j h -> h i j')
return qk_dots + (bias * self.scale)
# T5 Self Attention
class T5SelfAttention(nn.Module):
def __init__(
self,
*,
dim,
heads = 12,
dim_head = 64,
causal = False,
dropout = 0.
):
super().__init__()
inner_dim = dim_head * heads
self.heads = heads
self.scale = dim_head ** -0.5
self.causal = causal
self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_k = nn.Linear(dim, inner_dim, bias = False)
self.to_v = nn.Linear(dim, inner_dim, bias = False)
self.to_out = nn.Linear(inner_dim, dim)
self.relative_position_bias = T5RelativePositionBias(
scale = dim_head ** -0.5,
causal = causal,
heads = heads
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask = None):
b, n, _, h = *x.shape, self.heads
q, k, v = self.to_q(x), self.to_k(x), self.to_v(x)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
q = q * self.scale
sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
sim = self.relative_position_bias(sim)
# mask
mask_value = -torch.finfo(sim.dtype).max
if mask is not None:
sim = sim.masked_fill_(~mask, mask_value)
if self.causal:
i, j = sim.shape[-2:]
causal_mask = torch.ones((i, j), dtype = torch.bool, device = x.device).triu(j - i + 1)
sim = sim.masked_fill(causal_mask, mask_value)
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate
out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
# merge heads
out = rearrange(out, 'b h n d -> b n (h d)')
# combine heads and linear output
return self.to_out(out)
# T5 Cross Attention
class T5CrossAttention(nn.Module):
def __init__(
self,
*,
dim,
context_dim = None,
heads = 12,
dim_head = 64,
dropout = 0.
):
super().__init__()
inner_dim = dim_head * heads
context_dim = default(context_dim, dim)
self.heads = heads
self.scale = dim_head ** -0.5
self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_k = nn.Linear(context_dim, inner_dim, bias = False)
self.to_v = nn.Linear(context_dim, inner_dim, bias = False)
self.to_out = nn.Linear(inner_dim, dim)
self.relative_position_bias = T5RelativePositionBias(
scale = dim_head ** -0.5,
causal = False,
heads = heads
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, context, mask = None, context_mask = None):
b, n, _, h = *x.shape, self.heads
kv_input = default(context, x)
q, k, v = self.to_q(x), self.to_k(kv_input), self.to_v(kv_input)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
q = q * self.scale
sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
sim = self.relative_position_bias(sim)
# mask
mask_value = -torch.finfo(sim.dtype).max
if mask is not None:
sim = sim.masked_fill_(~mask, mask_value)
if context_mask is not None:
sim = sim.masked_fill_(~context_mask[:, None, :], mask_value)
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate
out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
# merge heads
out = rearrange(out, 'b h n d -> b n (h d)')
# combine heads and linear output
return self.to_out(out)
# T5 Encoder
class T5Encoder(nn.Module):
def __init__(
self,
*,
dim,
num_tokens,
#max_seq_len,
depth,
heads = 12,
dim_head = 64,
causal = False,
mlp_mult = 4,
dropout = 0.
):
super().__init__()
self.token_emb = nn.Embedding(num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.layer = nn.ModuleList([])
for _ in range(depth):
self.layer.append(nn.ModuleList([
Residual(PreNorm(dim, T5SelfAttention(dim = dim, heads = heads, dim_head = dim_head, causal = causal, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim = dim, mult = mlp_mult, dropout = dropout))),
]))
self.final_norm = T5LayerNorm(dim)
def forward(self, x, mask = None):
x = self.token_emb(x)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
for attn, mlp in self.layer:
x = attn(x, mask = mask)
x = mlp(x)
x = self.final_norm(x)
return x
# T5 Decoder
class T5Decoder(nn.Module):
def __init__(
self,
*,
dim,
num_tokens,
#max_seq_len,
depth,
heads = 12,
dim_head = 64,
causal = True,
mlp_mult = 4,
dropout = 0.
):
super().__init__()
self.token_emb = nn.Embedding(num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.layer = nn.ModuleList([])
for _ in range(depth):
self.layer.append(nn.ModuleList([
Residual(PreNorm(dim, T5SelfAttention(dim = dim, heads = heads, dim_head = dim_head, causal = causal, dropout = dropout))),
Residual(PreNorm(dim, T5CrossAttention(dim = dim, heads = heads, dim_head = dim_head, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim = dim, mult = mlp_mult, dropout = dropout))),
]))
self.final_norm = T5LayerNorm(dim)
def forward(self, x, context, mask = None, context_mask = None):
x = self.token_emb(x)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
for attn, cross_attn, mlp in self.layer:
x = attn(x, mask = mask)
x = cross_attn(x, context = context, mask = mask, context_mask = context_mask)
x = mlp(x)
x = self.final_norm(x)
return x
# T5
class T5(nn.Module):
def __init__(
self,
*,
dim,
#max_seq_len,
enc_num_tokens,
enc_depth,
enc_heads,
enc_dim_head,
enc_mlp_mult,
dec_num_tokens,
dec_depth,
dec_heads,
dec_dim_head,
dec_mlp_mult,
dropout = 0.,
tie_token_emb = True
):
super().__init__()
self.embedding = nn.Embedding(enc_num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.encoder = T5Encoder(
dim = dim,
#max_seq_len = max_seq_len,
num_tokens = enc_num_tokens,
depth = enc_depth,
heads = enc_heads,
dim_head = enc_dim_head,
mlp_mult = enc_mlp_mult,
dropout = dropout
)
self.decoder = T5Decoder(
dim = dim,
#max_seq_len= max_seq_len,
num_tokens = dec_num_tokens,
depth = dec_depth,
heads = dec_heads,
dim_head = dec_dim_head,
mlp_mult = dec_mlp_mult,
dropout = dropout
)
self.to_logits = nn.Linear(dim, dec_num_tokens)
# tie weights
if tie_token_emb:
self.encoder.token_emb.weight = self.decoder.token_emb.weight
def forward(self, src, tgt, mask = None, context_mask = None):
x = self.embedding(src)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
x = self.encoder(src, mask = mask)
x = self.decoder(tgt, x, mask = mask, context_mask = context_mask)
x = self.to_logits(x)
return x
if __name__ == '__main__':
from opendelta import Visualization
model = T5(
dim = 768,
#max_seq_len = 1024,
enc_num_tokens = 512,
enc_depth = 6,
enc_heads = 12,
enc_dim_head = 64,
enc_mlp_mult = 4,
dec_num_tokens = 512,
dec_depth = 6,
dec_heads = 12,
dec_dim_head = 64,
dec_mlp_mult = 4,
dropout = 0.,
tie_token_emb = True
)
src = torch.randint(0, 512, (1, 1024))
src_mask = torch.ones_like(src).bool()
tgt = torch.randint(0, 512, (1, 1024))
loss = model(src, tgt, mask = src_mask)
Visualization(model).structure_graph()
print(loss.shape) #torch.Size([1, 1024, 512])
Working implementation of T5 in pytorch:
import torch
from torch import nn
import torch.nn.functional as F
import math
from einops import rearrange
def exists(val):
return val is not None
def default(val, d):
return val if exists(val) else d
# residual wrapper
class Residual(nn.Module):
def __init__(self, fn):
super().__init__()
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(x, **kwargs) + x
# pre-normalization wrapper
# they use layernorm without bias
class T5LayerNorm(nn.Module):
def __init__(self, dim):
super().__init__()
self.gamma = nn.Parameter(torch.ones(dim))
self.register_buffer("beta", torch.zeros(dim))
def forward(self, x):
return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta)
class PreNorm(nn.Module):
def __init__(self, dim, fn):
super().__init__()
self.norm = T5LayerNorm(dim)
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(self.norm(x), **kwargs)
# feedforward layer
class FeedForward(nn.Module):
def __init__(self, dim, mult = 4, dropout = 0.):
super().__init__()
inner_dim = int(dim * mult)
self.net = nn.Sequential(
nn.Linear(dim, inner_dim),
nn.ReLU(),
nn.Dropout(dropout), # optional dropout
nn.Linear(inner_dim, dim)
)
def forward(self, x):
return self.net(x)
# T5 relative positional bias
class T5RelativePositionBias(nn.Module):
def __init__(self, scale, causal = False, num_buckets = 32, max_distance = 128, heads = 12):
super().__init__()
self.scale = scale
self.causal = causal
self.num_buckets = num_buckets
self.max_distance = max_distance
self.relative_attention_bias = nn.Embedding(num_buckets, heads)
#staticmethod
def _relative_position_bucket(relative_position, causal = True, num_buckets = 32, max_distance = 128):
ret = 0
n = -relative_position
if not causal:
num_buckets //= 2
ret += (n < 0).long() * num_buckets
n = torch.abs(n)
else:
n = torch.max(n, torch.zeros_like(n))
max_exact = num_buckets // 2
is_small = n < max_exact
val_if_large = max_exact + (
torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
).long()
val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
ret += torch.where(is_small, n, val_if_large)
return ret
def forward(self, qk_dots):
i, j, device = *qk_dots.shape[-2:], qk_dots.device
q_pos = torch.arange(j - i, j, dtype = torch.long, device = device)
k_pos = torch.arange(j, dtype = torch.long, device = device)
rel_pos = k_pos[None, :] - q_pos[:, None]
rp_bucket = self._relative_position_bucket(
rel_pos,
causal = self.causal,
num_buckets = self.num_buckets,
max_distance = self.max_distance
)
values = self.relative_attention_bias(rp_bucket)
bias = rearrange(values, 'i j h -> h i j')
return qk_dots + (bias * self.scale)
# T5 Self Attention
class T5SelfAttention(nn.Module):
def __init__(
self,
*,
dim,
heads = 12,
dim_head = 64,
causal = False,
dropout = 0.
):
super().__init__()
inner_dim = dim_head * heads
self.heads = heads
self.scale = dim_head ** -0.5
self.causal = causal
self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_k = nn.Linear(dim, inner_dim, bias = False)
self.to_v = nn.Linear(dim, inner_dim, bias = False)
self.to_out = nn.Linear(inner_dim, dim)
self.relative_position_bias = T5RelativePositionBias(
scale = dim_head ** -0.5,
causal = causal,
heads = heads
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask = None):
b, n, _, h = *x.shape, self.heads
q, k, v = self.to_q(x), self.to_k(x), self.to_v(x)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
q = q * self.scale
sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
sim = self.relative_position_bias(sim)
# mask
mask_value = -torch.finfo(sim.dtype).max
if mask is not None:
sim = sim.masked_fill_(~mask, mask_value)
if self.causal:
i, j = sim.shape[-2:]
causal_mask = torch.ones((i, j), dtype = torch.bool, device = x.device).triu(j - i + 1)
sim = sim.masked_fill(causal_mask, mask_value)
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate
out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
# merge heads
out = rearrange(out, 'b h n d -> b n (h d)')
# combine heads and linear output
return self.to_out(out)
# T5 Cross Attention
class T5CrossAttention(nn.Module):
def __init__(
self,
*,
dim,
context_dim = None,
heads = 12,
dim_head = 64,
dropout = 0.
):
super().__init__()
inner_dim = dim_head * heads
context_dim = default(context_dim, dim)
self.heads = heads
self.scale = dim_head ** -0.5
self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_k = nn.Linear(context_dim, inner_dim, bias = False)
self.to_v = nn.Linear(context_dim, inner_dim, bias = False)
self.to_out = nn.Linear(inner_dim, dim)
# self.relative_position_bias = T5RelativePositionBias(
# scale = dim_head ** -0.5,
# causal = False,
# heads = heads
# )
self.dropout = nn.Dropout(dropout)
def forward(self, x, context, mask = None, context_mask = None):
b, n, _, h = *x.shape, self.heads
kv_input = default(context, x)
q, k, v = self.to_q(x), self.to_k(kv_input), self.to_v(kv_input)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
q = q * self.scale
sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
#sim = self.relative_position_bias(sim)
# mask
mask_value = -torch.finfo(sim.dtype).max
if mask is not None:
sim = sim.masked_fill_(~mask, mask_value)
if context_mask is not None:
sim = sim.masked_fill_(~context_mask[:, None, :], mask_value)
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate
out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
# merge heads
out = rearrange(out, 'b h n d -> b n (h d)')
# combine heads and linear output
return self.to_out(out)
# T5 Encoder
class T5Encoder(nn.Module):
def __init__(
self,
*,
dim,
num_tokens,
#max_seq_len,
depth,
heads = 12,
dim_head = 64,
causal = False,
mlp_mult = 4,
dropout = 0.
):
super().__init__()
self.token_emb = nn.Embedding(num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.layer = nn.ModuleList([])
for _ in range(depth):
self.layer.append(nn.ModuleList([
Residual(PreNorm(dim, T5SelfAttention(dim = dim, heads = heads, dim_head = dim_head, causal = causal, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim = dim, mult = mlp_mult, dropout = dropout))),
]))
self.final_norm = T5LayerNorm(dim)
def forward(self, x, mask = None):
x = self.token_emb(x)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
for attn, mlp in self.layer:
x = attn(x, mask = mask)
x = mlp(x)
x = self.final_norm(x)
return x
# T5 Decoder
class T5Decoder(nn.Module):
def __init__(
self,
*,
dim,
num_tokens,
#max_seq_len,
depth,
heads = 12,
dim_head = 64,
causal = True,
mlp_mult = 4,
dropout = 0.
):
super().__init__()
self.token_emb = nn.Embedding(num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.layer = nn.ModuleList([])
for _ in range(depth):
self.layer.append(nn.ModuleList([
Residual(PreNorm(dim, T5SelfAttention(dim = dim, heads = heads, dim_head = dim_head, causal = causal, dropout = dropout))),
Residual(PreNorm(dim, T5CrossAttention(dim = dim, heads = heads, dim_head = dim_head, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim = dim, mult = mlp_mult, dropout = dropout))),
]))
self.final_norm = T5LayerNorm(dim)
def forward(self, x, context, mask = None, context_mask = None):
x = self.token_emb(x)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
for attn, cross_attn, mlp in self.layer:
x = attn(x, mask = mask)
x = cross_attn(x, context = context, mask = mask, context_mask = context_mask)
x = mlp(x)
x = self.final_norm(x)
return x
# T5
class T5(nn.Module):
def __init__(
self,
*,
dim,
#max_seq_len,
enc_num_tokens,
enc_depth,
enc_heads,
enc_dim_head,
enc_mlp_mult,
dec_num_tokens,
dec_depth,
dec_heads,
dec_dim_head,
dec_mlp_mult,
dropout = 0.,
tie_token_emb = True
):
super().__init__()
self.embedding = nn.Embedding(enc_num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.encoder = T5Encoder(
dim = dim,
#max_seq_len = max_seq_len,
num_tokens = enc_num_tokens,
depth = enc_depth,
heads = enc_heads,
dim_head = enc_dim_head,
mlp_mult = enc_mlp_mult,
dropout = dropout
)
self.decoder = T5Decoder(
dim = dim,
#max_seq_len= max_seq_len,
num_tokens = dec_num_tokens,
depth = dec_depth,
heads = dec_heads,
dim_head = dec_dim_head,
mlp_mult = dec_mlp_mult,
dropout = dropout
)
self.to_logits = nn.Linear(dim, dec_num_tokens)
# tie weights
if tie_token_emb:
self.encoder.token_emb.weight = self.decoder.token_emb.weight
def forward(self, src, tgt, mask = None, context_mask = None):
x = self.embedding(src)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
x = self.encoder(src, mask = mask)
x = self.decoder(tgt, x, mask = mask, context_mask = context_mask)
x = self.to_logits(x)
return x
if __name__ == '__main__':
model = T5(
dim = 768,
#max_seq_len = 1024,
enc_num_tokens = 512,
enc_depth = 6,
enc_heads = 12,
enc_dim_head = 64,
enc_mlp_mult = 4,
dec_num_tokens = 512,
dec_depth = 6,
dec_heads = 12,
dec_dim_head = 64,
dec_mlp_mult = 4,
dropout = 0.,
tie_token_emb = True
)
src = torch.randint(0, 512, (1, 1024))
src_mask = torch.ones_like(src).bool()
tgt = torch.randint(0, 512, (1, 1024))
loss = model(src, tgt, mask = src_mask)
print(loss.shape) #torch.Size([1, 1024, 512])

Debugging Neural Network's feedforward propagation

I am implementing a Neural Network's forward propagation.
This is my train method:
def train(self, x, y):
for layer in self.layers:
prediction = layer.forward_prop(x.T) # column vector
and this is the forward_prop method:
def forward_prop(self, x):
if self.is_input:
self.out = x
else:
# bug here: x has to be updated according to previous layer's output!
w_sum = np.dot(self.weight, x) + self.bias
self.out = self.activation(w_sum)
return self.out
There is a error, since layer 2 is trying to multiply self.weight and x instead of the previous layer's output:
ValueError: shapes (4,4) and (2,1) not aligned: 4 (dim 1) != 2 (dim 0)
That is I want:
Layer 0 (input layer): returns z^(0) = x
Layer 1 (first hidden layer): returns z^(1) = sig(W^(1) # z^(0) + b^(1))
Layer 2 (second hidden layer): returns z^(2) = sig(W^(2) # z^(1) + b^(2))
Layer 3 (output): returns z^(3) = sig(W^(3) # z^(2) + b^(3))
...
Layer i: returns z^(i) = sig(W^(i) # z^(i-1) + b^(i))
...
How can I store the previous layer's output?
This is my replicable example:
import numpy as np
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class NeuralNetwork:
def __init__(self):
self.layers = []
def add_layer(self, layer):
self.layers.append(layer)
def create(self):
for i, layer in enumerate(self.layers):
if i == 0:
layer.is_input = True
else:
layer.init_parameters(self.layers[i - 1].neurons)
def summary(self):
for i, layer in enumerate(self.layers):
print("Layer", i)
print("neurons:", layer.neurons)
print("is_input:", layer.is_input)
print("act:", layer.activation)
print("weight:", np.shape(layer.weight))
print(layer.weight)
print("bias:", np.shape(layer.bias))
print(layer.bias)
print("")
def train(self, x, y):
for layer in self.layers:
prediction = layer.forward_prop(x.T) # column vector
class Layer:
def __init__(self, neurons, is_input=False, activation=None):
self.out = None
self.weight = None
self.bias = None
self.neurons = neurons
self.is_input = is_input
self.activation = activation
def init_parameters(self, prev_layer_neurons):
self.weight = np.asmatrix(np.random.normal(0, 0.5, (self.neurons, prev_layer_neurons)))
self.bias = np.asmatrix(np.random.normal(0, 0.5, self.neurons)).T # column vector
if self.activation is None:
self.activation = sigmoid
def forward_prop(self, x):
if self.is_input:
self.out = x
else:
w_sum = np.dot(self.weight, x) + self.bias # Bug
self.out = self.activation(w_sum)
return self.out
if __name__ == '__main__':
net = NeuralNetwork()
d = 2 # input dimension
c = 1 # output
for m in (d, 4, 4, c):
layer = Layer(m)
net.add_layer(layer)
net.create()
# net.summary() # dbg
# Training set
X = np.asmatrix([
[0, 0],
[0, 1],
[1, 0],
[1, 1]
])
y = np.asarray([0, 0, 1, 0])
net.train(X[2], y[2])
I simply solved this way:
def train(self, x, y):
z = x.T
for layer in self.layers:
z = layer.forward_prop(z)

MLP mixer - Saving the training model

I'm trying to train the MLP mixer on a custom dataset based on this repository.
The code I have so far is shown below. How can I save the training model to further use it on test images?
import torch
import numpy as np
from torch import nn
from einops.layers.torch import Rearrange
import glob
import cv2
from torch.utils.data import Dataset, DataLoader
class customDataset(Dataset):
def __init__(self):
self.imags_path = '/path_to_dataset/'
file_list = glob.glob(self.imags_path + '*')
self.data = []
for class_path in file_list:
class_name = class_path.split('/')[-1]
for img_path in glob.glob(class_path + '/*.jpg'):
self.data.append([img_path,class_name])
self.class_map = {'dogs':0, 'cats':1}
self.img_dim = (416,416)
def __len__(self):
return len(self.data)
def __getitem__(self,idx):
img_path,class_name = self.data[idx]
img = cv2.imread(img_path)
img = cv2.resize(img,self.img_dim)
class_id = self.class_map[class_name]
img_tensor = torch.from_numpy(img)
img_tensor = img_tensor.permute(2, 0, 1)
class_id = torch.tensor([class_id])
return img_tensor, class_id
class FeedForward(nn.Module):
def __init__(self, dim, hidden_dim, dropout = 0.):
super().__init__()
self.net = nn.Sequential(
nn.Linear(dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, dim),
nn.Dropout(dropout)
)
def forward(self, x):
return self.net(x)
class MixerBlock(nn.Module):
def __init__(self, dim, num_patch, token_dim, channel_dim, dropout = 0.):
super().__init__()
self.token_mix = nn.Sequential(
nn.LayerNorm(dim),
Rearrange('b n d -> b d n'),
FeedForward(num_patch, token_dim, dropout),
Rearrange('b d n -> b n d')
)
self.channel_mix = nn.Sequential(
nn.LayerNorm(dim),
FeedForward(dim, channel_dim, dropout),
)
def forward(self, x):
x = x + self.token_mix(x)
x = x + self.channel_mix(x)
return x
class MLPMixer(nn.Module):
def __init__(self, in_channels, dim, num_classes, patch_size, image_size, depth, token_dim, channel_dim):
super().__init__()
assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
self.num_patch = (image_size // patch_size) ** 2
self.to_patch_embedding = nn.Sequential(
nn.Conv2d(in_channels, dim, patch_size, patch_size),
Rearrange('b c h w -> b (h w) c'),
)
self.mixer_blocks = nn.ModuleList([])
for _ in range(depth):
self.mixer_blocks.append(MixerBlock(dim, self.num_patch, token_dim, channel_dim))
self.layer_norm = nn.LayerNorm(dim)
self.mlp_head = nn.Sequential(
nn.Linear(dim, num_classes)
)
def forward(self, x):
x = self.to_patch_embedding(x)
for mixer_block in self.mixer_blocks:
x = mixer_block(x)
x = self.layer_norm(x)
x = x.mean(dim=1)
return self.mlp_head(x)
if __name__ == '__main__':
dataset = customDataset()
train_loader = DataLoader(dataset,batch_size=1,shuffle=True)
mixer_model = MLPMixer(in_channels=3,
image_size=416,
patch_size=16,
num_classes=2,
dim=512,
depth=8,
token_dim=256,
channel_dim=2048)
for i, data in enumerate(train_loader,0):
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
inputs, labels = inputs.float(), labels.float()
outputs = mixer_model(inputs)
Thanks.

NameError: name 'K' is not defined

I'm following the guide to Transformers and the colab project https://colab.research.google.com/drive/1XBP0Zh8K4g_n0A2p1UlGFf3dij0EX_Kt
but when I run the cell with the line multi_head = build_model() I get the error.
this is the output from the console:
NameError Traceback (most recent call
last) in ()
----> 1 multi_head = build_model()
5 frames in (x)
40 self.dropout = Dropout(attn_dropout)
41 def call(self, q, k, v, mask):
---> 42 attn = Lambda(lambda x:K.batch_dot(x[0],x[1],axes=[2,2])/self.temper)([q, k])
43 if mask is not None:
44 mmask = Lambda(lambda x:(-1e+10)*(1-x))(mask)
NameError: name 'K' is not defined
It just runs after the model architecture code, which the error refers to.
Can you see where this Kshould be defined?
import random, os, sys
import numpy as np
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.initializers import *
import tensorflow as tf
from tensorflow.python.keras.layers import Layer
try:
from dataloader import TokenList, pad_to_longest
# for transformer
except: pass
embed_size = 60
class LayerNormalization(Layer):
def __init__(self, eps=1e-6, **kwargs):
self.eps = eps
super(LayerNormalization, self).__init__(**kwargs)
def build(self, input_shape):
self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
initializer=Ones(), trainable=True)
self.beta = self.add_weight(name='beta', shape=input_shape[-1:],
initializer=Zeros(), trainable=True)
super(LayerNormalization, self).build(input_shape)
def call(self, x):
mean = K.mean(x, axis=-1, keepdims=True)
std = K.std(x, axis=-1, keepdims=True)
return self.gamma * (x - mean) / (std + self.eps) + self.beta
def compute_output_shape(self, input_shape):
return input_shape
class ScaledDotProductAttention():
def __init__(self, d_model, attn_dropout=0.1):
self.temper = np.sqrt(d_model)
self.dropout = Dropout(attn_dropout)
def __call__(self, q, k, v, mask):
attn = Lambda(lambda x:K.batch_dot(x[0],x[1],axes=[2,2])/self.temper)([q, k])
if mask is not None:
mmask = Lambda(lambda x:(-1e+10)*(1-x))(mask)
attn = Add()([attn, mmask])
attn = Activation('softmax')(attn)
attn = self.dropout(attn)
output = Lambda(lambda x:K.batch_dot(x[0], x[1]))([attn, v])
return output, attn
class MultiHeadAttention():
# mode 0 - big martixes, faster; mode 1 - more clear implementation
def __init__(self, n_head, d_model, d_k, d_v, dropout, mode=0, use_norm=True):
self.mode = mode
self.n_head = n_head
self.d_k = d_k
self.d_v = d_v
self.dropout = dropout
if mode == 0:
self.qs_layer = Dense(n_head*d_k, use_bias=False)
self.ks_layer = Dense(n_head*d_k, use_bias=False)
self.vs_layer = Dense(n_head*d_v, use_bias=False)
elif mode == 1:
self.qs_layers = []
self.ks_layers = []
self.vs_layers = []
for _ in range(n_head):
self.qs_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))
self.ks_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))
self.vs_layers.append(TimeDistributed(Dense(d_v, use_bias=False)))
self.attention = ScaledDotProductAttention(d_model)
self.layer_norm = LayerNormalization() if use_norm else None
self.w_o = TimeDistributed(Dense(d_model))
def __call__(self, q, k, v, mask=None):
d_k, d_v = self.d_k, self.d_v
n_head = self.n_head
if self.mode == 0:
qs = self.qs_layer(q) # [batch_size, len_q, n_head*d_k]
ks = self.ks_layer(k)
vs = self.vs_layer(v)
def reshape1(x):
s = tf.shape(x) # [batch_size, len_q, n_head * d_k]
x = tf.reshape(x, [s[0], s[1], n_head, d_k])
x = tf.transpose(x, [2, 0, 1, 3])
x = tf.reshape(x, [-1, s[1], d_k]) # [n_head * batch_size, len_q, d_k]
return x
qs = Lambda(reshape1)(qs)
ks = Lambda(reshape1)(ks)
vs = Lambda(reshape1)(vs)
if mask is not None:
mask = Lambda(lambda x:K.repeat_elements(x, n_head, 0))(mask)
head, attn = self.attention(qs, ks, vs, mask=mask)
def reshape2(x):
s = tf.shape(x) # [n_head * batch_size, len_v, d_v]
x = tf.reshape(x, [n_head, -1, s[1], s[2]])
x = tf.transpose(x, [1, 2, 0, 3])
x = tf.reshape(x, [-1, s[1], n_head*d_v]) # [batch_size, len_v, n_head * d_v]
return x
head = Lambda(reshape2)(head)
elif self.mode == 1:
heads = []; attns = []
for i in range(n_head):
qs = self.qs_layers[i](q)
ks = self.ks_layers[i](k)
vs = self.vs_layers[i](v)
head, attn = self.attention(qs, ks, vs, mask)
heads.append(head); attns.append(attn)
head = Concatenate()(heads) if n_head > 1 else heads[0]
attn = Concatenate()(attns) if n_head > 1 else attns[0]
outputs = self.w_o(head)
outputs = Dropout(self.dropout)(outputs)
if not self.layer_norm: return outputs, attn
# outputs = Add()([outputs, q]) # sl: fix
return self.layer_norm(outputs), attn
class PositionwiseFeedForward():
def __init__(self, d_hid, d_inner_hid, dropout=0.1):
self.w_1 = Conv1D(d_inner_hid, 1, activation='relu')
self.w_2 = Conv1D(d_hid, 1)
self.layer_norm = LayerNormalization()
self.dropout = Dropout(dropout)
def __call__(self, x):
output = self.w_1(x)
output = self.w_2(output)
output = self.dropout(output)
output = Add()([output, x])
return self.layer_norm(output)
class EncoderLayer():
def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1):
self.self_att_layer = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
self.pos_ffn_layer = PositionwiseFeedForward(d_model, d_inner_hid, dropout=dropout)
def __call__(self, enc_input, mask=None):
output, slf_attn = self.self_att_layer(enc_input, enc_input, enc_input, mask=mask)
output = self.pos_ffn_layer(output)
return output, slf_attn
def GetPosEncodingMatrix(max_len, d_emb):
pos_enc = np.array([
[pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
if pos != 0 else np.zeros(d_emb)
for pos in range(max_len)
])
pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2]) # dim 2i
pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2]) # dim 2i+1
return pos_enc
def GetPadMask(q, k):
ones = K.expand_dims(K.ones_like(q, 'float32'), -1)
mask = K.cast(K.expand_dims(K.not_equal(k, 0), 1), 'float32')
mask = K.batch_dot(ones, mask, axes=[2,1])
return mask
def GetSubMask(s):
len_s = tf.shape(s)[1]
bs = tf.shape(s)[:1]
mask = K.cumsum(tf.eye(len_s, batch_shape=bs), 1)
return mask
class Transformer():
def __init__(self, len_limit, embedding_matrix, d_model=embed_size, \
d_inner_hid=512, n_head=10, d_k=64, d_v=64, layers=2, dropout=0.1, \
share_word_emb=False, **kwargs):
self.name = 'Transformer'
self.len_limit = len_limit
self.src_loc_info = False # True # sl: fix later
self.d_model = d_model
self.decode_model = None
d_emb = d_model
pos_emb = Embedding(len_limit, d_emb, trainable=False, \
weights=[GetPosEncodingMatrix(len_limit, d_emb)])
i_word_emb = Embedding(max_features, d_emb, weights=[embedding_matrix]) # Add Kaggle provided embedding here
self.encoder = Encoder(d_model, d_inner_hid, n_head, d_k, d_v, layers, dropout, \
word_emb=i_word_emb, pos_emb=pos_emb)
def get_pos_seq(self, x):
mask = K.cast(K.not_equal(x, 0), 'int32')
pos = K.cumsum(K.ones_like(x, 'int32'), 1)
return pos * mask
def compile(self, active_layers=999):
src_seq_input = Input(shape=(None, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(src_seq_input)
# LSTM before attention layers
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x, slf_attn = MultiHeadAttention(n_head=3, d_model=300, d_k=64, d_v=64, dropout=0.1)(x, x, x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
conc = concatenate([avg_pool, max_pool])
conc = Dense(64, activation="relu")(conc)
x = Dense(1, activation="sigmoid")(conc)
self.model = Model(inputs=src_seq_input, outputs=x)
self.model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics=['accuracy'])
If you look at where K is being used you will see:
K.expand_dims
K.cumsum
K.batch_dot
These are Keras backend functions. The code is missing a from keras import backend as K, which I think is a standard abbreviation.

My LSTM code give flat prediction and don't know what is wrong..Anyone can help take a look?

Here is the code. I think the class mylstm has problem but I can not find it... The input is simple, which is just 7 columns data.
I tried to print out all the tensors but did not find what was wrong. Thanks for help!
class mylstm(nn.Module):
def __init__(self, input_size, hidden_size, T, logger):
super(mylstm, self).__init__()
self.T = T
self.input_size = input_size
self.hidden_size = hidden_size
self.logger = logger
self.lstm_layer = nn.LSTM(input_size = 7, hidden_size = hidden_size)
self.fc = nn.Linear(hidden_size, 1)
#self.fc.weight.data.normal_()
def forward(self, input_data):
hidden = self.init_hidden(input_data)
cell = self.init_hidden(input_data)
for t in range(self.T - 1):
if t < self.T - 1:
self.lstm_layer.flatten_parameters()
_, lstm_output = self.lstm_layer(input_data[:,t,:].unsqueeze(0), (hidden, cell))
hidden = lstm_output[0]
cell = lstm_output[1]
y_pred = self.fc(hidden[0])
return y_pred
def init_hidden(self, x):
return Variable(x.data.new(1, x.size(0), self.hidden_size).zero_())
# Train the model
class rnn:
def __init__(self, file_data, logger, input_size = 7, hidden_size = 64, T = 10,
learning_rate = 0.01, batch_size = 128, parallel = True, debug = False):
self.T = T
dat = pd.read_csv(file_data, nrows = 100 if debug else None)
self.logger = logger
self.logger.info("Shape of data: %s.\nMissing in data: %s.", dat.shape, dat.isnull().sum().sum())
self.X = dat.loc[:, [x for x in dat.columns.tolist()]].values
self.y = np.array(dat.rtm_spp)
self.batch_size = batch_size
self.lstm1 = mylstm(input_size = input_size,
hidden_size = hidden_size,
T = T, logger = logger)
if parallel:
self.lstm1 = nn.DataParallel(self.lstm1)
self.lstm1_optimizer = optim.Adam(params = filter(lambda p: p.requires_grad, self.lstm1.parameters()),
lr = learning_rate)
self.train_size = 20000
self.y = self.y - np.mean(self.y[:self.train_size]) # Question: why Adam requires data to be normalized?
self.logger.info("Training size: %d.", self.train_size)
def train(self, n_epochs = 10):
iter_per_epoch = int(np.ceil(self.train_size * 1. / self.batch_size))
logger.info("Iterations per epoch: %3.3f ~ %d.", self.train_size * 1. / self.batch_size, iter_per_epoch)
self.iter_losses = np.zeros(n_epochs * iter_per_epoch)
self.epoch_losses = np.zeros(n_epochs)
self.loss_func = nn.MSELoss()
n_iter = 0
learning_rate = 1.
for i in range(n_epochs):
perm_idx = np.random.permutation(self.train_size - self.T-1)
j = 0
while j < self.train_size:
batch_idx = perm_idx[j:(j + self.batch_size)]
X = np.zeros((len(batch_idx), self.T - 1, self.X.shape[1]))
#y_history = np.zeros((len(batch_idx), self.T - 1))
y_target = self.y[batch_idx + self.T]
for k in range(len(batch_idx)):
X[k, :, :] = self.X[batch_idx[k] : (batch_idx[k] + self.T - 1), :]
loss = self.train_iteration(X, y_target)
self.iter_losses[i * iter_per_epoch + j // self.batch_size] = loss
#if (j / self.batch_size) % 50 == 0:
j += self.batch_size
n_iter += 1
if n_iter % 10000 == 0 and n_iter > 0:
for param_group in self.lstm1_optimizer.param_groups:
param_group['lr'] = param_group['lr'] * 0.9
self.epoch_losses[i] = np.mean(self.iter_losses[range(i * iter_per_epoch, (i + 1) * iter_per_epoch)])
if i % 10 == 0:
self.logger.info("Epoch %d, loss: %3.3f.", i, self.epoch_losses[i])
y_train_pred = self.predict(on_train = True)
y_test_pred = self.predict(on_train = False)
def train_iteration(self, X,y_target):
self.lstm1_optimizer.zero_grad()
y_pred = self.lstm1(Variable(torch.from_numpy(X).type(torch.FloatTensor)))
y_true = Variable(torch.from_numpy(y_target).type(torch.FloatTensor))
y_true = y_true.view(y_true.shape[0],1)
y_pred=y_pred.squeeze(0)
print(y_pred)
loss = self.loss_func(y_pred, y_true)
loss.backward()
self.lstm1_optimizer.step()
return loss.data[0]
def predict(self, on_train = False):
if on_train:
y_pred = np.zeros(self.train_size - self.T +1)
else:
y_pred = np.zeros(self.X.shape[0] - self.train_size)
i = 0
while i < len(y_pred):
batch_idx = np.array(range(len(y_pred)))[i : (i + self.batch_size)]
X = np.zeros((len(batch_idx), self.T - 1, self.X.shape[1]))
#y_history = np.zeros((len(batch_idx), self.T - 1))
for j in range(len(batch_idx)):
if on_train:
X[j, :, :] = self.X[range(batch_idx[j], batch_idx[j] + self.T - 1), :]
else:
X[j, :, :] = self.X[range(batch_idx[j] + self.train_size - self.T, batch_idx[j] + self.train_size - 1), :]
input_data = Variable(torch.from_numpy(X).type(torch.FloatTensor))
# print(self.lstm1(torch.randn(128,9,7)))
#print(self.lstm1(X).data.numpy())
y_pred[i:(i + self.batch_size)] = self.lstm1(input_data).data.numpy()[:,0]
i += self.batch_size
return y_pred
model = rnn(file_data = 'L.csv', logger = logger, parallel = False,
learning_rate = .001)
model.train(n_epochs = 1000)
y_pred = model.predict()
It might be good if you can reduce your codes into the simplest form which still reproduce your problem. Asking people to debug over 200 lines of codes may be too big an ask. If you can give a small example of your problem, using a very simple NN model instead of the current one, many others will be willing to look into your codes and help identify the issue.

Categories

Resources