PyTorch T5 Transformer Implementation

PyTorch T5 Transformer Implementation - python

I have been working on an implementation of the T5 architecture in PyTorch. I am having some issues properly implementing the Cross Attention Layers and Decoder.
If anyone who is familiar with the architecture could provide any advice it would be greatly appreciated.
I am sometimes receiving this error as well:
return torch.embedding(weight, input, padding_idx, scale_grad_by_freq, sparse)
RuntimeError: Expected tensor for argument #1 'indices' to have one of the following scalar types: Long, Int; but got torch.FloatTensor instead (while checking arguments for embedding)
Thank you!
Code for T5 in PyTorch:
import torch
from torch import nn
import torch.nn.functional as F
import math
from einops import rearrange
def exists(val):
return val is not None
def default(val, d):
return val if exists(val) else d
# residual wrapper
class Residual(nn.Module):
def __init__(self, fn):
super().__init__()
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(x, **kwargs) + x
# pre-normalization wrapper
# they use layernorm without bias
class T5LayerNorm(nn.Module):
def __init__(self, dim):
super().__init__()
self.gamma = nn.Parameter(torch.ones(dim))
self.register_buffer("beta", torch.zeros(dim))
def forward(self, x):
return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta)
class PreNorm(nn.Module):
def __init__(self, dim, fn):
super().__init__()
self.norm = T5LayerNorm(dim)
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(self.norm(x), **kwargs)
# feedforward layer
class FeedForward(nn.Module):
def __init__(self, dim, mult = 4, dropout = 0.):
super().__init__()
inner_dim = int(dim * mult)
self.net = nn.Sequential(
nn.Linear(dim, inner_dim),
nn.ReLU(),
nn.Dropout(dropout), # optional dropout
nn.Linear(inner_dim, dim)
)
def forward(self, x):
return self.net(x)
# T5 relative positional bias
class T5RelativePositionBias(nn.Module):
def __init__(self, scale, causal = False, num_buckets = 32, max_distance = 128, heads = 12):
super().__init__()
self.scale = scale
self.causal = causal
self.num_buckets = num_buckets
self.max_distance = max_distance
self.relative_attention_bias = nn.Embedding(num_buckets, heads)
#staticmethod
def _relative_position_bucket(relative_position, causal = True, num_buckets = 32, max_distance = 128):
ret = 0
n = -relative_position
if not causal:
num_buckets //= 2
ret += (n < 0).long() * num_buckets
n = torch.abs(n)
else:
n = torch.max(n, torch.zeros_like(n))
max_exact = num_buckets // 2
is_small = n < max_exact
val_if_large = max_exact + (
torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
).long()
val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
ret += torch.where(is_small, n, val_if_large)
return ret
def forward(self, qk_dots):
i, j, device = *qk_dots.shape[-2:], qk_dots.device
q_pos = torch.arange(j - i, j, dtype = torch.long, device = device)
k_pos = torch.arange(j, dtype = torch.long, device = device)
rel_pos = k_pos[None, :] - q_pos[:, None]
rp_bucket = self._relative_position_bucket(
rel_pos,
causal = self.causal,
num_buckets = self.num_buckets,
max_distance = self.max_distance
)
values = self.relative_attention_bias(rp_bucket)
bias = rearrange(values, 'i j h -> h i j')
return qk_dots + (bias * self.scale)
# T5 Self Attention
class T5SelfAttention(nn.Module):
def __init__(
self,
*,
dim,
heads = 12,
dim_head = 64,
causal = False,
dropout = 0.
):
super().__init__()
inner_dim = dim_head * heads
self.heads = heads
self.scale = dim_head ** -0.5
self.causal = causal
self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_k = nn.Linear(dim, inner_dim, bias = False)
self.to_v = nn.Linear(dim, inner_dim, bias = False)
self.to_out = nn.Linear(inner_dim, dim)
self.relative_position_bias = T5RelativePositionBias(
scale = dim_head ** -0.5,
causal = causal,
heads = heads
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask = None):
b, n, _, h = *x.shape, self.heads
q, k, v = self.to_q(x), self.to_k(x), self.to_v(x)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
q = q * self.scale
sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
sim = self.relative_position_bias(sim)
# mask
mask_value = -torch.finfo(sim.dtype).max
if mask is not None:
sim = sim.masked_fill_(~mask, mask_value)
if self.causal:
i, j = sim.shape[-2:]
causal_mask = torch.ones((i, j), dtype = torch.bool, device = x.device).triu(j - i + 1)
sim = sim.masked_fill(causal_mask, mask_value)
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate
out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
# merge heads
out = rearrange(out, 'b h n d -> b n (h d)')
# combine heads and linear output
return self.to_out(out)
# T5 Cross Attention
class T5CrossAttention(nn.Module):
def __init__(
self,
*,
dim,
context_dim = None,
heads = 12,
dim_head = 64,
dropout = 0.
):
super().__init__()
inner_dim = dim_head * heads
context_dim = default(context_dim, dim)
self.heads = heads
self.scale = dim_head ** -0.5
self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_k = nn.Linear(context_dim, inner_dim, bias = False)
self.to_v = nn.Linear(context_dim, inner_dim, bias = False)
self.to_out = nn.Linear(inner_dim, dim)
self.relative_position_bias = T5RelativePositionBias(
scale = dim_head ** -0.5,
causal = False,
heads = heads
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, context, mask = None, context_mask = None):
b, n, _, h = *x.shape, self.heads
kv_input = default(context, x)
q, k, v = self.to_q(x), self.to_k(kv_input), self.to_v(kv_input)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
q = q * self.scale
sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
sim = self.relative_position_bias(sim)
# mask
mask_value = -torch.finfo(sim.dtype).max
if mask is not None:
sim = sim.masked_fill_(~mask, mask_value)
if context_mask is not None:
sim = sim.masked_fill_(~context_mask[:, None, :], mask_value)
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate
out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
# merge heads
out = rearrange(out, 'b h n d -> b n (h d)')
# combine heads and linear output
return self.to_out(out)
# T5 Encoder
class T5Encoder(nn.Module):
def __init__(
self,
*,
dim,
num_tokens,
#max_seq_len,
depth,
heads = 12,
dim_head = 64,
causal = False,
mlp_mult = 4,
dropout = 0.
):
super().__init__()
self.token_emb = nn.Embedding(num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.layer = nn.ModuleList([])
for _ in range(depth):
self.layer.append(nn.ModuleList([
Residual(PreNorm(dim, T5SelfAttention(dim = dim, heads = heads, dim_head = dim_head, causal = causal, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim = dim, mult = mlp_mult, dropout = dropout))),
]))
self.final_norm = T5LayerNorm(dim)
def forward(self, x, mask = None):
x = self.token_emb(x)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
for attn, mlp in self.layer:
x = attn(x, mask = mask)
x = mlp(x)
x = self.final_norm(x)
return x
# T5 Decoder
class T5Decoder(nn.Module):
def __init__(
self,
*,
dim,
num_tokens,
#max_seq_len,
depth,
heads = 12,
dim_head = 64,
causal = True,
mlp_mult = 4,
dropout = 0.
):
super().__init__()
self.token_emb = nn.Embedding(num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.layer = nn.ModuleList([])
for _ in range(depth):
self.layer.append(nn.ModuleList([
Residual(PreNorm(dim, T5SelfAttention(dim = dim, heads = heads, dim_head = dim_head, causal = causal, dropout = dropout))),
Residual(PreNorm(dim, T5CrossAttention(dim = dim, heads = heads, dim_head = dim_head, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim = dim, mult = mlp_mult, dropout = dropout))),
]))
self.final_norm = T5LayerNorm(dim)
def forward(self, x, context, mask = None, context_mask = None):
x = self.token_emb(x)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
for attn, cross_attn, mlp in self.layer:
x = attn(x, mask = mask)
x = cross_attn(x, context = context, mask = mask, context_mask = context_mask)
x = mlp(x)
x = self.final_norm(x)
return x
# T5
class T5(nn.Module):
def __init__(
self,
*,
dim,
#max_seq_len,
enc_num_tokens,
enc_depth,
enc_heads,
enc_dim_head,
enc_mlp_mult,
dec_num_tokens,
dec_depth,
dec_heads,
dec_dim_head,
dec_mlp_mult,
dropout = 0.,
tie_token_emb = True
):
super().__init__()
self.embedding = nn.Embedding(enc_num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.encoder = T5Encoder(
dim = dim,
#max_seq_len = max_seq_len,
num_tokens = enc_num_tokens,
depth = enc_depth,
heads = enc_heads,
dim_head = enc_dim_head,
mlp_mult = enc_mlp_mult,
dropout = dropout
)
self.decoder = T5Decoder(
dim = dim,
#max_seq_len= max_seq_len,
num_tokens = dec_num_tokens,
depth = dec_depth,
heads = dec_heads,
dim_head = dec_dim_head,
mlp_mult = dec_mlp_mult,
dropout = dropout
)
self.to_logits = nn.Linear(dim, dec_num_tokens)
# tie weights
if tie_token_emb:
self.encoder.token_emb.weight = self.decoder.token_emb.weight
def forward(self, src, tgt, mask = None, context_mask = None):
x = self.embedding(src)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
x = self.encoder(src, mask = mask)
x = self.decoder(tgt, x, mask = mask, context_mask = context_mask)
x = self.to_logits(x)
return x
if __name__ == '__main__':
from opendelta import Visualization
model = T5(
dim = 768,
#max_seq_len = 1024,
enc_num_tokens = 512,
enc_depth = 6,
enc_heads = 12,
enc_dim_head = 64,
enc_mlp_mult = 4,
dec_num_tokens = 512,
dec_depth = 6,
dec_heads = 12,
dec_dim_head = 64,
dec_mlp_mult = 4,
dropout = 0.,
tie_token_emb = True
)
src = torch.randint(0, 512, (1, 1024))
src_mask = torch.ones_like(src).bool()
tgt = torch.randint(0, 512, (1, 1024))
loss = model(src, tgt, mask = src_mask)
Visualization(model).structure_graph()
print(loss.shape) #torch.Size([1, 1024, 512])

Working implementation of T5 in pytorch:
import torch
from torch import nn
import torch.nn.functional as F
import math
from einops import rearrange
def exists(val):
return val is not None
def default(val, d):
return val if exists(val) else d
# residual wrapper
class Residual(nn.Module):
def __init__(self, fn):
super().__init__()
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(x, **kwargs) + x
# pre-normalization wrapper
# they use layernorm without bias
class T5LayerNorm(nn.Module):
def __init__(self, dim):
super().__init__()
self.gamma = nn.Parameter(torch.ones(dim))
self.register_buffer("beta", torch.zeros(dim))
def forward(self, x):
return F.layer_norm(x, x.shape[-1:], self.gamma, self.beta)
class PreNorm(nn.Module):
def __init__(self, dim, fn):
super().__init__()
self.norm = T5LayerNorm(dim)
self.fn = fn
def forward(self, x, **kwargs):
return self.fn(self.norm(x), **kwargs)
# feedforward layer
class FeedForward(nn.Module):
def __init__(self, dim, mult = 4, dropout = 0.):
super().__init__()
inner_dim = int(dim * mult)
self.net = nn.Sequential(
nn.Linear(dim, inner_dim),
nn.ReLU(),
nn.Dropout(dropout), # optional dropout
nn.Linear(inner_dim, dim)
)
def forward(self, x):
return self.net(x)
# T5 relative positional bias
class T5RelativePositionBias(nn.Module):
def __init__(self, scale, causal = False, num_buckets = 32, max_distance = 128, heads = 12):
super().__init__()
self.scale = scale
self.causal = causal
self.num_buckets = num_buckets
self.max_distance = max_distance
self.relative_attention_bias = nn.Embedding(num_buckets, heads)
#staticmethod
def _relative_position_bucket(relative_position, causal = True, num_buckets = 32, max_distance = 128):
ret = 0
n = -relative_position
if not causal:
num_buckets //= 2
ret += (n < 0).long() * num_buckets
n = torch.abs(n)
else:
n = torch.max(n, torch.zeros_like(n))
max_exact = num_buckets // 2
is_small = n < max_exact
val_if_large = max_exact + (
torch.log(n.float() / max_exact) / math.log(max_distance / max_exact) * (num_buckets - max_exact)
).long()
val_if_large = torch.min(val_if_large, torch.full_like(val_if_large, num_buckets - 1))
ret += torch.where(is_small, n, val_if_large)
return ret
def forward(self, qk_dots):
i, j, device = *qk_dots.shape[-2:], qk_dots.device
q_pos = torch.arange(j - i, j, dtype = torch.long, device = device)
k_pos = torch.arange(j, dtype = torch.long, device = device)
rel_pos = k_pos[None, :] - q_pos[:, None]
rp_bucket = self._relative_position_bucket(
rel_pos,
causal = self.causal,
num_buckets = self.num_buckets,
max_distance = self.max_distance
)
values = self.relative_attention_bias(rp_bucket)
bias = rearrange(values, 'i j h -> h i j')
return qk_dots + (bias * self.scale)
# T5 Self Attention
class T5SelfAttention(nn.Module):
def __init__(
self,
*,
dim,
heads = 12,
dim_head = 64,
causal = False,
dropout = 0.
):
super().__init__()
inner_dim = dim_head * heads
self.heads = heads
self.scale = dim_head ** -0.5
self.causal = causal
self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_k = nn.Linear(dim, inner_dim, bias = False)
self.to_v = nn.Linear(dim, inner_dim, bias = False)
self.to_out = nn.Linear(inner_dim, dim)
self.relative_position_bias = T5RelativePositionBias(
scale = dim_head ** -0.5,
causal = causal,
heads = heads
)
self.dropout = nn.Dropout(dropout)
def forward(self, x, mask = None):
b, n, _, h = *x.shape, self.heads
q, k, v = self.to_q(x), self.to_k(x), self.to_v(x)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
q = q * self.scale
sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
sim = self.relative_position_bias(sim)
# mask
mask_value = -torch.finfo(sim.dtype).max
if mask is not None:
sim = sim.masked_fill_(~mask, mask_value)
if self.causal:
i, j = sim.shape[-2:]
causal_mask = torch.ones((i, j), dtype = torch.bool, device = x.device).triu(j - i + 1)
sim = sim.masked_fill(causal_mask, mask_value)
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate
out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
# merge heads
out = rearrange(out, 'b h n d -> b n (h d)')
# combine heads and linear output
return self.to_out(out)
# T5 Cross Attention
class T5CrossAttention(nn.Module):
def __init__(
self,
*,
dim,
context_dim = None,
heads = 12,
dim_head = 64,
dropout = 0.
):
super().__init__()
inner_dim = dim_head * heads
context_dim = default(context_dim, dim)
self.heads = heads
self.scale = dim_head ** -0.5
self.to_q = nn.Linear(dim, inner_dim, bias = False)
self.to_k = nn.Linear(context_dim, inner_dim, bias = False)
self.to_v = nn.Linear(context_dim, inner_dim, bias = False)
self.to_out = nn.Linear(inner_dim, dim)
# self.relative_position_bias = T5RelativePositionBias(
# scale = dim_head ** -0.5,
# causal = False,
# heads = heads
# )
self.dropout = nn.Dropout(dropout)
def forward(self, x, context, mask = None, context_mask = None):
b, n, _, h = *x.shape, self.heads
kv_input = default(context, x)
q, k, v = self.to_q(x), self.to_k(kv_input), self.to_v(kv_input)
q, k, v = map(lambda t: rearrange(t, 'b n (h d) -> b h n d', h = h), (q, k, v))
q = q * self.scale
sim = torch.einsum('b h i d, b h j d -> b h i j', q, k)
#sim = self.relative_position_bias(sim)
# mask
mask_value = -torch.finfo(sim.dtype).max
if mask is not None:
sim = sim.masked_fill_(~mask, mask_value)
if context_mask is not None:
sim = sim.masked_fill_(~context_mask[:, None, :], mask_value)
# attention
attn = sim.softmax(dim = -1)
attn = self.dropout(attn)
# aggregate
out = torch.einsum('b h i j, b h j d -> b h i d', attn, v)
# merge heads
out = rearrange(out, 'b h n d -> b n (h d)')
# combine heads and linear output
return self.to_out(out)
# T5 Encoder
class T5Encoder(nn.Module):
def __init__(
self,
*,
dim,
num_tokens,
#max_seq_len,
depth,
heads = 12,
dim_head = 64,
causal = False,
mlp_mult = 4,
dropout = 0.
):
super().__init__()
self.token_emb = nn.Embedding(num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.layer = nn.ModuleList([])
for _ in range(depth):
self.layer.append(nn.ModuleList([
Residual(PreNorm(dim, T5SelfAttention(dim = dim, heads = heads, dim_head = dim_head, causal = causal, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim = dim, mult = mlp_mult, dropout = dropout))),
]))
self.final_norm = T5LayerNorm(dim)
def forward(self, x, mask = None):
x = self.token_emb(x)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
for attn, mlp in self.layer:
x = attn(x, mask = mask)
x = mlp(x)
x = self.final_norm(x)
return x
# T5 Decoder
class T5Decoder(nn.Module):
def __init__(
self,
*,
dim,
num_tokens,
#max_seq_len,
depth,
heads = 12,
dim_head = 64,
causal = True,
mlp_mult = 4,
dropout = 0.
):
super().__init__()
self.token_emb = nn.Embedding(num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.layer = nn.ModuleList([])
for _ in range(depth):
self.layer.append(nn.ModuleList([
Residual(PreNorm(dim, T5SelfAttention(dim = dim, heads = heads, dim_head = dim_head, causal = causal, dropout = dropout))),
Residual(PreNorm(dim, T5CrossAttention(dim = dim, heads = heads, dim_head = dim_head, dropout = dropout))),
Residual(PreNorm(dim, FeedForward(dim = dim, mult = mlp_mult, dropout = dropout))),
]))
self.final_norm = T5LayerNorm(dim)
def forward(self, x, context, mask = None, context_mask = None):
x = self.token_emb(x)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
for attn, cross_attn, mlp in self.layer:
x = attn(x, mask = mask)
x = cross_attn(x, context = context, mask = mask, context_mask = context_mask)
x = mlp(x)
x = self.final_norm(x)
return x
# T5
class T5(nn.Module):
def __init__(
self,
*,
dim,
#max_seq_len,
enc_num_tokens,
enc_depth,
enc_heads,
enc_dim_head,
enc_mlp_mult,
dec_num_tokens,
dec_depth,
dec_heads,
dec_dim_head,
dec_mlp_mult,
dropout = 0.,
tie_token_emb = True
):
super().__init__()
self.embedding = nn.Embedding(enc_num_tokens, dim)
#self.pos_emb = nn.Embedding(max_seq_len, dim)
self.encoder = T5Encoder(
dim = dim,
#max_seq_len = max_seq_len,
num_tokens = enc_num_tokens,
depth = enc_depth,
heads = enc_heads,
dim_head = enc_dim_head,
mlp_mult = enc_mlp_mult,
dropout = dropout
)
self.decoder = T5Decoder(
dim = dim,
#max_seq_len= max_seq_len,
num_tokens = dec_num_tokens,
depth = dec_depth,
heads = dec_heads,
dim_head = dec_dim_head,
mlp_mult = dec_mlp_mult,
dropout = dropout
)
self.to_logits = nn.Linear(dim, dec_num_tokens)
# tie weights
if tie_token_emb:
self.encoder.token_emb.weight = self.decoder.token_emb.weight
def forward(self, src, tgt, mask = None, context_mask = None):
x = self.embedding(src)
#x = x + self.pos_emb(torch.arange(x.shape[1], device = x.device))
x = self.encoder(src, mask = mask)
x = self.decoder(tgt, x, mask = mask, context_mask = context_mask)
x = self.to_logits(x)
return x
if __name__ == '__main__':
model = T5(
dim = 768,
#max_seq_len = 1024,
enc_num_tokens = 512,
enc_depth = 6,
enc_heads = 12,
enc_dim_head = 64,
enc_mlp_mult = 4,
dec_num_tokens = 512,
dec_depth = 6,
dec_heads = 12,
dec_dim_head = 64,
dec_mlp_mult = 4,
dropout = 0.,
tie_token_emb = True
)
src = torch.randint(0, 512, (1, 1024))
src_mask = torch.ones_like(src).bool()
tgt = torch.randint(0, 512, (1, 1024))
loss = model(src, tgt, mask = src_mask)
print(loss.shape) #torch.Size([1, 1024, 512])

Related

how to convert k.variable, addweights to pytorch code

i have a code written by tensorflow code i need to convert to pytorch.
i think i almost done with this but i'm not sure about detail.
first i have
class adaptive_implicit_trans(layers.Layer):
def __init__(self, **kwargs):
super(adaptive_implicit_trans, self).__init__(**kwargs)
def build(self, input_shape):
conv_shape = (1,1,64,64)
self.it_weights = self.add_weight(
shape = (1,1,64,1),
initializer = initializers.get('ones'),
constraint = constraints.NonNeg(),
name = 'ait_conv')
kernel = np.zeros(conv_shape)
r1 = sqrt(1.0/8)
r2 = sqrt(2.0/8)
for i in range(8):
_u = 2*i+1
for j in range(8):
_v = 2*j+1
index = i*8+j
for u in range(8):
for v in range(8):
index2 = u*8+v
t = cos(_u*u*pi/16)*cos(_v*v*pi/16)
t = t*r1 if u==0 else t*r2
t = t*r1 if v==0 else t*r2
kernel[0,0,index2,index] = t
self.kernel = k.variable(value = kernel, dtype = 'float32')
def call(self, inputs):
self.kernel = self.kernel*self.it_weights
y = k.conv2d(inputs,
self.kernel,
padding = 'same',
data_format='channels_last')
return y
def compute_output_shape(self, input_shape):
return input_shape
and i need to convert it to pytorch code.
so i made this.
class It_Weight(nn.Module):
def __init__(self):
super().__init__()
self.ReLU = nn.ReLU()
self.it_weights = nn.Parameter(torch.autograd.Variable(torch.ones((1, 64, 1, 1)),
requires_grad=True))
def forward(self, input):
y = input.to('cuda') * self.it_weights
return self.ReLU(y)
def compute_output_shape(self, input_shape):
return input_shape
class Kernel(nn.Module):
def __init__(self):
super().__init__()
conv_shape = (64, 64, 1, 1)
kernel = torch.zeros(conv_shape).cuda()
r1 = sqrt(1.0 / 8)
r2 = sqrt(2.0 / 8)
for i in range(8):
_u = 2 * i + 1
for j in range(8):
_v = 2 * j + 1
index = i * 8 + j
for u in range(8):
for v in range(8):
index2 = u * 8 + v
t = cos(_u * u * pi / 16) * cos(_v * v * pi / 16)
t = t * r1 if u == 0 else t * r2
t = t * r1 if v == 0 else t * r2
kernel[index, index2, 0, 0] = t
self.kernel = torch.autograd.Variable(kernel)
def forward(self):
return self.kernel
def compute_output_shape(self, input_shape):
return input_shape
class adaptive_implicit_trans(nn.Module):
def __init__(self):
super().__init__()
self.it_weights = It_Weight()
self.kernel = Kernel()
def forward(self, inputs):
self.kernel1 = self.it_weights(self.kernel()) # 출력도 kernel사이즈랑 동일
y = F.conv2d(inputs, self.kernel1)
return y
def compute_output_shape(self, input_shape):
return input_shape
the number of class was increased but it was my best in order to avoid error.
i don't know where is wrong.
i have been trying to train this pytorch code. but performance is lower than tensorflow code.
where should i edit ??
please help me.

MLP mixer - Saving the training model

I'm trying to train the MLP mixer on a custom dataset based on this repository.
The code I have so far is shown below. How can I save the training model to further use it on test images?
import torch
import numpy as np
from torch import nn
from einops.layers.torch import Rearrange
import glob
import cv2
from torch.utils.data import Dataset, DataLoader
class customDataset(Dataset):
def __init__(self):
self.imags_path = '/path_to_dataset/'
file_list = glob.glob(self.imags_path + '*')
self.data = []
for class_path in file_list:
class_name = class_path.split('/')[-1]
for img_path in glob.glob(class_path + '/*.jpg'):
self.data.append([img_path,class_name])
self.class_map = {'dogs':0, 'cats':1}
self.img_dim = (416,416)
def __len__(self):
return len(self.data)
def __getitem__(self,idx):
img_path,class_name = self.data[idx]
img = cv2.imread(img_path)
img = cv2.resize(img,self.img_dim)
class_id = self.class_map[class_name]
img_tensor = torch.from_numpy(img)
img_tensor = img_tensor.permute(2, 0, 1)
class_id = torch.tensor([class_id])
return img_tensor, class_id
class FeedForward(nn.Module):
def __init__(self, dim, hidden_dim, dropout = 0.):
super().__init__()
self.net = nn.Sequential(
nn.Linear(dim, hidden_dim),
nn.GELU(),
nn.Dropout(dropout),
nn.Linear(hidden_dim, dim),
nn.Dropout(dropout)
)
def forward(self, x):
return self.net(x)
class MixerBlock(nn.Module):
def __init__(self, dim, num_patch, token_dim, channel_dim, dropout = 0.):
super().__init__()
self.token_mix = nn.Sequential(
nn.LayerNorm(dim),
Rearrange('b n d -> b d n'),
FeedForward(num_patch, token_dim, dropout),
Rearrange('b d n -> b n d')
)
self.channel_mix = nn.Sequential(
nn.LayerNorm(dim),
FeedForward(dim, channel_dim, dropout),
)
def forward(self, x):
x = x + self.token_mix(x)
x = x + self.channel_mix(x)
return x
class MLPMixer(nn.Module):
def __init__(self, in_channels, dim, num_classes, patch_size, image_size, depth, token_dim, channel_dim):
super().__init__()
assert image_size % patch_size == 0, 'Image dimensions must be divisible by the patch size.'
self.num_patch = (image_size // patch_size) ** 2
self.to_patch_embedding = nn.Sequential(
nn.Conv2d(in_channels, dim, patch_size, patch_size),
Rearrange('b c h w -> b (h w) c'),
)
self.mixer_blocks = nn.ModuleList([])
for _ in range(depth):
self.mixer_blocks.append(MixerBlock(dim, self.num_patch, token_dim, channel_dim))
self.layer_norm = nn.LayerNorm(dim)
self.mlp_head = nn.Sequential(
nn.Linear(dim, num_classes)
)
def forward(self, x):
x = self.to_patch_embedding(x)
for mixer_block in self.mixer_blocks:
x = mixer_block(x)
x = self.layer_norm(x)
x = x.mean(dim=1)
return self.mlp_head(x)
if __name__ == '__main__':
dataset = customDataset()
train_loader = DataLoader(dataset,batch_size=1,shuffle=True)
mixer_model = MLPMixer(in_channels=3,
image_size=416,
patch_size=16,
num_classes=2,
dim=512,
depth=8,
token_dim=256,
channel_dim=2048)
for i, data in enumerate(train_loader,0):
inputs, labels = data
inputs, labels = inputs.to(device), labels.to(device)
inputs, labels = inputs.float(), labels.float()
outputs = mixer_model(inputs)
Thanks.

My tensorflow Convolutional Neural Network does not train

I tried to implement a class based convolutional neural network for face expression recognition data on kaggle using tensorflow. However, for some reason my network does not train and I keep getting the same cost and error rates at each iteration.
I tried using one hot vectors for labels, changing hyperparameters but they did not have any effect on the result.
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import tensorflow as tf
from sklearn.utils import shuffle
def get_data():
df = pd.read_csv('../large_files/fer2013/fer2013.csv')
Y = df.emotion.to_numpy()
XX = df.pixels
X = []
for i in range(len(XX)):
X.append(XX[i].split())
X = np.array(X).astype(np.float)
Z = df.Usage
train = (Z == 'Training').to_list()
test = [not i for i in train]
Xtrain = X[train].astype(np.float32)
Xtrain = Xtrain.reshape((Xtrain.shape[0], int(np.sqrt(Xtrain.shape[1])), int(np.sqrt(Xtrain.shape[1])), 1))
Xtest = X[test].astype(np.float32)
Xtest = Xtest.reshape((Xtest.shape[0], int(np.sqrt(Xtest.shape[1])), int(np.sqrt(Xtest.shape[1])), 1))
Ytrain = Y[train].astype(np.int32)
Ytest = Y[test].astype(np.int32)
return Xtrain / 255, Xtest / 255, Ytrain, Ytest
def convpool(X, W, b,poolsz):
conv_out = tf.nn.conv2d(X, W, strides = [1,1,1,1], padding = 'SAME')
conv_out = tf.nn.bias_add(conv_out, b)
pool_out = tf.nn.max_pool(conv_out, ksize=[1,poolsz,poolsz,1], strides=[1,poolsz,poolsz,1], padding = 'SAME')
return tf.nn.relu(pool_out)
def init_filter(shape):
w = np.random.rand(*shape) * np.sqrt(2 / np.prod(shape[:-1]))
return w.astype(np.float32)
def error_rate(Y,T):
return np.mean(Y != T)
class FullyConnectedLayer():
def __init__(self, M1, M2, activation = tf.nn.relu):
W = np.random.randn(M1,M2) / np.sqrt(M1 + M2)
self.W = tf.Variable(W.astype(np.float32))
b = np.zeros(M2)
self.b = tf.Variable(b.astype(np.float32))
self.activation = activation
def forward(self, X):
if self.activation == None:
return tf.matmul(X, self.W) + self.b
else:
return self.activation(tf.matmul(X, self.W) + self.b)
class ConvolutionLayer():
def __init__(self, filter_shape, b, poolsz = 2):
W = init_filter(filter_shape)
self.W = tf.Variable(W)
self.b = tf.Variable(b.astype(np.float32))
self.poolsize = poolsz
def forward(self, X):
return convpool(X, self.W, self.b, self.poolsize)
class CNN():
def __init__(self, filter_shapes, dense_layer_sizes):
self.filter_shapes = filter_shapes #List of shapes
self.dense_layer_sizes = dense_layer_sizes # List of hidden units for dense layers
def fit(self, trainset, testset, learning_rate = 0.001, momentum = 0.9, decay = 0.99, batch_sz = 200, poolsize = 2):
learning_rate = np.float32(learning_rate)
momentum = np.float32(momentum)
decay = np.float32(decay)
Xtrain = trainset[0]
Ytrain = trainset[1]
Xtest = testset[0]
Ytest = testset[1]
K = len(set(Ytrain))
# Crop Train and Test sets for divisibility to batch size
Ntrain = len(Ytrain)
Ntrain = Ntrain // batch_sz * batch_sz
Xtrain = Xtrain[:Ntrain,]
Ytrain = Ytrain[:Ntrain]
Ntest = len(Ytest)
Ntest = Ntest//batch_sz * batch_sz
Xtest = Xtest[:Ntest,]
Ytest = Ytest[:Ntest]
X_shape = Xtrain.shape
width = X_shape[1]
height = X_shape[2]
# Create Convolution Layers and Store Them
self.convolutionlayers = []
for shape in self.filter_shapes:
b = np.zeros(shape[-1], dtype = np.float32)
conv = ConvolutionLayer(shape, b, poolsz = poolsize)
self.convolutionlayers.append(conv)
# Size of both width and height is halved in each max pooling so in input size of first fully connected layer is found like this
final_filter_shape = self.filter_shapes[-1]
num_convs = len(self.convolutionlayers)
M1 = int((width/(2**num_convs)) * (height/(2**num_convs)) * final_filter_shape[-1])
# Create Fully Connected Layers and Store Them
self.vanillalayers = []
for M2 in self.dense_layer_sizes:
layer = FullyConnectedLayer(M1,M2)
self.vanillalayers.append(layer)
M1 = M2
final_layer = FullyConnectedLayer(M1, K, activation = None)
self.vanillalayers.append(final_layer)
self.AllLayers = self.convolutionlayers + self.vanillalayers
tfX = tf.placeholder(dtype=tf.float32, shape= (batch_sz, width, height, 1))
tfT = tf.placeholder(dtype=tf.int32, shape = (batch_sz,))
Yish = self.forward(tfX)
cost = tf.reduce_sum(tf.nn.sparse_softmax_cross_entropy_with_logits(logits = Yish, labels=tfT))
train_op = tf.train.RMSPropOptimizer(learning_rate=learning_rate, decay=decay, momentum=momentum).minimize(cost)
predict_op = self.predict(tfX)
max_epoch = 10
print_period = 20
num_batches = Ntrain // batch_sz
TestCosts = []
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
for i in range(max_epoch):
Xtrain, Ytrain = shuffle(Xtrain, Ytrain)
for j in range(num_batches):
Xbatch = Xtrain[j * batch_sz: (j + 1)*batch_sz,]
Ybatch = Ytrain[j * batch_sz: (j + 1)*batch_sz,]
sess.run(train_op, feed_dict = {tfX : Xbatch, tfT : Ybatch})
if j % print_period == 0:
test_cost = 0
prediction = np.zeros(Ntest)
for k in range(Ntest // batch_sz):
Xtestbatch = Xtest[k*batch_sz:(k*batch_sz + batch_sz),]
Ytestbatch = Ytest[k*batch_sz:(k*batch_sz + batch_sz),]
test_cost += sess.run(cost, feed_dict={tfX: Xtestbatch, tfT: Ytestbatch})
prediction[k*batch_sz:(k*batch_sz + batch_sz)] = sess.run(
predict_op, feed_dict={tfX: Xtestbatch})
err = error_rate(prediction, Ytest)
print("Cost / err at iteration i=%d, j=%d: %.3f / %.3f" % (i, j, test_cost, err))
TestCosts.append(test_cost)
plt.plot(TestCosts)
plt.show()
def forward(self, X):
Z = X
count = 0
for layer in self.AllLayers:
# If next layer is fully connected layer, reshape Z
if count >= len(self.convolutionlayers):
Z_shape = Z.get_shape().as_list()
Z = tf.reshape(Z, [Z_shape[0], np.prod(Z_shape[1:])])
Z = layer.forward(Z)
count += 1
return Z
def predict(self, X):
out = self.forward(X)
return tf.math.argmax(out, axis = 1)
def main():
Xtrain, Xtest, Ytrain, Ytest = get_data()
trainset = [Xtrain, Ytrain]
testset = [Xtest, Ytest]
filtershapes = [(5,5,1,10), (5,5,10,20), (5,5,20,40)]
fullylayers = [500,500]
cnn = CNN(filtershapes, fullylayers)
cnn.fit(trainset, testset)
if __name__ == '__main__':
main()

NameError: name 'K' is not defined

I'm following the guide to Transformers and the colab project https://colab.research.google.com/drive/1XBP0Zh8K4g_n0A2p1UlGFf3dij0EX_Kt
but when I run the cell with the line multi_head = build_model() I get the error.
this is the output from the console:
NameError Traceback (most recent call
last) in ()
----> 1 multi_head = build_model()
5 frames in (x)
40 self.dropout = Dropout(attn_dropout)
41 def call(self, q, k, v, mask):
---> 42 attn = Lambda(lambda x:K.batch_dot(x[0],x[1],axes=[2,2])/self.temper)([q, k])
43 if mask is not None:
44 mmask = Lambda(lambda x:(-1e+10)*(1-x))(mask)
NameError: name 'K' is not defined
It just runs after the model architecture code, which the error refers to.
Can you see where this Kshould be defined?
import random, os, sys
import numpy as np
from tensorflow.keras.models import *
from tensorflow.keras.layers import *
from tensorflow.keras.callbacks import *
from tensorflow.keras.initializers import *
import tensorflow as tf
from tensorflow.python.keras.layers import Layer
try:
from dataloader import TokenList, pad_to_longest
# for transformer
except: pass
embed_size = 60
class LayerNormalization(Layer):
def __init__(self, eps=1e-6, **kwargs):
self.eps = eps
super(LayerNormalization, self).__init__(**kwargs)
def build(self, input_shape):
self.gamma = self.add_weight(name='gamma', shape=input_shape[-1:],
initializer=Ones(), trainable=True)
self.beta = self.add_weight(name='beta', shape=input_shape[-1:],
initializer=Zeros(), trainable=True)
super(LayerNormalization, self).build(input_shape)
def call(self, x):
mean = K.mean(x, axis=-1, keepdims=True)
std = K.std(x, axis=-1, keepdims=True)
return self.gamma * (x - mean) / (std + self.eps) + self.beta
def compute_output_shape(self, input_shape):
return input_shape
class ScaledDotProductAttention():
def __init__(self, d_model, attn_dropout=0.1):
self.temper = np.sqrt(d_model)
self.dropout = Dropout(attn_dropout)
def __call__(self, q, k, v, mask):
attn = Lambda(lambda x:K.batch_dot(x[0],x[1],axes=[2,2])/self.temper)([q, k])
if mask is not None:
mmask = Lambda(lambda x:(-1e+10)*(1-x))(mask)
attn = Add()([attn, mmask])
attn = Activation('softmax')(attn)
attn = self.dropout(attn)
output = Lambda(lambda x:K.batch_dot(x[0], x[1]))([attn, v])
return output, attn
class MultiHeadAttention():
# mode 0 - big martixes, faster; mode 1 - more clear implementation
def __init__(self, n_head, d_model, d_k, d_v, dropout, mode=0, use_norm=True):
self.mode = mode
self.n_head = n_head
self.d_k = d_k
self.d_v = d_v
self.dropout = dropout
if mode == 0:
self.qs_layer = Dense(n_head*d_k, use_bias=False)
self.ks_layer = Dense(n_head*d_k, use_bias=False)
self.vs_layer = Dense(n_head*d_v, use_bias=False)
elif mode == 1:
self.qs_layers = []
self.ks_layers = []
self.vs_layers = []
for _ in range(n_head):
self.qs_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))
self.ks_layers.append(TimeDistributed(Dense(d_k, use_bias=False)))
self.vs_layers.append(TimeDistributed(Dense(d_v, use_bias=False)))
self.attention = ScaledDotProductAttention(d_model)
self.layer_norm = LayerNormalization() if use_norm else None
self.w_o = TimeDistributed(Dense(d_model))
def __call__(self, q, k, v, mask=None):
d_k, d_v = self.d_k, self.d_v
n_head = self.n_head
if self.mode == 0:
qs = self.qs_layer(q) # [batch_size, len_q, n_head*d_k]
ks = self.ks_layer(k)
vs = self.vs_layer(v)
def reshape1(x):
s = tf.shape(x) # [batch_size, len_q, n_head * d_k]
x = tf.reshape(x, [s[0], s[1], n_head, d_k])
x = tf.transpose(x, [2, 0, 1, 3])
x = tf.reshape(x, [-1, s[1], d_k]) # [n_head * batch_size, len_q, d_k]
return x
qs = Lambda(reshape1)(qs)
ks = Lambda(reshape1)(ks)
vs = Lambda(reshape1)(vs)
if mask is not None:
mask = Lambda(lambda x:K.repeat_elements(x, n_head, 0))(mask)
head, attn = self.attention(qs, ks, vs, mask=mask)
def reshape2(x):
s = tf.shape(x) # [n_head * batch_size, len_v, d_v]
x = tf.reshape(x, [n_head, -1, s[1], s[2]])
x = tf.transpose(x, [1, 2, 0, 3])
x = tf.reshape(x, [-1, s[1], n_head*d_v]) # [batch_size, len_v, n_head * d_v]
return x
head = Lambda(reshape2)(head)
elif self.mode == 1:
heads = []; attns = []
for i in range(n_head):
qs = self.qs_layers[i](q)
ks = self.ks_layers[i](k)
vs = self.vs_layers[i](v)
head, attn = self.attention(qs, ks, vs, mask)
heads.append(head); attns.append(attn)
head = Concatenate()(heads) if n_head > 1 else heads[0]
attn = Concatenate()(attns) if n_head > 1 else attns[0]
outputs = self.w_o(head)
outputs = Dropout(self.dropout)(outputs)
if not self.layer_norm: return outputs, attn
# outputs = Add()([outputs, q]) # sl: fix
return self.layer_norm(outputs), attn
class PositionwiseFeedForward():
def __init__(self, d_hid, d_inner_hid, dropout=0.1):
self.w_1 = Conv1D(d_inner_hid, 1, activation='relu')
self.w_2 = Conv1D(d_hid, 1)
self.layer_norm = LayerNormalization()
self.dropout = Dropout(dropout)
def __call__(self, x):
output = self.w_1(x)
output = self.w_2(output)
output = self.dropout(output)
output = Add()([output, x])
return self.layer_norm(output)
class EncoderLayer():
def __init__(self, d_model, d_inner_hid, n_head, d_k, d_v, dropout=0.1):
self.self_att_layer = MultiHeadAttention(n_head, d_model, d_k, d_v, dropout=dropout)
self.pos_ffn_layer = PositionwiseFeedForward(d_model, d_inner_hid, dropout=dropout)
def __call__(self, enc_input, mask=None):
output, slf_attn = self.self_att_layer(enc_input, enc_input, enc_input, mask=mask)
output = self.pos_ffn_layer(output)
return output, slf_attn
def GetPosEncodingMatrix(max_len, d_emb):
pos_enc = np.array([
[pos / np.power(10000, 2 * (j // 2) / d_emb) for j in range(d_emb)]
if pos != 0 else np.zeros(d_emb)
for pos in range(max_len)
])
pos_enc[1:, 0::2] = np.sin(pos_enc[1:, 0::2]) # dim 2i
pos_enc[1:, 1::2] = np.cos(pos_enc[1:, 1::2]) # dim 2i+1
return pos_enc
def GetPadMask(q, k):
ones = K.expand_dims(K.ones_like(q, 'float32'), -1)
mask = K.cast(K.expand_dims(K.not_equal(k, 0), 1), 'float32')
mask = K.batch_dot(ones, mask, axes=[2,1])
return mask
def GetSubMask(s):
len_s = tf.shape(s)[1]
bs = tf.shape(s)[:1]
mask = K.cumsum(tf.eye(len_s, batch_shape=bs), 1)
return mask
class Transformer():
def __init__(self, len_limit, embedding_matrix, d_model=embed_size, \
d_inner_hid=512, n_head=10, d_k=64, d_v=64, layers=2, dropout=0.1, \
share_word_emb=False, **kwargs):
self.name = 'Transformer'
self.len_limit = len_limit
self.src_loc_info = False # True # sl: fix later
self.d_model = d_model
self.decode_model = None
d_emb = d_model
pos_emb = Embedding(len_limit, d_emb, trainable=False, \
weights=[GetPosEncodingMatrix(len_limit, d_emb)])
i_word_emb = Embedding(max_features, d_emb, weights=[embedding_matrix]) # Add Kaggle provided embedding here
self.encoder = Encoder(d_model, d_inner_hid, n_head, d_k, d_v, layers, dropout, \
word_emb=i_word_emb, pos_emb=pos_emb)
def get_pos_seq(self, x):
mask = K.cast(K.not_equal(x, 0), 'int32')
pos = K.cumsum(K.ones_like(x, 'int32'), 1)
return pos * mask
def compile(self, active_layers=999):
src_seq_input = Input(shape=(None, ))
x = Embedding(max_features, embed_size, weights=[embedding_matrix])(src_seq_input)
# LSTM before attention layers
x = Bidirectional(LSTM(128, return_sequences=True))(x)
x = Bidirectional(LSTM(64, return_sequences=True))(x)
x, slf_attn = MultiHeadAttention(n_head=3, d_model=300, d_k=64, d_v=64, dropout=0.1)(x, x, x)
avg_pool = GlobalAveragePooling1D()(x)
max_pool = GlobalMaxPooling1D()(x)
conc = concatenate([avg_pool, max_pool])
conc = Dense(64, activation="relu")(conc)
x = Dense(1, activation="sigmoid")(conc)
self.model = Model(inputs=src_seq_input, outputs=x)
self.model.compile(optimizer = 'adam', loss = 'mean_squared_error', metrics=['accuracy'])

If you look at where K is being used you will see:
K.expand_dims
K.cumsum
K.batch_dot
These are Keras backend functions. The code is missing a from keras import backend as K, which I think is a standard abbreviation.

My LSTM code give flat prediction and don't know what is wrong..Anyone can help take a look?

Here is the code. I think the class mylstm has problem but I can not find it... The input is simple, which is just 7 columns data.
I tried to print out all the tensors but did not find what was wrong. Thanks for help!
class mylstm(nn.Module):
def __init__(self, input_size, hidden_size, T, logger):
super(mylstm, self).__init__()
self.T = T
self.input_size = input_size
self.hidden_size = hidden_size
self.logger = logger
self.lstm_layer = nn.LSTM(input_size = 7, hidden_size = hidden_size)
self.fc = nn.Linear(hidden_size, 1)
#self.fc.weight.data.normal_()
def forward(self, input_data):
hidden = self.init_hidden(input_data)
cell = self.init_hidden(input_data)
for t in range(self.T - 1):
if t < self.T - 1:
self.lstm_layer.flatten_parameters()
_, lstm_output = self.lstm_layer(input_data[:,t,:].unsqueeze(0), (hidden, cell))
hidden = lstm_output[0]
cell = lstm_output[1]
y_pred = self.fc(hidden[0])
return y_pred
def init_hidden(self, x):
return Variable(x.data.new(1, x.size(0), self.hidden_size).zero_())
# Train the model
class rnn:
def __init__(self, file_data, logger, input_size = 7, hidden_size = 64, T = 10,
learning_rate = 0.01, batch_size = 128, parallel = True, debug = False):
self.T = T
dat = pd.read_csv(file_data, nrows = 100 if debug else None)
self.logger = logger
self.logger.info("Shape of data: %s.\nMissing in data: %s.", dat.shape, dat.isnull().sum().sum())
self.X = dat.loc[:, [x for x in dat.columns.tolist()]].values
self.y = np.array(dat.rtm_spp)
self.batch_size = batch_size
self.lstm1 = mylstm(input_size = input_size,
hidden_size = hidden_size,
T = T, logger = logger)
if parallel:
self.lstm1 = nn.DataParallel(self.lstm1)
self.lstm1_optimizer = optim.Adam(params = filter(lambda p: p.requires_grad, self.lstm1.parameters()),
lr = learning_rate)
self.train_size = 20000
self.y = self.y - np.mean(self.y[:self.train_size]) # Question: why Adam requires data to be normalized?
self.logger.info("Training size: %d.", self.train_size)
def train(self, n_epochs = 10):
iter_per_epoch = int(np.ceil(self.train_size * 1. / self.batch_size))
logger.info("Iterations per epoch: %3.3f ~ %d.", self.train_size * 1. / self.batch_size, iter_per_epoch)
self.iter_losses = np.zeros(n_epochs * iter_per_epoch)
self.epoch_losses = np.zeros(n_epochs)
self.loss_func = nn.MSELoss()
n_iter = 0
learning_rate = 1.
for i in range(n_epochs):
perm_idx = np.random.permutation(self.train_size - self.T-1)
j = 0
while j < self.train_size:
batch_idx = perm_idx[j:(j + self.batch_size)]
X = np.zeros((len(batch_idx), self.T - 1, self.X.shape[1]))
#y_history = np.zeros((len(batch_idx), self.T - 1))
y_target = self.y[batch_idx + self.T]
for k in range(len(batch_idx)):
X[k, :, :] = self.X[batch_idx[k] : (batch_idx[k] + self.T - 1), :]
loss = self.train_iteration(X, y_target)
self.iter_losses[i * iter_per_epoch + j // self.batch_size] = loss
#if (j / self.batch_size) % 50 == 0:
j += self.batch_size
n_iter += 1
if n_iter % 10000 == 0 and n_iter > 0:
for param_group in self.lstm1_optimizer.param_groups:
param_group['lr'] = param_group['lr'] * 0.9
self.epoch_losses[i] = np.mean(self.iter_losses[range(i * iter_per_epoch, (i + 1) * iter_per_epoch)])
if i % 10 == 0:
self.logger.info("Epoch %d, loss: %3.3f.", i, self.epoch_losses[i])
y_train_pred = self.predict(on_train = True)
y_test_pred = self.predict(on_train = False)
def train_iteration(self, X,y_target):
self.lstm1_optimizer.zero_grad()
y_pred = self.lstm1(Variable(torch.from_numpy(X).type(torch.FloatTensor)))
y_true = Variable(torch.from_numpy(y_target).type(torch.FloatTensor))
y_true = y_true.view(y_true.shape[0],1)
y_pred=y_pred.squeeze(0)
print(y_pred)
loss = self.loss_func(y_pred, y_true)
loss.backward()
self.lstm1_optimizer.step()
return loss.data[0]
def predict(self, on_train = False):
if on_train:
y_pred = np.zeros(self.train_size - self.T +1)
else:
y_pred = np.zeros(self.X.shape[0] - self.train_size)
i = 0
while i < len(y_pred):
batch_idx = np.array(range(len(y_pred)))[i : (i + self.batch_size)]
X = np.zeros((len(batch_idx), self.T - 1, self.X.shape[1]))
#y_history = np.zeros((len(batch_idx), self.T - 1))
for j in range(len(batch_idx)):
if on_train:
X[j, :, :] = self.X[range(batch_idx[j], batch_idx[j] + self.T - 1), :]
else:
X[j, :, :] = self.X[range(batch_idx[j] + self.train_size - self.T, batch_idx[j] + self.train_size - 1), :]
input_data = Variable(torch.from_numpy(X).type(torch.FloatTensor))
# print(self.lstm1(torch.randn(128,9,7)))
#print(self.lstm1(X).data.numpy())
y_pred[i:(i + self.batch_size)] = self.lstm1(input_data).data.numpy()[:,0]
i += self.batch_size
return y_pred
model = rnn(file_data = 'L.csv', logger = logger, parallel = False,
learning_rate = .001)
model.train(n_epochs = 1000)
y_pred = model.predict()

It might be good if you can reduce your codes into the simplest form which still reproduce your problem. Asking people to debug over 200 lines of codes may be too big an ask. If you can give a small example of your problem, using a very simple NN model instead of the current one, many others will be willing to look into your codes and help identify the issue.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

PyTorch T5 Transformer Implementation - python

Related

how to convert k.variable, addweights to pytorch code

MLP mixer - Saving the training model

My tensorflow Convolutional Neural Network does not train

NameError: name 'K' is not defined

My LSTM code give flat prediction and don't know what is wrong..Anyone can help take a look?

Categories

Resources