I'm implementing ELMo model (paper + GRU architecture) using pytorch on sentiment analysis task (2 classes).
My problem is after training model for 3 epochs (almost takes 7 hours), parameters are almost constant, I mean parameters get update but grad value for every parameter is almost zero and parameters updates so slow.
After training model for almost 100 samples (just for test and long time for every epoch) I printed model output on trained samples (64 sentences) and you can see all of outputs are almost 0.61 or 0.62 (models output before applying sigmoid is almost zero):
[0.6190, 0.6177, 0.6218, 0.6209, 0.6216, 0.6177, 0.6218, 0.6248, 0.6187,
0.6209, 0.6208, 0.6197, 0.6208, 0.6201, 0.6164, 0.6204, 0.6187, 0.6186,
0.6172, 0.6227, 0.6180, 0.6176, 0.6177, 0.6189, 0.6167, 0.6162, 0.6204,
0.6212, 0.6212, 0.6170, 0.6175, 0.6188, 0.6200, 0.6207, 0.6211, 0.6186,
0.6171, 0.6190, 0.6171, 0.6215, 0.6204, 0.6166, 0.6169, 0.6189, 0.6192,
0.6171, 0.6198, 0.6210, 0.6217, 0.6182, 0.6205, 0.6167, 0.6185, 0.6185,
0.6247, 0.6201, 0.6183, 0.6172, 0.6248, 0.6156, 0.6187, 0.6221, 0.6184,
0.6200]
mean grad value for first layer (character based embedding) in 7 iterations (with batch size 4):
-3.2057e-08
-1.0591e-07
8.0309e-10
-3.1149e-08
1.7176e-08
1.0479e-08
-5.9668e-08
loss values:
0.6922
0.6888
0.6932
0.6933
0.705
0.6812
0.7068
first layer parameters (before training):
Parameter containing:
tensor([[-0.8127, 0.0848, -1.8994, ..., -0.4188, 0.0737, 1.7480],
[-0.9858, 1.2334, -1.5336, ..., -0.1520, -0.8097, 1.5319],
[-0.3637, 0.2356, -0.6203, ..., -0.2677, 0.3540, -0.8167],
...,
[ 0.5995, 0.0444, 0.5533, ..., -0.6380, -0.2782, 0.4377],
[-1.1214, 0.1163, 0.6494, ..., 0.9082, 0.0925, -2.0435],
[ 1.1774, 2.0876, 1.2902, ..., 0.1933, 0.6906, -0.9966]],
device='cuda:0', requires_grad=True)
first layer parameters (after training on 1000 iterations):
Parameter containing:
tensor([[ 0.4986, -0.1885, -2.1546, ..., 1.6023, 1.0103, -0.0118],
[-0.2110, -0.0524, -0.5779, ..., -1.7709, -0.6997, 1.7685],
[-0.8088, -0.0187, 0.4958, ..., 0.2945, -0.8318, 0.5191],
...,
[ 0.0324, 0.6847, 0.7107, ..., -0.5620, 1.1643, -0.1883],
[ 0.3290, -1.5829, -1.2789, ..., -0.6205, -1.9693, -0.8639],
[ 1.1525, 1.1839, 1.4262, ..., 0.1396, -0.0622, -1.1427]],
device='cuda:0', requires_grad=True)
conv1d_embed module (Embedding + Convolution 1D):
class Conv1d_Embed(nn.Module):
def __init__(self, embed_dim, filters_list):
super(Conv1d_Embed, self).__init__()
self.filters_list = filters_list
self.embed = nn.Embedding(num_embeddings=chars_count, embedding_dim=embed_dim, device=device)
self.conv_list = nn.ModuleList(modules=None)
self.conv_norm_layer = nn.LayerNorm([100, np.sum(np.array(self.filters_list)[:, 0])])
for filter in filters_list:
conv = nn.Conv1d(in_channels=embed_dim, out_channels=filter[0], kernel_size=filter[1], stride=1, padding=0, dilation=1, device=device)
self.conv_list.append(conv)
def forward(self, X):
X = self.embed(X).permute(0, 1, 3, 2)
X_conv = torch.empty(size=(X.shape[0], X.shape[1], np.sum(np.array(self.filters_list)[:, 0])))
for sentence_idx in range(X.shape[0]):
idx_sum = 0
for convolution in self.conv_list:
torch.cuda.empty_cache()
conv_result = convolution(X[sentence_idx])
conv_result = torch.max(conv_result, dim=2).values
seq_columns = convolution.out_channels
X_conv[sentence_idx][:, idx_sum:idx_sum + seq_columns] = conv_result
idx_sum += seq_columns
X_conv = self.conv_norm_layer(X_conv)
X_conv = torch.relu(X_conv)
torch.cuda.empty_cache()
return X_conv
highway network module:
class Highway_Network(nn.Module):
def __init__(self, H_act:str, in_dim:int):
super(Highway_Network, self).__init__()
if H_act == 'relu': self.H_act = nn.ReLU()
elif H_act == 'tanh': self.H_act = nn.Tanh()
else: self.H_act = nn.Sigmoid()
self.in_dim = in_dim
self.H = nn.Linear(in_features=in_dim, out_features=in_dim, bias=False, device=device)
self.T = nn.Linear(in_features=in_dim, out_features=in_dim, bias=True, device=device)
def forward(self, X):
T = torch.sigmoid(self.T(X))
H = self.H_act(self.H(X))
y = (H * T) + (X * (1 - T))
torch.cuda.empty_cache()
return y
ELMo module:
class ELMo(nn.Module):
def __init__(self, in_dim_for_highway, embed_dim, filters_list, proj_size, rnn_hidden_size):
super(ELMo, self).__init__()
self.conv1d_embed = Conv1d_Embed(embed_dim, filters_list)
self.highway_layer1 = Highway_Network(H_act='tanh', in_dim=in_dim_for_highway)
self.highway_layer2 = Highway_Network(H_act='tanh', in_dim=in_dim_for_highway)
self.proj_after_highway = nn.Linear(in_features=in_dim_for_highway, out_features=proj_size, bias=True, device=device)
self.norm_after_highway = nn.LayerNorm([100, proj_size], device=device)
self.rnn_layer1_forward = nn.GRU(input_size=proj_size, hidden_size=rnn_hidden_size, num_layers=1, bias=True,
batch_first=True, dropout=0, bidirectional=False, device=device)
self.rnn_layer1_backward = nn.GRU(input_size=proj_size, hidden_size=rnn_hidden_size, num_layers=1, bias=True,
batch_first=True, dropout=0, bidirectional=False, device=device)
self.rnn_layer2_forward = nn.GRU(input_size=proj_size, hidden_size=rnn_hidden_size, num_layers=1, bias=True,
batch_first=True, dropout=0, bidirectional=False, device=device)
self.rnn_layer2_backward = nn.GRU(input_size=proj_size, hidden_size=rnn_hidden_size, num_layers=1, bias=True,
batch_first=True, dropout=0, bidirectional=False, device=device)
self.proj_after_rnn1_forward = nn.Linear(in_features=rnn_hidden_size, out_features=proj_size, bias=True, device=device)
self.proj_after_rnn1_backward = nn.Linear(in_features=rnn_hidden_size, out_features=proj_size, bias=True, device=device)
self.proj_after_rnn2_forward = nn.Linear(in_features=rnn_hidden_size, out_features=proj_size, bias=True, device=device)
self.proj_after_rnn2_backward = nn.Linear(in_features=rnn_hidden_size, out_features=proj_size, bias=True, device=device)
self.output_layer = nn.Linear(in_features=102400, out_features=1, bias=True, device=device)
def forward(self, X):
output = self.conv1d_embed(X).to(device)
output = self.highway_layer1(output)
output = self.highway_layer2(output)
output = self.proj_after_highway(output)
output = self.norm_after_highway(output)
output = torch.relu(output)
forward_output = self.rnn_layer1_forward(output)[0] # forward
forward_output = torch.relu(forward_output)
forward_output = self.proj_after_rnn1_forward(forward_output)
forward_output = torch.relu(forward_output)
backward_output = self.rnn_layer1_backward(torch.flip(output, dims=[1]))[0] # backward
backward_output = torch.relu(backward_output)
backward_output = self.proj_after_rnn1_backward(backward_output)
backward_output = torch.relu(backward_output)
forward_output = self.rnn_layer2_forward(forward_output)[0]
forward_output = torch.relu(forward_output)
forward_output = self.proj_after_rnn2_forward(forward_output)
forward_output = torch.relu(forward_output)
backward_output = self.rnn_layer2_backward(backward_output)[0]
backward_output = torch.relu(backward_output)
backward_output = self.proj_after_rnn2_backward(backward_output)
backward_output = torch.relu(backward_output)
backward_output = torch.flip(backward_output, dims=[1])
output = torch.concat((forward_output, backward_output), dim=2)
output = output.reshape((output.shape[0], output.shape[1] * output.shape[2]))
output = self.output_layer(output)
output = torch.sigmoid(output)
return output
some other details:
embed_dim = 50
model_location = 'drive/MyDrive/elmo_dataset_words_lower_100/elmo_model.mdl'
optimizer_location = 'drive/MyDrive/elmo_dataset_words_lower_100/elmo_optimizer.optm'
filters_list = [[32, 1], [32, 2], [64, 3], [128, 4], [256, 5], [512, 6], [1024, 7]]
in_dim_for_highway = np.sum(np.array(filters_list)[:, 0])
proj_size = 512
rnn_hidden_size = 4096
Feedforward + Backward module:
model = ELMo(in_dim_for_highway, embed_dim, filters_list, proj_size, rnn_hidden_size)
optimizer = optim.Adam(params=model.parameters(), lr=1e-5)
# model.load_state_dict(torch.load(model_location))
# optimizer.load_state_dict(torch.load(optimizer_location))
print(summary(model))
batch_size = 4
epochs = 5 # Started by 5
bce = nn.BCELoss()
new_slices = slices = pd.read_csv('drive/MyDrive/elmo_dataset_words_lower_100/slice_list.csv').drop(columns=['Unnamed: 0']) # slice 10 is for test
for slice_idx in range(len(slices)):
slice_path = slices.iloc[slice_idx, :].values[0]
print(f'Training ELMo on {slice_path}...')
dataset = np.load(slice_path)
labels = torch.Tensor(dataset['labels'].astype(np.float32)).to('cpu')
dataset = torch.Tensor(dataset['data']).type(torch.int32).to('cpu')
for label_idx in range(len(labels)):
if labels[label_idx] == -1: labels[label_idx] = 0
# elif labels[label_idx] == 0: labels[label_idx] = 1
elif labels[label_idx] == 1: labels[label_idx] = 1
dataset_size = dataset.shape[0]
dataset_loss = list()
idx = torch.randperm(dataset.shape[0])
dataset = dataset[idx] # Randomization
labels = labels[idx] # Randomization
for batch in range(batch_size, dataset.shape[0] + batch_size, batch_size):
optimizer.zero_grad()
X = dataset[batch - batch_size:batch].to(device)
y = labels[batch - batch_size:batch].to(device)
output = model(X).squeeze()
loss = bce(output, y)
loss.backward()
optimizer.step()
print(torch.mean(list(model.parameters())[0].grad))
loss_value = loss.item()
dataset_loss.append(loss_value)
print(f'Batch: {batch} - Loss: {loss_value} - Dataset size: {dataset_size}')
print('---------------------')
torch.save(model.state_dict(), model_location)
torch.save(optimizer.state_dict(), optimizer_location)
print(f'Dataset slice: {slice_path} - Loss: {np.mean(dataset_loss)}')
print(f'Trained model saved in {model_location}')
print(f'Optimizer saved in {optimizer_location}')
print('---------------------')
new_slices = new_slices.drop(index=slice_idx)
new_slices.to_csv('drive/MyDrive/elmo_dataset_words_lower_100/slice_list.csv')
del X, y, dataset, labels, output
collect()
I tested every hyper-parameter you think (batch size, learning rate, activation functions, projection size and etc) and checked labels.
What is problem? I think there is mistake in using pytorch modules like autograd...
Related
I want to learn 100 for 3000days and predict 100 next day but I don't know why this error is happen and resolve this problem
please help me
batchsize = 100, hidden_dim = 10, seq_length = 60, data_dim = 100, output_dim = 100
class Net(nn.Module):
def __init__(self, input_dim, hidden_dim, batch_size, output_dim, layers):
super(Net, self).__init__()
self.hidden_dim = hidden_dim
self.batch_size = batch_size
self.output_dim = output_dim
self.layers = layers
self.lstm = nn.LSTM(input_dim, hidden_dim, num_layers=layers,
batch_first=True)
self.fc = nn.Linear(hidden_dim, output_dim, bias = True)
self.hidden = self.reset_hidden_state()
def reset_hidden_state(self):
return (
torch.zeros(self.layers, self.batch_size, self.hidden_dim),
torch.zeros(self.layers, self.batch_size, self.hidden_dim))
def forward(self, x):
x, self.hidden = self.lstm(x, self.hidden)
x = self.fc(x[:, -1,:]) # [batch_size, seq_len, hidden_dim]
return x
# Train part
def train_model(model, train_df, num_epochs = None, lr = None, verbose = 10, patience = 10):
criterion = nn.MSELoss().to(device)
optimizer = optim.Adam(model.parameters(), lr = learning_rate)
nb_epochs = num_epochs
train_hist = np.zeros(nb_epochs)
for epoch in range(nb_epochs):
avg_cost = 0
total_batch = len(train_df)
for batch_idx, samples in enumerate(train_df):
x_train, y_train = samples
# seq별 hidden state reset
model.reset_hidden_state()
model.hidden = [hidden.to(device) for hidden in model.reset_hidden_state()]
outputs = model(x_train)
loss = criterion(outputs, y_train)
optimizer.zero_grad()
loss.backward()
optimizer.step()
avg_cost += loss/total_batch
train_hist[epoch] = avg_cost
if epoch % verbose == 0:
print('Epoch:', '%04d' % (epoch), 'train loss :', '{:.4f}'.format(avg_cost))
if (epoch % patience == 0) & (epoch != 0):
if train_hist[epoch-patience] < train_hist[epoch]:
print('\n Early Stopping')
break
return model.eval(), train_hist
prediction part
net = Net(data_dim, hidden_dim, batch , output_dim, 1).to(device)
model, train_hist = train_model(net, dataloader, num_epochs = nb_epochs, lr = learning_rate, verbose = 20, patience = 10)
with torch.no_grad():
pred = np.zeros((2940,1,100))
for pr in range(len(trainX_tensor)):
model.reset_hidden_state()
predicted = model(torch.unsqueeze(trainX_tensor[pr], 0)).cpu()
print(predicted.shape)
#predicted = predicted.item()
pred[pr,:,:] = predicted
#print(pr)
# INVERSE
pred_inverse = scaler.inverse_transform(pred)
Expected hidden[0] size (1, 1, 10), got [1, 100, 10]
how can i resolve hidden[0] size fix?
I'm trying to train a cnn with pytorch. My error message I'm getting is:
Given groups=1, weight of size [8, 32, 3], expected input[1, 9999, 5024] to have 32 channels, but got 9999 channels instead
Before starting to train my architecture I hand in my data and label to the
images_batch = torch.from_numpy(np.array(X))
labels_batch = torch.from_numpy(np.array(y))
dataset_train = TensorDataset(X, y)
train_loader = DataLoader(dataset_train, batch_size=32, shuffle=True)
The dimension of X is (5024, 9999, 1) with 5024 being the number of instances, 9999 the sequence length. The dimension to y is (5024,1).
My current code for the model is the following:
class Model(nn.Module):
def __init__(self, **kwargs):
super().__init__()
self.conv1 = nn.Conv1d(32, 8, kernel_size=3, stride=1, padding=0)
self.conv2 = nn.Conv1d(8, 16, kernel_size=3, stride=1, padding=0)
self.conv3 = nn.Conv1d(16, 32, kernel_size=3, stride=1, padding=0)
#self.fc1 = nn.Linear(32, 2)
def forward(self, X):
X = F.relu(self.conv1(X))
X = F.relu(self.conv2(X))
X = F.max_pool2d(X,2)
X = self.conv3(X)
X = F.max_pool2d(X,2)
#X = self.fc1(X)
return F.softmax(X,dim =1)
device = torch.device('cuda') if torch.cuda.is_available() else 'cpu'
model = Model().to('cpu')
# loss and optimizer
loss = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=lr)
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
model.eval()
with torch.no_grad():
for x,y in loader:
x = x.to(device=device)
y = y.to(device=device)
scores = model(x)
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
print(
f"Got {num_correct} / {num_samples} with accuracy"
f" {float(num_correct) / float(num_samples) * 100:.2f}"
)
model.train()
check_accuracy(train_loader, model)
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(train_loader):
data.to(device=device)
targets.to(device=device)
data = data.reshape(data[0],1)
scores = model(data)
loss = criterion(scores, targets)
optimizer.zero_grad()
loss.backward()
optimizer.step()
I'm aware that in tensorflow the ordering of the tensor is different than in pytorch.
This is my CNN class:
class CNN(nn.Module):
def __init__(
self,
vocab_size,
emb_dim,
out_channels,
kernel_sizes,
dropout,
):
super().__init__()
self.embedding = nn.Embedding(vocab_size, emb_dim)
self.conv_0 = nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=(kernel_sizes[0], emb_dim), 2)
self.conv_1 = nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=(kernel_sizes[1], emb_dim), 2)
self.conv_2 = nn.Conv2d(in_channels=1, out_channels=out_channels, kernel_size=(kernel_sizes[2], emb_dim), 2)
self.fc = nn.Linear(len(kernel_sizes) * out_channels, 1)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
embedded = self.embedding(text)
print('embedded', embedded.shape)
embedded = embedded.unsqueeze(1) # may be reshape here
print('embedded', embedded.shape)
conved_0 = F.relu(self.conv_0(embedded)).squeeze(3) # may be reshape here
print('conved_0', conved_0.shape)
conved_1 = F.relu(self.conv_1(embedded)).squeeze(3) # may be reshape here
print('conved_1', conved_1.shape)
conved_2 = F.relu(self.conv_2(embedded)).squeeze(3) # may be reshape here
print('conved_2', conved_2.shape)
pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
print('pooled_0', pooled_0.shape)
pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
print('pooled_1', pooled_1.shape)
pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
print('pooled_2', pooled_2.shape)
cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))
print('cat', cat.shape)
return self.fc(cat)
Variables:
kernel_sizes = [3, 4, 5]
vocab_size = len(TEXT.vocab)
out_channels = 64
dropout = 0.2
dim = 300
model = CNN(vocab_size=vocab_size, emb_dim=dim, out_channels=out_channels,
kernel_sizes=kernel_sizes, dropout=dropout)
And training:
import numpy as np
min_loss = np.inf
cur_patience = 0
for epoch in range(1, max_epochs + 1):
train_loss = 0.0
model.train()
pbar = tqdm(enumerate(train_iter), total=len(train_iter), leave=False)
pbar.set_description(f"Epoch {epoch}")
for it, batch in pbar:
#YOUR CODE GOES HERE
opt.zero_grad()
input = batch.text[0].to(device)
output = model(input)
train_loss = loss_func(output, batch.label)
train_loss.backward()
opt.step()
train_loss /= len(train_iter)
val_loss = 0.0
model.eval()
pbar = tqdm(enumerate(valid_iter), total=len(valid_iter), leave=False)
pbar.set_description(f"Epoch {epoch}")
for it, batch in pbar:
# YOUR CODE GOES HERE
input = batch.text[0].to(device)
output = model(input)
val_loss = loss_fn(output, batch.label)
val_loss /= len(valid_iter)
if val_loss < min_loss:
min_loss = val_loss
best_model = model.state_dict()
else:
cur_patience += 1
if cur_patience == patience:
cur_patience = 0
break
print('Epoch: {}, Training Loss: {}, Validation Loss: {}'.format(epoch, train_loss, val_loss))
model.load_state_dict(best_model)
I get this error:
RuntimeError: Expected 4-dimensional input for 4-dimensional weight [64, 1, 3, 300], but got 3-dimensional input of size [894, 1, 300] instead
in this line:
---> 32 conved_0 = F.relu(self.conv_0(embedded)).squeeze(3)
I've tried using Conv1d, but still had problems with dimensions. Could somebody please explain what should I fix here for the network to train?
EDIT:
This is my class but with Conv1d:
class CNN(nn.Module):
def __init__(
self,
vocab_size,
emb_dim,
out_channels,
kernel_sizes,
dropout,
):
super().__init__()
self.embedding = nn.Embedding(vocab_size, emb_dim)
self.conv_0 = nn.Conv1d(in_channels=1, out_channels=out_channels, kernel_size=kernel_sizes[0])
self.conv_1 = nn.Conv1d(in_channels=1, out_channels=out_channels, kernel_size=kernel_sizes[1])
self.conv_2 = nn.Conv1d(in_channels=1, out_channels=out_channels, kernel_size=kernel_sizes[2])
self.fc = nn.Linear(len(kernel_sizes) * out_channels, 1)
self.dropout = nn.Dropout(dropout)
def forward(self, text):
embedded = self.embedding(text)
print('embedded', embedded.shape)
embedded = embedded.unsqueeze(1) # may be reshape here
print('embedded', embedded.shape)
conved_0 = F.relu(self.conv_0(embedded)) # may be reshape here
print('conved_0', conved_0.shape)
conved_1 = F.relu(self.conv_1(embedded)) # may be reshape here
print('conved_1', conved_1.shape)
conved_2 = F.relu(self.conv_2(embedded)) # may be reshape here
print('conved_2', conved_2.shape)
pooled_0 = F.max_pool1d(conved_0, conved_0.shape[2]).squeeze(2)
print('pooled_0', pooled_0.shape)
pooled_1 = F.max_pool1d(conved_1, conved_1.shape[2]).squeeze(2)
print('pooled_1', pooled_1.shape)
pooled_2 = F.max_pool1d(conved_2, conved_2.shape[2]).squeeze(2)
print('pooled_2', pooled_2.shape)
cat = self.dropout(torch.cat((pooled_0, pooled_1, pooled_2), dim=1))
print('cat', cat.shape)
return self.fc(cat)
Dimensions output:
embedded torch.Size([1115, 300])
embedded torch.Size([1115, 1, 300])
conved_0 torch.Size([1115, 64, 298])
conved_1 torch.Size([1115, 64, 297])
conved_2 torch.Size([1115, 64, 296])
pooled_0 torch.Size([1115, 64])
pooled_1 torch.Size([1115, 64])
pooled_2 torch.Size([1115, 64])
cat torch.Size([1115, 192])
Error:
ValueError: Target size (torch.Size([128])) must be the same as input size (torch.Size([1115, 1]))
What I was missing is that I had batch_first parameter set to True, which SWAPED batch_size and seq_len. Once I've set it to False, everything worked perfectly.
I'm newbie to tensorflow2 and use tensorflow2.3.1, cpu version.
I defined the model in subclassing way and, when showing the structure of my model, I encountered the error "tensorflow.python.framework.errors_impl.InvalidArgumentError: Cannot convert a Tensor of dtype resource to a NumPy array", which points to the following line in BST_DSSM.build_model
"self.item_sequence_embeddings = tf.nn.embedding_lookup("
I have browsed through similar questions but can't find satisfactory solution.
Any help will be appreciated :)
Below is my code.
import tensorflow as tf
class MultiHeadAttention(tf.keras.layers.Layer):
""" def multi head attention layer
q, k, v multiplied by Wq, Wk, Wv respectively -> q', k', v'
q' * k' -> w, w / sqrt(q'.shape[1]) -> w'
w' * v' -> z, z * Wz -> z'
z' add v (residual), then goes through LRelu, do a LN at last
"""
def __init__(
self,
scope_name,
num_units=8,
num_heads=1,
embed_dim=8,
has_residual=True,
dropout_keep_prob=1.0):
super(MultiHeadAttention, self).__init__()
assert num_units % num_heads == 0
assert scope_name in ["user", "item"]
self.num_heads = num_heads
self.num_units = num_units
self.embed_dim = embed_dim
self.dropout_keep_prob = dropout_keep_prob
self.Wq = tf.keras.layers.Dense(
units=self.num_units, activation=tf.nn.leaky_relu, name=f"{scope_name}_Wq")
self.Wk = tf.keras.layers.Dense(
units=self.num_units, activation=tf.nn.leaky_relu, name=f"{scope_name}_Wk")
self.Wv = tf.keras.layers.Dense(
units=self.num_units, activation=tf.nn.leaky_relu, name=f"{scope_name}_Wv")
self.has_residual = has_residual
self.Wz = tf.keras.layers.Dense(embed_dim)
def call(self, queries, keys_, values):
"""
:param queries: of shape [batch_size, max_length, emb_dim]
:param keys_: of shape [batch_size, max_length, emb_dim]
:param values: of shape [batch_size, max_length, emb_dim]
:return:
"""
assert values.get_shape().as_list()[-1] == self.embed_dim
assert queries.get_shape().as_list()[-1] == self.embed_dim
assert keys_.get_shape().as_list()[-1] == self.embed_dim
# Linear projections
Q = self.Wq(queries)
K = self.Wk(keys_)
V = self.Wv(values)
# Split and concat
Q_ = tf.concat(tf.split(Q, self.num_heads, axis=2), axis=0)
K_ = tf.concat(tf.split(K, self.num_heads, axis=2), axis=0)
V_ = tf.concat(tf.split(V, self.num_heads, axis=2), axis=0)
# Multiplication
weights = tf.matmul(Q_, tf.transpose(K_, [0, 2, 1]))
# Scale
weights = weights / (K_.get_shape().as_list()[-1] ** 0.5)
# convert to prob vector
weights = tf.nn.softmax(weights)
# Dropouts
if 0 < self.dropout_keep_prob < 1:
weights = tf.keras.layers.AlphaDropout(
rate=1 - self.dropout_keep_prob)(weights)
# Weighted sum
# [batch_size * num_heads, max_length, num_units / num_heads]
outputs = tf.matmul(weights, V_)
# Restore shape to [batch_size, max_length, num_units]
z = tf.concat(tf.split(outputs, self.num_heads, axis=0), axis=2)
# Restore shape to [batch_size, max_length, embed_dim]
z = self.Wz(z)
# Residual connection
if self.has_residual:
z += values
z = tf.nn.leaky_relu(z)
# Normalize
z = tf.keras.layers.LayerNormalization(
beta_initializer="zeros", gamma_initializer="ones")(z)
return z
class BST_DSSM(tf.keras.Model):
"""define BST+DSSM model stucture
"""
def __init__(self, model_dir,
item_embedding=None, user_embedding=None,
embedding_size=8,
vocab_size=1000,
max_length_item=15, max_length_user=6,
epoch=10, batch_size=256, blocks=2,
learning_rate=0.001, optimizer_type="adam",
batch_norm=0, batch_norm_decay=0.995,
verbose=False, random_seed=2019,
l2_reg=0.0, has_residual=True):
"""
initial model related parms and tensors
"""
super(BST_DSSM, self).__init__()
# denote as K, size of the feature embedding
self.embedding_size = embedding_size
self.l2_reg = l2_reg
self.epoch = epoch
self.batch_size = batch_size
self.learning_rate = learning_rate
self.optimizer_type = optimizer_type
self.optimizer = None
self.blocks = blocks
self.batch_norm = batch_norm
self.batch_norm_decay = batch_norm_decay
self.verbose = verbose
self.random_seed = random_seed
self.model_dir = model_dir
# self._init_graph()
self.vocab_size = vocab_size
self.max_length_item = max_length_item
self.max_length_user = max_length_user
self.has_residual = has_residual
self.model = None
self.item_embedding = item_embedding
self.user_embedding = user_embedding
self.mha_user = MultiHeadAttention("user", num_units=embedding_size)
self.mha_item = MultiHeadAttention("item", num_units=embedding_size)
def _get_item_embedding_matrix(self):
if self.item_embedding is None:
std = 0.1
minval = -std
maxval = std
emb_matrix = tf.Variable(
tf.random.uniform(
[self.vocab_size, self.embedding_size],
minval, maxval,
seed=self.random_seed,
dtype=tf.float32),
name="item_embedding")
self.item_embedding = emb_matrix
def _get_user_embedding_matrix(self):
if self.user_embedding is None:
std = 0.1
minval = -std
maxval = std
emb_matrix = tf.Variable(
tf.random.uniform(
[self.vocab_size, self.embedding_size],
minval, maxval,
seed=self.random_seed,
dtype=tf.float32),
name="user_embedding")
self.user_embedding = emb_matrix
def build_model(self):
# initialize lut
self._get_item_embedding_matrix()
self._get_user_embedding_matrix()
item_inputs = tf.keras.Input(
shape=(
self.max_length_item
),
dtype=tf.int32,
name="item_sequence_idx")
user_inputs = tf.keras.Input(
shape=(
self.max_length_user
),
dtype=tf.int32,
name="user_sequence_idx")
# user and item use different lut, similarly to DSSM
self.item_sequence_embeddings = tf.nn.embedding_lookup(
self.item_embedding, item_inputs, name="item_sequence_embeddings")
self.video_sequence_embeddings = tf.nn.embedding_lookup(
self.user_embedding, user_inputs, name="video_sequence_embeddings")
# self attn part
for i in range(self.blocks):
self.item_sequence_embeddings = self.mha_item(
queries=self.item_sequence_embeddings,
keys=self.item_sequence_embeddings,
values=self.item_sequence_embeddings)
self.video_sequence_embeddings = self.mha_user(
queries=self.video_sequence_embeddings,
keys=self.video_sequence_embeddings,
values=self.video_sequence_embeddings)
# max pooling
self.item_sequence_embeddings = tf.nn.max_pool(
self.item_sequence_embeddings,
[1, self.max_length_item, 1],
[1 for _ in range(len(self.item_sequence_embeddings.shape))],
padding="VALID")
self.video_sequence_embeddings = tf.nn.max_pool(
self.video_sequence_embeddings,
[1, self.max_length_user, 1],
[1 for _ in range(len(self.video_sequence_embeddings.shape))],
padding="VALID")
# cosine similarity
self.item_sequence_embeddings = tf.nn.l2_normalize(
self.item_sequence_embeddings, axis=2)
self.video_sequence_embeddings = tf.nn.l2_normalize(
self.video_sequence_embeddings, axis=2)
outputs = tf.matmul(
self.item_sequence_embeddings,
tf.transpose(self.video_sequence_embeddings, [0, 2, 1]))
outputs = tf.reshape(outputs, [-1, 1])
# optimizer
if self.optimizer_type == "adam":
self.optimizer = tf.keras.optimizers.Adam(
learning_rate=self.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
elif self.optimizer_type == "adagrad":
self.optimizer = tf.keras.optimizers.Adagrad(
learning_rate=self.learning_rate,
initial_accumulator_value=1e-8)
elif self.optimizer_type == "gd":
self.optimizer = tf.keras.optimizers.SGD(
learning_rate=self.learning_rate)
elif self.optimizer_type == "momentum":
self.optimizer = tf.keras.optimizers.SGD(
learning_rate=self.learning_rate, momentum=0.95)
self.model = tf.keras.Model(
inputs={
"item_sequence_idx": item_inputs,
"user_sequence_idx": user_inputs
},
outputs=outputs)
self.model.compile(
optimizer=self.optimizer,
loss=self.loss_fn,
metrics=[
tf.keras.metrics.AUC(),
tf.keras.metrics.binary_accuracy()])
Although I didn't figure out why I got such an error, I have built my model by defining a call method and the code is as below
from conf_loader import (
emb_dim, n_layer,
item_max_len, user_max_len,
batch_size, lr, l2_reg,
vocab_size
)
class BST_DSSM(tf.keras.Model):
"""define BST+DSSM model stucture
"""
def __init__(self,
item_embedding=None, user_embedding=None,
emb_dim=emb_dim,
vocab_size=vocab_size,
item_max_len=item_max_len, user_max_len=user_max_len,
epoch=10, batch_size=batch_size, n_layers=n_layer,
learning_rate=lr, optimizer_type="adam",
random_seed=2019,
l2_reg=l2_reg, has_residual=True):
"""
initial model related parms and tensors
"""
super(BST_DSSM, self).__init__()
self.emb_dim = emb_dim
self.l2_reg = l2_reg
self.epoch = epoch
self.batch_size = batch_size
self.learning_rate = learning_rate
self.optimizer_type = optimizer_type
self.blocks = n_layers
self.random_seed = random_seed
self.vocab_size = vocab_size
self.item_max_len = item_max_len
self.user_max_len = user_max_len
self.has_residual = has_residual
self.item_embedding = item_embedding
self.user_embedding = user_embedding
self.mha_user = MultiHeadAttention(scope_name="user", embed_dim=self.emb_dim)
self.mha_item = MultiHeadAttention(scope_name="item", embed_dim=self.emb_dim)
# optimizer
if self.optimizer_type == "adam":
self.optimizer = tf.keras.optimizers.Adam(
learning_rate=self.learning_rate, beta_1=0.9, beta_2=0.999, epsilon=1e-8)
elif self.optimizer_type == "adagrad":
self.optimizer = tf.keras.optimizers.Adagrad(
learning_rate=self.learning_rate,
initial_accumulator_value=1e-8)
elif self.optimizer_type == "gd":
self.optimizer = tf.keras.optimizers.SGD(
learning_rate=self.learning_rate)
elif self.optimizer_type == "momentum":
self.optimizer = tf.keras.optimizers.SGD(
learning_rate=self.learning_rate, momentum=0.95)
self.user_embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, output_dim=self.emb_dim)
self.item_embedding = tf.keras.layers.Embedding(input_dim=self.vocab_size, output_dim=self.emb_dim)
#tf.function
def call(self, inputs, training=True):
# multiple inputs
item_inputs = inputs[0]
user_inputs = inputs[1]
item_sequence_embeddings = self.item_embedding(item_inputs)
user_sequence_embeddings = self.user_embedding(user_inputs)
# [batch_size, max_length, 16]
for i in range(self.blocks):
item_sequence_embeddings = self.mha_item(item_sequence_embeddings)
user_sequence_embeddings = self.mha_user(user_sequence_embeddings)
# [batch_size, 1, 16]
item_outputs_max = tf.nn.max_pool(
item_sequence_embeddings,
[1, self.item_max_len, 1],
[1 for _ in range(len(item_sequence_embeddings.shape))],
padding="VALID")
user_outputs_max = tf.nn.max_pool(
user_sequence_embeddings,
[1, self.user_max_len, 1],
[1 for _ in range(len(user_sequence_embeddings.shape))],
padding="VALID")
# L2 normalize to get cosine similarity
item_normalized = tf.nn.l2_normalize(
item_outputs_max, axis=2)
user_normalized = tf.nn.l2_normalize(
user_outputs_max, axis=2)
outputs = tf.matmul(
item_normalized,
user_normalized,
transpose_b=True)
return tf.reshape(outputs, [-1, 1])
def loss_fn(self, target, output):
cross_entropy = tf.keras.backend.binary_crossentropy(
target, output, from_logits=False
)
if self.l2_reg > 0:
_regularizer = tf.keras.regularizers.l2(self.l2_reg)
cross_entropy += _regularizer(self.user_embedding)
cross_entropy += _regularizer(self.item_embedding)
return cross_entropy
def debug():
x_train = [
np.random.randint(low=0, high=20, size=(5, item_max_len)),
np.random.randint(low=0, high=20, size=(5, user_max_len))]
y_train = np.random.randint(low=0, high=2, size=5).astype(dtype=float)
model = BST_DSSM()
model.compile(
optimizer=model.optimizer,
loss=model.loss_fn
)
model.fit(x_train, y_train, epochs=n_epoch)
model.summary()
I have a network which outputs a vector of length two. My targets are in the form of 1 or zeros, referring to two possible categories. What is the best way to get the loss - i.e. should I transform the targets, for example into a dimension 2 vector, or should I transform the output of the network, e.g. take the location of the max number as the output?
My network looks like:
class LSTMClassifier(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super().__init__()
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
self.lstm1 = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, layer_dim, batch_first=True)
self.fc1 = nn.Linear(hidden_dim, 32)
self.fc2 = nn.Linear(32, 1)
self.dropout = nn.Dropout(p=0.2)
self.batch_normalisation1 = nn.BatchNorm1d(layer_dim)
self.batch_normalisation2 = nn.BatchNorm1d(2)
self.activation = nn.Softmax(dim=2)
def forward(self, x):
h0, c0 = self.init_hidden(x)
out, (hn1, cn1) = self.lstm1(x, (h0, c0))
out = self.dropout(out,)
out = self.batch_normalisation1(out)
h1, c1 = self.init_hidden(out)
out, (hn2, cn2) = self.lstm2(out, (h1, c1))
out = self.dropout(out)
out = self.batch_normalisation1(out)
h2, c2 = self.init_hidden(out)
out, (hn3, cn3) = self.lstm2(out, (h2, c2))
out = self.dropout(out)
out = self.batch_normalisation1(out)
out = self.fc1(out[:, -1, :])
out = self.dropout(out)
out = self.fc2(out)
return out
def init_hidden(self, x):
h0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
c0 = torch.zeros(self.layer_dim, x.size(0), self.hidden_dim)
return [t for t in (h0, c0)]
def pred(self, x):
out = self(x)
return out > 0
An example of input to this network is:
tensor([[[0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00, 0.0000e+00],
[2.3597e-04, 1.1507e-02, 8.7719e-02, 6.1093e-02, 9.5556e-01],
[2.1474e-03, 5.3805e-03, 9.6491e-02, 2.2508e-01, 8.2222e-01]]])
which has shape torch.Size([1, 3, 5]). The target is currently 1 or 0. However, the network outputs a vector such as:
tensor([[0.5293, 0.4707]], grad_fn=<SoftmaxBackward>)
What would be the best way to set up the loss between these target and the network output?
Update:
I can now train the model as suggested in the answers as:
model = LSTMClassifier(5, 128, 3, 1)
Epochs = 10
batch_size = 32
criterion = nn.BCEWithLogitsLoss()
optimizer = torch.optim.Adam(model.parameters(), lr=0.01, weight_decay=1e-6)
for epoch in range(Epochs):
if epoch == 0:
accurate = 0
for X_instance, y_instance in zip(val_x, val_y):
if int(y_instance) == 1 and model.pred(X_instance.view(-1, 3, 5)).item():
accurate += 1
print(f"Untrained accuracy test set: {accurate/len(val_x)}")
print(f"Epoch {epoch + 1}")
for n, (X, y) in enumerate(train_batches):
model.train()
optimizer.zero_grad()
y_pred = model(X)
loss = criterion(y_pred, y)
loss.backward()
optimizer.step()
model.eval()
accurate = 0
for X_instance, y_instance in zip(val_x, val_y):
if int(y_instance) == 1 and model.pred(X_instance.view(-1, 3, 5)).item():
accurate += 1
print(f"Accuracy test set: {accurate/len(val_x)}")
You shouldn't use any activation at the end of your network and output only a single neuron instead of two (trained with BCEWithLogitsLoss).
Below is your neural network code with commentary and removal of unnecessary parts:
class LSTMClassifier(nn.Module):
def __init__(self, input_dim, hidden_dim, layer_dim, output_dim):
super().__init__()
self.hidden_dim = hidden_dim
self.layer_dim = layer_dim
self.lstm1 = nn.LSTM(input_dim, hidden_dim, layer_dim, batch_first=True)
self.lstm2 = nn.LSTM(hidden_dim, hidden_dim, layer_dim, batch_first=True)
self.fc1 = nn.Linear(hidden_dim, 32)
# Output 1 neuron instead of two
self.fc2 = nn.Linear(32, 1)
# Model should not depend on batch size
# self.batch_size = None
# You are not using this variable
# self.hidden = None
self.dropout = nn.Dropout(p=0.2)
self.batch_normalisation1 = nn.BatchNorm1d(layer_dim)
self.batch_normalisation2 = nn.BatchNorm1d(2)
def forward(self, x):
# Hidden are initialized with 0 explicitly
# h0, c0 = self.init_hidden(x)
out, _ = self.lstm1(x)
# No need for initial values
# out, (hn1, cn1) = self.lstm1(x, (h0, c0))
out = self.dropout(out)
out = self.batch_normalisation1(out)
# Same for all other cells you re-init with zeros, it's implicit
out, _ = self.lstm2(out)
out = self.dropout(out)
out = self.batch_normalisation1(out)
out, _ = self.lstm2(out)
out = self.dropout(out)
out = self.batch_normalisation1(out)
out = self.fc1(out[:, -1, :])
out = self.dropout(out)
# No need for activation
# out = F.softmax(self.fc2(out))
out = self.fc2(out)
return out
# Return True (1) or False (0)
def pred(self, x):
return self(x) > 0
I have also added pred method which transforms logits into targets (e.g. to use with some metrics).
Basically, if your logit is lower than 0 it is False, otherwise it is True. No need for activation in this case.