Loss doesn't decrease in Pytorch CNN - python

I'm doing a CNN with Pytorch for a task, but it won't learn and improve the accuracy. I made a version working with the MNIST dataset so I could post it here. I'm just looking for an answer as to why it's not working. The architecture is fine, I implemented it in Keras and I had over 92% accuracy after 3 epochs. Note: I reshaped the MNIST into 60x60 pictures because that's how the pictures are in my "real" problem.
import numpy as np
from PIL import Image
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.autograd import Variable
from keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
def resize(pics):
pictures = []
for image in pics:
image = Image.fromarray(image).resize((dim, dim))
image = np.array(image)
pictures.append(image)
return np.array(pictures)
dim = 60
x_train, x_test = resize(x_train), resize(x_test) # because my real problem is in 60x60
x_train = x_train.reshape(-1, 1, dim, dim).astype('float32') / 255
x_test = x_test.reshape(-1, 1, dim, dim).astype('float32') / 255
y_train, y_test = y_train.astype('float32'), y_test.astype('float32')
if torch.cuda.is_available():
x_train = torch.from_numpy(x_train)[:10_000]
x_test = torch.from_numpy(x_test)[:4_000]
y_train = torch.from_numpy(y_train)[:10_000]
y_test = torch.from_numpy(y_test)[:4_000]
class ConvNet(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 32, 3)
self.conv2 = nn.Conv2d(32, 64, 3)
self.conv3 = nn.Conv2d(64, 128, 3)
self.fc1 = nn.Linear(5*5*128, 1024)
self.fc2 = nn.Linear(1024, 2048)
self.fc3 = nn.Linear(2048, 1)
def forward(self, x):
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv3(x)), (2, 2))
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.dropout(x, 0.5)
x = torch.sigmoid(self.fc3(x))
return x
net = ConvNet()
optimizer = optim.Adam(net.parameters(), lr=0.03)
loss_function = nn.BCELoss()
class FaceTrain:
def __init__(self):
self.len = x_train.shape[0]
self.x_train = x_train
self.y_train = y_train
def __getitem__(self, index):
return x_train[index], y_train[index].unsqueeze(0)
def __len__(self):
return self.len
class FaceTest:
def __init__(self):
self.len = x_test.shape[0]
self.x_test = x_test
self.y_test = y_test
def __getitem__(self, index):
return x_test[index], y_test[index].unsqueeze(0)
def __len__(self):
return self.len
train = FaceTrain()
test = FaceTest()
train_loader = DataLoader(dataset=train, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test, batch_size=64, shuffle=True)
epochs = 10
steps = 0
train_losses, test_losses = [], []
for e in range(epochs):
running_loss = 0
for images, labels in train_loader:
optimizer.zero_grad()
log_ps = net(images)
loss = loss_function(log_ps, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
test_loss = 0
accuracy = 0
with torch.no_grad():
for images, labels in test_loader:
log_ps = net(images)
test_loss += loss_function(log_ps, labels)
ps = torch.exp(log_ps)
top_p, top_class = ps.topk(1, dim=1)
equals = top_class.type('torch.LongTensor') == labels.type(torch.LongTensor).view(*top_class.shape)
accuracy += torch.mean(equals.type('torch.FloatTensor'))
train_losses.append(running_loss/len(train_loader))
test_losses.append(test_loss/len(test_loader))
print("[Epoch: {}/{}] ".format(e+1, epochs),
"[Training Loss: {:.3f}] ".format(running_loss/len(train_loader)),
"[Test Loss: {:.3f}] ".format(test_loss/len(test_loader)),
"[Test Accuracy: {:.3f}]".format(accuracy/len(test_loader)))

First the major issues...
1. The main issue with this code is that you're using the wrong output shape and the wrong loss function for classification.
nn.BCELoss computes the binary cross entropy loss. This is applicable when you have one or more targets which are either 0 or 1 (hence the binary). In your case the target is a single integer between 0 and 9. Since there are only a small number of potential target values, the most common approach is to use categorical cross-entropy loss (nn.CrossEntropyLoss). The "theoretical" definition of cross entropy loss expects the network outputs and the targets to both be 10 dimensional vectors where the target is all zeros except in one location (one-hot encoded). However for computational stability and space efficiency reasons, pytorch's nn.CrossEntropyLoss directly takes the integer as a target. However, you still need to provide it with a 10 dimensional output vector from your network.
# pseudo code (ignoring batch dimension)
loss = nn.functional.cross_entropy_loss(<output 10d vector>, <integer target>)
To fix this issue in your code we need to have fc3 output a 10 dimensional feature, and we need the labels to be integers (not floats). Also, there's no need to use .sigmoid on fc3 since pytorch's cross-entropy loss function internally applies log-softmax before computing the final loss value.
2. As pointed out by Serget Dymchenko, you need to switch the network to eval mode during inference and train mode during train. This mainly affects dropout and batch_norm layers since they behave differently during training and inference.
3. A learning rate of 0.03 is probably a little too high. It works just fine with a learning rate of 0.001 and in a couple experiments I saw the training diverge at 0.03.
To accommodate these fixes a number of changes needed to be made. The minimal corrections to the code are shown below. I commented any lines which were changed with #### followed by a short description of the change.
import numpy as np
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.autograd import Variable
from keras.datasets import mnist
(x_train, y_train), (x_test, y_test) = mnist.load_data()
def resize(pics):
pictures = []
for image in pics:
image = Image.fromarray(image).resize((dim, dim))
image = np.array(image)
pictures.append(image)
return np.array(pictures)
dim = 60
x_train, x_test = resize(x_train), resize(x_test) # because my real problem is in 60x60
x_train = x_train.reshape(-1, 1, dim, dim).astype('float32') / 255
x_test = x_test.reshape(-1, 1, dim, dim).astype('float32') / 255
#### float32 -> int64
y_train, y_test = y_train.astype('int64'), y_test.astype('int64')
#### no reason to test for cuda before converting to numpy
#### I assume you were taking a subset for debugging? No reason to not use all the data
x_train = torch.from_numpy(x_train)
x_test = torch.from_numpy(x_test)
y_train = torch.from_numpy(y_train)
y_test = torch.from_numpy(y_test)
class ConvNet(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 32, 3)
self.conv2 = nn.Conv2d(32, 64, 3)
self.conv3 = nn.Conv2d(64, 128, 3)
self.fc1 = nn.Linear(5*5*128, 1024)
self.fc2 = nn.Linear(1024, 2048)
#### 1 -> 10
self.fc3 = nn.Linear(2048, 10)
def forward(self, x):
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv3(x)), (2, 2))
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.dropout(x, 0.5)
#### removed sigmoid
x = self.fc3(x)
return x
net = ConvNet()
#### 0.03 -> 1e-3
optimizer = optim.Adam(net.parameters(), lr=1e-3)
#### BCELoss -> CrossEntropyLoss
loss_function = nn.CrossEntropyLoss()
class FaceTrain:
def __init__(self):
self.len = x_train.shape[0]
self.x_train = x_train
self.y_train = y_train
def __getitem__(self, index):
#### .unsqueeze(0) removed
return x_train[index], y_train[index]
def __len__(self):
return self.len
class FaceTest:
def __init__(self):
self.len = x_test.shape[0]
self.x_test = x_test
self.y_test = y_test
def __getitem__(self, index):
#### .unsqueeze(0) removed
return x_test[index], y_test[index]
def __len__(self):
return self.len
train = FaceTrain()
test = FaceTest()
train_loader = DataLoader(dataset=train, batch_size=64, shuffle=True)
test_loader = DataLoader(dataset=test, batch_size=64, shuffle=True)
epochs = 10
steps = 0
train_losses, test_losses = [], []
for e in range(epochs):
running_loss = 0
#### put net in train mode
net.train()
for idx, (images, labels) in enumerate(train_loader):
optimizer.zero_grad()
log_ps = net(images)
loss = loss_function(log_ps, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
test_loss = 0
accuracy = 0
#### put net in eval mode
net.eval()
with torch.no_grad():
for images, labels in test_loader:
log_ps = net(images)
test_loss += loss_function(log_ps, labels)
#### removed torch.exp() since exponential is monotone, taking it doesn't change the order of outputs. Similarly with torch.softmax()
top_p, top_class = log_ps.topk(1, dim=1)
#### convert to float/long using proper methods. what you have won't work for cuda tensors.
equals = top_class.long() == labels.long().view(*top_class.shape)
accuracy += torch.mean(equals.float())
train_losses.append(running_loss/len(train_loader))
test_losses.append(test_loss/len(test_loader))
print("[Epoch: {}/{}] ".format(e+1, epochs),
"[Training Loss: {:.3f}] ".format(running_loss/len(train_loader)),
"[Test Loss: {:.3f}] ".format(test_loss/len(test_loader)),
"[Test Accuracy: {:.3f}]".format(accuracy/len(test_loader)))
Results of training are now...
[Epoch: 1/10] [Training Loss: 0.139] [Test Loss: 0.046] [Test Accuracy: 0.986]
[Epoch: 2/10] [Training Loss: 0.046] [Test Loss: 0.042] [Test Accuracy: 0.987]
[Epoch: 3/10] [Training Loss: 0.031] [Test Loss: 0.040] [Test Accuracy: 0.988]
[Epoch: 4/10] [Training Loss: 0.022] [Test Loss: 0.029] [Test Accuracy: 0.990]
[Epoch: 5/10] [Training Loss: 0.017] [Test Loss: 0.066] [Test Accuracy: 0.987]
[Epoch: 6/10] [Training Loss: 0.015] [Test Loss: 0.056] [Test Accuracy: 0.985]
[Epoch: 7/10] [Training Loss: 0.018] [Test Loss: 0.039] [Test Accuracy: 0.991]
[Epoch: 8/10] [Training Loss: 0.012] [Test Loss: 0.057] [Test Accuracy: 0.988]
[Epoch: 9/10] [Training Loss: 0.012] [Test Loss: 0.041] [Test Accuracy: 0.991]
[Epoch: 10/10] [Training Loss: 0.007] [Test Loss: 0.048] [Test Accuracy: 0.992]
Some other issues that will improve your performance and code.
4. You're never moving the model to the GPU. This means you won't be getting GPU acceleration.
5. torchvision is designed with all the standard transforms and datasets and is built to be used with PyTorch. I recommend using it. This also removes the dependency on keras in your code.
6. Normalize your data by subtracting the mean and dividing by the standard deviation to improve performance of your network. With torchvision you can use transforms.Normalize. This won't make a big difference in MNIST because its already too easy. But in more difficult problems it turns out to be important.
Further improved code is show below (much faster on GPU).
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
from torch.utils.data import DataLoader
from torchvision.datasets import MNIST
from torchvision import transforms
dim = 60
class ConvNet(nn.Module):
def __init__(self):
super().__init__()
self.conv1 = nn.Conv2d(1, 32, 3)
self.conv2 = nn.Conv2d(32, 64, 3)
self.conv3 = nn.Conv2d(64, 128, 3)
self.fc1 = nn.Linear(5 * 5 * 128, 1024)
self.fc2 = nn.Linear(1024, 2048)
self.fc3 = nn.Linear(2048, 10)
def forward(self, x):
x = F.max_pool2d(F.relu(self.conv1(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv2(x)), (2, 2))
x = F.max_pool2d(F.relu(self.conv3(x)), (2, 2))
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.dropout(x, 0.5)
x = self.fc3(x)
return x
net = ConvNet()
if torch.cuda.is_available():
net.cuda()
optimizer = optim.Adam(net.parameters(), lr=1e-3)
loss_function = nn.CrossEntropyLoss()
train_dataset = MNIST('./data', train=True, download=True,
transform=transforms.Compose([
transforms.Resize((dim, dim)),
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
test_dataset = MNIST('./data', train=False, download=True,
transform=transforms.Compose([
transforms.Resize((dim, dim)),
transforms.ToTensor(),
transforms.Normalize((0.1307,), (0.3081,))
]))
train_loader = DataLoader(dataset=train_dataset, batch_size=64, shuffle=True, num_workers=8)
test_loader = DataLoader(dataset=test_dataset, batch_size=64, shuffle=False, num_workers=8)
epochs = 10
steps = 0
train_losses, test_losses = [], []
for e in range(epochs):
running_loss = 0
net.train()
for images, labels in train_loader:
if torch.cuda.is_available():
images, labels = images.cuda(), labels.cuda()
optimizer.zero_grad()
log_ps = net(images)
loss = loss_function(log_ps, labels)
loss.backward()
optimizer.step()
running_loss += loss.item()
else:
test_loss = 0
accuracy = 0
net.eval()
with torch.no_grad():
for images, labels in test_loader:
if torch.cuda.is_available():
images, labels = images.cuda(), labels.cuda()
log_ps = net(images)
test_loss += loss_function(log_ps, labels)
top_p, top_class = log_ps.topk(1, dim=1)
equals = top_class.flatten().long() == labels
accuracy += torch.mean(equals.float()).item()
train_losses.append(running_loss/len(train_loader))
test_losses.append(test_loss/len(test_loader))
print("[Epoch: {}/{}] ".format(e+1, epochs),
"[Training Loss: {:.3f}] ".format(running_loss/len(train_loader)),
"[Test Loss: {:.3f}] ".format(test_loss/len(test_loader)),
"[Test Accuracy: {:.3f}]".format(accuracy/len(test_loader)))
Updated results of training...
[Epoch: 1/10] [Training Loss: 0.125] [Test Loss: 0.045] [Test Accuracy: 0.987]
[Epoch: 2/10] [Training Loss: 0.043] [Test Loss: 0.031] [Test Accuracy: 0.991]
[Epoch: 3/10] [Training Loss: 0.030] [Test Loss: 0.030] [Test Accuracy: 0.991]
[Epoch: 4/10] [Training Loss: 0.024] [Test Loss: 0.046] [Test Accuracy: 0.990]
[Epoch: 5/10] [Training Loss: 0.020] [Test Loss: 0.032] [Test Accuracy: 0.992]
[Epoch: 6/10] [Training Loss: 0.017] [Test Loss: 0.046] [Test Accuracy: 0.991]
[Epoch: 7/10] [Training Loss: 0.015] [Test Loss: 0.034] [Test Accuracy: 0.992]
[Epoch: 8/10] [Training Loss: 0.011] [Test Loss: 0.048] [Test Accuracy: 0.992]
[Epoch: 9/10] [Training Loss: 0.012] [Test Loss: 0.037] [Test Accuracy: 0.991]
[Epoch: 10/10] [Training Loss: 0.013] [Test Loss: 0.038] [Test Accuracy: 0.992]

One thing I noticed that you test the model in train mode. You need to call net.eval() to disable dropouts (and then net.train() again to put it back in the train mode).
Maybe there are other issues. Is training loss going down? Have you tried to overfit on a single example?

Related

Convert Tensoflow model to PyTorch model - model isn't learning

I'm trying to port a tensorflow neural network to pytorch, as an exercise to familiarize myself with both / their nuances. This is the tensorflow network I'm porting to pytorch:
import pandas as pd
import tensorflow as tf
from tensorflow.keras.preprocessing import sequence
from tensorflow.keras.models import Sequential
from tensorflow.keras.layers import Dense, Dropout, Activation
from tensorflow.keras.layers import Embedding
from tensorflow.keras.layers import Conv1D, GlobalMaxPooling1D
from tensorflow.keras.datasets import imdb
(x_train, y_train), (x_test, y_test) = imdb.load_data(num_words=5000)
x_train = sequence.pad_sequences(x_train, maxlen=400, padding="post")
x_test = sequence.pad_sequences(x_test, maxlen=400, padding="post")
model = Sequential()
model.add(Embedding(5000, 50, input_length=400))
model.add(Dropout(0.2))
model.add(Conv1D(250, 3, padding='valid',activation='relu',strides=1))
model.add(GlobalMaxPooling1D())
model.add(Dense(250))
model.add(Dropout(0.2))
model.add(Activation('relu'))
model.add(Dense(1))
model.add(Activation('sigmoid'))
model.compile(loss='binary_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
h2 = model.fit(x_train, y_train, batch_size=32, epochs=10, validation_data=(x_test, y_test))
The shapes of each layer is shown below:
Model: "sequential"
_________________________________________________________________
Layer (type) Output Shape Param #
=================================================================
embedding (Embedding) (None, 400, 50) 250000
dropout (Dropout) (None, 400, 50) 0
conv1d (Conv1D) (None, 398, 250) 37750
global_max_pooling1d (Globa (None, 250) 0
lMaxPooling1D)
dense (Dense) (None, 250) 62750
dropout_1 (Dropout) (None, 250) 0
activation (Activation) (None, 250) 0
dense_1 (Dense) (None, 1) 251
activation_1 (Activation) (None, 1) 0
=================================================================
Total params: 350,751
Trainable params: 350,751
Non-trainable params: 0
And the output of the tensorflow model is:
Epoch 1/10
loss: 0.4043 - accuracy: 0.8021 - val_loss: 0.2764 - val_accuracy: 0.8854
Epoch 2/10
loss: 0.2332 - accuracy: 0.9052 - val_loss: 0.2690 - val_accuracy: 0.8888
Epoch 3/10
loss: 0.1598 - accuracy: 0.9389 - val_loss: 0.2948 - val_accuracy: 0.8832
Epoch 4/10
loss: 0.1112 - accuracy: 0.9600 - val_loss: 0.3015 - val_accuracy: 0.8906
Epoch 5/10
loss: 0.0810 - accuracy: 0.9700 - val_loss: 0.3057 - val_accuracy: 0.8868
Epoch 6/10
loss: 0.0537 - accuracy: 0.9811 - val_loss: 0.4055 - val_accuracy: 0.8868
Epoch 7/10
loss: 0.0408 - accuracy: 0.9860 - val_loss: 0.4083 - val_accuracy: 0.8852
Epoch 8/10
loss: 0.0411 - accuracy: 0.9845 - val_loss: 0.4789 - val_accuracy: 0.8789
Epoch 9/10
loss: 0.0380 - accuracy: 0.9862 - val_loss: 0.4828 - val_accuracy: 0.8827
Epoch 10/10
loss: 0.0329 - accuracy: 0.9879 - val_loss: 0.4999 - val_accuracy: 0.8825
Here's what I have in my PyTorch port over:
from torch.utils.data import DataLoader
from torch.utils.data import Dataset
import torch
from tqdm import tqdm
import torch.nn.functional as F
from sklearn.metrics import accuracy_score
class CustomDataset(Dataset):
def __init__(self, x, y):
self.x = x
self.y = y
def __len__(self):
return len(self.y)
def __getitem__(self, idx):
return self.x[idx], self.y[idx]
train_dataloader = DataLoader(CustomDataset(torch.Tensor(x_train), torch.Tensor(y_train)), batch_size=32, shuffle=True)
test_dataloader = DataLoader(CustomDataset(torch.Tensor(x_test), torch.Tensor(y_test)), batch_size=32, shuffle=True)
class MyModel(torch.nn.Module):
def __init__(self, vocab_size=5000, input_len=400, embedding_dims=50, kernel_size=3, filters=250, hidden_dims=250):
super(MyModel, self).__init__()
self.embedding_dims = embedding_dims
self.input_len = input_len
self.embedding = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dims)
self.dropout1 = torch.nn.Dropout(p=0.2)
self.conv1d = torch.nn.Conv1d(in_channels=embedding_dims, out_channels=filters, kernel_size=kernel_size, padding=(0,), stride=1)
self.pool = torch.nn.AdaptiveMaxPool1d(1)
self.linear1 = torch.nn.Linear(in_features=hidden_dims, out_features=hidden_dims)
self.dropout2 = torch.nn.Dropout(p=0.2)
self.activation = torch.nn.ReLU()
self.output = torch.nn.Linear(in_features=hidden_dims, out_features=1)
self.activation2 = torch.nn.Sigmoid()
def forward(self, x):
x = self.dropout1(self.embedding(x.type(torch.LongTensor)))
x = self.conv1d(x.view(-1, self.embedding_dims, self.input_len))
x = self.pool(x)
x = self.activation(self.dropout2(self.linear1(x.view(-1,x.size()[1]))))
x = self.activation2(self.output(x))
return x
class FitTorchModel():
def __init__(self, model, num_epochs=10, steps_per_epoch=782):
self.model = model
self.epochs = num_epochs
self.steps_per_epoch = steps_per_epoch
def fit(self, train_dataloader, test_dataloader):
opt = torch.optim.Adam(self.model.parameters(), lr=0.001)
crit = torch.nn.BCELoss(reduction = "mean")
history_df = pd.DataFrame(columns = ["Loss", "Accuracy", "Val_Loss", "Val_Acc"])
for epoch in range(self.epochs):
self.model.train()
print(f"Epoch {epoch}")
epoch_loss = 0
epoch_acc = 0
it = iter(train_dataloader)
for step in tqdm(range(self.steps_per_epoch)):
opt.zero_grad()
x, y = next(it)
y_pred = self.model(x).view(-1)
loss = crit(y_pred, y)
epoch_loss += loss.item()
epoch_acc += accuracy_score(y==1, y_pred > 0.5)
loss.backward()
opt.step()
val_loss, val_acc = self.predict_proba(test_dataloader, crit)
df = pd.DataFrame({"Loss": epoch_loss/(step+1),
"Accuracy": epoch_acc/(step+1),
"Val_Loss": val_loss, "Val_Acc": val_acc}, index=[0])
history_df = pd.concat((history_df, df), ignore_index=True)
return history_df
def predict_proba(self, test_dataloader, crit):
self.model.eval()
val_loss = 0
val_acc = 0
it = iter(test_dataloader)
with torch.no_grad():
for step in tqdm(range(self.steps_per_epoch)):
x,y = next(it)
y_pred = self.model(x).view(-1)
batch_loss = crit(y_pred, y)
val_loss += batch_loss.item()
val_acc += accuracy_score(y==1, y_pred > 0.5)
return val_loss/(step+1), val_acc/(step+1)
ftm = FitTorchModel(model=MyModel(), num_epochs=10, steps_per_epoch=782)
history_df = ftm.fit(train_dataloader, test_dataloader)
The shape of each layer is:
After embedding layer: torch.Size([32, 400, 50])
After dropout1 layer: torch.Size([32, 400, 50])
After convolution1d layer: torch.Size([32, 250, 398])
After maxpooling layer: torch.Size([32, 250, 1])
After linear1 layer: torch.Size([32, 250])
After dropout2 layer: torch.Size([32, 250])
After activation layer: torch.Size([32, 250])
After output layer: torch.Size([32, 1])
After activation2 layer: torch.Size([32, 1])
The output of the pytorch model training is:
Loss Accuracy Val_Loss Val_Acc
0 0.697899 0.505874 0.692495 0.511629
1 0.693063 0.503477 0.693186 0.503637
2 0.693190 0.496044 0.693149 0.499201
3 0.693181 0.501359 0.693082 0.502038
4 0.693169 0.503237 0.693234 0.495964
5 0.693177 0.500240 0.693154 0.500679
6 0.693069 0.507473 0.693258 0.498881
7 0.693948 0.500320 0.693145 0.501598
8 0.693196 0.499640 0.693164 0.496324
9 0.693170 0.500759 0.693140 0.501918
Couple things: the accuracy hovers around guessing (this is a binary classification task), no matter how many epochs have passed. Secondly, the training loss barely improves. I set the learning rate to the default learning rate described by tensorflow's Adam Optimizer docs. What else am I missing here? I had some trouble with the input / output dimensions for the various layers - did I mess those up at all?
Some observations:
Use BCEWithLogitsLoss as loss on the output of the last linear layer, before the sigmoid. This includes the sigmoid activation in a more numerically stable fashion.
The TensorFlow model has a ReLU after the Convolution, the pytorch implementations does not.
In general, for debugging, one might want to look at weight.grad of some of your weights after the loss.backward() and see if gradients calculated. Also printing out the value of one of the weights in each iteration to see if your optimizer actually changes the weights can help...
Also, it can depend on the input data:
(Are you sure that x_test is scaled correctly?)
If you are transforming your inputs to Long before embedding them and all x_test, for example, are floats between 0 and 1, they will all be converted to 0! And the network will have a hard time predicting the labels from all zeros as constant input!
But now to the actual issue in this particular case:
Be careful with .view! It might not do what you expect. It just reshapes the tensor but does not move the data around.
What you really want is .moveaxes(-1,2) instead!!
Loss Accuracy Val_Loss Val_Acc
0 0.573489 0.671715 0.402601 0.819413
1 0.376908 0.830163 0.33786 0.850783
2 0.308343 0.868646 0.296171 0.872323
3 0.258806 0.893342 0.319121 0.865849
4 0.227044 0.907649 0.3172 0.868326
5 0.202789 0.918478 0.281184 0.886549
6 0.179744 0.928549 0.291027 0.886589
7 0.161205 0.93702 0.329196 0.879156
8 0.145447 0.944094 0.294914 0.889746
9 0.133034 0.949568 0.291476 0.889826
After adding the relu after the convolution and, more importantly, fixing the view!
class MyModel(torch.nn.Module):
def __init__(self, vocab_size=5000, input_len=400, embedding_dims=50, kernel_size=3, filters=250, hidden_dims=250):
super(MyModel, self).__init__()
self.embedding_dims = embedding_dims
self.input_len = input_len
self.embedding = torch.nn.Embedding(num_embeddings=vocab_size, embedding_dim=embedding_dims)
self.dropout1 = torch.nn.Dropout(p=0.2)
self.conv1d = torch.nn.Conv1d(in_channels=embedding_dims, out_channels=filters, kernel_size=kernel_size, padding=(0,), stride=1)
self.pool = torch.nn.AdaptiveMaxPool1d(1)
self.linear1 = torch.nn.Linear(in_features=hidden_dims, out_features=hidden_dims)
self.dropout2 = torch.nn.Dropout(p=0.2)
self.activation = torch.nn.ReLU()
self.output = torch.nn.Linear(in_features=hidden_dims, out_features=1)
self.activation2 = torch.nn.Sigmoid()
def forward(self, x):
x = self.dropout1(self.embedding(x.type(torch.LongTensor)))
x = self.activation(self.conv1d(x.moveaxis(-1,-2)))
x = self.pool(x).squeeze(-1)
x = self.activation(self.dropout2(self.linear1(x)))
x = self.activation2(self.output(x))
return x
What is tinymodel you init opt with in fit function:
opt = torch.optim.Adam(tinymodel.parameters(), lr=0.001)
It seems like your optimizer is not working on the right model (see this answer on the relation between the optimizer and the parameters of the model).
You need to replace this line in fit function:
def fit(self, train_dataloader, test_dataloader):
opt = torch.optim.Adam(self.model.parameters(), lr=0.001)
# ...
Additionally, you are using Dropout layer that has different behavior in train and test.
You should add self.model.train() and self.model.eval() at the beginning of your fit and predict_proba functions respectively.

Simulate streaming learning using Tensorflow's fit() and evaluate() built-in methods

What I'm trying to achieve is to simulate a streaming learning method using Tensorflow's fit() and evaluate() methods.
What I have until now is a script like this, after getting some help from the community here:
import pandas as pd
import tensorflow as tf
df = pd.read_csv('labeled_tweets_processed.csv')
labels = df.pop('class')
dataset = tf.data.Dataset.from_tensor_slices((df, labels))
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
max_tokens=VOCAB_SIZE)
encoder.adapt(dataset.map(lambda text, label: text))
BUFFER_SIZE = 2
BATCH_SIZE = 1
train_dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
model = tf.keras.Sequential([
encoder,
tf.keras.layers.Embedding(
input_dim=len(encoder.get_vocabulary()),
output_dim=64,
# Use masking to handle the variable sequence lengths
mask_zero=True),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1)
])
model.compile(loss=tf.keras.losses.BinaryCrossentropy(from_logits=True),
optimizer=tf.keras.optimizers.Adam(1e-4),
metrics=['accuracy'])
to setup the model and training the model using this command:
history = model.fit(train_dataset, epochs=1)
What I actually want to do is to simulate a Streaming environment where I have a pipeline like Predict -> Fit into the model.
I thought it could be accomplished by using a method like:
for x, y in enumerate(train_dataset):
test_loss, test_acc = model.evaluate([x, y])
model.fit(y)
but it doesn't seems to work right like this.
What is the right way to simulate the described environment?
What is the best way to iterate through dataset's each entry and input to the desired methods?
Thank you very much in advance!
Update 1:
What I have right now, but resulting in very low model accuracy. Not sure if the metrics are updated the right way.
for idx, (x, y) in enumerate(train_dataset):
pred = model.predict_on_batch(x)
print(model.test_on_batch(x, pred, reset_metrics=False, return_dict=True))
model.train_on_batch(x, y, reset_metrics=False)
print(f"After {idx} entries")
You can try something like this:
for idx, (x, y) in enumerate(train_dataset):
test_loss, test_acc = model.evaluate(x, y)
model.fit(x, y, epochs=1)
Update 1:
Maybe try using a custom training loop:
import pandas as pd
import tensorflow as tf
df = pd.DataFrame(data = {'texts': ['Some text ssss', 'Some text', 'Some text', 'Some text', 'Some text'],
'class': [0, 0, 1, 1, 1]})
labels = df.pop('class')
dataset = tf.data.Dataset.from_tensor_slices((df, labels))
VOCAB_SIZE = 1000
encoder = tf.keras.layers.TextVectorization(
max_tokens=VOCAB_SIZE)
encoder.adapt(dataset.map(lambda text, label: text))
BUFFER_SIZE = 2
BATCH_SIZE = 3
train_dataset = dataset.shuffle(BUFFER_SIZE).batch(BATCH_SIZE).prefetch(tf.data.AUTOTUNE)
model = tf.keras.Sequential([
encoder,
tf.keras.layers.Embedding(
input_dim=len(encoder.get_vocabulary()),
output_dim=64,
# Use masking to handle the variable sequence lengths
mask_zero=True),
tf.keras.layers.Bidirectional(tf.keras.layers.LSTM(64)),
tf.keras.layers.Dense(64, activation='relu'),
tf.keras.layers.Dense(1, activation='sigmoid')
])
opt = tf.keras.optimizers.Adam(1e-4)
loss_fn = tf.keras.losses.BinaryCrossentropy()
train_acc_metric = tf.keras.metrics.BinaryAccuracy()
test_acc_metric = tf.keras.metrics.BinaryAccuracy()
epochs = 2
for epoch in range(epochs):
print("\nStart of epoch %d" % (epoch + 1,))
for step, (x_batch_train, y_batch_train) in enumerate(train_dataset):
pred = model(x_batch_train)
test_acc_metric.update_state(y_batch_train, pred)
print("Current test acc: %.4f" % (float(test_acc_metric.result()),))
with tf.GradientTape() as tape:
logits = model(x_batch_train, training=True)
loss_value = loss_fn(y_batch_train, logits)
grads = tape.gradient(loss_value, model.trainable_weights)
opt.apply_gradients(zip(grads, model.trainable_weights))
train_acc_metric.update_state(y_batch_train, logits)
print("Current train acc: %.4f" % (float(train_acc_metric.result()),))
test_acc = test_acc_metric.result()
print("Total test acc over epoch: %.4f" % (float(test_acc),))
test_acc_metric.reset_states()
train_acc = train_acc_metric.result()
print("Total train acc over epoch: %.4f" % (float(train_acc),))
train_acc_metric.reset_states()
Start of epoch 1
Current test acc: 0.6922
Current train acc: 0.6922
Current test acc: 0.6936
Current train acc: 0.6936
Current test acc: 0.6928
Current train acc: 0.6928
Current test acc: 0.6934
Current train acc: 0.6934
Current test acc: 0.6938
Current train acc: 0.6938
Total test acc over epoch: 0.6938
Total train acc over epoch: 0.6938
Start of epoch 2
Current test acc: 0.6914
Current train acc: 0.6914
Current test acc: 0.6914
Current train acc: 0.6914
Current test acc: 0.6926
Current train acc: 0.6926
Current test acc: 0.6932
Current train acc: 0.6932
Current test acc: 0.6936
Current train acc: 0.6936
Total test acc over epoch: 0.6936
Total train acc over epoch: 0.6936

Why I am getting a negative loss and negative validation loss in my model

I am training a variational autoencoder using USPS dataset of shape (7291, 16, 16). Below is my code snipet. I also tried the same code snipet on MNIST dataset of shape (60000,28,28) and everything seems to work fine. Both are gray scale images. I can figure out why I am getting the a negative value for training loss and validation loss for USPS dataset. The code execution is quite straight forwards, The only changes from MNIST model is mnist.load_data() to usps.load_data().
I also have also tried reducing the number of layers in both the encoder and decoder network but the result for the USPS model appears the same. I can figure out what exactly I am getting wrong. please I need your assistance to understand the reason for the negative values.
!pip install extra_keras_datasets
#######################################
from extra_keras_datasets import usps
import keras
from keras.layers import Conv2D, Conv2DTranspose, Input, Flatten, Dense, Lambda, Reshape
#from keras.layers import BatchNormalization
from keras.models import Model
from keras.datasets import mnist
import tensorflow.compat.v1.keras.backend as K
import tensorflow as tf
tf.compat.v1.disable_eager_execution()
import numpy as np
import matplotlib.pyplot as plt
# Load MNIST
# (x_train, y_train), (x_test, y_test) = mnist.load_data()
(x_train, y_train), (x_test, y_test) = usps.load_data()
#Normalize and reshape ============
#Norm.
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
x_train = x_train / 255
x_test = x_test / 255
# Reshape
img_width = x_train.shape[1]
img_height = x_train.shape[2]
num_channels = 1 #MNIST --> grey scale so 1 channel
x_train = x_train.reshape(x_train.shape[0], img_height, img_width, num_channels)
x_test = x_test.reshape(x_test.shape[0], img_height, img_width, num_channels)
input_shape = (img_height, img_width, num_channels)
# ========================
# BUILD THE MODEL
# # ================= #############
# # Encoder
#Let us define 4 conv2D, flatten and then dense
# # ================= ############
latent_dim = 2 # Number of latent dim parameters
#Create the model
input_img = Input(shape=input_shape, name='encoder_input')
x = Conv2D(32, 3, padding='same', activation='relu')(input_img)
x = Conv2D(64, 3, padding='same', activation='relu',strides=(2, 2))(x)
x = Conv2D(64, 3, padding='same', activation='relu')(x)
x = Conv2D(64, 3, padding='same', activation='relu')(x)
conv_shape = K.int_shape(x) #Shape of conv to be provided to decoder
print(conv_shape)
#Flatten
x = Flatten()(x)
x = Dense(32, activation='relu')(x)
# Two outputs, for latent mean and log variance (std. dev.)
#Use these to sample random variables in latent space to which inputs are mapped.
z_mu = Dense(latent_dim, name='latent_mu')(x) #Mean values of encoded input
z_sigma = Dense(latent_dim, name='latent_sigma')(x) #Std dev. (variance) of encoded input
#REPARAMETERIZATION TRICK
# Define sampling function to sample from the distribution
# Reparameterize sample based on the process defined by Gunderson and Huang
# into the shape of: mu + sigma squared x eps
#This is to allow gradient descent to allow for gradient estimation accurately.
def sample_z(args):
z_mu, z_sigma = args
eps = K.random_normal(shape=(K.shape(z_mu)[0], K.int_shape(z_mu)[1]))
return z_mu + K.exp(z_sigma / 2) * eps
# sample vector from the latent distribution
# z is the labda custom layer we are adding for gradient descent calculations
# using mu and variance (sigma)
z = Lambda(sample_z, output_shape=(latent_dim, ), name='z')([z_mu, z_sigma])
#Z (lambda layer) will be the last layer in the encoder.
# Define and summarize encoder model.
encoder = Model(input_img, [z_mu, z_sigma, z], name='encoder')
print(encoder.summary())
Decoder
x = Dense(conv_shape[1]*conv_shape[2]*conv_shape[3], activation='relu')(decoder_input)
# reshape to the shape of last conv. layer in the encoder, so we can
x = Reshape((conv_shape[1], conv_shape[2], conv_shape[3]))(x)
# upscale (conv2D transpose) back to original shape
# use Conv2DTranspose to reverse the conv layers defined in the encoder
x = Conv2DTranspose(32, 3, padding='same', activation='relu',strides=(2, 2))(x)
#Can add more conv2DTranspose layers, if desired.
#Using sigmoid activation
x = Conv2DTranspose(num_channels, 3, padding='same', activation='sigmoid', name='decoder_output')(x)
# Define and summarize decoder model
decoder = Model(decoder_input, x, name='decoder')
# apply the decoder to the latent sample
z_decoded = decoder(z)
decoder.summary()
custom loss and model fitting
#VAE is trained using two loss functions reconstruction loss and KL divergence
#Let us add a class to define a custom layer with loss
class CustomLayer(keras.layers.Layer):
def vae_loss(self, x, z_decoded):
x = K.flatten(x)
z_decoded = K.flatten(z_decoded)
# Reconstruction loss (as we used sigmoid activation we can use binarycrossentropy)
recon_loss = keras.metrics.binary_crossentropy(x, z_decoded)
# KL divergence
kl_loss = -5e-4 * K.mean(1 + z_sigma - K.square(z_mu) - K.exp(z_sigma), axis=-1)
return K.mean(recon_loss + kl_loss)
# add custom loss to the class
def call(self, inputs):
x = inputs[0]
z_decoded = inputs[1]
loss = self.vae_loss(x, z_decoded)
self.add_loss(loss, inputs=inputs)
return x
# apply the custom loss to the input images and the decoded latent distribution sample
y = CustomLayer()([input_img, z_decoded])
# y is basically the original image after encoding input img to mu, sigma, z
# and decoding sampled z values.
#This will be used as output for vae
# =================
# VAE
# =================
vae = Model(input_img, y, name='vae')
# Compile VAE
vae.compile(optimizer='adam', loss=None)
vae.summary()
# Train autoencoder
vae.fit(x_train, None, epochs = 10, batch_size = 32, validation_split = 0.2)
Here is my training History.
5832/5832 [==============================] - 5s 928us/sample - loss: 0.0345 - val_loss: -0.0278
Epoch 2/10
5832/5832 [==============================] - 4s 740us/sample - loss: -0.0301 - val_loss: -0.0292
Epoch 3/10
5832/5832 [==============================] - 4s 746us/sample - loss: -0.0307 - val_loss: -0.0293
Epoch 4/10
5832/5832 [==============================] - 4s 751us/sample - loss: -0.0307 - val_loss: -0.0294
Epoch 5/10
5832/5832 [==============================] - 4s 753us/sample - loss: -0.0307 - val_loss: -0.0294
Epoch 6/10
5832/5832 [==============================] - 4s 746us/sample - loss: -0.0307 - val_loss: -0.0294
Epoch 7/10
5832/5832 [==============================] - 4s 750us/sample - loss: -0.0307 - val_loss: -0.0294
Epoch 8/10
5832/5832 [==============================] - 4s 742us/sample - loss: -0.0307 - val_loss: -0.0294
Epoch 9/10
5832/5832 [==============================] - 4s 751us/sample - loss: -0.0307 - val_loss: -0.0294
Epoch 10/10
5832/5832 [==============================] - 4s 748us/sample - loss: -0.0307 - val_loss: -0.0294

CNN Classifier only guesses one thing - PyTorch

I'm trying to make a model predict the race of a 75x75 image's ethnicity, but when ever I train the model, the accuracy always stays completely still at 53.2%. I didn't realize why until I actually ran it on some of photos. It turned out, that no matter what the photo was, it would always predict 'other'. I'm not entirely sure why though.
I copied the code over from the official PyTorch Quickstart tutorial, and in that dataset or the standard MNIST data, it worked fine. I changed the dataset to the UTKFace, and then it started only predicting one label, all the time.
Here's my code:
import torch
from torch import nn
from torch.utils.data import DataLoader
from torchvision.datasets import ImageFolder
from torchvision.transforms import ToTensor
import torch.nn.functional as F
training_data = ImageFolder(
root = "data_training/",
transform = ToTensor(),
)
testing_data = ImageFolder(
root = "data_testing/",
transform = ToTensor()
)
training_dataloader = DataLoader(training_data, batch_size=64, shuffle=True)
test_dataloader = DataLoader(testing_data, batch_size=64, shuffle=True)
class NeuralNetwork(nn.Module):
def __init__(self):
super(NeuralNetwork, self).__init__()
self.conv1 = nn.Conv2d(3, 6, 5)
self.pool = nn.MaxPool2d(2, 2)
self.conv2 = nn.Conv2d(6, 16, 5)
self.fc1 = nn.Linear(1296, 1024)
self.fc2 = nn.Linear(1024, 1024)
self.fc3 = nn.Linear(1024, 512)
self.fc4 = nn.Linear(512, 84)
self.fc5 = nn.Linear(84, 5)
def forward(self, x):
x = self.pool(F.relu(self.conv1(x)))
x = self.pool(F.relu(self.conv2(x)))
x = x.view(x.size(0), -1)
x = F.relu(self.fc1(x))
x = F.relu(self.fc2(x))
x = F.relu(self.fc3(x))
x = F.relu(self.fc4(x))
x = self.fc5(x)
return x
model = NeuralNetwork().to("cpu")
loss_fn = nn.CrossEntropyLoss()
optimizer = torch.optim.SGD(model.parameters(), lr=1e-3)
def train(dataloader, model, loss_fn, optimizer):
size = len(dataloader.dataset)
for batch, (X, y) in enumerate(dataloader):
X, y = X.to("cpu"), y.to("cpu")
# Compute prediction error
pred = model(X)
loss = loss_fn(pred, y)
# Backpropagation
optimizer.zero_grad()
loss.backward()
optimizer.step()
if batch % 100 == 0:
loss, current = loss.item(), batch * len(X)
print(f"loss: {loss:>7f} [{current:>5d}/{size:>5d}]")
def tests(dataloader, model):
size = len(dataloader.dataset)
model.eval()
test_loss, correct = 0, 0
with torch.no_grad():
for X, y in dataloader:
X, y = X.to("cpu"), y.to("cpu")
pred = model(X)
test_loss += loss_fn(pred, y).item()
correct += (pred.argmax(1) == y).type(torch.float).sum().item()
test_loss /= size
correct /= size
print(f"Test Error: \n Accuracy: {(100*correct):>0.1f}%, Avg loss: {test_loss:>8f} \n")
epochs = 10
for t in range(epochs):
print(f"Epoch {t+1}\n-------------------------------")
train(training_dataloader, model, loss_fn, optimizer)
tests(test_dataloader, model)
torch.save(model.state_dict(), "model.pth")
The training logs:
Epoch 1
-------------------------------
loss: 1.628994 [ 0/23705]
loss: 1.620698 [ 6400/23705]
loss: 1.615423 [12800/23705]
loss: 1.596390 [19200/23705]
Test Error:
Accuracy: 53.2%, Avg loss: 0.024725
Epoch 2
-------------------------------
loss: 1.593613 [ 0/23705]
loss: 1.581375 [ 6400/23705]
loss: 1.583656 [12800/23705]
loss: 1.591942 [19200/23705]
Test Error:
Accuracy: 53.2%, Avg loss: 0.024165
Epoch 3
-------------------------------
loss: 1.541260 [ 0/23705]
loss: 1.592345 [ 6400/23705]
loss: 1.540908 [12800/23705]
loss: 1.540741 [19200/23705]
Test Error:
Accuracy: 53.2%, Avg loss: 0.023705
Epoch 4
-------------------------------
loss: 1.566888 [ 0/23705]
loss: 1.524875 [ 6400/23705]
loss: 1.540764 [12800/23705]
loss: 1.510044 [19200/23705]
Test Error:
Accuracy: 53.2%, Avg loss: 0.023323
Epoch 5
-------------------------------
loss: 1.530084 [ 0/23705]
loss: 1.498773 [ 6400/23705]
loss: 1.537755 [12800/23705]
loss: 1.508989 [19200/23705]
Test Error:
Accuracy: 53.2%, Avg loss: 0.022993
....
No matter how many epochs I set it to, or how many layers I add in to try to get it to overfit, it always just seems to guess the same thing over and over again, with no signs of improvement.
I separated the UTKFace dataset into folders based on the ethnicity category of the name. There are 23705 images in the training data and 10134 in the testing.
I'm not sure why this is happening. Is my dataset not large enough? Are there not enough layers?
The number of layers and the dataset size don't explain this behavior for this example. Your CNN is behaving as a constant function, so far I don't know why, but these might be some clues:
Since you have separated your data by label into folders, if you are training your model using only one of those folders you will obtain a constant function.
The last layer of your neural network has no activation function! This is, in method forward you are doing x = self.fc5(x) instead of x = F.<function>(self.fc5(x)).
Where do you indicate, when loading the training data, which is the label for each image? Are you sure that training_dataloader is loading the images with their correct label?
few comments:
Did you check the ground truth in the test data (the shape may be different)
can you check the output probabilities to see if the predictions are unanimous ? (Btw you don't necessarily need a activation function at the end in this case as the pytorch crossentropy already contains a logsoftmax)
Did you try conv2d with more filters (like 16,32 or 64)
The % of error seems fine as in the link you put, the accuracy is around 35%
It does seems a bit weird not to be able to over-fit.

Siamese network on MNIST dataset is not getting trained

I train Siamese network with constructive loss on two classes of MNIST dataset to identify whether two images are similar or not. Although the loss is decreasing in the beginning, it freezes later with accuracy around 0.5.
The model is trained on pairs of images and a label (0.0 for different, 1.0 for identical). I used only two classes for simplicity (zeros and ones) and prepared the dataset, so that it contains every pair of images. I've checked that the dataset is consistent (image pairs from dataset). I've also experimented with data normalization, different batch sizes, learning rates, initializations and regularization constants with no luck.
This is the model:
class Encoder(Model):
"""
A network that finds a 50-dimensional representation of the input images
so that the distances between them minimize the constructive loss
"""
def __init__(self):
super(Encoder, self).__init__(name='encoder')
self.cv = Conv2D(32, (3, 3), activation='relu', padding='Same',
input_shape=(28, 28, 1),
kernel_regularizer=tf.keras.regularizers.l2(0.01))
self.pool = MaxPooling2D((2, 2))
self.flatten = Flatten()
self.dense = Dense(50, activation=None,
kernel_regularizer=tf.keras.regularizers.l2(0.01))
def call(self, inputs, training=None, mask=None):
""" Forward pass for one image """
x = self.cv(inputs)
x = self.pool(x)
x = self.flatten(x)
x = self.dense(x)
return x
#staticmethod
def distance(difference):
""" The D function from the paper which is used in loss """
distance = tf.sqrt(tf.reduce_sum(tf.pow(difference, 2), 0))
return distance
The loss and accuracy:
def simnet_loss(target, x1, x2):
difference = x1 - x2
distance_vector = tf.map_fn(lambda x: Encoder.distance(x), difference)
loss = tf.map_fn(lambda distance: target * tf.square(distance) +
(1.0 - target) * tf.square(tf.maximum(0.0, 1.0 - distance)), distance_vector)
average_loss = tf.reduce_mean(loss)
return average_loss
def accuracy(y_true, y_pred):
distance_vector = tf.map_fn(lambda x: Encoder.distance(x), y_pred)
accuracy = tf.keras.metrics.binary_accuracy(y_true, distance_vector)
return accuracy
Training:
def train_step(images, labels):
with tf.GradientTape() as tape:
x1, x2 = images[:, 0, :, :, :], images[:, 1, :, :, :]
x1 = model(x1)
x2 = model(x2)
loss = simnet_loss(labels, x1, x2)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
return loss
model = Encoder()
optimizer = tf.keras.optimizers.Adam(learning_rate=learning_rate)
for epoch in range(n_epoch):
epoch_loss = 0
n_batches = int(x_train.shape[0]/batch_size)
for indices in np.array_split(np.arange(x_train.shape[0]), indices_or_sections=n_batches):
x = np.take(x_train, indices, axis=0)
y = np.take(y_train, indices, axis=0)
epoch_loss += train_step(x, y)
epoch_loss = epoch_loss / n_batches
accuracy = test_step(x_train, y_train)
val_accuracy = test_step(x_test, y_test)
tf.print("epoch:", epoch, "loss:", epoch_loss, "accuracy:", accuracy,
"val_accuracy:", val_accuracy, output_stream=sys.stdout)
The code above produces:
epoch: 0 loss: 0.755419433 accuracy: 0.318898171 val_accuracy:
0.310316473
epoch: 1 loss: 0.270610392 accuracy: 0.369466901 val_accuracy:
0.360871345
epoch: 2 loss: 0.262594223 accuracy: 0.430587918 val_accuracy:
0.418002456
epoch: 3 loss: 0.258690506 accuracy: 0.428258181 val_accuracy:
0.427044809
epoch: 4 loss: 0.25654456 accuracy: 0.43497327 val_accuracy:
0.44800657
epoch: 5 loss: 0.255373538 accuracy: 0.444840342 val_accuracy:
0.454993844
epoch: 6 loss: 0.254594624 accuracy: 0.453885168 val_accuracy:
0.454171807

Categories

Resources