Can I implement an iterative training loop for the ELM classifier? - python

I am using the Extreme Learning Machine classifier for hand gestures recognition but I still have 20% as accuracy.Can anyone help me to implement an iterative training loop to improve the accuracy?I am a beginner and Here the code I am using:I split the dataset that I prepared into train and test parts after normalization and I train it using the train function by calculating the Moore Penrose inverse and then predict the class of each gesture using the prediction function.
# -*- coding: utf-8 -*-
"""
Created on Sat Jul 4 17:52:25 2020
#author: lenovo
"""
# -*- coding: utf-8 -*-
__author__ = 'Sarra'
import numpy as np
class ELM(object):
def __init__(self, inputSize, outputSize, hiddenSize):
"""
Initialize weight and bias between input layer and hidden layer
Parameters:
inputSize: int
The number of input layer dimensions or features in the training data
outputSize: int
The number of output layer dimensions
hiddenSize: int
The number of hidden layer dimensions
"""
self.inputSize = inputSize
self.outputSize = outputSize
self.hiddenSize = hiddenSize
# Initialize random weight with range [-0.5, 0.5]
self.weight = np.matrix(np.random.uniform(-0.5, 0.5, (self.hiddenSize, self.inputSize)))
# Initialize random bias with range [0, 1]
self.bias = np.matrix(np.random.uniform(0, 1, (1, self.hiddenSize)))
self.H = 0
self.beta = 0
def sigmoid(self, x):
"""
Sigmoid activation function
Parameters:
x: array-like or matrix
The value that the activation output will look for
Returns:
The results of activation using sigmoid function
"""
return 1 / (1 + np.exp(-1 * x))
def predict(self, X):
"""
Predict the results of the training process using test data
Parameters:
X: array-like or matrix
Test data that will be used to determine output using ELM
Returns:
Predicted results or outputs from test data
"""
X = np.matrix(X)
y = self.sigmoid((X * self.weight.T) + self.bias) * self.beta
return y
def train(self, X, y):
"""
Extreme Learning Machine training process
Parameters:
X: array-like or matrix
Training data that contains the value of each feature
y: array-like or matrix
Training data that contains the value of the target (class)
Returns:
The results of the training process
"""
X = np.matrix(X)
y = np.matrix(y)
# Calculate hidden layer output matrix (Hinit)
self.H = (X * self.weight.T) + self.bias
# Sigmoid activation function
self.H = self.sigmoid(self.H)
# Calculate the Moore-Penrose pseudoinverse matriks
H_moore_penrose = np.linalg.pinv(self.H.T * self.H) * self.H.T
# Calculate the output weight matrix beta
self.beta = H_moore_penrose * y
return self.H * self.beta
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score
from sklearn import preprocessing
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
from sklearn.preprocessing import MinMaxScaler
# read the dataset
database = pd.read_csv(r"C:\\Users\\lenovo\\tensorflow\\tensorflow1\\Numpy-ELM\\hand_gestures_database.csv")
#separate data from labels
data = database.iloc[:, 1:].values.astype('float64')
#normalize data
#n_data = preprocessing.minmax_scale(data, feature_range=(0, 1), axis=0, copy=True)
scaler = MinMaxScaler()
scaler.fit(data)
n_data = scaler.transform(data)
#identify the labels
label = database.iloc[:, 0]
#encoding labels to transform each label to a value between 0 to number of labels-1
def prepare_targets(n):
le =preprocessing.LabelEncoder()
le.fit(n)
label_enc = le.transform(n)
return label_enc
label_enc = prepare_targets(label)
CLASSES = 10
#transform the value of each label to a binary vector
target = np.zeros([label_enc.shape[0], CLASSES])
for i in range(label_enc.shape[0]):
target[i][label_enc[i]] = 1
target.view(type=np.matrix)
print("target",target)
# Create instance of ELM object with 10 hidden neuron
maxx=0
for u in range(10):
elm = ELM(45, 10, 10)
# Train test split 80:20
X_train, X_test, y_train, y_test = train_test_split(n_data, target, test_size=0.34, random_state=1)
elm.train(X_train,y_train)
y_pred = elm.predict(X_test)
# Train data
correct = 0
total = y_pred.shape[0]
for i in range(total):
predicted = np.argmax(y_pred[i])
test = np.argmax(y_test[i])
correct = correct + (1 if predicted == test else 0)
print('Accuracy: {:f}'.format(correct/total))
if(correct/total>maxx):
maxx=correct/total
print(maxx)
###confusion matrix
import seaborn as sns
y_pred=np.argmax(y_pred, axis=1)
y_true=(np.argmax(y_test, axis=1))
target_names=["G1","G2","G3","G4","G5","G6","G7","G8","G9","G10"]
cm=confusion_matrix(y_true, y_pred)
#cmn = cm.astype('float')/cm.sum(axis=1)[:, np.newaxis]*100
fig, ax = plt.subplots(figsize=(15,8))
sns.heatmap(cm/np.sum(cm), annot=True, fmt='.2f',xticklabels=target_names, yticklabels=target_names, cmap='Blues')
#sns.heatmap(cmn, annot=True, fmt='.2%', xticklabels=target_names, yticklabels=target_names)
plt.ylabel('Actual')
plt.xlabel('Predicted')
plt.ylim(-0.5, len(target_names) + 0.5)
plt.show(block=False)
def perf_measure(y_actual, y_pred):
TP = 0
FP = 0
TN = 0
FN = 0
for i in range(len(y_pred)):
if y_actual[i]==y_pred[i]==1:
TP += 1
if y_pred[i]==1 and y_actual[i]!=y_pred[i]:
FP += 1
if y_actual[i]==y_pred[i]==0:
TN += 1
if y_pred[i]==0 and y_actual[i]!=y_pred[i]:
FN += 1
return(TP, FP, TN, FN)
TP, FP, TN, FN=perf_measure(y_true, y_pred)
print("precision",TP/(TP+FP))
print("sensivity",TP/(TP+FN))
print("specifity",TN/(TN+FP))
print("accuracy",(TP+TN)/(TN+FP+FN+TP))

To your question about whether you can implement an iterative training loop for an ELM:
No, you can not. An ELM consists of one random layer followed by an output layer. Because the first layer is fixed, this is essentially a linear model and we can find the optimal output weights by using the pseudo-inverse, as you pointed out.
However, since you already find the perfect solution for this model in one step, there is no direct way to iteratively improve this result.
I would, however, not advise using an extreme learning machine.
Besides the controversy about their origin, they are very limited in the functions they can learn.
There are other well-established approaches for gesture classification that are likely more useful to pursue.

Related

RuntimeError: shape '[265, 265]' is invalid for input of size 265

I am trying to train a model using Gaussian Process regression in Python, and I have some problems in predicting. The training set is of size 265, and after having trained the model, I am trying to predict on the test set.
I get the error "RuntimeError: shape '[265, 265]' is invalid for input of size 265" and I don't really understand why.
I also get a vector as loss while training (instead than a scalar). I solved the loss problem by doing loss.mean() (even if I still don't understand why the loss is returned as a vector), but I have not managed to solve the other problem.
265 is the size of the training set (and not of the test set), so I imagine the error is related to that and not to the test set that I give in input.
Thank you really much for your help! I send all my code, the problem is in the function predict. PS: I am sorry but I don't know how to upload the training and test data here.
'''
import os
import typing
#from sklearn.gaussian_process.kernels import *
import numpy as np
#from sklearn.gaussian_process import GaussianProcessRegressor
#from sklearn.model_selection import cross_val_score
#from sklearn.model_selection import KFold
import matplotlib.pyplot as plt
from matplotlib import cm
import gpytorch
import matplotlib.pyplot as plt
from matplotlib import rcParams
import numpy as np
import torch
import pandas as pd
class Model(object): # were we allowed to change here?
"""
Model for this task.
You need to implement the fit_model and predict methods
without changing their signatures, but are allowed to create additional methods.
"""
def __init__(self): # are we allowed to change here?
"""
Initialize your model here.
We already provide a random number generator for reproducibility.
"""
self.likelihood = gpytorch.likelihoods.GaussianLikelihood()
# self.mean_module = gpytorch.means.ZeroMean() #prior mean
# self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.RBFKernel()) #covariance (which kernel to use?)
self.model = None
self.rng = np.random.default_rng(seed=0) # it was already present
# TODO: Add custom initialization for your model here if necessary
def predict(self, x: np.ndarray) -> typing.Tuple[np.ndarray, np.ndarray, np.ndarray]:
"""
Predict the pollution concentration for a given set of locations.
:param x: Locations as a 2d NumPy float array of shape (NUM_SAMPLES, 2)
#TODO: for us it is 1-d!!!
:return:
Tuple of three 1d NumPy float arrays, each of shape (NUM_SAMPLES,),
containing your predictions, the GP posterior mean, and the GP posterior stddev (in that order)
"""
# TODO: Use your GP to estimate the posterior mean and stddev for each location here
gp_mean = np.zeros(x.shape[0], dtype=float)
gp_std = np.zeros(x.shape[0], dtype=float)
x = torch.Tensor(x)
# Turn the model into evaluation mode
self.model.eval()
self.likelihood.eval()
# The gpytorch.settings.fast_pred_var flag activates LOVE (for fast variances)
# See https://arxiv.org/abs/1803.06058
with torch.no_grad(), gpytorch.settings.fast_pred_var():
# Obtain the predictive mean and covariance matrix
#print(x.size())
f_preds = self.model(x) #TODO: check the input!!!
print(type(f_preds))
gp_mean = np.array(f_preds.mean)
gp_std = torch.sqrt(f_preds.variance) # check if it is right
gp_std = np.array(gp_std)
# Make predictions by feeding model through likelihood
# observed_pred = self.likelihood(f_preds) #I think it is useless
# TODO: Use the GP posterior to form your predictions here
predictions = gp_mean
# check if we want to change the predictions for the asymmetric cost
'''for i in range(0, len(predictions)):
if (predictions[i] + 3 * gp_std[i] > 35.5) and (predictions[i] < 35.5):
predictions[i] = 35.51
elif ((predictions[i] - gp_std[i]) > 35.5):
predictions[i] = predictions[i] - gp_std[i]'''
return predictions, gp_mean, gp_std
def fit_model(self, train_x: np.ndarray, train_y: np.ndarray):
"""
Fit your model on the given training data.
:param train_x: Training features as a 2d NumPy float array of shape (NUM_SAMPLES, 2) #TODO: 1-d for us!!!
:param train_y: Training pollution concentrations as a 1d NumPy float array of shape (NUM_SAMPLES,)
"""
# TODO: Fit your model here
# Put the model into training mode
# initialize likelihood and model (necessary?)
train_x = torch.Tensor(train_x)
train_y = torch.Tensor(train_y)
# torch.Tensor prende in input matrici di numpy e le trasforma in elementi di torch,
# dentro pytorch hai di default un algoritmo di propagation
# clustering
# samplesize=5000
# train_x = train_x[0:samplesize, :]
# train_y = train_y[0:samplesize]
#
self.model = ExactGPModel(train_x, train_y, self.likelihood)
self.model.train()
self.likelihood.train()
# understand how to manage these last 4 rows
# Use the SGD optimizer
optimizer = torch.optim.Adam([
{'params': self.model.parameters()}, # Includes GaussianLikelihood parameters
],
lr=0.1) # if instead of "Adam" we put "SGD" it uses stochastic gradient descent! Oppure minibatch gradient descent
#TODO: choose which optimizer to use
# we can always use stochastic, minibatch or standard(batch)
# we can use gridsearch, change parameters like lr,beta1,beta2,... in order to have a better approximation
# "Loss" for GPs - the marginal log likelihood
mll = gpytorch.mlls.ExactMarginalLogLikelihood(self.likelihood, self.model)
training_iter = 1200 # numero di epochs
for i in range(training_iter):
# Zero gradients from previous iteration
optimizer.zero_grad()
# Output from model
output = self.model(train_x)
# Calc loss and backprop gradients
loss = -mll(output, train_y)
loss = torch.mean(loss) #TODO: understand how to deal with this loss, we shouldn't do the mean!!!
#print(loss)
#print(type(loss))
loss.backward()
print('Iter %d/%d - Loss: %.3f lengthscale: %.3f noise: %.3f' % (
i + 1, training_iter, loss.item(),
self.model.covar_module.base_kernel.lengthscale.item(),
self.model.likelihood.noise.item()))
optimizer.step()
# Auxiliary class
class ExactGPModel(gpytorch.models.ExactGP):
# every object of ExactGPmodel will have every module of models.ExactGP
def __init__(self, train_x, train_y, likelihood):
super(ExactGPModel, self).__init__(train_x, train_y, likelihood)
self.mean_module = gpytorch.means.ZeroMean()
self.covar_module = gpytorch.kernels.ScaleKernel(gpytorch.kernels.MaternKernel(nu=1 / 2))
def forward(self, x):
mean_x = self.mean_module(x)
covar_x = self.covar_module(x)
return gpytorch.distributions.MultivariateNormal(mean_x, covar_x)
# Load the training dateset and test features
trainfilename="TrainingData.txt"
testfilename="TestingData.txt"
'''train_set = pd.read_csv(trainfilename)
test_set = pd.read_csv(testfilename)
# Convert Pandas DataFrame to NumpyArray
train_set = train_set.to_numpy()
test_set = test_set.to_numpy()
meant = np.mean(train_set[:,0:1])
sdt = np.std(train_set[:,0:1])
train_set[:,0:1] = (train_set[:,0:1]-meant)/sdt
train_set = torch.from_numpy(train_set)
meant = np.mean(test_set[:,0:1])
sdt = np.std(test_set[:,0:1])
test_set[:,0:1] = (test_set[:,0:1]-meant)/sdt
test_set = torch.from_numpy(test_set)
train_x = train_set[:,0:1]
train_y = train_set[:,1:2]
test_x = test_set[:,0:1]'''
train_set = np.loadtxt(trainfilename, delimiter=',', skiprows=1)
test_x = np.loadtxt(testfilename, delimiter=',', skiprows=1)
train_x = train_set[:,0:1]
train_y = train_set[:,1:2]
#print(type(train_x)) #the type looks fine
# Fit the model
print('Fitting model')
model = Model()
model.fit_model(train_x, train_y) #TODO: how to give the input here???
# Predict on the test features
print('Predicting on test features')
#print(test_x.size())
print(test_x.shape)
predicted_y = model.predict(test_x) #TODO: how to give the input here???
print(predicted_y)
#TODO: RuntimeError: shape '[265, 265]' is invalid for input of size 265
#TODO: I think it does not get the covariance matrix
'''

Negative loss when trying to implement aleatoric uncertainty estimation according to Kendall et al

I'm trying to implement a neural network with aleatoric uncertainty estimation for regression with pytorch according to
Kendall et al.: "What Uncertainties Do We Need in Bayesian Deep
Learning for Computer Vision?" (Link).
However, while the predicted regression values fit the desired ground truth values quite well, the predicted variance looks weird and the loss gets negative during training.
The paper suggests to have two outputs mean and variance instead of only predicting the regression value. To be more precise, it is suggested to predict mean and log(variance) due to stability reasons. Therefore, my network looks as follows:
class ReferenceResNet(nn.Module):
def __init__(self):
super().__init__()
self.fcl1 = nn.Linear(1, 32)
self.fcl2 = nn.Linear(32, 64)
self.fcl3 = nn.Linear(64, 128)
self.fcl_mean = nn.Linear(128,1)
self.fcl_var = nn.Linear(128,1)
def forward(self, x):
x = torch.tanh(self.fcl1(x))
x = torch.tanh(self.fcl2(x))
x = torch.tanh(self.fcl3(x))
mean = self.fcl_mean(x)
log_var = self.fcl_var(x)
return mean, log_var
According to the paper, given these outputs, the corresponding loss function consists of a residual regression-part and a regularization term:
where si is the log(variance) predicted by the network.
I implemented this loss-function accordingly:
def loss_function(pred_mean, pred_log_var, y):
return 1/len(pred_mean)*(0.5 * torch.exp(-pred_log_var)*torch.sqrt(torch.pow(y-pred_mean, 2))+0.5*pred_log_var).sum()
I tried this code on a self-generated toy dataset (see image with results), however, the loss gets negative during training and when I plot the variance over the dataset after training, for me it does not really make sense while the corresponding mean values fit the ground truth quite well:
I already figured out that the negative loss comes from the regularization term as logarithms are negative for values between 0 and 1, however, I don't believe that the absolute value of the regularization term is supposed to grow bigger than the regression part. Does anyone know what is the reason for this and how I can prevent this from happening? And why does my variance look so weird?
For reproduction, my full code looks as follows:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data.dataset import TensorDataset
from torchvision import datasets, transforms
import math
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class ReferenceRegNet(nn.Module):
def __init__(self):
super().__init__()
self.fcl1 = nn.Linear(1, 32)
self.fcl2 = nn.Linear(32, 64)
self.fcl3 = nn.Linear(64, 128)
self.fcl_mean = nn.Linear(128,1)
self.fcl_var = nn.Linear(128,1)
def forward(self, x):
x = torch.tanh(self.fcl1(x))
x = torch.tanh(self.fcl2(x))
x = torch.tanh(self.fcl3(x))
mean = self.fcl_mean(x)
log_var = self.fcl_var(x)
return mean, log_var
def toy_function(x):
return math.sin(x/15-4)+2 + math.sin(x/10-5)
def loss_function(x_mean, x_log_var, y):
return 1/len(x_mean)*(0.5 * torch.exp(-x_log_var)*torch.sqrt(torch.pow(y-x_mean, 2))+0.5*x_log_var).sum()
BATCH_SIZE = 10
EVAL_BATCH_SIZE = 10
CLASSES = 1
TRAIN_EPOCHS = 50
# generate toy dataset: A train-set in form of a complex sin-curve
x_train_data = np.array([])
y_train_data = np.array([])
for repeat in range(2):
for i in range(50, 150):
for j in range(100):
sampled_x = i+np.random.randint(101)/100
sampled_y = toy_function(sampled_x)+np.random.normal(0,0.2)
x_train_data = np.append(x_train_data, sampled_x)
y_train_data = np.append(y_train_data, sampled_y)
x_eval_data = list(np.arange(50.0, 150.0, 0.1))
y_eval_data = [toy_function(x) for x in x_eval_data]
LOADER_KWARGS = {'num_workers': 0, 'pin_memory': False} if torch.cuda.is_available() else {}
train_set = TensorDataset(torch.Tensor(x_train_data),torch.Tensor(y_train_data))
eval_set = TensorDataset(torch.Tensor(x_eval_data), torch.Tensor(y_eval_data))
train_loader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, **LOADER_KWARGS)
eval_loader = torch.utils.data.DataLoader(eval_set, batch_size=EVAL_BATCH_SIZE, shuffle=False, **LOADER_KWARGS)
TRAIN_SIZE = len(train_loader.dataset)
EVAL_SIZE = len(eval_loader.dataset)
assert (TRAIN_SIZE % BATCH_SIZE) == 0
assert (EVAL_SIZE % EVAL_BATCH_SIZE) == 0
net = ReferenceRegNet().to(DEVICE)
optimizer = optim.Adam(net.parameters(), lr=1e-3)
losses = {}
# train network
for epoch in range(1,TRAIN_EPOCHS+1):
net.train()
mean_epoch_loss = 0
mean_epoch_mse = 0
# train batches
for batch_idx, (data, target) in enumerate(tqdm(train_loader), start=1):
data, target = (data.to(DEVICE)).unsqueeze(dim=1), (target.to(DEVICE)).unsqueeze(dim=1)
optimizer.zero_grad()
output_means, output_log_var = net(data)
target_np = target.detach().cpu().numpy()
output_means_np = output_means.detach().cpu().numpy()
loss = loss_function(output_means, output_log_var, target)
loss_value = loss.item() # get raw float-value out of loss-tensor
mean_epoch_loss += loss_value
# optimize network
loss.backward()
optimizer.step()
mean_epoch_loss = mean_epoch_loss / len(train_loader)
losses.update({epoch:mean_epoch_loss})
print("Epoch " + str(epoch) + ": Train-Loss = " + str(mean_epoch_loss))
net.eval()
with torch.no_grad():
mean_loss = 0
mean_mse = 0
for data, target in eval_loader:
data, target = (data.to(DEVICE)).unsqueeze(dim=1), (target.to(DEVICE)).unsqueeze(dim=1)
output_means, output_log_var = net(data) # perform prediction
target_np = target.detach().cpu().numpy()
output_means_np = output_means.detach().cpu().numpy()
mean_loss += loss_function(output_means, output_log_var, target).item()
mean_loss = mean_loss/len(eval_loader)
#print("Epoch " + str(epoch) + ": Eval-loss = " + str(mean_loss))
fig = plt.figure(figsize=(40,12)) # create a 30x30 inch figure
ax = fig.add_subplot(1,3,1)
ax.set_title("regression value")
ax.set_xlabel("x")
ax.set_ylabel("regression mean")
ax.plot(x_train_data, y_train_data, 'x', color='black')
ax.plot(x_eval_data, y_eval_data, color='red')
pred_means_list = []
output_vars_list_train = []
output_vars_list_test = []
for x_test in sorted(x_train_data):
x_test = (torch.Tensor([x_test]).to(DEVICE))
pred_means, output_log_vars = net.forward(x_test)
pred_means_list.append(pred_means.detach().cpu())
output_vars_list_train.append(torch.exp(output_log_vars).detach().cpu())
ax.plot(sorted(x_train_data), pred_means_list, color='blue', label = 'training_perform')
pred_means_list = []
for x_test in x_eval_data:
x_test = (torch.Tensor([x_test]).to(DEVICE))
pred_means, output_log_vars = net.forward(x_test)
pred_means_list.append(pred_means.detach().cpu())
output_vars_list_test.append(torch.exp(output_log_vars).detach().cpu())
ax.plot(sorted(x_eval_data), pred_means_list, color='green', label = 'eval_perform')
plt.tight_layout()
plt.legend()
ax = fig.add_subplot(1,3,2)
ax.set_title("variance")
ax.set_xlabel("x")
ax.set_ylabel("regression var")
ax.plot(sorted(x_train_data), output_vars_list_train, label = 'training data')
ax.plot(x_eval_data, output_vars_list_test, label = 'test data')
plt.tight_layout()
plt.legend()
ax = fig.add_subplot(1,3,3)
ax.set_title("training loss")
ax.set_xlabel("Epoch")
ax.set_ylabel("Loss")
lists = sorted(losses.items())
epoch, loss = zip(*lists)
ax.plot(epoch, loss, label = 'loss')
plt.tight_layout()
plt.legend()
plt.savefig('ref_test.png')
TLDR: The optimization drives the loss to a minimum where the gradient
becomes zero, regardless of what the nominal loss value is.
A comprehensive explanation by K.Frank:
A smaller loss – algebraically less positive or algebraically more
negative – means (or should mean) better predictions. The
optimization step uses some version of gradient descent to make
your loss smaller. The overall level of the loss doesn’t matter as
far as the optimization goes. The gradient tells the optimizer how
to change the model parameters to reduce the loss, and it doesn’t
care about the overall level of the loss.
An example from the same source:
Consider, for example, optimizing with lossA = MSELoss. Now
imagine optimizing with lossB = lossA - 17.2. The 17.2 doesn’t
really change anything at all. It is true that “perfect” predictions
will yield lossB = -17.2 rather than zero. (lossA will, of course,
be zero for “perfect” predictions.) But who cares?
In your example: you are right, the negative loss value comes from the logarithmic term. This is completely OK and it means that your training is dominated by contributions of high-confidence loss terms. Regarding the high values of variance - can't comment much on it but it should be fine since the loss curve drops as expected.

Error when checking input: expected input_12 to have 4 dimensions, but got array with shape (None, None, None, None, None)

I am trying to classify 24 RGB images belonging to 2 classes. Each image was originally of dimension 400 X 400, but has been resized to 32 X 32 in the code. Iam using the metric-learning for image similarity search algorithm. However, I obtain the error " Error when checking input.....", when I run the line history = model.fit(AnchorPositivePairs(num_batchs=2), epochs=20) at the end of the program. What could be causing this error?
Here is my code!
import random
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from collections import defaultdict
from PIL import Image
from sklearn.metrics import ConfusionMatrixDisplay
from tensorflow import keras
from tensorflow.keras import layers
from tensorflow.keras import utils
import glob
import os
import tqdm
IMG_DIR = "C:/Temp2/AGAIN_4/" # load the images from one directory
IM_WIDTH = 32
IM_HEIGHT = 32
#batch_size = 2
num_classes = 2
#epochs = 15
def read_images(directory, resize_to=(32, 32)): # extract image labels
"""
Reads images and labels from the given directory
:param directory directory from which to read the files
:param resize_to a tuple of width, height to resize the images
: returns a tuple of list of images and labels
"""
files = glob.glob(directory + "*.jpg")
images = []
labels = []
for f in tqdm.tqdm_notebook(files):
im = Image.open(f)
im = im.resize(resize_to)
im = np.array(im) / 255.0
im = im.astype("float32")
images.append(im)
label = 1 if 'microwave' in f.lower() else 0
labels.append(label)
return np.array(images), np.array(labels)
x, y = read_images(directory=IMG_DIR, resize_to=(IM_WIDTH, IM_HEIGHT))
# make sure we have 25000 images if we are reading the full data set.
# Change the number accordingly if you have created a subset
assert len(x) == len(y) == 24 #25000
from sklearn.model_selection import train_test_split # extract train and test data
x_train, x_test, y_train, y_test =train_test_split(x, y, test_size=0.25)
# remove X and y since we don't need them anymore
# otherwise it will just use the memory
del x
del y
x_train = x_train.astype('float32')
x_test = x_test.astype('float32')
# Display some of the images
height_width = 32
def show_collage(examples):
box_size = height_width + 2
num_rows, num_cols = examples.shape[:2]
collage = Image.new(
mode="RGB",
size=(num_cols * box_size, num_rows * box_size),
color=(250, 250, 250),
)
for row_idx in range(num_rows):
for col_idx in range(num_cols):
array = (np.array(examples[row_idx, col_idx]) * 255).astype(np.uint8)
collage.paste(
Image.fromarray(array), (col_idx * box_size, row_idx * box_size)
)
# Double size for visualisation.
collage = collage.resize((2 * num_cols * box_size, 2 * num_rows * box_size))
return collage
# Show a collage of 3x3 random images.
sample_idxs = np.random.randint(0, 15, size=(3, 3))
examples = x_train[sample_idxs]
show_collage(examples) # Displays 9 images
class_idx_to_train_idxs = defaultdict(list)
for y_train_idx, y in enumerate(y_train):
class_idx_to_train_idxs[y].append(y_train_idx)
class_idx_to_test_idxs = defaultdict(list)
for y_test_idx, y in enumerate(y_test):
class_idx_to_test_idxs[y].append(y_test_idx)
num_classes = 2
class AnchorPositivePairs(keras.utils.Sequence):
def __init__(self, num_batchs):
self.num_batchs = num_batchs
def __len__(self):
return self.num_batchs
def __getitem__(self, _idx):
x = np.empty((2, num_classes, height_width, height_width, 3), dtype=np.float32)
for class_idx in range(num_classes):
examples_for_class = class_idx_to_train_idxs[class_idx]
anchor_idx = random.choice(examples_for_class)
positive_idx = random.choice(examples_for_class)
while positive_idx == anchor_idx:
positive_idx = random.choice(examples_for_class)
x[0, class_idx] = x_train[anchor_idx]
x[1, class_idx] = x_train[positive_idx]
return x
examples = next(iter(AnchorPositivePairs(num_batchs=1)))
show_collage(examples)
class EmbeddingModel(keras.Model):
def train_step(self, data):
# Note: Workaround for open issue, to be removed.
if isinstance(data, tuple):
data = data[0]
anchors, positives = data[0], data[1]
with tf.GradientTape() as tape:
# Run both anchors and positives through model.
anchor_embeddings = self(anchors, training=True)
positive_embeddings = self(positives, training=True)
# Calculate cosine similarity between anchors and positives. As they have
# been normalised this is just the pair wise dot products.
similarities = tf.einsum(
"ae,pe->ap", anchor_embeddings, positive_embeddings
)
# Since we intend to use these as logits we scale them by a temperature.
# This value would normally be chosen as a hyper parameter.
temperature = 0.2
similarities /= temperature
# We use these similarities as logits for a softmax. The labels for
# this call are just the sequence [0, 1, 2, ..., num_classes] since we
# want the main diagonal values, which correspond to the anchor/positive
# pairs, to be high. This loss will move embeddings for the
# anchor/positive pairs together and move all other pairs apart.
sparse_labels = tf.range(num_classes)
loss = self.compiled_loss(sparse_labels, similarities)
# Calculate gradients and apply via optimizer.
gradients = tape.gradient(loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
# Update and return metrics (specifically the one for the loss value).
self.compiled_metrics.update_state(sparse_labels, similarities)
return {m.name: m.result() for m in self.metrics}
inputs = layers.Input(shape=(height_width, height_width, 3))
#inputs = layers.Input(shape=(32, 32, 3))
x = layers.Conv2D(filters=32, kernel_size=3, strides=2, activation="relu")(inputs)
x = layers.Conv2D(filters=64, kernel_size=3, strides=2, activation="relu")(x)
x = layers.Conv2D(filters=128, kernel_size=3, strides=2, activation="relu")(x)
x = layers.GlobalAveragePooling2D()(x)
embeddings = layers.Dense(units=8, activation=None)(x)
embeddings = tf.nn.l2_normalize(embeddings, axis=-1)
model = EmbeddingModel(inputs, embeddings)
model.summary()
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=1e-3),
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)
history = model.fit(AnchorPositivePairs(num_batchs=2), epochs=20)
plt.plot(history.history["loss"])
plt.show()
I have used the cifar10 dataset as input instead of my local directory images as shown in the next code, but I still get the same error.
import random
import matplotlib.pyplot as plt
import numpy as np
import tensorflow as tf
from collections import defaultdict
from PIL import Image
from sklearn.metrics import ConfusionMatrixDisplay
from tensorflow import keras
from tensorflow.keras import layers
"""
## Dataset
For this example we will be using the
[CIFAR-10](https://www.cs.toronto.edu/~kriz/cifar.html) dataset.
"""
from tensorflow.keras.datasets import cifar10
(x_train, y_train), (x_test, y_test) = cifar10.load_data()
x_train = x_train.astype("float32") / 255.0
y_train = np.squeeze(y_train)
x_test = x_test.astype("float32") / 255.0
y_test = np.squeeze(y_test)
"""
To get a sense of the dataset we can visualise a grid of 25 random examples.
"""
height_width = 32
def show_collage(examples):
box_size = height_width + 2
num_rows, num_cols = examples.shape[:2]
collage = Image.new(
mode="RGB",
size=(num_cols * box_size, num_rows * box_size),
color=(250, 250, 250),
)
for row_idx in range(num_rows):
for col_idx in range(num_cols):
array = (np.array(examples[row_idx, col_idx]) * 255).astype(np.uint8)
collage.paste(
Image.fromarray(array), (col_idx * box_size, row_idx * box_size)
)
# Double size for visualisation.
collage = collage.resize((2 * num_cols * box_size, 2 * num_rows * box_size))
return collage
# Show a collage of 5x5 random images.
sample_idxs = np.random.randint(0, 50000, size=(5, 5))
examples = x_train[sample_idxs]
show_collage(examples)
"""
Metric learning provides training data not as explicit `(X, y)` pairs but instead uses
multiple instances that are related in the way we want to express similarity. In our
example we will use instances of the same class to represent similarity; a single
training instance will not be one image, but a pair of images of the same class. When
referring to the images in this pair we'll use the common metric learning names of the
`anchor` (a randomly chosen image) and the `positive` (another randomly chosen image of
the same class).
To facilitate this we need to build a form of lookup that maps from classes to the
instances of that class. When generating data for training we will sample from this
lookup.
"""
class_idx_to_train_idxs = defaultdict(list)
for y_train_idx, y in enumerate(y_train):
class_idx_to_train_idxs[y].append(y_train_idx)
class_idx_to_test_idxs = defaultdict(list)
for y_test_idx, y in enumerate(y_test):
class_idx_to_test_idxs[y].append(y_test_idx)
"""
For this example we are using the simplest approach to training; a batch will consist of
`(anchor, positive)` pairs spread across the classes. The goal of learning will be to
move the anchor and positive pairs closer together and further away from other instances
in the batch. In this case the batch size will be dictated by the number of classes; for
CIFAR-10 this is 10.
"""
num_classes = 10
class AnchorPositivePairs(keras.utils.Sequence):
def __init__(self, num_batchs):
self.num_batchs = num_batchs
def __len__(self):
return self.num_batchs
def __getitem__(self, _idx):
x = np.empty((2, num_classes, height_width, height_width, 3), dtype=np.float32)
for class_idx in range(num_classes):
examples_for_class = class_idx_to_train_idxs[class_idx]
anchor_idx = random.choice(examples_for_class)
positive_idx = random.choice(examples_for_class)
while positive_idx == anchor_idx:
positive_idx = random.choice(examples_for_class)
x[0, class_idx] = x_train[anchor_idx]
x[1, class_idx] = x_train[positive_idx]
return x
"""
We can visualise a batch in another collage. The top row shows randomly chosen anchors
from the 10 classes, the bottom row shows the corresponding 10 positives.
"""
examples = next(iter(AnchorPositivePairs(num_batchs=1)))
show_collage(examples)
"""
## Embedding model
We define a custom model with a `train_step` that first embeds both anchors and positives
and then uses their pairwise dot products as logits for a softmax.
"""
class EmbeddingModel(keras.Model):
def train_step(self, data):
# Note: Workaround for open issue, to be removed.
if isinstance(data, tuple):
data = data[0]
anchors, positives = data[0], data[1]
with tf.GradientTape() as tape:
# Run both anchors and positives through model.
anchor_embeddings = self(anchors, training=True)
positive_embeddings = self(positives, training=True)
# Calculate cosine similarity between anchors and positives. As they have
# been normalised this is just the pair wise dot products.
similarities = tf.einsum(
"ae,pe->ap", anchor_embeddings, positive_embeddings
)
# Since we intend to use these as logits we scale them by a temperature.
# This value would normally be chosen as a hyper parameter.
temperature = 0.2
similarities /= temperature
# We use these similarities as logits for a softmax. The labels for
# this call are just the sequence [0, 1, 2, ..., num_classes] since we
# want the main diagonal values, which correspond to the anchor/positive
# pairs, to be high. This loss will move embeddings for the
# anchor/positive pairs together and move all other pairs apart.
sparse_labels = tf.range(num_classes)
loss = self.compiled_loss(sparse_labels, similarities)
# Calculate gradients and apply via optimizer.
gradients = tape.gradient(loss, self.trainable_variables)
self.optimizer.apply_gradients(zip(gradients, self.trainable_variables))
# Update and return metrics (specifically the one for the loss value).
self.compiled_metrics.update_state(sparse_labels, similarities)
return {m.name: m.result() for m in self.metrics}
"""
Next we describe the architecture that maps from an image to an embedding. This model
simply consists of a sequence of 2d convolutions followed by global pooling with a final
linear projection to an embedding space. As is common in metric learning we normalise the
embeddings so that we can use simple dot products to measure similarity. For simplicity
this model is intentionally small.
"""
inputs = layers.Input(shape=(height_width, height_width, 3))
x = layers.Conv2D(filters=32, kernel_size=3, strides=2, activation="relu")(inputs)
x = layers.Conv2D(filters=64, kernel_size=3, strides=2, activation="relu")(x)
x = layers.Conv2D(filters=128, kernel_size=3, strides=2, activation="relu")(x)
x = layers.GlobalAveragePooling2D()(x)
embeddings = layers.Dense(units=8, activation=None)(x)
embeddings = tf.nn.l2_normalize(embeddings, axis=-1)
model = EmbeddingModel(inputs, embeddings)
"""
Finally we run the training. On a Google Colab GPU instance this takes about a minute.
"""
model.compile(
optimizer=keras.optimizers.Adam(learning_rate=1e-3),
loss=keras.losses.SparseCategoricalCrossentropy(from_logits=True),
)
history = model.fit(AnchorPositivePairs(num_batchs=1000), epochs=20)
plt.plot(history.history["loss"])
plt.show()
"""
## Testing
We can review the quality of this model by applying it to the test set and considering
near neighbours in the embedding space.
First we embed the test set and calculate all near neighbours. Recall that since the
embeddings are unit length we can calculate cosine similarity via dot products.
"""
near_neighbours_per_example = 10
embeddings = model.predict(x_test)
gram_matrix = np.einsum("ae,be->ab", embeddings, embeddings)
near_neighbours = np.argsort(gram_matrix.T)[:, -(near_neighbours_per_example + 1) :]
"""
As a visual check of these embeddings we can build a collage of the near neighbours for 5
random examples. The first column of the image below is a randomly selected image, the
following 10 columns show the nearest neighbours in order of similarity.
"""
num_collage_examples = 5
examples = np.empty(
(
num_collage_examples,
near_neighbours_per_example + 1,
height_width,
height_width,
3,
),
dtype=np.float32,
)
for row_idx in range(num_collage_examples):
examples[row_idx, 0] = x_test[row_idx]
anchor_near_neighbours = reversed(near_neighbours[row_idx][:-1])
for col_idx, nn_idx in enumerate(anchor_near_neighbours):
examples[row_idx, col_idx + 1] = x_test[nn_idx]
show_collage(examples)
"""
We can also get a quantified view of the performance by considering the correctness of
near neighbours in terms of a confusion matrix.
Let us sample 10 examples from each of the 10 classes and consider their near neighbours
as a form of prediction; that is, does the example and its near neighbours share the same
class?
We observe that each animal class does generally well, and is confused the most with the
other animal classes. The vehicle classes follow the same pattern.
"""
confusion_matrix = np.zeros((num_classes, num_classes))
# For each class.
for class_idx in range(num_classes):
# Consider 10 examples.
example_idxs = class_idx_to_test_idxs[class_idx][:10]
for y_test_idx in example_idxs:
# And count the classes of its near neighbours.
for nn_idx in near_neighbours[y_test_idx][:-1]:
nn_class_idx = y_test[nn_idx]
confusion_matrix[class_idx, nn_class_idx] += 1
# Display a confusion matrix.
labels = [
"Airplane",
"Automobile",
"Bird",
"Cat",
"Deer",
"Dog",
"Frog",
"Horse",
"Ship",
"Truck",
]
disp = ConfusionMatrixDisplay(confusion_matrix=confusion_matrix, display_labels=labels)
disp.plot(include_values=True, cmap="viridis", ax=None, xticks_rotation="vertical")
plt.show()
ValueError: Error when checking input: expected input_16 to have 4 dimensions, but got array with shape (None, None, None, None, None)
I m not sure but as i understand it your own data generator cause this error. You should also pass to try your data size in generator, try this:
def __len__(self):
if self.batch_size > self.X.shape[0]:
print("Batch size is greater than data size!!")
return -1
return int(np.floor(self.X.shape[0] / self.batch_size))

Gradient descent using many polynomials is not converging

context: I am trying to create a generic function to optimize the cost of any regression problem using polynomial regression (of any specified degree).
I am trying to fit my model to the load_boston dataset (with the house price as the label and 13 features).
I used multiple degrees of polynomials, and multiple learning rates and epochs (with gradient descent) and the MSE is coming out to be so high even on the training dataset (I am using 100% of the data to train the model, and I am checking the cost on the same data, but the MSE cost is still very high).
import tensorflow as tf
from sklearn.datasets import load_boston
def polynomial(x, coeffs):
y = 0
for i in range(len(coeffs)):
y += coeffs[i]*x**i
return y
def initial_parameters(dimensions, data_type, degree): # list number of dims/features and degree
thetas = [tf.Variable(0, dtype=data_type)] # the constant theta/bias
for i in range(degree):
thetas.append(tf.Variable( tf.zeros([dimensions, 1], dtype=data_type)))
return thetas
def regression_error(x, y, thetas):
hx = thetas[0] # constant thetas - no need to have 1 for each variable (e.g x^0*th + y^0*th...)
for i in range(1, len(thetas)):
hx = tf.add(hx, tf.matmul( tf.pow(x, i), thetas[i]))
return tf.reduce_mean(tf.squared_difference(hx, y))
def polynomial_regression(x, y, data_type, degree, learning_rate, epoch): #features=dimensions=variables
thetas = initial_parameters(x.shape[1], data_type, degree)
cost = regression_error(x, y, thetas)
init = tf.initialize_all_variables()
optimizer = tf.train.GradientDescentOptimizer(learning_rate).minimize(cost)
with tf.Session() as sess:
sess.run(init)
for epoch in range(epoch):
sess.run(optimizer)
return cost.eval()
x, y = load_boston(True) # yes just use the entire dataset
for deg in range(1, 2):
for lr in range(-8, -5):
error = polynomial_regression(x, y, tf.float64, deg, 10**lr, 100 )
print (deg, lr, error)
It outputs 97.3 even though most of the labels are around 30 (degree = 1, learning rate = 10^-6).
what is wrong with the code?
The problem is that the different features are on different orders of magnitude and hence are not compatible with the learning rate which is the same for all features. Even more, when using a non-zero variable initialization, one has to make sure that these initial values are as well compatible with the feature values.
In [1]: from sklearn.datasets import load_boston
In [2]: x, y = load_boston(True)
In [3]: x.std(axis=0)
Out[3]:
array([8.58828355e+00, 2.32993957e+01, 6.85357058e+00, 2.53742935e-01,
1.15763115e-01, 7.01922514e-01, 2.81210326e+01, 2.10362836e+00,
8.69865112e+00, 1.68370495e+02, 2.16280519e+00, 9.12046075e+01,
7.13400164e+00])
In [4]: x.mean(axis=0)
Out[4]:
array([3.59376071e+00, 1.13636364e+01, 1.11367787e+01, 6.91699605e-02,
5.54695059e-01, 6.28463439e+00, 6.85749012e+01, 3.79504269e+00,
9.54940711e+00, 4.08237154e+02, 1.84555336e+01, 3.56674032e+02,
1.26530632e+01])
A common approach is to normalize the input data (e.g. to have zero mean and unit variance) and to choose the initial weights randomly (e.g. normal distribution, std.dev. = 1). sklearn.preprocessing offers various functionality for these cases.
PolynomialFeatures can be used to generate the polynomial features automatically.
StandardScaler scales the data to zero mean and unit variance.
pipeline.Pipeline can be used for convenience to combine these preprocessing steps.
The polynomial_regression function then reduces to:
pipeline = Pipeline([
('poly', PolynomialFeatures(degree)),
('scaler', StandardScaler())
])
x = pipeline.fit_transform(x)
thetas = tf.Variable(tf.random_normal([x.shape[1], 1], dtype=data_type))
cost = tf.reduce_mean(tf.squared_difference(tf.matmul(x, thetas), y))
# Perform variable initialization and optimizer instantiation here.
# Run optimization over epochs.

Why Skorch show NAN in the every epoch?

I want to create my own dataset class based on Dataset class of Skorch because I want to differentiate categorical columns and continuous columns. These categorical columns will be passed through the embedding layers in the model. The result is weird because it show NAN
like this:
epoch train_loss valid_loss dur
------- ------------ ------------ ------
1 nan nan 0.2187
2 nan nan 0.1719
3 nan nan 0.1719
4 nan nan 0.1562
5 nan nan 0.1406
Can you help me fix it ? I am using data from this kaggle:
Here
from skorch import NeuralNetRegressor
from skorch.dataset import Dataset
import torch
import torch.nn as nn
import torch.nn.functional as F
import pandas as pd
import numpy as np
from sklearn.preprocessing import LabelEncoder
class TabularDataset(Dataset):
def __init__(self, data, cat_cols=None, output_col=None):
self.n = data.shape[0]
if output_col:
self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
else:
self.y = np.zeros((self.n, 1))
self.cat_cols = cat_cols if cat_cols else []
self.cont_cols = [col for col in data.columns
if col not in self.cat_cols + [output_col]]
if self.cont_cols:
self.cont_X = data[self.cont_cols].astype(np.float32).values
else:
self.cont_X = np.zeros((self.n, 1))
if self.cat_cols:
self.cat_X = data[self.cat_cols].astype(np.int64).values
else:
self.cat_X = np.zeros((self.n, 1))
def __len__(self):
# Denotes the total number of sampoes
return self.n
def __getitem__(self, idx):
# generates one sample of data
return [self.cont_X[idx], self.cat_X[idx]], self.y[idx]
class FeedForwardNN(nn.Module):
def __init__(self, emb_dims, no_of_cont, lin_layer_sizes,
output_size, emb_dropout, lin_layer_dropouts):
"""
Parameters
----------
emb_dims: List of two element tuples
This list will contain a two element tuple for each
categorical feature. The first element of a tuple will
denote the number of unique values of the categorical
feature. The second element will denote the embedding
dimension to be used for that feature.
no_of_cont: Integer
The number of continuous features in the data.
lin_layer_sizes: List of integers.
The size of each linear layer. The length will be equal
to the total number
of linear layers in the network.
output_size: Integer
The size of the final output.
emb_dropout: Float
The dropout to be used after the embedding layers.
lin_layer_dropouts: List of floats
The dropouts to be used after each linear layer.
"""
super().__init__()
# Embedding layers
self.emb_layers = nn.ModuleList([nn.Embedding(x, y)
for x, y in emb_dims])
no_of_embs = sum([y for x, y in emb_dims])
self.no_of_embs = no_of_embs
self.no_of_cont = no_of_cont
# Linear Layers
first_lin_layer = nn.Linear(self.no_of_embs + self.no_of_cont,
lin_layer_sizes[0])
self.lin_layers = \
nn.ModuleList([first_lin_layer] + \
[nn.Linear(lin_layer_sizes[i], lin_layer_sizes[i + 1])
for i in range(len(lin_layer_sizes) - 1)])
for lin_layer in self.lin_layers:
nn.init.kaiming_normal_(lin_layer.weight.data)
# Output Layer
self.output_layer = nn.Linear(lin_layer_sizes[-1],
output_size)
nn.init.kaiming_normal_(self.output_layer.weight.data)
# Batch Norm Layers
self.first_bn_layer = nn.BatchNorm1d(self.no_of_cont)
self.bn_layers = nn.ModuleList([nn.BatchNorm1d(size)
for size in lin_layer_sizes])
# Dropout Layers
self.emb_dropout_layer = nn.Dropout(emb_dropout)
self.droput_layers = nn.ModuleList([nn.Dropout(size)
for size in lin_layer_dropouts])
def forward(self, X):
cont_data = X[0]
cat_data = X[1]
if self.no_of_embs != 0:
x = [emb_layer(cat_data[:, i])
for i, emb_layer in enumerate(self.emb_layers)]
x = torch.cat(x, 1)
x = self.emb_dropout_layer(x)
if self.no_of_cont != 0:
normalized_cont_data = self.first_bn_layer(cont_data)
if self.no_of_embs != 0:
x = torch.cat([x, normalized_cont_data], 1)
else:
x = normalized_cont_data
for lin_layer, dropout_layer, bn_layer in \
zip(self.lin_layers, self.droput_layers, self.bn_layers):
x = F.relu(lin_layer(x))
x = bn_layer(x)
x = dropout_layer(x)
x = self.output_layer(x)
return x
# Read data
data = pd.read_csv("data/train.csv", usecols=["SalePrice", "MSSubClass", "MSZoning", "LotFrontage", "LotArea",
"Street", "YearBuilt", "LotShape", "1stFlrSF", "2ndFlrSF"]).dropna()
categorical_features = ["MSSubClass", "MSZoning", "Street", "LotShape", "YearBuilt"]
output_feature = "SalePrice"
# Label Encode Categorial Features
label_encoders = {}
for cat_col in categorical_features:
label_encoders[cat_col] = LabelEncoder()
data[cat_col] = label_encoders[cat_col].fit_transform(data[cat_col])
# feed Forward NN
cat_dims = [int(data[col].nunique()) for col in categorical_features]
emb_dims = [(x, min(50, (x + 1) // 2)) for x in cat_dims]
net = FeedForwardNN(emb_dims, no_of_cont=4, lin_layer_sizes=[50, 100],
output_size=1, emb_dropout=0.04,
lin_layer_dropouts=[0.001, 0.01])
# Fit
ds = TabularDataset(data=data, cat_cols=categorical_features,
output_col=output_feature)
X = data.drop(['SalePrice'], axis=1)
y = data['SalePrice'].values.reshape(-1, 1)
net = NeuralNetRegressor(
net,
max_epochs=5,
lr=0.1,
dataset=ds
)
net.fit(X, y)
For anyone who's experiencing a similar issue in case of classification problems, the loss function (criterion) used by Skorch by default is NLLLoss, which computes the log for you (doc and related issue). Therefore it is expected that a Softmax (in case of multiple classes) layer is present as last step in your architecture to be able to produce probabilities.
You can either:
Add a Softmax layer to produce probabilities and leave the default NLLLoss;
Change the default loss to CrossEntropyLoss:
net = NeuralNetClassifier(
...
criterion=torch.nn.CrossEntropyLoss
)
The problem is not with skorch but your data. You have to scale your inputs and, in this case, especially the targets to avoid huge losses and exploding gradients. As a start I suggest using, for example, sklearn.preprocessing.StandardScaler:
from sklearn.preprocessing import StandardScaler
class TabularDataset(Dataset):
def __init__(self, data, cat_cols=None, output_col=None):
self.n = data.shape[0]
# [...]
if output_col:
scaler_y = StandardScaler()
self.y = data[output_col].astype(np.float32).values.reshape(-1, 1)
scaler_y.fit(self.y)
self.y = scaler_y.transform(self.y)
# [...]
if self.cont_cols:
scaler_X_cont = StandardScaler()
self.cont_X = data[self.cont_cols].astype(np.float32).values
scaler_X_cont.fit(self.cont_X)
self.cont_X = scaler_X_cont.transform(self.cont_X)
# [...]
As a side note, you don't need X and y when you have a dataset that provides the actual data, you can simply pass it to net.fit (with the exception of using a stratified CV split):
net = NeuralNetRegressor(
net,
max_epochs=5,
lr=0.00001,
)
net.fit(ds, y=None)

Categories

Resources