I am creating a GRU to do some classification for a project, and I'm relatively new to Pytorch and implementing GRUs. I know similar questions like this one have been answered already but I can't seem to bring the same solution over to my own problem. I understand that there is an issue with the shape/order of my fc arrays but after trying to change things I can no longer see the trees for the wood. I would appreciate it if someone could point me in the right direction.
Below I have attached my code and the error. The datasets im using contain 24 features with a label in the 25th column.
# Imports
import pandas as pd
import numpy as np
import torch
import torchvision # torch package for vision related things
import torch.nn.functional as F # Parameterless functions, like (some) activation functions
import torchvision.datasets as datasets # Standard datasets
import torchvision.transforms as transforms # Transformations we can perform on our dataset for augmentation
from torch import optim # For optimizers like SGD, Adam, etc.
from torch import nn # All neural network modules
from torch.utils.data import Dataset, DataLoader # Gives easier dataset managment by creating mini batches etc.
from tqdm import tqdm # For a nice progress bar
from sklearn.preprocessing import StandardScaler
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
input_size = 24
hidden_size = 128
num_layers = 1
num_classes = 2
sequence_length = 1
learning_rate = 0.005
batch_size = 8
num_epochs = 3
# Recurrent neural network with GRU (many-to-one)
class RNN_GRU(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN_GRU, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
def forward(self, x):
# Set initial hidden and cell states
x = x.unsqueeze(0)
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
# Forward propagate LSTM
out, _ = self.gru(x, h0)
out = out.reshape(out.shape[0], -1)
# Decode the hidden state of the last time step
out = self.fc(out)
return out
class MyDataset(Dataset):
def __init__(self,file_name):
stats_df=pd.read_csv(file_name)
x=stats_df.iloc[:,0:24].values
y=stats_df.iloc[:,24].values
self.x_train=torch.tensor(x,dtype=torch.float32)
self.y_train=torch.tensor(y,dtype=torch.float32)
def __len__(self):
return len(self.y_train)
def __getitem__(self,idx):
return self.x_train[idx],self.y_train[idx]
nomDs=MyDataset("nomStats.csv")
atkDs=MyDataset("atkStats.csv")
train_loader=DataLoader(dataset=nomDs,batch_size=batch_size)
test_loader=DataLoader(dataset=atkDs,batch_size=batch_size)
# Initialize network (try out just using simple RNN, or GRU, and then compare with LSTM)
model = RNN_GRU(input_size, hidden_size, num_layers, num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Train Network
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
# Get data to cuda if possible
data = data.to(device=device).squeeze(1)
targets = targets.to(device=device)
# forward
scores = model(data)
loss = criterion(scores, targets)
# backward
optimizer.zero_grad()
loss.backward()
# gradient descent update step/adam step
optimizer.step()
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
# Set model to eval
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device).squeeze(1)
y = y.to(device=device)
scores = model(x)
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
# Toggle model back to train
model.train()
return num_correct / num_samples
print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:2f}")
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}")
Traceback (most recent call last):
File "TESTGRU.py", line 87, in <module>
scores = model(data)
File "C:\Users\steph\anaconda3\envs\FYP\lib\site-packages\torch\nn\modules\module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "TESTGRU.py", line 47, in forward
out = self.fc(out)
File "C:\Users\steph\anaconda3\envs\FYP\lib\site-packages\torch\nn\modules\module.py", line 889, in _call_impl
result = self.forward(*input, **kwargs)
File "C:\Users\steph\anaconda3\envs\FYP\lib\site-packages\torch\nn\modules\linear.py", line 94, in forward
return F.linear(input, self.weight, self.bias)
File "C:\Users\steph\anaconda3\envs\FYP\lib\site-packages\torch\nn\functional.py", line 1753, in linear
return torch._C._nn.linear(input, weight, bias)
RuntimeError: mat1 and mat2 shapes cannot be multiplied (1x1024 and 128x2)
It seems like these lines
# Forward propagate LSTM
out, _ = self.gru(x, h0)
out = out.reshape(out.shape[0], -1)
are the problem.
It appears that you only want to feed the hidden state of the last time step.
This could be read from the output in two ways:
If you want the output of all layers at the last time step, you should use the second return value of out, _ = self.gru(x, h0) not the first.
If you want to use just the last layer's output at the last time step (which seems to be the case), you should use
out[:, -1, :]. With this change, you may not need the
reshape operation.
Related
Having a hard time setting up a neural network most of the examples are images. My problem has 3 inputs each of size N X M where N are the samples and M are the features. I have a separate file (CSV) with 1 x N binary target (0,1).
The network i'm trying to configure should have two hidden layers with 100 and 50 neurons, respectively. Sigmoid activation function and cross-entropy to check performance. The result should just be a single probability output.
Please help?
EDIT:
import torch
import torch.nn as nn
import torch.optim as optim
import torch.autograd as autograd
import torch.nn.functional as F
#from torch.autograd import Variable
import pandas as pd
# Import Data
Input1 = pd.read_csv(r'...')
Input2 = pd.read_csv(r'...')
Input3 = pd.read_csv(r'...')
Target = pd.read_csv(r'...' )
# Convert to Tensor
Input1_tensor = torch.tensor(Input1.to_numpy()).float()
Input2_tensor = torch.tensor(Input2.to_numpy()).float()
Input3_tensor = torch.tensor(Input3.to_numpy()).float()
Target_tensor = torch.tensor(Target.to_numpy()).float()
# Transpose to have signal as columns instead of rows
input1 = Input1_tensor
input2 = Input2_tensor
input3 = Input3_tensor
y = Target_tensor
# Define the model
class Net(nn.Module):
def __init__(self, num_inputs, hidden1_size, hidden2_size, num_classes):
# Initialize super class
super(Net, self).__init__()
#self.criterion = nn.CrossEntropyLoss()
# Add hidden layer
self.layer1 = nn.Linear(num_inputs,hidden1_size)
# Activation
self.sigmoid = torch.nn.Sigmoid()
# Add output layer
self.layer2 = nn.Linear(hidden1_size,hidden2_size)
# Activation
self.sigmoid2 = torch.nn.Sigmoid()
self.layer3 = nn.Linear(hidden2_size, num_classes)
def forward(self, x1, x2, x3):
# implement the forward pass
in1 = self.layer1(x1)
in2 = self.layer1(x2)
in3 = self.layer1(x3)
xyz = torch.cat((in1,in2,in3),1)
return xyz
# Define loss function
loss_function = nn.CrossEntropyLoss()
# Define optimizer
optimizer = optim.SGD(model.parameters(), lr=1e-4)
for t in range(num_epochs):
# Forward pass: Compute predicted y by passing x to the model
y_pred = model(input1, input2, input3)
# Compute and print loss
loss = loss_function(y_pred, y)
print(t, loss.item())
# Zero gradients, perform a backward pass, and update the weights.
optimizer.zero_grad()
# Calculate gradient using backward pass
loss.backward()
# Update model parameters (weights)
optimizer.step()
Here I am getting an error of "
RuntimeError: 0D or 1D target tensor expected, multi-target not supported"
for line "loss = loss_function(y_pred, y)"
Where y_pred is [20000,375] and y is [20000,1]
you can refer to pytorch, a python library for deep learning and neural networks.
and you can use code that defines network below:
from torch import nn
import torch.nn.functional as F
def network(nn.Module):
def __init__(self, M):
# M is the dimension of input feature
super(network, self).__init__()
self.layer1 = nn.Linear(M, 100)
self.layer2 = nn.Linear(100, 50)
self.out = nn.Linear(50,1)
def forward(self,x):
return F.sigmoid(self.out(self.layer2(self.layer1(x))))
----------
You can then refer to the pytorch documentation and finish the rest training code.
Edit:
As for RuntimeError, you can squeeze the target tensor by y.squeeze(). This will remove redundant dimension in your tensor, e.g. [20000,1] -> [20000]
I am trying to implement a Custom Loss function that uses multiple predictions/forward propagations of images for an image classification model.
The general concept of this loss function is to evaluate the model's consistency with non-augmented and augmented images. That is to say, the model is given 2 images; the original image and its augmented counterpart. Then, both images are forward propagated through the model. The more different the two outputs are from each other, the higher the loss.
What this meant is a fairly low-level change, and the most apparent way of solving this, to me, was model subclassing. I created a subclass of the keras.Model class and changed the train_step() method to include a small algorithm for locating the respective augmented counterpart of each original image (not relevant to the issue at all), and more significantly, a line that gave a prediction on the augmented counterpart:
with tf.GradientTape() as tape:
y_pred = self(x, training=True)
y_aug = self(self.augmented_data[aug_index:aug_index+self.batch_size], training=True)
loss = self.comparative_loss(y, y_pred, y_aug)
The whole self.augmented_data[aug_index:aug_index+self.batch_size] isn't relevant at all, it can be thought of just as the augmented data input. The intent was for the method "comparative_loss" to take the two predictions and then perform the aforementioned loss calculations on it.
The issue came when I tried to compile the model; there was a required loss parameter, but it refused to accept my custom loss method as it required 3 parameters. I couldn't go with the standard fix of putting the functions into a structure like this:
def new_loss(extra_parameter):
def loss(y_true, y_pred):
return loss_value
return loss
since my "extra_parameter" was not just a standard output of the model; it was a completely separate forward propagation on it, that relied on my custom train_step() method.
TL;DR:
What I'm most confused about is, why does tf.compile() even require a loss function, if my "train_step" method doesn't use it? The train_step method in my custom subclass has the loss built-in, so is there a way to override the .compile()'s loss parameter and have it work without me having to give it a method? If not, what other solutions are there?
The full code is below, though I sincerely apologize to anyone that reads it, as it's not quite finished:
# -*- coding: utf-8 -*-
"""
Created on Fri Feb 18 11:37:08 2022
Custom Loss Function
Description:
For each element of y_true, compare the y_predict of
the original image and the complemented one, then return
a loss accordingly using the Euclidian distance
between the predictions for the original images and the complements.
y_predict are labels for the images, these labels can
come in any form: CIFAR labels, species labels, or labels of which
individual a given image is.
y_predict will be in the shape (batch_size, number_of_classes), using the
#author: hudso
"""
import tensorflow as tf
import keras
from keras.layers import Conv2D, MaxPooling2D, Dense, Flatten, BatchNormalization
import ssl
import numpy as np
import cv2 as cv
class CustomModel(keras.Model):
def __init__(self, classes):
super().__init__() #call parent constructor
self.conv_1 = Conv2D(32,(3,3),activation='relu',padding='same')
self.batch_1 = BatchNormalization()
self.conv_2 = Conv2D(32,(3,3),activation='relu',padding='same')
self.batch_2 = BatchNormalization()
self.pool_1 = MaxPooling2D((2,2))
self.conv_3 = Conv2D(64,(3,3),activation='relu',padding='same')
self.batch_3 = BatchNormalization()
self.conv_4 = Conv2D(64,(3,3),activation='relu',padding='same')
self.batch_4 = BatchNormalization()
self.pool_2 = MaxPooling2D((2,2))
self.conv_5 = Conv2D(128,(3,3),activation='relu',padding='same')
self.batch_5 = BatchNormalization()
self.conv_6 = Conv2D(128,(3,3),activation='relu',padding='same')
self.batch_6 = BatchNormalization()
self.flatten = Flatten()
self.layer_1 = keras.layers.Dropout(0.2)
self.layer_2 = Dense(256,activation='relu')
self.dropout = keras.layers.Dropout(0.2)
self.outputs = Dense(classes, activation='softmax') #no. of classes
self.classes = classes #Initializes the number of classes variable
#essentially the Functional API forward-pass call-structure shenanigans
#called each forward propagation (calculating loss, training, etc.)
def call(self, inputs):
#print("INPUTS: " + str(inputs))
x = self.conv_1(inputs)
x = self.batch_1(x)
x = self.conv_2(x)
x = self.batch_2(x)
x = self.pool_1(x)
x = self.conv_3(x)
x = self.batch_3(x)
x = self.conv_4(x)
x = self.batch_4(x)
x = self.pool_2(x)
x = self.conv_5(x)
x = self.batch_5(x)
x = self.conv_6(x)
x = self.batch_6(x)
x = self.flatten(x)
x = self.layer_1(x)
x = self.layer_2(x)
x = self.dropout(x)
x = self.outputs(x)
return x #returns the constructed model
#Imports necessary data (It's hard to gain access of the values handed to .fit())
def data_import(self, augmented_data, x_all, batch_size):
self.augmented_data = augmented_data
self.x_all = np.asarray(x_all, dtype=np.float32)
self.batch_size = batch_size
#Very useful advice: https://stackoverflow.com/questions/65889381/going-from-a-tensorarray-to-a-tensor
def comparative_loss(self, y_true, y_pred, y_aug):
output_loss = tf.TensorArray(tf.float32, size=self.classes)
batch_loss = tf.TensorArray(tf.float32, size=self.batch_size)
for n in range(self.batch_size):
for i in range(self.classes):
output_loss = output_loss.write(i, tf.square(tf.abs(tf.subtract(y_pred[n][i], y_aug[n][i])))) #finds Euclidean Distance for each prediction, then averages the loss across all iterations in the batch
indexes = tf.keras.backend.arange(0, self.classes, step=1, dtype='int32')
output_loss_tensor = output_loss.gather(indexes)
batch_loss = batch_loss.write(n, tf.math.reduce_sum(output_loss_tensor))
indexes = tf.keras.backend.arange(0, self.batch_size, step=1, dtype='int32')
batch_loss_tensor = batch_loss.gather(indexes)
total_loss = tf.math.reduce_sum(batch_loss_tensor)
total_loss = tf.math.divide(total_loss, self.batch_size)
print("TOTAL LOSS: " + str(total_loss))
return total_loss
def train_step(self, data):
x, y = data #Current batch
#Finds the range of indexes for the complements of the current batch of images
#A lower level implementation could make this significantly more efficient by avoiding searching each time
aug_index = 0
x_arr = x.numpy() #Turns the input data iterable Tensor into a numpy array, Eager Execution must be enabled for this to work
for i in range(np.size(self.x_all, axis = 0)):
difference = cv.subtract(self.x_all[i], x_arr[0])
if np.count_nonzero(difference) == 0: #In the .fit() line for this CustomModel, shuffle = False for this to work
aug_index = i #Lower bound of the batch of images
found = True
if found == False:
print("Yikes mate the x_arr wasn't found in x_all... probably a rounding error")
print("\nCurrent Index: " + str(aug_index))
#Forward pass/predictions + loss calculation
with tf.GradientTape() as tape:
y_pred = self(x, training=True)
y_aug = self(self.augmented_data[aug_index:aug_index+self.batch_size], training=True)
loss = self.comparative_loss(y, y_pred, y_aug) #Computes the actual loss value
#I didn't touch any of this code
trainable_vars = self.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
self.compiled_metrics.update_state(y, y_pred)
return {m.name: m.result() for m in self.metrics}
#Essentially emulates the environment that the model would normally be running in
#E.g. Creates the dataset, does Image Augmentation, etc.
#In the actual implementation, only the "CustomModel" class will be used, this is purely for testing purposes
class shrek_is_love:
def __init__(self):
self.complements = []
self.create_dataset()
#automatically runs
def create_dataset(self):
ssl._create_default_https_context = ssl._create_unverified_context
(images, labels), (_, _) = keras.datasets.cifar10.load_data() #only uses the training sets and then splits it again later since that'll be what we'll be dealing with in the happywhale dataset anyways
self.labels = labels
self.images = images
self.data_aug()
#NOT MY CODE this is liam's image data generator (thx liam ur cool)
#automatically runs
def data_aug(self):
imageGen = keras.preprocessing.image.ImageDataGenerator(width_shift_range=.3, height_shift_range=.3, horizontal_flip=True, zoom_range=.3)
imagees = np.zeros(shape=(1, 32, 32, 3))
for l in range(np.size(self.images, 0)):
# adjust the tuple inside of cv.resize to adjust resolution
temp = cv.resize(self.images[l], (32, 32))
imagees[0] = (cv.cvtColor(temp, cv.COLOR_BGR2RGB))
it = imageGen.flow(imagees)
im = it.next()
im = im[0].astype('float32')
im = im / 255.0
self.complements.append(im)
self.complements = np.asarray(self.complements, dtype=np.float)
self.images = self.images.astype(np.float)
self.images = self.images / 255.0
self.preprocessor()
def preprocessor(self):
from sklearn.preprocessing import OneHotEncoder
onehot_encoder = OneHotEncoder(sparse=False)
self.labels = onehot_encoder.fit_transform(np.reshape(self.labels, (-1, 1)))
from sklearn.model_selection import train_test_split
shared_seed = 5 #the indexes of complements_train and image_train have to line up, so that labels_train can apply to both
self.complements_train, self.complements_test = train_test_split(self.complements, test_size=0.25, random_state=shared_seed)
self.images_train, self.images_test, self.labels_train, self.labels_test = train_test_split(self.images, self.labels, test_size=0.25, random_state=shared_seed)
#The following code will be all that is necessary to run the CustomModel classs
batch_size = 32
shrek_is_life = shrek_is_love()
model = CustomModel(10) #10 classes
model.data_import(shrek_is_life.complements_train, shrek_is_life.images_train, batch_size) #the model will not be training on aug_data, essentially turning it into a secondary test set
model.compile(optimizer='adam', loss=None, metrics=['accuracy'], run_eagerly=True) #loss=None brings up an error, but I have no idea what else to put in there
model.fit(x = shrek_is_life.images_train, y = shrek_is_life.labels_train, shuffle = False, batch_size = batch_size, epochs = 1)
EDIT:
Running it without a .compile line yields this error:
Traceback (most recent call last):
File "D:\Downloads\untitled0.py", line 191, in <module>
model.fit(x = shrek_is_life.images_train, y = shrek_is_life.labels_train, shuffle = False, batch_size = batch_size, epochs = 1)
File "C:\Users\hudso\anaconda3\envs\mlTens\lib\site-packages\keras\engine\training.py", line 1150, in fit
x, y, sample_weights = self._standardize_user_data(
File "C:\Users\hudso\anaconda3\envs\mlTens\lib\site-packages\keras\engine\training.py", line 508, in _standardize_user_data
raise RuntimeError('You must compile a model before '
RuntimeError: You must compile a model before training/testing. Use `model.compile(optimizer, loss)`.
Running .compile without the loss argument or with loss=None yields:
File "C:\Users\hudso\anaconda3\envs\mlTens\lib\site-packages\keras\engine\training.py", line 706, in _prepare_total_loss
raise ValueError('The model cannot be compiled '
ValueError: The model cannot be compiled because it has no loss to optimize.
I am creating a GRU to predict if data derived from traffic packets from a device is considered safe or anomalous. I plan to do this by training a model only on safe/ normal operating data and then having it check what it considers new unseen traffic to be (testing). I wish to only train on the safe data (one class) as an attack could take many forms and I don't want to train the model on labeled attack data and then have it miss an attack type that I didn't train it on (basically I want to overfit on the normal operating data). As such I need it to be able to check if the incoming unlabeled data matches the one class it has already trained on (i.e. does the incoming data match the normal operation of the device) or if it is anomalous.
The issue I am having is that as the model is being trained on only one class it is having trouble differentiating the anomalous unseen data from normal data and considers virtually all data that it sees as normal (same as the class it trained on).
As such I would appreciate it if anyone has any ideas or could point out flaws in the way I have implemented by model.
# Imports
import pandas as pd
import numpy as np
import torch
import torchvision # torch package for vision related things
import torch.nn.functional as F # Parameterless functions, like (some) activation functions
import torchvision.datasets as datasets # Standard datasets
import torchvision.transforms as transforms # Transformations we can perform on our dataset for augmentation
from torch import optim # For optimizers like SGD, Adam, etc.
from torch import nn # All neural network modules
from torch.utils.data import Dataset, DataLoader # Gives easier dataset managment by creating mini batches etc.
from tqdm import tqdm # For a nice progress bar
from sklearn.preprocessing import StandardScaler
# Set device
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
# Hyperparameters
input_size = 24
hidden_size = 128
num_layers = 1
num_classes = 2
sequence_length = 1
learning_rate = 0.005
batch_size = 8
num_epochs = 5
# Recurrent neural network with GRU (many-to-one)
class RNN_GRU(nn.Module):
def __init__(self, input_size, hidden_size, num_layers, num_classes):
super(RNN_GRU, self).__init__()
self.hidden_size = hidden_size
self.num_layers = num_layers
self.gru = nn.GRU(input_size, hidden_size, num_layers, batch_first=True)
self.fc = nn.Linear(hidden_size * sequence_length, num_classes)
def forward(self, x):
# Set initial hidden and cell states
x = x.unsqueeze(1)
h0 = torch.zeros(self.num_layers, x.size(0), self.hidden_size).to(device)
# Forward propagate GRU
out, _ = self.gru(x, h0)
out = out[:, -1, :]
# Decode the hidden state of the last time step
out = self.fc(out)
return out
class MyDataset(Dataset):
def __init__(self,file_name):
stats_df=pd.read_csv(file_name)
x=stats_df.iloc[:,0:24].values
y=stats_df.iloc[:,24].values
self.x_train=torch.tensor(x,dtype=torch.float32)
self.y_train=torch.tensor(y,dtype=torch.float32)
def __len__(self):
return len(self.y_train)
def __getitem__(self,idx):
return self.x_train[idx],self.y_train[idx]
nomDs=MyDataset("nomStats.csv")
atkDs=MyDataset("atkStats.csv")
train_loader=DataLoader(dataset=nomDs,batch_size=batch_size)
test_loader=DataLoader(dataset=atkDs,batch_size=batch_size)
# Initialize network
model = RNN_GRU(input_size, hidden_size, num_layers, num_classes).to(device)
# Loss and optimizer
criterion = nn.CrossEntropyLoss()
optimizer = optim.Adam(model.parameters(), lr=learning_rate)
# Train Network
for epoch in range(num_epochs):
for batch_idx, (data, targets) in enumerate(tqdm(train_loader)):
# Get data to cuda if possible
data = data.to(device=device).squeeze(1)
targets = targets.to(device=device)
targets = targets.to(dtype=torch.long)
# forward
scores = model(data)
loss = criterion(scores, targets)
# backward
optimizer.zero_grad()
loss.backward()
# gradient descent update step/adam step
optimizer.step()
# Check accuracy on training & test to see how good our model
def check_accuracy(loader, model):
num_correct = 0
num_samples = 0
# Set model to eval
model.eval()
with torch.no_grad():
for x, y in loader:
x = x.to(device=device).squeeze(1)
y = y.to(device=device)
scores = model(x)
_, predictions = scores.max(1)
num_correct += (predictions == y).sum()
num_samples += predictions.size(0)
# Toggle model back to train
model.train()
return num_correct / num_samples
print(f"Accuracy on training set: {check_accuracy(train_loader, model)*100:.2f}%")
print(f"Accuracy on test set: {check_accuracy(test_loader, model)*100:.2f}%")
I'm trying to create a contractive autoencoder in Pytorch. I found this thread and tried according to that. This is the snippet I wrote based on the mentioned thread:
import datetime
import numpy as np
import torch
import torchvision
from torchvision import datasets, transforms
from torchvision.utils import save_image, make_grid
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
import matplotlib.pyplot as plt
%matplotlib inline
dataset_train = datasets.MNIST(root='MNIST',
train=True,
transform = transforms.ToTensor(),
download=True)
dataset_test = datasets.MNIST(root='MNIST',
train=False,
transform = transforms.ToTensor(),
download=True)
batch_size = 128
num_workers = 2
dataloader_train = torch.utils.data.DataLoader(dataset_train,
batch_size = batch_size,
shuffle=True,
num_workers = num_workers,
pin_memory=True)
dataloader_test = torch.utils.data.DataLoader(dataset_test,
batch_size = batch_size,
num_workers = num_workers,
pin_memory=True)
def view_images(imgs, labels, rows = 4, cols =11):
imgs = imgs.detach().cpu().numpy().transpose(0,2,3,1)
fig = plt.figure(figsize=(8,4))
for i in range(imgs.shape[0]):
ax = fig.add_subplot(rows, cols, i+1, xticks=[], yticks=[])
ax.imshow(imgs[i].squeeze(), cmap='Greys_r')
ax.set_title(labels[i].item())
# now let's view some
imgs, labels = next(iter(dataloader_train))
view_images(imgs, labels,13,10)
class Contractive_AutoEncoder(nn.Module):
def __init__(self):
super().__init__()
self.encoder = nn.Linear(784, 512)
self.decoder = nn.Linear(512, 784)
def forward(self, input):
# flatten the input
shape = input.shape
input = input.view(input.size(0), -1)
output_e = F.relu(self.encoder(input))
output = F.sigmoid(self.decoder(output_e))
output = output.view(*shape)
return output_e, output
def loss_function(output_e, outputs, imgs, device):
output_e.backward(torch.ones(output_e.size()).to(device), retain_graph=True)
criterion = nn.MSELoss()
assert outputs.shape == imgs.shape ,f'outputs.shape : {outputs.shape} != imgs.shape : {imgs.shape}'
imgs.grad.requires_grad = True
loss1 = criterion(outputs, imgs)
print(imgs.grad)
loss2 = torch.mean(pow(imgs.grad,2))
loss = loss1 + loss2
return loss
epochs = 50
interval = 2000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model = Contractive_AutoEncoder().to(device)
optimizer = optim.Adam(model.parameters(), lr =0.001)
for e in range(epochs):
for i, (imgs, labels) in enumerate(dataloader_train):
imgs = imgs.to(device)
labels = labels.to(device)
outputs_e, outputs = model(imgs)
loss = loss_function(outputs_e, outputs, imgs,device)
optimizer.zero_grad()
loss.backward()
optimizer.step()
if i%interval:
print('')
print(f'epoch/epoechs: {e}/{epochs} loss : {loss.item():.4f} ')
For the sake of brevity I just used one layer for the encoder and the decoder. It should work regardless of number of layers in either of them obviously!
But the catch here is, aside from the fact that I don't know if this is the correct way of doing this, (calculating gradients with respect to the input), I get an error which makes the former solution wrong/not applicable.
That is:
imgs.grad.requires_grad = True
produces the error :
AttributeError : 'NoneType' object has no attribute 'requires_grad'
I also tried the second method suggested in that thread which is as follows:
class Contractive_Encoder(nn.Module):
def __init__(self):
super().__init__()
self.encoder = nn.Linear(784, 512)
def forward(self, input):
# flatten the input
input = input.view(input.size(0), -1)
output_e = F.relu(self.encoder(input))
return output_e
class Contractive_Decoder(nn.Module):
def __init__(self):
super().__init__()
self.decoder = nn.Linear(512, 784)
def forward(self, input):
# flatten the input
output = F.sigmoid(self.decoder(input))
output = output.view(-1,1,28,28)
return output
epochs = 50
interval = 2000
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
model_enc = Contractive_Encoder().to(device)
model_dec = Contractive_Decoder().to(device)
optimizer = optim.Adam([{"params":model_enc.parameters()},
{"params":model_dec.parameters()}], lr =0.001)
optimizer_cond = optim.Adam(model_enc.parameters(), lr = 0.001)
criterion = nn.MSELoss()
for e in range(epochs):
for i, (imgs, labels) in enumerate(dataloader_train):
imgs = imgs.to(device)
labels = labels.to(device)
outputs_e = model_enc(imgs)
outputs = model_dec(outputs_e)
loss_rec = criterion(outputs, imgs)
optimizer.zero_grad()
loss_rec.backward()
optimizer.step()
imgs.requires_grad_(True)
y = model_enc(imgs)
optimizer_cond.zero_grad()
y.backward(torch.ones(imgs.view(-1,28*28).size()))
imgs.grad.requires_grad = True
loss = torch.mean([pow(imgs.grad,2)])
optimizer_cond.zero_grad()
loss.backward()
optimizer_cond.step()
if i%interval:
print('')
print(f'epoch/epoechs: {e}/{epochs} loss : {loss.item():.4f} ')
but I face the error :
RuntimeError: invalid gradient at index 0 - got [128, 784] but expected shape compatible with [128, 512]
How should I go about this in Pytorch?
Summary
The final implementation for contractive loss that I wrote is as follows:
def loss_function(output_e, outputs, imgs, lamda = 1e-4, device=torch.device('cuda')):
criterion = nn.MSELoss()
assert outputs.shape == imgs.shape ,f'outputs.shape : {outputs.shape} != imgs.shape : {imgs.shape}'
loss1 = criterion(outputs, imgs)
output_e.backward(torch.ones(outputs_e.size()).to(device), retain_graph=True)
# Frobenious norm, the square root of sum of all elements (square value)
# in a jacobian matrix
loss2 = torch.sqrt(torch.sum(torch.pow(imgs.grad,2)))
imgs.grad.data.zero_()
loss = loss1 + (lamda*loss2)
return loss
and inside training loop you need to do:
for e in range(epochs):
for i, (imgs, labels) in enumerate(dataloader_train):
imgs = imgs.to(device)
labels = labels.to(device)
imgs.retain_grad()
imgs.requires_grad_(True)
outputs_e, outputs = model(imgs)
loss = loss_function(outputs_e, outputs, imgs, lam,device)
imgs.requires_grad_(False)
optimizer.zero_grad()
loss.backward()
optimizer.step()
print(f'epoch/epochs: {e}/{epochs} loss: {loss.item():.4f}')
Full explanation
As it turns out and rightfully #akshayk07 pointed out in the comments, the implementation found in Pytorch forum was wrong in multiple places. The notable thing, being it wasn't implementing the actual contractive loss that was introduced in Contractive Auto-Encoders:Explicit Invariance During Feature Extraction paper! and also aside from that, the implementation wouldn't work at all for obvious reasons that will be explained in a moment.
The changes are obvious so I try to explain what's going on here. First of all note that imgs is not a leaf node, so the gradients would not be retained in the image .grad attribute.
In order to retain gradients for non leaf nodes, you should use retain_graph(). grad is only populated for leaf Tensors. Also imgs.retain_grad() should be called before doing forward() as it will instruct the autograd to store grads into non-leaf nodes.
Update
Thanks to #Michael for pointing out that the correct calculation of Frobenius Norm is actually (from ScienceDirect):
the square root of the sum of the squares of all the matrix entries
and not
the the square root of the sum of the absolute values of all the
matrix entries as explained here
In PyTorch 1.5.0, a high level torch.autograd.functional.jacobian API is added. This should make the contractive objective easier to implement for an arbitrary encoder. For torch>=v1.5.0, the contractive loss would look like this:
contractive_loss = torch.norm(torch.autograd.functional.jacobian(self.encoder, imgs, create_graph=True))
The create_graph argument makes the jacobian differentiable.
The main challenge in implementing the contractive autoencoder is in calculating the Frobenius norm of the Jacobian, which is the gradient of the code or bottleneck layer (vector) with respect to the input layer (vector). This is the regularization term in the loss function. Fortunately, you have done the hard work in solving this for me. Thank you! You are using MSE loss for the first term. Cross entropy loss is sometimes used instead. It's worth considering. I think you are almost there with the Frobenius norm, except that you need to take the square root of the sum of the squares of the Jacobian, where you are calculating the square root of the sum of the absolute values. Here's how I'd define the loss function (sorry I changed notation a little to keep myself straight):
def cae_loss_fcn(code, img_out, img_in, lamda=1e-4, device=torch.device('cuda')):
# First term in the loss function, for ensuring representational fidelity
criterion=nn.MSELoss()
assert img_out.shape == img_in.shape, f'img_out.shape : {img_out.shape} != img_in.shape : {img_in.shape}'
loss1 = criterion(img_out, img_in)
# Second term in the loss function, for enforcing contraction of representation
code.backward(torch.ones(code.size()).to(device), retain_graph=True)
# Frobenius norm of Jacobian of code with respect to input image
loss2 = torch.sqrt(torch.sum(torch.pow(img_in.grad, 2))) # THE CORRECTION
img_in.grad.data.zero_()
# Total loss, the sum of the two loss terms, with weight applied to second term
loss = loss1 + (lamda*loss2)
return loss
I'm new with Pytorch, I tried to write my training class, but I had this error
import numpy as np
import torch
import torch.nn as nn
from torch.autograd import Variable
import torch.nn.functional as F
import torch.optim as optim
import tqdm
class MLPNet(nn.Module):
def __init__(self):
super(MLPNet, self).__init__()
self.first_fully_connected = nn.Linear(8*8, 100)
self.last_fully_connected = nn.Linear(100, 10)
def forward(self, x):
x = x.view(-1, 8*8) # reshape input tensor to the size (batch_size, 8*8)
x = F.sigmoid(self.first_fully_connected(x))
x = F.sigmoid(self.last_fully_connected(x))
return x
def training(mlp, X, y, epochs=1, lr=.2, batch_size=101):
# solver
# loss
solver = torch.optim.SGD(mlp.parameters(), lr=lr, momentum=0.9)
loss = nn.CrossEntropyLoss() # nn.NLLLoss()
n_batches = (len(X) + batch_size - 1) // batch_size
for epoch in tqdm.tqdm(range(epochs)):
for i in range(n_batches):
slice_ = np.s_[i::n_batches]
X_batch = Variable(torch.from_numpy(X[slice_])).float()
y_batch = Variable(torch.from_numpy(y[slice_, np.newaxis])).float()
# X_batch = Variable(torch.from_numpy(X[slice_])).long()
# y_batch = Variable(torch.from_numpy(y[slice_, np.newaxis])).long()
print(type(X_batch.data))
print(type(y_batch.data))
### BEGIN: your optim step here. do not forget to reset gradients
# Clear gradients w.r.t. parameters
solver.zero_grad()
prediction = mlp(X_batch)
# Forward pass to get output/logits
#outputs = mlp(X_batch)
# Calculate Loss: softmax --> cross entropy loss
#loss = criterion(outputs, y_batch)
loss_f = loss(prediction, y_batch)
# Getting gradients w.r.t. parameters
loss_f.backward()
# Updating parameters
solver.step()
### END
return mlp
mlp = nn.Sequential(
#### Your net here
nn.Linear(2, 64),
nn.ReLU(),
nn.Linear(64, 2)
)
model_mlp = training(mlp, X_std, y_std)
But I got this error,
I tried the change the type but still faced that error.
I tried the changed the loss function too but still that error.
RuntimeError Traceback (most recent call last)
in ()
----> 1 model_mlp = fit(mlp, X_std, y_std)
RuntimeError: Expected object of type Variable[torch.FloatTensor] but found type Variable[torch.LongTensor] for argument #1 'mat1'
I really appreciate any help you can provide.
Thank you so much
As edited in the question by #Ioannis Nasios, you had
X_batch = Variable(torch.from_numpy(X[slice_])).long()
Which means that your input tensors to your MLP were long integers, but the network requires floats. So you need to have:
X_batch = Variable(torch.from_numpy(X[slice_])).float()
And this should solve your error.