Let's say I have following (relatively) small lstm model:
First, let's create some pseudo input/target data:
import torch
# create pseudo input data (features)
features = torch.rand(size = (64, 24, 3)) # of shape (batch_size, num_time_steps, num_features)
# create pseudo target data
targets = torch.ones(size = (64, 24, 1)) # of shape (batch_size, num_time_steps, num_targets)
# store num. of time steps
num_time_steps = features.shape[1]
Now, let's define a simple lstm model:
# create a simple lstm model with lstm_cell
class SmallModel(torch.nn.Module):
def __init__(self):
super().__init__() # initialize the parent class
# define the layers
self.lstm_cell = torch.nn.LSTMCell(input_size = features.shape[2], hidden_size = 16)
self.fc = torch.nn.Linear(in_features = 16, out_features = targets.shape[2])
def forward(self, features):
# initialise states
hx = torch.randn(64, 16)
cx = torch.randn(64, 16)
# empty list to collect final preds
a_s = []
b_s = []
c_s = []
for t in range(num_time_steps): # loop through each time step
# select features at the current time step t
features_t = features[:, t, :]
# forward computation at the current time step t
hx, cx = self.lstm_cell(features_t, (hx, cx))
out_t = torch.relu(self.fc(hx))
# do some computation with the output
a = out_t * 0.8 + 20
b = a * 2
c = b * 0.9
a_s.append(a)
b_s.append(b)
c_s.append(c)
a_s = torch.stack(a_s, dim = 1) # of shape (batch_size, num_time_steps, num_targets)
b_s = torch.stack(b_s, dim = 1)
c_s = torch.stack(c_s, dim = 1)
return a_s, b_s, c_s
Instantiating model, loss fun. and optimizer:
# instantiate the model
model = SmallModel()
# loss function
loss_fn = torch.nn.MSELoss()
# optimizer
optimizer = torch.optim.SGD(model.parameters(), lr=0.001, momentum=0.9)
Now, during the training loop, I want to print the gradients of the intermediate (a_s.grad, b_s.grad) outputs for each epoch:
# number of epochs
n_epoch = 10
# training loop
for epoch in range(n_epoch): # loop through each epoch
# zero out the grad because pytorch accumulates them
optimizer.zero_grad()
# make predictions
a_s, b_s, c_s = model(features)
# retain the gradients of intermediate outputs
a_s.retain_grad()
b_s.retain_grad()
c_s.retain_grad()
# compute loss
loss = loss_fn(c_s, targets)
# backward computation
loss.backward()
# print gradients of outpus at each epoch
print(a_s.grad)
print(b_s.grad)
# update the weights
optimizer.step()
But I get the following:
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
None
How can I get the actual gradients of the intermediate outputs?
c_s is not a function of a_s and b_s that is the problem.
In your code:
loss = func(c_s, *)
c_s = func(a, b)
# c_s = func(a_s, b_s) is not true
Hence during backward pass no grad will be calculated for variables a_s and b_s.
Try this modified forward function to get gradients for a_s and b_s where c_s = func(a_s, b_s):
def forward(self, features):
# initialise states
hx = torch.randn(64, 16)
cx = torch.randn(64, 16)
# empty list to collect final preds
a_s = []
b_s = []
c_s = []
for t in range(num_time_steps): # loop through each time step
# select features at the current time step t
features_t = features[:, t, :]
# forward computation at the current time step t
hx, cx = self.lstm_cell(features_t, (hx, cx))
out_t = torch.relu(self.fc(hx))
# do some computation with the output
a = out_t * 0.8 + 20
# b = a * 2
# c = b * 0.9
a_s.append(a)
# b_s.append(b)
# c_s.append(c)
a_s = torch.stack(a_s, dim = 1) # of shape (batch_size, num_time_steps, num_targets)
##########################################
## c_s = func(a_s, b_s)
##########################################
b_s = a_s * 2
c_s = b_s * 0.9
##########################################
##########################################
return a_s, b_s, c_s
Related
I try to use torch.nn.lstm to predict future retail prices. My data cover different stores over time. In each iteration I draw batch_size of 2 stores, then I train my LSTM sequentially over the periods which are covered in the data.
My problem is this - in each iteration I create a month X 2-stores dataset (for the sake of this post), and run it through the model.
Before I start, I initialize it using the init function. In that case, the resulting tensor is full of nans. On the other hand, if I'm using init every time, the results are actual numbers.
The lstm model is
n_features = 68
n_steps = 3
batch_size = 2
seq_len = n_steps*batch_size
n_hidden = 2 # number of hidden states
n_layers = 2 # number of LSTM layers (stacked)
# 2. Build the Model
class SmallLSTM(torch.nn.Module):
def __init__(self,n_features,seq_len, n_hidden, n_layers, n_steps, batch_size):
super(SmallLSTM, self).__init__()
self.n_features = n_features
self.seq_len = seq_len
self.n_hidden = n_hidden # number of hidden states
self.n_layers = n_layers # number of LSTM layers (stacked)
self.n_steps = n_steps
self.batch_size = batch_size
self.l_lstm = torch.nn.LSTM(input_size = self.n_features,
hidden_size = self.n_hidden,
num_layers = self.n_layers,
batch_first = True,
dropout = 0.1)
# according to pytorch docs LSTM output is
# (batch_size,seq_len, num_directions * hidden_size)
# when considering batch_first = True
self.l_linear = torch.nn.Linear(self.n_steps*self.batch_size * self.n_hidden, self.batch_size)
def init_hidden(self, batch_size):
# even with batch_first = True this remains same as docs
hidden_state = torch.zeros(self.n_layers,1,self.n_hidden)
cell_state = torch.zeros(self.n_layers,1,self.n_hidden)
self.hidden = (hidden_state, cell_state)
def forward(self, x):
#batch_size, seq_len, _ = x.size()
lstm_out, self.hidden = self.l_lstm(x,self.hidden)
# lstm_out(with batch_first = True) is
# (batch_size,seq_len,num_directions * hidden_size)
# for following linear layer we want to keep batch_size dimension and merge rest
# .contiguous() -> solves tensor compatibility error
lstm_out = lstm_out.reshape((1,self.n_steps*self.batch_size * self.n_hidden))
lstm_out = self.l_linear(lstm_out)
#self.hidden = [elem.detach_() for elem in self.hidden]
return lstm_out
Without the per-iteration initialization the code looks like that
batch_size = 2
stores_drawn_idx = 2
Stores_train_batch = Stores_train.iloc[stores_drawn_idx:stores_drawn_idx+batch_size]
Stores_train_batch.reset_index(inplace=True, drop = True)
stores_drawn_idx += batch_size
Months = Xy['Month'].sort_values().unique()
n_steps = 3
mv_net = SmallLSTM(n_features = n_features,\
seq_len = seq_len, \
n_hidden=n_hidden, \
n_layers = n_layers, \
n_steps = n_steps, \
batch_size = batch_size)
mv_net.init_hidden(1)
train_batch = pd.DataFrame(columns = Xy.columns)
for j in range(Stores_train_batch.shape[0]):
X_ = Xy_temp_month.drop(['origin_address', 'Retailer_origin', 'Month', 'target'], axis = 1).values.astype('float32')
y_ = Xy_temp_month[Xy_temp_month['Month'] == months_temp[-1]]['target']
X_ = torch.from_numpy(X_.reshape(1,n_steps*batch_size,X_.shape[1]))
X_ = torch.tensor(X_,dtype=torch.float32)
y_ = torch.tensor(y_.values.astype('float32'), dtype=torch.float32).reshape([len(y_)])
n_features = 68
seq_len = n_steps*batch_size
n_hidden = 2 # number of hidden states
n_layers = 2 # number of LSTM layers (stacked)
criterion = torch.nn.MSELoss(reduction='sum')
optimizer = torch.optim.Adam(mv_net.parameters(), lr=0.05, weight_decay=1500)
valid_loss_min = 1000
#mv_net.init_hidden(1)
output = mv_net(X_)
loss = criterion(output, y_)
print(loss.item())
Under this scenario the results are nan for each time the line print(loss.item()) runs.
If I add that line mv_net.init_hidden(1) I get values for the loss.
What should I do?
Thanks!!!
I'm making a CNN and I've got this error that the matrices don't align and i understand the error but i don't know how to fix it. Here is the code:
import numpy as np
import nnfs
import emnist
import os
import cv2
import pickle
import copy
nnfs.init()
# Dense layer
class Layer_Dense:
# Layer initialization
def __init__(self, n_inputs, n_neurons,
weight_regularizer_l1=0, weight_regularizer_l2=0,
bias_regularizer_l1=0, bias_regularizer_l2=0):
# Initialize weights and biases
self.weights = 0.01 * np.random.randn(n_inputs, n_neurons)
self.biases = np.zeros((1, n_neurons))
# Set regularization strength
self.weight_regularizer_l1 = weight_regularizer_l1
self.weight_regularizer_l2 = weight_regularizer_l2
self.bias_regularizer_l1 = bias_regularizer_l1
self.bias_regularizer_l2 = bias_regularizer_l2
# Forward pass
def forward(self, inputs, training):
# Remember input values
self.inputs = inputs
# Calculate output values from inputs, weights and biases
self.output = np.dot(inputs, self.weights) + self.biases
# Backward pass
def backward(self, dvalues):
# Gradients on parameters
self.dweights = np.dot(self.inputs.T, dvalues)
self.dbiases = np.sum(dvalues, axis=0, keepdims=True)
# Gradients on regularization
# L1 on weights
if self.weight_regularizer_l1 > 0:
dL1 = np.ones_like(self.weights)
dL1[self.weights < 0] = -1
self.dweights += self.weight_regularizer_l1 * dL1
# L2 on weights
if self.weight_regularizer_l2 > 0:
self.dweights += 2 * self.weight_regularizer_l2 * \
self.weights
# L1 on biases
if self.bias_regularizer_l1 > 0:
dL1 = np.ones_like(self.biases)
dL1[self.biases < 0] = -1
self.dbiases += self.bias_regularizer_l1 * dL1
# L2 on biases
if self.bias_regularizer_l2 > 0:
self.dbiases += 2 * self.bias_regularizer_l2 * \
self.biases
# Gradient on values
self.dinputs = np.dot(dvalues, self.weights.T)
# Retrieve layer parameters
def get_parameters(self):
return self.weights, self.biases
# Set weights and biases in a layer instance
def set_parameters(self, weights, biases):
self.weights = weights
self.biases = biases
# Dropout
class Layer_Dropout:
# Init
def __init__(self, rate):
# Store rate, we invert it as for example for dropout
# of 0.1 we need success rate of 0.9
self.rate = 1 - rate
# Forward pass
def forward(self, inputs, training):
# Save input values
self.inputs = inputs
# If not in the training mode - return values
if not training:
self.output = inputs.copy()
return
# Generate and save scaled mask
self.binary_mask = np.random.binomial(1, self.rate,size=inputs.shape) / self.rate
# Apply mask to output values
self.output = inputs * self.binary_mask
# Backward pass
def backward(self, dvalues):
# Gradient on values
self.dinputs = dvalues * self.binary_mask
#Input "layer"
class Layer_Input:
# Forward pass
def forward(self, inputs, training):
self.output = inputs
# ReLU activation
class Activation_ReLU:
# Forward pass
def forward(self, inputs, training):
# Remember input values
self.inputs = inputs
# Calculate output values from inputs
self.output = np.maximum(0, inputs)
# Backward pass
def backward(self, dvalues):
# Since we need to modify original variable,
# let's make a copy of values first
self.dinputs = dvalues.copy()
# Zero gradient where input values were negative
self.dinputs[self.inputs <= 0] = 0
# Calculate predictions for outputs
def predictions(self, outputs):
return outputs
# Softmax activation
class Activation_Softmax:
# Forward pass
def forward(self, inputs, training):
# Remember input values
self.inputs = inputs
# Get unnormalized probabilities
exp_values = np.exp(inputs - np.max(inputs, axis=1,keepdims=True))
# Normalize them for each sample
probabilities = exp_values / np.sum(exp_values, axis=1,keepdims=True)
self.output = probabilities
# Backward pass
def backward(self, dvalues):
# Create uninitialized array
self.dinputs = np.empty_like(dvalues)
# Enumerate outputs and gradients
for index, (single_output, single_dvalues) in enumerate(zip(self.output, dvalues)):
# Flatten output array
single_output = single_output.reshape(-1, 1)
# Calculate Jacobian matrix of the output
jacobian_matrix = np.diagflat(single_output) - np.dot(single_output, single_output.T)
# Calculate sample-wise gradient
# and add it to the array of sample gradients
self.dinputs[index] = np.dot(jacobian_matrix,single_dvalues)
# Calculate predictions for outputs
def predictions(self, outputs):
return np.argmax(outputs, axis=1)
# Adam optimizer
class Optimizer_Adam:
# Initialize optimizer - set settings
def __init__(self, learning_rate=0.001, decay=0., epsilon=1e-7,
beta_1=0.9, beta_2=0.999):
self.learning_rate = learning_rate
self.current_learning_rate = learning_rate
self.decay = decay
self.iterations = 0
self.epsilon = epsilon
self.beta_1 = beta_1
self.beta_2 = beta_2
# Call once before any parameter updates
def pre_update_params(self):
if self.decay:
self.current_learning_rate = self.learning_rate * (1. / (1. + self.decay * self.iterations))
# Update parameters
def update_params(self, layer):
# If layer does not contain cache arrays,
# create them filled with zeros
if not hasattr(layer, 'weight_cache'):
layer.weight_momentums = np.zeros_like(layer.weights)
layer.weight_cache = np.zeros_like(layer.weights)
layer.bias_momentums = np.zeros_like(layer.biases)
layer.bias_cache = np.zeros_like(layer.biases)
# Update momentum with current gradients
layer.weight_momentums = self.beta_1 * layer.weight_momentums + (1 - self.beta_1) * layer.dweights
layer.bias_momentums = self.beta_1 * layer.bias_momentums + (1 - self.beta_1) * layer.dbiases
# Get corrected momentum
# self.iteration is 0 at first pass
# and we need to start with 1 here
weight_momentums_corrected = layer.weight_momentums / (1 - self.beta_1 ** (self.iterations + 1))
bias_momentums_corrected = layer.bias_momentums / (1 - self.beta_1 ** (self.iterations + 1))
# Update cache with squared current gradients
layer.weight_cache = self.beta_2 * layer.weight_cache + (1 - self.beta_2) * layer.dweights**2
layer.bias_cache = self.beta_2 * layer.bias_cache + (1 - self.beta_2) * layer.dbiases**2
# Get corrected cache
weight_cache_corrected = layer.weight_cache / (1 - self.beta_2 ** (self.iterations + 1))
bias_cache_corrected = layer.bias_cache / (1 - self.beta_2 ** (self.iterations + 1))
# Vanilla SGD parameter update + normalization
# with square rooted cache
layer.weights += -self.current_learning_rate * weight_momentums_corrected / (np.sqrt(weight_cache_corrected) + self.epsilon)
layer.biases += -self.current_learning_rate * bias_momentums_corrected / (np.sqrt(bias_cache_corrected) + self.epsilon)
# Call once after any parameter updates
def post_update_params(self):
self.iterations += 1
# Common loss class
class Loss:
# Regularization loss calculation
def regularization_loss(self):
# 0 by default
regularization_loss = 0
# Calculate regularization loss
# iterate all trainable layers
for layer in self.trainable_layers:
# L1 regularization - weights
# calculate only when factor greater than 0
if layer.weight_regularizer_l1 > 0:
regularization_loss += layer.weight_regularizer_l1 * np.sum(np.abs(layer.weights))
# L2 regularization - weights
if layer.weight_regularizer_l2 > 0:
regularization_loss += layer.weight_regularizer_l2 * np.sum(layer.weights * layer.weights)
# L1 regularization - biases
# calculate only when factor greater than 0
if layer.bias_regularizer_l1 > 0:
regularization_loss += layer.bias_regularizer_l1 * np.sum(np.abs(layer.biases))
# L2 regularization - biases
if layer.bias_regularizer_l2 > 0:
regularization_loss += layer.bias_regularizer_l2 * np.sum(layer.biases * layer.biases)
return regularization_loss
# Set/remember trainable layers
def remember_trainable_layers(self, trainable_layers):
self.trainable_layers = trainable_layers
# Calculates the data and regularization losses
# given model output and ground truth values
def calculate(self, output, y, *, include_regularization=False):
# Calculate sample losses
sample_losses = self.forward(output, y)
# Calculate mean loss
data_loss = np.mean(sample_losses)
# Add accumulated sum of losses and sample count
self.accumulated_sum += np.sum(sample_losses)
self.accumulated_count += len(sample_losses)
# If just data loss - return it
if not include_regularization:
return data_loss
# Return the data and regularization losses
return data_loss, self.regularization_loss()
# Calculates accumulated loss
def calculate_accumulated(self, *, include_regularization=False):
# Calculate mean loss
data_loss = self.accumulated_sum / self.accumulated_count
# If just data loss - return it
if not include_regularization:
return data_loss
# Return the data and regularization losses
return data_loss, self.regularization_loss()
# Reset variables for accumulated loss
def new_pass(self):
self.accumulated_sum = 0
self.accumulated_count = 0
# Cross-entropy loss
class Loss_CategoricalCrossentropy(Loss):
# Forward pass
def forward(self, y_pred, y_true):
# Number of samples in a batch
samples = len(y_pred)
# Clip data to prevent division by 0
# Clip both sides to not drag mean towards any value
y_pred_clipped = np.clip(y_pred, 1e-7, 1 - 1e-7)
# Probabilities for target values -
# only if categorical labels
if len(y_true.shape) == 1:
correct_confidences = y_pred_clipped[range(samples),y_true]
# Mask values - only for one-hot encoded labels
elif len(y_true.shape) == 2:
correct_confidences = np.sum(y_pred_clipped * y_true,axis=1)
# Losses
negative_log_likelihoods = -np.log(correct_confidences)
return negative_log_likelihoods
# Backward pass
def backward(self, dvalues, y_true):
# Number of samples
samples = len(dvalues)
# Number of labels in every sample
# We'll use the first sample to count them
labels = len(dvalues[0])
# If labels are sparse, turn them into one-hot vector
if len(y_true.shape) == 1:
y_true = np.eye(labels)[y_true]
# Calculate gradient
self.dinputs = -y_true / dvalues
# Normalize gradient
self.dinputs = self.dinputs / samples
# Softmax classifier - combined Softmax activation
# and cross-entropy loss for faster backward step
class Activation_Softmax_Loss_CategoricalCrossentropy():
# Backward pass
def backward(self, dvalues, y_true):
# Number of samples
samples = len(dvalues)
# If labels are one-hot encoded,
# turn them into discrete values
if len(y_true.shape) == 2:
y_true = np.argmax(y_true, axis=1)
# Copy so we can safely modify
self.dinputs = dvalues.copy()
# Calculate gradient
self.dinputs[range(samples), y_true] -= 1
# Normalize gradient
self.dinputs = self.dinputs / samples
# Common accuracy class
class Accuracy:
# Calculates an accuracy
# given predictions and ground truth values
def calculate(self, predictions, y):
# Get comparison results
comparisons = self.compare(predictions, y)
# Calculate an accuracy
accuracy = np.mean(comparisons)
# Add accumulated sum of matching values and sample count
self.accumulated_sum += np.sum(comparisons)
self.accumulated_count += len(comparisons)
# Return accuracy
return accuracy
# Calculates accumulated accuracy
def calculate_accumulated(self):
# Calculate an accuracy
accuracy = self.accumulated_sum / self.accumulated_count
# Return the data and regularization losses
return accuracy
# Reset variables for accumulated accuracy
def new_pass(self):
self.accumulated_sum = 0
self.accumulated_count = 0
# Accuracy calculation for classification model
class Accuracy_Categorical(Accuracy):
def __init__(self, *, binary=False):
# Binary mode?
self.binary = binary
# No initialization is needed
def init(self, y):
pass
# Compares predictions to the ground truth values
def compare(self, predictions, y):
if not self.binary and len(y.shape) == 2:
y = np.argmax(y, axis=1)
return predictions == y
# Model class
class Model:
def __init__(self):
# Create a list of network objects
self.layers = []
# Softmax classifier's output object
self.softmax_classifier_output = None
# Add objects to the model
def add(self, layer):
self.layers.append(layer)
#
# Set loss, optimizer and accuracy
def set(self, *, loss=None, optimizer=None, accuracy=None):
if loss is not None:
self.loss = loss
if optimizer is not None:
self.optimizer = optimizer
if accuracy is not None:
self.accuracy = accuracy
# Finalize the model
def finalize(self):
# Create and set the input layer
self.input_layer = Layer_Input()
# Count all the objects
layer_count = len(self.layers)
# Initialize a list containing trainable layers:
self.trainable_layers = []
# Iterate the objects
for i in range(layer_count):
# If it's the first layer,
# the previous layer object is the input layer
if i == 0:
self.layers[i].prev = self.input_layer
self.layers[i].next = self.layers[i+1]
# All layers except for the first and the last
elif i < layer_count - 1:
self.layers[i].prev = self.layers[i-1]
self.layers[i].next = self.layers[i+1]
# The last layer - the next object is the loss
# Also let's save aside the reference to the last object
# whose output is the model's output
else:
self.layers[i].prev = self.layers[i-1]
self.layers[i].next = self.loss
self.output_layer_activation = self.layers[i]
# If layer contains an attribute called "weights",
# it's a trainable layer -
# add it to the list of trainable layers
# We don't need to check for biases -
# checking for weights is enough
if hasattr(self.layers[i], 'weights'):
self.trainable_layers.append(self.layers[i])
# Update loss object with trainable layers
if self.loss is not None:
self.loss.remember_trainable_layers(self.trainable_layers)
# If output activation is Softmax and
# loss function is Categorical Cross-Entropy
# create an object of combined activation
# and loss function containing
# faster gradient calculation
if isinstance(self.layers[-1], Activation_Softmax) and isinstance(self.loss, Loss_CategoricalCrossentropy):
# Create an object of combined activation
# and loss functions
self.softmax_classifier_output = Activation_Softmax_Loss_CategoricalCrossentropy()
# Train the model
def train(self, X, y, *, epochs=1, batch_size=None,print_every=1, validation_data=None):
# Initialize accuracy object
self.accuracy.init(y)
# Default value if batch size is not being set
train_steps = 1
# Calculate number of steps
if batch_size is not None:
train_steps = len(X) // batch_size
# Dividing rounds down. If there are some remaining
# data but not a full batch, this won't include it
# Add `1` to include this not full batch
if train_steps * batch_size < len(X):
train_steps += 1
# Main training loop
for epoch in range(1, epochs+1):
# Print epoch number
print(f'epoch: {epoch}')
# Reset accumulated values in loss and accuracy objects
self.loss.new_pass()
self.accuracy.new_pass()
# Iterate over steps
for step in range(train_steps):
# If batch size is not set -
# train using one step and full dataset
if batch_size is None:
batch_X = X
batch_y = y
# Otherwise slice a batch
else:
batch_X = X[step*batch_size:(step+1)*batch_size]
atch_y = y[step*batch_size:(step+1)*batch_size]
# Perform the forward pass
output = self.forward(batch_X, training=True)
# Calculate loss
data_loss, regularization_loss = self.loss.calculate(output, batch_y,include_regularization=True)
loss = data_loss + regularization_loss
# Get predictions and calculate an accuracy
predictions = self.output_layer_activation.predictions(output)
accuracy = self.accuracy.calculate(predictions,batch_y)
# Perform backward pass
self.backward(output, batch_y)
# Optimize (update parameters)
self.optimizer.pre_update_params()
for layer in self.trainable_layers:
self.optimizer.update_params(layer)
self.optimizer.post_update_params()
# Print a summary
if not step % print_every or step == train_steps - 1:
print(f'step: {step}, ' +
f'acc: {accuracy:.3f}, ' +
f'loss: {loss:.3f} (' +
f'data_loss: {data_loss:.3f}, ' +
f'reg_loss: {regularization_loss:.3f}), ' +
f'lr: {self.optimizer.current_learning_rate}')
# Get and print epoch loss and accuracy
epoch_data_loss, epoch_regularization_loss = self.loss.calculate_accumulated(include_regularization=True)
epoch_loss = epoch_data_loss + epoch_regularization_loss
epoch_accuracy = self.accuracy.calculate_accumulated()
print(f'training, ' +
f'acc: {epoch_accuracy:.3f}, ' +
f'loss: {epoch_loss:.3f} (' +
f'data_loss: {epoch_data_loss:.3f}, ' +
f'reg_loss: {epoch_regularization_loss:.3f}), ' +
f'lr: {self.optimizer.current_learning_rate}')
# If there is the validation data
if validation_data is not None:
# Evaluate the model:
self.evaluate(*validation_data,batch_size=batch_size)
# Evaluates the model using passed-in dataset
def evaluate(self, X_val, y_val, *, batch_size=None):
# Default value if batch size is not being set
validation_steps = 1
# Calculate number of steps
if batch_size is not None:
validation_steps = len(X_val) // batch_size
# Dividing rounds down. If there are some remaining
# data but not a full batch, this won't include it
# Add `1` to include this not full batch
if validation_steps * batch_size < len(X_val):
validation_steps += 1
# Reset accumulated values in loss
# and accuracy objects
self.loss.new_pass()
self.accuracy.new_pass()
# Iterate over steps
for step in range(validation_steps):
# If batch size is not set -
# train using one step and full dataset
if batch_size is None:
batch_X = X_val
batch_y = y_val
# Otherwise slice a batch
else:
batch_X = X_val[step*batch_size:(step+1)*batch_size]
batch_y = y_val[step*batch_size:(step+1)*batch_size]
# Perform the forward pass
output = self.forward(batch_X, training=False)
# Calculate the loss
self.loss.calculate(output, batch_y)
# Get predictions and calculate an accuracy
predictions = self.output_layer_activation.predictions(output)
self.accuracy.calculate(predictions, batch_y)
# Get and print validation loss and accuracy
validation_loss = self.loss.calculate_accumulated()
validation_accuracy = self.accuracy.calculate_accumulated()
# Print a summary
print(f'validation, ' +
f'acc: {validation_accuracy:.3f}, ' +
f'loss: {validation_loss:.3f}')
# Predicts on the samples
def predict(self, X, *, batch_size=None):
# Default value if batch size is not being set
prediction_steps = 1
# Calculate number of steps
if batch_size is not None:
prediction_steps = len(X) // batch_size
# Dividing rounds down. If there are some remaining
# data but not a full batch, this won't include it
# Add `1` to include this not full batch
if prediction_steps * batch_size < len(X):
prediction_steps += 1
# Model outputs
output = []
# Iterate over steps
for step in range(prediction_steps):
# If batch size is not set -
# train using one step and full dataset
if batch_size is None:
batch_X = X
# Otherwise slice a batch
else:
batch_X = X[step*batch_size:(step+1)*batch_size]
# Perform the forward pass
batch_output = self.forward(batch_X, training=False)
# Append batch prediction to the list of predictions
output.append(batch_output)
# Stack and return results
return np.vstack(output)
# Performs forward pass
def forward(self, X, training):
# Call forward method on the input layer
# this will set the output property that
# the first layer in "prev" object is expecting
self.input_layer.forward(X, training)
# Call forward method of every object in a chain
# Pass output of the previous object as a parameter
for layer in self.layers:
layer.forward(layer.prev.output, training)
# "layer" is now the last object from the list,
# return its output
# Performs backward pass
def backward(self, output, y):
# If softmax classifier
if self.softmax_classifier_output is not None:
# First call backward method
# on the combined activation/loss
# this will set dinputs property
self.softmax_classifier_output.backward(output, y)
# Since we'll not call backward method of the last layer
# which is Softmax activation
# as we used combined activation/loss
# object, let's set dinputs in this object
self.layers[-1].dinputs = self.softmax_classifier_output.dinputs
# Call backward method going through
# all the objects but last
# in reversed order passing dinputs as a parameter
for layer in reversed(self.layers[:-1]):
layer.backward(layer.next.dinputs)
return
# First call backward method on the loss
# this will set dinputs property that the last
# layer will try to access shortly
self.loss.backward(output, y)
# Call backward method going through all the objects
# in reversed order passing dinputs as a parameter
for layer in reversed(self.layers):
layer.backward(layer.next.dinputs)
# Retrieves and returns parameters of trainable layers
def get_parameters(self):
# Create a list for parameters
parameters = []
# Iterable trainable layers and get their parameters
for layer in self.trainable_layers:
parameters.append(layer.get_parameters())
# Return a list
return parameters
#Updates the model with new parameters
def set_parameters(self, parameters):
# Iterate over the parameters and layers
# and update each layers with each set of the parameters
for parameter_set, layer in zip(parameters,self.trainable_layers):
layer.set_parameters(*parameter_set)
# Saves the parameters to a file
def save_parameters(self, path):
# Open a file in the binary-write mode
# and save parameters into it
with open(path, 'wb') as f:
pickle.dump(self.get_parameters(), f)
# Loads the weights and updates a model instance with them
def load_parameters(self, path):
# Open file in the binary-read mode,
# load weights and update trainable layers
with open(path, 'rb') as f:
self.set_parameters(pickle.load(f))
# Saves the model
def save(self, path):
# Make a deep copy of current model instance
model = copy.deepcopy(self)
# Reset accumulated values in loss and accuracy objects
model.loss.new_pass()
model.accuracy.new_pass()
# Remove data from the input layer
# and gradients from the loss object
model.input_layer.__dict__.pop('output', None)
model.loss.__dict__.pop('dinputs', None)
# For each layer remove inputs, output and dinputs properties
for layer in model.layers:
for property in ['inputs', 'output', 'dinputs','dweights', 'dbiases']:
layer.__dict__.pop(property, None)
# Open a file in the binary-write mode and save the model
with open(path, 'wb') as f:
pickle.dump(model, f)
# Loads and returns a model
#staticmethod
def load(path):
# Open file in the binary-read mode, load a model
with open(path, 'rb') as f:
model = pickle.load(f)
# Return a model
return model
# Create dataset
X, y = emnist.extract_training_samples('digits')
X_test, y_test = emnist.extract_test_samples('digits')
# Instantiate the model
model = Model()
# Add layers
model.add(Layer_Dense(2, 512, weight_regularizer_l2=5e-4,bias_regularizer_l2=5e-4))
model.add(Activation_ReLU())
model.add(Layer_Dropout(0.1))
model.add(Layer_Dense(512, 3))
model.add(Activation_Softmax())
# Set loss, optimizer and accuracy objects
model.set(
loss=Loss_CategoricalCrossentropy(),
optimizer=Optimizer_Adam(learning_rate=0.05, decay=5e-5),
accuracy=Accuracy_Categorical()
)
# Finalize the model
model.finalize()
# Train the model
model.train(X, y, validation_data=(X_test, y_test),epochs=10000, print_every=100)
And this is the error i get in sublime text:
epoch: 1
Traceback (most recent call last):
File "/media/luke/New Volume/final project/untitled.py", line 654, in <module>
model.train(X, y, validation_data=(X_test, y_test),epochs=10000, print_every=100)
File "/media/luke/New Volume/final project/untitled.py", line 430, in train
output = self.forward(batch_X, training=True)
File "/media/luke/New Volume/final project/untitled.py", line 545, in forward
layer.forward(layer.prev.output, training)
File "/media/luke/New Volume/final project/untitled.py", line 29, in forward
self.output = np.dot(inputs, self.weights) + self.biases
File "/home/luke/.local/lib/python3.8/site-packages/nnfs/core.py", line 22, in dot
return orig_dot(*[a.astype('float64') for a in args], **kwargs).astype('float32')
File "<__array_function__ internals>", line 5, in dot
ValueError: shapes (240000,28,28) and (2,512) not aligned: 28 (dim 2) != 2 (dim 0)
As you can see it gets to epoch 1 then when trying to do the numpy dot product and then cant do it.
I'd appreciate any help
Thanks :)
Firstly, you should flatten your input so its shape is (240000, 28*28) = (240000, 784). After that, the problem is in this line:
model.add(Layer_Dense(2, 512, weight_regularizer_l2=5e-4,bias_regularizer_l2=5e-4))
You set your input size to 2, when it should be 784 which is the number of pixels in each image (assuming you're using MNIST).
model.add(Layer_Dense(784, 512, weight_regularizer_l2=5e-4,bias_regularizer_l2=5e-4))
Should work correctly if your inputs are flattened.
Edit: To flatten your inputs I would use np.reshape as demonstrated here https://stackoverflow.com/a/18758049/11777402.
X.reshape(240000, 784)
I am trying to create a denoising autoencoder for 1d cyclic signals like cos(x) etc.
The process of creating the dataset is that I pass a list of cyclic functions and for each example generated it rolls random coefficients for each function in the list so every function generated is different yet cyclic. eg - 0.856cos(x) - 1.3cos(0.1x)
Then I add noise and normalize the signal to be between [0, 1).
Next, I train my autoencoder on it but it learns to output a constant (usually 0.5). my guess is that it happens because 0.5 is the usual mean value of the normalized functions. But this is not the result im aspiring to get at all.
I am providing the code I wrote for the autoencoder, the data generator and the training loop as well as two pictures depicting the problem im having.
first example:
second example:
Linear autoencoder:
class LinAutoencoder(nn.Module):
def __init__(self, in_channels, K, B, z_dim, out_channels):
super(LinAutoencoder, self).__init__()
self.in_channels = in_channels
self.K = K # number of samples per 2pi interval
self.B = B # how many intervals
self.out_channels = out_channels
encoder_layers = []
decoder_layers = []
encoder_layers += [
nn.Linear(in_channels * K * B, 2*z_dim, bias=True),
nn.ReLU(),
nn.Linear(2*z_dim, z_dim, bias=True),
nn.ReLU(),
nn.Linear(z_dim, z_dim, bias=True),
nn.ReLU()
]
decoder_layers += [
nn.Linear(z_dim, z_dim, bias=True),
nn.ReLU(),
nn.Linear(z_dim, 2*z_dim, bias=True),
nn.ReLU(),
nn.Linear(2*z_dim, out_channels * K * B, bias=True),
nn.Tanh()
]
self.encoder = nn.Sequential(*encoder_layers)
self.decoder = nn.Sequential(*decoder_layers)
def forward(self, x):
batch_size = x.shape[0]
x_flat = torch.flatten(x, start_dim=1)
enc = self.encoder(x_flat)
dec = self.decoder(enc)
res = dec.view((batch_size, self.out_channels, self.K * self.B))
return res
The data generator:
def lincomb_generate_data(batch_size, intervals, sample_length, functions, noise_type="gaussian", **kwargs)->torch.tensor:
channels = 1
mul_term = 2 * np.pi / sample_length
positions = np.arange(0, sample_length * intervals)
x_axis = positions * mul_term
X = np.tile(x_axis, (channels, 1))
y = X
Y = np.repeat(y[np.newaxis, :], batch_size, axis=0)
if noise_type == "gaussian":
# defaults to 0, 0.4
noise_mean = kwargs.get("noise_mean", 0)
noise_std = kwargs.get("noise_std", 0.4)
noise = np.random.normal(noise_mean, noise_std, Y.shape)
if noise_type == "uniform":
# defaults to 0, 1
noise_low = kwargs.get("noise_low", 0)
noise_high = kwargs.get("noise_high", 1)
noise = np.random.uniform(noise_low, noise_high, Y.shape)
coef_lo = -2
coef_hi = 2
coef_mat = np.random.uniform(coef_lo, coef_hi, (batch_size, len(functions))) # creating a matrix of coefficients
coef_mat = np.where(np.abs(coef_mat) < 10**-1, 0, coef_mat)
for i in range(batch_size):
curr_res = np.zeros((channels, sample_length * intervals))
for func_id, function in enumerate(functions):
curr_func = functions[func_id]
curr_coef = coef_mat[i][func_id]
curr_res += curr_coef * curr_func(Y[i, :, :])
Y[i, :, :] = curr_res
clean = Y
noisy = clean + noise
# Normalizing
clean -= clean.min(axis=2, keepdims=2)
clean /= clean.max(axis=2, keepdims=2) + 1e-5 #avoiding zero division
noisy -= noisy.min(axis=2, keepdims=2)
noisy /= noisy.max(axis=2, keepdims=2) + 1e-5 #avoiding zero division
clean = torch.from_numpy(clean)
noisy = torch.from_numpy(noisy)
return x_axis, clean, noisy
Training loop:
functions = [lambda x: np.cos(0.1*x),
lambda x: np.cos(x),
lambda x: np.cos(3*x)]
num_epochs = 200
lin_loss_list = []
criterion = torch.nn.MSELoss()
lin_optimizer = torch.optim.SGD(lin_model.parameters(), lr=0.01, momentum=0.9)
_, val_clean, val_noisy = util.lincomb_generate_data(batch_size, B, K, functions, noise_type="gaussian")
print("STARTED TRAINING")
for epoch in range(num_epochs):
# generate data returns the x-axis used for plotting as well as the clean and noisy data
_, t_clean, t_noisy = util.lincomb_generate_data(batch_size, B, K, functions, noise_type="gaussian")
# ===================forward=====================
lin_output = lin_model(t_noisy.float())
lin_loss = criterion(lin_output.float(), t_clean.float())
lin_loss_list.append(lin_loss.data)
# ===================backward====================
lin_optimizer.zero_grad()
lin_loss.backward()
lin_optimizer.step()
val_lin_loss = F.mse_loss(lin_model(val_noisy.float()), val_clean.float())
print("DONE TRAINING")
edit: shared the parameters requested
L = 1
K = 512
B = 2
batch_size = 64
z_dim = 64
noise_mean = 0
noise_std = 0.4
The problem was I didnt use nn.BatchNorm1d in my model so i guess something wrong happened during training (probably vanishing gradients).
I am trying to complete an implementation of a neural network class that uses pytorch.
But the upgrade step is causing the error to pop up related to None Type.
I am using Pytorch Pkg with Python 3.73 using Jupyter Notebook.
The problem is in the step where I have to ake weight update step, then zero the gradient values.
class NNet(torch.nn.Module):
def __init__(self, n_inputs, n_hiddens_per_layer, n_outputs, act_func='tanh'):
super().__init__() # call parent class (torch.nn.Module) constructor
# Set self.n_hiddens_per_layer to [] if argument is 0, [], or [0]
if n_hiddens_per_layer == 0 or n_hiddens_per_layer == [] or n_hiddens_per_layer == [0]:
self.n_hiddens_per_layer = []
else:
self.n_hiddens_per_layer = n_hiddens_per_layer
self.hidden_layers = torch.nn.ModuleList() # necessary for model.to('cuda')
for nh in self.n_hiddens_per_layer:
self.hidden_layers.append( torch.nn.Sequential(
torch.nn.Linear(n_inputs, nh),
torch.nn.Tanh() if act_func == 'tanh' else torch.nn.ReLU()))
n_inputs = nh
self.output_layer = torch.nn.Linear(n_inputs, n_outputs)
self.Xmeans = None
self.Xstds = None
self.Tmeans = None
self.Tstds = None
self.error_trace = []
def forward(self, X):
Y = X
for hidden_layer in self.hidden_layers:
Y = hidden_layer(Y)
Y = self.output_layer(Y)
return Y
def train(self, X, T, n_epochs, learning_rate, verbose=True):
# Set data matrices to torch.tensors if not already.
if not isinstance(X, torch.Tensor):
X = torch.from_numpy(X).float()
if not isinstance(T, torch.Tensor):
T = torch.from_numpy(T).float()
W = torch.zeros((2, 1), requires_grad=True)
print(W.requires_grad)
# Calculate standardization parameters if not already calculated
if self.Xmeans is None:
self.Xmeans = X.mean(0)
self.Xstds = X.std(0)
self.Xstds[self.Xstds == 0] = 1
self.Tmeans = T.mean(0)
self.Tstds = T.std(0)
self.Tstds[self.Tstds == 0] = 1
# Standardize inputs and targets
X = (X - self.Xmeans) / self.Xstds
T = (T - self.Tmeans) / self.Tstds
# Set optimizer to Adam and loss functions to MSELoss
optimizer = torch.optim.Adam(self.parameters(), lr=learning_rate)
mse_func = torch.nn.MSELoss()
# For each epoch:
# Do forward pass to calculate output Y.
# Calculate mean squared error loss, mse.
# Calculate gradient of mse with respect to all weights by calling mse.backward().
# Take weight update step, then zero the gradient values.
# Unstandardize the mse error and save in self.error_trace
# Print epoch+1 and unstandardized error if verbose is True and
# (epoch+1 is n_epochs or epoch+1 % (n_epochs // 10) == 0)
for epoch in range(n_epochs):
# Do forward pass to calculate output Y.
Y = self.forward(X)
print("Y = \n",Y)
# Calculate mean squared error loss, mse.
mse = ((T - Y)**2).mean()
#mse = torch.mean((T - Y[-1]) ** 2)
print("Y shape = \n",Y.shape)
print("Tshape = \n",T.shape)
print("MSE = \n",mse)
# Calculate gradient of mse with respect to all weights by calling mse.backward().
#W.retain_grad()
mse.backward(torch.ones(100))
#print("mse.backward(torch.ones(100))",mse.backward(torch.ones(100)))
# Take weight update step, then zero the gradient values.
#print("W.grad = ",W.grad())
with torch.no_grad():
W = learning_rate*W.grad()
print("kuttu",W.requires_grad)
W -= learning_rate * W.grad()
W.grad.zero_()
# Unstandardize the mse error and save in self.error_trace
self.error_trace = mse * self.Tstds
#. . .
def use(self, X):
# Set input matrix to torch.tensors if not already.
if not isinstance(X, torch.Tensor):
X = torch.from_numpy(X).float()
# Standardize X
print("here=\n",type(X))
X = (X - torch.mean(X)) / self.Xstds
# Do forward pass and unstandardize resulting output. Assign to variable Y.
# Return output Y after detaching from computation graph and converting to numpy
return Y.detach().numpy()
*<ipython-input-20-6e1e577f866d> in train(self, X, T, n_epochs, learning_rate, verbose)
86 # Take weight update step, then zero the gradient values.
87 with torch.no_grad():
---> 88 W = learning_rate*W.grad()
89 print("w",W.requires_grad)
90 W -= learning_rate * W.grad()*
TypeError: 'NoneType' object is not callable
I am following a tutorial on rnn's in TensorFlow but I have a question concerning the input formats.
They are taking raw_x (one hot vector) and basically first cutting that up in pieces of length 200 (batch_size) to form data_x. That is good.
Then they further cut up data_x in pieces of length 5 (num_step, or graph width) with:
for i in range(epoch_size):
x = data_x[:, i * num_steps:(i + 1) * num_steps]
y = data_y[:, i * num_steps:(i + 1) * num_steps]
yield (x, y)
However, if I look in the data, the slices of x do not match data_x. The first one does, but then they diverge.
Am I misunderstanding the above code? I would like to understand how x is being created or what it is supposed to look like.
I had expected the second item to be 0 1 0 1 0.
Also, I thought an epoch is when you go through the data completely, from this it seems that they split up the data in 1000 parts (epoch size)?
If it helps, this is my full code. I am trying to figure out what is going on in x. at line 48:
import numpy as np
import tensorflow as tf
# %matplotlib inline
import matplotlib.pyplot as plt
# Global config variables
num_steps = 5 # number of truncated backprop steps ('n' in the discussion above)
batch_size = 200
num_classes = 2
state_size = 4
learning_rate = 0.1
def gen_data(size=1000000):
print('generating data');
X = np.array(np.random.choice(2, size=(size,)))
Y = []
for i in range(size):
threshold = 0.5
if X[i-3] == 1:
threshold += 0.5
if X[i-8] == 1:
threshold -= 0.25
if np.random.rand() > threshold:
Y.append(0)
else:
Y.append(1)
return X, np.array(Y)
# adapted from https://github.com/tensorflow/tensorflow/blob/master/tensorflow/models/rnn/ptb/reader.py
def gen_batch(raw_data, batch_size, num_steps):
print('generating batches');
raw_x, raw_y = raw_data
data_length = len(raw_x)
# partition raw data into batches and stack them vertically in a data matrix
batch_partition_length = data_length // batch_size
data_x = np.zeros([batch_size, batch_partition_length], dtype=np.int32)
data_y = np.zeros([batch_size, batch_partition_length], dtype=np.int32)
for i in range(batch_size):
data_x[i] = raw_x[batch_partition_length * i:batch_partition_length * (i + 1)]
data_y[i] = raw_y[batch_partition_length * i:batch_partition_length * (i + 1)]
# further divide batch partitions into num_steps for truncated backprop
epoch_size = batch_partition_length // num_steps
for i in range(epoch_size):
x = data_x[:, i * num_steps:(i + 1) * num_steps]
y = data_y[:, i * num_steps:(i + 1) * num_steps]
yield (x, y)
def gen_epochs(n, num_steps):
for i in range(n):
yield gen_batch(gen_data(), batch_size, num_steps)
"""
Placeholders
"""
x = tf.placeholder(tf.int32, [batch_size, num_steps], name='input_placeholder')
y = tf.placeholder(tf.int32, [batch_size, num_steps], name='labels_placeholder')
init_state = tf.zeros([batch_size, state_size])
"""
RNN Inputs
"""
# Turn our x placeholder into a list of one-hot tensors:
# rnn_inputs is a list of num_steps tensors with shape [batch_size, num_classes]
x_one_hot = tf.one_hot(x, num_classes)
rnn_inputs = tf.unstack(x_one_hot, axis=1)
"""
Definition of rnn_cell
This is very similar to the __call__ method on Tensorflow's BasicRNNCell. See:
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/rnn_cell.py
"""
with tf.variable_scope('rnn_cell'):
W = tf.get_variable('W', [num_classes + state_size, state_size])
b = tf.get_variable('b', [state_size], initializer=tf.constant_initializer(0.0))
def rnn_cell(rnn_input, state):
with tf.variable_scope('rnn_cell', reuse=True):
W = tf.get_variable('W', [num_classes + state_size, state_size])
b = tf.get_variable('b', [state_size], initializer=tf.constant_initializer(0.0))
return tf.tanh(tf.matmul(tf.concat(axis=1, values=[rnn_input, state]), W) + b)
"""
Adding rnn_cells to graph
This is a simplified version of the "rnn" function from Tensorflow's api. See:
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/rnn.py
"""
state = init_state
rnn_outputs = []
for rnn_input in rnn_inputs:
state = rnn_cell(rnn_input, state)
rnn_outputs.append(state)
final_state = rnn_outputs[-1]
"""
Predictions, loss, training step
Losses and total_loss are simlar to the "sequence_loss_by_example" and "sequence_loss"
functions, respectively, from Tensorflow's api. See:
https://github.com/tensorflow/tensorflow/blob/master/tensorflow/python/ops/seq2seq.py
"""
#logits and predictions
with tf.variable_scope('softmax'):
W = tf.get_variable('W', [state_size, num_classes])
b = tf.get_variable('b', [num_classes], initializer=tf.constant_initializer(0.0))
logits = [tf.matmul(rnn_output, W) + b for rnn_output in rnn_outputs]
predictions = [tf.nn.softmax(logit) for logit in logits]
# Turn our y placeholder into a list labels
y_as_list = [tf.squeeze(i, axis=[1]) for i in tf.split(axis=1, num_or_size_splits=num_steps, value=y)]
#losses and train_step
losses = [tf.nn.sparse_softmax_cross_entropy_with_logits(logits=logit,labels=label) for \
logit, label in zip(logits, y_as_list)]
total_loss = tf.reduce_mean(losses)
train_step = tf.train.AdagradOptimizer(learning_rate).minimize(total_loss)
"""
Function to train the network
"""
def train_network(num_epochs, num_steps, state_size=4, verbose=True):
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
training_losses = []
for idx, epoch in enumerate(gen_epochs(num_epochs, num_steps)):
training_loss = 0
training_state = np.zeros((batch_size, state_size))
if verbose:
print("\nEPOCH", idx)
for step, (X, Y) in enumerate(epoch):
tr_losses, training_loss_, training_state, _ = \
sess.run([losses,
total_loss,
final_state,
train_step],
feed_dict={x:X, y:Y, init_state:training_state})
training_loss += training_loss_
if step % 100 == 0 and step > 0:
if verbose:
print("Average loss at step", step,
"for last 250 steps:", training_loss/100)
training_losses.append(training_loss/100)
training_loss = 0
return training_losses
training_losses = train_network(1,num_steps)
plt.plot(training_losses)
Seems like the batches are actually transposed.
So the first elements of the x-matrix (200 x 5) will fit the first 5 elements of x_raw.
Then only in the next iteration, the next 5-10 elements of x_raw will be in the first elements (again) of x.