How to handle the last batch using keras fit_generator - python

I am using a customised batch generator in an attempt to fix the problem of incompatible shapes (BroadcastGradientArgs error) while using the standard model.fit() function due to the small size of the last batch in the training data. I used the batch generator mentioned here with the model.fit_generator() function:
class Generator(Sequence):
# Class is a dataset wrapper for better training performance
def __init__(self, x_set, y_set, batch_size=256):
self.x, self.y = x_set, y_set
self.batch_size = batch_size
self.indices = np.arange(self.x.shape[0])
def __len__(self):
return math.floor(self.x.shape[0] / self.batch_size)
def __getitem__(self, idx):
inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size] #Line A
batch_x = self.x[inds]
batch_y = self.y[inds]
return batch_x, batch_y
def on_epoch_end(self):
np.random.shuffle(self.indices)
But it seems that it discards the last batch if its size is smaller than the provided batch size. How can I update it to include the last batch and expand it (for example) with some repeated samples?
Also, somehow I don't get how "Line A" works!
Update:
here is how I am using the generator in with my model:
# dummy model
input_1 = Input(shape=(None,))
...
dense_1 = Dense(10, activation='relu')(input_1)
output_1 = Dense(1, activation='sigmoid')(dense_1)
model = Model(input_1, output_1)
print(model.summary())
#Compile and fit_generator
model.compile(optimizer='adam', loss='binary_crossentropy')
train_data_gen = Generator(x1_train, y_train, batch_size)
test_data_gen = Generator(x1_test, y_test, batch_size)
model.fit_generator(generator=train_data_gen, validation_data = test_data_gen, epochs=epochs, shuffle=False, verbose=1)
loss, accuracy = model.evaluate_generator(generator=test_data_gen)
print('Test Loss: %0.5f Accuracy: %0.5f' % (loss, accuracy))

I thing the culprit is this line
return math.floor(self.x.shape[0] / self.batch_size)
Replace it with this might work
return math.ceil(self.x.shape[0] / self.batch_size)
Imagine if you have 100 samples and batch size 32. It should divided to 3.125 batches. But if you use math.floor, it will become 3 and discord 0.125.
As for Line A, if batch size is 32, when index is 1 the [idx * self.batch_size:(idx + 1) * self.batch_size] will become [32:64], in other word, pick the 33th to 64th elements of self.indices
**Update 2, change the input to have a None shape and use LSTM and add evaluate
import os
os.environ['CUDA_VISIBLE_DEVICES'] = ""
import math
import numpy as np
from keras.models import Model
from keras.utils import Sequence
from keras.layers import Input, Dense, LSTM
class Generator(Sequence):
# Class is a dataset wrapper for better training performance
def __init__(self, x_set, y_set, batch_size=256):
self.x, self.y = x_set, y_set
self.batch_size = batch_size
self.indices = np.arange(self.x.shape[0])
def __len__(self):
return math.ceil(self.x.shape[0] / self.batch_size)
def __getitem__(self, idx):
inds = self.indices[idx * self.batch_size:(idx + 1) * self.batch_size] # Line A
batch_x = self.x[inds]
batch_y = self.y[inds]
return batch_x, batch_y
def on_epoch_end(self):
np.random.shuffle(self.indices)
# dummy model
input_1 = Input(shape=(None, 10))
x = LSTM(90)(input_1)
x = Dense(10)(x)
x = Dense(1, activation='sigmoid')(x)
model = Model(input_1, x)
print(model.summary())
# Compile and fit_generator
model.compile(optimizer='adam', loss='binary_crossentropy')
x1_train = np.random.rand(1590, 20, 10)
x1_test = np.random.rand(90, 20, 10)
y_train = np.random.rand(1590, 1)
y_test = np.random.rand(90, 1)
train_data_gen = Generator(x1_train, y_train, 256)
test_data_gen = Generator(x1_test, y_test, 256)
model.fit_generator(generator=train_data_gen,
validation_data=test_data_gen,
epochs=5,
shuffle=False,
verbose=1)
loss = model.evaluate_generator(generator=test_data_gen)
print('Test Loss: %0.5f' % loss)
This run without any problem.

Away from the strategy in the other answers, such issue could be tackled in different ways, depending on your scope (intention).
If you wish to repeat some samples in the last batch (until the last batch's size is equal to batch_size) as you suggested in your question, you could (for example) check whether the last sample in the dataset was reached, if so, do something. e.g.:
N_batches = int(np.ceil(len(dataset) / batch_size))
batch_size = 32
batch_counter = 0
while True:
current_batch = []
idx_start = batch_size * batches_counter
idx_end = batch_size * (batches_counter + 1)
for idx in range(idx_start, idx_end):
## Next line sets idx to the index of the last sample in the dataset:
idx = len(dataset)-1 if (idx > len(data_set)-1) else idx
current_batch.append(dataset[idx])
.
.
.
batch_counter += 1
if (batch_counter == N_batches):
batch_counter = 0
Obviously, it doesn't need to be the last sample, it could (for example) be a random sample from the dataset:
idx = random.randint(0,len(dataset) if (idx > len(data_set)-1) else idx
Hope this helps.

Related

Can I use Keras model.fit() on a multi-output model with custom loss that uses all outputs' targets and predictions in Tensorflow 2?

I have tried and failed to make Keras model.fit() work on my multi-output model with a custom loss that uses all outputs' targets and predictions (specifically for 2 outputs) in TF 2.
When I tried to do this on a model made with the Keras functional API, I get the error: "SymbolicException: Inputs to eager execution function cannot be Keras symbolic tensors, but found ..."
meaning I can't use my loss function because it returns an eager tensor to a Keras DAG that works with symbolic tensors (functional API model). To get around this, I used model.add_loss() instead of passing my loss function into model.compile(), but I believe this hogged GPU memory and caused OOM errors.
I've tried workarounds, where I put my functional API model inside a Keras subclassed model or make a completely new Keras subclassed model.
Workaround 1 is below in code, and runs yet gives me NaNs across the epochs on training on a variety of gradient clippings, and gives 0-valued outputs.
Workaround 2 gives me an error inside the override call() method because the inputs param is different shapes during model compile-time and run-time because my model (in a quirky way) has 3 inputs: 1 is the actual input to the DLNN, and the 2 others are the targets for the input sample. This is so that I can get the targets from each sample into the loss function.
from scipy.io import wavfile
import scipy.signal as sg
import matplotlib.pyplot as plt
import tensorflow as tf
from tensorflow.keras.layers import Input, SimpleRNN, Dense, Lambda, TimeDistributed, Layer, LSTM, Bidirectional, BatchNormalization, Concatenate
from tensorflow.keras.models import Model
from tensorflow.keras.activations import relu
from tensorflow.keras.callbacks import EarlyStopping
import numpy as np
import datetime
import numpy as np
import math
import random
import json
import os
import sys
# Loss function
def discriminative_loss(piano_true, noise_true, piano_pred, noise_pred, loss_const):
last_dim = piano_pred.shape[1] * piano_pred.shape[2]
return (
tf.math.reduce_mean(tf.reshape(noise_pred - noise_true, shape=(-1, last_dim)) ** 2, axis=-1) -
(loss_const * tf.math.reduce_mean(tf.reshape(noise_pred - piano_true, shape=(-1, last_dim)) ** 2, axis=-1)) +
tf.math.reduce_mean(tf.reshape(piano_pred - piano_true, shape=(-1, last_dim)) ** 2, axis=-1) -
(loss_const * tf.math.reduce_mean(tf.reshape(piano_pred - noise_true, shape=(-1, last_dim)) ** 2, axis=-1))
)
def make_model(features, sequences, name='Model'):
input_layer = Input(shape=(sequences, features), dtype='float32',
name='piano_noise_mixed')
piano_true = Input(shape=(sequences, features), dtype='float32',
name='piano_true')
noise_true = Input(shape=(sequences, features), dtype='float32',
name='noise_true')
x = SimpleRNN(features // 2,
activation='relu',
return_sequences=True) (input_layer)
piano_pred = TimeDistributed(Dense(features), name='piano_hat') (x) # source 1 branch
noise_pred = TimeDistributed(Dense(features), name='noise_hat') (x) # source 2 branch
model = Model(inputs=[input_layer, piano_true, noise_true],
outputs=[piano_pred, noise_pred])
return model
# Model "wrapper" for many-input loss function
class RestorationModel2(Model):
def __init__(self, model, loss_const):
super(RestorationModel2, self).__init__()
self.model = model
self.loss_const = loss_const
def call(self, inputs):
return self.model(inputs)
def compile(self, optimizer, loss):
super(RestorationModel2, self).compile()
self.optimizer = optimizer
self.loss = loss
def train_step(self, data):
# Unpack data - what generator yeilds
x, piano_true, noise_true = data
with tf.GradientTape() as tape:
piano_pred, noise_pred = self.model((x, piano_true, noise_true), training=True)
loss = self.loss(piano_true, noise_true, piano_pred, noise_pred, self.loss_const)
trainable_vars = self.model.trainable_variables
gradients = tape.gradient(loss, trainable_vars)
self.optimizer.apply_gradients(zip(gradients, trainable_vars))
return {'loss': loss}
def test_step(self, data):
x, piano_true, noise_true = data
piano_pred, noise_pred = self.model((x, piano_true, noise_true), training=False)
loss = self.loss(piano_true, noise_true, piano_pred, noise_pred, self.loss_const)
return {'loss': loss}
def make_imp_model(features, sequences, loss_const=0.05,
optimizer=tf.keras.optimizers.RMSprop(clipvalue=0.7),
name='Restoration Model', epsilon=10 ** (-10)):
# NEW Semi-imperative model
model = RestorationModel2(make_model(features, sequences, name='Training Model'),
loss_const=loss_const)
model.compile(optimizer=optimizer, loss=discriminative_loss)
return model
# MODEL TRAIN & EVAL FUNCTION
def evaluate_source_sep(train_generator, validation_generator,
num_train, num_val, n_feat, n_seq, batch_size,
loss_const, epochs=20,
optimizer=tf.keras.optimizers.RMSprop(clipvalue=0.75),
patience=10, epsilon=10 ** (-10)):
print('Making model...') # IMPERATIVE MODEL - Customize Fit
model = make_imp_model(n_feat, n_seq, loss_const=loss_const, optimizer=optimizer, epsilon=epsilon)
print('Going into training now...')
hist = model.fit(train_generator,
steps_per_epoch=math.ceil(num_train / batch_size),
epochs=epochs,
validation_data=validation_generator,
validation_steps=math.ceil(num_val / batch_size),
callbacks=[EarlyStopping('val_loss', patience=patience, mode='min')])
print(model.summary())
# NEURAL NETWORK DATA GENERATOR
def my_dummy_generator(num_samples, batch_size, train_seq, train_feat):
while True:
for offset in range(0, num_samples, batch_size):
# Initialise x, y1 and y2 arrays for this batch
x, y1, y2 = (np.empty((batch_size, train_seq, train_feat)),
np.empty((batch_size, train_seq, train_feat)),
np.empty((batch_size, train_seq, train_feat)))
yield (x, y1, y2)
def main():
epsilon = 10 ** (-10)
train_batch_size = 5
loss_const, epochs, val_split = 0.05, 10, 0.25
optimizer = tf.keras.optimizers.RMSprop(clipvalue=0.9)
TRAIN_SEQ_LEN, TRAIN_FEAT_LEN = 1847, 2049
TOTAL_SMPLS = 60
# Validation & Training Split
indices = list(range(TOTAL_SMPLS))
val_indices = indices[:math.ceil(TOTAL_SMPLS * val_split)]
num_val = len(val_indices)
num_train = TOTAL_SMPLS - num_val
train_seq, train_feat = TRAIN_SEQ_LEN, TRAIN_FEAT_LEN
print('Train Input Stats:')
print('N Feat:', train_feat, 'Seq Len:', train_seq, 'Batch Size:', train_batch_size)
# Create data generators and evaluate model with them
train_generator = my_dummy_generator(num_train,
batch_size=train_batch_size, train_seq=train_seq,
train_feat=train_feat)
validation_generator = my_dummy_generator(num_val,
batch_size=train_batch_size, train_seq=train_seq,
train_feat=train_feat)
evaluate_source_sep(train_generator, validation_generator, num_train, num_val,
n_feat=train_feat, n_seq=train_seq,
batch_size=train_batch_size,
loss_const=loss_const, epochs=epochs,
optimizer=optimizer, epsilon=epsilon)
if __name__ == '__main__':
main()
Thanks for the help!
Solution, don't pass your loss into model.add_loss(). Instead concatenate your outputs together which lets you pass your custom loss into model.compile(). Then deal with the outputs in the custom loss function.
class TimeFreqMasking(Layer):
# Init is for input-independent variables
def __init__(self, epsilon, **kwargs):
super(TimeFreqMasking, self).__init__(**kwargs)
self.epsilon = epsilon
# No build method, b/c passing in multiple inputs to layer (no single shape)
def call(self, inputs):
y_hat_self, y_hat_other, x_mixed = inputs
mask = tf.abs(y_hat_self) / (tf.abs(y_hat_self) + tf.abs(y_hat_other) + self.epsilon)
y_tilde_self = mask * x_mixed
return y_tilde_self
def discrim_loss(y_true, y_pred):
piano_true, noise_true = tf.split(y_true, num_or_size_splits=2, axis=-1)
loss_const = y_pred[-1, :, :][0][0]
piano_pred, noise_pred = tf.split(y_pred[:-1, :, :], num_or_size_splits=2, axis=0)
last_dim = piano_pred.shape[1] * piano_pred.shape[2]
return (
tf.math.reduce_mean(tf.reshape(noise_pred - noise_true, shape=(-1, last_dim)) ** 2) -
(loss_const * tf.math.reduce_mean(tf.reshape(noise_pred - piano_true, shape=(-1, last_dim)) ** 2)) +
tf.math.reduce_mean(tf.reshape(piano_pred - piano_true, shape=(-1, last_dim)) ** 2) -
(loss_const * tf.math.reduce_mean(tf.reshape(piano_pred - noise_true, shape=(-1, last_dim)) ** 2))
)
def make_model(features, sequences, epsilon, loss_const):
input_layer = Input(shape=(sequences, features), name='piano_noise_mixed')
x = SimpleRNN(features // 2,
activation='relu',
return_sequences=True) (input_layer)
x = SimpleRNN(features // 2,
activation='relu',
return_sequences=True) (x)
piano_hat = TimeDistributed(Dense(features), name='piano_hat') (x) # source 1 branch
noise_hat = TimeDistributed(Dense(features), name='noise_hat') (x) # source 2 branch
piano_pred = TimeFreqMasking(epsilon=epsilon,
name='piano_pred') ((piano_hat, noise_hat, input_layer))
noise_pred = TimeFreqMasking(epsilon=epsilon,
name='noise_pred') ((noise_hat, piano_hat, input_layer))
preds_and_gamma = Concatenate(axis=0) ([piano_pred,
noise_pred,
# loss_const_tensor
tf.broadcast_to(tf.constant(loss_const), [1, sequences, features])
])
model = Model(inputs=input_layer, outputs=preds_and_gamma)
model.compile(optimizer=optimizer, loss=discrim_loss)
return model
def dummy_generator(num_samples, batch_size, num_seq, num_feat):
while True:
for _ in range(0, num_samples, batch_size):
x, y1, y2 = (np.random.rand(batch_size, num_seq, num_feat),
np.random.rand(batch_size, num_seq, num_feat),
np.random.rand(batch_size, num_seq, num_feat))
yield ([x, np.concatenate((y1, y2), axis=-1)])
total_samples = 6
batch_size = 2
time_steps = 3
features = 4
loss_const = 2
epochs = 10
val_split = 0.25
epsilon = 10 ** (-10)
model = make_model(features, time_steps, epsilon, loss_const)
print(model.summary())
num_val = math.ceil(actual_samples * val_split)
num_train = total_samples - val_samples
train_dataset = dummy_generator(num_train, batch_size, time_steps, features)
val_dataset = dummy_generator(num_val, batch_size, time_steps, features)
model.fit(train_dataset,
steps_per_epoch=math.ceil(num_train / batch_size),
epochs=epochs,
validation_data=val_dataset,
validation_steps=math.ceil(num_val / batch_size)

Pytorch simple model not improving

I am making a simple PyTorch neural net to approximate the sine function on x = [0, 2pi]. This is a simple architecture I use with different deep learning libraries to test whether I understand how to use it or not. The neural net, when untrained, always produces a straight horizontal line, and when trained, produces a straight line at y = 0. In general, it always produces a straight line at y = (The mean of the function). This leads me to believe something is wrong with the forward prop portion of it, as the boundary should not just be a straight line when untrained. Here is the code for the net:
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.model = nn.Sequential(
nn.Linear(1, 20),
nn.Sigmoid(),
nn.Linear(20, 50),
nn.Sigmoid(),
nn.Linear(50, 50),
nn.Sigmoid(),
nn.Linear(50, 1)
)
def forward(self, x):
x = self.model(x)
return x
Here is the training loop
def train(net, trainloader, valloader, learningrate, n_epochs):
net = net.train()
loss = nn.MSELoss()
optimizer = torch.optim.SGD(net.parameters(), lr = learningrate)
for epoch in range(n_epochs):
for X, y in trainloader:
X = X.reshape(-1, 1)
y = y.view(-1, 1)
optimizer.zero_grad()
outputs = net(X)
error = loss(outputs, y)
error.backward()
#net.parameters() net.parameters() * learningrate
optimizer.step()
total_loss = 0
for X, y in valloader:
X = X.reshape(-1, 1).float()
y = y.view(-1, 1)
outputs = net(X)
error = loss(outputs, y)
total_loss += error.data
print('Val loss for epoch', epoch, 'is', total_loss / len(valloader) )
it is called as:
net = Net()
losslist = train(net, trainloader, valloader, .0001, n_epochs = 4)
Where trainloader and valloader are the training and validation loaders. Can anyone help me see what's wrong with this? I know its not the learning rate since its the one I use in other frameworks, and I know its not the fact im using SGD or sigmoid activation functions, although I have a suspicion the error is in the activation functions somewhere.
Does anyone know how to fix this? Thanks.
After a while playing with some hyperparameters, modifying the net and changing the optimizer (following this excellent recipe) I ended up with changing the line optimizer = torch.optim.SGD(net.parameters(), lr = learningrate) to optimizer = torch.optim.Adam(net.parameters()) (the default optimizer parameters was used), running for 100 epochs and batch size equal to 1.
The following code was used (tested on CPU only):
import torch
import torch.nn as nn
from torch.utils import data
import numpy as np
import matplotlib.pyplot as plt
# for reproducibility
torch.manual_seed(0)
np.random.seed(0)
class Dataset(data.Dataset):
def __init__(self, init, end, n):
self.n = n
self.x = np.random.rand(self.n, 1) * (end - init) + init
self.y = np.sin(self.x)
def __len__(self):
return self.n
def __getitem__(self, idx):
x = self.x[idx, np.newaxis]
y = self.y[idx, np.newaxis]
return torch.Tensor(x), torch.Tensor(y)
class Net(nn.Module):
def __init__(self):
super(Net, self).__init__()
self.model = nn.Sequential(
nn.Linear(1, 20),
nn.Sigmoid(),
nn.Linear(20, 50),
nn.Sigmoid(),
nn.Linear(50, 50),
nn.Sigmoid(),
nn.Linear(50, 1)
)
def forward(self, x):
x = self.model(x)
return x
def train(net, trainloader, valloader, n_epochs):
loss = nn.MSELoss()
# Switch the two following lines and run the code
# optimizer = torch.optim.SGD(net.parameters(), lr = 0.0001)
optimizer = torch.optim.Adam(net.parameters())
for epoch in range(n_epochs):
net.train()
for x, y in trainloader:
optimizer.zero_grad()
outputs = net(x).view(-1)
error = loss(outputs, y)
error.backward()
optimizer.step()
net.eval()
total_loss = 0
for x, y in valloader:
outputs = net(x)
error = loss(outputs, y)
total_loss += error.data
print('Val loss for epoch', epoch, 'is', total_loss / len(valloader) )
net.eval()
f, (ax1, ax2) = plt.subplots(1, 2, sharey=True)
def plot_result(ax, dataloader):
out, xx, yy = [], [], []
for x, y in dataloader:
out.append(net(x))
xx.append(x)
yy.append(y)
out = torch.cat(out, dim=0).detach().numpy().reshape(-1)
xx = torch.cat(xx, dim=0).numpy().reshape(-1)
yy = torch.cat(yy, dim=0).numpy().reshape(-1)
ax.scatter(xx, yy, facecolor='green')
ax.scatter(xx, out, facecolor='red')
xx = np.linspace(0.0, 3.14159*2, 1000)
ax.plot(xx, np.sin(xx), color='green')
plot_result(ax1, trainloader)
plot_result(ax2, valloader)
plt.show()
train_dataset = Dataset(0.0, 3.14159*2, 100)
val_dataset = Dataset(0.0, 3.14159*2, 30)
params = {'batch_size': 1,
'shuffle': True,
'num_workers': 4}
trainloader = data.DataLoader(train_dataset, **params)
valloader = data.DataLoader(val_dataset, **params)
net = Net()
losslist = train(net, trainloader, valloader, n_epochs = 100)
Result with Adam optimizer:
Result with SGD optimizer:
In general, it always produces a straight line at y = (The mean of the function).
Usually, this means that the NN has only successfully trained the final layer so far. You need to train it for longer or with better optimizations, as ViniciusArruda shows here.
Edit: To explain further.. When only the final layer has been trained, the NN is effectively trying to guess the output y with no knowledge of the input X. In this case, the best guess it can make is the mean value. That way, it can minimize its MSE loss.

Keras vs PyTorch LSTM different results

Trying to get similar results on same dataset with Keras and PyTorch.
Data
from numpy import array
from numpy import hstack
from sklearn.model_selection import train_test_split
# split a multivariate sequence into samples
def split_sequences(sequences, n_steps):
X, y = list(), list()
for i in range(len(sequences)):
# find the end of this pattern
end_ix = i + n_steps
# check if we are beyond the dataset
if end_ix > len(sequences):
break
# gather input and output parts of the pattern
seq_x, seq_y = sequences[i:end_ix, :-1], sequences[end_ix-1, -1]
X.append(seq_x)
y.append(seq_y)
return array(X), array(y)
def get_data():
# define input sequence
in_seq1 = array([x for x in range(0,500,10)])/1
in_seq2 = array([x for x in range(5,505,10)])/1
out_seq = array([in_seq1[i]+in_seq2[i] for i in range(len(in_seq1))])
# convert to [rows, columns] structure
in_seq1 = in_seq1.reshape((len(in_seq1), 1))
in_seq2 = in_seq2.reshape((len(in_seq2), 1))
out_seq = out_seq.reshape((len(out_seq), 1))
# horizontally stack columns
dataset = hstack((in_seq1, in_seq2, out_seq))
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps
# convert into input/output
X, y = split_sequences(dataset, n_timesteps)
print(X.shape, y.shape)
X_train,x_test,Y_train, y_test = train_test_split(X,y,test_size = 0.2,shuffle=False)
return X_train,x_test,Y_train, y_test
Keras
from keras.models import Sequential
from keras.layers import LSTM
from keras.layers import Dense
from sklearn.metrics import mean_squared_error
import testing.TimeSeries.datacreator as dc # !!!!change this!!!!
X_train,x_test,Y_train, y_test = dc.get_data()
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps
# define model
model = Sequential()
model.add(LSTM(1024, activation='relu',
input_shape=(n_timesteps, n_features),
kernel_initializer='uniform',
recurrent_initializer='uniform'))
model.add(Dense(512, activation='relu'))
model.add(Dense(1))
opt = keras.optimizers.Adam(lr=0.001,
beta_1=0.9,
beta_2=0.999,
epsilon=keras.optimizers.K.epsilon(),
decay=0.0,
amsgrad=False)
model.compile(optimizer=opt, loss='mse')
# fit model
model.fit(X_train, Y_train, epochs=200, verbose=1,validation_data=(x_test,y_test))
yhat = model.predict(x_test, verbose=0)
mean_squared_error(y_test, yhat)
PyTorch - module class
import numpy as np
import torch
import torch.nn.functional as F
from sklearn.metrics import mean_squared_error
import testing.TimeSeries.datacreator as dc # !!!! change this !!!!
X_train,x_test,Y_train, y_test = dc.get_data()
n_features = 2 # this is number of parallel inputs
n_timesteps = 3 # this is number of timesteps
class MV_LSTM(torch.nn.Module):
def __init__(self,n_features,seq_length):
super(MV_LSTM, self).__init__()
self.n_features = n_features # number of parallel inputs
self.seq_len = seq_length # number of timesteps
self.n_hidden = 1024 # number of hidden states
self.n_layers = 1 # number of LSTM layers (stacked)
self.l_lstm = torch.nn.LSTM(input_size = n_features,
hidden_size = self.n_hidden,
num_layers = self.n_layers,
batch_first = True)
# according to pytorch docs LSTM output is
# (batch_size,seq_len, num_directions * hidden_size)
# when considering batch_first = True
self.l_linear = torch.nn.Linear(self.n_hidden*self.seq_len, 512)
# self.l_linear1 = torch.nn.Linear(512, 512)
self.l_linear2 = torch.nn.Linear(512, 1)
def init_hidden(self, batch_size):
# even with batch_first = True this remains same as docs
hidden_state = torch.zeros(self.n_layers,batch_size,self.n_hidden).to(next(self.parameters()).device)
cell_state = torch.zeros(self.n_layers,batch_size,self.n_hidden).to(next(self.parameters()).device)
self.hidden = (hidden_state, cell_state)
def forward(self, x):
batch_size, seq_len, _ = x.size()
lstm_out, self.hidden = self.l_lstm(x,self.hidden)
# lstm_out(with batch_first = True) is
# (batch_size,seq_len,num_directions * hidden_size)
# for following linear layer we want to keep batch_size dimension and merge rest
# .contiguous() -> solves tensor compatibility error
x = lstm_out.contiguous().view(batch_size,-1)
x = F.relu(x)
x = F.relu(self.l_linear(x))
# x = F.relu(self.l_linear1(x))
x = self.l_linear2(x)
return x
PyTorch - init and train
# create NN
mv_net = MV_LSTM(n_features,n_timesteps)
criterion = torch.nn.MSELoss()
import keras # for epsilon constant
optimizer = torch.optim.Adam(mv_net.parameters(),
lr=1e-3,
betas=[0.9,0.999],
eps=keras.optimizers.K.epsilon(),
weight_decay=0,
amsgrad=False)
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
mv_net.to(device)
train_episodes = 200
batch_size = 32
eval_batch_size = 32
for t in range(train_episodes):
# TRAIN
mv_net.train()
for b in range(0,len(X_train),batch_size):
inpt = X_train[b:b+batch_size,:,:]
target = Y_train[b:b+batch_size]
x_batch = torch.tensor(inpt,dtype=torch.float32).to(device)
y_batch = torch.tensor(target,dtype=torch.float32).to(device)
mv_net.init_hidden(x_batch.size(0))
output = mv_net(x_batch)
loss = criterion(output.view(-1), y_batch)
loss.backward()
optimizer.step()
optimizer.zero_grad()
# EVAL
mv_net.eval()
mv_net.init_hidden(eval_batch_size)
acc = 0
for b in range(0,len(x_test),eval_batch_size):
inpt = x_test[b:b+eval_batch_size,:,:]
target = y_test[b:b+eval_batch_size]
x_batch = torch.tensor(inpt,dtype=torch.float32).to(device)
y_batch = torch.tensor(target,dtype=torch.float32).to(device)
mv_net.init_hidden(x_batch.size(0))
output = mv_net(x_batch)
acc += mean_squared_error(y_batch.cpu().detach().numpy(), output.view(-1).cpu().detach().numpy())
print('step:' , t , 'train loss:' , round(loss.item(),3),'eval acc:',round(acc/len(x_test),3))
mv_net.init_hidden(len(x_test))
val = torch.tensor(x_test,dtype=torch.float32).to(device)
otp = mv_net(val)
print(mean_squared_error(y_test, otp.view(-1).cpu().detach().numpy()))
Results
Keras produces test MSE almost 0, but PyTorch about 6000, which is way too different
I have tried couple tweaks in PyTorch code, but none got me anywhere close to similar keras, even with identical optim params
I cant see what is wrong with (kinda tutorialic) PyTorch code
I know it is almost one year too late. But I came across the same problem and I think the problem is the following. From the keras documentation it says:
return_sequences: Boolean. Whether to return the last output in the
output sequence, or the full sequence.
this basically means that the input shape of your self.l_linear needs to be torch.nn.Linear(1024, 512) instead of self.n_hidden*self.seq_len, 512.
Now you also need to do the same as keras does and only use the last output in your forward pass:
def forward(self, x):
batch_size, seq_len, _ = x.size()
lstm_out, self.hidden = self.l_lstm(x,self.hidden)
x = lstm_out[:,-1]
x = torch.nn.functional.relu(x)
x = torch.nn.functional.relu(self.l_linear(x))
x = self.l_linear2(x)
return x
when I run your example (which I needed to tweak a bit to get it run) I get very similar training losses.
Keras:
38/38 [==============================] - 0s 6ms/step - loss: 67.6081 - val_loss: 325.9259
PyTorch:
step: 199 train loss: 41.043 eval acc: 1142.688
I hope this helps others having a similar problem.
PS also note that keras is resetting the hidden state (stateful=False) by default.

XOR neural network does not learn

I am trying to solve the very simple non-linear problem. It is XOR gate.
I my school knowledge. XOR can be solve by using 2 input nodes, 2 hidden layer nodes. And 1 output. It is binary classification problem.
I generate the 1000 of random integer number it is 0 or 1 and then do backpropagation. But for some unknown reason my network has not learned anything. The training accuracy is constant at 50.
# coding: utf-8
import matplotlib
import torch
import torch.nn as nn
from torch.autograd import Variable
matplotlib.use('TkAgg') # My buggy OSX 10.13.6 requires this
import matplotlib.pyplot as plt
from torch.utils.data import Dataset
from tqdm import tqdm
import random
N = 1000
batch_size = 10
epochs = 40
hidden_size = 2
output_size = 1
lr = 0.1
def return_xor(N):
tmp_x = []
tmp_y = []
for i in range(N):
a = (random.randint(0, 1) == 1)
b = (random.randint(0, 1) == 1)
if (a and not b) or (not a and b):
q = True
else:
q = False
input_features = (a, b)
output_class = q
tmp_x.append(input_features)
tmp_y.append(output_class)
return tmp_x, tmp_y
# In[495]:
# Training set
x, y = return_xor(N)
x = torch.tensor(x, dtype=torch.float, requires_grad=True)
y = torch.tensor(y, dtype=torch.float, requires_grad=True)
# Test dataset
x_test, y_test = return_xor(100)
x_test = torch.tensor(x_test)
y_test = torch.tensor(y_test)
class MyDataset(Dataset):
"""Define my own `Dataset` in order to use `Variable` with `autograd`"""
def __init__(self, x, y):
self.x = x
self.y = y
def __getitem__(self, index):
return self.x[index], self.y[index]
def __len__(self):
return len(self.x)
dataset = MyDataset(x, y)
test_dataset = MyDataset(x_test, y_test)
print(dataset.x.shape)
print(dataset.y.shape)
# Make data iterable by loading to a loader. Shuffle, batch_size kwargs put them here in order to remind I myself
train_loader = torch.utils.data.DataLoader(dataset=dataset, batch_size=batch_size, shuffle=True)
test_loader = torch.utils.data.DataLoader(dataset=test_dataset, batch_size=batch_size, shuffle=False)
print(f"They are {len(train_loader)} batches in the dataset")
shown = 0
for (x, y) in train_loader:
if shown == 1:
break
print(f"{x.shape} {x.dtype}")
print(f"{y.shape} {y.dtype}")
shown += 1
class MyModel(nn.Module):
"""
Binary classification
2 input nodes
2 hidden nodes
1 output node
"""
def __init__(self, input_size, hidden_size, output_size):
super().__init__()
self.fc1 = torch.nn.Linear(input_size, hidden_size)
self.fc2 = torch.nn.Linear(hidden_size, output_size)
self.sigmoid = torch.nn.Sigmoid()
def forward(self, out):
out = self.fc1(out)
out = self.fc2(out)
out = self.sigmoid(out)
return out
# Create my network
net = MyModel(dataset.x.shape[1], hidden_size, output_size)
CUDA = torch.cuda.is_available()
if CUDA:
net = net.cuda()
criterion = torch.nn.BCELoss(reduction='elementwise_mean')
optimizer = torch.optim.SGD(net.parameters(), lr=lr)
# Train the network
correct_train = 0
total_train = 0
for epoch in range(epochs):
for i, (batches, labels) in enumerate(train_loader):
batcesh = Variable(batches.float())
labels = Variable(labels.float())
output = net(batches) # Forward pass
optimizer.zero_grad()
loss = criterion(output, labels.view(10, 1))
loss.backward()
optimizer.step()
total_train += labels.size(0)
correct_train += (predicted == labels.long()).sum()
if (i + 1) % 10 == 0:
print(f"""
Epoch {epoch+1}/{epochs},
Iteration {i+1}/{len(dataset)//batch_size},
Training Loss: {loss.item()},
Training Accuracy: {100*correct_train/total_train}
""")
Solution:
I did initialized weight, Adaptive learning rate
https://github.com/elcolie/nnbootcamp/blob/master/Study-XOR.ipynb
I am not sure what results you are getting, as the code you have posted in the question doesn't work (It gives errors with pytorch 0.4.1 like predicted not defined etc). But syntax issues apart, there are other problems.
Your model is not actually two layer as it does not use non-linearity after the first output. Effectively this is one layer network and to fix that you can modify your model's forward as follows:
def forward(self, out):
out = torch.nn.functional.relu(self.fc1(out))
out = self.fc2(out)
out = self.sigmoid(out)
return out
You can try sigmoid or tanh non-linearity as well... but the non-linearity is a must. This should fix the problem.
I also see that you are using only 2 hidden units. This might be restrictive and you might want to increase that to something like 5 or 10.

CNN with Tensorflow, low accuracy on CIFAR-10 and not improving

On running the first training epoch of a 3-layer convnet on CIFAR-10, I am neither able to achieve a high enough validation accuracy nor minimize the objective function.
Specifically, the accuracy varies on the first iteration, and then settles at 8.7% for the following iterations. What's peculiar is that I've also trained a 2-layer, fully-connected network which does substantially better, consistently getting around 43% accuracy on the validation set.
NOTE: Bulk of the code is from a Jupyter notebook designed as an introduction to barebones Tensorflow (and Keras) provided as part of an assignment for Stanford's CS231n Convolutional Neural Networks for Visual Recognition and although I am neither a student of the course nor of the university, I am doing this purely for experiential purposes and out of my newborn interests in CV / deep learning.
My contribution is only implementations for the forward pass and the initialization of the network's parameters.
The author of the notebook left a comment stating that when correctly implemented this model should achieve above 40% accuracy after the first epoch without any hyperparameter tuning.
Implementation Notes
49,000 / 1000 : train/validation split, batch size = 64
Weights are initialized using Kaiming normalization, bias initialized with 0s
learning rate = 3e-3
Here are each of the layers of convnet in detail:
Convolutional layer (with bias) with 32 5x5 filters, with zero-padding 2
ReLU Convolutional layer (with bias) with 16 3x3 filters, with zero-padding 1
ReLU Fully-connected layer (with bias) to compute scores for 10 classes
Code
( mine is written between the 'TODO' comment blocks )
import tensorflow as tf
import numpy as np
def load_cifar10(num_training=49000, num_validation=1000, num_test=10000):
cifar10 = tf.keras.datasets.cifar10.load_data()
(X_train, y_train), (X_test, y_test) = cifar10
X_train = np.asarray(X_train, dtype=np.float32)
y_train = np.asarray(y_train, dtype=np.int32).flatten()
X_test = np.asarray(X_test, dtype=np.float32)
y_test = np.asarray(y_test, dtype=np.int32).flatten()
mask = range(num_training, num_training + num_validation)
X_val = X_train[mask]
y_val = y_train[mask]
mask = range(num_training)
X_train = X_train[mask]
y_train = y_train[mask]
mask = range(num_test)
X_test = X_test[mask]
y_test = y_test[mask]
mean_pixel = X_train.mean(axis=(0, 1, 2), keepdims=True)
std_pixel = X_train.std(axis=(0, 1, 2), keepdims=True)
X_train = (X_train - mean_pixel) / std_pixel
X_val = (X_val - mean_pixel) / std_pixel
X_test = (X_test - mean_pixel) / std_pixel
return X_train, y_train, X_val, y_val, X_test, y_test
class Dataset(object):
def __init__(self, X, y, batch_size, shuffle=False):
assert X.shape[0] == y.shape[0], 'Got different numbers of data and labels'
self.X, self.y = X, y
self.batch_size, self.shuffle = batch_size, shuffle
def __iter__(self):
N, B = self.X.shape[0], self.batch_size
idxs = np.arange(N)
if self.shuffle:
np.random.shuffle(idxs)
return iter((self.X[i:i+B], self.y[i:i+B]) for i in range(0, N, B))
def flatten(x):
N = tf.shape(x)[0]
return tf.reshape(x, (N, -1))
def three_layer_convnet(x, params):
conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b = params
scores = None
############################################################################
# TODO: Implement the forward pass for the three-layer ConvNet. #
############################################################################
h1_conv = tf.nn.conv2d(x,
conv_w1 + conv_b1,
strides=[1, 1, 1, 1],
padding='SAME'
)
h1 = tf.nn.relu(h1_conv)
h2_conv = tf.nn.conv2d(h1,
conv_w2 + conv_b2,
strides=[1, 1, 1, 1],
padding='SAME'
)
h2 = tf.nn.relu(h2_conv)
fc_params = flatten(fc_w + fc_b)
h2 = flatten(h2)
scores = tf.matmul(h2, fc_params)
############################################################################
# END OF YOUR CODE #
############################################################################
return scores
def training_step(scores, y, params, learning_rate):
losses = tf.nn.sparse_softmax_cross_entropy_with_logits(labels=y, logits=scores)
loss = tf.reduce_mean(losses)
grad_params = tf.gradients(loss, params)
new_weights = []
for w, grad_w in zip(params, grad_params):
new_w = tf.assign_sub(w, learning_rate * grad_w)
new_weights.append(new_w)
with tf.control_dependencies(new_weights):
return tf.identity(loss)
def check_accuracy(sess, dset, x, scores, is_training=None):
num_correct, num_samples = 0, 0
for x_batch, y_batch in dset:
feed_dict = {x: x_batch, is_training: 0}
scores_np = sess.run(scores, feed_dict=feed_dict)
y_pred = scores_np.argmax(axis=1)
num_samples += x_batch.shape[0]
num_correct += (y_pred == y_batch).sum()
acc = float(num_correct) / num_samples
print('Got %d / %d correct (%.2f%%)' % (num_correct, num_samples, 100 * acc))
def kaiming_normal(shape):
if len(shape) == 2:
fan_in, fan_out = shape[0], shape[1]
elif len(shape) == 4:
fan_in, fan_out = np.prod(shape[:3]), shape[3]
return tf.random_normal(shape) * np.sqrt(2.0 / fan_in)
def three_layer_convnet_init():
params = None
############################################################################
# TODO: Initialize the parameters of the three-layer network. #
############################################################################
conv_w1 = tf.Variable(kaiming_normal((5, 5, 3, 32)))
conv_b1 = tf.Variable(tf.zeros((32,)))
conv_w2 = tf.Variable(kaiming_normal((3, 3, 32, 16)))
conv_b2 = tf.Variable(tf.zeros((16,)))
fc_w = tf.Variable(kaiming_normal((32 * 32 * 16, 10)))
fc_b = tf.Variable(tf.zeros((10,)))
params = [conv_w1, conv_b1, conv_w2, conv_b2, fc_w, fc_b]
############################################################################
# END OF YOUR CODE #
############################################################################
return params
def main():
learning_rate = 3e-3
tf.reset_default_graph()
is_training = tf.placeholder(tf.bool, name='is_training')
X_train, y_train, X_val, y_val, X_test, y_test = load_cifar10()
train_dset = Dataset(X_train, y_train, batch_size=64, shuffle=True)
test_dset = Dataset(X_test, y_test, batch_size=64)
val_dset = Dataset(X_val, y_val, batch_size=64, shuffle=False)
print('Train data shape: ', X_train.shape)
print('Train labels shape: ', y_train.shape, y_train.dtype)
print('Validation data shape: ', X_val.shape)
print('Validation labels shape: ', y_val.shape)
print('Test data shape: ', X_test.shape)
print('Test labels shape: ', y_test.shape)
device = '/cpu:0'
with tf.device(device):
x = tf.placeholder(tf.float32, [None, 32, 32, 3])
y = tf.placeholder(tf.int32, [None])
params = three_layer_convnet_init()
scores = three_layer_convnet(x, params)
loss = training_step(scores, y, params, learning_rate)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
for t, (x_np, y_np) in enumerate(train_dset):
feed_dict = {x: x_np, y: y_np}
loss_np = sess.run(loss, feed_dict=feed_dict)
if t % 100 == 0:
print('Iteration %d, loss = %.4f' % (t, loss_np))
check_accuracy(sess, val_dset, x, scores, is_training)
if __name__=="__main__":
main()
EDIT: removed unnecessary comments and code
the problem is here
h1_conv = tf.nn.conv2d(x,
conv_w1 + conv_b1,
strides=[1, 1, 1, 1],
padding='SAME'
)
This is wrong as here you are adding bias values (conv_b1) to the filter conv_w1 but bias has to be added to the output of conv layer. The right way would be something like this
h1_conv = tf.nn.conv2d(x,
conv_w1,
strides=[1, 1, 1, 1],
padding='SAME'
)
h1_bias = tf.nn.bias_add(h1_conv, conv_b1)
h1 = tf.nn.relu(h1_bias)
And Correct it for h2 too.

Categories

Resources