Backpropagation computes NaN values if training set size is big

Backpropagation computes NaN values if training set size is big - python

I am developing a shallow fully connected ANN from scratch which learns thanks to the Gradient Descent with momentum algorithm. This is the code
import numpy as np
from scipy.special import expit, xlog1py
def softmax(y):
e_y = np.exp(y - np.max(y))
return e_y / e_y.sum()
def cross_entropy(y, t, derivative=False, post_process=True):
if post_process:
if derivative:
return y - t
return -np.sum(np.sum(xlog1py(t, softmax(y)), axis=0))
def sigmoid(a, derivative=False):
f_a = expit(-a)
df_a = np.multiply(f_a, (1 - f_a)) # element-wise
if derivative:
return df_a
return f_a
def identity(a, derivative=False):
f_a = a
df_a = np.ones(np.shape(a))
if derivative:
return df_a
return f_a
def generate_data(n_items, n_features, n_classes):
X = np.asmatrix(np.random.normal(size=(n_items, n_features)))
targets = np.asarray(np.random.randint(n_classes, size=n_items))
targets = one_hot(targets)
return X, targets
def one_hot(targets):
return np.asmatrix(np.eye(np.max(targets) + 1)[targets]).T
class NeuralNetwork:
def __init__(self):
self.layers = []
def add_layer(self, layer):
self.layers.append(layer)
def build(self):
for i, layer in enumerate(self.layers):
if i == 0:
layer.type = "input"
else:
layer.type = "output" if i == len(self.layers) - 1 else "hidden"
layer.configure(self.layers[i - 1].neurons)
def fit(self, X, targets):
MAX_EPOCHS = 200
epoch_loss = []
# batch mode
for epoch in range(MAX_EPOCHS):
predictions = self.predict(X)
self.back_prop(targets, cross_entropy)
self.learning_rule(l_rate=0.01, momentum=0.01)
loss = cross_entropy(predictions, targets)
epoch_loss.append(loss)
print("E(%d) on TrS is:" % epoch, loss)
# Columns of predictions
def predict(self, dataset):
z = dataset.T
for layer in self.layers:
z = layer.forward_prop_step(z)
return z
def back_prop(self, target, loss):
for i, layer in enumerate(self.layers[:0:-1]):
next_layer = self.layers[-i]
prev_layer = self.layers[-i - 2]
layer.back_prop_step(next_layer, prev_layer, target, loss)
def learning_rule(self, l_rate, momentum):
# Momentum GD
for layer in [layer for layer in self.layers if layer.type != "input"]:
layer.update_weights(l_rate, momentum)
layer.update_bias(l_rate, momentum)
class Layer:
def __init__(self, neurons, type=None, activation=None):
self.dE_dW = 0
self.dE_db = 0
self.dEn_db = None
self.dEn_dW = None
self.dact_a = None
self.out = None
self.weights = None
self.bias = None
self.w_sum = None
self.neurons = neurons
self.type = type
self.activation = activation
self.deltas = None
def configure(self, prev_layer_neurons):
self.weights = np.asmatrix(np.random.normal(-1, 1, (self.neurons, prev_layer_neurons)))
self.bias = np.asmatrix(np.random.normal(-1, 1, self.neurons)).T # vettore colonna
if self.activation is None:
# th approx universale
if self.type == "hidden":
self.activation = sigmoid
elif self.type == "output":
self.activation = identity
def forward_prop_step(self, z):
if self.type == "input":
self.out = z
else:
self.w_sum = np.dot(self.weights, z) + self.bias
self.out = self.activation(self.w_sum)
return self.out
def back_prop_step(self, next_layer, prev_layer, target, local_loss):
if self.type == "output":
self.dact_a = self.activation(self.w_sum, derivative=True)
self.deltas = np.multiply(self.dact_a,
local_loss(self.out, target, derivative=True)) # (c,batch_size)
else:
self.dact_a = self.activation(self.w_sum, derivative=True) # (m,batch_size)
debug = np.dot(next_layer.weights.T, next_layer.deltas) # <<<< problem here
self.deltas = np.multiply(self.dact_a, debug)
self.dEn_dW = self.deltas * prev_layer.out.T
self.dEn_db = self.deltas
self.dE_dW = self.dEn_dW
self.dE_db = self.dEn_db
def update_weights(self, l_rate, momentum):
# Momentum GD
self.weights = self.weights - l_rate * self.dE_dW
self.weights = -l_rate * self.dE_dW + momentum * self.weights
def update_bias(self, l_rate, momentum):
# Momentum GD
self.bias = self.bias - l_rate * self.dE_db
self.bias = -l_rate * self.dE_db + momentum * self.bias
if __name__ == '__main__':
# Dog: 0 -> 000
# Cat: 1 -> 010
# Mouse: 2 -> 001
net = NeuralNetwork()
d = 4 # (n_features)
c = 3 # classes
n_items = 10 # increasing this gives NaN in EBP formula, in debug variable
for m in (d, 4, c):
layer = Layer(m)
net.add_layer(layer)
net.build()
X, targets = generate_data(n_items=n_items, n_features=d, n_classes=c)
net.fit(X, targets)
If n_items value is low, such as 10 or 100, the learning works properly:
E(0) on TrS is: -0.27547576455869305
E(1) on TrS is: -0.33774479466660445
E(2) on TrS is: -0.3295771015279694
...
E(199) on TrS is: -0.33026951829371987
Unfortunately when n_items gets bigger, such as 1000 I get this error:
RuntimeWarning: invalid value encountered in multiply
self.deltas = np.multiply(self.dact_a, debug)
and:
E(0) on TrS is: -0.3337489007828587
E(1) on TrS is: -0.01614463421285259
E(2) on TrS is: -0.33594156066981384
E(3) on TrS is: -0.11378512597000995
E(4) on TrS is: -0.33508867936192843
E(5) on TrS is: -0.33276323614435077
E(6) on TrS is: -0.33310949105565746
E(7) on TrS is: -0.224661060748479
E(8) on TrS is: -0.3321560115270673
E(9) on TrS is: -0.22289014654421438
...
E(138) on TrS is: nan
E(139) on TrS is: nan
E(140) on TrS is: nan
E(141) on TrS is: nan
...
E(199) on TrS is: nan
I think this is caused by debug variable which grows until it reaches the sys.float_info.max value which is 1.7976931348623157e+308.
How can I solve this problem?

It looks like you have exploding gradients. Maybe try some regularisation.
A couple of general things though. Your weight initialisation is going to give very large starting values which can lead to exploding gradients in itself.
Instead, consider something more like:
np.random.normal(-0.1,0.02,...
This example doesn't affect the weights, but it is also worth looking at the logic of some of your methods. For instance, sigmoid always calculates the derivative whether it is used or not. Perhaps instead use two methods (one job per method) or at least calculate the derivative inside the if:
def sigmoid(a, derivative=False):
f_a = expit(-a)
if derivative:
df_a = np.multiply(f_a, (1 - f_a)) # element-wise
return df_a
return f_a
For more about exploding gradients and weights initialisation, see this https://medium.com/usf-msds/deep-learning-best-practices-1-weight-initialization-14e5c0295b94 .

Related

Training loss increases instead of decrease with epochs

I am developing from scratch my first feed-forward fully-connected ANN based on batch learning mode on a toy training set. I am using back-propagation for calculating the gradient of the loss function with respect to weights and biases and using the gradient descent method as a learning rule. But when I print the training loss, it gets bigger as the epoch increases:
E(0) on TrS is: [[7.83898769]]
E(1) on TrS is: [[10.00738465]]
E(2) on TrS is: [[10.76653098]]
E(3) on TrS is: [[15.94001008]]
E(4) on TrS is: [[23.80650667]]
E(5) on TrS is: [[28.65805023]]
E(6) on TrS is: [[29.56550719]]
E(7) on TrS is: [[30.5424694]]
E(8) on TrS is: [[34.26980112]]
E(9) on TrS is: [[39.9948856]]
This is my loss_functions.py file:
import numpy as np
def sum_of_squares(c, t, y, derivative=False):
ret = 0
for k in range(c):
ret += np.square(y - t)
ret = 1 / 2 * ret
if derivative:
return y - t
return ret
this is my activation_functions.py file:
import numpy as np
def sigmoid(a, derivative=False):
f_a = 1 / (1 + np.exp(-a))
df_a = np.multiply(f_a, (1 - f_a))
if derivative:
return df_a
return f_a
def identity(a, derivative=False):
f = a
df = np.ones(np.shape(a))
if derivative:
return df
return f
and this is the main.py file:
from activation_functions import *
from loss_functions import *
class NeuralNetwork:
def _init_(self):
self.layers = []
def add_layer(self, layer):
self.layers.append(layer)
def create(self):
for i, layer in enumerate(self.layers):
if i == 0:
layer.type = "input"
else:
if i == len(self.layers) - 1:
layer.type = "output"
else:
layer.type = "hidden"
layer.configure(self.layers[i - 1].neurons)
def train(self, X, targets):
MAX_EPOCHS = 10
loss_function = sum_of_squares
E = 0 # errore sull'intero DS
for epoch in range(MAX_EPOCHS):
for i, x in enumerate(X):
target = targets[i]
prediction = self.forward_prop(x.T)
E_n = loss_function(c, target, prediction)
E += E_n
self.back_prop(target, local_loss=sum_of_squares)
print("E(%d) on TrS is:" % epoch, E) # increasing!!!
self.learning_rule(l_rate=0.05)
def forward_prop(self, z):
for layer in self.layers:
z = layer.forward_prop_step(z)
return z
def back_prop(self, target, local_loss):
for i, layer in enumerate(self.layers[:0:-1]):
next_layer = self.layers[-i]
prev_layer = self.layers[-i - 2]
layer.back_prop_step(next_layer, prev_layer, target, local_loss)
def learning_rule(self, l_rate):
# GD
for layer in self.layers:
if layer.type != "input":
layer.weight -= l_rate * layer.dE_dW
layer.bias -= l_rate * layer.dE_db
class Layer:
def _init_(self, neurons, type=None, activation=None):
self.dE_dW = 0
self.dE_db = 0
self.dEn_db = None # based on the n-th item
self.dEn_dW = None # based on the n-th item
self.dact_a = None
self.out = None
self.weight = None
self.bias = None
self.w_sum = None
self.neurons = neurons
self.type = type
self.activation = activation
self.deltas = None
def configure(self, prev_layer_neurons):
self.weight = np.asmatrix(np.random.normal(0, 0.5, (self.neurons, prev_layer_neurons)))
self.bias = np.asmatrix(np.random.normal(0, 0.5, self.neurons)).T
if self.activation is None:
if self.type == "hidden":
self.activation = sigmoid
elif self.type == "output":
self.activation = identity
def forward_prop_step(self, z):
if self.type == "input":
self.out = z
else:
self.w_sum = np.dot(self.weight, z) + self.bias
self.out = self.activation(self.w_sum)
return self.out
def back_prop_step(self, next_layer, prev_layer, target, local_loss):
if self.type == "input":
pass
elif self.type == "output":
self.dact_a = self.activation(self.w_sum, derivative=True)
self.deltas = np.multiply(self.dact_a, local_loss(c, target, self.out, derivative=True))
else:
self.dact_a = self.activation(self.w_sum, derivative=True)
self.deltas = np.multiply(self.dact_a, np.dot(next_layer.weight.T, next_layer.deltas))
self.dEn_dW = np.dot(self.deltas, prev_layer.out.T)
self.dEn_db = self.deltas
self.dE_dW += self.dEn_dW
self.dE_db += self.dEn_db
if _name_ == '_main_':
net = NeuralNetwork()
for m in (2, 4, 4, 1):
net.add_layer(Layer(m))
net.create()
X = np.asmatrix([
[1, 0],
[1, 1],
[0, 1],
[0, 0]
])
targets = np.asarray([1, 0, 0, 0])
net.train(X, targets)
What I did for trying to fix the problem is:
Check for any bug
Decrease the learning rate (l_rate)
Increase MAX_EPOCHS value
Replace - symbol to + in GD formula
Unfortunately none of these worked. There must be a hidden bug somewhere in the code...
How can I resolve the issue?

Your task is about classification meanwhile you have chose the sum-of-squares loss function which works properly for regression tasks. You should use the cross-entropy loss function, instead.

Debugging Neural Network's feedforward propagation

I am implementing a Neural Network's forward propagation.
This is my train method:
def train(self, x, y):
for layer in self.layers:
prediction = layer.forward_prop(x.T) # column vector
and this is the forward_prop method:
def forward_prop(self, x):
if self.is_input:
self.out = x
else:
# bug here: x has to be updated according to previous layer's output!
w_sum = np.dot(self.weight, x) + self.bias
self.out = self.activation(w_sum)
return self.out
There is a error, since layer 2 is trying to multiply self.weight and x instead of the previous layer's output:
ValueError: shapes (4,4) and (2,1) not aligned: 4 (dim 1) != 2 (dim 0)
That is I want:
Layer 0 (input layer): returns z^(0) = x
Layer 1 (first hidden layer): returns z^(1) = sig(W^(1) # z^(0) + b^(1))
Layer 2 (second hidden layer): returns z^(2) = sig(W^(2) # z^(1) + b^(2))
Layer 3 (output): returns z^(3) = sig(W^(3) # z^(2) + b^(3))
...
Layer i: returns z^(i) = sig(W^(i) # z^(i-1) + b^(i))
...
How can I store the previous layer's output?
This is my replicable example:
import numpy as np
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class NeuralNetwork:
def __init__(self):
self.layers = []
def add_layer(self, layer):
self.layers.append(layer)
def create(self):
for i, layer in enumerate(self.layers):
if i == 0:
layer.is_input = True
else:
layer.init_parameters(self.layers[i - 1].neurons)
def summary(self):
for i, layer in enumerate(self.layers):
print("Layer", i)
print("neurons:", layer.neurons)
print("is_input:", layer.is_input)
print("act:", layer.activation)
print("weight:", np.shape(layer.weight))
print(layer.weight)
print("bias:", np.shape(layer.bias))
print(layer.bias)
print("")
def train(self, x, y):
for layer in self.layers:
prediction = layer.forward_prop(x.T) # column vector
class Layer:
def __init__(self, neurons, is_input=False, activation=None):
self.out = None
self.weight = None
self.bias = None
self.neurons = neurons
self.is_input = is_input
self.activation = activation
def init_parameters(self, prev_layer_neurons):
self.weight = np.asmatrix(np.random.normal(0, 0.5, (self.neurons, prev_layer_neurons)))
self.bias = np.asmatrix(np.random.normal(0, 0.5, self.neurons)).T # column vector
if self.activation is None:
self.activation = sigmoid
def forward_prop(self, x):
if self.is_input:
self.out = x
else:
w_sum = np.dot(self.weight, x) + self.bias # Bug
self.out = self.activation(w_sum)
return self.out
if __name__ == '__main__':
net = NeuralNetwork()
d = 2 # input dimension
c = 1 # output
for m in (d, 4, 4, c):
layer = Layer(m)
net.add_layer(layer)
net.create()
# net.summary() # dbg
# Training set
X = np.asmatrix([
[0, 0],
[0, 1],
[1, 0],
[1, 1]
])
y = np.asarray([0, 0, 1, 0])
net.train(X[2], y[2])

I simply solved this way:
def train(self, x, y):
z = x.T
for layer in self.layers:
z = layer.forward_prop(z)

MNIST ML output doesn't make sense

I'm new to ML and have tried to use this GitHub repository to build an MNIST Machine Learning Model.
Since I have to import the dataset from my computer, I had to change things a bit. My imported dataset also doesn't include all 10 digits, but only 5.
The calculated accuracy is 96%, however when I cross check the .png files on my computer with the outcome txt, the labels make zero sense. It labels some 4's as 7's, some 2's as 5's and so on.
This is what the folder structure looks like on my computer:
2
-->001.png
-->002.png
-->003.png
-->...
3
-->001.png
-->002.png
-->003.png
-->...
4
-->001.png
-->002.png
-->003.png
-->...
5
-->001.png
-->002.png
-->003.png
-->...
7
-->001.png
-->002.png
-->003.png
-->...
Question 1:
I previously had the error that it expected 8 different categories since 7 is the highest digit label. I didn't know how to fix this, so I rename the folders from 0 to 4. Any idea how to fix this, without having to rename all folders?
Question 2:
Do you know why the outcome doesn't make any sense? It doesn't seem to be an overfitting issue, I've tried adjusting the training-test split, which didn't have any impact.
from sklearn.datasets import fetch_openml
from keras.utils.np_utils import to_categorical
import numpy as np
from sklearn.model_selection import train_test_split
import time
#x, y = fetch_openml('mnist_784', version=1, return_X_y=True)
import os
from os import listdir
from os.path import isfile, join
import cv2
label_folder_training = []
label_files_training = []
total_size_training = 0
total_size_testing = 0
data_path_training = r"Training_data"
data_path_testing = r"Testing_data"
for root, dirs, files in os.walk(data_path_training):
for dir in dirs:
label_folder_training.append(dir)
total_size_training += len(files)
for file in files:
label_files_training.append(file)
for root, dirs, files in os.walk(data_path_testing):
total_size_testing += len(files)
#to ignore .DS_Store file
total_size_training = total_size_training - 1
total_size_testing = total_size_testing
print("found", total_size_training, "training files and", total_size_testing, "testing files.")
print("folder Training:",label_folder_training)
# Print returns the following:
#found 20000 training files and 5000 testing files.
#folder Training: ['0', '1', '4', '3', '2']
x = []
y = []
for i in range(len(label_folder_training)):
labelPath_training = os.path.join(data_path_training,label_folder_training[i])
FileName = [f for f in listdir(labelPath_training) if isfile(join(labelPath_training, f))]
for j in range(len(FileName)):
path = os.path.join(labelPath_training,FileName[j])
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
x.append(img)
y.append(label_folder_training[i])
x = np.array(x)
x = np.reshape(x, (20000, 784))
x = (x/255).astype('float32')
y = to_categorical(y)
X_train, X_test, y_train, y_test = train_test_split(x, y, test_size=0.3, random_state=42)
import pandas as pd
class Module:
def __init__(self):
self._train = True
def forward(self, input):
raise NotImplementedError
def backward(self, input, grad_output):
raise NotImplementedError
def parameters(self):
"""
Returns list of its parameters
"""
return []
def grad_parameters(self):
"""
Returns list of tensors gradients of its parameters
"""
return []
def train(self):
self._train = True
def eval(self):
self._train = False
class Criterion:
def forward(self, input, target):
raise NotImplementedError
def backward(self, input, target):
raise NotImplementedError
class Linear(Module):
def __init__(self, dim_in, dim_out):
super().__init__()
self.W = np.random.randn(dim_in, dim_out)
self.b = np.random.randn(1, dim_out)
def forward(self, input):
self.output = (np.dot(input, self.W) + self.b)
return self.output
def backward(self, input, grad_output):
self.grad_b = np.mean(grad_output, axis=0)
self.grad_W = np.dot(input.T, grad_output)
self.grad_W /= input.shape[0]
grad_input = np.dot(grad_output, self.W.T)
return grad_input
def parameters(self):
return [self.W, self.b]
def grad_parameters(self):
return [self.grad_W, self.grad_b]
def softmax(xs):
xs = np.subtract(xs, xs.max(axis=1, keepdims=True))
xs = np.exp(xs) / np.sum(np.exp(xs), axis=1, keepdims=True)
return xs
class CrossEntropy(Criterion):
def __init__(self):
super().__init__()
def forward(self, input, target):
eps = 1e-9
predictions = np.clip(input, eps, 1. - eps)
N = predictions.shape[0]
ce = -np.sum(target * np.log(predictions))
return ce / N
def backward(self, input, target):
eps = 1e-9
input_clamp = np.clip(input, eps, 1 - eps)
return softmax(input_clamp) - target
class Sequential(Module):
def __init__(self, *layers):
super().__init__()
self.layers = layers
def forward(self, input):
for layer in self.layers:
input = layer.forward(input)
self.output = input
return self.output
def backward(self, input, grad_output):
for i in range(len(self.layers) - 1, 0, -1):
grad_output = self.layers[i].backward(self.layers[i-1].output, grad_output)
grad_output = self.layers[0].backward(input, grad_output)
return grad_output
def parameters(self):
res = []
for l in self.layers:
res += l.parameters()
return res
def grad_parameters(self):
res = []
for l in self.layers:
res += l.grad_parameters()
return res
def train(self):
for layer in self.layers:
layer.train()
def eval(self):
for layer in self.layers:
layer.eval()
def sigmoid(x):
return 1 / (1 + np.exp(-x))
class Sigmoid(Module):
def __init__(self):
super().__init__()
def forward(self, input):
self.output = sigmoid(input)
return self.output
def backward(self, input, grad_output):
grad_input = sigmoid(input) * (1 - sigmoid(input)) * grad_output
return grad_input
class SoftMax(Module):
def __init__(self):
super().__init__()
def forward(self, input):
self.output = np.subtract(input, input.max(axis=1, keepdims=True))
self.output = np.exp(self.output) / np.sum(np.exp(self.output), axis=1, keepdims=True)
return self.output
def backward(self, input, grad_output):
return grad_output
def DataLoader(X, Y, batch_size=32):
n = X.shape[0]
indices = np.arange(n)
np.random.shuffle(indices)
for start in range(0, n, batch_size):
end = min(start + batch_size, n)
batch_idx = indices[start:end]
yield X[batch_idx], Y[batch_idx]
def accuracy_score(y_true, y_pred):
a = np.argmax(y_true, axis=1)
b = np.argmax(y_pred, axis=1)
return np.count_nonzero(a == b) / y_true.shape[0]
class Adam:
def __init__(self, model):
self.prev_m = None
self.prev_v = None
self.model = model
self.t = 1
def step(self, lr, beta1, beta2):
prev_m_tmp = []
prev_v_tmp = []
eps = 1e-7
for i, (weights, gradient) in enumerate(zip(self.model.parameters(), self.model.grad_parameters())):
if self.prev_m and self.prev_v:
m = beta1 * self.prev_m[i] + (1 - beta1) * gradient
v = beta2 * self.prev_v[i] + (1 - beta2) * gradient ** 2
m_hat = m / (1 - beta1 ** self.t)
v_hat = v / (1 - beta2 ** self.t)
else:
m = beta1 * 0 + (1 - beta1) * gradient
v = beta2 * 0 + (1 - beta2) * gradient ** 2
m_hat = m / (1 - beta1 ** self.t)
v_hat = v / (1 - beta2 ** self.t)
weights -= lr * m_hat / (np.sqrt(v_hat) + eps)
prev_m_tmp.append(m)
prev_v_tmp.append(v)
self.prev_m = prev_m_tmp
self.prev_v = prev_v_tmp
self.t += 1
model = Sequential(
Linear(784, 512),
Sigmoid(),
Linear(512, 256),
Sigmoid(),
Linear(256, 128),
Sigmoid(),
Linear(128, 64),
Sigmoid(),
Linear(64, 5),
SoftMax(),
)
epochs = 20
eval_every = 1
batch_size = 1024
criterion = CrossEntropy()
optimizer = Adam(model)
for epoch in range(epochs):
for x, y in DataLoader(X_train, y_train, batch_size=batch_size):
model.train()
y_pred = model.forward(x)
grad = criterion.backward(y_pred, y)
model.backward(x, grad)
optimizer.step(lr=0.003, beta1=0.9, beta2=0.999)
if (epoch + 1) % eval_every == 0:
model.eval()
y_train_pred = model.forward(X_train)
y_test_pred = model.forward(X_test)
loss_train = criterion.forward(y_train_pred, y_train)
loss_test = criterion.forward(y_test_pred, y_test)
print(f'Epoch: {epoch + 1}/{epochs}')
print(f'Train Loss: {loss_train} Train Accuracy: {accuracy_score(y_train, y_train_pred)}')
print(f'Test Loss: {loss_test} Test Accuracy: {accuracy_score(y_test, y_test_pred)} \n')
# Returns the following in epoch 20/20:
# Epoch: 20/20
# Train Loss: 0.151567557756849 Train Accuracy: 0.9905
# Test Loss: 0.706321046620394 Test Accuracy: 0.9563333333333334
test_x=[]
FileName = [f for f in listdir(data_path_testing) if isfile(join(data_path_testing, f))]
for j in range(len(FileName)):
path = os.path.join(data_path_testing,FileName[j])
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
test_x.append(img)
x_val = np.array(test_x)
x_val = np.reshape(x_val, (5000, 784))
x_val = (x_val/255).astype('float32')
df_test = pd.DataFrame(x_val,columns=range(784)).add_prefix('pixels_')
output = model.forward(df_test)
output_arg = np.argmax(output, axis=1)
ImageId = df_test.index +1
submission = pd.DataFrame({'ImageId': ImageId, 'Label': output})
submission['ImageId'] = submission['ImageId'].apply('{:0>4}'.format)
submission.to_csv('export.txt', sep=' ', index=False, header=False)

Found the answer to my problem. The output didn't make any sense since python was importing the testing files in a random order. All I had to do was to sort FileName before letting the model run.
I changed this
test_x=[]
FileName = [f for f in listdir(data_path_testing) if isfile(join(data_path_testing, f))]
for j in range(len(FileName)):
path = os.path.join(data_path_testing,FileName[j])
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
test_x.append(img)
to this:
test_x=[]
FileName = sorted( filter( lambda x: os.path.isfile(os.path.join(data_path_testing, x)),
os.listdir(data_path_testing) ) )
for j in range(len(FileName)):
path = os.path.join(data_path_testing,FileName[j])
img = cv2.imread(path,cv2.IMREAD_GRAYSCALE)
test_x.append(img)

How to shuffle the training data set for each epochs while mantaining the intial one?

Perceptron
I want to compare the training accuracy between shuffling the training dataset and without shuffling the training dataset for each epochs. I have done the part that without shuffling the training dataset, but I have no idea how to implement the code with shuffling into the existing code in order to plot a graph to visualize the difference between training accuracy with shuffling and without shuffling.
class Perceptron():
def __init__(self, num_epochs, num_features, averaged):
super().__init__()
self.num_epochs = num_epochs
self.averaged = averaged
self.num_features = num_features
self.weights = None
self.bias = None
def init_parameters(self):
self.weights = np.zeros(self.num_features)
self.bias = 0
pass
def train(self, train_X, train_y, dev_X, dev_y):
self.init_parameters()
train_acc = []
dev_acc = []
for epoch in range(self.num_epochs):
preds = []
for i in range(51775):
if (safe_sparse_dot(self.weights, train_X[i].T, dense_output = True) + self.bias==0):
a=np.array(sample([-1,1],1)).astype('float64')
else:
a = safe_sparse_dot(self.weights, train_X[i].T, dense_output = True) + self.bias
y_hat = np.sign(a)
yhat = preds.append(y_hat)
if (train_y[i]*a) <=0:
self.weights = self.weights+ train_y[i]*train_X[i]
self.bias = self.bias + train_y[i]
arr_ravel = np.array(preds).ravel()
training_acc = np.mean(arr_ravel==train_y)
train_acc.append(training_acc)
development_acc = np.mean(self.predict(dev_X)==dev_y)
dev_acc.append(development_acc)
def predict(self, X):
predicted_labels = []
for j in X:
if ((safe_sparse_dot(self.weights, j.T, dense_output = True) + self.bias)==0):
a=np.array(sample([-1,1],1)).astype('float64')
else:
a = safe_sparse_dot(self.weights, j.T, dense_output = True) + self.bias
y_hat = np.sign(a)
predicted_labels.append(y_hat)
array_ravel = np.array(predicted_labels).ravel()
return array_ravel

Instead of shuffling the data, create an index array and shuffle that every epoch. This way you keep the original order.
idx = np.arange(train_X.shape[0])
np.random.shuffle(x)
train_X_shuffled = train_X[idx]
train_y_shuffled = train_y[idx]
Adding this to your code (making copies of original data, so change the code as little as possible):
class Perceptron():
def __init__(self, num_epochs, num_features, averaged):
super().__init__()
self.num_epochs = num_epochs
self.averaged = averaged
self.num_features = num_features
self.weights = None
self.bias = None
def init_parameters(self):
self.weights = np.zeros(self.num_features)
self.bias = 0
pass
def train(self, train_X, train_y, dev_X, dev_y):
self.init_parameters()
train_acc = []
dev_acc = []
# Make copies of the original data
train_X_unshuffled = train_X.copy()
train_y_unshuffled = train_y.copy()
idx = np.arange(train_X.shape[0])
for epoch in range(self.num_epochs):
# Get shuffled dataset
np.shuffle(idx)
train_X = train_X_unshuffled[idx]
train_y = train_y_unshuffled[idx]
preds = []
for i in range(51775):
if (safe_sparse_dot(self.weights, train_X[i].T, dense_output = True) + self.bias==0):
a=np.array(sample([-1,1],1)).astype('float64')
else:
a = safe_sparse_dot(self.weights, train_X[i].T, dense_output = True) + self.bias
y_hat = np.sign(a)
yhat = preds.append(y_hat)
if (train_y[i]*a) <=0:
self.weights = self.weights+ train_y[i]*train_X[i]
self.bias = self.bias + train_y[i]
arr_ravel = np.array(preds).ravel()
training_acc = np.mean(arr_ravel==train_y)
train_acc.append(training_acc)
development_acc = np.mean(self.predict(dev_X)==dev_y)
dev_acc.append(development_acc)
def predict(self, X):
predicted_labels = []
for j in X:
if ((safe_sparse_dot(self.weights, j.T, dense_output = True) + self.bias)==0):
a=np.array(sample([-1,1],1)).astype('float64')
else:
a = safe_sparse_dot(self.weights, j.T, dense_output = True) + self.bias
y_hat = np.sign(a)
predicted_labels.append(y_hat)
array_ravel = np.array(predicted_labels).ravel()
return array_ravel

Applying Normalization to Inputs in Tensorflow

I have created a custom class to be an ML model, and it is working fine, but I would like to normalize the inputs as they have a wide range of values (e.g. 0, 20000, 500, 10, 8). Currently, as a way of normalizing the inputs, I'm applying lambda x: np.log(x + 1) to each input (the +1 is so it doesn't error out when 0 is passed in). Would a normalization layer be better than my current approach? If so, how would I go about implementing it? My code for the model is below:
class FollowModel:
def __init__(self, input_shape, output_shape, hidden_layers, input_labels, learning_rate=0.001):
tf.reset_default_graph()
assert len(input_labels) == input_shape[1], 'Incorrect number of input labels!'
# Placeholders for input and output data
self.input_labels = input_labels
self.input_shape = input_shape
self.output_shape = output_shape
self.X = tf.placeholder(shape=input_shape, dtype=tf.float64, name='X')
self.y = tf.placeholder(shape=output_shape, dtype=tf.float64, name='y')
self.hidden_layers = hidden_layers
self.learning_rate = learning_rate
# Variables for two group of weights between the three layers of the network
self.W1 = tf.Variable(np.random.rand(input_shape[1], hidden_layers), dtype=tf.float64)
self.W2 = tf.Variable(np.random.rand(hidden_layers, output_shape[1]), dtype=tf.float64)
# Create the neural net graph
self.A1 = tf.sigmoid(tf.matmul(self.X, self.W1))
self.y_est = tf.sigmoid(tf.matmul(self.A1, self.W2))
# Define a loss function
self.deltas = tf.square(self.y_est - self.y) # want this to be 0
self.loss = tf.reduce_sum(self.deltas)
# Define a train operation to minimize the loss
self.optimizer = tf.train.AdamOptimizer(learning_rate).minimize(self.loss)
#initialize
self.model_init = tf.global_variables_initializer()
self.trained = False
def train(self, Xtrain, ytrain, Xtest, ytest, training_steps, batch_size, print_progress=True):
#intiialize session
self.trained = True
self.training_steps = training_steps
self.batch_size = batch_size
self.sess = tf.Session()
self.sess.run(self.model_init)
self.losses = []
self.accs = []
self.testing_accuracies = []
for i in range(training_steps*batch_size):
self.sess.run(self.optimizer, feed_dict={self.X: Xtrain, self.y: ytrain})
local_loss = self.sess.run(self.loss, feed_dict={self.X: Xtrain.values, self.y: ytrain.values})
self.losses.append(local_loss)
self.weights1 = self.sess.run(self.W1)
self.weights2 = self.sess.run(self.W2)
y_est_np = self.sess.run(self.y_est, feed_dict={self.X: Xtrain.values, self.y: ytrain.values})
correct = [estimate.argmax(axis=0) == target.argmax(axis=0)
for estimate, target in zip(y_est_np, ytrain.values)]
acc = 100 * sum(correct) / len(correct)
self.accs.append(acc)
if i % batch_size == 0:
batch_num = i / batch_size
if batch_num % 5 == 0:
self.testing_accuracies.append(self.test_accuracy(Xtest, ytest, False, True))
temp_table = pd.concat([Xtrain, ytrain], axis=1).sample(frac=1)
column_names = list(temp_table.columns.values)
X_columns, y_columns = column_names[0:len(column_names) - 2], column_names[len(column_names) - 2:]
Xtrain = temp_table[X_columns]
ytrain = temp_table[y_columns]
if print_progress: print('Step: %d, Accuracy: %.2f, Loss: %.2f' % (int(i/batch_size), acc, local_loss))
if print_progress: print("Training complete!\nloss: {}, hidden nodes: {}, steps: {}, epoch size: {}, total steps: {}".format(int(self.losses[-1]*100)/100, self.hidden_layers, training_steps, batch_size, training_steps*batch_size))
self.follow_accuracy = acc
return acc
def test_accuracy(self, Xtest, ytest, print_progress=True, return_accuracy=False):
if self.trained:
X = tf.placeholder(shape=Xtest.shape, dtype=tf.float64, name='X')
y = tf.placeholder(shape=ytest.shape, dtype=tf.float64, name='y')
W1 = tf.Variable(self.weights1)
W2 = tf.Variable(self.weights2)
A1 = tf.sigmoid(tf.matmul(X, W1))
y_est = tf.sigmoid(tf.matmul(A1, W2))
# Calculate the predicted outputs
init = tf.global_variables_initializer()
with tf.Session() as sess:
sess.run(init)
y_est_np = sess.run(y_est, feed_dict={X: Xtest, y: ytest})
correctly_followed = 0
incorrectly_followed = 0
missed_follows = 0
correctly_skipped = 0
for estimate, actual in zip(y_est_np, ytest.values):
est = estimate.argmax(axis=0)
# print(estimate)
actual = actual.argmax(axis=0)
if est == 1 and actual == 0: incorrectly_followed += 1
elif est == 1 and actual == 1: correctly_followed += 1
elif est == 0 and actual == 1: missed_follows += 1
else: correctly_skipped += 1
# correct = [estimate.argmax(axis=0) == target.argmax(axis=0) for estimate, target in zip(y_est_np, ytest.values)]
total_followed = incorrectly_followed + correctly_followed
total_correct = correctly_followed + correctly_skipped
total_incorrect = incorrectly_followed + missed_follows
try: total_accuracy = int(total_correct * 10000 / (total_correct + total_incorrect)) / 100
except: total_accuracy = 0
total_skipped = correctly_skipped + missed_follows
try: follow_accuracy = int(correctly_followed * 10000 / total_followed) / 100
except: follow_accuracy = 0
try: skip_accuracy = int(correctly_skipped * 10000 / total_skipped) / 100
except: skip_accuracy = 0
if print_progress: print('Correctly followed {} / {} ({}%), correctly skipped {} / {} ({}%)'.format(
correctly_followed, total_followed, follow_accuracy, correctly_skipped, total_skipped, skip_accuracy))
self.follow_accuracy = follow_accuracy
if return_accuracy:
return total_accuracy
else:
print('The model is not trained!')
def make_prediction_on_normal_data(self, input_list):
assert len(input_list) == len(self.input_labels), 'Incorrect number of inputs (had {} should have {})'.format(len(input_list), len(self.input_labels))
# from ProcessData import normalize_list
# normalize_list(input_list)
input_array = np.array([input_list])
X = tf.placeholder(shape=(1, len(input_list)), dtype=tf.float64, name='X')
y = tf.placeholder(shape=(1, 2), dtype=tf.float64, name='y')
W1 = tf.Variable(self.weights1)
W2 = tf.Variable(self.weights2)
A1 = tf.sigmoid(tf.matmul(X, W1))
y_est = tf.sigmoid(tf.matmul(A1, W2))
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
y_est_np = sess.run(y_est, feed_dict={X: input_array, y: self.create_blank_outputs()})
predicted_value = y_est_np[0].argmax(axis=0)
return predicted_value
def make_prediction_on_abnormal_data(self, input_list):
from ProcessData import normalize_list
normalize_list(input_list)
return self.make_prediction_on_normal_data(input_list)
def create_blank_outputs(self):
blank_outputs = np.zeros(shape=(1,2), dtype=np.int)
for i in range(len(blank_outputs[0])):
blank_outputs[0][i] = float(blank_outputs[0][i])
return blank_outputs

I don't see see why you want to create a layer that does that. The common practice of preprocessing your inputs is as you are currently doing.
Using the log operator is quite common for skewed data, but there are other preprocessing solutions such as sklearn's MinMaxScaler and StandardScaler
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.MinMaxScaler.html
https://scikit-learn.org/stable/modules/generated/sklearn.preprocessing.StandardScaler.html
Those are just examples of two other ways to scale your data.
There is such a thing called BatchNorm but it is not recommended as the first layer of the network as distribution of the data is fixed and doesn’t vary during training.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Backpropagation computes NaN values if training set size is big - python

Related

Training loss increases instead of decrease with epochs

Debugging Neural Network's feedforward propagation

MNIST ML output doesn't make sense

How to shuffle the training data set for each epochs while mantaining the intial one?

Applying Normalization to Inputs in Tensorflow

Categories

Resources