I've been doing a very simply binary cat/dog classification project with machine learning. I understand the problem of overfitting, but what's strange in my case is that the validation loss begins to rise from the very beginning. I've tried many different sets of hyperparameters, with L2 regularization, learning rate decay and stochastic gradient descent, and a large training set, but the issue remained. Here is the learning graph from one of the trials (the horizontal axis should be per 10 epochs):
The hyperparameters are: two hidden layers with 50 and 10 units, initial alpha = 0.05, alpha decay rate = 0.95 per 50 epochs, mini-batch size = 64, lambda = 0.05
Here are other sample learning graphs:
I developed my model on the basis of what's provided in Andrew Ng's Deep Learning Specialization, so I didn't expect many bugs. My full code, as required, is attached below:
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
from scipy import special
#Data Preprocessing (the same for dev set, which I omit here)
path = '/Users/bobby/Downloads/kagglecatsanddogs_3367a/PetImages'
train_set = []
img_size = 80
categories = ['dogs_train','cats_train']
epsilon = 1e-8
for category in categories:
path_animal = os.path.join(path, category)
for img in os.listdir(path_animal):
try:
img_array = cv2.imread(os.path.join(path_animal, img), cv2.IMREAD_GRAYSCALE)
new_img_array = cv2.resize(img_array, (img_size, img_size))
flattened_img_array = new_img_array.reshape(img_size*img_size)
train_set.append([flattened_img_array, categories.index(category)])
except:
continue
import random
random.shuffle(train_set)
X_train = []
Y_train = []
for sample in train_set:
X_train.append(sample[0])
Y_train.append(sample[1])
X_train = (np.array(X_train).T)/255
Y_train = np.array(Y_train).reshape((1, np.array(Y_train).shape[0]))
def create_mini_batches(X, Y, mini_batch_size):
m = X.shape[1]
mini_batches = []
num_mini_batches = m // mini_batch_size
permutation = list(np.random.permutation(m))
shuffled_X = X[:, permutation]
shuffled_Y = Y[:, permutation]
for i in range(num_mini_batches):
select_X = shuffled_X[:, mini_batch_size*i : mini_batch_size*(i+1)]
select_Y = shuffled_Y[:, mini_batch_size*i : mini_batch_size*(i+1)]
mini_batch = (select_X, select_Y)
mini_batches.append(mini_batch)
if m % mini_batch_size != 0:
last_X = shuffled_X[:, mini_batch_size*num_mini_batches:m]
last_Y = shuffled_Y[:, mini_batch_size*num_mini_batches:m]
last_mini_batch = (last_X, last_Y)
mini_batches.append(last_mini_batch)
return mini_batches
def initialize_parameters(layers_dims):
L = len(layers_dims) # number of layers (including input layer), in this case L=4.
parameters = {}
for l in range(1,L): # range(1,4).
parameters['W' + str(l)] = np.random.randn(layers_dims[l],layers_dims[l-1]) * np.sqrt(2/layers_dims[l-1])
parameters['b' + str(l)] = np.zeros((layers_dims[l],1))
return parameters
def sigmoid(Z):
A = special.expit(Z)
return A,Z
def relu(Z):
A = np.maximum(0.01*Z, Z)
return A,Z
def forward_propagation(X, parameters):
caches = [] #list containing Z for every node
A = X
L = int(len(parameters)/2)
for l in range(1,L):
A_prev = A
W = parameters['W'+str(l)]
b = parameters['b'+str(l)]
Z = np.dot(W, A_prev) + b
A, activation_cache = relu(Z) #activation_cache contains z[l].
linear_cache = (A_prev, W, b) #linear_cache contains A[l-1], W[l], b[l].
cache = (linear_cache, activation_cache)
caches.append(cache)
W = parameters['W'+str(L)]
b = parameters['b'+str(L)]
Z = np.dot(W, A) + b
AL, activation_cache = sigmoid(Z)
linear_cache = (A, W, b)
cache = (linear_cache, activation_cache)
caches.append(cache)
return AL, caches
def compute_cost(AL, Y, parameters, lambd):
m = Y.shape[1] # number of examples
L = int(len(parameters)/2) #[6400,100,20,1] L=3 (0,1,2)
reg_cost = 0
for l in range(L):
W = parameters['W' + str(l+1)]
reg_cost += np.sum(np.square(W))
J = (-1/m)*(np.sum(Y*np.log(AL+epsilon)+(1-Y)*np.log(1-AL+epsilon))) + (1/m) * (lambd/2) * reg_cost
J = np.squeeze(J)
return J
def linear_backward(dZ, linear_cache, lambd):
A_prev, W, b = linear_cache
m = A_prev.shape[1]
dW = (1/m) * np.dot(dZ,A_prev.T) + (lambd/m)*W
db = (1/m) * np.sum(dZ,axis=1,keepdims=True)
dA_prev = np.dot(W.T,dZ)
return dA_prev, dW, db
def relu_gradient(Z):
dZ = np.where(Z > 0, 1, 0.01)
return dZ
def sigmoid_gradient(Z):
dZ = special.expit(Z)*(1-special.expit(Z))
return dZ
def linear_activation_backward(dA, cache, lambd, A, Y, activation):
linear_cache, activation_cache = cache
if activation == 'relu':
dZ = dA * relu_gradient(activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
elif activation == 'sigmoid':
dZ = A - Y
dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
return dA_prev, dW, db
def L_model_backward(AL, Y, caches, lambd):
grads = {}
L = len(caches)
m = AL.shape[1]
Y = Y.reshape(AL.shape)
cache_final_layer = caches[L-1]
grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(_, cache_final_layer, lambd, AL, Y, activation='sigmoid')
for l in reversed(range(L-1)):
current_cache = caches[l]
grads["dA" + str(l)], grads["dW" + str(l+1)], grads["db" + str(l+1)] = linear_activation_backward(grads['dA' + str(l+1)], current_cache, lambd, _, _, activation='relu')
return grads
def update_parameters(parameters, grads, learning_rate):
L = len(parameters) // 2
for l in range(L):
parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
return parameters
def Neural_Network_Model(X_train, Y_train, X_dev, Y_dev, layers_dims, learning_rate, num_epoch, mini_batch_size, lambd, k):
mini_batches = create_mini_batches(X_train, Y_train, mini_batch_size) #[(X{1},Y{1}),(X{2},Y{2}),...,(X{n},Y{n})]
costs_train = []
costs_dev = []
parameters = initialize_parameters(layers_dims)
AL_dev, caches_dev = forward_propagation(X_dev, parameters)
J_dev = compute_cost(AL_dev, Y_dev, parameters, 0)
costs_dev.append(J_dev)
for i in range(num_epoch):
for mini_batch in mini_batches:
(minibatch_X, minibatch_Y) = mini_batch
AL, caches = forward_propagation(minibatch_X, parameters)
J_train = compute_cost(AL, minibatch_Y, parameters, lambd)
grads = L_model_backward(AL, minibatch_Y, caches, lambd)
parameters = update_parameters(parameters, grads, learning_rate)
if i % 10 == 0:
costs_train.append(J_train)
AL_dev, caches_dev = forward_propagation(X_dev, parameters)
J_dev = compute_cost(AL_dev, Y_dev, parameters, 0)
costs_dev.append(J_dev)
if i % 100 == 0:
print ("Cost after epoch %i: %f" %(i, J_train))
learning_rate = learning_rate * (k**(i/50))
plt.plot(np.squeeze(costs_train),'r')
plt.plot(np.squeeze(costs_dev),'b')
plt.ylabel('cost')
plt.xlabel('epochs (per thirties)')
plt.show()
return parameters, costs_train, costs_dev
parameters_updated, costs_train, costs_dev = Neural_Network_Model(X_train, Y_train, X_dev, Y_dev, [6400, 50, 10, 1], 0.05, 1000, 64, 0.05, 0.95)
I would really be grateful for anyone who is patient enough to read through my code. If the problem is still overfitting, could you offer some advice as to how to address this issue? I'm at a loss here because the validation loss goes up at a very early stage, so early stopping would cause underfitting by preventing the model from learning more deeply. Any advice would be appreciated.
When Validation Loss starts to increase in early beginning like images you added, it means that there's there is something wrong in the model.
It's not clear what's it as you didn't show your model.
You could check the following links that will help you:
Basic Cats vs Dogs Detailed Example in Colab
Detailed explanation for Over-fitting in TF Tutorial
or add your full code
Related
I am trying to create a nueral network using tensor flow. I am not using keras api. I have some parameter estimation(weight,bias and some other parameters) to do. The code is working but the parameter estimation is really bad and error percentage is very high what is the problem here? I tried so many ways still no improvement. the loss fn is less.
I tried creating my own optimizer but the process is slow and the error is large. Is there any way to apply optimizers parameter.
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
from scipy.interpolate import griddata
from pyDOE import lhs
import math as ma
class PhysicsInformedNN:
def __init__(self,X_n,v,layers,lb,ub):
self.lb = lb
self.ub = ub
self.layers = layers
self.dx_n = tf.convert_to_tensor(X_n[:,0:1],dtype = 'float32')
self.t_n = tf.convert_to_tensor(X_n[:,1:2],dtype = 'float32')
self.v_r = tf.convert_to_tensor(v,dtype = 'float32')
self.lambda_1 = tf.Variable(0,dtype = 'float32')#1.5
self.lambda_2 = tf.Variable(-6,dtype = 'float32')
self.para =[self.lambda_1,self.lambda_2]
self.weights, self.biases = self.initialize_NN(layers)
def initialize_NN(self,layers):
weights = []
biases = []
num_layers = len(layers)
for l in range(0,num_layers-1):
W = self.xavier_init(size=[layers[l], layers[l+1]])
b = tf.Variable(tf.zeros([1,layers[l+1]], dtype='float32'), dtype='float32')
weights.append(W)
biases.append(b)
return weights, biases
def xavier_init(self, size):
in_dim = size[0]
out_dim = size[1]
xavier_stddev = np.sqrt(2/(in_dim + out_dim))
return tf.Variable(tf.random.truncated_normal([in_dim, out_dim], stddev=xavier_stddev), dtype='float32')
def neural_net(self, X, weights, biases):
num_layers = len(weights) + 1
H = 2.0*(X - self.lb)/(self.ub - self.lb) - 1.0
for l in range(0,num_layers-2):
W = weights[l]
b = biases[l]
H = tf.math.tanh(tf.math.add(tf.linalg.matmul(H, W), b))
W = weights[-1]
b = biases[-1]
Y = tf.math.add(tf.linalg.matmul(H, W), b)
return Y
def net_u(self, x, t):
v = self.neural_net(tf.concat([x,t],1), self.weights, self.biases)
return v
def net_f(self, x, t):
lambda_1 = self.para[0]
lambda_2 = tf.exp(self.para[1])
with tf.GradientTape(persistent=True) as tape :
tape.watch(t)
tape.watch(x)
u = self.net_u(x,t)
u_x = tape.gradient(u,x)
u_t = tape.gradient(u,t)
u_xx = tape.gradient(u_x,x)
f = u_t + lambda_1*u*u_x - lambda_2*u_xx
del tape
return f
def callback(self, loss,n):
print('Loss:', loss, ' Epoch : ', n)
def train(self,epoch):
for i in range(epoch):
with tf.GradientTape(persistent=True) as tape :
tape.watch(self.weights)
tape.watch(self.biases)
tape.watch(self.para)
f_pred = self.net_f(self.dx_n, self.t_n)
v_pred = self.net_u(self.dx_n, self.t_n)
loss = tf.reduce_mean(tf.square(self.v_r - v_pred)) + tf.reduce_mean(tf.square(f_pred))
dw = tape.gradient(loss,self.weights)
db = tape.gradient(loss,self.biases)
dp = tape.gradient(loss,self.para)
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate=1e-2,
decay_steps=10000,
decay_rate=0.9)
optimizer1 = tf.keras.optimizers.Adam(learning_rate=0.0001)
optimizer1.apply_gradients(zip(dw, self.weights))
optimizer1.apply_gradients(zip(db, self.biases))
optimizer2 = tf.keras.optimizers.Adam(learning_rate=0.001)
optimizer2.apply_gradients(zip(dp, self.para))
del tape
self.callback(loss,i)
def predict(self, X_star):
v_star = self.net_u(X_star[:,0:1], X_star[:,1:2])
f_star = f_pred = self.net_f(X_star[:,0:1], X_star[:,1:2])
para_last = self.para
return v_star, f_star, para_last
if __name__ == '__main__':
#PARAMETERS for the problem
np.random.seed(123)
nu =0.01/np.pi
layers = [2, 20, 20, 20, 20, 1]
N_u = 2000
data = scipy.io.loadmat('burgers_shock.mat')
t = data['t'].flatten()[:,None]
x = data['x'].flatten()[:,None]
Exact = np.real(data['usol']).T
X, T = np.meshgrid(x,t)
X_star = np.hstack((X.flatten()[:,None], T.flatten()[:,None]))
u_star = Exact.flatten()[:,None]
lb = X_star.min(0)
ub = X_star.max(0)
idx = np.random.choice(X_star.shape[0], N_u, replace=False)
X_u_train = X_star[idx,:]
u_train = u_star[idx,:]
model = PhysicsInformedNN(X_u_train, u_train, layers, lb, ub)
model.train(1000)
X_star = tf.convert_to_tensor(X_star,dtype='float32')
u_pred, f_pred, param = model.predict(X_star)
error_lambda_1 = np.abs(param[0] - 1.0)*100
error_lambda_2 = np.abs( np.exp(param[1])- nu)/nu * 100
print(error_lambda_1,error_lambda_2)
I just started studying NN. In the class, the teacher gave us a code to experiment with, in google colab. I tried changing the number of nodes in each hidden layer and the number of hidden layers, and print out test accuracy and train accuracy. I've tried many configurations but the accuracies did not change. Like, it will stay exactly at 0.7857142857142857 (this is the actual number) unless I reshuffle the samples.
The teacher said that accuracy can't be changed that easily. But I don't believe her. I think there is something wrong with the code because there are too many similar digits.
Here are the codes I think are necessary to post.
Model
class Model():
def __init__(self):
self.layers = []
self.L = 0
self.W = {}
self.b = {}
self.A = {}
self.Z = {}
self.dA = {}
self.dZ = {}
self.dW = {}
self.db = {}
self.cost = 0.
self.m = 0
self.lam = 0
self.cost_history = []
self.acc_history = []
self.alpha_history = []
self.alpha = 0.
self.iterations = 0
def add_layers(self, list_of_layers):
self.layers = list_of_layers
self.L = len(self.layers) - 1 # Number of layers excluding the input feature layer
def init_params(self):
for i in range(1, self.L + 1):
self.W[str(i)] = np.random.randn(self.layers[i], self.layers[i - 1]) * np.sqrt(2. / self.layers[i - 1])
self.b[str(i)] = np.zeros((self.layers[i], 1))
def forward_prop(self, X):
self.A['0'] = X
for i in range(1, self.L + 1):
self.Z[str(i)] = np.dot(self.W[str(i)], self.A[str(i - 1)]) + self.b[str(i)]
if i == self.L:
# Output layer, Sigmoid activation
self.A[str(i)] = sigmoid(self.Z[str(i)])
else:
# Hidden layer, Relu activataion
self.A[str(i)] = relu(self.Z[str(i)])
def compute_cost(self, Y):
self.cost = -1 * np.sum(np.multiply(Y, np.log(self.A[str(self.L)])) +
np.multiply(1 - Y, np.log(1 - self.A[str(self.L)]))) / self.m
if self.lam != 0:
reg = (self.lam / (2 * self.m))
for i in range(1, self.L + 1):
reg += np.sum(np.dot(self.W[str(i)], self.W[str(i)].T))
self.cost += reg
self.cost_history.append(self.cost)
def backward_prop(self, Y):
# We need dA[str(L)] to start the backward prop computation
self.dA[str(self.L)] = -1 * (np.divide(Y, self.A[str(self.L)]) - np.divide(1 - Y, 1 - self.A[str(self.L)]))
self.dZ[str(self.L)] = np.multiply(self.dA[str(self.L)], sigmoid_derivative(self.Z[str(self.L)]))
self.dW[str(self.L)] = np.dot(self.dZ[str(self.L)], self.A[str(self.L - 1)].T) / self.m + (self.lam/self.m) * self.W[str(self.L)]
self.db[str(self.L)] = np.sum(self.dZ[str(self.L)], axis = 1, keepdims = True) / self.m
self.dA[str(self.L - 1)] = np.dot(self.W[str(self.L)].T, self.dZ[str(self.L)])
for i in reversed(range(1, self.L)):
self.dZ[str(i)] = np.multiply(self.dA[str(i)], relu_derivative(self.Z[str(i)]))
self.dW[str(i)] = np.dot(self.dZ[str(i)], self.A[str(i - 1)].T) / self.m + (self.lam/self.m) * self.W[str(i)]
self.db[str(i)] = np.sum(self.dZ[str(i)], axis = 1, keepdims = True) / self.m
self.dA[str(i - 1)] = np.dot(self.W[str(i)].T, self.dZ[str(i)])
def update_params(self):
for i in range(1, self.L + 1):
self.W[str(i)] = self.W[str(i)] - self.alpha * self.dW[str(i)]
self.b[str(i)] = self.b[str(i)] - self.alpha * self.db[str(i)]
def train(self, X, Y, iterations = 10,
alpha = 0.001, decay = True, decay_iter = 5, decay_rate = 0.9, stop_decay_counter = 100,
verbose = True, lam = 0):
self.m = Y.shape[1]
self.alpha = alpha
self.iterations = iterations
self.lam = lam
# initialize parameters
self.init_params()
for i in range(iterations):
# forward prop
self.forward_prop(X)
# compute cost
self.compute_cost(Y)
# backward prop
self.backward_prop(Y)
# update params
self.update_params()
# evaluate
self.acc_history.append(self.evaluate(X, Y, in_training = True))
# save alpha
self.alpha_history.append(self.alpha)
# learning rate decay
if decay and stop_decay_counter > 0 and i % decay_iter == 0:
self.alpha = decay_rate * self.alpha
stop_decay_counter -= 1
# display cost per iteration
if verbose:
print('Cost after {} iterations: {}'.format(i, self.cost))
def predict(self, X, in_training = False):
if in_training == False:
self.forward_prop(X)
preds = self.A[str(self.L)] >= 0.5
preds = np.squeeze(preds)
return preds
def evaluate(self, X, Y, in_training = False):
examples = X.shape[1]
pred = self.predict(X, in_training = in_training)
pred = pred.reshape(1, examples)
diff = np.sum(abs(pred - Y))
acc = (examples - np.sum(diff)) / examples
return acc
Dataset
import pandas as pd
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data', sep = ',', header = None)
data.head()
X_train = data.iloc[:,:-1]
Y_train = data.iloc[:, -1]
X_train = np.array(X_train)
Y_train = np.array(Y_train)
Y_train = Y_train.reshape(Y_train.shape[0], 1)
mean = np.mean(X_train, axis = 0)
variance = np.var(X_train, axis = 0)
X_train = np.divide((X_train - mean), variance)
Y_train = Y_train - 1
# Changing label 1 to 0 and label 2 to 1
Split & Shuffle data
# Split the data into test and train sets
from sklearn.utils import shuffle
X_train, Y_train = shuffle(X_train, Y_train)
X_test = X_train[250:,:]
Y_test = Y_train[250:,:]
X_train_ = X_train[:250,:]
Y_train_ = Y_train[:250,:]
X_train_ = X_train_.reshape(3, 250)
Y_train_ = Y_train_.reshape(1, 250)
X_test = X_test.reshape(3, 56)
Y_test = Y_test.reshape(1, 56)
Creating a Model
m = Model()
m.add_layers([3, 16, 16, 1])
m.train(X_train_, Y_train_, iterations = 5000, alpha = 0.9
, decay_iter = 10, decay_rate = 0.98, stop_decay_counter = 100
, verbose = False, lam = 2)
Evaluate
print('Test set acc = ', m.evaluate(X_test, Y_test))
print('Train set acc = ', m.evaluate(X_train_, Y_train_))
What I did in the experiment.
Shuffle, train several models (different in number of nodes and hidden layers), and evaluate
# Model examples
m.add_layers([3, 16, 16, 1, 50, 3, 25, 7, 99, 1])
m.add_layers([3, 1, 55, 19, 2, 2, 1, 1, 2, 75, 80, 3, 12, 1])
Reshuffle, evaluate
Result: Every model has the exact same train and test accuracy unless the data is reshuffled.
The teacher told me that it's just my thought, and it's not true.
Could you please tell me what is wrong to get this result?
I am undergoing a research project that requires me to write a regularizer for a DNN.
import lasagne
from lasagne.nonlinearities import leaky_rectify, softmax
import theano, theano.tensor as T
import numpy as np
import sklearn.datasets, sklearn.preprocessing, sklearn.model_selection
import matplotlib.pyplot as plt
from tabulate import tabulate
import time
import math
#psi function that will be used in the penalty function
def psi(g,l):
m = g.shape[1]
C = (1/T.pow(2,m))*(1/T.pow(math.pi,((m-1)/2))) / (T.gamma((m+1)/2))
logDens = T.log(C) + m*T.log(l) - l*T.sqrt(T.sum(g**2))
dens = T.exp(logDens)
return(dens)
#pstar function that will be used in the penalty function
def pStar(g,lambda1,lambda0,theta):
psi1 = psi(g,lambda1)
psi0 = psi(g,lambda0)
## if a coefficient is really large then both these will numerically be zero
if theta*psi1 ==0 and (1-theta)*psi0==0:
p = 1
else:
p = (theta*psi1) / (theta*psi1 + (1 - theta)*psi0)
return p
#Seperable
def pen_S(l):
theta = 0.5
lambda1 = 1
lambda0 = 12
for j in range(len(l)):
t = l[j]
m = t.shape[1]
n = t.shape[0].eval()
cost = T.zeros((1,1))
for i in range(n):
g = t[i]
temp = -lambda1*T.sum(g**2) + T.log(pStar(T.zeros((1,m)),lambda1,lambda0,theta)/pStar(g,lambda1,lambda0,theta))
cost = cost + temp
return cost
# Number of simulations
N_runs = 1
# Maximum number of epochs
max_epochs = 1500
# Define number of layers and number of neurons
H_layers = np.asarray([40, 20])
# Minibatch size
batch_size = 300
# Lasagne Regularizers to be tested
regularizers = [pen_S]
# Define the regularization factors for each algorithm
reg_factors = [10**-3.5]
# Define the names (for display purposes)
names = ['SSGL_Sep']
# Load the dataset (DIGITS)
digits = sklearn.datasets.load_digits()
X = digits.data
y = digits.target
# MNIST
#mnist = sklearn.datasets.fetch_mldata('MNIST original', data_home='C:/Users/ISPAMM/Downloads')
#X = mnist.data
#y = mnist.target
# Preprocessing (input)
scaler = sklearn.preprocessing.MinMaxScaler()
X = scaler.fit_transform(X)
# Output structures
tr_errors = np.zeros((len(regularizers), N_runs))
tst_errors = np.zeros((len(regularizers), N_runs))
tr_times = np.zeros((len(regularizers), N_runs))
tr_obj = np.zeros((len(regularizers), N_runs, max_epochs))
sparsity_weights = np.zeros((len(regularizers), N_runs, len(H_layers)+1))
sparsity_neurons = np.zeros((len(regularizers), N_runs, len(H_layers)+1))
# Define the input and output symbolic variables
input_var = T.matrix(name='X')
target_var = T.ivector(name='y')
# Utility function for minibatches
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
assert len(inputs) == len(targets)
if shuffle:
indices = np.arange(len(inputs))
np.random.shuffle(indices)
for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
if shuffle:
excerpt = indices[start_idx:start_idx + batchsize]
else:
excerpt = slice(start_idx, start_idx + batchsize)
yield inputs[excerpt], targets[excerpt]
for k in np.arange(0, N_runs):
print("Run ", k+1, " of ", N_runs, "...\n", end="")
# Split the data
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.25)
# Define the network structure
network = lasagne.layers.InputLayer((None, X.shape[1]), input_var)
for h in H_layers:
network = lasagne.layers.DenseLayer(network, h, nonlinearity=leaky_rectify, W=lasagne.init.GlorotNormal())
network = lasagne.layers.DenseLayer(network, len(np.unique(y)), nonlinearity=softmax, W=lasagne.init.GlorotNormal())
params_original = lasagne.layers.get_all_param_values(network)
params = lasagne.layers.get_all_params(network, trainable=True)
# Define the loss function
prediction = lasagne.layers.get_output(network)
loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
# Define the test function
test_prediction = lasagne.layers.get_output(network, deterministic=True)
test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
dtype=theano.config.floatX)
test_fn = theano.function([input_var, target_var], test_acc, allow_input_downcast=True)
for r in np.arange(0, len(regularizers)):
# Set to original parameters
lasagne.layers.set_all_param_values(network, params_original)
# Define the regularized loss function
loss_reg = loss.mean() + reg_factors[r] * lasagne.regularization.regularize_network_params(network, regularizers[r])
# Update function
# updates_reg = lasagne.updates.nesterov_momentum(loss_reg, params,learning_rate=0.01)
updates_reg = lasagne.updates.adam(loss_reg, params)
# Training function
train_fn = theano.function([input_var, target_var], loss_reg, updates=updates_reg, allow_input_downcast=True)
# Train network
print("\tTraining with ", names[r], " regularization, epoch: ", end="")
start = time.time()
for epoch in range(max_epochs):
loss_epoch = 0
batches = 0
if np.mod(epoch, 10) == 0:
print(epoch, "... ", end="")
for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True):
input_batch, target_batch = batch
loss_epoch += train_fn(input_batch, target_batch)
batches += 1
tr_obj[r,k,epoch] = loss_epoch/batches
end = time.time()
tr_times[r,k] = end - start
print(epoch, ".")
# Final test with accuracy
print("\tTesting the network with ", names[r], " regularization...")
tr_errors[r,k] = test_fn(X_train, y_train)
tst_errors[r,k] = test_fn(X_test, y_test)
# Check sparsity
params_trained = lasagne.layers.get_all_param_values(network, trainable=True)
sparsity_weights[r,k,:] = [1-(x.round(decimals=3).ravel().nonzero()[0].shape[0]/x.size) for x in params_trained[0::2]]
sparsity_neurons[r,k,:] = [x.round(decimals=3).sum(axis=1).nonzero()[0].shape[0] for x in params_trained[0::2]]
tr_obj_mean = np.mean(tr_obj, axis=1)
# Plot the average loss
plt.figure()
plt.title('Training objective')
for r in np.arange(0, len(regularizers)):
plt.semilogy(tr_obj_mean[r, :], label=names[r])
plt.legend()
# Print the results
print(tabulate([['Tr. accuracy [%]'] + np.mean(tr_errors, axis=1).round(decimals=4).tolist(),
['Test. accuracy [%]'] + np.mean(tst_errors, axis=1).round(decimals=4).tolist(),
['Tr. times [secs.]'] + np.mean(tr_times, axis=1).round(decimals=4).tolist(),
['Sparsity [%]'] + np.mean(sparsity_weights, axis=1).round(decimals=4).tolist(),
['Neurons'] + np.mean(sparsity_neurons, axis=1).round(decimals=4).tolist()],
headers=['']+names))
Here is my defined regularizer pen_S(l), but when I run the code to train the network, i was promted with 'TypeError: cost must be a scalar.' But I think my output of pen_S is already a scalar.
Can anyone help me with this?
I have been trying to create a simple standard neural network from scratch but I can't seem to get it to work normally. Sometimes the cost skyrockets, other times the cost doesn't even change. I'm not sure what the problem is but it would be really helpful if someone could help me.
I have all of the information on Github. If any more information is needed kindly reply and I will provide it.
https://github.com/enriquedellxps/Neural-Network
Function for generating batches:
def batcher(data, batch_size):
# get the number of batches
num_batches_norem = data.shape[1] // batch_size
if data.shape[1] % batch_size == 0:
remainder_quantity = 0
else:
remainder_size = data.shape[1] % batch_size
remainder_quantity = 1
num_batches = num_batches_norem + remainder_quantity
changer = 0
for mb in range(num_batches_norem):
current_batch = data[:, changer:changer + batch_size]
changer += batch_size
yield current_batch
for last_mb in range(remainder_quantity):
last_batch = data[:, changer:changer + remainder_size]
yield last_batch
Function for g(z):
def activationer(a, z):
# ACTIVATION FUNCTIONS
# Sigmoid Activation Function
def sigmoid(z):
g = scipy.special.expit(z)
return g
# Tanh (Hyperbolic Tangent Function) Activation Function
def tanh(z):
g = (np.exp(z) - np.exp(-1 * z)) / ((np.exp(z) + np.exp(-1 * z)))
return g
# ReLU (Rectified Linear Unit) Activation Function
def ReLU(z):
g = np.maximum(0, z)
return g
# Leaky ReLU (Leaky Rectified Linear Unit) Activation Function
def Leaky_ReLU(z):
g = np.maximum(0.01 * z, z)
return g
def softmax(z):
z_exp = np.exp(z)
g = z_exp / np.sum(z_exp, axis=0, keepdims=True)
return g
if a == "sigmoid":
res = sigmoid(z)
elif a == "tanh":
res = tanh(z)
elif a == "relu":
res = ReLU(z)
elif a == "leaky relu":
res = Leaky_ReLU(z)
elif a == "softmax":
res = softmax(z)
return res
NN Class:
class DeepNeuralNetwork:
def __init__(self, n_x, n_h, n_y, nl, activations, alpha):
assert nl == len(activations), f"L: {nl}, Number of Activations: {len(activations)}"
# Assign inputs to the self object
self.n_x = n_x
self.n_h = n_h
self.n_y = n_y
self.nl = nl
self.activations = activations
self.alpha = alpha
# Initialize Parameters
def initialize_parameters(self):
n_x = self.n_x
n_h = self.n_h
n_y = self.n_y
activations = self.activations
parameters = []
for l in range(self.nl):
np.random.seed(8)
if l == 0:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_h, n_x) * np.sqrt(2 / n_x), np.zeros((n_h, 1))]) # aka W1, b1 | Xavier
else:
parameters.append([np.random.randn(n_h, n_x) * np.sqrt(1 / n_x), np.zeros((n_h, 1))]) # aka W1, b1 | He
elif l == self.nl - 1:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_y, n_h) * np.sqrt(2 / n_h), np.zeros((n_y, 1))]) # aka WL, bL | Xavier
else:
parameters.append([np.random.randn(n_y, n_h) * np.sqrt(1 / n_h), np.zeros((n_y, 1))]) # aka WL, bL | He
else:
if activations[l] == "relu" or "leaky relu":
parameters.append([np.random.randn(n_h, n_h) * np.sqrt(2 / n_h), np.zeros((n_h, 1))]) # hidden params | Xavier
else:
parameters.append([np.random.randn(n_h, n_h) * np.sqrt(1 / n_h), np.zeros((n_h, 1))]) # hidden params | He
return parameters
# Forward Propagation
def forward_propagation(self, parameters, input_data):
batch_size = input_data.shape[1] # Get the amount of examples in the batch
caches = []
self.caches = caches
current_activation = input_data # Set first activation - A0 - as the input
caches.append(current_activation)
for l in range(self.nl):
W, b = parameters[l][0], parameters[l][1] # Get weights and biases for current layer
Z = W # current_activation + b # Compute the linear activation
current_activation = activationer(self.activations[l], Z) # Compute the full activation
caches.append(current_activation)
return current_activation
# Compute Cost
def compute_cost(self, yhat, y):
batch_size = y.shape[1] # Get the amount of examples in the batch
cost = (1 / batch_size) * np.sum(-1 * (y * np.log(yhat) + ((1 - y) * (np.log(1 - yhat))))) # Compute the cross-entropy cost
cost = np.squeeze(cost) # Turn [[17]] to 17
return cost
# Backward Propagation
def backward_propagation(self, parameters, y):
caches = self.caches
batch_size = y.shape[1]
grads = []
for l in reversed(range(1, self.nl + 1)):
if l == self.nl:
dZ = caches[l] - y
dW = (1 / batch_size) * dZ # caches[l - 1].T
db = (1 / batch_size) * np.sum(dZ, axis=1, keepdims=True)
grads.append([dW, db])
else:
dA = parameters[l][0].T # dZ
dZ = dA * np.multiply(caches[l], (1 - caches[l]))
dW = (1 / batch_size) * dZ # caches[l - 1].T
db = (1 / batch_size) * np.sum(dZ, axis=1, keepdims=True)
grads.append([dW, db])
return grads
# Update Parameters
def update_parameters(self, parameters, gradients):
for l in range(self.nl):
parameters[l][0] = parameters[l][0] - self.alpha * gradients[self.nl - l - 1][0]
parameters[l][1] = parameters[l][1] - self.alpha * gradients[self.nl - l - 1][1]
return parameters
Running it:
dnn = DeepNeuralNetwork(12288, 20, 1, 4, ["relu", "relu", "relu", "sigmoid"], 0.001)
params = dnn.initialize_parameters()
epochs = 100
for e in range(epochs):
for i, j in zip(train_x_batched, train_y_batched):
yhat = dnn.forward_propagation(params, i)
cost = dnn.compute_cost(yhat, j)
grads = dnn.backward_propagation(params, j)
params = update_parameters(params, grads)
print(cost) # This usually starts going down then skyrockets. Even if I lower the learning rate to 0.00001
Thanks :)
I'm trying to implement a neural network from scratch in order to gain better insight about it and I run into a weird problem. When I use Relu function for hidden layers as an activation function, the model did not converge whereas it did converge once sigmoid function is used. Here is my vanilla code: When you change first 2 layers' activation function from relu to sigmoid, you can see that it converges, though it may have a problem sometimes. Where could be the problem? It's been three days and I still couldnt find it, though I found some little bugs. Thanks in advance.
Here is the toy dataset I've been using(just paste it to where this code is located).
Dataset
import numpy as np
import pandas as pd
class NeuralNetwork():
def __init__(self, epoch=10000, alpha=0.075, algorithm="gradient_descent"):
# hyperparameters
self.epoch = epoch
self.alpha = alpha
self.algorithm = algorithm
# parameters
self.params = {}
self.layer_no = 1
# logs
self.cost_vals = []
def createLayer(self, size, activation_func, randomness=True):
if randomness == True:
self.params["W" + str(self.layer_no)] = np.random.randn(size[0], size[1]) * 0.01
else:
self.params["W" + str(self.layer_no)] = np.zeros(size)
self.params["b" + str(self.layer_no)] = np.zeros((size[0], 1))
self.params["func" + str(self.layer_no)] = activation_func
self.layer_no += 1
def sigmoid(self, X):
return 1 / (1 + np.exp(-X))
def relu(self, X):
return np.maximum(X, 0) * 0.01
def tanh(self, X):
return (np.exp(X) - np.exp(-X)) / (np.exp(X) + np.exp(-X))
def derivative_sigmoid(self, X):
der_x = self.sigmoid(X)
return der_x * (1 - der_x)
def derivative_relu(self, X):
X[X<=0] = 0
X[X>0] = 1
return X
def derivative_tanh(self, X):
tanhx = self.tanh(X)
return 1 - np.power(tanhx, 2)
def activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.sigmoid(Zl)
elif act_func_name == "relu":
return self.relu(Zl)
elif act_func_name == "tanh":
return self.tanh(Zl)
def derivative_activation_function(self, Zl, act_func_name):
if act_func_name == "sigmoid":
return self.derivative_sigmoid(Zl)
elif act_func_name == "relu":
return self.derivative_relu(Zl)
elif act_func_name == "tanh":
return self.derivative_tanh(Zl)
def train(self, X, Y):
m = Y.shape[0] # number of training examples
self.params["A0"] = X
self.params["Z0"] = None
for i in range(self.epoch):
# forward prop
for l in range(1, self.layer_no): # 1,2,3
Zl = np.dot(self.params["W" + str(l)], self.params["A" + str(l - 1)]) + self.params["b" + str(l)] # linear function of a layer with vectorization
Al = self.activation_function(Zl, self.params["func" + str(l)]) # activated form of Zl
self.params["Z" + str(l)] = Zl
self.params["A" + str(l)] = Al
# cost function
cost_val = - 1 / m * np.sum(np.multiply(Y, np.log(Al)) + np.multiply((1 - Y), np.log(1 - Al)))
cost_val = np.squeeze(cost_val)
if i % 500 == 0:
print(cost_val)
self.cost_vals.append(cost_val)
# backward prop
dAl = - (np.divide(Y, Al) - np.divide(1 - Y, 1 - Al)) # gradiant of last layer of A
for l in reversed(range(1, self.layer_no)): # 3,2,1
# backward prop
dZl = np.multiply(dAl,
self.derivative_activation_function(self.params["Z" + str(l)], self.params["func" + str(l)])) # gradient of layer l of Z
dAl1 = np.dot(self.params["W" + str(l)].T, dZl) # gradient of previous layer of A
dWl = 1 / m * np.dot(dZl, self.params["A" + str(l - 1)].T) # gradient of parameters W in layer l
dbl = 1 / m * np.sum(dZl, axis=1, keepdims=True) # gradient of parameters b in layer l
# update parameters
self.params["W" + str(l)] -= self.alpha * dWl
self.params["b" + str(l)] -= self.alpha * dbl
dAl = dAl1 # assign gradient of previous layer of A to the current one so as to use it while back-propagation
def iris_data():
from sklearn.model_selection import train_test_split
datas = pd.read_csv('iris_nn.data').to_numpy()
X = datas[:, 0:4].astype(float)
Y = datas[:, 4:5]
Y = np.asarray([1 if (y == 'Iris-setosa') else 0 for y in Y]).reshape((Y.shape[0], 1))
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.25, random_state=0)
return X_train.T, Y_train.T
X, Y = iris_data()
model = NeuralNetwork()
model.createLayer((5,4), "relu")
model.createLayer((7,5), "relu")
model.createLayer((1,7), "sigmoid")
model.train(X,Y)
#