Why is RPROP not updating the training accuracy? - python

I need help with my machine learning code. I am trying to train my network using RPROP, and the train accuracy has not been updating.
Here's the code:
import theano
from theano import *
import theano.tensor as T
from theano.ifelse import ifelse
import numpy as np
from random import random
from sklearn import datasets
from sklearn.model_selection import train_test_split
import math
#GETTING TEST DATA
def get_iris_data():
""" Read the iris data set and split them into training and test sets """
iris = datasets.load_iris()
data = iris.data
target = iris.target
# Prepend the column of 1s for bias
N, M = data.shape
all_X = np.ones((N, M + 1))
all_X[:, 1:] = data
# Convert into one-hot vectors
num_labels = len(np.unique(target))
all_Y = np.eye(num_labels)[target] # One liner trick!
return train_test_split(all_X, all_Y, test_size=0.33)
#SETTING INITIAL WEIGHT VALUES
def init_weights(shape):
""" Weight initialization """
weights = np.asarray(np.random.randn(*shape) * 0.01, dtype=theano.config.floatX)
return theano.shared(weights)
def feedforward(X, w1, w2):
hidden = T.nnet.sigmoid(T.dot(X,w1))
out = T.nnet.softmax(T.dot(hidden,w2))
return out
def rprop(cost, params, learning_rate):
""" Back-propagation """
#RPROP Variables
updates = []
gradients = T.grad(cost = cost, wrt = params)
#Default Values
prevparams = params
deltaMax = 50.
deltaMin = math.exp(-6)
deltas = -0.1 * numpy.ones(len(params))
prevgradients = numpy.zeros(len(params))
npos = 1.2
nneg = 0.5
#All Values
allvalues = zip(params, prevparams, gradients, deltas, prevgradients)
for param, prevparam, gradient, delta, prevgradient in allvalues:
polarity = T.sgn(gradient * prevgradient)
prevdelta = delta
if T.gt(polarity, 0):
delta = T.minimum(prevdelta * npos, deltaMax)
change = - T.sgn(gradient) * delta
prevgradient = gradient
elif T.lt(polarity,0):
delta = T.maximu(prevdelta * nneg, deltaMin)
prevgradient = 0
change = -prevgradient
else:
change = - T.sign(gradient) * delta
prevgradient = gradient
updates.append((param, param - change * learning_rate))
return updates
#MAIN FUNCTION
def main():
#Initialization of Variables and data
train_X, test_X, train_Y, test_Y = get_iris_data()
learning_rate = 0.01
X = T.fmatrix()
Y = T.fmatrix()
#Set ANN Network Size
in_size = train_X.shape[1]
hid_size = 256
out_size = train_Y.shape[1]
#Set weights inbetween
w1 = init_weights((in_size, hid_size))
w2 = init_weights((hid_size, out_size))
#Forward Propagation Function for Neuron activation and transfer funtion
yHat = feedforward(X,w1,w2)
#Backpropagation for correction
cost = T.mean(T.nnet.categorical_crossentropy(yHat, Y))
params = [w1,w2]
updates = rprop(cost, params, learning_rate)
# Train and predict
train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
pred_Y = T.argmax(yHat, axis=1)
predict = theano.function(inputs=[X], outputs=pred_Y, allow_input_downcast=True)
# Run SGD
for iter in range(2000):
for i in range(len(train_X)):
train(train_X[i: i + 1], train_Y[i: i + 1])
train_accuracy = np.mean(np.argmax(train_Y, axis=1) == predict(train_X))
test_accuracy = np.mean(np.argmax(test_Y, axis=1) == predict(test_X))
print("Iteration = %d, train accuracy = %.2f%%, test accuracy = %.2f%%"
% (iter + 1, 100 * train_accuracy, 100 * test_accuracy))
if __name__ == '__main__':
main()

Related

for loop sending wrong data to list

Below is the code, i am running a for loop to train on different training sizes. The first loop works correctly, where when training begins, the training and validation accuracy are sent to a list, then a frame then finally a csv. But on the subsequent loops, a data generator is sent to the list. Can anyone see where the issue is, because I cant find it.
Also if you have a better way of doing this (data compiling for analysis), I'm all ears.
The first block is the code snippet, the second block is the full code. The for loop starts about halfway down.
for i in range(1,6):
training_loader, validation_loader, training_ones, training_zeros, validation_ones, validation_zeros = switcher().sets(case)
train_accuracy = []
val_accuracy = []
start_time = time.time()
for epoch in tqdm(range(1, epochs + 1), total=epochs):
train()
train_acc = test(training_loader)
train_accuracy.append(train_acc)
val_acc = test(validation_loader)
val_accuracy.append(val_acc)
accuracy = pd.DataFrame()
accuracy['train_acc'] = train_accuracy
accuracy['val_acc'] = val_accuracy
accuracy.to_csv(f'C:\\Users\\Anthony Sirico\\Documents\\GitHub\\PyGeo_Circuit_exp\\PyGeo_Circuit_exp\\imbalance_exp\\csv files\\accuracy_{i}.csv')
import sys
sys.path.insert(0, 'C:\\Users\\user\\Desktop\\imbalance_exp\\imbalance_exp\\imbalance_exp')
import torch
from torch_geometric.loader import DataLoader
import imb_dataset as imb
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GraphConv
from torch_geometric.nn import global_mean_pool
import neptune.new as neptune
import pandas as pd
from sklearn.metrics import confusion_matrix, matthews_corrcoef
import seaborn as sns
from neptune.new.types import File
from tqdm import tqdm
import time
known = imb.ImbalanceDataset(root='imb_50v2', set='known', split=0.5)
unknown = imb.ImbalanceDataset(root='imb_50v2', set='unknown', split=0.5)
all_data = imb.ImbalanceDataset(root='imb_50v2', set='All', split=None)
torch.manual_seed(12345)
known = known.shuffle()
lr = 0.001
training_perc = 0.9
N = len(known)
mini_batch_size = 32
epochs = 600
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
case = 2
class switcher:
def sets(self, case):
default = known
return getattr(self, 'case_' + str(case), lambda: default)()
def case_1(self):
training_set = known[:int(training_perc*len(known))]
validation_set = known[int(training_perc*len(known)):]
training_loader = DataLoader(training_set, batch_size=mini_batch_size, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=mini_batch_size, shuffle=False)
training_ones = []
training_zeros = []
validation_ones = []
validation_zeros = []
for i in range(len(training_set)):
if training_set[i].y == 1:
training_ones.append(training_set[i])
else:
training_zeros.append(training_set[i])
for i in range(len(validation_set)):
if validation_set[i].y == 1:
validation_ones.append(validation_set[i])
else:
validation_zeros.append(validation_set[i])
return training_loader, validation_loader, training_ones, training_zeros, validation_ones, validation_zeros
def case_2(self):
one_index = round(len(known) * 0.25)
known_ones = known[:one_index].copy()
known_ones.shuffle()
known_zeros = known[one_index:].copy()
known_zeros.shuffle()
training_ones = known_ones[:int(training_perc*len(known_ones))]
training_zeros = known_zeros[:len(training_ones)]
training_set = torch.utils.data.ConcatDataset([training_ones, training_zeros])
validation_ones = known_ones[int(training_perc*len(known_ones)):]
validation_zeros = known_zeros[len(training_ones):]
validation_set = torch.utils.data.ConcatDataset([validation_ones, validation_zeros])
training_loader = DataLoader(training_set, batch_size=mini_batch_size, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=mini_batch_size, shuffle=False)
training_ones = []
training_zeros = []
validation_ones = []
validation_zeros = []
for i in range(len(training_set)):
if training_set[i].y == 1:
training_ones.append(training_set[i])
else:
training_zeros.append(training_set[i])
for i in range(len(validation_set)):
if validation_set[i].y == 1:
validation_ones.append(validation_set[i])
else:
validation_zeros.append(validation_set[i])
return training_loader, validation_loader, training_ones, training_zeros, validation_ones, validation_zeros
class GCN(torch.nn.Module):
def __init__(self, hidden_channels):
super(GCN, self).__init__()
torch.manual_seed(12345)
self.conv1 = GraphConv(known.num_node_features, hidden_channels)
self.conv2 = GraphConv(hidden_channels, hidden_channels)
self.conv3 = GraphConv(hidden_channels, hidden_channels)
self.lin = Linear(hidden_channels, known.num_classes)
def forward(self, x, edge_index, batch):
# 1. Obtain node embeddings
x = self.conv1(x, edge_index)
x = x.relu()
x = self.conv2(x, edge_index)
x = x.relu()
x = self.conv3(x, edge_index)
# 2. Readout layer
x = global_mean_pool(x, batch) # [batch_size, hidden_channels]
# 3. Apply a final classifier
x = F.dropout(x, p=0.5, training=self.training)
x = self.lin(x)
return x
model = GCN(hidden_channels=64).to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss()
def train():
model.train()
total_loss = 0
for data in training_loader: # Iterate in batches over the training dataset.
data = data.to(device)
out = model(data.x, data.edge_index, data.batch) # Perform a single forward pass.
loss = criterion(out, data.y) # Compute the loss solely based on the training nodes.
loss.backward() # Derive gradients.
optimizer.step() # Update parameters based on gradients.
optimizer.zero_grad() # Clear gradients.
def test(loader):
model.eval()
correct = 0
for data in loader: # Iterate in batches over the training/test dataset.
data = data.to(device)
out = model(data.x, data.edge_index, data.batch)
pred = out.argmax(dim=1) # Use the class with highest probability.
correct += int((pred == data.y).sum()) # Check against ground-truth labels.
return correct / len(loader.dataset) # Derive ratio of correct predictions.
output_frame = pd.DataFrame(columns=['epoch', 'lr', 'known', 'unknown', 'train_ones', 'train_zeros', 'val_ones', 'val_zeros', 'tn_all', 'fp_all', 'fn_all', 'tp_all', 'tn_known', 'fp_known', 'fn_known', 'tp_known', 'precision_all', 'recall_all', 'f1_all', 'accuracy_all', 'mcc_all', 'precision_known', 'recall_known', 'f1_known', 'accuracy_known', 'mcc_known', 'time_elapsed'])
for i in range(1,6):
training_loader, validation_loader, training_ones, training_zeros, validation_ones, validation_zeros = switcher().sets(case)
train_accuracy = []
val_accuracy = []
start_time = time.time()
for epoch in tqdm(range(1, epochs + 1), total=epochs):
train()
train_acc = test(training_loader)
train_accuracy.append(train_acc)
val_acc = test(validation_loader)
val_accuracy.append(val_acc)
accuracy = pd.DataFrame()
accuracy['train_acc'] = train_accuracy
accuracy['val_acc'] = val_accuracy
accuracy.to_csv(f'C:\\Users\\Anthony Sirico\\Documents\\GitHub\\PyGeo_Circuit_exp\\PyGeo_Circuit_exp\\imbalance_exp\\csv files\\accuracy_{i}.csv')
unknown_loader = DataLoader(unknown, batch_size=1, shuffle=False)
predictions = []
all_correct = 0
known_correct = 0
for test in unknown_loader:
test = test.to(device)
out = model(test.x, test.edge_index, test.batch)
pred = out.argmax(dim=1)
predictions.append(pred)
all_correct += int((pred == test.y_all).sum())
known_correct += int((pred == test.y_known).sum())
pred_df = pd.DataFrame()
pred_df['y_all_true'] = [i.item() for i in unknown.data.y_all]
pred_df['y_known_true'] = [i.item() for i in unknown.data.y_known]
pred_df['y_pred'] = [i.item() for i in predictions]
pred_df.to_csv(f'C:\\Users\\Anthony Sirico\\Documents\\GitHub\\PyGeo_Circuit_exp\\PyGeo_Circuit_exp\\imbalance_exp\\csv files\\pred_df_{i}.csv')
cf_matrix_all = confusion_matrix(pred_df['y_all_true'], pred_df['y_pred'])
ax = sns.heatmap(cf_matrix_all, annot=True, fmt='g', cmap='Blues')
ax.title.set_text('Confusion Matrix based on all data')
tn_all, fp_all, fn_all, tp_all = cf_matrix_all.ravel()
end_time = time.time()
time_elapsed = end_time - start_time
precision_all = tp_all / (tp_all + fp_all)
recall_all = tp_all / (tp_all + fn_all)
f1_all = 2 * (precision_all * recall_all) / (precision_all + recall_all)
accuracy_all = (tp_all + tn_all) / (tp_all + tn_all + fp_all + fn_all)
mcc_all = matthews_corrcoef(pred_df['y_all_true'], pred_df['y_pred'])
cf_matrix_known = confusion_matrix(pred_df['y_known_true'], pred_df['y_pred'])
ax = sns.heatmap(cf_matrix_known, annot=True, fmt='g', cmap='Blues')
ax.title.set_text('Confusion Matrix based on known data')
tn_known, fp_known, fn_known, tp_known = cf_matrix_known.ravel()
precision_known = tp_known / (tp_known + fp_known)
recall_known = tp_known / (tp_known + fn_known)
f1_known = 2 * (precision_known * recall_known) / (precision_known + recall_known)
accuracy_known = (tp_known + tn_known) / (tp_known + tn_known + fp_known + fn_known)
mcc_known = matthews_corrcoef(pred_df['y_known_true'], pred_df['y_pred'])
#'epoch', 'lr', 'known', 'unknown', 'train_ones', 'train_zeros', 'val_ones', 'val_zeros', 'tn_all', 'fp_all', 'fn_all', 'tp_all', 'tn_known', 'fp_known', 'fn_known', 'tp_known
output_frame.loc[i] = [epochs, lr, len(known), len(unknown), len(training_ones), len(training_zeros), len(validation_ones), len(validation_zeros), tn_all, fp_all, fn_all, tp_all, tn_known, fp_known, fn_known, tp_known, precision_all, recall_all, f1_all, accuracy_all, mcc_all, precision_known, recall_known, f1_known, accuracy_known, mcc_known, time_elapsed]
output_frame.to_csv('C:\\Users\\Anthony Sirico\\Documents\\GitHub\\PyGeo_Circuit_exp\\PyGeo_Circuit_exp\\imbalance_exp\\csv files\\final_output.csv')
training_perc -= 0.2

parameter estimation using tensorflow

I am trying to create a nueral network using tensor flow. I am not using keras api. I have some parameter estimation(weight,bias and some other parameters) to do. The code is working but the parameter estimation is really bad and error percentage is very high what is the problem here? I tried so many ways still no improvement. the loss fn is less.
I tried creating my own optimizer but the process is slow and the error is large. Is there any way to apply optimizers parameter.
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
from scipy.interpolate import griddata
from pyDOE import lhs
import math as ma
class PhysicsInformedNN:
def __init__(self,X_n,v,layers,lb,ub):
self.lb = lb
self.ub = ub
self.layers = layers
self.dx_n = tf.convert_to_tensor(X_n[:,0:1],dtype = 'float32')
self.t_n = tf.convert_to_tensor(X_n[:,1:2],dtype = 'float32')
self.v_r = tf.convert_to_tensor(v,dtype = 'float32')
self.lambda_1 = tf.Variable(0,dtype = 'float32')#1.5
self.lambda_2 = tf.Variable(-6,dtype = 'float32')
self.para =[self.lambda_1,self.lambda_2]
self.weights, self.biases = self.initialize_NN(layers)
def initialize_NN(self,layers):
weights = []
biases = []
num_layers = len(layers)
for l in range(0,num_layers-1):
W = self.xavier_init(size=[layers[l], layers[l+1]])
b = tf.Variable(tf.zeros([1,layers[l+1]], dtype='float32'), dtype='float32')
weights.append(W)
biases.append(b)
return weights, biases
def xavier_init(self, size):
in_dim = size[0]
out_dim = size[1]
xavier_stddev = np.sqrt(2/(in_dim + out_dim))
return tf.Variable(tf.random.truncated_normal([in_dim, out_dim], stddev=xavier_stddev), dtype='float32')
def neural_net(self, X, weights, biases):
num_layers = len(weights) + 1
H = 2.0*(X - self.lb)/(self.ub - self.lb) - 1.0
for l in range(0,num_layers-2):
W = weights[l]
b = biases[l]
H = tf.math.tanh(tf.math.add(tf.linalg.matmul(H, W), b))
W = weights[-1]
b = biases[-1]
Y = tf.math.add(tf.linalg.matmul(H, W), b)
return Y
def net_u(self, x, t):
v = self.neural_net(tf.concat([x,t],1), self.weights, self.biases)
return v
def net_f(self, x, t):
lambda_1 = self.para[0]
lambda_2 = tf.exp(self.para[1])
with tf.GradientTape(persistent=True) as tape :
tape.watch(t)
tape.watch(x)
u = self.net_u(x,t)
u_x = tape.gradient(u,x)
u_t = tape.gradient(u,t)
u_xx = tape.gradient(u_x,x)
f = u_t + lambda_1*u*u_x - lambda_2*u_xx
del tape
return f
def callback(self, loss,n):
print('Loss:', loss, ' Epoch : ', n)
def train(self,epoch):
for i in range(epoch):
with tf.GradientTape(persistent=True) as tape :
tape.watch(self.weights)
tape.watch(self.biases)
tape.watch(self.para)
f_pred = self.net_f(self.dx_n, self.t_n)
v_pred = self.net_u(self.dx_n, self.t_n)
loss = tf.reduce_mean(tf.square(self.v_r - v_pred)) + tf.reduce_mean(tf.square(f_pred))
dw = tape.gradient(loss,self.weights)
db = tape.gradient(loss,self.biases)
dp = tape.gradient(loss,self.para)
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate=1e-2,
decay_steps=10000,
decay_rate=0.9)
optimizer1 = tf.keras.optimizers.Adam(learning_rate=0.0001)
optimizer1.apply_gradients(zip(dw, self.weights))
optimizer1.apply_gradients(zip(db, self.biases))
optimizer2 = tf.keras.optimizers.Adam(learning_rate=0.001)
optimizer2.apply_gradients(zip(dp, self.para))
del tape
self.callback(loss,i)
def predict(self, X_star):
v_star = self.net_u(X_star[:,0:1], X_star[:,1:2])
f_star = f_pred = self.net_f(X_star[:,0:1], X_star[:,1:2])
para_last = self.para
return v_star, f_star, para_last
if __name__ == '__main__':
#PARAMETERS for the problem
np.random.seed(123)
nu =0.01/np.pi
layers = [2, 20, 20, 20, 20, 1]
N_u = 2000
data = scipy.io.loadmat('burgers_shock.mat')
t = data['t'].flatten()[:,None]
x = data['x'].flatten()[:,None]
Exact = np.real(data['usol']).T
X, T = np.meshgrid(x,t)
X_star = np.hstack((X.flatten()[:,None], T.flatten()[:,None]))
u_star = Exact.flatten()[:,None]
lb = X_star.min(0)
ub = X_star.max(0)
idx = np.random.choice(X_star.shape[0], N_u, replace=False)
X_u_train = X_star[idx,:]
u_train = u_star[idx,:]
model = PhysicsInformedNN(X_u_train, u_train, layers, lb, ub)
model.train(1000)
X_star = tf.convert_to_tensor(X_star,dtype='float32')
u_pred, f_pred, param = model.predict(X_star)
error_lambda_1 = np.abs(param[0] - 1.0)*100
error_lambda_2 = np.abs( np.exp(param[1])- nu)/nu * 100
print(error_lambda_1,error_lambda_2)

[Theano]TypeError: cost must be a scalar

I am undergoing a research project that requires me to write a regularizer for a DNN.
import lasagne
from lasagne.nonlinearities import leaky_rectify, softmax
import theano, theano.tensor as T
import numpy as np
import sklearn.datasets, sklearn.preprocessing, sklearn.model_selection
import matplotlib.pyplot as plt
from tabulate import tabulate
import time
import math
#psi function that will be used in the penalty function
def psi(g,l):
m = g.shape[1]
C = (1/T.pow(2,m))*(1/T.pow(math.pi,((m-1)/2))) / (T.gamma((m+1)/2))
logDens = T.log(C) + m*T.log(l) - l*T.sqrt(T.sum(g**2))
dens = T.exp(logDens)
return(dens)
#pstar function that will be used in the penalty function
def pStar(g,lambda1,lambda0,theta):
psi1 = psi(g,lambda1)
psi0 = psi(g,lambda0)
## if a coefficient is really large then both these will numerically be zero
if theta*psi1 ==0 and (1-theta)*psi0==0:
p = 1
else:
p = (theta*psi1) / (theta*psi1 + (1 - theta)*psi0)
return p
#Seperable
def pen_S(l):
theta = 0.5
lambda1 = 1
lambda0 = 12
for j in range(len(l)):
t = l[j]
m = t.shape[1]
n = t.shape[0].eval()
cost = T.zeros((1,1))
for i in range(n):
g = t[i]
temp = -lambda1*T.sum(g**2) + T.log(pStar(T.zeros((1,m)),lambda1,lambda0,theta)/pStar(g,lambda1,lambda0,theta))
cost = cost + temp
return cost
# Number of simulations
N_runs = 1
# Maximum number of epochs
max_epochs = 1500
# Define number of layers and number of neurons
H_layers = np.asarray([40, 20])
# Minibatch size
batch_size = 300
# Lasagne Regularizers to be tested
regularizers = [pen_S]
# Define the regularization factors for each algorithm
reg_factors = [10**-3.5]
# Define the names (for display purposes)
names = ['SSGL_Sep']
# Load the dataset (DIGITS)
digits = sklearn.datasets.load_digits()
X = digits.data
y = digits.target
# MNIST
#mnist = sklearn.datasets.fetch_mldata('MNIST original', data_home='C:/Users/ISPAMM/Downloads')
#X = mnist.data
#y = mnist.target
# Preprocessing (input)
scaler = sklearn.preprocessing.MinMaxScaler()
X = scaler.fit_transform(X)
# Output structures
tr_errors = np.zeros((len(regularizers), N_runs))
tst_errors = np.zeros((len(regularizers), N_runs))
tr_times = np.zeros((len(regularizers), N_runs))
tr_obj = np.zeros((len(regularizers), N_runs, max_epochs))
sparsity_weights = np.zeros((len(regularizers), N_runs, len(H_layers)+1))
sparsity_neurons = np.zeros((len(regularizers), N_runs, len(H_layers)+1))
# Define the input and output symbolic variables
input_var = T.matrix(name='X')
target_var = T.ivector(name='y')
# Utility function for minibatches
def iterate_minibatches(inputs, targets, batchsize, shuffle=False):
assert len(inputs) == len(targets)
if shuffle:
indices = np.arange(len(inputs))
np.random.shuffle(indices)
for start_idx in range(0, len(inputs) - batchsize + 1, batchsize):
if shuffle:
excerpt = indices[start_idx:start_idx + batchsize]
else:
excerpt = slice(start_idx, start_idx + batchsize)
yield inputs[excerpt], targets[excerpt]
for k in np.arange(0, N_runs):
print("Run ", k+1, " of ", N_runs, "...\n", end="")
# Split the data
X_train, X_test, y_train, y_test = sklearn.model_selection.train_test_split(X, y, test_size=0.25)
# Define the network structure
network = lasagne.layers.InputLayer((None, X.shape[1]), input_var)
for h in H_layers:
network = lasagne.layers.DenseLayer(network, h, nonlinearity=leaky_rectify, W=lasagne.init.GlorotNormal())
network = lasagne.layers.DenseLayer(network, len(np.unique(y)), nonlinearity=softmax, W=lasagne.init.GlorotNormal())
params_original = lasagne.layers.get_all_param_values(network)
params = lasagne.layers.get_all_params(network, trainable=True)
# Define the loss function
prediction = lasagne.layers.get_output(network)
loss = lasagne.objectives.categorical_crossentropy(prediction, target_var)
# Define the test function
test_prediction = lasagne.layers.get_output(network, deterministic=True)
test_acc = T.mean(T.eq(T.argmax(test_prediction, axis=1), target_var),
dtype=theano.config.floatX)
test_fn = theano.function([input_var, target_var], test_acc, allow_input_downcast=True)
for r in np.arange(0, len(regularizers)):
# Set to original parameters
lasagne.layers.set_all_param_values(network, params_original)
# Define the regularized loss function
loss_reg = loss.mean() + reg_factors[r] * lasagne.regularization.regularize_network_params(network, regularizers[r])
# Update function
# updates_reg = lasagne.updates.nesterov_momentum(loss_reg, params,learning_rate=0.01)
updates_reg = lasagne.updates.adam(loss_reg, params)
# Training function
train_fn = theano.function([input_var, target_var], loss_reg, updates=updates_reg, allow_input_downcast=True)
# Train network
print("\tTraining with ", names[r], " regularization, epoch: ", end="")
start = time.time()
for epoch in range(max_epochs):
loss_epoch = 0
batches = 0
if np.mod(epoch, 10) == 0:
print(epoch, "... ", end="")
for batch in iterate_minibatches(X_train, y_train, batch_size, shuffle=True):
input_batch, target_batch = batch
loss_epoch += train_fn(input_batch, target_batch)
batches += 1
tr_obj[r,k,epoch] = loss_epoch/batches
end = time.time()
tr_times[r,k] = end - start
print(epoch, ".")
# Final test with accuracy
print("\tTesting the network with ", names[r], " regularization...")
tr_errors[r,k] = test_fn(X_train, y_train)
tst_errors[r,k] = test_fn(X_test, y_test)
# Check sparsity
params_trained = lasagne.layers.get_all_param_values(network, trainable=True)
sparsity_weights[r,k,:] = [1-(x.round(decimals=3).ravel().nonzero()[0].shape[0]/x.size) for x in params_trained[0::2]]
sparsity_neurons[r,k,:] = [x.round(decimals=3).sum(axis=1).nonzero()[0].shape[0] for x in params_trained[0::2]]
tr_obj_mean = np.mean(tr_obj, axis=1)
# Plot the average loss
plt.figure()
plt.title('Training objective')
for r in np.arange(0, len(regularizers)):
plt.semilogy(tr_obj_mean[r, :], label=names[r])
plt.legend()
# Print the results
print(tabulate([['Tr. accuracy [%]'] + np.mean(tr_errors, axis=1).round(decimals=4).tolist(),
['Test. accuracy [%]'] + np.mean(tst_errors, axis=1).round(decimals=4).tolist(),
['Tr. times [secs.]'] + np.mean(tr_times, axis=1).round(decimals=4).tolist(),
['Sparsity [%]'] + np.mean(sparsity_weights, axis=1).round(decimals=4).tolist(),
['Neurons'] + np.mean(sparsity_neurons, axis=1).round(decimals=4).tolist()],
headers=['']+names))
Here is my defined regularizer pen_S(l), but when I run the code to train the network, i was promted with 'TypeError: cost must be a scalar.' But I think my output of pen_S is already a scalar.
Can anyone help me with this?

Increasing validation loss from the very beginning

I've been doing a very simply binary cat/dog classification project with machine learning. I understand the problem of overfitting, but what's strange in my case is that the validation loss begins to rise from the very beginning. I've tried many different sets of hyperparameters, with L2 regularization, learning rate decay and stochastic gradient descent, and a large training set, but the issue remained. Here is the learning graph from one of the trials (the horizontal axis should be per 10 epochs):
The hyperparameters are: two hidden layers with 50 and 10 units, initial alpha = 0.05, alpha decay rate = 0.95 per 50 epochs, mini-batch size = 64, lambda = 0.05
Here are other sample learning graphs:
I developed my model on the basis of what's provided in Andrew Ng's Deep Learning Specialization, so I didn't expect many bugs. My full code, as required, is attached below:
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
from scipy import special
#Data Preprocessing (the same for dev set, which I omit here)
path = '/Users/bobby/Downloads/kagglecatsanddogs_3367a/PetImages'
train_set = []
img_size = 80
categories = ['dogs_train','cats_train']
epsilon = 1e-8
for category in categories:
path_animal = os.path.join(path, category)
for img in os.listdir(path_animal):
try:
img_array = cv2.imread(os.path.join(path_animal, img), cv2.IMREAD_GRAYSCALE)
new_img_array = cv2.resize(img_array, (img_size, img_size))
flattened_img_array = new_img_array.reshape(img_size*img_size)
train_set.append([flattened_img_array, categories.index(category)])
except:
continue
import random
random.shuffle(train_set)
X_train = []
Y_train = []
for sample in train_set:
X_train.append(sample[0])
Y_train.append(sample[1])
X_train = (np.array(X_train).T)/255
Y_train = np.array(Y_train).reshape((1, np.array(Y_train).shape[0]))
def create_mini_batches(X, Y, mini_batch_size):
m = X.shape[1]
mini_batches = []
num_mini_batches = m // mini_batch_size
permutation = list(np.random.permutation(m))
shuffled_X = X[:, permutation]
shuffled_Y = Y[:, permutation]
for i in range(num_mini_batches):
select_X = shuffled_X[:, mini_batch_size*i : mini_batch_size*(i+1)]
select_Y = shuffled_Y[:, mini_batch_size*i : mini_batch_size*(i+1)]
mini_batch = (select_X, select_Y)
mini_batches.append(mini_batch)
if m % mini_batch_size != 0:
last_X = shuffled_X[:, mini_batch_size*num_mini_batches:m]
last_Y = shuffled_Y[:, mini_batch_size*num_mini_batches:m]
last_mini_batch = (last_X, last_Y)
mini_batches.append(last_mini_batch)
return mini_batches
def initialize_parameters(layers_dims):
L = len(layers_dims) # number of layers (including input layer), in this case L=4.
parameters = {}
for l in range(1,L): # range(1,4).
parameters['W' + str(l)] = np.random.randn(layers_dims[l],layers_dims[l-1]) * np.sqrt(2/layers_dims[l-1])
parameters['b' + str(l)] = np.zeros((layers_dims[l],1))
return parameters
def sigmoid(Z):
A = special.expit(Z)
return A,Z
def relu(Z):
A = np.maximum(0.01*Z, Z)
return A,Z
def forward_propagation(X, parameters):
caches = [] #list containing Z for every node
A = X
L = int(len(parameters)/2)
for l in range(1,L):
A_prev = A
W = parameters['W'+str(l)]
b = parameters['b'+str(l)]
Z = np.dot(W, A_prev) + b
A, activation_cache = relu(Z) #activation_cache contains z[l].
linear_cache = (A_prev, W, b) #linear_cache contains A[l-1], W[l], b[l].
cache = (linear_cache, activation_cache)
caches.append(cache)
W = parameters['W'+str(L)]
b = parameters['b'+str(L)]
Z = np.dot(W, A) + b
AL, activation_cache = sigmoid(Z)
linear_cache = (A, W, b)
cache = (linear_cache, activation_cache)
caches.append(cache)
return AL, caches
def compute_cost(AL, Y, parameters, lambd):
m = Y.shape[1] # number of examples
L = int(len(parameters)/2) #[6400,100,20,1] L=3 (0,1,2)
reg_cost = 0
for l in range(L):
W = parameters['W' + str(l+1)]
reg_cost += np.sum(np.square(W))
J = (-1/m)*(np.sum(Y*np.log(AL+epsilon)+(1-Y)*np.log(1-AL+epsilon))) + (1/m) * (lambd/2) * reg_cost
J = np.squeeze(J)
return J
def linear_backward(dZ, linear_cache, lambd):
A_prev, W, b = linear_cache
m = A_prev.shape[1]
dW = (1/m) * np.dot(dZ,A_prev.T) + (lambd/m)*W
db = (1/m) * np.sum(dZ,axis=1,keepdims=True)
dA_prev = np.dot(W.T,dZ)
return dA_prev, dW, db
def relu_gradient(Z):
dZ = np.where(Z > 0, 1, 0.01)
return dZ
def sigmoid_gradient(Z):
dZ = special.expit(Z)*(1-special.expit(Z))
return dZ
def linear_activation_backward(dA, cache, lambd, A, Y, activation):
linear_cache, activation_cache = cache
if activation == 'relu':
dZ = dA * relu_gradient(activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
elif activation == 'sigmoid':
dZ = A - Y
dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
return dA_prev, dW, db
def L_model_backward(AL, Y, caches, lambd):
grads = {}
L = len(caches)
m = AL.shape[1]
Y = Y.reshape(AL.shape)
cache_final_layer = caches[L-1]
grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(_, cache_final_layer, lambd, AL, Y, activation='sigmoid')
for l in reversed(range(L-1)):
current_cache = caches[l]
grads["dA" + str(l)], grads["dW" + str(l+1)], grads["db" + str(l+1)] = linear_activation_backward(grads['dA' + str(l+1)], current_cache, lambd, _, _, activation='relu')
return grads
def update_parameters(parameters, grads, learning_rate):
L = len(parameters) // 2
for l in range(L):
parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
return parameters
def Neural_Network_Model(X_train, Y_train, X_dev, Y_dev, layers_dims, learning_rate, num_epoch, mini_batch_size, lambd, k):
mini_batches = create_mini_batches(X_train, Y_train, mini_batch_size) #[(X{1},Y{1}),(X{2},Y{2}),...,(X{n},Y{n})]
costs_train = []
costs_dev = []
parameters = initialize_parameters(layers_dims)
AL_dev, caches_dev = forward_propagation(X_dev, parameters)
J_dev = compute_cost(AL_dev, Y_dev, parameters, 0)
costs_dev.append(J_dev)
for i in range(num_epoch):
for mini_batch in mini_batches:
(minibatch_X, minibatch_Y) = mini_batch
AL, caches = forward_propagation(minibatch_X, parameters)
J_train = compute_cost(AL, minibatch_Y, parameters, lambd)
grads = L_model_backward(AL, minibatch_Y, caches, lambd)
parameters = update_parameters(parameters, grads, learning_rate)
if i % 10 == 0:
costs_train.append(J_train)
AL_dev, caches_dev = forward_propagation(X_dev, parameters)
J_dev = compute_cost(AL_dev, Y_dev, parameters, 0)
costs_dev.append(J_dev)
if i % 100 == 0:
print ("Cost after epoch %i: %f" %(i, J_train))
learning_rate = learning_rate * (k**(i/50))
plt.plot(np.squeeze(costs_train),'r')
plt.plot(np.squeeze(costs_dev),'b')
plt.ylabel('cost')
plt.xlabel('epochs (per thirties)')
plt.show()
return parameters, costs_train, costs_dev
parameters_updated, costs_train, costs_dev = Neural_Network_Model(X_train, Y_train, X_dev, Y_dev, [6400, 50, 10, 1], 0.05, 1000, 64, 0.05, 0.95)
I would really be grateful for anyone who is patient enough to read through my code. If the problem is still overfitting, could you offer some advice as to how to address this issue? I'm at a loss here because the validation loss goes up at a very early stage, so early stopping would cause underfitting by preventing the model from learning more deeply. Any advice would be appreciated.
When Validation Loss starts to increase in early beginning like images you added, it means that there's there is something wrong in the model.
It's not clear what's it as you didn't show your model.
You could check the following links that will help you:
Basic Cats vs Dogs Detailed Example in Colab
Detailed explanation for Over-fitting in TF Tutorial
or add your full code

How does one use Hermite polynomials with Stochastic Gradient Descent (SGD)?

I was trying to train a simple polynomial linear model with pytorch using Hermite polynomials since they seem to have a better conditioned Hessian.
To do that I decided to use the hermvander since it gives the Vandermonde matrix with each entry being a Hermite term. To do that I just made my feature vectors be the outpute of hermvander:
Kern_train = hermvander(X_train,Degree_mdl)
however, when I proceeded to train I get NaN all the time. I suspected it could have been a step size issue but I decided to use the step size suggested by this question that already has my example working in R, so there is no need to search for a step size I thought. However, when I tried it it does not work.
Anyone has any idea whats going on?
Same error occurs in tensorflow:
import pdb
import numpy as np
from numpy.polynomial.hermite import hermvander
import random
import tensorflow as tf
def get_batch(X,Y,M):
N = len(Y)
valid_indices = np.array( range(N) )
batch_indices = np.random.choice(valid_indices,size=M,replace=False)
batch_xs = X[batch_indices,:]
batch_ys = Y[batch_indices]
return batch_xs, batch_ys
##
D0=1
logging_freq = 100
## SGD params
M = 5
eta = 0.1
#eta = lambda i: eta/(i**0.6)
nb_iter = 500*10
##
lb,ub = 0,1
freq_sin = 4 # 2.3
f_target = lambda x: np.sin(2*np.pi*freq_sin*x)
N_train = 10
X_train = np.linspace(lb,ub,N_train)
Y_train = f_target(X_train).reshape(N_train,1)
x_horizontal = np.linspace(lb,ub,1000).reshape(1000,1)
## degree of mdl
Degree_mdl = N_train-1
## Hermite
Kern_train = hermvander(X_train,Degree_mdl)
print(f'Kern_train.shape={Kern_train.shape}')
Kern_train = Kern_train.reshape(N_train,Kern_train.shape[1])
##
Kern_train_pinv = np.linalg.pinv( Kern_train )
c_pinv = np.dot(Kern_train_pinv, Y_train)
nb_terms = c_pinv.shape[0]
##
condition_number_hessian = np.linalg.cond(Kern_train)
##
graph = tf.Graph()
with graph.as_default():
X = tf.placeholder(tf.float32, [None, nb_terms])
Y = tf.placeholder(tf.float32, [None,1])
w = tf.Variable( tf.zeros([nb_terms,1]) )
#w = tf.Variable( tf.truncated_normal([Degree_mdl,1],mean=0.0,stddev=1.0) )
#w = tf.Variable( 1000*tf.ones([Degree_mdl,1]) )
##
f = tf.matmul(X,w) # [N,1] = [N,D] x [D,1]
#loss = tf.reduce_sum(tf.square(Y - f))
loss = tf.reduce_sum( tf.reduce_mean(tf.square(Y-f), 0))
l2loss_tf = (1/N_train)*2*tf.nn.l2_loss(Y-f)
##
learning_rate = eta
#global_step = tf.Variable(0, trainable=False)
#learning_rate = tf.train.exponential_decay(learning_rate=eta, global_step=global_step,decay_steps=nb_iter/2, decay_rate=1, staircase=True)
train_step = tf.train.GradientDescentOptimizer(learning_rate=learning_rate).minimize(loss)
with tf.Session(graph=graph) as sess:
Y_train = Y_train.reshape(N_train,1)
tf.global_variables_initializer().run()
# Train
for i in range(nb_iter):
#if i % (nb_iter/10) == 0:
if i % (nb_iter/10) == 0 or i == 0:
current_loss = sess.run(fetches=loss, feed_dict={X: Kern_train, Y: Y_train})
print(f'tf: i = {i}, current_loss = {current_loss}')
## train
batch_xs, batch_ys = get_batch(Kern_train,Y_train,M)
sess.run(train_step, feed_dict={X: batch_xs, Y: batch_ys})
print(f'condition_number_hessian = {condition_number_hessian}')
print('\a')
Totally self contained code in pytorch:
import numpy as np
from numpy.polynomial.hermite import hermvander
import random
import torch
from torch.autograd import Variable
def vectors_dims_dont_match(Y,Y_):
'''
Checks that vector Y and Y_ have the same dimensions. If they don't
then there might be an error that could be caused due to wrong broadcasting.
'''
DY = tuple( Y.size() )
DY_ = tuple( Y_.size() )
if len(DY) != len(DY_):
return True
for i in range(len(DY)):
if DY[i] != DY_[i]:
return True
return False
def index_batch(X,batch_indices,dtype):
'''
returns the batch indexed/sliced batch
'''
if len(X.shape) == 1: # i.e. dimension (M,) just a vector
batch_xs = torch.FloatTensor(X[batch_indices]).type(dtype)
else:
batch_xs = torch.FloatTensor(X[batch_indices,:]).type(dtype)
return batch_xs
def get_batch2(X,Y,M,dtype):
'''
get batch for pytorch model
'''
# TODO fix and make it nicer, there is pytorch forum question
X,Y = X.data.numpy(), Y.data.numpy()
N = len(Y)
valid_indices = np.array( range(N) )
batch_indices = np.random.choice(valid_indices,size=M,replace=False)
batch_xs = index_batch(X,batch_indices,dtype)
batch_ys = index_batch(Y,batch_indices,dtype)
return Variable(batch_xs, requires_grad=False), Variable(batch_ys, requires_grad=False)
def get_sequential_lifted_mdl(nb_monomials,D_out, bias=False):
return torch.nn.Sequential(torch.nn.Linear(nb_monomials,D_out,bias=bias))
def train_SGD(mdl, M,eta,nb_iter,logging_freq ,dtype, X_train,Y_train):
##
#pdb.set_trace()
N_train,_ = tuple( X_train.size() )
#print(N_train)
for i in range(1,nb_iter+1):
# Forward pass: compute predicted Y using operations on Variables
batch_xs, batch_ys = get_batch2(X_train,Y_train,M,dtype) # [M, D], [M, 1]
## FORWARD PASS
y_pred = mdl.forward(batch_xs)
## Check vectors have same dimension
if vectors_dims_dont_match(batch_ys,y_pred):
raise ValueError('You vectors don\'t have matching dimensions. It will lead to errors.')
## LOSS + Regularization
batch_loss = (1/M)*(y_pred - batch_ys).pow(2).sum()
## BACKARD PASS
batch_loss.backward() # Use autograd to compute the backward pass. Now w will have gradients
## SGD update
for W in mdl.parameters():
delta = eta(i)*W.grad.data
W.data.copy_(W.data - delta)
## train stats
if i % (nb_iter/10) == 0 or i == 0:
#X_train_, Y_train_ = Variable(X_train), Variable(Y_train)
X_train_, Y_train_ = X_train, Y_train
current_train_loss = (1/N_train)*(mdl.forward(X_train_) - Y_train_).pow(2).sum().data.numpy()
print('\n-------------')
print(f'i = {i}, current_train_loss = {current_train_loss}\n')
print(f'eta*W.grad.data = {eta*W.grad.data}')
print(f'W.grad.data = {W.grad.data}')
## Manually zero the gradients after updating weights
mdl.zero_grad()
final_sgd_error = current_train_loss
return final_sgd_error
##
D0=1
logging_freq = 100
#dtype = torch.cuda.FloatTensor
dtype = torch.FloatTensor
## SGD params
M = 5
eta = 0.1
eta = lambda i: eta/(i**0.6)
nb_iter = 500*10
##
lb,ub = 0,1
freq_sin = 4 # 2.3
f_target = lambda x: np.sin(2*np.pi*freq_sin*x)
N_train = 10
X_train = np.linspace(lb,ub,N_train)
Y_train = f_target(X_train).reshape(N_train,1)
x_horizontal = np.linspace(lb,ub,1000).reshape(1000,1)
## degree of mdl
Degree_mdl = N_train-1
## Hermite
Kern_train = hermvander(X_train,Degree_mdl)
Kern_train = Kern_train.reshape(N_train,Kern_train.shape[2])
##
Kern_train_pinv = np.linalg.pinv( Kern_train )
c_pinv = np.dot(Kern_train_pinv, Y_train)
##
condition_number_hessian = np.linalg.cond(Kern_train)
## linear mdl to train with SGD
nb_terms = c_pinv.shape[0]
mdl_sgd = get_sequential_lifted_mdl(nb_monomials=nb_terms,D_out=1, bias=False)
mdl_sgd[0].weight.data.normal_(mean=0,std=0.001)
mdl_sgd[0].weight.data.fill_(0)
## Make polynomial Kernel
Kern_train_pt, Y_train_pt = Variable(torch.FloatTensor(Kern_train).type(dtype), requires_grad=False), Variable(torch.FloatTensor(Y_train).type(dtype), requires_grad=False)
final_sgd_error = train_SGD(mdl_sgd, M,eta,nb_iter,logging_freq ,dtype, Kern_train_pt,Y_train_pt)
## PRINT ERRORS
from plotting_utils import *
train_error_pinv = (1/N_train)*(np.linalg.norm(Y_train-np.dot(Kern_train,c_pinv))**2)
print('\n-----------------')
print(f'N_train={N_train}')
print(f'train_error_pinv = {train_error_pinv}')
print(f'final_sgd_error = {final_sgd_error}')
print(f'condition_number_hessian = {condition_number_hessian}')
print('\a')
Maybe it's a bit late, but you might have a look at this https://github.com/Orcuslc/OrthNet

Categories

Resources