I have converted my problem into something more tangible.
I have the following equations;
Where x is the input variables, and I want to find the betas, as they are useful in another setting. I know that in this simple code example one could just estimate y and then apply OLS to get the betas, however for my real-life usage this is not applicable.
I use Pytorch to keep track of gradients and SGD to optimize my parameters. A simple FFN is used as an approximation for the functions f.
My code is written below (I'm relatively new to Pytorch and tensors)
import pandas as pd
import numpy as np
import torch
import torch.nn as nn
import torch.optim as optim
import copy
import torch.nn.functional as F
import matplotlib.pyplot as plt
# Make data
df = pd.date_range('2020-12-01', '2022-05-31', freq='H')
df = pd.DataFrame(df[0:len(df)-1], columns={'DateTime'})
np.random.seed(1)
A = 10
X = pd.DataFrame(np.random.uniform(0, 1, len(df)), columns={'X1'})
X['X2'] = np.random.normal(0, 1, len(df))
X['X3'] = np.random.normal(0, 1, len(df))
Y = pd.DataFrame()
for i in range(0, len(df)):
Y[str(i)] = X['X1'][i] ** 2 + X['X1'][i] * X['X2'][i] * range(0, A) + X['X1'][i] **2 * X['X2'][i] * np.array(range(0, A))**2
Y = Y.T.reset_index(drop=True)
df = pd.concat([df, Y], axis=1)
# Setting
valid_start = '2021-07-01'
test_start = '2022-01-01'
Out = 2
X_array = X.copy()
Y_train = df[df['DateTime'].dt.date.astype(str) < valid_start].drop(columns='DateTime')
Y_valid = df[df['DateTime'].dt.date.astype(str) >= valid_start].drop(columns='DateTime')
Y_test = df[df['DateTime'].dt.date.astype(str) >= valid_start].drop(columns='DateTime')
Y_test_date = pd.DataFrame(df[df['DateTime'].dt.date.astype(str) >= valid_start]['DateTime'], columns={'DateTime'})
X_train = X_array[X_array.index.isin(Y_train.index)]
X_valid = X_array[X_array.index.isin(Y_valid.index)]
X_test = X_array[X_array.index.isin(Y_test.index)]
# Make cuda
torch.cuda.is_available()
torch.cuda.device_count()
device = torch.device("cuda:0" if torch.cuda.is_available() else "cpu")
print(device)
# make tensor
Y_train_tensor = torch.tensor(np.float32(Y_train)).to(device)
X_train_tensor = torch.tensor(np.float32(X_train)).to(device)
Y_valid_tensor = torch.tensor(np.float32(Y_valid)).to(device)
X_valid_tensor = torch.tensor(np.float32(X_valid)).to(device)
Y_test_tensor = torch.tensor(np.float32(Y_test)).to(device)
X_test_tensor = torch.tensor(np.float32(X_test)).to(device)
## Model
class Model(nn.Module):
def __init__(self):
# how many layers?
super().__init__()
self.fc1 = nn.Linear(X_train_tensor.shape[1], 4)
self.out = nn.Linear(4, int(Out) + 1)
def forward(self, x):
x = F.relu(self.fc1(x))
x = self.out(x)
return x
net = Model().to(device)
criterion = nn.MSELoss()
optimizer = optim.SGD(net.parameters(), lr=0.001)
Range_tensor = torch.tensor(np.float32(range(0, A))).to(device)
# Early stopping
Network_weight = [copy.deepcopy(net.out.weight)]
Epoch = 100
patience = 10
Valloss_Array = []
trigger_times = 0
last_loss = 1000000000000000
Valloss_Array.append(last_loss)
for epoch in range(Epoch): # loop over the dataset multiple times
x = X_train_tensor
y = Y_train_tensor
# zero the parameter gradients
optimizer.zero_grad()
# forward + backward + optimize
outputs = net(x)
y_hat = []
for e in range(0, len(y)):
y_hat_temp = outputs[e, 0] + outputs[e, 1] * Range_tensor + outputs[e, 2] * torch.pow(Range_tensor, 2)
y_hat.append(y_hat_temp)
y_hat = torch.stack(y_hat)
loss = criterion(y_hat, y)
loss.backward()
optimizer.step()
print('Running training loss is; ' + str(loss.item()))
Network_weight.append([copy.deepcopy(net.fc1.weight), copy.deepcopy(net.out.weight)])
################## Validation ##################
# break
x = X_valid_tensor
y = Y_valid_tensor
outputs = net(x)
y_hat = []
for e in range(0, len(y)):
y_hat_temp = outputs[e, 0] + outputs[e, 1] * Range_tensor + outputs[e, 2] * torch.pow(Range_tensor, 2)
y_hat.append(y_hat_temp)
y_hat = torch.stack(y_hat)
Valloss = criterion(y_hat, y)
if Valloss.item() >= min(Valloss_Array):
trigger_times += 1
if trigger_times >= patience:
print('Early stopping!')
break
else:
trigger_times = 0
Best = [copy.deepcopy(net.fc1.weight), copy.deepcopy(net.out.weight)]
Valloss_Array.append(Valloss.item())
# With best weights
net.fc1.weight = Best[0]
net.out.weight = Best[1]
outputs = net(X_test_tensor)
y_hat = []
for e in range(0, len(Y_test_tensor)):
y_hat_temp = outputs[e, 0] + outputs[e, 1] * Range_tensor + outputs[e, 2] * torch.pow(Range_tensor, 2)
y_hat.append(y_hat_temp)
y_hat = torch.stack(y_hat)
BestModelLoss = criterion(y_hat, Y_test_tensor)
Y_Prediction = pd.DataFrame(y_hat.cpu().detach().numpy(), index=Y_test.index, columns={0, 1, 2, 3, 4, 5, 6, 7, 8, 9})
Y_test_date['SE'] = ((Y_Prediction - Y_test) ** 2).sum(axis=1)
Y_Prediction['DateTime'] = Y_test_date['DateTime']
Y_test['DateTime'] = Y_test_date['DateTime']
### Plot
MinDate = Y_test_date.loc[Y_test_date['SE'] == Y_test_date['SE'].min(), 'DateTime']
YMinForecast = np.array(Y_Prediction[(Y_Prediction['DateTime'] == MinDate.values[0])].drop(columns='DateTime'))
YMin = np.array(Y_test[(Y_test['DateTime'] == MinDate.values[0])].drop(columns='DateTime'))
MaxDate = Y_test_date.loc[Y_test_date['SE'] == Y_test_date['SE'].max(), 'DateTime']
YMaxForecast = np.array(Y_Prediction[(Y_Prediction['DateTime'] == MaxDate.values[0])].drop(columns='DateTime'))
YMax = np.array(Y_test[(Y_test['DateTime'] == MaxDate.values[0])].drop(columns='DateTime'))
MedianValue = Y_test_date['SE'].iloc[(Y_test_date['SE']-Y_test_date['SE'].median()).abs().argsort()[:2]].values[0]
MedDate = Y_test_date.loc[Y_test_date['SE'] == MedianValue, 'DateTime']
YMedForecast = np.array(Y_Prediction[(Y_Prediction['DateTime'] == MedDate.values[0])].drop(columns='DateTime'))
YMed = np.array(Y_test[(Y_test['DateTime'] == MedDate.values[0])].drop(columns='DateTime'))
# plot
plt.plot(range(0, y_hat.shape[1]), YMinForecast.reshape(-1), label='Forecast')
plt.plot(range(0, y_hat.shape[1]), YMin.reshape(-1), label='Actual')
plt.grid()
plt.title("Best performance")
plt.legend()
plt.show()
plt.plot(range(0, y_hat.shape[1]), YMaxForecast.reshape(-1), label='Forecast')
plt.plot(range(0, y_hat.shape[1]), YMax.reshape(-1), label='Actual')
plt.grid()
plt.title("Poor performance")
plt.legend()
plt.show()
plt.plot(range(0, y_hat.shape[1]), YMedForecast.reshape(-1), label='Forecast')
plt.plot(range(0, y_hat.shape[1]), YMed.reshape(-1), label='Actual')
plt.grid()
plt.title("Median performance")
plt.legend()
plt.show()
Increasing the hidden layers and the number of neurons solved the problem (both the simplification and the real problem).
Related
Below is the code, i am running a for loop to train on different training sizes. The first loop works correctly, where when training begins, the training and validation accuracy are sent to a list, then a frame then finally a csv. But on the subsequent loops, a data generator is sent to the list. Can anyone see where the issue is, because I cant find it.
Also if you have a better way of doing this (data compiling for analysis), I'm all ears.
The first block is the code snippet, the second block is the full code. The for loop starts about halfway down.
for i in range(1,6):
training_loader, validation_loader, training_ones, training_zeros, validation_ones, validation_zeros = switcher().sets(case)
train_accuracy = []
val_accuracy = []
start_time = time.time()
for epoch in tqdm(range(1, epochs + 1), total=epochs):
train()
train_acc = test(training_loader)
train_accuracy.append(train_acc)
val_acc = test(validation_loader)
val_accuracy.append(val_acc)
accuracy = pd.DataFrame()
accuracy['train_acc'] = train_accuracy
accuracy['val_acc'] = val_accuracy
accuracy.to_csv(f'C:\\Users\\Anthony Sirico\\Documents\\GitHub\\PyGeo_Circuit_exp\\PyGeo_Circuit_exp\\imbalance_exp\\csv files\\accuracy_{i}.csv')
import sys
sys.path.insert(0, 'C:\\Users\\user\\Desktop\\imbalance_exp\\imbalance_exp\\imbalance_exp')
import torch
from torch_geometric.loader import DataLoader
import imb_dataset as imb
from torch.nn import Linear
import torch.nn.functional as F
from torch_geometric.nn import GraphConv
from torch_geometric.nn import global_mean_pool
import neptune.new as neptune
import pandas as pd
from sklearn.metrics import confusion_matrix, matthews_corrcoef
import seaborn as sns
from neptune.new.types import File
from tqdm import tqdm
import time
known = imb.ImbalanceDataset(root='imb_50v2', set='known', split=0.5)
unknown = imb.ImbalanceDataset(root='imb_50v2', set='unknown', split=0.5)
all_data = imb.ImbalanceDataset(root='imb_50v2', set='All', split=None)
torch.manual_seed(12345)
known = known.shuffle()
lr = 0.001
training_perc = 0.9
N = len(known)
mini_batch_size = 32
epochs = 600
device = torch.device('cuda' if torch.cuda.is_available() else 'cpu')
case = 2
class switcher:
def sets(self, case):
default = known
return getattr(self, 'case_' + str(case), lambda: default)()
def case_1(self):
training_set = known[:int(training_perc*len(known))]
validation_set = known[int(training_perc*len(known)):]
training_loader = DataLoader(training_set, batch_size=mini_batch_size, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=mini_batch_size, shuffle=False)
training_ones = []
training_zeros = []
validation_ones = []
validation_zeros = []
for i in range(len(training_set)):
if training_set[i].y == 1:
training_ones.append(training_set[i])
else:
training_zeros.append(training_set[i])
for i in range(len(validation_set)):
if validation_set[i].y == 1:
validation_ones.append(validation_set[i])
else:
validation_zeros.append(validation_set[i])
return training_loader, validation_loader, training_ones, training_zeros, validation_ones, validation_zeros
def case_2(self):
one_index = round(len(known) * 0.25)
known_ones = known[:one_index].copy()
known_ones.shuffle()
known_zeros = known[one_index:].copy()
known_zeros.shuffle()
training_ones = known_ones[:int(training_perc*len(known_ones))]
training_zeros = known_zeros[:len(training_ones)]
training_set = torch.utils.data.ConcatDataset([training_ones, training_zeros])
validation_ones = known_ones[int(training_perc*len(known_ones)):]
validation_zeros = known_zeros[len(training_ones):]
validation_set = torch.utils.data.ConcatDataset([validation_ones, validation_zeros])
training_loader = DataLoader(training_set, batch_size=mini_batch_size, shuffle=True)
validation_loader = DataLoader(validation_set, batch_size=mini_batch_size, shuffle=False)
training_ones = []
training_zeros = []
validation_ones = []
validation_zeros = []
for i in range(len(training_set)):
if training_set[i].y == 1:
training_ones.append(training_set[i])
else:
training_zeros.append(training_set[i])
for i in range(len(validation_set)):
if validation_set[i].y == 1:
validation_ones.append(validation_set[i])
else:
validation_zeros.append(validation_set[i])
return training_loader, validation_loader, training_ones, training_zeros, validation_ones, validation_zeros
class GCN(torch.nn.Module):
def __init__(self, hidden_channels):
super(GCN, self).__init__()
torch.manual_seed(12345)
self.conv1 = GraphConv(known.num_node_features, hidden_channels)
self.conv2 = GraphConv(hidden_channels, hidden_channels)
self.conv3 = GraphConv(hidden_channels, hidden_channels)
self.lin = Linear(hidden_channels, known.num_classes)
def forward(self, x, edge_index, batch):
# 1. Obtain node embeddings
x = self.conv1(x, edge_index)
x = x.relu()
x = self.conv2(x, edge_index)
x = x.relu()
x = self.conv3(x, edge_index)
# 2. Readout layer
x = global_mean_pool(x, batch) # [batch_size, hidden_channels]
# 3. Apply a final classifier
x = F.dropout(x, p=0.5, training=self.training)
x = self.lin(x)
return x
model = GCN(hidden_channels=64).to(device)
print(model)
optimizer = torch.optim.Adam(model.parameters(), lr=lr)
criterion = torch.nn.CrossEntropyLoss()
def train():
model.train()
total_loss = 0
for data in training_loader: # Iterate in batches over the training dataset.
data = data.to(device)
out = model(data.x, data.edge_index, data.batch) # Perform a single forward pass.
loss = criterion(out, data.y) # Compute the loss solely based on the training nodes.
loss.backward() # Derive gradients.
optimizer.step() # Update parameters based on gradients.
optimizer.zero_grad() # Clear gradients.
def test(loader):
model.eval()
correct = 0
for data in loader: # Iterate in batches over the training/test dataset.
data = data.to(device)
out = model(data.x, data.edge_index, data.batch)
pred = out.argmax(dim=1) # Use the class with highest probability.
correct += int((pred == data.y).sum()) # Check against ground-truth labels.
return correct / len(loader.dataset) # Derive ratio of correct predictions.
output_frame = pd.DataFrame(columns=['epoch', 'lr', 'known', 'unknown', 'train_ones', 'train_zeros', 'val_ones', 'val_zeros', 'tn_all', 'fp_all', 'fn_all', 'tp_all', 'tn_known', 'fp_known', 'fn_known', 'tp_known', 'precision_all', 'recall_all', 'f1_all', 'accuracy_all', 'mcc_all', 'precision_known', 'recall_known', 'f1_known', 'accuracy_known', 'mcc_known', 'time_elapsed'])
for i in range(1,6):
training_loader, validation_loader, training_ones, training_zeros, validation_ones, validation_zeros = switcher().sets(case)
train_accuracy = []
val_accuracy = []
start_time = time.time()
for epoch in tqdm(range(1, epochs + 1), total=epochs):
train()
train_acc = test(training_loader)
train_accuracy.append(train_acc)
val_acc = test(validation_loader)
val_accuracy.append(val_acc)
accuracy = pd.DataFrame()
accuracy['train_acc'] = train_accuracy
accuracy['val_acc'] = val_accuracy
accuracy.to_csv(f'C:\\Users\\Anthony Sirico\\Documents\\GitHub\\PyGeo_Circuit_exp\\PyGeo_Circuit_exp\\imbalance_exp\\csv files\\accuracy_{i}.csv')
unknown_loader = DataLoader(unknown, batch_size=1, shuffle=False)
predictions = []
all_correct = 0
known_correct = 0
for test in unknown_loader:
test = test.to(device)
out = model(test.x, test.edge_index, test.batch)
pred = out.argmax(dim=1)
predictions.append(pred)
all_correct += int((pred == test.y_all).sum())
known_correct += int((pred == test.y_known).sum())
pred_df = pd.DataFrame()
pred_df['y_all_true'] = [i.item() for i in unknown.data.y_all]
pred_df['y_known_true'] = [i.item() for i in unknown.data.y_known]
pred_df['y_pred'] = [i.item() for i in predictions]
pred_df.to_csv(f'C:\\Users\\Anthony Sirico\\Documents\\GitHub\\PyGeo_Circuit_exp\\PyGeo_Circuit_exp\\imbalance_exp\\csv files\\pred_df_{i}.csv')
cf_matrix_all = confusion_matrix(pred_df['y_all_true'], pred_df['y_pred'])
ax = sns.heatmap(cf_matrix_all, annot=True, fmt='g', cmap='Blues')
ax.title.set_text('Confusion Matrix based on all data')
tn_all, fp_all, fn_all, tp_all = cf_matrix_all.ravel()
end_time = time.time()
time_elapsed = end_time - start_time
precision_all = tp_all / (tp_all + fp_all)
recall_all = tp_all / (tp_all + fn_all)
f1_all = 2 * (precision_all * recall_all) / (precision_all + recall_all)
accuracy_all = (tp_all + tn_all) / (tp_all + tn_all + fp_all + fn_all)
mcc_all = matthews_corrcoef(pred_df['y_all_true'], pred_df['y_pred'])
cf_matrix_known = confusion_matrix(pred_df['y_known_true'], pred_df['y_pred'])
ax = sns.heatmap(cf_matrix_known, annot=True, fmt='g', cmap='Blues')
ax.title.set_text('Confusion Matrix based on known data')
tn_known, fp_known, fn_known, tp_known = cf_matrix_known.ravel()
precision_known = tp_known / (tp_known + fp_known)
recall_known = tp_known / (tp_known + fn_known)
f1_known = 2 * (precision_known * recall_known) / (precision_known + recall_known)
accuracy_known = (tp_known + tn_known) / (tp_known + tn_known + fp_known + fn_known)
mcc_known = matthews_corrcoef(pred_df['y_known_true'], pred_df['y_pred'])
#'epoch', 'lr', 'known', 'unknown', 'train_ones', 'train_zeros', 'val_ones', 'val_zeros', 'tn_all', 'fp_all', 'fn_all', 'tp_all', 'tn_known', 'fp_known', 'fn_known', 'tp_known
output_frame.loc[i] = [epochs, lr, len(known), len(unknown), len(training_ones), len(training_zeros), len(validation_ones), len(validation_zeros), tn_all, fp_all, fn_all, tp_all, tn_known, fp_known, fn_known, tp_known, precision_all, recall_all, f1_all, accuracy_all, mcc_all, precision_known, recall_known, f1_known, accuracy_known, mcc_known, time_elapsed]
output_frame.to_csv('C:\\Users\\Anthony Sirico\\Documents\\GitHub\\PyGeo_Circuit_exp\\PyGeo_Circuit_exp\\imbalance_exp\\csv files\\final_output.csv')
training_perc -= 0.2
I am trying to create a nueral network using tensor flow. I am not using keras api. I have some parameter estimation(weight,bias and some other parameters) to do. The code is working but the parameter estimation is really bad and error percentage is very high what is the problem here? I tried so many ways still no improvement. the loss fn is less.
I tried creating my own optimizer but the process is slow and the error is large. Is there any way to apply optimizers parameter.
import tensorflow as tf
import numpy as np
import matplotlib.pyplot as plt
import scipy.io
from scipy.interpolate import griddata
from pyDOE import lhs
import math as ma
class PhysicsInformedNN:
def __init__(self,X_n,v,layers,lb,ub):
self.lb = lb
self.ub = ub
self.layers = layers
self.dx_n = tf.convert_to_tensor(X_n[:,0:1],dtype = 'float32')
self.t_n = tf.convert_to_tensor(X_n[:,1:2],dtype = 'float32')
self.v_r = tf.convert_to_tensor(v,dtype = 'float32')
self.lambda_1 = tf.Variable(0,dtype = 'float32')#1.5
self.lambda_2 = tf.Variable(-6,dtype = 'float32')
self.para =[self.lambda_1,self.lambda_2]
self.weights, self.biases = self.initialize_NN(layers)
def initialize_NN(self,layers):
weights = []
biases = []
num_layers = len(layers)
for l in range(0,num_layers-1):
W = self.xavier_init(size=[layers[l], layers[l+1]])
b = tf.Variable(tf.zeros([1,layers[l+1]], dtype='float32'), dtype='float32')
weights.append(W)
biases.append(b)
return weights, biases
def xavier_init(self, size):
in_dim = size[0]
out_dim = size[1]
xavier_stddev = np.sqrt(2/(in_dim + out_dim))
return tf.Variable(tf.random.truncated_normal([in_dim, out_dim], stddev=xavier_stddev), dtype='float32')
def neural_net(self, X, weights, biases):
num_layers = len(weights) + 1
H = 2.0*(X - self.lb)/(self.ub - self.lb) - 1.0
for l in range(0,num_layers-2):
W = weights[l]
b = biases[l]
H = tf.math.tanh(tf.math.add(tf.linalg.matmul(H, W), b))
W = weights[-1]
b = biases[-1]
Y = tf.math.add(tf.linalg.matmul(H, W), b)
return Y
def net_u(self, x, t):
v = self.neural_net(tf.concat([x,t],1), self.weights, self.biases)
return v
def net_f(self, x, t):
lambda_1 = self.para[0]
lambda_2 = tf.exp(self.para[1])
with tf.GradientTape(persistent=True) as tape :
tape.watch(t)
tape.watch(x)
u = self.net_u(x,t)
u_x = tape.gradient(u,x)
u_t = tape.gradient(u,t)
u_xx = tape.gradient(u_x,x)
f = u_t + lambda_1*u*u_x - lambda_2*u_xx
del tape
return f
def callback(self, loss,n):
print('Loss:', loss, ' Epoch : ', n)
def train(self,epoch):
for i in range(epoch):
with tf.GradientTape(persistent=True) as tape :
tape.watch(self.weights)
tape.watch(self.biases)
tape.watch(self.para)
f_pred = self.net_f(self.dx_n, self.t_n)
v_pred = self.net_u(self.dx_n, self.t_n)
loss = tf.reduce_mean(tf.square(self.v_r - v_pred)) + tf.reduce_mean(tf.square(f_pred))
dw = tape.gradient(loss,self.weights)
db = tape.gradient(loss,self.biases)
dp = tape.gradient(loss,self.para)
lr_schedule = tf.keras.optimizers.schedules.ExponentialDecay(
initial_learning_rate=1e-2,
decay_steps=10000,
decay_rate=0.9)
optimizer1 = tf.keras.optimizers.Adam(learning_rate=0.0001)
optimizer1.apply_gradients(zip(dw, self.weights))
optimizer1.apply_gradients(zip(db, self.biases))
optimizer2 = tf.keras.optimizers.Adam(learning_rate=0.001)
optimizer2.apply_gradients(zip(dp, self.para))
del tape
self.callback(loss,i)
def predict(self, X_star):
v_star = self.net_u(X_star[:,0:1], X_star[:,1:2])
f_star = f_pred = self.net_f(X_star[:,0:1], X_star[:,1:2])
para_last = self.para
return v_star, f_star, para_last
if __name__ == '__main__':
#PARAMETERS for the problem
np.random.seed(123)
nu =0.01/np.pi
layers = [2, 20, 20, 20, 20, 1]
N_u = 2000
data = scipy.io.loadmat('burgers_shock.mat')
t = data['t'].flatten()[:,None]
x = data['x'].flatten()[:,None]
Exact = np.real(data['usol']).T
X, T = np.meshgrid(x,t)
X_star = np.hstack((X.flatten()[:,None], T.flatten()[:,None]))
u_star = Exact.flatten()[:,None]
lb = X_star.min(0)
ub = X_star.max(0)
idx = np.random.choice(X_star.shape[0], N_u, replace=False)
X_u_train = X_star[idx,:]
u_train = u_star[idx,:]
model = PhysicsInformedNN(X_u_train, u_train, layers, lb, ub)
model.train(1000)
X_star = tf.convert_to_tensor(X_star,dtype='float32')
u_pred, f_pred, param = model.predict(X_star)
error_lambda_1 = np.abs(param[0] - 1.0)*100
error_lambda_2 = np.abs( np.exp(param[1])- nu)/nu * 100
print(error_lambda_1,error_lambda_2)
I just started studying NN. In the class, the teacher gave us a code to experiment with, in google colab. I tried changing the number of nodes in each hidden layer and the number of hidden layers, and print out test accuracy and train accuracy. I've tried many configurations but the accuracies did not change. Like, it will stay exactly at 0.7857142857142857 (this is the actual number) unless I reshuffle the samples.
The teacher said that accuracy can't be changed that easily. But I don't believe her. I think there is something wrong with the code because there are too many similar digits.
Here are the codes I think are necessary to post.
Model
class Model():
def __init__(self):
self.layers = []
self.L = 0
self.W = {}
self.b = {}
self.A = {}
self.Z = {}
self.dA = {}
self.dZ = {}
self.dW = {}
self.db = {}
self.cost = 0.
self.m = 0
self.lam = 0
self.cost_history = []
self.acc_history = []
self.alpha_history = []
self.alpha = 0.
self.iterations = 0
def add_layers(self, list_of_layers):
self.layers = list_of_layers
self.L = len(self.layers) - 1 # Number of layers excluding the input feature layer
def init_params(self):
for i in range(1, self.L + 1):
self.W[str(i)] = np.random.randn(self.layers[i], self.layers[i - 1]) * np.sqrt(2. / self.layers[i - 1])
self.b[str(i)] = np.zeros((self.layers[i], 1))
def forward_prop(self, X):
self.A['0'] = X
for i in range(1, self.L + 1):
self.Z[str(i)] = np.dot(self.W[str(i)], self.A[str(i - 1)]) + self.b[str(i)]
if i == self.L:
# Output layer, Sigmoid activation
self.A[str(i)] = sigmoid(self.Z[str(i)])
else:
# Hidden layer, Relu activataion
self.A[str(i)] = relu(self.Z[str(i)])
def compute_cost(self, Y):
self.cost = -1 * np.sum(np.multiply(Y, np.log(self.A[str(self.L)])) +
np.multiply(1 - Y, np.log(1 - self.A[str(self.L)]))) / self.m
if self.lam != 0:
reg = (self.lam / (2 * self.m))
for i in range(1, self.L + 1):
reg += np.sum(np.dot(self.W[str(i)], self.W[str(i)].T))
self.cost += reg
self.cost_history.append(self.cost)
def backward_prop(self, Y):
# We need dA[str(L)] to start the backward prop computation
self.dA[str(self.L)] = -1 * (np.divide(Y, self.A[str(self.L)]) - np.divide(1 - Y, 1 - self.A[str(self.L)]))
self.dZ[str(self.L)] = np.multiply(self.dA[str(self.L)], sigmoid_derivative(self.Z[str(self.L)]))
self.dW[str(self.L)] = np.dot(self.dZ[str(self.L)], self.A[str(self.L - 1)].T) / self.m + (self.lam/self.m) * self.W[str(self.L)]
self.db[str(self.L)] = np.sum(self.dZ[str(self.L)], axis = 1, keepdims = True) / self.m
self.dA[str(self.L - 1)] = np.dot(self.W[str(self.L)].T, self.dZ[str(self.L)])
for i in reversed(range(1, self.L)):
self.dZ[str(i)] = np.multiply(self.dA[str(i)], relu_derivative(self.Z[str(i)]))
self.dW[str(i)] = np.dot(self.dZ[str(i)], self.A[str(i - 1)].T) / self.m + (self.lam/self.m) * self.W[str(i)]
self.db[str(i)] = np.sum(self.dZ[str(i)], axis = 1, keepdims = True) / self.m
self.dA[str(i - 1)] = np.dot(self.W[str(i)].T, self.dZ[str(i)])
def update_params(self):
for i in range(1, self.L + 1):
self.W[str(i)] = self.W[str(i)] - self.alpha * self.dW[str(i)]
self.b[str(i)] = self.b[str(i)] - self.alpha * self.db[str(i)]
def train(self, X, Y, iterations = 10,
alpha = 0.001, decay = True, decay_iter = 5, decay_rate = 0.9, stop_decay_counter = 100,
verbose = True, lam = 0):
self.m = Y.shape[1]
self.alpha = alpha
self.iterations = iterations
self.lam = lam
# initialize parameters
self.init_params()
for i in range(iterations):
# forward prop
self.forward_prop(X)
# compute cost
self.compute_cost(Y)
# backward prop
self.backward_prop(Y)
# update params
self.update_params()
# evaluate
self.acc_history.append(self.evaluate(X, Y, in_training = True))
# save alpha
self.alpha_history.append(self.alpha)
# learning rate decay
if decay and stop_decay_counter > 0 and i % decay_iter == 0:
self.alpha = decay_rate * self.alpha
stop_decay_counter -= 1
# display cost per iteration
if verbose:
print('Cost after {} iterations: {}'.format(i, self.cost))
def predict(self, X, in_training = False):
if in_training == False:
self.forward_prop(X)
preds = self.A[str(self.L)] >= 0.5
preds = np.squeeze(preds)
return preds
def evaluate(self, X, Y, in_training = False):
examples = X.shape[1]
pred = self.predict(X, in_training = in_training)
pred = pred.reshape(1, examples)
diff = np.sum(abs(pred - Y))
acc = (examples - np.sum(diff)) / examples
return acc
Dataset
import pandas as pd
data = pd.read_csv('https://archive.ics.uci.edu/ml/machine-learning-databases/haberman/haberman.data', sep = ',', header = None)
data.head()
X_train = data.iloc[:,:-1]
Y_train = data.iloc[:, -1]
X_train = np.array(X_train)
Y_train = np.array(Y_train)
Y_train = Y_train.reshape(Y_train.shape[0], 1)
mean = np.mean(X_train, axis = 0)
variance = np.var(X_train, axis = 0)
X_train = np.divide((X_train - mean), variance)
Y_train = Y_train - 1
# Changing label 1 to 0 and label 2 to 1
Split & Shuffle data
# Split the data into test and train sets
from sklearn.utils import shuffle
X_train, Y_train = shuffle(X_train, Y_train)
X_test = X_train[250:,:]
Y_test = Y_train[250:,:]
X_train_ = X_train[:250,:]
Y_train_ = Y_train[:250,:]
X_train_ = X_train_.reshape(3, 250)
Y_train_ = Y_train_.reshape(1, 250)
X_test = X_test.reshape(3, 56)
Y_test = Y_test.reshape(1, 56)
Creating a Model
m = Model()
m.add_layers([3, 16, 16, 1])
m.train(X_train_, Y_train_, iterations = 5000, alpha = 0.9
, decay_iter = 10, decay_rate = 0.98, stop_decay_counter = 100
, verbose = False, lam = 2)
Evaluate
print('Test set acc = ', m.evaluate(X_test, Y_test))
print('Train set acc = ', m.evaluate(X_train_, Y_train_))
What I did in the experiment.
Shuffle, train several models (different in number of nodes and hidden layers), and evaluate
# Model examples
m.add_layers([3, 16, 16, 1, 50, 3, 25, 7, 99, 1])
m.add_layers([3, 1, 55, 19, 2, 2, 1, 1, 2, 75, 80, 3, 12, 1])
Reshuffle, evaluate
Result: Every model has the exact same train and test accuracy unless the data is reshuffled.
The teacher told me that it's just my thought, and it's not true.
Could you please tell me what is wrong to get this result?
I've been doing a very simply binary cat/dog classification project with machine learning. I understand the problem of overfitting, but what's strange in my case is that the validation loss begins to rise from the very beginning. I've tried many different sets of hyperparameters, with L2 regularization, learning rate decay and stochastic gradient descent, and a large training set, but the issue remained. Here is the learning graph from one of the trials (the horizontal axis should be per 10 epochs):
The hyperparameters are: two hidden layers with 50 and 10 units, initial alpha = 0.05, alpha decay rate = 0.95 per 50 epochs, mini-batch size = 64, lambda = 0.05
Here are other sample learning graphs:
I developed my model on the basis of what's provided in Andrew Ng's Deep Learning Specialization, so I didn't expect many bugs. My full code, as required, is attached below:
import numpy as np
import matplotlib.pyplot as plt
import os
import cv2
from scipy import special
#Data Preprocessing (the same for dev set, which I omit here)
path = '/Users/bobby/Downloads/kagglecatsanddogs_3367a/PetImages'
train_set = []
img_size = 80
categories = ['dogs_train','cats_train']
epsilon = 1e-8
for category in categories:
path_animal = os.path.join(path, category)
for img in os.listdir(path_animal):
try:
img_array = cv2.imread(os.path.join(path_animal, img), cv2.IMREAD_GRAYSCALE)
new_img_array = cv2.resize(img_array, (img_size, img_size))
flattened_img_array = new_img_array.reshape(img_size*img_size)
train_set.append([flattened_img_array, categories.index(category)])
except:
continue
import random
random.shuffle(train_set)
X_train = []
Y_train = []
for sample in train_set:
X_train.append(sample[0])
Y_train.append(sample[1])
X_train = (np.array(X_train).T)/255
Y_train = np.array(Y_train).reshape((1, np.array(Y_train).shape[0]))
def create_mini_batches(X, Y, mini_batch_size):
m = X.shape[1]
mini_batches = []
num_mini_batches = m // mini_batch_size
permutation = list(np.random.permutation(m))
shuffled_X = X[:, permutation]
shuffled_Y = Y[:, permutation]
for i in range(num_mini_batches):
select_X = shuffled_X[:, mini_batch_size*i : mini_batch_size*(i+1)]
select_Y = shuffled_Y[:, mini_batch_size*i : mini_batch_size*(i+1)]
mini_batch = (select_X, select_Y)
mini_batches.append(mini_batch)
if m % mini_batch_size != 0:
last_X = shuffled_X[:, mini_batch_size*num_mini_batches:m]
last_Y = shuffled_Y[:, mini_batch_size*num_mini_batches:m]
last_mini_batch = (last_X, last_Y)
mini_batches.append(last_mini_batch)
return mini_batches
def initialize_parameters(layers_dims):
L = len(layers_dims) # number of layers (including input layer), in this case L=4.
parameters = {}
for l in range(1,L): # range(1,4).
parameters['W' + str(l)] = np.random.randn(layers_dims[l],layers_dims[l-1]) * np.sqrt(2/layers_dims[l-1])
parameters['b' + str(l)] = np.zeros((layers_dims[l],1))
return parameters
def sigmoid(Z):
A = special.expit(Z)
return A,Z
def relu(Z):
A = np.maximum(0.01*Z, Z)
return A,Z
def forward_propagation(X, parameters):
caches = [] #list containing Z for every node
A = X
L = int(len(parameters)/2)
for l in range(1,L):
A_prev = A
W = parameters['W'+str(l)]
b = parameters['b'+str(l)]
Z = np.dot(W, A_prev) + b
A, activation_cache = relu(Z) #activation_cache contains z[l].
linear_cache = (A_prev, W, b) #linear_cache contains A[l-1], W[l], b[l].
cache = (linear_cache, activation_cache)
caches.append(cache)
W = parameters['W'+str(L)]
b = parameters['b'+str(L)]
Z = np.dot(W, A) + b
AL, activation_cache = sigmoid(Z)
linear_cache = (A, W, b)
cache = (linear_cache, activation_cache)
caches.append(cache)
return AL, caches
def compute_cost(AL, Y, parameters, lambd):
m = Y.shape[1] # number of examples
L = int(len(parameters)/2) #[6400,100,20,1] L=3 (0,1,2)
reg_cost = 0
for l in range(L):
W = parameters['W' + str(l+1)]
reg_cost += np.sum(np.square(W))
J = (-1/m)*(np.sum(Y*np.log(AL+epsilon)+(1-Y)*np.log(1-AL+epsilon))) + (1/m) * (lambd/2) * reg_cost
J = np.squeeze(J)
return J
def linear_backward(dZ, linear_cache, lambd):
A_prev, W, b = linear_cache
m = A_prev.shape[1]
dW = (1/m) * np.dot(dZ,A_prev.T) + (lambd/m)*W
db = (1/m) * np.sum(dZ,axis=1,keepdims=True)
dA_prev = np.dot(W.T,dZ)
return dA_prev, dW, db
def relu_gradient(Z):
dZ = np.where(Z > 0, 1, 0.01)
return dZ
def sigmoid_gradient(Z):
dZ = special.expit(Z)*(1-special.expit(Z))
return dZ
def linear_activation_backward(dA, cache, lambd, A, Y, activation):
linear_cache, activation_cache = cache
if activation == 'relu':
dZ = dA * relu_gradient(activation_cache)
dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
elif activation == 'sigmoid':
dZ = A - Y
dA_prev, dW, db = linear_backward(dZ, linear_cache, lambd)
return dA_prev, dW, db
def L_model_backward(AL, Y, caches, lambd):
grads = {}
L = len(caches)
m = AL.shape[1]
Y = Y.reshape(AL.shape)
cache_final_layer = caches[L-1]
grads["dA" + str(L-1)], grads["dW" + str(L)], grads["db" + str(L)] = linear_activation_backward(_, cache_final_layer, lambd, AL, Y, activation='sigmoid')
for l in reversed(range(L-1)):
current_cache = caches[l]
grads["dA" + str(l)], grads["dW" + str(l+1)], grads["db" + str(l+1)] = linear_activation_backward(grads['dA' + str(l+1)], current_cache, lambd, _, _, activation='relu')
return grads
def update_parameters(parameters, grads, learning_rate):
L = len(parameters) // 2
for l in range(L):
parameters["W" + str(l+1)] = parameters["W" + str(l+1)] - learning_rate * grads["dW" + str(l+1)]
parameters["b" + str(l+1)] = parameters["b" + str(l+1)] - learning_rate * grads["db" + str(l+1)]
return parameters
def Neural_Network_Model(X_train, Y_train, X_dev, Y_dev, layers_dims, learning_rate, num_epoch, mini_batch_size, lambd, k):
mini_batches = create_mini_batches(X_train, Y_train, mini_batch_size) #[(X{1},Y{1}),(X{2},Y{2}),...,(X{n},Y{n})]
costs_train = []
costs_dev = []
parameters = initialize_parameters(layers_dims)
AL_dev, caches_dev = forward_propagation(X_dev, parameters)
J_dev = compute_cost(AL_dev, Y_dev, parameters, 0)
costs_dev.append(J_dev)
for i in range(num_epoch):
for mini_batch in mini_batches:
(minibatch_X, minibatch_Y) = mini_batch
AL, caches = forward_propagation(minibatch_X, parameters)
J_train = compute_cost(AL, minibatch_Y, parameters, lambd)
grads = L_model_backward(AL, minibatch_Y, caches, lambd)
parameters = update_parameters(parameters, grads, learning_rate)
if i % 10 == 0:
costs_train.append(J_train)
AL_dev, caches_dev = forward_propagation(X_dev, parameters)
J_dev = compute_cost(AL_dev, Y_dev, parameters, 0)
costs_dev.append(J_dev)
if i % 100 == 0:
print ("Cost after epoch %i: %f" %(i, J_train))
learning_rate = learning_rate * (k**(i/50))
plt.plot(np.squeeze(costs_train),'r')
plt.plot(np.squeeze(costs_dev),'b')
plt.ylabel('cost')
plt.xlabel('epochs (per thirties)')
plt.show()
return parameters, costs_train, costs_dev
parameters_updated, costs_train, costs_dev = Neural_Network_Model(X_train, Y_train, X_dev, Y_dev, [6400, 50, 10, 1], 0.05, 1000, 64, 0.05, 0.95)
I would really be grateful for anyone who is patient enough to read through my code. If the problem is still overfitting, could you offer some advice as to how to address this issue? I'm at a loss here because the validation loss goes up at a very early stage, so early stopping would cause underfitting by preventing the model from learning more deeply. Any advice would be appreciated.
When Validation Loss starts to increase in early beginning like images you added, it means that there's there is something wrong in the model.
It's not clear what's it as you didn't show your model.
You could check the following links that will help you:
Basic Cats vs Dogs Detailed Example in Colab
Detailed explanation for Over-fitting in TF Tutorial
or add your full code
I need help with my machine learning code. I am trying to train my network using RPROP, and the train accuracy has not been updating.
Here's the code:
import theano
from theano import *
import theano.tensor as T
from theano.ifelse import ifelse
import numpy as np
from random import random
from sklearn import datasets
from sklearn.model_selection import train_test_split
import math
#GETTING TEST DATA
def get_iris_data():
""" Read the iris data set and split them into training and test sets """
iris = datasets.load_iris()
data = iris.data
target = iris.target
# Prepend the column of 1s for bias
N, M = data.shape
all_X = np.ones((N, M + 1))
all_X[:, 1:] = data
# Convert into one-hot vectors
num_labels = len(np.unique(target))
all_Y = np.eye(num_labels)[target] # One liner trick!
return train_test_split(all_X, all_Y, test_size=0.33)
#SETTING INITIAL WEIGHT VALUES
def init_weights(shape):
""" Weight initialization """
weights = np.asarray(np.random.randn(*shape) * 0.01, dtype=theano.config.floatX)
return theano.shared(weights)
def feedforward(X, w1, w2):
hidden = T.nnet.sigmoid(T.dot(X,w1))
out = T.nnet.softmax(T.dot(hidden,w2))
return out
def rprop(cost, params, learning_rate):
""" Back-propagation """
#RPROP Variables
updates = []
gradients = T.grad(cost = cost, wrt = params)
#Default Values
prevparams = params
deltaMax = 50.
deltaMin = math.exp(-6)
deltas = -0.1 * numpy.ones(len(params))
prevgradients = numpy.zeros(len(params))
npos = 1.2
nneg = 0.5
#All Values
allvalues = zip(params, prevparams, gradients, deltas, prevgradients)
for param, prevparam, gradient, delta, prevgradient in allvalues:
polarity = T.sgn(gradient * prevgradient)
prevdelta = delta
if T.gt(polarity, 0):
delta = T.minimum(prevdelta * npos, deltaMax)
change = - T.sgn(gradient) * delta
prevgradient = gradient
elif T.lt(polarity,0):
delta = T.maximu(prevdelta * nneg, deltaMin)
prevgradient = 0
change = -prevgradient
else:
change = - T.sign(gradient) * delta
prevgradient = gradient
updates.append((param, param - change * learning_rate))
return updates
#MAIN FUNCTION
def main():
#Initialization of Variables and data
train_X, test_X, train_Y, test_Y = get_iris_data()
learning_rate = 0.01
X = T.fmatrix()
Y = T.fmatrix()
#Set ANN Network Size
in_size = train_X.shape[1]
hid_size = 256
out_size = train_Y.shape[1]
#Set weights inbetween
w1 = init_weights((in_size, hid_size))
w2 = init_weights((hid_size, out_size))
#Forward Propagation Function for Neuron activation and transfer funtion
yHat = feedforward(X,w1,w2)
#Backpropagation for correction
cost = T.mean(T.nnet.categorical_crossentropy(yHat, Y))
params = [w1,w2]
updates = rprop(cost, params, learning_rate)
# Train and predict
train = theano.function(inputs=[X, Y], outputs=cost, updates=updates, allow_input_downcast=True)
pred_Y = T.argmax(yHat, axis=1)
predict = theano.function(inputs=[X], outputs=pred_Y, allow_input_downcast=True)
# Run SGD
for iter in range(2000):
for i in range(len(train_X)):
train(train_X[i: i + 1], train_Y[i: i + 1])
train_accuracy = np.mean(np.argmax(train_Y, axis=1) == predict(train_X))
test_accuracy = np.mean(np.argmax(test_Y, axis=1) == predict(test_X))
print("Iteration = %d, train accuracy = %.2f%%, test accuracy = %.2f%%"
% (iter + 1, 100 * train_accuracy, 100 * test_accuracy))
if __name__ == '__main__':
main()