I'm trying to implement a neural network with aleatoric uncertainty estimation for regression with pytorch according to
Kendall et al.: "What Uncertainties Do We Need in Bayesian Deep
Learning for Computer Vision?" (Link).
However, while the predicted regression values fit the desired ground truth values quite well, the predicted variance looks weird and the loss gets negative during training.
The paper suggests to have two outputs mean and variance instead of only predicting the regression value. To be more precise, it is suggested to predict mean and log(variance) due to stability reasons. Therefore, my network looks as follows:
class ReferenceResNet(nn.Module):
def __init__(self):
super().__init__()
self.fcl1 = nn.Linear(1, 32)
self.fcl2 = nn.Linear(32, 64)
self.fcl3 = nn.Linear(64, 128)
self.fcl_mean = nn.Linear(128,1)
self.fcl_var = nn.Linear(128,1)
def forward(self, x):
x = torch.tanh(self.fcl1(x))
x = torch.tanh(self.fcl2(x))
x = torch.tanh(self.fcl3(x))
mean = self.fcl_mean(x)
log_var = self.fcl_var(x)
return mean, log_var
According to the paper, given these outputs, the corresponding loss function consists of a residual regression-part and a regularization term:
where si is the log(variance) predicted by the network.
I implemented this loss-function accordingly:
def loss_function(pred_mean, pred_log_var, y):
return 1/len(pred_mean)*(0.5 * torch.exp(-pred_log_var)*torch.sqrt(torch.pow(y-pred_mean, 2))+0.5*pred_log_var).sum()
I tried this code on a self-generated toy dataset (see image with results), however, the loss gets negative during training and when I plot the variance over the dataset after training, for me it does not really make sense while the corresponding mean values fit the ground truth quite well:
I already figured out that the negative loss comes from the regularization term as logarithms are negative for values between 0 and 1, however, I don't believe that the absolute value of the regularization term is supposed to grow bigger than the regression part. Does anyone know what is the reason for this and how I can prevent this from happening? And why does my variance look so weird?
For reproduction, my full code looks as follows:
import torch.nn as nn
import torch
import torch.nn.functional as F
import torch.optim as optim
from torch.optim import lr_scheduler
from torch.utils.data.dataset import TensorDataset
from torchvision import datasets, transforms
import math
import numpy as np
import torch.nn.functional as F
import matplotlib.pyplot as plt
from tqdm import tqdm
DEVICE = torch.device("cuda" if torch.cuda.is_available() else "cpu")
class ReferenceRegNet(nn.Module):
def __init__(self):
super().__init__()
self.fcl1 = nn.Linear(1, 32)
self.fcl2 = nn.Linear(32, 64)
self.fcl3 = nn.Linear(64, 128)
self.fcl_mean = nn.Linear(128,1)
self.fcl_var = nn.Linear(128,1)
def forward(self, x):
x = torch.tanh(self.fcl1(x))
x = torch.tanh(self.fcl2(x))
x = torch.tanh(self.fcl3(x))
mean = self.fcl_mean(x)
log_var = self.fcl_var(x)
return mean, log_var
def toy_function(x):
return math.sin(x/15-4)+2 + math.sin(x/10-5)
def loss_function(x_mean, x_log_var, y):
return 1/len(x_mean)*(0.5 * torch.exp(-x_log_var)*torch.sqrt(torch.pow(y-x_mean, 2))+0.5*x_log_var).sum()
BATCH_SIZE = 10
EVAL_BATCH_SIZE = 10
CLASSES = 1
TRAIN_EPOCHS = 50
# generate toy dataset: A train-set in form of a complex sin-curve
x_train_data = np.array([])
y_train_data = np.array([])
for repeat in range(2):
for i in range(50, 150):
for j in range(100):
sampled_x = i+np.random.randint(101)/100
sampled_y = toy_function(sampled_x)+np.random.normal(0,0.2)
x_train_data = np.append(x_train_data, sampled_x)
y_train_data = np.append(y_train_data, sampled_y)
x_eval_data = list(np.arange(50.0, 150.0, 0.1))
y_eval_data = [toy_function(x) for x in x_eval_data]
LOADER_KWARGS = {'num_workers': 0, 'pin_memory': False} if torch.cuda.is_available() else {}
train_set = TensorDataset(torch.Tensor(x_train_data),torch.Tensor(y_train_data))
eval_set = TensorDataset(torch.Tensor(x_eval_data), torch.Tensor(y_eval_data))
train_loader = torch.utils.data.DataLoader(train_set, batch_size=BATCH_SIZE, shuffle=True, **LOADER_KWARGS)
eval_loader = torch.utils.data.DataLoader(eval_set, batch_size=EVAL_BATCH_SIZE, shuffle=False, **LOADER_KWARGS)
TRAIN_SIZE = len(train_loader.dataset)
EVAL_SIZE = len(eval_loader.dataset)
assert (TRAIN_SIZE % BATCH_SIZE) == 0
assert (EVAL_SIZE % EVAL_BATCH_SIZE) == 0
net = ReferenceRegNet().to(DEVICE)
optimizer = optim.Adam(net.parameters(), lr=1e-3)
losses = {}
# train network
for epoch in range(1,TRAIN_EPOCHS+1):
net.train()
mean_epoch_loss = 0
mean_epoch_mse = 0
# train batches
for batch_idx, (data, target) in enumerate(tqdm(train_loader), start=1):
data, target = (data.to(DEVICE)).unsqueeze(dim=1), (target.to(DEVICE)).unsqueeze(dim=1)
optimizer.zero_grad()
output_means, output_log_var = net(data)
target_np = target.detach().cpu().numpy()
output_means_np = output_means.detach().cpu().numpy()
loss = loss_function(output_means, output_log_var, target)
loss_value = loss.item() # get raw float-value out of loss-tensor
mean_epoch_loss += loss_value
# optimize network
loss.backward()
optimizer.step()
mean_epoch_loss = mean_epoch_loss / len(train_loader)
losses.update({epoch:mean_epoch_loss})
print("Epoch " + str(epoch) + ": Train-Loss = " + str(mean_epoch_loss))
net.eval()
with torch.no_grad():
mean_loss = 0
mean_mse = 0
for data, target in eval_loader:
data, target = (data.to(DEVICE)).unsqueeze(dim=1), (target.to(DEVICE)).unsqueeze(dim=1)
output_means, output_log_var = net(data) # perform prediction
target_np = target.detach().cpu().numpy()
output_means_np = output_means.detach().cpu().numpy()
mean_loss += loss_function(output_means, output_log_var, target).item()
mean_loss = mean_loss/len(eval_loader)
#print("Epoch " + str(epoch) + ": Eval-loss = " + str(mean_loss))
fig = plt.figure(figsize=(40,12)) # create a 30x30 inch figure
ax = fig.add_subplot(1,3,1)
ax.set_title("regression value")
ax.set_xlabel("x")
ax.set_ylabel("regression mean")
ax.plot(x_train_data, y_train_data, 'x', color='black')
ax.plot(x_eval_data, y_eval_data, color='red')
pred_means_list = []
output_vars_list_train = []
output_vars_list_test = []
for x_test in sorted(x_train_data):
x_test = (torch.Tensor([x_test]).to(DEVICE))
pred_means, output_log_vars = net.forward(x_test)
pred_means_list.append(pred_means.detach().cpu())
output_vars_list_train.append(torch.exp(output_log_vars).detach().cpu())
ax.plot(sorted(x_train_data), pred_means_list, color='blue', label = 'training_perform')
pred_means_list = []
for x_test in x_eval_data:
x_test = (torch.Tensor([x_test]).to(DEVICE))
pred_means, output_log_vars = net.forward(x_test)
pred_means_list.append(pred_means.detach().cpu())
output_vars_list_test.append(torch.exp(output_log_vars).detach().cpu())
ax.plot(sorted(x_eval_data), pred_means_list, color='green', label = 'eval_perform')
plt.tight_layout()
plt.legend()
ax = fig.add_subplot(1,3,2)
ax.set_title("variance")
ax.set_xlabel("x")
ax.set_ylabel("regression var")
ax.plot(sorted(x_train_data), output_vars_list_train, label = 'training data')
ax.plot(x_eval_data, output_vars_list_test, label = 'test data')
plt.tight_layout()
plt.legend()
ax = fig.add_subplot(1,3,3)
ax.set_title("training loss")
ax.set_xlabel("Epoch")
ax.set_ylabel("Loss")
lists = sorted(losses.items())
epoch, loss = zip(*lists)
ax.plot(epoch, loss, label = 'loss')
plt.tight_layout()
plt.legend()
plt.savefig('ref_test.png')
TLDR: The optimization drives the loss to a minimum where the gradient
becomes zero, regardless of what the nominal loss value is.
A comprehensive explanation by K.Frank:
A smaller loss – algebraically less positive or algebraically more
negative – means (or should mean) better predictions. The
optimization step uses some version of gradient descent to make
your loss smaller. The overall level of the loss doesn’t matter as
far as the optimization goes. The gradient tells the optimizer how
to change the model parameters to reduce the loss, and it doesn’t
care about the overall level of the loss.
An example from the same source:
Consider, for example, optimizing with lossA = MSELoss. Now
imagine optimizing with lossB = lossA - 17.2. The 17.2 doesn’t
really change anything at all. It is true that “perfect” predictions
will yield lossB = -17.2 rather than zero. (lossA will, of course,
be zero for “perfect” predictions.) But who cares?
In your example: you are right, the negative loss value comes from the logarithmic term. This is completely OK and it means that your training is dominated by contributions of high-confidence loss terms. Regarding the high values of variance - can't comment much on it but it should be fine since the loss curve drops as expected.
I'm currently trying to build a "simple" LSTM model that takes historical Bitcoin data, learns from that and then tries to predict the future X steps in advance.
I've build it on the idea that A + B + C = D so B + C + D should be E. (I think that's a very simple idea behind an LSTM model. I might be wrong however i'm pretty new to it.)
I managed to build the basics in python (I'm fairly new to python) but something seems off by the prediction. For some reason many of the predictions i test / make end up flatlining. I have a theory on why but we have no idea if it's correct and even less idea on how to solve it.
My theory is that within a sequence the model learns to put more importance / weight on the last digit in the sequence because with Bitcoin prices the future price (in 1 minute) is probably pretty close to the price now. That's try the predicted values keeps getting closer to the real value eventually being equal and thus flatlining in a graph. (I don't know if that makes sense but thats what i tought anyway.)
I've also added a screenshot of my graph from a few days ago. Almost all predictions however end similar to this graph. This is just a more extreme example as demonstration.
Here is my code, can someone please explain why it flatlines and what i did wrong?
import numpy as np
from matplotlib import pyplot
import pandas as pd
from sklearn.preprocessing import MinMaxScaler
from keras.models import Sequential
from keras.layers import Dense
from keras.layers import LSTM
from keras.layers import Dropout
import yfinance as yf
from sklearn.preprocessing import MinMaxScaler
from math import sqrt
from sklearn.metrics import mean_squared_error
# Create output sets X + Y from given input-set
# with inputset : a 1-dimensional list of floats
# with N : the number of lookback values to use for X
# with Gap : the number of point skipped between X and Y
# Y: is equal to input, (although the first N are missing)
# X: for each y of Y a corresponding set of size N is created
# composed of the N values preceeding y.
def create_lookback(inputset, n=1, gap=0):
print("create_lookback with n=%d gap=%d" % (n,gap))
print(" - length of inputset = %d" % len(inputset))
dataX, dataY = [], []
for i in range(len(inputset) - (n+gap)):
a = inputset[i:(i + n), 0]
dataX.append(a)
dataY.append(inputset[i + n+gap, 0])
print(" - length of dataY = %d" % len(dataY))
data_x = np.array(dataX)
xret = data_x.reshape(data_x.shape[0], 1, data_x.shape[1])
return xret, np.array(dataY)
# Train model based on given training-set + Test-set
def create_model(trainX,trainY,testX,testY):
model = Sequential()
model.add(LSTM(units = 100, input_shape=(trainX.shape[1], trainX.shape[2], )))
model.add(Dropout(0.2))
#model.add(LSTM(30, return_sequences=True))
#model.add(Dropout(0.1))
model.add(Dense(1))
model.compile(loss='mae', optimizer='adam')
history = model.fit(trainX, trainY, epochs=100, batch_size=5, validation_data=(testX, testY), verbose=1, shuffle=False)
return model
# Evaluate given X / Y set.
# - Calculate RMSE
# - Generate visual line-plot to screen
def show_result(scaler,yhat,setY,txt):
print("Show %s result" % txt)
yhat_inverse = scaler.inverse_transform(yhat.reshape(-1, 1))
testY_inverse = scaler.inverse_transform(setY.reshape(-1, 1))
if len(testY_inverse) == len(yhat_inverse):
rmse = sqrt(mean_squared_error(testY_inverse, yhat_inverse))
print(' RMSE %s : %.3f' % (txt,rmse))
pyplot.plot(yhat_inverse, label='predict '+txt)
pyplot.plot(testY_inverse, label='actual '+txt, alpha=0.5)
pyplot.legend()
pyplot.show()
# Extrapoleer is dutch for Extrapolate
def extrapoleer(i,model,tup,toekomst):
if(i == 0):
return
setX = np.array([[tup]])
y = model.predict(setX)
y_float = y[0][0]
tup_new = np.append(tup[1:], y_float)
toekomst.append(y_float)
extrapoleer(i-1, model, tup_new,toekomst)
# --- end of defined functions
# -- start of main flow
data_grid_1 = yf.download('BTC-USD', start="2021-04-14",end="2021-04-15", interval="1m");
data_grid_2 = yf.download('BTC-USD', period="12h", interval="1m");
dataset_1 = data_grid_1.iloc[:, 1:2].values
dataset_2 = data_grid_2.iloc[:, 1:2].values
scaler = MinMaxScaler(feature_range = (0, 1))
scaled = scaler.fit_transform(dataset_1)
# 70% of dataset_1 is used to train ; 30% to test
train_size = int(len(scaled) * 0.7)
test_size = len(scaled) - train_size
train, test = scaled[0:train_size,:], scaled[train_size:len(scaled),:]
print("train: %d test: %d" % (len(train), len(test)))
scaled_2 = scaler.fit_transform(dataset_2)
look_back_n = 3
look_back_gap = 0
trainX, trainY = create_lookback(train, look_back_n, look_back_gap)
testX, testY = create_lookback(test, look_back_n, look_back_gap)
testX_2, testY_2 = create_lookback(scaled_2, look_back_n, look_back_gap)
model = create_model(trainX,trainY,testX,testY)
yhat_1 = model.predict(testX)
yhat_2 = model.predict(testX_2)
show_result(scaler,yhat_1,testY,"test")
show_result(scaler,yhat_2,testY_2,"test2")
last_n = testY_2[-look_back_n:]
#toekomst = Future in dutch
toekomst = []
#aantal = Amount in Dutch, this indicates the amount if steps you want to future predict
aantal = 30
extrapoleer(aantal, model, last_n, toekomst)
print("Resultaat van %d voorspelde punten in de toekomst: " % aantal)
print(toekomst)
yhat_2_plus = np.append(yhat_2,toekomst)
show_result(scaler,yhat_2_plus,testY_2,"test2-plus")
I managed to successfully implement multilinear regression using only numpy for Iris dataset. I wanted to do the same for
boston houses data set but my model won't learn and I have no idea why.
import pandas as pd
# read data and split into test and training sets
data = pd.read_csv('train.csv')
data = (data - data.mean()) / data.std() # normalize data
split_data = np.random.rand(len(data)) < 0.8
train_data = data[split_data].round(5)
test_data = data[~split_data]
# create matrices
input_features_train = train_data.drop(['ID', 'medv'], 1).values
output_feature_train = train_data.medv.values.reshape(-1, 1)
ones = np.ones([input_features_train.shape[0], 1])
input_features_train = np.concatenate((ones, input_features_train), 1)
weight = np.zeros([1, 14])
def computeCost(X, y, theta):
summed = np.power(((X # theta.T) - y), 2)
return np.sum(summed) / (2 * len(X))
def gradientDescent(X, y, theta, iters, alpha):
costs = np.zeros(iters)
for i in range(iters):
theta = theta - (alpha / len(X)) * np.sum(X * (X # theta.T - y), 0)
costs[i] = computeCost(X, y, theta)
return theta, costs
learning_rate = 0.01
iterations = 100000
weights, cost = gradientDescent(input_features_train, output_feature_train, weight, iterations, learning_rate)
print("Weights: ", weights)
finalCost = computeCost(input_features_train, output_feature_train, weights)
# test
input_features_test = test_data.drop(['ID', 'medv'], 1).values
output_feature_test = test_data.medv.values.reshape(-1, 1)
ones = np.ones([input_features_test.shape[0], 1])
input_features_test = np.concatenate((ones, input_features_test), 1)
def test_data(input_features, output_feature, weights):
predictions = np.round(np.dot(input_features, weights.T))
for i in range(len(output_feature)):
predicted = predictions[i]
success = predictions[i] == output_feature[i]
print('For features: ', input_features[i], ' housing price should be ', output_feature[i])
print("Predicted: %f" % predicted)
print("Is success? ", success)
print()
test_data(input_features_test, output_feature_test, weights)
predictions = np.round(np.dot(input_features_test, weights.T))
accuracy = (sum(predictions == output_feature_test) / float(len(output_feature_test)) * 100)[0]
print("Accuracy of the model is ", accuracy, "% after ", iterations, "iterations")
example output goes as follow
Weights: [[ 0.01465871 -0.11583742 0.17729105 0.01249782 0.09822299 -0.31249182
0.25208063 -0.00937766 -0.48751822 0.46772537 -0.27637035 -0.1590125
0.12926108 -0.48910136]]
For features: [ 1. -0.44852959 -0.47141352 0.09095532 -0.25240023 0.13793157
0.46506236 0.03105118 -0.62153314 -0.98758424 -0.79769195 1.18594974
0.37563165 -0.40259248] housing price should be [-0.04019949]
Predicted: 0.000000
Is success? [False]
I tried even 10000000 iterations and still it fails all tests and has 0% accuracy. On iris data set I managed to get 100% with this model so I don't understand why it won't work.
I suspect it might be something with data normalization as without it I get RuntimeWarning: overflow encountered in power
summed = np.power(((X # theta.T) - y), 2) error which I also don't know why is happening.
Could you please point me in the right direction ? Thanks!
I really suggest you to use scikit learn for this. You can use SGD Regressor,or Cat Boost Regressor which have inbuilt support required for approaches like these.
The main motive behind this suggestion is using gradient descent manually may lead to some logical error which may go undetected.
Try to solve with scikit learn.That might help.
I am currently taking the Deep Learning specialization by Deeplearning.ai on Coursera and am on the first assignment that requires implementing Neural Network with Logistic Regression mindset. The problem is that the assignment is implementation of Neural Network as Logistic Regression function for UNSTRUCTURED DATA (IMAGES). I have successfully completed the assignment, getting all the expected outputs. However, I am now trying to use the coded Neural Network for STRUCTURE DATA but come across broadcast error. Part of the code is as below :
The dataset code
path_train = r'C:\Users\Ahmed Ismail Khalid\Desktop\Research Paper\Research Paper Feature Sets\Balanced Feature Sets\Balanced Train combined scores.csv'
path_test = r'C:\Users\Ahmed Ismail Khalid\Desktop\Research Paper\Research Paper Feature Sets\Balanced Feature Sets\Balanced Test combined scores.csv'
df_train = pd.read_csv(path_train)
#df_train = df_train.to_numpy()
df_test = pd.read_csv(path_test)
#df_test = df_test.to_numpy()
x_train = df_train.iloc[:,1:19]
x_train = x_train.to_numpy()
x_train = x_train.T
y_train = df_train.iloc[:,19]
y_train = y_train.to_numpy()
y_train = y_train.reshape(y_train.shape[0],1)
y_train = y_train.T
x_test = df_test.iloc[:,1:19]
x_test = x_test.to_numpy()
x_test = x_test.T
y_test = df_test.iloc[:,19]
y_test = y_test.to_numpy()
y_test = y_test.reshape(y_test.shape[0],1)
y_test = y_test.T
print ("Number of training examples: m_train = " + str(m_train))
print ("Number of testing examples: m_test = " + str(m_test))
print ("train_set_x shape: " + str(x_train.shape))
print ("train_set_y shape: " + str(y_train.shape))
print ("test_set_x shape: " + str(x_test.shape))
print ("test_set_y shape: " + str(y_test.shape))
Output of Dataset Code
Number of training examples: df_train = 713
Number of testing examples: df_test = 237
x_train shape: (18, 713)
y_train shape: (1, 713)
x_test shape: (18, 237)
y_test shape: (1, 237)
The propagate function code
def propagate(w,b,X,Y) :
m = X.shape[1]
A = sigmoid((w.T * X) + b)
cost = (- 1 / m) * np.sum(np.dot(Y,np.log(A)) + np.dot((1 - Y), np.log(1 - A)))
dw = (1 / m) * np.dot((X,(A - Y)).T)
db = (1 / m) * np.sum(A - Y)
assert(dw.shape == w.shape)
assert(db.dtype == float)
cost = np.squeeze(cost)
assert(cost.shape == ())
grads = {"dw": dw,
"db": db}
return grads, cost
The optimize and model functions
**def optimize**(w,b,X,Y,num_iterations,learning_rate,print_cost) :
costs = []
for i in range(num_iterations) :
# Cost and gradient calculation
grads, cost = propagate(w,b,X,Y)
# Retrieve derivatives from gradients
dw = grads['dw']
db = grads['db']
# Update w and b
w = w - learning_rate * dw
b = b - learning_rate * db
if i % 100 == 0:
costs.append(cost)
# Print the cost every 100 training iterations
if print_cost and i % 100 == 0:
print ("Cost after iteration %i: %f" %(i, cost))
params = {"w": w,
"b": b}
grads = {"dw": dw,
"db": db}
return params, grads, costs
**def model**(X_train, Y_train, X_test, Y_test, num_iterations = 2000, learning_rate = 0.5, print_cost = False) :
# initialize parameters with zero
w, b = initialize_with_zeros(X_train.shape[0])
# Gradient descent (≈ 1 line of code)
parameters, grads, costs = optimize(w,b,X_train,Y_train,num_iterations,learning_rate,print_cost)
# Retrieve parameters w and b from dictionary "parameters"
w = parameters["w"]
b = parameters["b"]
# Predict train/test set examples (≈ 2 lines of code)
Y_prediction_train = predict(w,b,X_train)
Y_prediction_test = predict(w,b,X_test)
# Print train/test Errors
print("train accuracy: {} %".format(100 - np.mean(abs(Y_prediction_train - Y_train)) * 100))
print("test accuracy: {} %".format(100 - np.mean(abs(Y_prediction_test - Y_test)) * 100))
d = {"costs": costs,
"Y_prediction_test": Y_prediction_test,
"Y_prediction_train" : Y_prediction_train,
"w" : w,
"b" : b,
"learning_rate" : learning_rate,
"num_iterations": num_iterations}
return d
Model Function output
Cost after iteration 0: 0.693147
train accuracy: -0.1402524544179613 %
test accuracy: 0.4219409282700326 %
When I run the code, I get ValueError: operands could not be broadcast together with shapes (1,713) (713,18) at A = sigmoid((w.T * X) + b). I am pretty new to neural networks and usage of numpy, so I can't figure out the problem. Any and all help would be really appreciated. The entire .ipynb file containing the entire code can be downloaded from here
Thanks
The * operator is elementwise multiplication, and your arrays have incompatible shapes. You want matrix multiplication, which you can do with np.matmul() or with the # operator:
A = sigmoid(w.T # X + b)
A lot of ML, especially neural nets, is about keeping the shapes of things straight. Check the shapes of your w, X, and Y — they should be: (features, 1), (features, m), (1, m) respectively, where features is 18 for you, and m is 713.
You should also then be able to make sure that the shape of A matches Y.
This is my custom extension of one of Andrew NG's neural network from deep learning course where instead of producing 0 or 1 for binary classification I'm attempting
to classify multiple examples.
Both the inputs and outputs are one hot encoded.
With not much training I receive an accuracy of 'train accuracy: 67.51658067499625 %'
How can I classify a single training example instead of classifying all training examples?
I think a bug exists in my implementation as an issue with this network is training examples (train_set_x) and output values (train_set_y) both need to have same dimensions or an error related to the dimensionality of matrices is received.
For example using :
train_set_x = np.array([
[1,1,1,1],[0,1,1,1],[0,0,1,1]
])
train_set_y = np.array([
[1,1,1],[1,1,0],[1,1,1]
])
returns error :
ValueError Traceback (most recent call last)
<ipython-input-11-0d356e8d66f3> in <module>()
27 print(A)
28
---> 29 np.multiply(train_set_y,A)
30
31 def initialize_with_zeros(numberOfTrainingExamples):
ValueError: operands could not be broadcast together with shapes (3,3) (1,4)
network code :
import numpy as np
import matplotlib.pyplot as plt
import h5py
import scipy
from scipy import ndimage
import pandas as pd
%matplotlib inline
train_set_x = np.array([
[1,1,1,1],[0,1,1,1],[0,0,1,1]
])
train_set_y = np.array([
[1,1,1,0],[1,1,0,0],[1,1,1,1]
])
numberOfFeatures = 4
numberOfTrainingExamples = 3
def sigmoid(z):
s = 1 / (1 + np.exp(-z))
return s
w = np.zeros((numberOfTrainingExamples , 1))
b = 0
A = sigmoid(np.dot(w.T , train_set_x))
print(A)
np.multiply(train_set_y,A)
def initialize_with_zeros(numberOfTrainingExamples):
w = np.zeros((numberOfTrainingExamples , 1))
b = 0
return w, b
def propagate(w, b, X, Y):
m = X.shape[1]
A = sigmoid(np.dot(w.T , X) + b)
cost = -(1/m)*np.sum(np.multiply(Y,np.log(A)) + np.multiply((1-Y),np.log(1-A)), axis=1)
dw = ( 1 / m ) * np.dot( X, ( A - Y ).T ) # consumes ( A - Y )
db = ( 1 / m ) * np.sum( A - Y ) # consumes ( A - Y ) again
# cost = np.squeeze(cost)
grads = {"dw": dw,
"db": db}
return grads, cost
def optimize(w, b, X, Y, num_iterations, learning_rate, print_cost = True):
costs = []
for i in range(num_iterations):
grads, cost = propagate(w, b, X, Y)
dw = grads["dw"]
db = grads["db"]
w = w - (learning_rate * dw)
b = b - (learning_rate * db)
if i % 100 == 0:
costs.append(cost)
if print_cost and i % 10000 == 0:
print(cost)
params = {"w": w,
"b": b}
grads = {"dw": dw,
"db": db}
return params, grads, costs
def model(X_train, Y_train, num_iterations, learning_rate = 0.5, print_cost = False):
w, b = initialize_with_zeros(numberOfTrainingExamples)
parameters, grads, costs = optimize(w, b, X_train, Y_train, num_iterations, learning_rate, print_cost = True)
w = parameters["w"]
b = parameters["b"]
Y_prediction_train = sigmoid(np.dot(w.T , X_train) + b)
print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - Y_train)) * 100))
model(train_set_x, train_set_y, num_iterations = 20000, learning_rate = 0.0001, print_cost = True)
Update: A bug exists in this implementation in that the training example pairs (train_set_x , train_set_y) must contain the same dimensions. Can point in direction of how linear algebra should be modified?
Update 2 :
I modified #Paul Panzer answer so that learning rate is 0.001 and train_set_x , train_set_y pairs are unique :
train_set_x = np.array([
[1,1,1,1,1],[0,1,1,1,1],[0,0,1,1,0],[0,0,1,0,1]
])
train_set_y = np.array([
[1,0,0],[0,0,1],[0,1,0],[1,0,1]
])
grads = model(train_set_x, train_set_y, num_iterations = 20000, learning_rate = 0.001, print_cost = True)
# To classify single training example :
print(sigmoid(dw # [0,0,1,1,0] + db))
This update produces following output :
-2.09657359028
-3.94918577439
[[ 0.74043089 0.32851512 0.14776077 0.77970162]
[ 0.04810012 0.08033521 0.72846174 0.1063849 ]
[ 0.25956911 0.67148488 0.22029838 0.85223923]]
[[1 0 0 1]
[0 0 1 0]
[0 1 0 1]]
train accuracy: 79.84462279013312 %
[[ 0.51309252 0.48853845 0.50945862]
[ 0.5110232 0.48646923 0.50738869]
[ 0.51354109 0.48898712 0.50990734]]
Should print(sigmoid(dw # [0,0,1,1,0] + db)) produce a vector that once rounded matches train_set_y corresponding value : [0,1,0] ?
Modifying to produce a vector with (adding [0,0,1,1,0] to numpy array and taking transpose):
print(sigmoid(dw # np.array([[0,0,1,1,0]]).T + db))
returns :
array([[ 0.51309252],
[ 0.48646923],
[ 0.50990734]])
Again, rounding these values to nearest whole number produces vector [1,0,1] when [0,1,0] is expected.
These are incorrect operations to produce a prediction for single training example ?
Your difficulties come from mismatched dimensions, so let's walk through the problem and try and get them straight.
Your network has a number of inputs, the features, let's call their number N_in (numberOfFeatures in your code). And it has a number of outputs which correspond to different classes let's call their number N_out. Inputs and outputs are connected by the weights w.
Now here is the problem. Connections are all-to-all, so we need a weight for each of the N_out x N_in pairs of outputs and inputs. Therefore in your code the shape of w must be changed to (N_out, N_in). You probably also want an offset b for each output, so b should be a vector of size (N_out,) or rather (N_out, 1) so it plays well with the 2d terms.
I've fixed that in the modified code below and I tried to make it very explicit. I've also thrown a mock data creator into the bargain.
Re the one-hot encoded categorical output, I'm not an expert on neural networks but I think, most people understand it so that classes are mutually exclusive, so each sample in your mock output should have one one and the rest zeros.
Side note:
At one point a competing answer advised you to get rid of the 1-... terms in the cost function. While that looks like an interesting idea to me my gut feeling (Edit Now confirmed using gradient-free minimizer; use activation="hybrid" in code below. Solver will simply maximize all outputs which are active in at least one training example.) is it won't work just like that because the cost will then fail to penalise false positives (see below for detailed explanation). To make it work you'd have to add some kind of regularization. One method that appears to work is using the softmax instead of the sigmoid. The softmax is to one-hot what the sigmoid is to binary. It makes sure the output is "fuzzy one-hot".
Therefore my recommendation is:
If you want to stick with sigmoid and not explicitly enforce one-hot predictions. Keep the 1-... term.
If you want to use the shorter cost function. Enforce one-hot predictions. For example by using softmax instead of sigmoid.
I've added an activation="sigmoid"|"softmax"|"hybrid" parameter to the code that switches between models. I've also made the scipy general purpose minimizer available, which may be useful when the gradient of the cost is not at hand.
Recap on how the cost function works:
The cost is a sum over all classes and all training samples of the term
-y log (y') - (1-y) log (1-y')
where y is the expected response, i.e. the one given by the "y" training sample for the input (the "x" training sample). y' is the prediction, the response the network with its current weights and biases generates. Now, because the expected response is either 0 or 1 the cost for a single category and a single training sample can be written
-log (y') if y = 1
-log(1-y') if y = 0
because in the first case (1-y) is zero, so the second term vanishes and in the secondo case y is zero, so the first term vanishes.
One can now convince oneself that the cost is high if
the expected response y is 1 and the network prediction y' is close to zero
the expected response y is 0 and the network prediction y' is close to one
In other words the cost does its job in punishing wrong predictions. Now, if we drop the second term (1-y) log (1-y') half of this mechanism is gone. If the expected response is 1, a low prediction will still incur a cost, but if the expected response is 0, the cost will be zero, regardless of the prediction, in particular, a high prediction (or false positive) will go unpunished.
Now, because the total cost is a sum over all training samples, there are three possibilities.
all training samples prescribe that the class be zero:
then the cost will be completely independent of the predictions for this class and no learning can take place
some training samples put the class at zero, some at one:
then because "false negatives" or "misses" are still punished but false positives aren't the net will find the easiest way to minimize the cost which is to indiscriminately increase the prediction of the class for all samples
all training samples prescribe that the class be one:
essentially the same as in the second scenario will happen, only here it's no problem, because that is the correct behavior
And finally, why does it work if we use softmax instead of sigmoid? False positives will still be invisible. Now it is easy to see that the sum over all classes of the softmax is one. So I can only increase the prediction for one class if at least one other class is reduced to compensate. In particular, there can be no false positives without a false negative, and the false negative the cost will detect.
On how to get a binary prediction:
For binary expected responses rounding is indeed the appropriate procedure. For one-hot I'd rather find the largest value, set that to one and all others to zero. I've added a convenience function, predict, implementing that.
import numpy as np
from scipy import optimize as opt
from collections import namedtuple
# First, a few structures to keep ourselves organized
Problem_Size = namedtuple('Problem_Size', 'Out In Samples')
Data = namedtuple('Data', 'Out In')
Network = namedtuple('Network', 'w b activation cost gradient most_likely')
def get_dims(Out, In, transpose=False):
"""extract dimensions and ensure everything is 2d
return Data, Dims"""
# gracefully acccept lists etc.
Out, In = np.asanyarray(Out), np.asanyarray(In)
if transpose:
Out, In = Out.T, In.T
# if it's a single sample make sure it's n x 1
Out = Out[:, None] if len(Out.shape) == 1 else Out
In = In[:, None] if len(In.shape) == 1 else In
Dims = Problem_Size(Out.shape[0], *In.shape)
if Dims.Samples != Out.shape[1]:
raise ValueError("number of samples must be the same for Out and In")
return Data(Out, In), Dims
def sigmoid(z):
s = 1 / (1 + np.exp(-z))
return s
def sig_cost(Net, data):
A = process(data.In, Net)
logA = np.log(A)
return -(data.Out * logA + (1-data.Out) * (1-logA)).sum(axis=0).mean()
def sig_grad (Net, Dims, data):
A = process(data.In, Net)
return dict(dw = (A - data.Out) # data.In.T / Dims.Samples,
db = (A - data.Out).mean(axis=1, keepdims=True))
def sig_ml(z):
return np.round(z).astype(int)
def sof_ml(z):
hot = np.argmax(z, axis=0)
z = np.zeros(z.shape, dtype=int)
z[hot, np.arange(len(hot))] = 1
return z
def softmax(z):
z = z - z.max(axis=0, keepdims=True)
z = np.exp(z)
return z / z.sum(axis=0, keepdims=True)
def sof_cost(Net, data):
A = process(data.In, Net)
logA = np.log(A)
return -(data.Out * logA).sum(axis=0).mean()
sof_grad = sig_grad
def get_net(Dims, activation='softmax'):
activation, cost, gradient, ml = {
'sigmoid': (sigmoid, sig_cost, sig_grad, sig_ml),
'softmax': (softmax, sof_cost, sof_grad, sof_ml),
'hybrid': (sigmoid, sof_cost, None, sig_ml)}[activation]
return Network(w=np.zeros((Dims.Out, Dims.In)),
b=np.zeros((Dims.Out, 1)),
activation=activation, cost=cost, gradient=gradient,
most_likely=ml)
def process(In, Net):
return Net.activation(Net.w # In + Net.b)
def propagate(data, Dims, Net):
return Net.gradient(Net, Dims, data), Net.cost(Net, data)
def optimize_no_grad(Net, Dims, data):
def f(x):
Net.w[...] = x[:Net.w.size].reshape(Net.w.shape)
Net.b[...] = x[Net.w.size:].reshape(Net.b.shape)
return Net.cost(Net, data)
x = np.r_[Net.w.ravel(), Net.b.ravel()]
res = opt.minimize(f, x, options=dict(maxiter=10000)).x
Net.w[...] = res[:Net.w.size].reshape(Net.w.shape)
Net.b[...] = res[Net.w.size:].reshape(Net.b.shape)
def optimize(Net, Dims, data, num_iterations, learning_rate, print_cost = True):
w, b = Net.w, Net.b
costs = []
for i in range(num_iterations):
grads, cost = propagate(data, Dims, Net)
dw = grads["dw"]
db = grads["db"]
w -= learning_rate * dw
b -= learning_rate * db
if i % 100 == 0:
costs.append(cost)
if print_cost and i % 10000 == 0:
print(cost)
return grads, costs
def model(X_train, Y_train, num_iterations, learning_rate = 0.5, print_cost = False, activation='sigmoid'):
data, Dims = get_dims(Y_train, X_train, transpose=True)
Net = get_net(Dims, activation)
if Net.gradient is None:
optimize_no_grad(Net, Dims, data)
else:
grads, costs = optimize(Net, Dims, data, num_iterations, learning_rate, print_cost = True)
Y_prediction_train = process(data.In, Net)
print(Y_prediction_train)
print(data.Out)
print(Y_prediction_train.sum(axis=0))
print("train accuracy: {} %".format(100 - np.mean(np.abs(Y_prediction_train - data.Out)) * 100))
return Net
def predict(In, Net, probability=False):
In = np.asanyarray(In)
is1d = In.ndim == 1
if is1d:
In = In.reshape(-1, 1)
Out = process(In, Net)
if not probability:
Out = Net.most_likely(Out)
if is1d:
Out = Out.reshape(-1)
return Out
def create_data(Dims):
Out = np.zeros((Dims.Out, Dims.Samples), dtype=int)
Out[np.random.randint(0, Dims.Out, (Dims.Samples,)), np.arange(Dims.Samples)] = 1
In = np.random.randint(0, 2, (Dims.In, Dims.Samples))
return Data(Out, In)
train_set_x = np.array([
[1,1,1,1,1],[0,1,1,1,1],[0,0,1,1,0],[0,0,1,0,1]
])
train_set_y = np.array([
[1,0,0],[1,0,0],[0,0,1],[0,0,1]
])
Net1 = model(train_set_x, train_set_y, num_iterations = 20000, learning_rate = 0.001, print_cost = True, activation='sigmoid')
Net2 = model(train_set_x, train_set_y, num_iterations = 20000, learning_rate = 0.001, print_cost = True, activation='softmax')
Net3 = model(train_set_x, train_set_y, num_iterations = 20000, learning_rate = 0.001, print_cost = True, activation='hybrid')
Dims = Problem_Size(8, 100, 50)
data = create_data(Dims)
model(data.In.T, data.Out.T, num_iterations = 40000, learning_rate = 0.001, print_cost = True, activation='softmax')
model(data.In.T, data.Out.T, num_iterations = 40000, learning_rate = 0.001, print_cost = True, activation='sigmoid')
Both the idea of how to fix the bug and how you can extend the implementation to classify between more classes can be solved with some dimensionality analysis.
I am assuming that you by classifying multiple examples mean multiple classes and not multiple samples, as we need multiple samples to train even for 2 classes.
Where N = number of samples, D = number of features, K = number of categories(with K=2 being a special case where one can reduce this down to one dimension,ie K=1 with y=0 signifying one class and y=1 the other). The data should have the following dimensions:
X: N * D #input
y: N * K #output
W: D * K #weights, also dW has same dimensions
b: 1 * K #bias, also db has same dimensions
#A should have same dimensions as y
The order of the dimensions can be switched around, as long as the dot products are done correctly.
First dealing with your bug: You are initializing W as N * K instead of D * K ie. in the binary case:
w = np.zeros((numberOfTrainingExamples , 1))
#instead of
w = np.zeros((numberOfFeatures , 1))
This means that the only time you are initializing W to correct dimensions is when y and X (coincidentally) have same dimensions.
This will mess with your dot products as well:
np.dot(X, w) # or np.dot(w.T,X.T) if you define y as [K * N] dimensions
#instead of
np.dot(w.T , X)
and
np.dot( X.T, ( A - Y ) ) #np.dot( X.T, ( A - Y ).T ) if y:[K * N]
#instead of
np.dot( X, ( A - Y ).T )
Also make sure that the cost function returns one number (ie. not an array).
Secondly going on to K>2 you need to make some changes. b is no longer a single number, but a vector (1D-array). y and W go from being 1D-array to 2D array. To avoid confusion and hard-to-find bugs it could be good to set K, N and D to different values