How to get accurate predictions from Neural Network? - python

I'm doing a project on water quality prediction using Artificial Neural Network. I implemented this using python. I have completed my prediction model but the generated predictions are not much accurate.
What I'm doing is I have collected data from a river for past 4 and half years on daily basis and I'm predicting a pattern for a specific parameter by inputting data from past records. Simply what I need to do is to predict "Turbidity level" of water on 2015 by feeding data on turbidity from 2012-2014.
From the model which I have created it is not much accurate when I compare to the real data I have gathered for 2015. Please help me to solve this. I tried this by changing hidden layer sizes and the Lambda value.
//This is my code
import xlrd
import numpy as np
from numpy import zeros
from scipy.optimize import minimize
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy import optimize
#Neural Network
class Neural_Network(object):
def __init__(self,Lambda):
#Define Hyperparameters
self.inputLayerSize = 2
self.outputLayerSize = 1
self.hiddenLayerSize = 10
#Weights (parameters)
self.W1 = np.random.randn(self.inputLayerSize,self.hiddenLayerSize)
self.W2 = np.random.randn(self.hiddenLayerSize,self.outputLayerSize)
#Regularization Parameter:
self.Lambda = Lambda
def forward(self, arrayInput):
#Propogate inputs though network
self.z2 = np.dot(arrayInput, self.W1)
self.a2 = self.sigmoid(self.z2)
self.z3 = np.dot(self.a2, self.W2)
yHat = self.sigmoid(self.z3)
return yHat
def sigmoid(self, z):
#Apply sigmoid activation function to scalar, vector, or matrix
return 1/(1+np.exp(-z))
def sigmoidPrime(self,z):
#Gradient of sigmoid
return np.exp(-z)/((1+np.exp(-z))**2)
def costFunction(self, arrayInput, arrayOutput):
#Compute cost for given input,output use weights already stored in class.
self.yHat = self.forward(arrayInput)
#J = 0.5*sum((arrayOutput-self.yHat)**2)
#J = 0.5*sum((arrayOutput-self.yHat)**2)/arrayInput.shape[0] + (self.Lambda/2)
J = 0.5*sum((arrayOutput-self.yHat)**2)/arrayInput.shape[0] + (self.Lambda/2)*sum(sum(self.W1**2),sum(self.W2**2))
#J = 0.5*sum((arrayOutput-self.yHat)**2)/arrayInput.shape[0] + (self.Lambda/2)*(sum(self.W1**2)+sum(self.W2**2))
return J
def costFunctionPrime(self, arrayInput, arrayOutput):
#Compute derivative with respect to W and W2 for a given X and y:
self.yHat = self.forward(arrayInput)
delta3 = np.multiply(-(arrayOutput-self.yHat), self.sigmoidPrime(self.z3))
#Add gradient of regularization term:
#dJdW2 = np.dot(self.a2.T, delta3) + self.Lambda*self.W2
dJdW2 = np.dot(self.a2.T, delta3)
delta2 = np.dot(delta3, self.W2.T)*self.sigmoidPrime(self.z2)
#Add gradient of regularization term:
#dJdW1 = np.dot(arrayInput.T, delta2)+ self.Lambda*self.W1
dJdW1 = np.dot(arrayInput.T, delta2)
return dJdW1, dJdW2
#Helper Functions for interacting with other classes:
def getParams(self):
#Get W1 and W2 unrolled into vector:
params = np.concatenate((self.W1.ravel(), self.W2.ravel()))
return params
def setParams(self, params):
#Set W1 and W2 using single paramater vector.
W1_start = 0
W1_end = self.hiddenLayerSize * self.inputLayerSize
self.W1 = np.reshape(params[W1_start:W1_end], (self.inputLayerSize , self.hiddenLayerSize))
W2_end = W1_end + self.hiddenLayerSize*self.outputLayerSize
self.W2 = np.reshape(params[W1_end:W2_end], (self.hiddenLayerSize, self.outputLayerSize))
def computeGradients(self, arrayInput, arrayOutput):
dJdW1, dJdW2 = self.costFunctionPrime(arrayInput, arrayOutput)
return np.concatenate((dJdW1.ravel(), dJdW2.ravel()))
def computeNumericalGradient(self,N, X, y):
paramsInitial = N.getParams()
numgrad = np.zeros(paramsInitial.shape)
perturb = np.zeros(paramsInitial.shape)
e = 1e-4
for p in range(len(paramsInitial)):
#Set perturbation vector
perturb[p] = e
N.setParams(paramsInitial + perturb)
loss2 = N.costFunction(X, y)
N.setParams(paramsInitial - perturb)
loss1 = N.costFunction(X, y)
#Compute Numerical Gradient
numgrad[p] = (loss2 - loss1) / (2*e)
#Return the value we changed to zero:
perturb[p] = 0
#Return Params to original value:
N.setParams(paramsInitial)
return numgrad
#Trainer class
class trainer(object):
def __init__(self, N):
self.N = N
def costFunctionWrapper(self, params, arrayInput, arrayOutput):
self.N.setParams(params)
cost = self.N.costFunction(arrayInput, arrayOutput)
#grad = self.N.computeGradients(arrayInput, arrayOutput)
grad = self.N.computeNumericalGradient(self.N,arrayInput, arrayOutput)
return cost, grad
def callbackF(self, params):
self.N.setParams(params)
self.J.append(self.N.costFunction(self.arrayInput, self.arrayOutput))
self.testJ.append(self.N.costFunction(self.TestInput, self.TestOutput))
def train(self, arrayInput, arrayOutput,TestInput,TestOutput):
#Make an internal variable for the callback function:
self.arrayInput = arrayInput
self.arrayOutput = arrayOutput
self.TestInput = TestInput
self.TestOutput = TestOutput
#Make empty list to store costs:
self.J = []
self.testJ= []
params0 = self.N.getParams()
options = {'maxiter': 200, 'disp' : True}
_res = optimize.minimize(self.costFunctionWrapper, params0, jac=True, method='BFGS', \
args=(arrayInput, arrayOutput), options=options, callback=self.callbackF)
self.N.setParams(_res.x)
self.optimizationResults = _res
#Main Program
path = "F:\prototype\\newdata\\tody\\turbidity\\c.xlsx"
book = xlrd.open_workbook(path)
input1=[]
output=[]
testinput=[]
testoutput=[]
#training data set
first_sheet = book.sheet_by_index(1)
for row in range(first_sheet.ncols-1):
input1.append(first_sheet.col_values(row))
for row in range((first_sheet.ncols-1),first_sheet.ncols ):
output.append(first_sheet.col_values(row))
arrayInput = np.asarray(input1)
arrayInput = arrayInput.T
arrayOutput = np.asarray(output)
arrayOutput = arrayOutput.T
#testing data set
first_sheet1 = book.sheet_by_index(0)
for row in range(first_sheet1.ncols-1):
testinput.append(first_sheet1.col_values(row))
for row in range((first_sheet1.ncols-1),first_sheet1.ncols ):
testoutput.append(first_sheet1.col_values(row))
TestInput = np.asarray(testinput)
TestInput = TestInput.T
TestOutput = np.asarray(testoutput)
TestOutput = TestOutput.T
#2016
input2016=[]
first_sheet2 = book.sheet_by_index(2)
for row in range(first_sheet2.ncols):
input2016.append(first_sheet2.col_values(row))
Input = np.asarray(input2016)
Input = Input.T
# Scaling
arrayInput = arrayInput / np.amax(arrayInput, axis=0)
arrayOutput = arrayOutput / np.amax(arrayOutput, axis=0)
TestInput = TestInput / np.amax(TestInput, axis=0)
Input = Input / np.amax(Input, axis=0)
TestOutput = TestOutput / np.amax(TestOutput, axis=0)
NN=Neural_Network(Lambda=0.00000000000001)
T = trainer(NN)
T.train(arrayInput,arrayOutput,TestInput,TestOutput)
print NN.costFunctionPrime(arrayInput,arrayOutput)
Output = NN.forward(Input)
print Output
print '----------'
#print TestOutput
#plt.plot(T.J)
plt.plot(Output)
plt.grid(1)
plt.xlabel('Iterations')
plt.ylabel('cost')
plt.show()
//Turbidity means 2015 real data and prediction means data predicted using this code

Some of the comments suggest scaling the output sigmoidal layer to match the correct data. If you look at your predictions, you will see that with some scaling they are pretty accurate. I advise against scaling a sigmoidal function, however.
A sigmoidal output is meant to be interpreted as a probability (given certain constraints are followed), so scaling it would be breaking that contract and could give undefined results. What happens if you scale from 0-100, but then start receiving training targets larger than 100? (assuming you are training an online system, otherwise perhaps that example is not relevant)
I would change your code to use a linear output layer. This would not require any manipulation of the data after training the network. Also given that your cost function is least squares, the linear output layer will be convex (which reduces the number of local optima that your algorithm can get stuck in).

Related

How to implement self daptive weight in neural network in Pytorch

I want to develop a Physics Informed Neural Network model in Pytorch. My network should be trained based on two losses: boundary condition (BC) and partial derivative equation (PDE). I am adding these two losses but the problem is that the BC is controlling the main loss, like the following figure:
This way I make asimple finite difference calculation for my 1D heat conduction:
import torch
import torch.nn as nn
import matplotlib.pyplot as plt
import numpy as np
from pyDOE import lhs
######### Finite difference solution
# geometry:
L = 1 # length of the rod
# mesh:
dx = 0.01
nx = int(L/dx) + 1
x = np.linspace(0, L, nx)
# temporal grid:
t_sim = 1
dt = 0.01
nt = int (t_sim/dt)
# parametrization
alpha = 0.14340344168260039
# IC
t_ic = 4
# BC
t_left = 5 # left side with 6 °C temperature
t_right = 3 # right side with 4 °C temperature
# Results
T = np.ones(nx) * t_ic
all_T = []
for i in range (0, nt):
Tn = T.copy()
T[1:-1] = Tn[1:-1] + dt/(dx++2) * alpha * (Tn[2:] - 2*Tn[1:-1] + Tn[0:-2])
T[0] = t_left
T[-1] = t_right
all_T.append(Tn)
Then,data is prepared for the PINN model through the next block of code:
x = torch.linspace(0, L, nx, dtype=torch.float32)
t = torch.linspace(0, t_sim, nt, dtype=torch.float32)
T, X = torch.meshgrid(t,x)
Temps = np.concatenate (all_T).reshape(nt,nx)
x_test = torch.hstack((X.transpose(1,0).flatten()[:,None], T.transpose(1,0).flatten()[:,None]))
y_test = torch.from_numpy(Temps) # I suppose it is the ground truth
lb = x_test[0] # lower boundary
ub = x_test[-1] # upper boundary
left_x = torch.hstack((X[:,0][:,None], T[:,0][:,None])) # x and t of left boundary
left_y = torch.ones(left_x.shape[0], 1) * t_left # Temperature of left boundary
left_y[0,0] = t_ic
right_x = torch.hstack((X[:,-1][:,None], T[:,0][:,None])) # x and t of right boundary
right_y = torch.ones(right_x.shape[0], 1) * t_right # Temperature of right boundary
right_y[0,0] = t_ic
bottom_x = torch.hstack((X[0,1:-1][:,None], T[0,1:-1][:,None])) # x and t of IC
bottom_y = torch.ones(bottom_x.shape[0], 1) * t_ic # Temperature of IC
No_BC = 1 # 50 percent of the BC data are used from training
No_IC = 1 # 75 percent of the IC data are used from training
idx_l = np.random.choice(left_x.shape[0], int (left_x.shape[0]*No_BC), replace=False)
idx_r = np.random.choice(right_x.shape[0], int (right_x.shape[0]*No_BC), replace=False)
idx_b = np.random.choice(bottom_x.shape[0], int (bottom_x.shape[0]*No_IC), replace=False)
X_train_No = torch.vstack([left_x[idx_l,:], right_x[idx_r,:], bottom_x[idx_b,:]])
Y_train_No = torch.vstack([left_y[idx_l,:], right_y[idx_r,:], bottom_y[idx_b,:]])
N_f = 5000
X_train_Nf = lb + (ub-lb)*lhs(2,N_f)
f_hat = torch.zeros(X_train_Nf.shape[0], 1, dtype=torch.float32) # zero array for loss of PDE
This is my script for PINN and I very much appreciate your help:
class FCN(nn.Module):
##Neural Network
def __init__(self,layers):
super().__init__() #call __init__ from parent class
self.activation = nn.Tanh()
self.loss_function = nn.MSELoss(reduction ='mean')
'Initialise neural network as a list using nn.Modulelist'
self.linears = nn.ModuleList([nn.Linear(layers[i], layers[i+1]) for i in range(len(layers)-1)])
self.iter = 0
'Xavier Normal Initialization'
for i in range(len(layers)-1):
nn.init.xavier_normal_(self.linears[i].weight.data, gain=1.0)
nn.init.zeros_(self.linears[i].bias.data)
'foward pass'
def forward(self,x):
if torch.is_tensor(x) != True:
x = torch.from_numpy(x)
a = x.float()
for i in range(len(layers)-2):
z = self.linears[i](a)
a = self.activation(z)
a = self.linears[-1](a)
return a
'Loss Functions'
#Loss BC
def lossBC(self, x_BC, y_BC):
loss_BC = self.loss_function(self.forward(x_BC),y_BC)
return loss_BC.float()
#Loss PDE
def lossPDE(self,x_PDE):
g = x_PDE.clone()
g.requires_grad = True # Enable differentiation
f = self.forward(g)
f_x_t = torch.autograd.grad(f,g,torch.ones([g.shape[0],1]).to(device),retain_graph=True, create_graph=True)[0] #first derivative
f_xx_tt = torch.autograd.grad(f_x_t,g,torch.ones(g.shape).to(device), create_graph=True)[0]#second derivative
f_t = f_x_t[:,[1]]
f_xx = f_xx_tt[:,[0]]
f = f_t - alpha * f_xx
return self.loss_function(f,f_hat).float()
def loss(self,x_BC,y_BC,x_PDE):
loss_bc = self.lossBC(x_BC.float(),y_BC.float())
loss_pde = self.lossPDE(x_PDE.float())
return loss_bc.float() + loss_pde.float()
And this is how I make the model, arrays representing losses and finally the plot:
layers = np.array([2, 50, 50, 50, 50, 50, 1])
PINN = FCN(layers)
optimizer = torch.optim.Adam(PINN.parameters(), lr=0.001)
def closure():
optimizer.zero_grad()
loss_p = PINN.lossPDE(X_train_Nf)
loss_p.backward()
loss_b = PINN.lossBC(X_train_No, Y_train_No)
loss_b.backward()
return loss_b + loss_p
total_l = np.array([])
BC_l = np.array([])
PDE_l = np.array([])
test_BC_l = np.array([])
for i in range(10000):
loss = optimizer.step(closure)
total_l = np.append(total_l, loss.cpu().detach().numpy())
PDE_l = np.append (PDE_l, PINN.lossPDE(X_train_Nf).cpu().detach().numpy())
BC_l = np.append(BC_l, PINN.lossBC(X_train_No, Y_train_No).cpu().detach().numpy())
with torch.no_grad():
test_loss = PINN.lossBC(X_test, Y_test.flatten().view(-1,1))
test_BC_l = np.append(test_BC_l, test_loss.cpu().detach().numpy())
import matplotlib.pyplot as plt
fig,ax=plt.subplots(1,1, figsize=(9,9))
ax.plot(PDE_l, c = 'g', lw=2, label='PDE loss in train')
ax.plot(BC_l, c = 'k', lw=2, label='BC loss in train')
ax.plot(test_BC_l, c = 'r', lw=2, label='BC loss in test')
ax.plot(total_l, c = 'b', lw=2, label='total loss in train')
ax.set_xlabel('Epoch')
ax.set_ylabel('Loss')
plt.legend()
plt.show()
You should not add the boundary and PDE based loss while performing the backpropagation. Backpropagate iteratively on the PDE and the number of different boundary conditions used (Dirichlet or Neumann). When you add both of them, the network is not learning any thing about the BC, as the majority of the loss is being generated from the PDE. So, the network learns more about the PDE based loss and none about the BC, as it is clearly evident from your graph.
The loss function should be something like this :
for _ in different_loss_types: 1) PDE loss (backprop) on PDE 2) BC loss (backprop on BC)

2d laplacian computation in tensorflow

I want to do a toy code computing the laplacian of the function f(x,y) = sin(pi (x+1)/2)*sin(pi (y+1)/2) for (x,y) in [-1,1]^2.
I have tried multiple methods:
The first one works
the second one has a bug when calculating u_xx
the third one gives wrong results. I dont even know what is in the hess variable
the fourth one is a try of the tf.hessians function that doesnt work.
If someone can shed some light, I would be indebted.
import tensorflow as tf
import tensorflow as tf
from tensorflow import keras
import matplotlib.pyplot as plt
pi = np.pi
# function that will be passed to autmoatic differentiation
#tf.function
def sol(X):
x,y = X[:,0] , X[:,1]
return tf.sin(pi*(x+1)/2)*tf.sin(pi*(y+1)/2)
#tf.function
def sol2(X,Y):
return tf.sin(pi*(X+1)/2)*tf.sin(pi*(Y+1)/2)
# true laplacian for comparison
def f(X):
x , y = X[:,0] , X[:,1]
return -pi**2/2*tf.sin(pi*(x+1)/2)*tf.sin(pi*(y+1)/2)
def fgrid(X,Y):
return -pi**2/2*tf.sin(pi*(X+1)/2)*tf.sin(pi*(Y+1)/2)
#######"MESH OVER THE SQUARE"
n = 500
x1 , x2 = -1 , 1
vec = tf.linspace(x1,x2,n)
xgrid,ygrid = tf.meshgrid(vec,vec)
xrow,yrow = tf.reshape(xgrid,(-1,1)),tf.reshape(ygrid,(-1,1))
Xdata = tf.Variable(tf.concat((xrow,yrow),axis=1))
#######COMPUTING LAPLACIAN
# FIRST WORKING METHOD
"""
with tf.GradientTape(persistent=True) as tape:
xx = tf.reshape(Xdata[:,0],(-1,1))
yy = tf.reshape(Xdata[:,1],(-1,1))
tape.watch(xx)
tape.watch(yy)
u = sol(tf.concat([xx,yy],axis=1))
u_x = tape.gradient(u,xx)
u_xx = tape.gradient(u_x,xx)
u_y = tape.gradient(u,yy)
u_yy = tape.gradient(u_y,yy)
lapl = (u_xx+u_yy)
del(tape)
#displaying results
plt.contourf(xgrid,ygrid,lapl.numpy().reshape(n,n))
#"""
# SECOND METHOD NOT WORKING : error in u_xx computation
"""
with tf.GradientTape(persistent=True) as tape:
xx = tf.Variable(Xdata[:,0])
yy = tf.Variable(Xdata[:,1])
u = sol(Xdata)
u_x = tape.gradient(u,xx)
u_xx = tape.gradient(u_x,xx)
u_y = tape.gradient(u,yy)
u_yy = tape.gradient(u_y,yy)
lapl = u_xx + u_yy
del(tape)
#"""
#plt.contourf(xgrid,ygrid,lapl.numpy().reshape(n,n))
# THIRD METHOD NOT WORKING : wrong results
"""
with tf.GradientTape(persistent=True) as tape:
u = sol(Xdata)
grads = tape.gradient(u,Xdata)
hess = tape.gradient(grads,Xdata) # shape (-1,2)
lapl = hess[:,0] + hess[:,1]
del(tape)
plt.contourf(xgrid,ygrid,lapl.numpy().reshape(n,n))
#"""
# FOURTH METHOD NOT WORKING : tf.hessians (use Gradient.tape instead) or 'GradientTape' object has no attribute 'hessians'
"""
with tf.GradientTape() as tape:
u = sol(Xdata)
hess = tape.hessians(u,Xdata)
lapl = hess[:,0] + hess[:,1]
plt.contourf(xgrid,ygrid,lapl.numpy().reshape(n,n))
#"""
# second try with tf.hessians
#"""
hess = tf.hessians(sol(Xdata),Xdata) #tf.gradients is not supported when eager execution is enabled. Use tf.GradientTape instead.
#""" ```

Pytorch: multiplication between parameters is inplace for LBFGS optimizer?

I am trying to solve a kind of inverse problem by backward propagation with pytorch. I am trying to recover the parameters (r, theta) that generate a vector field U(r,theta).
As I intended to use the LBFGS optimizer from pytorch, I realize that the operation
r*theta
is detected as inplace and thus not supported for the backward computation of the gradient, whereas
r+theta is not.
How can I overcome this ? I actually need to recover fields that use transformations of the form r*theta.
Here is an example of a code that reproduces the error: it is running fine if you change
field = Wrong_U_param(r, theta, positions)
by
field = U_param(r, theta, positions)
in the loop. Is also works if you replace the r*theta operation by r.item()*theta (but is does not optimize over r since there is no more gradient depending on r.
I tried to use torch.mul() to run the product but it also fails.
The error message is the following
RuntimeError: one of the variables needed for gradient computation has been modified by an inplace operation
and the automatic detection points towards this very product.
Thank you for your help !
import numpy as np
import torch
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
import torch.optim as optim
from geomloss import SamplesLoss
torch.autograd.set_detect_anomaly(True)
def model(field):
return field
def U_param(r, theta, pos):
result = r + theta + 0. * pos
return result
def Wrong_U_param(r, theta, pos):
result = r * theta + 0. * pos
return result
def learn_U_param(Zobs, ngrad, params, r_guess=0., theta_guess=0., lambd=1.):
Npts = params[0]
positions = torch.tensor(np.arange(0, 1, 1 / Npts) + 1 / 2 / Npts).reshape((Npts, 1))
lab = torch.tensor(np.arange(0, Npts))
r = torch.tensor(float(r_guess)).to(device)
r.requires_grad = True
theta = torch.tensor(float(theta_guess)).to(device)
theta.requires_grad = True
r_hist = [r.item()]
theta_hist = [theta.item()]
loss_hist = []
optimizer = optim.LBFGS([r, theta])
for i in range(ngrad):
field = Wrong_U_param(r, theta, positions)
Z = model(field)
Loss = SamplesLoss(loss="sinkhorn", p=2, blur=.05)
Wass = Loss(lab, Z, positions, lab, Zobs, positions)
def closure():
optimizer.zero_grad()
Wass.backward(retain_graph=True)
return Wass
optimizer.step(closure)
optimizer.zero_grad()
r_hist.append(r.item())
theta_hist.append(theta.item())
loss_hist.append(Wass.item())
return r_hist, theta_hist, loss_hist
N=100
r = 2
theta = 2
params = [N]
positions = torch.tensor(np.arange(0, 1, 1 / N) + 1 / 2 / N).reshape((N, 1))
Zobs = U_param(r, theta, positions)
ngrad = 10
print(learn_U_param(Zobs, ngrad, params, r_guess=0.1, theta_guess=0.1, lambd=1.))

Computing the partial derivatives of a deep neural network with respect to Inputs

I am trying to compute the derivative of a neural network with 2 or more hidden layers with respect to its inputs. So not "standard backpropagation" since I am not interested in how the output varies with respect to the weights. And I am not looking to train my network using it (if this warrants removing the backpropagation tag, let me know, but I suspect that what I need is not too different)
The reason for my interest in the derivative here, is that I have a test set which sometimes provides me with a matching [x1, x2] : [y] pair, and sometimes a [x1, x2] : [d(y)/dx1] or [x1, x2] : [d(y)/dx2]. I then use a particle swarm algorithm to train my network.
I like diagrams, so to save a few words here is my network:
and what I would like is for the compute_derivativemethod to return a numpy array of the form below:
This is my attempt so far, but I can't seem to get an array matching my number of inputs at the end. I can't figure what I am doing wrong.
def compute_derivative(self):
"""Computes the network derivative and returns an array with the change in output with respect to each input"""
self.compute_layer_derivative(0)
for l in np.arange(1,self.size):
dl = self.compute_layer_derivative(l)
dprev = self.layers[l-1].derivatives
self.output_derivatives = dl.T.dot(dprev)
return self.output_derivatives
def compute_layer_derivative(self, l_id):
wL = self.layers[l_id].w
zL = self.layers[l_id].output
daL = self.layers[l_id].f(zL, div=1)
daLM = np.repeat(daL,wL.shape[0], axis=0)
self.layers[l_id].derivatives = np.multiply(daLM,wL)
return self.layers[l_id].derivatives
If you want to run the entire code I have made a cut down, commented version, which will work with a copy paste (see below). Thank you for your help !
# -*- coding: utf-8 -*-
import numpy as np
def sigmoid(x, div = 0):
if div == 1: #first derivative f'
return np.exp(-x) / (1. + np.exp(-x))**2.
if div == 2: # second derivative f''
return - np.exp(x) * (np.exp(x) - 1) / (1. + np.exp(x))**3.
return 1. / (1. + np.exp(-x)) # f
def linear(x, div = 0):
if div == 1: #first derivative f'
return np.full(x.shape,1)
if div > 2: # second derivative f''
return np.zeros(x.shape)
return x # f
class Layer():
def __init__(self, in_n, h_n, activation, bias = True, debug = False):
self.w = 2*np.random.random((in_n, h_n)) - 1 # synaptic weights with 0 mean
self.f = activation
self.output = None
self.activation = None
self.derivatives = np.array([[None for i in range(in_n+1)]]) #+1 for global dev
if bias:
self.b = 2*np.random.random((1, h_n)) - 1
else:
self.b = None
if debug:
self.w = np.full((in_n, h_n), 1.)
if self.b is not None: self.b = np.full((1, h_n), 1.)
def compute(self, inputs):
if self.w.shape[0] != inputs.shape[1]:
raise ValueError("Inputs dimensions do not match test data dim.")
if self.b is None:
self.output = np.dot(inputs, self.w)
else:
self.output = np.dot(inputs, self.w) + self.b
self.activation = self.f(self.output)
class NeuralNetwork():
def __init__(self, nb_layers, in_NN, h_density, out_NN, debug = False):
self.debug = debug
self.layers = []
self.size = nb_layers+1
self.output_derivatives = None
self.output = None
self.in_N = in_NN
self.out_N = out_NN
if debug:
print("Input Layer with {} inputs.".format(in_NN))
#create hidden layers
current_inputs = in_NN
for l in range(self.size - 1):
self.layers.append(Layer(current_inputs, h_density, sigmoid, debug = debug))
current_inputs = h_density
if debug:
print("Hidden Layer {} with {} inputs and {} neurons.".format(l+1, self.layers[l].w.shape[0], self.layers[l].w.shape[1]))
#creat output layer
self.layers.append(Layer(current_inputs, out_NN, linear, bias=False, debug = debug))
if debug:
print("Output Layer with {} inputs and {} outputs.".format(self.layers[-1].w.shape[0], self.layers[-1].w.shape[1]))
#print("with w: {}".format(self.layers[l].w))
print("ANN size = {}, with {} Layers\n\n".format( self.size, len(self.layers)))
def compute(self, point):
curr_inputs = point
for l in range(self.size):
self.layers[l].compute(curr_inputs)
curr_inputs = self.layers[l].activation
self.output = curr_inputs
if self.debug: print("ANN output: ",curr_inputs)
return self.output
def compute_derivative(self, order, point):
""" If the network has not been computed, compute it before getting
the derivative. This might be a bit expensive..."""
if self.layers[self.size-1].output is None:
self.compute(point)
#Compute output layer total derivative
self.compute_layer_derivative(self.size-1, order)
self.output_derivatives = self.get_partial_derivatives_to_outputs(self.size-1)
print(self.output_derivatives)
for l in np.arange(1,self.size):
l = self.size-1 - l
self.compute_layer_derivative(l, order)
if l > 0: #if we are not at first hidden layer compute the total derivative
self.output_derivatives *= self.get_total_derivative_to_inputs(l)
else:# get the each output derivative with respect to each input
backprop_dev_to_outs = np.repeat(np.matrix(self.output_derivatives),self.in_N, axis=0).T
dev_to_inputs = np.repeat(np.matrix(self.get_partial_derivatives_to_inputs(l)).T,self.out_N, axis=1).T
self.output_derivatives = np.multiply(backprop_dev_to_outs, dev_to_inputs)
if self.debug: print("output derivatives: ",self.output_derivatives)
return self.output_derivatives
def get_total_derivative(self,l_id):
return np.sum(self.get_partial_derivatives_to_inputs(l_id))
def get_total_derivative_to_inputs(self,l_id):
return np.sum(self.get_partial_derivatives_to_inputs(l_id))
def get_partial_derivatives_to_inputs(self,l_id):
return np.sum(self.layers[l_id].derivatives, axis=1)
def get_partial_derivatives_to_outputs(self,l_id):
return np.sum(self.layers[l_id].derivatives, axis=0)
def compute_layer_derivative(self, l_id, order):
if self.debug: print("\n\ncurrent layer is ", l_id)
wL = self.layers[l_id].w
zL = self.layers[l_id].output
daL = self.layers[l_id].f(zL, order)
daLM = np.repeat(daL,wL.shape[0], axis=0)
self.layers[l_id].derivatives = np.multiply(daLM,wL)
if self.debug:
print("L_id: {}, a_f: {}".format(l_id, self.layers[l_id].f))
print("L_id: {}, dev: {}".format(l_id, self.get_total_derivative_to_inputs(l_id)))
return self.layers[l_id].derivatives
#nb_layers, in_NN, h_density, out_NN, debug = False
nn = NeuralNetwork(1,2,2,1, debug= True)
nn.compute(np.array([[1,1]]))# head value
nn.compute_derivative(1,np.array([[1,1]])) #first derivative
EDITED ANSWER BASED ON SIRGUY's REPLY:
# Here we assume that the layer has sigmoid activation
def Jacobian(x = np.array([[1,1]]), w = np.array([[1,1],[1,1]]), b = np.array([[1,1]])):
return sigmoid_d(x.dot(w) + b) * w # J(S, x)
In the case of a network with 2 hidden layers with sigmoid activation and one output layer with sigmoid activation (so that we can just use the same function as above) we have:
J_L1 = Jacobian(x = np.array([[1,1]])) # where [1,1] are the inputs of to the network (i.e. values of the neuron in the input layer)
J_L2 = Jacobian(x = np.array([[3,3]])) # where [3,3] are the neuron values of layer 1 before activation
# in the output layer the weights and biases are adjusted as there is 1 neuron rather than 2
J_Lout = Jacobian(x = np.array([[2.90514825, 2.90514825]]), w = np.array([[1],[1]]), b = np.array([[1]]))# where [2.905,2.905] are the neuron values of layer 2 before activation
J_out_to_in = J_Lout.T.dot(J_L2).dot(J_L1)
Here's how I derived what your example should give:
# i'th component of vector-valued function S(x) (sigmoid-weighted layer)
S_i(x) = 1 / 1 + exp(-w_i . x + b_i) # . for matrix multiplication here
# i'th component of vector-valued function L(x) (linear-weighted layer)
L_i(x) = w_i . x # different weights than S.
# as it happens our L(x) output 1 value, so is in fact a scalar function
F(x) = L(S(x)) # final output value
#derivative of F, denoted as J(F, x) to mean the Jacobian of the function F, evaluated at x.
J(F, x) = J(L(S(x)), x) = J(L, S(x)) . J(S, x) # chain rule for multivariable, vector-valued functions
#First, what's the derivative of L?
J(L, S(x)) = L
This is usually a surprising result, but you can verify this yourself by computing partial derivatives of M . x for some random matrix M. If you compute all the derivatives and put them into the Jacobian you will get back M.
#Now what's the derivative of S? Compute via formula
d(S_i(x)/dx_j) = w_ij * exp(-w_i.x+b_i) / (1 + exp(-w_i.x+b_i))**2 #w_ij, is the j'th component of the vector w_i
#For the gradient of a S_i (which is just one component of S), we get
J(S_i, x) = (exp(-w_i . x + b_i) / (1 + exp(-w_i . x + b_i))**2) * w_i # remember this is a vector because w_i is a vector
Now to take your debug example of 1's everywhere.
w_i = b = x = [1, 1]
#define a to make this less cluttered
a = exp(-w_i . x + b) = exp(-3)
J(S_i, x) = a / (1 + a)^2 * [1, 1]
J(S, x) = a / (1 + a)^2 * [[1, 1], [1, 1]]
J(L, S(x)) = [1, 1] #Doesn't depend on S(x)
J(F, x) = J(L, S(x)) . J(S, x) = (a / (1 + a)**2) * [1, 1] . [[1, 1], [1, 1]]
J(F, x) = (a / (1 + a)**2) * [2, 2] = (2 * a / (1 + a)**2) * [1, 1]
J(F, x) = [0.0903533, 0.0903533]
Hopefully this will help you reorganise your code a bit. You can't evaluate the derivatives here with just the value of w_i . x, you will need w_i and x separately to properly compute everything.
EDIT
Because I find this stuff interesting, here is my python script for
computing the value and first derivative of a neural network:
import numpy as np
class Layer:
def __init__(self, weights_matrix, bias_vector, sigmoid_activation = True):
self.weights_matrix = weights_matrix
self.bias_vector = bias_vector
self.sigmoid_activation = sigmoid_activation
def compute_value(self, x_vector):
result = np.add(np.dot(self.weights_matrix, x_vector), self.bias_vector)
if self.sigmoid_activation:
result = np.exp(-result)
result = 1 / (1 + result)
return result
def compute_value_and_derivative(self, x_vector):
if not self.sigmoid_activation:
return (self.compute_value(x_vector), self.weights_matrix)
temp = np.add(np.dot(self.weights_matrix, x_vector), self.bias_vector)
temp = np.exp(-temp)
value = 1.0 / (1 + temp)
temp = temp / (1 + temp)**2
#pre-multiplying by a diagonal matrix multiplies each row by
#the corresponding diagonal element
#(1st row with 1st value, 2nd row with 2nd value, etc...)
jacobian = np.dot(np.diag(temp), self.weights_matrix)
return (value, jacobian)
class Network:
def __init__(self, layers):
self.layers = layers
def compute_value(self, x_vector):
for l in self.layers:
x_vector = l.compute_value(x_vector)
return x_vector
def compute_value_and_derivative(self, x_vector):
x_vector, jacobian = self.layers[0].compute_value_and_derivative(x_vector)
for l in self.layers[1:]:
x_vector, j = l.compute_value_and_derivative(x_vector)
jacobian = np.dot(j, jacobian)
return x_vector, jacobian
#first weights
l1w = np.array([[1,1],[1,1]])
l1b = np.array([1,1])
l2w = np.array([[1,1],[1,1]])
l2b = np.array([1,1])
l3w = np.array([1, 1])
l3b = np.array([0])
nn = Network([Layer(l1w, l1b),
Layer(l2w, l2b),
Layer(l3w, l3b, False)])
r = nn.compute_value_and_derivative(np.array([1,1]))
print r

Neural Network in python: Decision/Classification always gives 0.5

First of all I wanna say that I am a python beginner and also completely new to neural networks. When I read about it I was very excited and thought I set up a little code from scratch (see code below).
But somehow my code is not working properly. I guess there are some major bugs (in the algorithm and the programming?). But I cannot find them at the moment.
So, in the handwritten notes you can see my system (and some formulas). I wanna solve a decision problem where I have data in the form of X=(x1,x2) and y (which is 0 or 1).
My network has one hidden layer consisting of 3 neurons and one output layer.
As an activation function I use sigmoid and for the loss I use cross entropy (sth like log likelihood for bernoulli, I guess?)
The neurons take the weighted input W.X + bias and return a scalar between 0,1.
For the learning process I tried to use backward propagation. So I just computed the derivative dLoss/dparams and applied the chain rule several times. In order not to make everything in index notation I tried to use numpy to handle matrices, etc.
Maybe someone sees directly the things I did wrong? (apart from the bad programming :D)
Handwritten notes 1/2
Handwritten notes 2/2
#!/usr/bin/python
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
## create random data set for decision problem
np.random.seed(0) #fixed seed to reproduce results
X, y = datasets.make_moons(20, noise=0.20) # lists containing the Data
plt.scatter(X[:,0], X[:,1], s=40, c=y, cmap=plt.cm.Spectral) # plot it
plt.show() # show plot; proceeds when plot is closed
## initialize model parameters
W1 = np.random.uniform(-0.5,0.5,[3,2]) # hidden layer weights (3 x 2) matrix
b1 = np.random.uniform(-1,1,[3]) # bias for neurons in hidden layer
W2 = np.random.uniform(-0.5,0.5,[1,3]) # weights for output layer (1 x 3)
b2 = np.random.uniform(-1,1,[1]) # bias for output neuron
# collecting parameters in model dict
model = {"W1" : W1, "W2" : W2, "b1" : b1, "b2" : b2}
## the activation function
# can also return the derivative
def sigmoid(x,derivative = False):
if derivative == True:
# derivative; np.multiply multiplies element-wise
# needed if x is tensor-like object
return np.multiply(sigmoid(x), (1 - sigmoid(x)))
else:
return 1.0/(1.0 + np.exp(-x))
## moving forward in the network for a single data point
# and returns a dict with necessary information
def move_forward(model, DataX):
W1 = model["W1"] # extract model params from dict to make it better readable
W2 = model["W2"]
b1 = model["b1"]
b2 = model["b2"]
t1 = np.dot(W1,DataX) + b1 # weighted input for hidden layer (here 3-dim object)
phi = sigmoid(t1) # evaluate activation function
phiP = sigmoid(t1, True) # derivative (needed for moving backward "learning")
t2 = np.dot(W2,phi) + b2 # weighted input for output layer (1-dim object)
sig = sigmoid(t2) # evaluate final output
sigP = sigmoid(t2, True) # derivative
forward = {"phi" : phi,"phiP" : phiP, # dict collecting the output
"sig" : sig, "sigP" : sigP}
return forward
## moving backward for a single data point
def move_backward(forward, model, DataX):
W1 = model["W1"]
W2 = model["W2"]
b1 = model["b1"]
b2 = model["b2"]
phi = forward["phi"]
phiP = forward["phiP"]
sig = forward["sig"]
sigP = forward["sigP"]
#not the full deltaWs / deltabs; multiplied by the rest in "update_model"
dW2 = sigP * phi # part from "derivative chain" roughly: dsig/dt2 dt2 / dW2
db2 = sigP # analogue
temp = np.multiply(W2,phiP) # multiplied element wise
dW1 = sigP * np.outer(temp, DataX) # outer product since: (W2 * phi)_j x_i
db1 = sigP * np.outer(temp, [1]) # analogue
backward = {"dW1": dW1, "dW2": dW2, "db1": db1, "db2": db2}
return backward
## part of the loss function; here for one data point
# returns also the derivative for the learning process
def loss(DataY, PredictionY, derivative = False):
if derivative == True:
return DataY / PredictionY - (1.0 - DataY) / (1.0 - PredictionY)
log_likelihood = DataY * np.log(PredictionY) + (1.0 - DataY) * np.log(1.0 - PredictionY)
return log_likelihood
## updating model parameters
## epsilon is a small parameter regulating the learning
def update_model(DataSet,model, epsilon):
DataX = DataSet[0]
DataY = DataSet[1]
total_loss = 0
dW1_total = 0
dW2_total = 0
db1_total = 0
db2_total = 0
beta = 0
W1 = model["W1"]
W2 = model["W2"]
b1 = model["b1"]
b2 = model["b2"]
# iterating over full data set
for i in range(len(DataX)):
forward = move_forward(model, DataX[i])
backward = move_backward(forward, model, DataX[i])
sig = forward["sig"]
total_loss += loss(DataY[i],sig)
beta += loss(DataY[i],sig, True)
dW1_total += backward["dW1"]
dW2_total += backward["dW2"]
db1_total += backward["db1"]
db2_total += backward["db2"]
total_loss *= -1.0/len(DataX) # the total loss
beta *= -1.0/len(DataX) # the derivative of dloss/dsig
## setting updated model params
W1_new = W1 - epsilon * beta * dW1_total
W2_new = W2 - epsilon * beta * dW2_total
b1_new = b1 - epsilon * beta * np.squeeze(np.asarray(db1_total))
b2_new = b2 - epsilon * beta * db2_total
model_updated = {"W1": W1_new, "W2": W2_new, "b1": b1_new,
"b2": b2_new, "loss": total_loss}
return model_updated
## train the model with a given data set N times
def train_model(DataSet,model, epsilon, N, print_state = False):
for i in range(N):
model = update_model(DataSet,model, epsilon)
if print_state == True:
if i % 100 == 0:
print(model)
print("loss = " , model["loss"])
print(model)
return model
## call the training function and store the output
model_new = train_model([X,y],model, 0.01, 1000, True)
## check result with data point in the training set
move_forward(model_new,X[0])
# Note: Hm, somehow I always get sig = 0.5 (roughly). And the loss
# does not get smaller than 0.68
# I guess there must be several mistakes

Categories

Resources