First of all I wanna say that I am a python beginner and also completely new to neural networks. When I read about it I was very excited and thought I set up a little code from scratch (see code below).
But somehow my code is not working properly. I guess there are some major bugs (in the algorithm and the programming?). But I cannot find them at the moment.
So, in the handwritten notes you can see my system (and some formulas). I wanna solve a decision problem where I have data in the form of X=(x1,x2) and y (which is 0 or 1).
My network has one hidden layer consisting of 3 neurons and one output layer.
As an activation function I use sigmoid and for the loss I use cross entropy (sth like log likelihood for bernoulli, I guess?)
The neurons take the weighted input W.X + bias and return a scalar between 0,1.
For the learning process I tried to use backward propagation. So I just computed the derivative dLoss/dparams and applied the chain rule several times. In order not to make everything in index notation I tried to use numpy to handle matrices, etc.
Maybe someone sees directly the things I did wrong? (apart from the bad programming :D)
Handwritten notes 1/2
Handwritten notes 2/2
#!/usr/bin/python
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
## create random data set for decision problem
np.random.seed(0) #fixed seed to reproduce results
X, y = datasets.make_moons(20, noise=0.20) # lists containing the Data
plt.scatter(X[:,0], X[:,1], s=40, c=y, cmap=plt.cm.Spectral) # plot it
plt.show() # show plot; proceeds when plot is closed
## initialize model parameters
W1 = np.random.uniform(-0.5,0.5,[3,2]) # hidden layer weights (3 x 2) matrix
b1 = np.random.uniform(-1,1,[3]) # bias for neurons in hidden layer
W2 = np.random.uniform(-0.5,0.5,[1,3]) # weights for output layer (1 x 3)
b2 = np.random.uniform(-1,1,[1]) # bias for output neuron
# collecting parameters in model dict
model = {"W1" : W1, "W2" : W2, "b1" : b1, "b2" : b2}
## the activation function
# can also return the derivative
def sigmoid(x,derivative = False):
if derivative == True:
# derivative; np.multiply multiplies element-wise
# needed if x is tensor-like object
return np.multiply(sigmoid(x), (1 - sigmoid(x)))
else:
return 1.0/(1.0 + np.exp(-x))
## moving forward in the network for a single data point
# and returns a dict with necessary information
def move_forward(model, DataX):
W1 = model["W1"] # extract model params from dict to make it better readable
W2 = model["W2"]
b1 = model["b1"]
b2 = model["b2"]
t1 = np.dot(W1,DataX) + b1 # weighted input for hidden layer (here 3-dim object)
phi = sigmoid(t1) # evaluate activation function
phiP = sigmoid(t1, True) # derivative (needed for moving backward "learning")
t2 = np.dot(W2,phi) + b2 # weighted input for output layer (1-dim object)
sig = sigmoid(t2) # evaluate final output
sigP = sigmoid(t2, True) # derivative
forward = {"phi" : phi,"phiP" : phiP, # dict collecting the output
"sig" : sig, "sigP" : sigP}
return forward
## moving backward for a single data point
def move_backward(forward, model, DataX):
W1 = model["W1"]
W2 = model["W2"]
b1 = model["b1"]
b2 = model["b2"]
phi = forward["phi"]
phiP = forward["phiP"]
sig = forward["sig"]
sigP = forward["sigP"]
#not the full deltaWs / deltabs; multiplied by the rest in "update_model"
dW2 = sigP * phi # part from "derivative chain" roughly: dsig/dt2 dt2 / dW2
db2 = sigP # analogue
temp = np.multiply(W2,phiP) # multiplied element wise
dW1 = sigP * np.outer(temp, DataX) # outer product since: (W2 * phi)_j x_i
db1 = sigP * np.outer(temp, [1]) # analogue
backward = {"dW1": dW1, "dW2": dW2, "db1": db1, "db2": db2}
return backward
## part of the loss function; here for one data point
# returns also the derivative for the learning process
def loss(DataY, PredictionY, derivative = False):
if derivative == True:
return DataY / PredictionY - (1.0 - DataY) / (1.0 - PredictionY)
log_likelihood = DataY * np.log(PredictionY) + (1.0 - DataY) * np.log(1.0 - PredictionY)
return log_likelihood
## updating model parameters
## epsilon is a small parameter regulating the learning
def update_model(DataSet,model, epsilon):
DataX = DataSet[0]
DataY = DataSet[1]
total_loss = 0
dW1_total = 0
dW2_total = 0
db1_total = 0
db2_total = 0
beta = 0
W1 = model["W1"]
W2 = model["W2"]
b1 = model["b1"]
b2 = model["b2"]
# iterating over full data set
for i in range(len(DataX)):
forward = move_forward(model, DataX[i])
backward = move_backward(forward, model, DataX[i])
sig = forward["sig"]
total_loss += loss(DataY[i],sig)
beta += loss(DataY[i],sig, True)
dW1_total += backward["dW1"]
dW2_total += backward["dW2"]
db1_total += backward["db1"]
db2_total += backward["db2"]
total_loss *= -1.0/len(DataX) # the total loss
beta *= -1.0/len(DataX) # the derivative of dloss/dsig
## setting updated model params
W1_new = W1 - epsilon * beta * dW1_total
W2_new = W2 - epsilon * beta * dW2_total
b1_new = b1 - epsilon * beta * np.squeeze(np.asarray(db1_total))
b2_new = b2 - epsilon * beta * db2_total
model_updated = {"W1": W1_new, "W2": W2_new, "b1": b1_new,
"b2": b2_new, "loss": total_loss}
return model_updated
## train the model with a given data set N times
def train_model(DataSet,model, epsilon, N, print_state = False):
for i in range(N):
model = update_model(DataSet,model, epsilon)
if print_state == True:
if i % 100 == 0:
print(model)
print("loss = " , model["loss"])
print(model)
return model
## call the training function and store the output
model_new = train_model([X,y],model, 0.01, 1000, True)
## check result with data point in the training set
move_forward(model_new,X[0])
# Note: Hm, somehow I always get sig = 0.5 (roughly). And the loss
# does not get smaller than 0.68
# I guess there must be several mistakes
Related
Formula to Find the Cost Function:
Formula to Calculate Gradient Loss for w,b:
Arguments:
w -- weights, a numpy array of size (num_px * num_px * 3, 1)
b -- bias, a scalar
X -- data of size (num_px * num_px * 3, number of examples)
Y -- true "label" vector (containing 0 if non-cat, 1 if cat) of size (1, number of examples)
Return:
cost -- negative log-likelihood cost for logistic regression
dw -- gradient of the loss with respect to w, thus same shape as w
db -- gradient of the loss with respect to b, thus same shape as b
My Code:
import numpy as np
def sigmoid(z):
"""
Compute the sigmoid of z
Arguments:
z -- A scalar or numpy array of any size.
Return:
s -- sigmoid(z)
"""
### START CODE HERE ### (≈ 1 line of code)
s = None
s = 1 / (1 + np.exp(-z))
### END CODE HERE ###
return s
# GRADED FUNCTION: propagate
def propagate(w, b, X, Y):
"""
Implement the cost function and its gradient for the propagation explained above
Tips:
- Write your code step by step for the propagation. np.log(), np.dot()
"""
m = X.shape[1]
# FORWARD PROPAGATION (FROM X TO COST)
### START CODE HERE ### (≈ 2 lines of code)
A = None # compute activation
cost = None # compute cost
k = w * X + b
A = sigmoid(k)
cost = (-Y * np.log(A) - (1 - Y) * np.log(1 - A)).mean() / m
### END CODE HERE ###
# BACKWARD PROPAGATION (TO FIND GRAD)
### START CODE HERE ### (≈ 2 lines of code)
dw = None
db = None
db = np.subtract(A , Y)
dw = np.dot(X,db.T)/m
db = np.sum(db)/m
### END CODE HERE ###
# assert(dw.shape == w.shape)
# assert(db.dtype == float)
# cost = np.squeeze(cost)
# assert(cost.shape == ())
grads = {"dw": dw,
"db": db}
return grads, cost
w, b, X, Y = np.array([[1.],[2.]]), 2., np.array([[1.,2.,-1.],[3.,4.,-3.2]]), np.array([[1,0,1]])
grads, cost = propagate(w, b, X, Y)
print ("dw = " + str(grads["dw"]))
print ("db = " + str(grads["db"]))
print ("cost = " + str(cost))
My Output:
dw = [[ 0.72851438 0.99581514]
[ 1.5487967 2.38666712]]
db = 0.225798060825
cost = 1.04403235316
Expected Output:
dw = [[ 0.99845601] [ 2.39507239]]
db = 0.00145557813678
cost = 5.801545319394553
Could anybody tell me why my dw dimension is not same with expected output and help to find the cost function?
There are some small mistakes like you should use np.sum(Y*np.log(A) + (1-Y)*np.log(1-A)) / m in place of using .mean() and the next mistake that I think is replace np.subtract(A-Y) with simple A-Y bcz. there is no need for numpy in this. It's working for me.
def propagate(w, b, X, Y):
"""
Implement the cost function and its gradient for the propagation explained above
Arguments:
w -- weights, a numpy array of size (num_px * num_px * 3, 1)
b -- bias, a scalar
X -- data of size (num_px * num_px * 3, number of examples)
Y -- true "label" vector (containing 0 if non-cat, 1 if cat) of size (1, number of examples)
Return:
cost -- negative log-likelihood cost for logistic regression
dw -- gradient of the loss with respect to w, thus same shape as w
db -- gradient of the loss with respect to b, thus same shape as b
Tips:
- Write your code step by step for the propagation. np.log(), np.dot()
"""
m = X.shape[1]
# FORWARD PROPAGATION (FROM X TO COST)
### START CODE HERE ### (≈ 2 lines of code)
A = sigmoid(np.dot(w.T,X)+b) # compute activation
cost = -np.sum(Y*np.log(A) + (1-Y)*np.log(1-A)) / m # compute cost
### END CODE HERE ###
# BACKWARD PROPAGATION (TO FIND GRAD)
### START CODE HERE ### (≈ 2 lines of code)
dw = np.dot(X,(A-Y).T)/m
db = np.sum(A-Y,axis=1)/m
### END CODE HERE ###
assert(dw.shape == w.shape)
assert(db.dtype == float)
cost = np.squeeze(cost)
assert(cost.shape == ())
grads = {"dw": dw,
"db": db}
return grads, cost
dw = np.dot(X,db.T)/m
is wrong.
Instead of db, it should be multiplied with the derivative of the activation function here, i.e sigmoid,
A = sigmoid(k)
dA = np.dot((1-A)*A,dloss.T) # This is the derivative of a sigmoid function
dw = np.dot(X,dA.T)
The code is not tested, but the solution would be along this line.
See here to calculate dloss.
First a quick disclaimer would be that I posted this question on Reddit, in the Deep Learning and Learning Machine Learning first, but I thought I might also request your expertise here too. Without further ado:
I am currently challenging myself on this year Deep Unsupervised Learning Course of Berkeley University and although I just started the warmup exercise of week 1, I am already having 'technical' difficulties.
The exercise in question is the "1. Warmup" in the following document: Week 1 Exercises. (My apologies as I am not familiar enough with Reddit formating to seemlessly include images.
In my understanding, we have a variable x which can take values from 1..100 which a specific probability of being sampled ( defined in sample_data() function).
The task is therefore to fit a vector of parameters theta which is passed to a softmax function, and is supposed to give the likelihood of a specific element x_i to be sampled. Namely, theta_1 should the parameter which "bumps up" the soft-max value corresponding to the variable x = 1 and so on.
Using Tensorflow, I think I was able to create such a model, but when it comes to training, I believe I am missing a crucial point as the program cannot compute gradients with respect to the theta parameters.
I would like to know if am not misunderstanding the task, and if there is any better method to achieve the result of the exercise.
Here is the code, where the failing par is located from the # Computing gradients.
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
if __name__ == "__main__":
# Sampling function of the x variable provided in the exercise
def sample_data():
count = 10000
rand = np.random.RandomState(0)
a = 0.3 + 0.1 * rand.randn(count)
b = 0.8 + 0.05 * rand.randn(count)
mask = rand.rand(count) < 0.5
samples = np.clip(a * mask + b * (1 - mask), 0.0, 1.0)
return np.digitize(samples, np.linspace(0.0, 1.0, 100))
full_data = sample_data()
train_ds = full_data[:int(.8*len( full_data))]
val_ds = full_data[int(.8*len( full_data)):]
# Declaring parameters theta
w_init = tf.zeros_initializer()
params = tf.Variable(
initial_value=w_init(shape=(1, 100),
dtype='float32'), trainable=True, name='params')
softmax = tf.squeeze( tf.nn.softmax( params, axis=1))
#Should materialize the loss of the model
def get_neg_log_likelihood( inputs):
return - tf.math.log( softmax)
neg_log_likelihoods = get_neg_log_likelihood( softmax)
dist = tfp.distributions.Categorical( probs=softmax, dtype=tf.int32)
optimizer = tf.keras.optimizers.Adam()
for epoch in range( 100):
minibatch_size = 200
n_minibatches = len( train_ds) // minibatch_size
# Running over minibatches of the data
for minibatch in range( n_minibatches):
# Minibatching
start_index = (minibatch*minibatch_size)
end_index = (minibatch_size*minibatch + minibatch_size)
x = train_ds[start_index:end_index]
with tf.GradientTape() as tape:
tape.watch( params)
loss = tf.reduce_mean( - dist.log_prob( x))
# Computing gradients
grads = tape.gradient( loss, params)
print( grads) # Result: None
# input()
optimizer.apply_gradients( zip( grads, params))
Thank you in advance for your time.
PS: I mainly have a background in Deep Reinforcement Learning, therefore I can understand the various models used there ( policy, value functions ...), but I am trying to refine my grasp over the internals of the models themselves, namely in generative probabilistic models (GAN, VAE) and other unsupervised learning models in general ( RealNVP, Norm Flows, ...)
Pretty sure nobody is gonna see this, but I thought I might as well bring some closure to this.
First of all, I calculated the gradients by directly deriving its expression from the negative log likelihood of the soft-max value, thus dropping the Tensorflow framework by the same occasion.
Although the results are a little bit under my expectations, the program was able to fit the model to a distribution somewhat similar to the empirical distribution of the sampled data. I guess this is due to the fact that just a 1 dimensional theta parameter vector is not enough to fully model the real data distribution, as well as the finite amount of sampled data.
An updated version of the code:
import numpy as np
from matplotlib import pyplot as plt
np.random.seed( 42)
def softmax(X, theta = 1.0, axis = None):
# Shamefull copy paste from SO
y = np.atleast_2d(X)
if axis is None:
axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)
y = y * float(theta)
y = y - np.expand_dims(np.max(y, axis = axis), axis)
y = np.exp(y)
ax_sum = np.expand_dims(np.sum(y, axis = axis), axis)
p = y / ax_sum
if len(X.shape) == 1: p = p.flatten()
return p
if __name__ == "__main__":
def sample_data():
count = 10000
rand = np.random.RandomState(0)
a = 0.3 + 0.1 * rand.randn(count)
b = 0.8 + 0.05 * rand.randn(count)
mask = rand.rand(count) < 0.5
samples = np.clip(a * mask + b * (1 - mask), 0.0, 1.0)
return np.digitize(samples, np.linspace(0.0, 1.0, 100))
full_data = sample_data()
train_ds = full_data[:int(.8*len( full_data))]
val_ds = full_data[int(.8*len( full_data)):]
# Declaring parameters
params = np.zeros(100)
# Use for loss computation
def get_neg_log_likelihood( softmax):
return - np.log( softmax)
def get_loss( params, x):
return np.mean( [get_neg_log_likelihood( softmax( params))[i-1] for i in x])
lr = .0005
for epoch in range( 1000):
# Shuffling training data
np.random.shuffle( train_ds)
minibatch_size = 100
n_minibatches = len( train_ds) // minibatch_size
# Running over minibatches of the data
for minibatch in range( n_minibatches):
smax = softmax( params)
# Jacobian of neg log likelishood
jacobian = [[ smax[j] - 1 if i == j else
smax[j] for j in range(100)] for i in range(100)]
# Minibatching
start_index = (minibatch*minibatch_size)
end_index = (minibatch_size*minibatch + minibatch_size)
x = train_ds[start_index:end_index]
# Compute the gradient matrix for each sample data and mean over it
grad_matrix = np.vstack( [jacobian[i] for i in x])
grads = np.sum( grad_matrix, axis=0)
params -= lr * grads
print( "Epoch %d -- Train loss: %.4f , Val loss: %.4f" %(epoch, get_loss( params, train_ds), get_loss( params, val_ds)))
# Plotting each ~100 epochs
if epoch % 100 == 0:
counters = { i+1: 0 for i in range(100)}
for x in full_data:
counters[x]+= 1
histogram = np.array( [ counters[i+1] / len( full_data) for i in range( 100)])
fsmax = softmax( params)
fig, ax = plt.subplots()
ax.set_title('Dist. Comp. after %d epochs of training (from scratch)' % epoch)
x = np.arange( 1,101)
width = 0.35
rects1 = ax.bar(x - width/2, fsmax, width, label='Model')
rects2 = ax.bar(x + width/2, histogram, width, label='Empirical')
ax.set_ylabel('Likelihood')
ax.set_xlabel('Variable x\s values')
ax.legend()
def autolabel(rects):
for rect in rects:
height = rect.get_height()
autolabel(rects1)
autolabel(rects2)
fig.tight_layout()
plt.savefig( 'plots/results_after_%d_epochs.png' % epoch)
Picture of the final model distribution included for completeness. Modeled vs Empirical Distribution
I have a simple cost function, which I want to optimize using scipy.optimize.minimize function.
opt_solution = scipy.optimize.minimize(costFunction, theta, args = (training_data,), method = 'L-BFGS-B', jac = True, options = {'maxiter': 100)
where costFunction is the function to be optimized, theta are the parameters to be optimized. Inside costFunction, I printed the value of cost function. But the parameter maxiter seems to have no effect whether I increase value from 10 to 100000. The time it is taking is same. Also, I was expecting the printed value of cost function should be equal to the values of maxiter. So I am feeling maxiter has no effect. What might be the problem ?
Cost function is
def costFunction(self, theta, input):
""" Extract weights and biases from 'theta' input """
W1 = theta[self.limit0 : self.limit1].reshape(self.hidden_size, self.visible_size)
W2 = theta[self.limit1 : self.limit2].reshape(self.visible_size, self.hidden_size)
b1 = theta[self.limit2 : self.limit3].reshape(self.hidden_size, 1)
b2 = theta[self.limit3 : self.limit4].reshape(self.visible_size, 1)
""" Compute output layers by performing a feedforward pass
Computation is done for all the training inputs simultaneously """
hidden_layer = self.sigmoid(numpy.dot(W1, input) + b1)
output_layer = self.sigmoid(numpy.dot(W2, hidden_layer) + b2)
""" Compute intermediate difference values using Backpropagation algorithm """
diff = output_layer - input
sum_of_squares_error = 0.5 * numpy.sum(numpy.multiply(diff, diff)) / input.shape[1]
weight_decay = 0.5 * self.lamda * (numpy.sum(numpy.multiply(W1, W1)) + numpy.sum(numpy.multiply(W2, W2)))
cost = sum_of_squares_error + weight_decay
""" Compute the gradient values by averaging partial derivatives
Partial derivatives are averaged over all training examples """
W1_grad = numpy.dot(del_hid, numpy.transpose(input))
W2_grad = numpy.dot(del_out, numpy.transpose(hidden_layer))
b1_grad = numpy.sum(del_hid, axis = 1)
b2_grad = numpy.sum(del_out, axis = 1)
W1_grad = W1_grad / input.shape[1] + self.lamda * W1
W2_grad = W2_grad / input.shape[1] + self.lamda * W2
b1_grad = b1_grad / input.shape[1]
b2_grad = b2_grad / input.shape[1]
""" Transform numpy matrices into arrays """
W1_grad = numpy.array(W1_grad)
W2_grad = numpy.array(W2_grad)
b1_grad = numpy.array(b1_grad)
b2_grad = numpy.array(b2_grad)
""" Unroll the gradient values and return as 'theta' gradient """
theta_grad = numpy.concatenate((W1_grad.flatten(), W2_grad.flatten(),
b1_grad.flatten(), b2_grad.flatten()))
# Update counter value
self.counter += 1
print "Index ", self.counter, "cost ", cost
return [cost, theta_grad]
maxiter gives the maximum number of iterations that scipy will try before giving up on improving the solution. But it may very well be satisfied with a solution and stop earlier.
If you look at the docs for minimize when using the 'l-bfgs-b' method, notice there are three parameters you can pass as options (factr, ftol and gtol) that can also cause the iteration to stop.
In simple cases like yours, especially if your cost function also provides the gradient (as indicated by jac=True in your call), convergence typically happens in the first few iterations, hence way before the maxiter limit is reached.
I'm doing a project on water quality prediction using Artificial Neural Network. I implemented this using python. I have completed my prediction model but the generated predictions are not much accurate.
What I'm doing is I have collected data from a river for past 4 and half years on daily basis and I'm predicting a pattern for a specific parameter by inputting data from past records. Simply what I need to do is to predict "Turbidity level" of water on 2015 by feeding data on turbidity from 2012-2014.
From the model which I have created it is not much accurate when I compare to the real data I have gathered for 2015. Please help me to solve this. I tried this by changing hidden layer sizes and the Lambda value.
//This is my code
import xlrd
import numpy as np
from numpy import zeros
from scipy.optimize import minimize
import matplotlib.pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
from scipy import optimize
#Neural Network
class Neural_Network(object):
def __init__(self,Lambda):
#Define Hyperparameters
self.inputLayerSize = 2
self.outputLayerSize = 1
self.hiddenLayerSize = 10
#Weights (parameters)
self.W1 = np.random.randn(self.inputLayerSize,self.hiddenLayerSize)
self.W2 = np.random.randn(self.hiddenLayerSize,self.outputLayerSize)
#Regularization Parameter:
self.Lambda = Lambda
def forward(self, arrayInput):
#Propogate inputs though network
self.z2 = np.dot(arrayInput, self.W1)
self.a2 = self.sigmoid(self.z2)
self.z3 = np.dot(self.a2, self.W2)
yHat = self.sigmoid(self.z3)
return yHat
def sigmoid(self, z):
#Apply sigmoid activation function to scalar, vector, or matrix
return 1/(1+np.exp(-z))
def sigmoidPrime(self,z):
#Gradient of sigmoid
return np.exp(-z)/((1+np.exp(-z))**2)
def costFunction(self, arrayInput, arrayOutput):
#Compute cost for given input,output use weights already stored in class.
self.yHat = self.forward(arrayInput)
#J = 0.5*sum((arrayOutput-self.yHat)**2)
#J = 0.5*sum((arrayOutput-self.yHat)**2)/arrayInput.shape[0] + (self.Lambda/2)
J = 0.5*sum((arrayOutput-self.yHat)**2)/arrayInput.shape[0] + (self.Lambda/2)*sum(sum(self.W1**2),sum(self.W2**2))
#J = 0.5*sum((arrayOutput-self.yHat)**2)/arrayInput.shape[0] + (self.Lambda/2)*(sum(self.W1**2)+sum(self.W2**2))
return J
def costFunctionPrime(self, arrayInput, arrayOutput):
#Compute derivative with respect to W and W2 for a given X and y:
self.yHat = self.forward(arrayInput)
delta3 = np.multiply(-(arrayOutput-self.yHat), self.sigmoidPrime(self.z3))
#Add gradient of regularization term:
#dJdW2 = np.dot(self.a2.T, delta3) + self.Lambda*self.W2
dJdW2 = np.dot(self.a2.T, delta3)
delta2 = np.dot(delta3, self.W2.T)*self.sigmoidPrime(self.z2)
#Add gradient of regularization term:
#dJdW1 = np.dot(arrayInput.T, delta2)+ self.Lambda*self.W1
dJdW1 = np.dot(arrayInput.T, delta2)
return dJdW1, dJdW2
#Helper Functions for interacting with other classes:
def getParams(self):
#Get W1 and W2 unrolled into vector:
params = np.concatenate((self.W1.ravel(), self.W2.ravel()))
return params
def setParams(self, params):
#Set W1 and W2 using single paramater vector.
W1_start = 0
W1_end = self.hiddenLayerSize * self.inputLayerSize
self.W1 = np.reshape(params[W1_start:W1_end], (self.inputLayerSize , self.hiddenLayerSize))
W2_end = W1_end + self.hiddenLayerSize*self.outputLayerSize
self.W2 = np.reshape(params[W1_end:W2_end], (self.hiddenLayerSize, self.outputLayerSize))
def computeGradients(self, arrayInput, arrayOutput):
dJdW1, dJdW2 = self.costFunctionPrime(arrayInput, arrayOutput)
return np.concatenate((dJdW1.ravel(), dJdW2.ravel()))
def computeNumericalGradient(self,N, X, y):
paramsInitial = N.getParams()
numgrad = np.zeros(paramsInitial.shape)
perturb = np.zeros(paramsInitial.shape)
e = 1e-4
for p in range(len(paramsInitial)):
#Set perturbation vector
perturb[p] = e
N.setParams(paramsInitial + perturb)
loss2 = N.costFunction(X, y)
N.setParams(paramsInitial - perturb)
loss1 = N.costFunction(X, y)
#Compute Numerical Gradient
numgrad[p] = (loss2 - loss1) / (2*e)
#Return the value we changed to zero:
perturb[p] = 0
#Return Params to original value:
N.setParams(paramsInitial)
return numgrad
#Trainer class
class trainer(object):
def __init__(self, N):
self.N = N
def costFunctionWrapper(self, params, arrayInput, arrayOutput):
self.N.setParams(params)
cost = self.N.costFunction(arrayInput, arrayOutput)
#grad = self.N.computeGradients(arrayInput, arrayOutput)
grad = self.N.computeNumericalGradient(self.N,arrayInput, arrayOutput)
return cost, grad
def callbackF(self, params):
self.N.setParams(params)
self.J.append(self.N.costFunction(self.arrayInput, self.arrayOutput))
self.testJ.append(self.N.costFunction(self.TestInput, self.TestOutput))
def train(self, arrayInput, arrayOutput,TestInput,TestOutput):
#Make an internal variable for the callback function:
self.arrayInput = arrayInput
self.arrayOutput = arrayOutput
self.TestInput = TestInput
self.TestOutput = TestOutput
#Make empty list to store costs:
self.J = []
self.testJ= []
params0 = self.N.getParams()
options = {'maxiter': 200, 'disp' : True}
_res = optimize.minimize(self.costFunctionWrapper, params0, jac=True, method='BFGS', \
args=(arrayInput, arrayOutput), options=options, callback=self.callbackF)
self.N.setParams(_res.x)
self.optimizationResults = _res
#Main Program
path = "F:\prototype\\newdata\\tody\\turbidity\\c.xlsx"
book = xlrd.open_workbook(path)
input1=[]
output=[]
testinput=[]
testoutput=[]
#training data set
first_sheet = book.sheet_by_index(1)
for row in range(first_sheet.ncols-1):
input1.append(first_sheet.col_values(row))
for row in range((first_sheet.ncols-1),first_sheet.ncols ):
output.append(first_sheet.col_values(row))
arrayInput = np.asarray(input1)
arrayInput = arrayInput.T
arrayOutput = np.asarray(output)
arrayOutput = arrayOutput.T
#testing data set
first_sheet1 = book.sheet_by_index(0)
for row in range(first_sheet1.ncols-1):
testinput.append(first_sheet1.col_values(row))
for row in range((first_sheet1.ncols-1),first_sheet1.ncols ):
testoutput.append(first_sheet1.col_values(row))
TestInput = np.asarray(testinput)
TestInput = TestInput.T
TestOutput = np.asarray(testoutput)
TestOutput = TestOutput.T
#2016
input2016=[]
first_sheet2 = book.sheet_by_index(2)
for row in range(first_sheet2.ncols):
input2016.append(first_sheet2.col_values(row))
Input = np.asarray(input2016)
Input = Input.T
# Scaling
arrayInput = arrayInput / np.amax(arrayInput, axis=0)
arrayOutput = arrayOutput / np.amax(arrayOutput, axis=0)
TestInput = TestInput / np.amax(TestInput, axis=0)
Input = Input / np.amax(Input, axis=0)
TestOutput = TestOutput / np.amax(TestOutput, axis=0)
NN=Neural_Network(Lambda=0.00000000000001)
T = trainer(NN)
T.train(arrayInput,arrayOutput,TestInput,TestOutput)
print NN.costFunctionPrime(arrayInput,arrayOutput)
Output = NN.forward(Input)
print Output
print '----------'
#print TestOutput
#plt.plot(T.J)
plt.plot(Output)
plt.grid(1)
plt.xlabel('Iterations')
plt.ylabel('cost')
plt.show()
//Turbidity means 2015 real data and prediction means data predicted using this code
Some of the comments suggest scaling the output sigmoidal layer to match the correct data. If you look at your predictions, you will see that with some scaling they are pretty accurate. I advise against scaling a sigmoidal function, however.
A sigmoidal output is meant to be interpreted as a probability (given certain constraints are followed), so scaling it would be breaking that contract and could give undefined results. What happens if you scale from 0-100, but then start receiving training targets larger than 100? (assuming you are training an online system, otherwise perhaps that example is not relevant)
I would change your code to use a linear output layer. This would not require any manipulation of the data after training the network. Also given that your cost function is least squares, the linear output layer will be convex (which reduces the number of local optima that your algorithm can get stuck in).
thanks for reading this.
I'm trying to implement a multi-label logistic regression using theano:
import numpy
import theano
import theano.tensor as T
rng = numpy.random
examples = 5
features = 10
labels = 2
D = (rng.randn(examples, labels, features), rng.randint(size=(labels, examples), low=0, high=2))
training_steps = 10000
# Declare Theano symbolic variables
x = T.matrix("x")
y = T.vector("y")
w = theano.shared(rng.randn(1 , labels ,features), name="w")
b = theano.shared(0., name="b")
print "Initial model:"
print w.get_value(), b.get_value()
# Construct Theano expression graph
p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b)) # Probability that target = 1
prediction = p_1 > 0.5 # The prediction thresholded
xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function
cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize
gw, gb = T.grad(cost, [w, b]) # Compute the gradient of the cost
# (we shall return to this in a
# following section of this tutorial)
# Compile
train = theano.function(
inputs=[x,y],
outputs=[prediction, xent],
updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb)),
name='train')
predict = theano.function(inputs=[x], outputs=prediction , name='predict')
# Train
for i in range(training_steps):
pred, err = train(D[0], D[1])
print "Final model:"
print w.get_value(), b.get_value()
print "target values for D:", D[1]
print "prediction on D:", predict(D[0])
but -T.dot(x, w) product fails with this error:
TypeError: ('Bad input argument to theano function with name "train" at index 0(0-based)', 'Wrong number of dimensions: expected 2, got 3 with shape (5, 10, 2).')
x has shape (5, 2, 10) And W (1, 2, 10). I would expect the dot product to have shape (5,2).
My questions are:
Is there anyway to do this inner product?
Do you think there is a better way to achieve a multi-label logistic regression?
thanks!
---- EDIT -----
So here is an implementation of what I would like to do using numpy.
x = rng.randn(examples,labels,features)
w = rng.randn (labels,features)
dot = numpy.zeros((examples,labels))
for example in range(examples):
for label in range(labels):
dot[example,label] = x[example,label,:].dot(w[label,:])
print dot
output:
[[-1.70321498 2.51088139]
[-5.73608956 0.1066286 ]
[ 2.31334531 3.31892284]
[ 1.56301872 -0.56150922]
[-1.98815855 -2.98866706]]
But I don't know how to do this symbolically using Theano.
After some hours of fighting this seems to produce the right results:
I had an error which was having the input as rng.randn(examples,features,labels) instead of rng.randn(examples,features). This means, that besides having more labels, the inputs should be the same size.
And the way of computing the dot product the right way was using theano.scan method like:
results, updates = theano.scan(lambda label: T.dot(x, w[label,:]) - b[label], sequences=T.arange(labels))
thanks everybody for their help!
import numpy as np
import theano
import theano.tensor as T
rng = np.random
examples = 5
features = 10
labels = 2
D = (rng.randn(examples,features), rng.randint(size=(labels, examples), low=0, high=2))
training_steps = 10000
# Declare Theano symbolic variables
x = T.matrix("x")
y = T.matrix("y")
w = theano.shared(rng.randn(labels ,features), name="w")
b = theano.shared(np.zeros(labels), name="b")
print "Initial model:"
print w.get_value(), b.get_value()
results, updates = theano.scan(lambda label: T.dot(x, w[label,:]) - b[label], sequences=T.arange(labels))
# Construct Theano expression graph
p_1 = 1 / (1 + T.exp(- results)) # Probability that target = 1
prediction = p_1 > .5 # The prediction thresholded
xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function
cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize
gw, gb = T.grad(cost, [w, b]) # Compute the gradient of the cost
# (we shall return to this in a
# following section of this tutorial)
# Compile
train = theano.function(
inputs=[x,y],
outputs=[prediction, xent],
updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb)),
name='train')
predict = theano.function(inputs=[x], outputs=prediction , name='predict')
# Train
for i in range(training_steps):
pred, err = train(D[0], D[1])
print "Final model:"
print w.get_value(), b.get_value()
print "target values for D:", D[1]
print "prediction on D:", predict(D[0])