thanks for reading this.
I'm trying to implement a multi-label logistic regression using theano:
import numpy
import theano
import theano.tensor as T
rng = numpy.random
examples = 5
features = 10
labels = 2
D = (rng.randn(examples, labels, features), rng.randint(size=(labels, examples), low=0, high=2))
training_steps = 10000
# Declare Theano symbolic variables
x = T.matrix("x")
y = T.vector("y")
w = theano.shared(rng.randn(1 , labels ,features), name="w")
b = theano.shared(0., name="b")
print "Initial model:"
print w.get_value(), b.get_value()
# Construct Theano expression graph
p_1 = 1 / (1 + T.exp(-T.dot(x, w) - b)) # Probability that target = 1
prediction = p_1 > 0.5 # The prediction thresholded
xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function
cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize
gw, gb = T.grad(cost, [w, b]) # Compute the gradient of the cost
# (we shall return to this in a
# following section of this tutorial)
# Compile
train = theano.function(
inputs=[x,y],
outputs=[prediction, xent],
updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb)),
name='train')
predict = theano.function(inputs=[x], outputs=prediction , name='predict')
# Train
for i in range(training_steps):
pred, err = train(D[0], D[1])
print "Final model:"
print w.get_value(), b.get_value()
print "target values for D:", D[1]
print "prediction on D:", predict(D[0])
but -T.dot(x, w) product fails with this error:
TypeError: ('Bad input argument to theano function with name "train" at index 0(0-based)', 'Wrong number of dimensions: expected 2, got 3 with shape (5, 10, 2).')
x has shape (5, 2, 10) And W (1, 2, 10). I would expect the dot product to have shape (5,2).
My questions are:
Is there anyway to do this inner product?
Do you think there is a better way to achieve a multi-label logistic regression?
thanks!
---- EDIT -----
So here is an implementation of what I would like to do using numpy.
x = rng.randn(examples,labels,features)
w = rng.randn (labels,features)
dot = numpy.zeros((examples,labels))
for example in range(examples):
for label in range(labels):
dot[example,label] = x[example,label,:].dot(w[label,:])
print dot
output:
[[-1.70321498 2.51088139]
[-5.73608956 0.1066286 ]
[ 2.31334531 3.31892284]
[ 1.56301872 -0.56150922]
[-1.98815855 -2.98866706]]
But I don't know how to do this symbolically using Theano.
After some hours of fighting this seems to produce the right results:
I had an error which was having the input as rng.randn(examples,features,labels) instead of rng.randn(examples,features). This means, that besides having more labels, the inputs should be the same size.
And the way of computing the dot product the right way was using theano.scan method like:
results, updates = theano.scan(lambda label: T.dot(x, w[label,:]) - b[label], sequences=T.arange(labels))
thanks everybody for their help!
import numpy as np
import theano
import theano.tensor as T
rng = np.random
examples = 5
features = 10
labels = 2
D = (rng.randn(examples,features), rng.randint(size=(labels, examples), low=0, high=2))
training_steps = 10000
# Declare Theano symbolic variables
x = T.matrix("x")
y = T.matrix("y")
w = theano.shared(rng.randn(labels ,features), name="w")
b = theano.shared(np.zeros(labels), name="b")
print "Initial model:"
print w.get_value(), b.get_value()
results, updates = theano.scan(lambda label: T.dot(x, w[label,:]) - b[label], sequences=T.arange(labels))
# Construct Theano expression graph
p_1 = 1 / (1 + T.exp(- results)) # Probability that target = 1
prediction = p_1 > .5 # The prediction thresholded
xent = -y * T.log(p_1) - (1-y) * T.log(1-p_1) # Cross-entropy loss function
cost = xent.mean() + 0.01 * (w ** 2).sum()# The cost to minimize
gw, gb = T.grad(cost, [w, b]) # Compute the gradient of the cost
# (we shall return to this in a
# following section of this tutorial)
# Compile
train = theano.function(
inputs=[x,y],
outputs=[prediction, xent],
updates=((w, w - 0.1 * gw), (b, b - 0.1 * gb)),
name='train')
predict = theano.function(inputs=[x], outputs=prediction , name='predict')
# Train
for i in range(training_steps):
pred, err = train(D[0], D[1])
print "Final model:"
print w.get_value(), b.get_value()
print "target values for D:", D[1]
print "prediction on D:", predict(D[0])
Related
I'm facing some issues trying to find the linear regression line using Gradient Descent, getting to weird results.
Here is the function:
def gradient_descent(m_k, c_k, learning_rate, points):
n = len(points)
dm, dc = 0, 0
for i in range(n):
x = points.iloc[i]['alcohol']
y = points.iloc[i]['total']
dm += -(2/n) * x * (y - (m_k * x + c_k)) # Partial der in m
dc += -(2/n) * (y - (m_k * x + c_k)) # Partial der in c
m = m_k - dm * learning_rate
c = c_k - dc * learning_rate
return m, c
And combined with a for loop
l_rate = 0.0001
m, c = 0, 0
epochs = 1000
for _ in range(epochs):
m, c = gradient_descent(m, c, l_rate, dataset)
plt.scatter(dataset.alcohol, dataset.total)
plt.plot(list(range(2, 10)), [m * x + c for x in range(2,10)], color='red')
plt.show()
Gives this result:
Slope: 2.8061974241244196
Y intercept: 0.5712221080810446
The problem is though that taking advantage of sklearn to compute the slope and intercept, i.e.
model = LinearRegression(fit_intercept=True).fit(np.array(dataset['alcohol']).copy().reshape(-1, 1),
np.array(dataset['total']).copy())
I get something completely different:
Slope: 2.0325063
Intercept: 5.8577761548263005
Any idea why? Looking on SO I've found out that a possible problem could be a too high learning rate, but as stated above I'm currently using 0.0001
Sklearn's LinearRegression doesn't use gradient descent - it uses Ordinary Least Squares (OLS) Regression which is a non-iterative method.
For your model, you might consider randomly initialising m, c rather than starting with 0,0. You could also consider adjusting the learning rate or using an adaptive learning rate.
I am trying to use PYMC3 for a Bayesian model where I would like to repeatedly train my model on new unseen data. I am thinking I would need to update the priors with the posterior of the previously trained model every time I see the data, similar to how is achieved here https://docs.pymc.io/notebooks/updating_priors.html. They use the following function that finds the KDE from the samples and replacing each of the original definitions of the parameters in the model with a call to from_posterior.
def from_posterior(param, samples):
smin, smax = np.min(samples), np.max(samples)
width = smax - smin
x = np.linspace(smin, smax, 100)
y = stats.gaussian_kde(samples)(x)
# what was never sampled should have a small probability but not 0,
# so we'll extend the domain and use linear approximation of density on it
x = np.concatenate([[x[0] - 3 * width], x, [x[-1] + 3 * width]])
y = np.concatenate([[0], y, [0]])
return Interpolated(param, x, y)
And here is my original model.
def create_model(batsmen, bowlers, id1, id2, X):
testval = [[-5,0,1,2,3.5,5] for i in range(0, 9)]
l = [i for i in range(9)]
model = pm.Model()
with model:
delta_1 = pm.Uniform("delta_1", lower=0, upper=1)
delta_2 = pm.Uniform("delta_2", lower=0, upper=1)
inv_sigma_sqr = pm.Gamma("sigma^-2", alpha=1.0, beta=1.0)
inv_tau_sqr = pm.Gamma("tau^-2", alpha=1.0, beta=1.0)
mu_1 = pm.Normal("mu_1", mu=0, sigma=1/pm.math.sqrt(inv_tau_sqr), shape=len(batsmen))
mu_2 = pm.Normal("mu_2", mu=0, sigma=1/pm.math.sqrt(inv_tau_sqr), shape=len(bowlers))
delta = pm.math.ge(l, 3) * delta_1 + pm.math.ge(l, 6) * delta_2
eta = [pm.Deterministic("eta_" + str(i), delta[i] + mu_1[id1[i]] - mu_2[id2[i]]) for i in range(9)]
cutpoints = pm.Normal("cutpoints", mu=0, sigma=1/pm.math.sqrt(inv_sigma_sqr), transform=pm.distributions.transforms.ordered, shape=(9,6), testval=testval)
X_ = [pm.OrderedLogistic("X_" + str(i), cutpoints=cutpoints[i], eta=eta[i], observed=X[i]-1) for i in range(9)]
return model
Here, the problem is that some of my parameters such as mu_1, are multidimensional. This is why I get the following error:
ValueError: points have dimension 1, dataset has dimension 1500
because of the line y = stats.gaussian_kde(samples)(x).
Can someone please help me make this work for multi-dimensional parameters? I don't properly understand what KDE is and how the code computes it.
Thank you in advance!!
Formula to Find the Cost Function:
Formula to Calculate Gradient Loss for w,b:
Arguments:
w -- weights, a numpy array of size (num_px * num_px * 3, 1)
b -- bias, a scalar
X -- data of size (num_px * num_px * 3, number of examples)
Y -- true "label" vector (containing 0 if non-cat, 1 if cat) of size (1, number of examples)
Return:
cost -- negative log-likelihood cost for logistic regression
dw -- gradient of the loss with respect to w, thus same shape as w
db -- gradient of the loss with respect to b, thus same shape as b
My Code:
import numpy as np
def sigmoid(z):
"""
Compute the sigmoid of z
Arguments:
z -- A scalar or numpy array of any size.
Return:
s -- sigmoid(z)
"""
### START CODE HERE ### (≈ 1 line of code)
s = None
s = 1 / (1 + np.exp(-z))
### END CODE HERE ###
return s
# GRADED FUNCTION: propagate
def propagate(w, b, X, Y):
"""
Implement the cost function and its gradient for the propagation explained above
Tips:
- Write your code step by step for the propagation. np.log(), np.dot()
"""
m = X.shape[1]
# FORWARD PROPAGATION (FROM X TO COST)
### START CODE HERE ### (≈ 2 lines of code)
A = None # compute activation
cost = None # compute cost
k = w * X + b
A = sigmoid(k)
cost = (-Y * np.log(A) - (1 - Y) * np.log(1 - A)).mean() / m
### END CODE HERE ###
# BACKWARD PROPAGATION (TO FIND GRAD)
### START CODE HERE ### (≈ 2 lines of code)
dw = None
db = None
db = np.subtract(A , Y)
dw = np.dot(X,db.T)/m
db = np.sum(db)/m
### END CODE HERE ###
# assert(dw.shape == w.shape)
# assert(db.dtype == float)
# cost = np.squeeze(cost)
# assert(cost.shape == ())
grads = {"dw": dw,
"db": db}
return grads, cost
w, b, X, Y = np.array([[1.],[2.]]), 2., np.array([[1.,2.,-1.],[3.,4.,-3.2]]), np.array([[1,0,1]])
grads, cost = propagate(w, b, X, Y)
print ("dw = " + str(grads["dw"]))
print ("db = " + str(grads["db"]))
print ("cost = " + str(cost))
My Output:
dw = [[ 0.72851438 0.99581514]
[ 1.5487967 2.38666712]]
db = 0.225798060825
cost = 1.04403235316
Expected Output:
dw = [[ 0.99845601] [ 2.39507239]]
db = 0.00145557813678
cost = 5.801545319394553
Could anybody tell me why my dw dimension is not same with expected output and help to find the cost function?
There are some small mistakes like you should use np.sum(Y*np.log(A) + (1-Y)*np.log(1-A)) / m in place of using .mean() and the next mistake that I think is replace np.subtract(A-Y) with simple A-Y bcz. there is no need for numpy in this. It's working for me.
def propagate(w, b, X, Y):
"""
Implement the cost function and its gradient for the propagation explained above
Arguments:
w -- weights, a numpy array of size (num_px * num_px * 3, 1)
b -- bias, a scalar
X -- data of size (num_px * num_px * 3, number of examples)
Y -- true "label" vector (containing 0 if non-cat, 1 if cat) of size (1, number of examples)
Return:
cost -- negative log-likelihood cost for logistic regression
dw -- gradient of the loss with respect to w, thus same shape as w
db -- gradient of the loss with respect to b, thus same shape as b
Tips:
- Write your code step by step for the propagation. np.log(), np.dot()
"""
m = X.shape[1]
# FORWARD PROPAGATION (FROM X TO COST)
### START CODE HERE ### (≈ 2 lines of code)
A = sigmoid(np.dot(w.T,X)+b) # compute activation
cost = -np.sum(Y*np.log(A) + (1-Y)*np.log(1-A)) / m # compute cost
### END CODE HERE ###
# BACKWARD PROPAGATION (TO FIND GRAD)
### START CODE HERE ### (≈ 2 lines of code)
dw = np.dot(X,(A-Y).T)/m
db = np.sum(A-Y,axis=1)/m
### END CODE HERE ###
assert(dw.shape == w.shape)
assert(db.dtype == float)
cost = np.squeeze(cost)
assert(cost.shape == ())
grads = {"dw": dw,
"db": db}
return grads, cost
dw = np.dot(X,db.T)/m
is wrong.
Instead of db, it should be multiplied with the derivative of the activation function here, i.e sigmoid,
A = sigmoid(k)
dA = np.dot((1-A)*A,dloss.T) # This is the derivative of a sigmoid function
dw = np.dot(X,dA.T)
The code is not tested, but the solution would be along this line.
See here to calculate dloss.
First a quick disclaimer would be that I posted this question on Reddit, in the Deep Learning and Learning Machine Learning first, but I thought I might also request your expertise here too. Without further ado:
I am currently challenging myself on this year Deep Unsupervised Learning Course of Berkeley University and although I just started the warmup exercise of week 1, I am already having 'technical' difficulties.
The exercise in question is the "1. Warmup" in the following document: Week 1 Exercises. (My apologies as I am not familiar enough with Reddit formating to seemlessly include images.
In my understanding, we have a variable x which can take values from 1..100 which a specific probability of being sampled ( defined in sample_data() function).
The task is therefore to fit a vector of parameters theta which is passed to a softmax function, and is supposed to give the likelihood of a specific element x_i to be sampled. Namely, theta_1 should the parameter which "bumps up" the soft-max value corresponding to the variable x = 1 and so on.
Using Tensorflow, I think I was able to create such a model, but when it comes to training, I believe I am missing a crucial point as the program cannot compute gradients with respect to the theta parameters.
I would like to know if am not misunderstanding the task, and if there is any better method to achieve the result of the exercise.
Here is the code, where the failing par is located from the # Computing gradients.
import numpy as np
import tensorflow as tf
import tensorflow_probability as tfp
if __name__ == "__main__":
# Sampling function of the x variable provided in the exercise
def sample_data():
count = 10000
rand = np.random.RandomState(0)
a = 0.3 + 0.1 * rand.randn(count)
b = 0.8 + 0.05 * rand.randn(count)
mask = rand.rand(count) < 0.5
samples = np.clip(a * mask + b * (1 - mask), 0.0, 1.0)
return np.digitize(samples, np.linspace(0.0, 1.0, 100))
full_data = sample_data()
train_ds = full_data[:int(.8*len( full_data))]
val_ds = full_data[int(.8*len( full_data)):]
# Declaring parameters theta
w_init = tf.zeros_initializer()
params = tf.Variable(
initial_value=w_init(shape=(1, 100),
dtype='float32'), trainable=True, name='params')
softmax = tf.squeeze( tf.nn.softmax( params, axis=1))
#Should materialize the loss of the model
def get_neg_log_likelihood( inputs):
return - tf.math.log( softmax)
neg_log_likelihoods = get_neg_log_likelihood( softmax)
dist = tfp.distributions.Categorical( probs=softmax, dtype=tf.int32)
optimizer = tf.keras.optimizers.Adam()
for epoch in range( 100):
minibatch_size = 200
n_minibatches = len( train_ds) // minibatch_size
# Running over minibatches of the data
for minibatch in range( n_minibatches):
# Minibatching
start_index = (minibatch*minibatch_size)
end_index = (minibatch_size*minibatch + minibatch_size)
x = train_ds[start_index:end_index]
with tf.GradientTape() as tape:
tape.watch( params)
loss = tf.reduce_mean( - dist.log_prob( x))
# Computing gradients
grads = tape.gradient( loss, params)
print( grads) # Result: None
# input()
optimizer.apply_gradients( zip( grads, params))
Thank you in advance for your time.
PS: I mainly have a background in Deep Reinforcement Learning, therefore I can understand the various models used there ( policy, value functions ...), but I am trying to refine my grasp over the internals of the models themselves, namely in generative probabilistic models (GAN, VAE) and other unsupervised learning models in general ( RealNVP, Norm Flows, ...)
Pretty sure nobody is gonna see this, but I thought I might as well bring some closure to this.
First of all, I calculated the gradients by directly deriving its expression from the negative log likelihood of the soft-max value, thus dropping the Tensorflow framework by the same occasion.
Although the results are a little bit under my expectations, the program was able to fit the model to a distribution somewhat similar to the empirical distribution of the sampled data. I guess this is due to the fact that just a 1 dimensional theta parameter vector is not enough to fully model the real data distribution, as well as the finite amount of sampled data.
An updated version of the code:
import numpy as np
from matplotlib import pyplot as plt
np.random.seed( 42)
def softmax(X, theta = 1.0, axis = None):
# Shamefull copy paste from SO
y = np.atleast_2d(X)
if axis is None:
axis = next(j[0] for j in enumerate(y.shape) if j[1] > 1)
y = y * float(theta)
y = y - np.expand_dims(np.max(y, axis = axis), axis)
y = np.exp(y)
ax_sum = np.expand_dims(np.sum(y, axis = axis), axis)
p = y / ax_sum
if len(X.shape) == 1: p = p.flatten()
return p
if __name__ == "__main__":
def sample_data():
count = 10000
rand = np.random.RandomState(0)
a = 0.3 + 0.1 * rand.randn(count)
b = 0.8 + 0.05 * rand.randn(count)
mask = rand.rand(count) < 0.5
samples = np.clip(a * mask + b * (1 - mask), 0.0, 1.0)
return np.digitize(samples, np.linspace(0.0, 1.0, 100))
full_data = sample_data()
train_ds = full_data[:int(.8*len( full_data))]
val_ds = full_data[int(.8*len( full_data)):]
# Declaring parameters
params = np.zeros(100)
# Use for loss computation
def get_neg_log_likelihood( softmax):
return - np.log( softmax)
def get_loss( params, x):
return np.mean( [get_neg_log_likelihood( softmax( params))[i-1] for i in x])
lr = .0005
for epoch in range( 1000):
# Shuffling training data
np.random.shuffle( train_ds)
minibatch_size = 100
n_minibatches = len( train_ds) // minibatch_size
# Running over minibatches of the data
for minibatch in range( n_minibatches):
smax = softmax( params)
# Jacobian of neg log likelishood
jacobian = [[ smax[j] - 1 if i == j else
smax[j] for j in range(100)] for i in range(100)]
# Minibatching
start_index = (minibatch*minibatch_size)
end_index = (minibatch_size*minibatch + minibatch_size)
x = train_ds[start_index:end_index]
# Compute the gradient matrix for each sample data and mean over it
grad_matrix = np.vstack( [jacobian[i] for i in x])
grads = np.sum( grad_matrix, axis=0)
params -= lr * grads
print( "Epoch %d -- Train loss: %.4f , Val loss: %.4f" %(epoch, get_loss( params, train_ds), get_loss( params, val_ds)))
# Plotting each ~100 epochs
if epoch % 100 == 0:
counters = { i+1: 0 for i in range(100)}
for x in full_data:
counters[x]+= 1
histogram = np.array( [ counters[i+1] / len( full_data) for i in range( 100)])
fsmax = softmax( params)
fig, ax = plt.subplots()
ax.set_title('Dist. Comp. after %d epochs of training (from scratch)' % epoch)
x = np.arange( 1,101)
width = 0.35
rects1 = ax.bar(x - width/2, fsmax, width, label='Model')
rects2 = ax.bar(x + width/2, histogram, width, label='Empirical')
ax.set_ylabel('Likelihood')
ax.set_xlabel('Variable x\s values')
ax.legend()
def autolabel(rects):
for rect in rects:
height = rect.get_height()
autolabel(rects1)
autolabel(rects2)
fig.tight_layout()
plt.savefig( 'plots/results_after_%d_epochs.png' % epoch)
Picture of the final model distribution included for completeness. Modeled vs Empirical Distribution
First of all I wanna say that I am a python beginner and also completely new to neural networks. When I read about it I was very excited and thought I set up a little code from scratch (see code below).
But somehow my code is not working properly. I guess there are some major bugs (in the algorithm and the programming?). But I cannot find them at the moment.
So, in the handwritten notes you can see my system (and some formulas). I wanna solve a decision problem where I have data in the form of X=(x1,x2) and y (which is 0 or 1).
My network has one hidden layer consisting of 3 neurons and one output layer.
As an activation function I use sigmoid and for the loss I use cross entropy (sth like log likelihood for bernoulli, I guess?)
The neurons take the weighted input W.X + bias and return a scalar between 0,1.
For the learning process I tried to use backward propagation. So I just computed the derivative dLoss/dparams and applied the chain rule several times. In order not to make everything in index notation I tried to use numpy to handle matrices, etc.
Maybe someone sees directly the things I did wrong? (apart from the bad programming :D)
Handwritten notes 1/2
Handwritten notes 2/2
#!/usr/bin/python
import numpy as np
from sklearn import datasets
import matplotlib.pyplot as plt
## create random data set for decision problem
np.random.seed(0) #fixed seed to reproduce results
X, y = datasets.make_moons(20, noise=0.20) # lists containing the Data
plt.scatter(X[:,0], X[:,1], s=40, c=y, cmap=plt.cm.Spectral) # plot it
plt.show() # show plot; proceeds when plot is closed
## initialize model parameters
W1 = np.random.uniform(-0.5,0.5,[3,2]) # hidden layer weights (3 x 2) matrix
b1 = np.random.uniform(-1,1,[3]) # bias for neurons in hidden layer
W2 = np.random.uniform(-0.5,0.5,[1,3]) # weights for output layer (1 x 3)
b2 = np.random.uniform(-1,1,[1]) # bias for output neuron
# collecting parameters in model dict
model = {"W1" : W1, "W2" : W2, "b1" : b1, "b2" : b2}
## the activation function
# can also return the derivative
def sigmoid(x,derivative = False):
if derivative == True:
# derivative; np.multiply multiplies element-wise
# needed if x is tensor-like object
return np.multiply(sigmoid(x), (1 - sigmoid(x)))
else:
return 1.0/(1.0 + np.exp(-x))
## moving forward in the network for a single data point
# and returns a dict with necessary information
def move_forward(model, DataX):
W1 = model["W1"] # extract model params from dict to make it better readable
W2 = model["W2"]
b1 = model["b1"]
b2 = model["b2"]
t1 = np.dot(W1,DataX) + b1 # weighted input for hidden layer (here 3-dim object)
phi = sigmoid(t1) # evaluate activation function
phiP = sigmoid(t1, True) # derivative (needed for moving backward "learning")
t2 = np.dot(W2,phi) + b2 # weighted input for output layer (1-dim object)
sig = sigmoid(t2) # evaluate final output
sigP = sigmoid(t2, True) # derivative
forward = {"phi" : phi,"phiP" : phiP, # dict collecting the output
"sig" : sig, "sigP" : sigP}
return forward
## moving backward for a single data point
def move_backward(forward, model, DataX):
W1 = model["W1"]
W2 = model["W2"]
b1 = model["b1"]
b2 = model["b2"]
phi = forward["phi"]
phiP = forward["phiP"]
sig = forward["sig"]
sigP = forward["sigP"]
#not the full deltaWs / deltabs; multiplied by the rest in "update_model"
dW2 = sigP * phi # part from "derivative chain" roughly: dsig/dt2 dt2 / dW2
db2 = sigP # analogue
temp = np.multiply(W2,phiP) # multiplied element wise
dW1 = sigP * np.outer(temp, DataX) # outer product since: (W2 * phi)_j x_i
db1 = sigP * np.outer(temp, [1]) # analogue
backward = {"dW1": dW1, "dW2": dW2, "db1": db1, "db2": db2}
return backward
## part of the loss function; here for one data point
# returns also the derivative for the learning process
def loss(DataY, PredictionY, derivative = False):
if derivative == True:
return DataY / PredictionY - (1.0 - DataY) / (1.0 - PredictionY)
log_likelihood = DataY * np.log(PredictionY) + (1.0 - DataY) * np.log(1.0 - PredictionY)
return log_likelihood
## updating model parameters
## epsilon is a small parameter regulating the learning
def update_model(DataSet,model, epsilon):
DataX = DataSet[0]
DataY = DataSet[1]
total_loss = 0
dW1_total = 0
dW2_total = 0
db1_total = 0
db2_total = 0
beta = 0
W1 = model["W1"]
W2 = model["W2"]
b1 = model["b1"]
b2 = model["b2"]
# iterating over full data set
for i in range(len(DataX)):
forward = move_forward(model, DataX[i])
backward = move_backward(forward, model, DataX[i])
sig = forward["sig"]
total_loss += loss(DataY[i],sig)
beta += loss(DataY[i],sig, True)
dW1_total += backward["dW1"]
dW2_total += backward["dW2"]
db1_total += backward["db1"]
db2_total += backward["db2"]
total_loss *= -1.0/len(DataX) # the total loss
beta *= -1.0/len(DataX) # the derivative of dloss/dsig
## setting updated model params
W1_new = W1 - epsilon * beta * dW1_total
W2_new = W2 - epsilon * beta * dW2_total
b1_new = b1 - epsilon * beta * np.squeeze(np.asarray(db1_total))
b2_new = b2 - epsilon * beta * db2_total
model_updated = {"W1": W1_new, "W2": W2_new, "b1": b1_new,
"b2": b2_new, "loss": total_loss}
return model_updated
## train the model with a given data set N times
def train_model(DataSet,model, epsilon, N, print_state = False):
for i in range(N):
model = update_model(DataSet,model, epsilon)
if print_state == True:
if i % 100 == 0:
print(model)
print("loss = " , model["loss"])
print(model)
return model
## call the training function and store the output
model_new = train_model([X,y],model, 0.01, 1000, True)
## check result with data point in the training set
move_forward(model_new,X[0])
# Note: Hm, somehow I always get sig = 0.5 (roughly). And the loss
# does not get smaller than 0.68
# I guess there must be several mistakes