I am currently writing an implementation for gradient descent and I'm running into an issue where my predicted value (y_hat) only ever decreases. It never increases even though it should increase in cases where the training label is 1 and not 0. my train function code is below:
def sigma(self, a):
ans = 1/(1+np.exp(-a))
return ans
def get_loss(self, y_i, y_hat):
loss = -(y_i * np.log(y_hat) + (1 - y_i) * np.log(1 - y_hat))
return loss
def train(self, X, y, step_size, num_iterations):
b_0 = 0
rows = X.shape[0]
columns = X.shape[1]
weights = np.zeros(columns)
losses = []
for iteration in range(num_iterations):
#Step 1: calculate y hat for row
summation = 0
summation_k = np.zeros(columns)
total_loss = 0
for i in range(rows):
row_total = np.sum(np.multiply(X[i], weights))
y_hat = self.sigma(b_0 + row_total)
y_i = y[i]
# print('y_i: ', y_i)
# print('y_hat: ', y_hat)
# print()
total_loss += self.get_loss(y_i, y_hat)
diff = y_i - y_hat
summation += diff
# summation_k_i = summation_k_i + X[i] * diff
summation_k = np.add(summation_k, np.multiply(diff, X[i]))
# Compute change for each weight based on errors, then update the weights
# Update b_0
b_0 = b_0 + step_size * ((1/rows) * (-summation))
# Update b_k
# for j in range(columns):
# weights[j] = weights[j] + step_size * ((1/rows) * (-summation_k[j]))
weights = np.add(weights, np.multiply(summation_k, (-step_size/rows)))
# Keeping track of average loss for each iteration.
losses.append(total_loss/rows)
self.weights = np.insert(weights, 0, b_0)
return np.array(losses)
When I run this the y_hat values always decrease for every row and in every iteration. I can't find the bug that's causing this.
Related
I am trying to implement my own gradient descent function in python but my MSE loss function is suspiciously high. How does my implementation look? I based my function on the formula below.
def gradient_descent(w, X):
for i in range(len(X)):
L = 0.001
total = 0
row_vec = X.iloc[i].to_numpy()
y = Y.iloc[i]
y_hat = np.dot(w, row_vec)
inner = y_hat - y
total += inner * row_vec
print(total)
return w - L * ( ( 1 / len(X) ) * total)
The function above represents one iteration of gradient descent. The w parameter is a weights vector that I initialize to np.array([[1,1,1,...]]) and X is a DataFrame where each column represents a feature with an added column of all 1s for bias.
def optimize(w, X):
loss = 999999
iter = 0
loss_arr = []
while True:
vec = gradient_descent(w, X) # new weights vector
tmp_loss = loss_function(vec, X) # test new weights
if tmp_loss < loss:
loss = tmp_loss
w = vec
loss_arr.append(loss)
iter += 1
else:
break
return (loss_arr, w, iter)
In the function above, I call the gradient_descent function and check if my loss function is better than the previous one. If it's not then I stop and I have my final weights.
How do I avoid getting the error on the highlighted line, it keeps giving me that error on how the float64 type is not iterable. Essentially, I'm trying to calculate the cost for SGD in the first function and then calculate SGD in the second function.
def calculate_cost_gradient(W, X_batch, Y_batch):
# if only one example is passed (eg. in case of SGD)
if type(Y_batch) == np.float64:
Y_batch = np.array([Y_batch])
X_batch = np.array([X_batch]) # gives multidimensional array
distance = 1 - (Y_batch * np.dot(X_batch, W))
dw = np.zeros(len(W))
######## Error is here ########
for ind, d in enumerate(distance):
if max(0, d) == 0:
di = W
else:
di = W - (regularization_strength * Y_batch[ind] * X_batch[ind])
dw += di
dw = dw/len(Y_batch) # average
return dw
def sgd(features, outputs):
max_epochs = 5000
weights = np.zeros(features.shape[1])
nth = 0
prev_cost = float("inf")
cost_threshold = 0.01 # in percent
# stochastic gradient descent
for epoch in range(1, max_epochs):
# shuffle to prevent repeating update cycles
X, Y = shuffle(features, outputs)
for ind, x in enumerate(X):
ascent = calculate_cost_gradient(weights, x, Y[ind])
weights = weights - (learning_rate * ascent)
# convergence check on 2^nth epoch
if epoch == 2 ** nth or epoch == max_epochs - 1:
cost = compute_cost(weights, features, outputs)
print("Epoch is:{} and Cost is: {}".format(epoch, cost))
# stoppage criterion
if abs(prev_cost - cost) < cost_threshold * prev_cost:
return weights
prev_cost = cost
nth += 1
return weights
It's very likely that distance is a numpy integer and the code should be:
for ind, d in enumerate(range(distance)):
I also faced the same issue while implementing SVM from a blog post: https://towardsdatascience.com/svm-implementation-from-scratch-python-2db2fc52e5c2
The solution is just to remove the if type(Y_batch) == np.float64
It is because Y_batch is of type int64 and it is not converted into NumPy array resulting in the distance being a type of float64. (we can use enumerate only to the collections of items)
I am trying to implement Logistic Regression model with regularisation. I got stuck in computing the gradient because when I am running my gradient descent algorithm it actually shows that the cost function is increasing rather than decreasing.
def sigmoid(z):
return 1 / (1 + np.exp(-z))
def Probability(theta, X):
return sigmoid(np.dot(X,theta))
def cost_function_regression(theta, x, y, Lambda):
# Computes the cost function for all the training samples
m = x.shape[0]
total_cost = (-(1 / m) * np.sum(
np.dot(y.T, np.log(Probability( theta,x))) + np.dot((1 - y).T, np.log(
1 - Probability(theta,x))))) + (Lambda/ 2)* np.sum(np.dot(theta, theta.T))
return total_cost
def Gradient_regression( theta, X,y, Lambda ):
m=X.shape[0]
grad=(((1/m)* np.dot(X.T, Probability(theta,X)-y)) + np.sum((Lambda/m )* theta))
return(grad)
We will start by establishing the theory followed by the working example and end with some comments.
Problem statement
The steps in fitting/training a logistic regression model (as with any supervised ML model) using gradient decent method are as below
Identify a hypothesis function [h(X)] with parameters [w,b]
Identify a loss function [J(w,b)]
Forward propagation: Make predictions using the hypothesis functions [y_hat = h(X)]
Calculate the error between the actual label [y] and the predicted label [y_hat] using the loss function.
Backward propagation: Adjust the parameters in the hypothesis function based on the error (by calculating the gradients), using the update rule
Got to step 3 if gradients are high else end
Calculating gradients
Hypothesis function for logistic regression :
Where X is a vector and X^i is the ith element of the vector.
The commonly used loss function for logistic regression is log loss. The log loss with l2 regularization is:
Lets calculate the gradients
Similarly
Now that we know the gradients, lets code the gradient decent algorithm to fit the parameters of our logistic regression model
Toy Example
# load data
iris = datasets.load_iris()
# Lets take only two classes
y = iris.target
X = iris.data[y != 2]
y = y[y != 2]
# Normalize data to 0 mean and 1 std
X[:, 0] = (X[:, 0] - np.mean(X[:, 0]))/np.std(X[:, 0])
X[:, 1] = (X[:, 1] - np.mean(X[:, 1]))/np.std(X[:, 1])
X[:, 2] = (X[:, 2] - np.mean(X[:, 2]))/np.std(X[:, 2])
X[:, 3] = (X[:, 3] - np.mean(X[:, 3]))/np.std(X[:, 3])
def sigmoid(x):
return 1 / (1+math.exp(-x))
# initialize weights
w0, w1, w2, w3, b = 0.01,0.01,0.01,0.01,0.01
n = len(X)
# Learning rate
alpha = 0.01
# The gardient decent loop
while True:
y_hat = [sigmoid(w0*x[0] + w1*x[1] + w2*x[2] + w3*x[3] + b) for x in X]
delta_w0 = -np.sum([(y[j] - y_hat[j])*X[j,0] for j in range(n)])/n + 2*w0
delta_w1 = -np.sum([(y[j] - y_hat[j])*X[j,1] for j in range(n)])/n + 2*w1
delta_w2 = -np.sum([(y[j] - y_hat[j])*X[j,2] for j in range(n)])/n + 2*w2
delta_w3 = -np.sum([(y[j] - y_hat[j])*X[j,3] for j in range(n)])/n + 2*w3
delta_b = -np.sum([(y[j] - y_hat[j]) for j in range(n)])/n + 2*b
w0 = w0 - alpha*delta_w0
w1 = w1 - alpha*delta_w1
w2 = w2 - alpha*delta_w2
w3 = w3 - alpha*delta_w3
b = b - alpha*delta_b
if np.sum(np.abs([delta_w0, delta_w1, delta_w2, delta_w3, delta_b])) < 1e-5:
break
# Make predictions
pred = [1 if i > 0.5 else 0 for i in y_hat]
# Find no:of correct predictions
correct = np.sum([1 if pred[i] == y[i] else 0 for i in range(n)])
print (correct)
Comments
The above toy example is the coded in the most inefficient way. The intention was to show the steps clearly rather then efficiency. Saying that, we will have to vectorize the operations (using np arrays and matrics operations) for efficiency.
Data normalization is important
The models are trained on train data and the performance is measured on test/validation data.
I am doing a very simple logistic regression problem, but it doesn't converge. The dataset is linearly separable. There is no way that the loss can't converge to 0.
The loss converges very slowly and seems like converging to a constant. The gradient also doesn't converge to 0. I have checked the function of computing the gradient (by gradient checking) which is correct. The loss function should also be correct. And changing the learning rate doesn't help.
import random
import numpy as np
import matplotlib.pyplot as plt
def sigmoid(z):
""" Sigmoid function """
s = 1 / (1 + np.exp(-z))
return s
def cost_function(theta, X, y):
output = sigmoid(np.dot(X, theta))
cost = 0
m_samples = len(y)
for i in range(m_samples):
if y[i] == 0:
cost += -(1 - y[i]) * np.log(1 - output[i])
elif y[i] == 1:
cost += -y[i] * np.log(output[i])
cost /= m_samples
return cost
def gradient_update(theta, X, y):
output = sigmoid(np.dot(X, theta))
grad = np.dot((output - y).T, X)
grad = grad / m_samples
return grad
def gradient_descent(theta, X, y, alpha, max_iterations, print_iterations):
m_samples = len(y)
iteration = 0
X_train = X / np.max(X)
while (iteration < max_iterations):
iteration += 1
gradient = gradient_update(theta, X_train, y)
theta = theta - alpha * gradient
if iteration % print_iterations == 0 or iteration == 1:
cost = cost_function(theta, X_train, y)
print("[ Iteration", iteration, "]", "cost =", cost)
#print(gradient)
num_features = train_X.shape[1]
initial_theta = np.random.randn(num_features)
max_iter = 200
print_iter = 25
alpha_test = 0.1
learned_theta = gradient_descent(initial_theta, train_X, train_y, alpha_test, max_iter, print_iter)
I don't think it converges that slowly instead it should converge very fast.
This is the output.
[ Iteration 1 ] cost = 0.6321735730663283
[ Iteration 25 ] cost = 0.6307985058882454
[ Iteration 50 ] cost = 0.6302278288232466
[ Iteration 75 ] cost = 0.6300077925064239
[ Iteration 100 ] cost = 0.6299228901862299
[ Iteration 125 ] cost = 0.6298894960439918
[ Iteration 150 ] cost = 0.6298756287152963
[ Iteration 175 ] cost = 0.6298691634248297
[ Iteration 200 ] cost = 0.6298655267069331
I don't know what's going on.
I've got a toy example set up of a linear regression model with one input variable and one output variable. The problem that I'm encountering is that the output for the bias is far off from the generated data. If I manually set the bias then it will produce a weight and bias which is close enough to the original.
I've written two pieces of code gen_data which generates data and GradientDescent which performs that gradient descent algorithm to find the weight and bias.
def gen_data(num_points=50, slope=1, bias=10, x_max=50):
f = lambda z: slope * z + bias
x = np.zeros(shape=(num_points, 1))
y = np.zeros(shape=(num_points, 1))
for i in range(num_points):
x_temp = np.random.uniform()*x_max
x[i] = x_temp
y[i] = f(x_temp) + np.random.normal(scale=3.0)
return (x, y)
# \mathbb{R}^1 with no regularization
def gradientDescent2(x, y, learning_rate=0.0001, epochs=100):
theta = np.random.rand()
bias = np.random.rand()
for i in range(0, epochs):
loss = (theta * x + bias) - y
cost = np.mean(loss**2) / 2
# print('Iteration {} | Cost: {}'.format(i, cost))
grad_b = np.mean(loss)
grad_t = np.mean(loss*x)
# updates
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
return (theta, bias)
If you want to use batch update, don't set your batch_size equals to your simple size. (I also believe that batch_update is not very suitable for this case.)
2.Your gradient calculation and parameter update are incorrect, the gradient should be:
grad_b = 1
grad_t = x
For the parameter update, you should always trying to minimize the loss, so it should be
if loss>0:
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
elif loss< 0:
bias += learning_rate * grad_b
theta += learning_rate * grad_t
After all, below is the modified code works well.
import numpy as np
import sys
def gen_data(num_points=500, slope=1, bias=10, x_max=50):
f = lambda z: slope * z + bias
x = np.zeros(shape=(num_points))
y = np.zeros(shape=(num_points))
for i in range(num_points):
x_temp = np.random.uniform()*x_max
x[i] = x_temp
y[i] = f(x_temp) #+ np.random.normal(scale=3.0)
#print('x:',x[i],' y:',y[i])
return (x, y)
def gradientDescent2(x, y, learning_rate=0.001, epochs=100):
theta = np.random.rand()
bias = np.random.rand()
for i in range(0, epochs):
for j in range(len(x)):
loss = (theta * x[j] + bias) - y[j]
cost = np.mean(loss**2) / 2
# print('Iteration {} | Cost: {}'.format(i, cost))
grad_b = 1
grad_t = x[j]
if loss>0:
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
elif loss< 0:
bias += learning_rate * grad_b
theta += learning_rate * grad_t
return (theta, bias)
def main():
x,y =gen_data()
ta,bias = gradientDescent2(x,y)
print('theta:',ta)
print('bias:',bias)
if __name__ == '__main__':
sys.exit(int(main() or 0))