I want to implement gradient descent with numpy for linear regression but I have some error in this code:
import numpy as np
# Code Example
rng = np.random.RandomState(10)
X = 10*rng.rand(1000, 5) # feature matrix
y = 0.9 + np.dot(X, [2.2, 4, -4, 1, 2]) # target vector
# GD implementation for linear regression
def GD(X, y, eta=0.1, n_iter=20):
theta = np.zeros((X.shape[0], X.shape[1]))
for i in range(n_iter):
grad = 2 * np.mean((np.dot(theta.T, X) - y) * X)
theta = theta - eta * grad
return theta
# SGD implementation for linear regression
def SGD(X, y, eta=0.1, n_iter=20):
theta = np.zeros(1, X.shape[1])
for i in range(n_iter):
for j in range(X.shape[0]):
grad = 2 * np.mean((np.dot(theta.T, X[j,:]) - y[j]) * X[j,:])
theta = theta - eta * grad
return theta
# MSE loss for linear regression with numpy
def MSE(X, y, theta):
return np.mean((X.dot(theta.T) - y)**2)
# linear regression with GD and MSE with numpy
theta_gd = GD(X, y)
theta_sgd = SGD(X, y)
print('MSE with GD: ', MSE(X, y, theta_gd))
print('MSE with SGD: ', MSE(X, y, theta_sgd))
The error is
grad = 2 * np.mean((np.dot(theta.T, X) - y) * X)
ValueError: operands could not be broadcast together with shapes (5,5) (1000,)
and I can't solve it.
Minor changes in your code that resolve dimensionality issues during matrix multiplication make the code run successfully. In particular, note that a linear regression on a design matrix X of dimension Nxk has a parameter vector theta of size k.
In addition, I'd suggest some changes in SGD() that make it a proper stochastic gradient descent. Namely, evaluating the gradient over random subsets of the data realized as realized by randomly partitioning the index set of the train data with np.random.shuffle() and looping through it. The batch_size determines the size of each subset after which the parameter estimate is updated. The argument seed ensures reproducibility.
# GD implementation for linear regression
def GD(X, y, eta=0.001, n_iter=100):
theta = np.zeros(X.shape[1])
for i in range(n_iter):
for j in range(X.shape[0]):
grad = (2 * np.mean(X[j,:] # theta - y[j]) * X[j,:]) # changed line
theta -= eta * grad
return theta
# SGD implementation for linear regression
def SGD(X, y, eta=0.001, n_iter=1000, batch_size=25, seed=7678):
theta = np.zeros(X.shape[1])
indexSet = list(range(len(X)))
np.random.seed(seed)
for i in range(n_iter):
np.random.shuffle(indexSet) # random shuffle of index set
for j in range(round(len(X) / batch_size)+1):
X_sub = X[indexSet[j*batch_size:(j+1)*batch_size],:]
y_sub = y[indexSet[j*batch_size:(j+1)*batch_size]]
if(len(X_sub) > 0):
grad = (2 * np.mean(X_sub # theta - y_sub) * X_sub) # changed line
theta -= eta * np.mean(grad, axis=0)
return theta
Running the code, I get
print('MSE with GD : ', MSE(X, y, theta_gd))
print('MSE with SGD: ', MSE(X, y, theta_sgd))
> MSE with GD : 0.07602
MSE with SGD: 0.05762
Each observation has 5 features, and X contains 1000 observations:
X = rng.rand(1000, 5) * 10 # X.shape == (1000, 5)
Create y which is perfectly linearly correlated with X (with no distortions):
real_weights = np.array([2.2, 4, -4, 1, 2]).reshape(-1, 1)
real_bias = 0.9
y = X # real_weights + real_bias # y.shape == (1000, 1)
G.D. implementation for linear regression:
Note:
w (weights) is your theta variable.
I have also added the calculation of b (bias).
def GD(X, y, eta=0.1, n_iter=20):
# Initialize weights and a bias (all zeros):
w = np.zeros((X.shape[1], 1)) # w.shape == (5, 1)
b = 0
# Gradient descent
for i in range(n_iter):
errors = X # w + b - y # errors.shape == (1000, 1)
dw = 2 * np.mean(errors * X, axis=0).reshape(5, 1)
db = 2 * np.mean(errors)
w -= eta * dw
b -= eta * db
return w, b
Testing:
w, b = GD(X, y, eta=0.003, n_iter=5000)
print(w, b)
[[ 2.20464905]
[ 4.00510139]
[-3.99569374]
[ 1.00444026]
[ 2.00407476]] 0.7805448262466914
Notes:
Your function SGD also contains some error..
I'm using the # operator because it's just my preference over np.dot.
Related
I am trying to implement multivariate linear regression(gradient descent and mse cost function) but the loss value keeps exponentially increasing for every iteration of gradient descent and I'm unable to figure out why?
from sklearn.datasets import load_boston
class LinearRegression:
def __init__(self):
self.X = None # The feature vectors [shape = (m, n)]
self.y = None # The regression outputs [shape = (m, 1)]
self.W = None # The parameter vector `W` [shape = (n, 1)]
self.bias = None # The bias value `b`
self.lr = None # Learning Rate `alpha`
self.m = None
self.n = None
self.epochs = None
def fit(self, X: np.ndarray, y: np.ndarray, epochs: int = 100, lr: float = 0.001):
self.X = X # shape (m, n)
self.m, self.n = X.shape
assert y.size == self.m and y.shape[0] == self.m
self.y = np.reshape(y, (-1, 1)) # shape (m, ) or (m, 1)
assert self.y.shape == (self.m, 1)
self.W = np.random.random((self.n, 1)) * 1e-3 # shape (n, 1)
self.bias = 0.0
self.epochs = epochs
self.lr = lr
self.minimize()
def minimize(self, verbose: bool = True):
for num_epoch in range(self.epochs):
predictions = np.dot(self.X, self.W)
assert predictions.shape == (self.m, 1)
grad_w = (1/self.m) * np.sum((predictions-self.y) * self.X, axis=0)[:, np.newaxis]
self.W = self.W - self.lr * grad_w
assert self.W.shape == grad_w.shape
loss = (1 / 2 * self.m) * np.sum(np.square(predictions - self.y))
if verbose:
print(f'Epoch : {num_epoch+1}/{self.epochs} \t Loss : {loss.item()}')
linear_regression = LinearRegression()
x_train, y_train = load_boston(return_X_y=True)
linear_regression.fit(x_train, y_train, 10)
I'm using the boston housing dataset from sklearn.
PS. I'd like to know what's causing this issue and how to fix it and whether or not my implementation is correct.
Thanks
The error is in the gradient. A divergence like that for an iterative shrinkage-thresholding algorithms (ISTA) solver is not something you should see.
For your gradient computation: X is of shape (m,n) and W of shape(n,1) so (prediction - y) is of shape (m,1) then you multiply by X on the left? (m,1) by (m,n)? Not sure what numpy is computing but it is not what you want to compute:
grad_w = (1/self.m) * np.sum((predictions-self.y) * self.X, axis=0)[:, np.newaxis]
here the code should be a bit different to have a (n,m) multiply by a (m,1) in order to get a (n,1), same shape as W.
(1/self.m) * np.sum(self.X.T*(predictions-self.y) , axis=0)[:, np.newaxis]
For the derivation to be correct.
I am also not sure of why you use the dot (which is a good idea) for the prediction but not for the gradient.
You Also do not need so many reshapes:
from sklearn.datasets import load_boston
A,b = load_boston(return_X_y=True)
n_samples = A.shape[0]
n_features = A.shape[1]
def grad_linreg(x):
"""Least-squares gradient"""
grad = (1. / n_samples) * np.dot(A.T, np.dot(A, x) - b)
return grad
def loss_linreg(x):
"""Least-squares loss"""
f = (1. / (2. * n_samples)) * sum((b - np.dot(A, x)) ** 2)
return f
And then you check that your gradient is good:
from scipy.optimize import check_grad
from numpy.random import randn
check_grad(loss_linreg,grad_linreg,randn(n_features))
check_grad(loss_linreg,grad_linreg,randn(n_features))
check_grad(loss_linreg,grad_linreg,randn(n_features))
check_grad(loss_linreg,grad_linreg,randn(n_features))
You can then build the Model on that.
If you want to test that with ISTA/FISTA and Logistic/Linear Regression and LASSO/RIDGE, here is a jupyter notebook with the theory and a working example
I'm doing a hands-on for learning and have created a model in python using numpy that's being trained on breast cancer dataSet from sklearn library. Model is running without any error and giving me Train and Test accuracy as 92.48826291079813% and 90.9090909090909% respectively. However somehow I'm not able to complete the hands-on since (probably) my result is different than expected. I don't know where the problem is because I don't know the right answer, also don't see any error.
Would request someone to help me with this. Code is given below.
#Import numpy as np and pandas as pd
"""
import numpy as np
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
**Define method initialiseNetwork() initilise weights with zeros of shape(num_features, 1) and also bias b to zero
parameters: num_features(number of input features)
returns : dictionary of weight vector and bias**
def initialiseNetwork(num_features):
W = np.zeros((num_features,1))
b = 0
parameters = {"W": W, "b": b}
return parameters
** define function sigmoid for the input z.
parameters: z
returns: $1/(1+e^{(-z)})$ **
def sigmoid(z):
a = 1/(1 + np.exp(-z))
return a
** Define method forwardPropagation() which implements forward propagtion defined as Z = (W.T dot_product X) + b, A = sigmoid(Z)
parameters: X, parameters
returns: A **
def forwardPropagation(X, parameters):
W = parameters["W"]
b = parameters["b"]
Z = np.dot(W.T,X) + b
A = sigmoid(Z)
return A
** Define function cost() which calculate the cost given by −(sum(Y\*log(A)+(1−Y)\*log(1−A)))/num_samples, here * is elementwise product
parameters: A,Y,num_samples(number of samples)
returns: cost **
def cost(A, Y, num_samples):
cost = -1/num_samples * np.sum(Y*np.log(A) + (1-Y)*(np.log(1-A)))
#cost = Y*np.log(A) + (1-Y)*(np.log(1-A))
return cost
** Define method backPropgation() to get the derivatives of weigths and bias
parameters: X,Y,A,num_samples
returns: dW,db **
def backPropagration(X, Y, A, num_samples):
dZ = A - Y
dW = (np.dot(X,dZ.T))/num_samples #(X dot_product dZ.T)/num_samples
db = np.sum(dZ)/num_samples #sum(dZ)/num_samples
return dW, db
** Define function updateParameters() to update current parameters with its derivatives
w = w - learning_rate \* dw
b = b - learning_rate \* db
parameters: parameters,dW,db, learning_rate
returns: dictionary of updated parameters **
def updateParameters(parameters, dW, db, learning_rate):
W = parameters["W"] - (learning_rate * dW)
b = parameters["b"] - (learning_rate * db)
return {"W": W, "b": b}
** Define the model for forward propagation
parameters: X,Y, num_iter(number of iterations), learning_rate
returns: parameters(dictionary of updated weights and bias) **
def model(X, Y, num_iter, learning_rate):
num_features = X.shape[0]
num_samples = X.shape[1]
parameters = initialiseNetwork(num_features) #call initialiseNetwork()
for i in range(num_iter):
#A = forwardPropagation(X, Y, parameters) # calculate final output A from forwardPropagation()
A = forwardPropagation(X, parameters)
if(i%100 == 0):
print("cost after {} iteration: {}".format(i, cost(A, Y, num_samples)))
dW, db = backPropagration(X, Y, A, num_samples) # calculate derivatives from backpropagation
parameters = updateParameters(parameters, dW, db, learning_rate) # update parameters
return parameters
** Run the below cell to define the function to predict the output.It takes updated parameters and input data as function parameters and returns the predicted output **
def predict(X, parameters):
W = parameters["W"]
b = parameters["b"]
b = b.reshape(b.shape[0],1)
Z = np.dot(W.T,X) + b
Y = np.array([1 if y > 0.5 else 0 for y in sigmoid(Z[0])]).reshape(1,len(Z[0]))
return Y
** The code in the below cell loads the breast cancer data set from sklearn.
The input variable(X_cancer) is about the dimensions of tumor cell and targrt variable(y_cancer) classifies tumor as malignant(0) or benign(1) **
(X_cancer, y_cancer) = load_breast_cancer(return_X_y = True)
** Split the data into train and test set using train_test_split(). Set the random state to 25. Refer the code snippet in topic 4 **
X_train, X_test, y_train, y_test = train_test_split(X_cancer, y_cancer,
random_state = 25)
** Since the dimensions of tumor is not uniform you need to normalize the data before feeding to the network
The below function is used to normalize the input data. **
def normalize(data):
col_max = np.max(data, axis = 0)
col_min = np.min(data, axis = 0)
return np.divide(data - col_min, col_max - col_min)
** Normalize X_train and X_test and assign it to X_train_n and X_test_n respectively **
X_train_n = normalize(X_train)
X_test_n = normalize(X_test)
** Transpose X_train_n and X_test_n so that rows represents features and column represents the samples
Reshape Y_train and y_test into row vector whose length is equal to number of samples.Use np.reshape() **
X_trainT = X_train_n.T
#print(X_trainT.shape)
X_testT = X_test_n.T
#print(X_testT.shape)
y_trainT = y_train.reshape(1,X_trainT.shape[1])
y_testT = y_test.reshape(1,X_testT.shape[1])
** Train the network using X_trainT,y_trainT with number of iterations 4000 and learning rate 0.75 **
parameters = model(X_trainT, y_trainT, 4000, 0.75) #call the model() function with parametrs mentioned in the above cell
** Predict the output of test and train data using X_trainT and X_testT using predict() method> Use the parametes returned from the trained model **
yPredTrain = predict(X_trainT, parameters) # pass weigths and bias from parameters dictionary and X_trainT as input to the function
yPredTest = predict(X_testT, parameters) # pass the same parameters but X_testT as input data
** Run the below cell print the accuracy of model on train and test data. ***
accuracy_train = 100 - np.mean(np.abs(yPredTrain - y_trainT)) * 100
accuracy_test = 100 - np.mean(np.abs(yPredTest - y_testT)) * 100
print("train accuracy: {} %".format(accuracy_train))
print("test accuracy: {} %".format(accuracy_test))
My Output:
train accuracy: 92.48826291079813 %
test accuracy: 90.9090909090909 %
I figured out where the problem was. It was the third line in predict function where I was reshaping bias which was not at all necessary.
def predict(X, parameters):
W = parameters["W"]
b = parameters["b"]
**b = b.reshape(b.shape[0],1)**
Z = np.dot(W.T,X) + b
Y = np.array([1 if y > 0.5 else 0 for y in sigmoid(Z[0])]).reshape(1,len(Z[0]))
return Y
and third line in back-propagation function needed to be corrected as np.sum(dZ)/num_samples.
def backPropagration(X, Y, A, num_samples):
dZ = A - Y
dW = (np.dot(X,dZ.T))/num_samples
** db = sum(dZ)/num_samples **
return dW, db
After I corrected both functions, the model gave me train accuracy as 98.59154929577464% and test accuracy as 93.00699300699301%.
I've trained a simple machine learning model, a polynomial regression. The pseudocode of prediction function is as follows:
def f(x):
"""
x is a np.ndarray of shape (m, )
"""
# X is stacked of x ** 0, x ** 1, x ** 2, ..., x ** (n - 1) by rows
# X is of shape of (m, n)
# m is the number of training examples
X = generate(x)
Y = np.dot(X, W)
return Y
W is trained parameters. Here the shape of Y is (m, 1), but if I return Y.squeeze(), say of shape (m,), I get a very different standard deviation on the test set, say 70 for the former and 8 for the latter.
I use random initialisation, but I've trained and tested many times, the std of the squeezed version is much smaller. So I just wonder why.
I just show the complete codes below, and you can test by yourself. My questions are in line 90 and line 91
# python: 3.5.2
# encoding: utf-8
# numpy: 1.14.1
import numpy as np
import matplotlib.pyplot as plt
def load_data(filename):
xys = []
with open(filename, 'r') as f:
for line in f:
xys.append(map(float, line.strip().split()))
xs, ys = zip(*xys)
return np.asarray(xs), np.asarray(ys)
def evaluate(ys, ys_pred):
std = np.sqrt(np.mean(np.abs(ys - ys_pred) ** 2))
return std
def linear_regression(x_train, y_train, n=2, learning_rate=0.0005, epochs=1000, l2=0, Print=False):
"""
This target function is: y = b + w1 * x^1 + w2 * x^2 + ...
also y = b + np.dot(w.T, x)
:param x_train: np.ndarray
:param y_train: np.ndarray
:return: a trained model (as a function), trained by x_train and y_train
"""
# get the number of train e.g.
m = x_train.shape[0]
# set and initialize parameters here
# intercept
b = np.float64(-10)
# weights
w = np.float64(np.random.randn(n, 1))
# convert the x_train matrix to a design matrix
X = np.zeros((n, m), dtype=np.float64)
for i in range(n):
X[i, :] = x_train ** (i + 1)
X = np.float64(X)
Y = np.float64(np.reshape(y_train, newshape=(1, m)))
# if plot of the training process is needed
costs = []
# train on the dataset
for epoch in range(epochs):
# compute the gradient of cost on w
Z = b + np.dot(w.T, X)
dZ = Z - Y
dw = 1./m * np.dot(X, dZ.T)
db = 1./m * np.squeeze(np.sum(dZ))
# update the parameters, for w, I also set "weight decay"
w -= learning_rate * dw + l2 * w
b -= learning_rate * db
cost = np.squeeze(0.5/m * np.dot(dZ, dZ.T))
costs.append(cost)
if Print == True and epoch % 25 == 0:
print("Cost after " + str(epoch) + " iterations " + ": " + str(cost))
# plot the costs
if Print == True:
plt.plot(costs)
plt.show()
def pred(x):
assert type(x) is np.ndarray
m = x.shape[0]
# convert the x_train matrix to a design matrix
X = np.zeros((n, m))
for i in range(n):
X[i, :] = x ** (i + 1)
# to predict
Y = b + np.dot(w.T, X)
return Y.T
# return Y.squeeze()
return pred
if __name__ == '__main__':
train_file = 'train.txt'
test_file = 'test.txt'
# load data
x_train, y_train = load_data(train_file)
x_test, y_test = load_data(test_file)
print(x_train.shape)
print(x_test.shape)
# use a trained linear-regression model
f = linear_regression(x_train, y_train, n=2, epochs=10000, Print=False, learning_rate=1e-8, l2=5e-2)
# compute the predictions
y_test_pred = f(x_test)
# use the test set to evaluate the model
std = evaluate(y_test, y_test_pred)
print('the standard deviation:{:.1f}'.format(std))
# show the result
plt.plot(x_train, y_train, 'ro', markersize=3)
plt.plot(x_test, y_test, 'k')
plt.plot(x_test, y_test_pred)
plt.xlabel('x')
plt.ylabel('y')
plt.title('Linear Regression')
plt.legend(['train', 'test', 'pred'])
plt.show()
I am trying to build an ANN in python, and I've been able to get so far as to to forward pass, but I get a problem when I try to do backward propagation. In my function nnCostFunction, the gradient grad is define as:
grad = tr(c_[Theta1_grad.swapaxes(1,0).reshape(1,-1), Theta2_grad.swapaxes(1,0).reshape(1,-1)])
But this is a problem because I am using scipy.optimize.fmin_cg to calculate nn_params and cost, and fmin_cg accepts only a single value (the J value for my forward pass) and cannot accept grad...
nn_params, cost = op.fmin_cg(lambda t: nnCostFunction(t, input_layer_size, hidden_layer_size, num_labels, X, y, lam), initial_nn_params, gtol = 0.001, maxiter = 40, full_output=1)[0, 1]
Is there a way to fix this so I can include backward propagation in my network? I know there is a scipy.optimize.minimize function, but I am having some difficulty understand how to use it and get the results I need. Does anyone know what needs to be done?
Your help is greatly appreciated, thanks.
def nnCostFunction(nn_params, input_layer_size, hidden_layer_size, num_labels, X, y, lam):
'''
Given NN parameters, layer sizes, number of labels, data, and learning rate, returns the cost of traversing NN.
'''
Theta1 = (reshape(nn_params[:(hidden_layer_size*(input_layer_size+1))],(hidden_layer_size,(input_layer_size+1))))
Theta2 = (reshape(nn_params[((hidden_layer_size*(input_layer_size+1))):],(num_labels, (hidden_layer_size+1))))
m = X.shape[0]
n = X.shape[1]
#forward pass
y_eye = eye(num_labels)
y_new = np.zeros((y.shape[0],num_labels))
for z in range(y.shape[0]):
y_new[z,:] = y_eye[int(y[z])-1]
y = y_new
a_1 = c_[ones((m,1)),X]
z_2 = tr(Theta1.dot(tr(a_1)))
a_2 = tr(sigmoid(Theta1.dot(tr(a_1))))
a_2 = c_[ones((a_2.shape[0],1)), a_2]
a_3 = tr(sigmoid(Theta2.dot(tr(a_2))))
J_reg = lam/(2.*m) * (sum(sum(Theta1[:,1:]**2)) + sum(sum(Theta2[:,1:]**2)))
J = (1./m) * sum(sum(-y*log(a_3) - (1-y)*log(1-a_3))) + J_reg
#Backprop
d_3 = a_3 - y
d_2 = d_3.dot(Theta2[:,1:])*sigmoidGradient(z_2)
Theta1_grad = 1./m * tr(d_2).dot(a_1)
Theta2_grad = 1./m * tr(d_3).dot(a_2)
#Add regularization
Theta1_grad[:,1:] = Theta1_grad[:,1:] + lam*1.0/m*Theta1[:,1:]
Theta2_grad[:,1:] = Theta2_grad[:,1:] + lam*1.0/m*Theta2[:,1:]
#Unroll gradients
grad = tr(c_[Theta1_grad.swapaxes(1,0).reshape(1,-1), Theta2_grad.swapaxes(1,0).reshape(1,-1)])
return J, grad
def nn_train(X,y,lam = 1.0, hidden_layer_size = 10):
'''
Train neural network given the features and class arrays, learning rate, and size of the hidden layer.
Return parameters Theta1, Theta2.
'''
# NN input and output layer sizes
input_layer_size = X.shape[1]
num_labels = unique(y).shape[0] #output layer
# Initialize NN parameters
initial_Theta1 = randInitializeWeights(input_layer_size, hidden_layer_size)
initial_Theta2 = randInitializeWeights(hidden_layer_size, num_labels)
# Unroll parameters
initial_nn_params = np.append(initial_Theta1.flatten(1), initial_Theta2.flatten(1))
initial_nn_params = reshape(initial_nn_params,(len(initial_nn_params),)) #flatten into 1-d array
# Find and print initial cost:
J_init = nnCostFunction(initial_nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,lam)[0]
grad_init = nnCostFunction(initial_nn_params,input_layer_size,hidden_layer_size,num_labels,X,y,lam)[1]
print 'Initial J cost: ' + str(J_init)
print 'Initial grad cost: ' + str(grad_init)
# Implement backprop and train network, run fmin
print 'Training Neural Network...'
print 'fmin results:'
nn_params, cost = op.fmin_cg(lambda t: nnCostFunction(t, input_layer_size, hidden_layer_size, num_labels, X, y, lam), initial_nn_params, gtol = 0.001, maxiter = 40, full_output=1)[0, 1]
Theta1 = (reshape(nn_params[:(hidden_layer_size*(input_layer_size+1))],(hidden_layer_size,(input_layer_size+1))))
Theta2 = (reshape(nn_params[((hidden_layer_size*(input_layer_size+1))):],(num_labels, (hidden_layer_size+1))))
return Theta1, Theta2
I've got a toy example set up of a linear regression model with one input variable and one output variable. The problem that I'm encountering is that the output for the bias is far off from the generated data. If I manually set the bias then it will produce a weight and bias which is close enough to the original.
I've written two pieces of code gen_data which generates data and GradientDescent which performs that gradient descent algorithm to find the weight and bias.
def gen_data(num_points=50, slope=1, bias=10, x_max=50):
f = lambda z: slope * z + bias
x = np.zeros(shape=(num_points, 1))
y = np.zeros(shape=(num_points, 1))
for i in range(num_points):
x_temp = np.random.uniform()*x_max
x[i] = x_temp
y[i] = f(x_temp) + np.random.normal(scale=3.0)
return (x, y)
# \mathbb{R}^1 with no regularization
def gradientDescent2(x, y, learning_rate=0.0001, epochs=100):
theta = np.random.rand()
bias = np.random.rand()
for i in range(0, epochs):
loss = (theta * x + bias) - y
cost = np.mean(loss**2) / 2
# print('Iteration {} | Cost: {}'.format(i, cost))
grad_b = np.mean(loss)
grad_t = np.mean(loss*x)
# updates
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
return (theta, bias)
If you want to use batch update, don't set your batch_size equals to your simple size. (I also believe that batch_update is not very suitable for this case.)
2.Your gradient calculation and parameter update are incorrect, the gradient should be:
grad_b = 1
grad_t = x
For the parameter update, you should always trying to minimize the loss, so it should be
if loss>0:
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
elif loss< 0:
bias += learning_rate * grad_b
theta += learning_rate * grad_t
After all, below is the modified code works well.
import numpy as np
import sys
def gen_data(num_points=500, slope=1, bias=10, x_max=50):
f = lambda z: slope * z + bias
x = np.zeros(shape=(num_points))
y = np.zeros(shape=(num_points))
for i in range(num_points):
x_temp = np.random.uniform()*x_max
x[i] = x_temp
y[i] = f(x_temp) #+ np.random.normal(scale=3.0)
#print('x:',x[i],' y:',y[i])
return (x, y)
def gradientDescent2(x, y, learning_rate=0.001, epochs=100):
theta = np.random.rand()
bias = np.random.rand()
for i in range(0, epochs):
for j in range(len(x)):
loss = (theta * x[j] + bias) - y[j]
cost = np.mean(loss**2) / 2
# print('Iteration {} | Cost: {}'.format(i, cost))
grad_b = 1
grad_t = x[j]
if loss>0:
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
elif loss< 0:
bias += learning_rate * grad_b
theta += learning_rate * grad_t
return (theta, bias)
def main():
x,y =gen_data()
ta,bias = gradientDescent2(x,y)
print('theta:',ta)
print('bias:',bias)
if __name__ == '__main__':
sys.exit(int(main() or 0))