I'm trying to implement regularized logistic regression using python for the coursera ML class but I'm having a lot of trouble vectorizing it. Using this repository:
I've tried many different ways but never get the correct gradient or cost heres my current implementation:
h = utils.sigmoid( np.dot(X, theta) )
J = (-1/m) * ( y.T.dot( np.log(h) ) + (1 - y.T).dot( np.log( 1 - h ) ) ) + ( lambda_/(2*m) ) * np.sum( np.square(theta[1:]) )
grad = ((1/m) * (h - y).T.dot( X )).T + grad_theta_reg
Here are the results:
Cost : 0.693147
Expected
cost: 2.534819
Gradients:
[-0.100000, -0.030000, -0.080000, -0.130000]
Expected gradients:
[0.146561, -0.548558, 0.724722, 1.398003]
Any help from someone who knows whats going on would be much appreciated.
Bellow a working snippet of a vectorized version of Logistic Regression. You can see more here https://github.com/hzitoun/coursera_machine_learning_matlab_python
Main
theta_t = np.array([[-2], [-1], [1], [2]])
data = np.arange(1, 16).reshape(3, 5).T
X_t = np.c_[np.ones((5,1)), data/10]
y_t = (np.array([[1], [0], [1], [0], [1]]) >= 0.5) * 1
lambda_t = 3
J, grad = lrCostFunction(theta_t, X_t, y_t, lambda_t), lrGradient(theta_t, X_t, y_t, lambda_t, flattenResult=False)
print('\nCost: f\n', J)
print('Expected cost: 2.534819\n')
print('Gradients:\n')
print(' f \n', grad)
print('Expected gradients:\n')
print(' 0.146561\n -0.548558\n 0.724722\n 1.398003\n')
lrCostFunction
from sigmoid import sigmoid
import numpy as np
def lrCostFunction(theta, X, y, reg_lambda):
"""LRCOSTFUNCTION Compute cost and gradient for logistic regression with
regularization
J = LRCOSTFUNCTION(theta, X, y, lambda) computes the cost of using
theta as the parameter for regularized logistic regression and the
gradient of the cost w.r.t. to the parameters.
"""
m, n = X.shape #number of training examples
theta = theta.reshape((n,1))
prediction = sigmoid(X.dot(theta))
cost_y_1 = (1 - y) * np.log(1 - prediction)
cost_y_0 = -1 * y * np.log(prediction)
J = (1.0/m) * np.sum(cost_y_0 - cost_y_1) + (reg_lambda/(2.0 * m)) * np.sum(np.power(theta[1:], 2))
return J
lrGradient
from sigmoid import sigmoid
import numpy as np
def lrGradient(theta, X,y, reg_lambda, flattenResult=True):
m,n = X.shape
theta = theta.reshape((n,1))
prediction = sigmoid(np.dot(X, theta))
errors = np.subtract(prediction, y)
grad = (1.0/m) * np.dot(X.T, errors)
grad_with_regul = grad[1:] + (reg_lambda/m) * theta[1:]
firstRow = grad[0, :].reshape((1,1))
grad = np.r_[firstRow, grad_with_regul]
if flattenResult:
return grad.flatten()
return grad
Hope that helped!
Related
import numpy as np
import pandas as pd
import numpy as np
from matplotlib import pyplot as pt
def computeCost(X,y,theta):
m=len(y)
predictions= X*theta-y
sqrerror=np.power(predictions,2)
return 1/(2*m)*np.sum(sqrerror)
def gradientDescent(X, y, theta, alpha, num_iters):
m = len(y)
jhistory = np.zeros((num_iters,1))
for i in range(num_iters):
h = X * theta
s = h - y
theta = theta - (alpha / m) * (s.T*X).T
jhistory_iter = computeCost(X, y, theta)
return theta,jhistory_iter
data = open(r'C:\Users\Coding\Desktop\machine-learning-ex1\ex1\ex1data1.txt')
data1=np.array(pd.read_csv(r'C:\Users\Coding\Desktop\machine-learning-ex1\ex1\ex1data1.txt',header=None))
y =np.array(data1[:,1])
m=len(y)
y=np.asmatrix(y.reshape(m,1))
X = np.array([data1[:,0]]).reshape(m,1)
X = np.asmatrix(np.insert(X,0,1,axis=1))
theta=np.zeros((2,1))
iterations = 1500
alpha = 0.01;
print('Testing the cost function ...')
J = computeCost(X, y, theta)
print('With theta = [0 , 0]\nCost computed = ', J)
print('Expected cost value (approx) 32.07')
theta=np.asmatrix([[-1,0],[1,2]])
J = computeCost(X, y, theta)
print('With theta = [-1 , 2]\nCost computed =', J)
print('Expected cost value (approx) 54.24')
theta,JJ = gradientDescent(X, y, theta, alpha, iterations)
print('Theta found by gradient descent:')
print(theta)
print('Expected theta values (approx)')
print(' -3.6303\n 1.1664\n')
predict1 = [1, 3.5] *theta
print(predict1*10000)
Result:
Testing the cost function ...
With theta = [0 , 0]
Cost computed = 32.072733877455676
Expected cost value (approx) 32.07
With theta = [-1 , 2]
Cost computed = 69.84811062494227
Expected cost value (approx) 54.24
Theta found by gradient descent:
[[-3.70304726 -3.64357517]
[ 1.17367146 1.16769684]]
Expected theta values (approx)
-3.6303
1.1664
[[4048.02858742 4433.63790186]]
There are two problems, the first Cost computed was right, but the second one was wrong. And there are 4 element in my gradient descent(suppose to be two)
When you mention "With theta = [-1 , 2]"
and you enter
theta=np.asmatrix([[-1,0],[1,2]])
I think this is incorrect. Assuming that you have single feature and you added a column of 1, and you are trying to do simple linear regression
The correct way should be
np.array([-1,2])
Also where have
predictions= X*theta-y
It would be better if you did
np.dot(X,theta)-y
When you multiply, it's not doing the same thing.
I am going through's Andrew Ng's ML course, and I am trying to implement the programs in python. For the second exercise, on logistic regression, I am trying to use scipy.optimize.minimize for optimizing the cost function. My code is as follows.
import os
import numpy as np
from scipy.special import expit
from scipy import optimize
datafile1 = os.path.join('data','ex2data1.txt')
data1 = np.loadtxt(datafile1, delimiter=',')
exam_scores, results = data1[:, :2], data1[:, 2]
m, n = exam_scores.shape
exam_scores = np.concatenate([np.ones([m, 1]), exam_scores], axis=1)
def cost_function(x, y, theta):
m = len(y)
hypothesis = expit(np.dot(x, theta))
term1 = -np.dot(y.T, np.log(hypothesis)) / m
term2 = -np.dot((1 - y).T, np.log(1 - hypothesis)) / m
cost = term1 + term2
return cost
def gradient(x, y, theta):
m = len(y)
hypothesis = expit(np.dot(x, theta))
return np.dot(hypothesis - y, x) / m
def minimize_cost(x, y, theta):
output = optimize.minimize(cost_function, theta, args=(x, y),
jac=gradient, options={'maxiter':400})
return output.fun, output.x
theta = np.zeros(n + 1)
theta, cost = minimize_cost(exam_scores, results, theta)
This gives me
<ipython-input-42-e2ba65cce1d8> in gradient(x, y, theta)
9 def gradient(x, y, theta):
10 m = len(y)
---> 11 hypothesis = expit(np.dot(x, theta))
12 return np.dot(hypothesis - y, x) / m
ValueError: shapes (3,) and (100,) not aligned: 3 (dim 0) != 100 (dim 0).
However the shape of theta and the output of the gradient function is the same, i.e. theta.shape == gradient(exam_scores, results, theta).shape gives me True.
I do not understand why is the gradient function raising a ValueError when called from minimize since by itself it is giving the expected output.
Any pointers would be appreciated.
P.S. Here is a part of the data.
exam_scores[:5, :]
array([[34.62365962, 78.02469282],
[30.28671077, 43.89499752],
[35.84740877, 72.90219803],
[60.18259939, 86.3085521 ],
[79.03273605, 75.34437644]])
results.reshape(m, 1)[:5, :]
array([[0.],
[0.],
[0.],
[1.],
[1.]])
Edit: Added part of the data.
I'm looking to use multivariate regression with least squares as my cost function to find a,b,c for ax^2 +bx + c that best fits cos(x) from (-2,2). My cost won't decrease but is ridiculously high- what I am doing wrong?
x = np.linspace(-2,2,100)
y = np.cos(x)
theta = np.random.random((3,1))
m = len(y)
for i in range(10000):
#Calculate my y_hat
y_hat = np.array([(theta[0]*(a**2) + theta[1]*a + theta[2]) for a in x])
#Calculate my cost based off y_hat and y
cost = np.sum((y_hat - y) ** 2) * (1/m)
#Calculate my derivatives based off y_hat and x
da = (2 / m) * np.sum((y_hat - y) * (x**2))
db = (2 / m) * np.sum((y_hat - y) * (x))
dc = (2 / m) * np.sum((y_hat - y))
#update step
theta[0] = theta[0] - 0.0001*(da)
theta[1] = theta[1] - 0.0001*(db)
theta[2] = theta[2] - 0.0001*(dc)
print("Epoch Num: {} Cost: {}".format(i, cost))
print(theta)
You're calculation of y_hat is slightly incorrect. It's currently a 2D array of shape (100,1).
This should help. It pulls the "zeroith" element from each of the rows:
theta_ = [(theta[0]*(a**2) + theta[1]*a + theta[2]) for a in x]
y_hat = np.array([t[0] for t in theta_])
I am trying to code logistic regression from scratch. In this code I have, I thought my cost derivative was my regularization, but I've been tasked with adding L1norm regularization. How do you add this in python? Should this be added where I have defined the cost derivative? Any help in the right direction is appreciated.
def Sigmoid(z):
return 1/(1 + np.exp(-z))
def Hypothesis(theta, X):
return Sigmoid(X # theta)
def Cost_Function(X,Y,theta,m):
hi = Hypothesis(theta, X)
_y = Y.reshape(-1, 1)
J = 1/float(m) * np.sum(-_y * np.log(hi) - (1-_y) * np.log(1-hi))
return J
def Cost_Function_Derivative(X,Y,theta,m,alpha):
hi = Hypothesis(theta,X)
_y = Y.reshape(-1, 1)
J = alpha/float(m) * X.T # (hi - _y)
return J
def Gradient_Descent(X,Y,theta,m,alpha):
new_theta = theta - Cost_Function_Derivative(X,Y,theta,m,alpha)
return new_theta
def Accuracy(theta):
correct = 0
length = len(X_test)
prediction = (Hypothesis(theta, X_test) > 0.5)
_y = Y_test.reshape(-1, 1)
correct = prediction == _y
my_accuracy = (np.sum(correct) / length)*100
print ('LR Accuracy: ', my_accuracy, "%")
def Logistic_Regression(X,Y,alpha,theta,num_iters):
m = len(Y)
for x in range(num_iters):
new_theta = Gradient_Descent(X,Y,theta,m,alpha)
theta = new_theta
if x % 100 == 0:
print #('theta: ', theta)
print #('cost: ', Cost_Function(X,Y,theta,m))
Accuracy(theta)
ep = .012
initial_theta = np.random.rand(X_train.shape[1],1) * 2 * ep - ep
alpha = 0.5
iterations = 10000
Logistic_Regression(X_train,Y_train,alpha,initial_theta,iterations)
Regularization adds a term to the cost function so that there is a compromise between minimize cost and minimizing the model parameters to reduce overfitting. You can control how much compromise you would like by adding a scalar e for the regularization term.
So just add the L1 norm of theta to the original cost function:
J = J + e * np.sum(abs(theta))
Since this term is added to the cost function, then it should be considered when computing the gradient of the cost function.
This is simple since the derivative of the sum is the sum of derivatives. So now just need to figure out what is the derivate of the term sum(abs(theta)). Since it is a linear term, then the derivative is constant. It is = 1 if theta >= 0, and -1 if theta < 0 (note there is a mathematical undeterminity at 0, but we don't care about it).
So in the function Cost_Function_Derivative we add:
J = J + alpha * e * (theta >= 0).astype(float)
Last night I wrote a simple binary logistic regression python code.
It seems to be working correctly (likelihood increases with each iteration, and I get good classification results).
My problem is that I can only initialize my weights with W = np.random.randn(n+1, 1) normal distribution.
But I don't want normal distribution, I want uniform distribution. But when I do that, I get the error
"RuntimeWarning: divide by zero encountered in log
return np.dot(Y.T, np.log(predictions)) + np.dot((onesVector - Y).T, np.log(onesVector - predictions))"
this is my code
import numpy as np
import matplotlib.pyplot as plt
def sigmoid(x):
return 1/(1+np.exp(-x))
def predict(X, W):
return sigmoid(np.dot(X, W))
def logLikelihood(X, Y, W):
m = X.shape[0]
predictions = predict(X, W)
onesVector = np.ones((m, 1))
return np.dot(Y.T, np.log(predictions)) + np.dot((onesVector - Y).T, np.log(onesVector - predictions))
def gradient(X, Y, W):
return np.dot(X.T, Y - predict(X, W))
def successRate(X, Y, W):
m = Y.shape[0]
predictions = predict(X, W) > 0.5
correct = (Y == predictions)
return 100 * np.sum(correct)/float(correct.shape[0])
trX = np.load("binaryMnistTrainX.npy")
trY = np.load("binaryMnistTrainY.npy")
teX = np.load("binaryMnistTestX.npy")
teY = np.load("binaryMnistTestY.npy")
m, n = trX.shape
trX = np.concatenate((trX, np.ones((m, 1))),axis=1)
teX = np.concatenate((teX, np.ones((teX.shape[0], 1))),axis=1)
W = np.random.randn(n+1, 1)
learningRate = 0.00001
numIter = 500
likelihoodArray = np.zeros((numIter, 1))
for i in range(0, numIter):
W = W + learningRate * gradient(trX, trY, W)
likelihoodArray[i, 0] = logLikelihood(trX, trY, W)
print("train success rate is %lf" %(successRate(trX, trY, W)))
print("test success rate is %lf" %(successRate(teX, teY, W)))
plt.plot(likelihoodArray)
plt.show()
If i initialize my W to be zeros or randn then it works.
If I initialize it to random (not normal) or ones, then I get the division by zero thing.
Why does this happen and how can I fix it?