Last night I wrote a simple binary logistic regression python code.
It seems to be working correctly (likelihood increases with each iteration, and I get good classification results).
My problem is that I can only initialize my weights with W = np.random.randn(n+1, 1) normal distribution.
But I don't want normal distribution, I want uniform distribution. But when I do that, I get the error
"RuntimeWarning: divide by zero encountered in log
return np.dot(Y.T, np.log(predictions)) + np.dot((onesVector - Y).T, np.log(onesVector - predictions))"
this is my code
import numpy as np
import matplotlib.pyplot as plt
def sigmoid(x):
return 1/(1+np.exp(-x))
def predict(X, W):
return sigmoid(np.dot(X, W))
def logLikelihood(X, Y, W):
m = X.shape[0]
predictions = predict(X, W)
onesVector = np.ones((m, 1))
return np.dot(Y.T, np.log(predictions)) + np.dot((onesVector - Y).T, np.log(onesVector - predictions))
def gradient(X, Y, W):
return np.dot(X.T, Y - predict(X, W))
def successRate(X, Y, W):
m = Y.shape[0]
predictions = predict(X, W) > 0.5
correct = (Y == predictions)
return 100 * np.sum(correct)/float(correct.shape[0])
trX = np.load("binaryMnistTrainX.npy")
trY = np.load("binaryMnistTrainY.npy")
teX = np.load("binaryMnistTestX.npy")
teY = np.load("binaryMnistTestY.npy")
m, n = trX.shape
trX = np.concatenate((trX, np.ones((m, 1))),axis=1)
teX = np.concatenate((teX, np.ones((teX.shape[0], 1))),axis=1)
W = np.random.randn(n+1, 1)
learningRate = 0.00001
numIter = 500
likelihoodArray = np.zeros((numIter, 1))
for i in range(0, numIter):
W = W + learningRate * gradient(trX, trY, W)
likelihoodArray[i, 0] = logLikelihood(trX, trY, W)
print("train success rate is %lf" %(successRate(trX, trY, W)))
print("test success rate is %lf" %(successRate(teX, teY, W)))
plt.plot(likelihoodArray)
plt.show()
If i initialize my W to be zeros or randn then it works.
If I initialize it to random (not normal) or ones, then I get the division by zero thing.
Why does this happen and how can I fix it?
Related
I'm currently working on Andrew Ng's gradient descent exercise using python but keeps getting me the wrong optimal theta. I followed this vectorization cheatsheet for gradient descent --- https://medium.com/ml-ai-study-group/vectorized-implementation-of-cost-functions-and-gradient-vectors-linear-regression-and-logistic-31c17bca9181.
Here is my code:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def cost_func(X, Y, theta):
m = len(X)
H = X.dot(theta)
J = 1/(2*m) * (H - Y).T.dot(H - Y)
return J
def gradient_descent(X, Y, alpha=0.01, iterations=1500):
#initializing theta as a zero vector
theta = np.zeros(X.shape[1])
#initializing the a list of cost function value
J_list = [cost_func(X, Y, theta)]
m = len(X)
while iterations > 0:
H = X.dot(theta)
delta = (1/m)*X.T.dot(H - Y)
theta = theta - alpha * delta
iterations -= 1
J_list.append(cost_func(X, Y, theta))
return theta, J_list
def check_convergence(J_list):
plt.plot(range(len(J_list)), J_list)
plt.xlabel('Iterations')
plt.ylabel('Cost J')
plt.show()
file_name_1 = 'https://raw.githubusercontent.com/kaleko/CourseraML/master/ex1/data/ex1data1.txt'
df1 = pd.read_csv(file_name_1, header=None)
X = df1.values[:, 0]
Y = df1.values[:, 1]
m = len(X)
X = np.column_stack((np.ones(m), X))
theta_optimal, J_list = gradient_descent(X, Y, 0.01, 1500)
print(theta_optimal)
check_convergence(J_list)
My theta output is [-3.63029144 1.16636235], which is incorrect.
Here is my cost function graph. As you see, it converges way too quickly.
The correct graph should look like.
Thank you.
I'm trying to implement regularized logistic regression using python for the coursera ML class but I'm having a lot of trouble vectorizing it. Using this repository:
I've tried many different ways but never get the correct gradient or cost heres my current implementation:
h = utils.sigmoid( np.dot(X, theta) )
J = (-1/m) * ( y.T.dot( np.log(h) ) + (1 - y.T).dot( np.log( 1 - h ) ) ) + ( lambda_/(2*m) ) * np.sum( np.square(theta[1:]) )
grad = ((1/m) * (h - y).T.dot( X )).T + grad_theta_reg
Here are the results:
Cost : 0.693147
Expected
cost: 2.534819
Gradients:
[-0.100000, -0.030000, -0.080000, -0.130000]
Expected gradients:
[0.146561, -0.548558, 0.724722, 1.398003]
Any help from someone who knows whats going on would be much appreciated.
Bellow a working snippet of a vectorized version of Logistic Regression. You can see more here https://github.com/hzitoun/coursera_machine_learning_matlab_python
Main
theta_t = np.array([[-2], [-1], [1], [2]])
data = np.arange(1, 16).reshape(3, 5).T
X_t = np.c_[np.ones((5,1)), data/10]
y_t = (np.array([[1], [0], [1], [0], [1]]) >= 0.5) * 1
lambda_t = 3
J, grad = lrCostFunction(theta_t, X_t, y_t, lambda_t), lrGradient(theta_t, X_t, y_t, lambda_t, flattenResult=False)
print('\nCost: f\n', J)
print('Expected cost: 2.534819\n')
print('Gradients:\n')
print(' f \n', grad)
print('Expected gradients:\n')
print(' 0.146561\n -0.548558\n 0.724722\n 1.398003\n')
lrCostFunction
from sigmoid import sigmoid
import numpy as np
def lrCostFunction(theta, X, y, reg_lambda):
"""LRCOSTFUNCTION Compute cost and gradient for logistic regression with
regularization
J = LRCOSTFUNCTION(theta, X, y, lambda) computes the cost of using
theta as the parameter for regularized logistic regression and the
gradient of the cost w.r.t. to the parameters.
"""
m, n = X.shape #number of training examples
theta = theta.reshape((n,1))
prediction = sigmoid(X.dot(theta))
cost_y_1 = (1 - y) * np.log(1 - prediction)
cost_y_0 = -1 * y * np.log(prediction)
J = (1.0/m) * np.sum(cost_y_0 - cost_y_1) + (reg_lambda/(2.0 * m)) * np.sum(np.power(theta[1:], 2))
return J
lrGradient
from sigmoid import sigmoid
import numpy as np
def lrGradient(theta, X,y, reg_lambda, flattenResult=True):
m,n = X.shape
theta = theta.reshape((n,1))
prediction = sigmoid(np.dot(X, theta))
errors = np.subtract(prediction, y)
grad = (1.0/m) * np.dot(X.T, errors)
grad_with_regul = grad[1:] + (reg_lambda/m) * theta[1:]
firstRow = grad[0, :].reshape((1,1))
grad = np.r_[firstRow, grad_with_regul]
if flattenResult:
return grad.flatten()
return grad
Hope that helped!
I implemented a simple linear regression and I want to try it out by fitting a non linear model
specifically I am trying to fit a model for the function y = x^3 + 5 for example
this is my code
import numpy as np
import numpy.matlib
import matplotlib.pyplot as plt
def predict(X,W):
return np.dot(X,W)
def gradient(X, Y, W, regTerm=0):
return (-np.dot(X.T, Y) + np.dot(np.dot(X.T,X),W))/(m*k) + regTerm * W /(n*k)
def cost(X, Y, W, regTerm=0):
m, k = Y.shape
n, k = W.shape
Yhat = predict(X, W)
return np.trace(np.dot(Y-Yhat,(Y-Yhat).T))/(2*m*k) + regTerm * np.trace(np.dot(W,W.T)) / (2*n*k)
def Rsquared(X, Y, W):
m, k = Y.shape
SSres = cost(X, Y, W)
Ybar = np.mean(Y,axis=0)
Ybar = np.matlib.repmat(Ybar, m, 1)
SStot = np.trace(np.dot(Y-Ybar,(Y-Ybar).T))
return 1-SSres/SStot
m = 10
n = 200
k = 1
trX = np.random.rand(m, n)
trX[:, 0] = 1
for i in range(2, n):
trX[:, i] = trX[:, 1] ** i
trY = trX[:, 1] ** 3 + 5
trY = np.reshape(trY, (m, k))
W = np.random.rand(n, k)
numIter = 10000
learningRate = 0.5
for i in range(0, numIter):
W = W - learningRate * gradient(trX, trY, W)
domain = np.linspace(0,1,100000)
powerDomain = np.copy(domain)
m = powerDomain.shape[0]
powerDomain = np.reshape(powerDomain, (m, 1))
powerDomain = np.matlib.repmat(powerDomain, 1, n)
for i in range(1, n):
powerDomain[:, i] = powerDomain[:, 0] ** i
print(Rsquared(trX, trY, W))
plt.plot(trX[:, 1],trY,'o', domain, predict(powerDomain, W),'r')
plt.show()
the R^2 I'm getting is very close to 1, meaning I found a very good fit to the training data, but it isn't shown on the plots. When I plot the data, it usually looks like this:
it looks as if I'm underfitting the data, but with such a complex hypothesis, with 200 features (meaning i allow polynomials up to x^200) and only 10 training examples, I should very clearly be overfitting data, so I expect the red line to pass through all the blue points and go wild between them.
This isn't what I'm getting which is confusing to me.
What's wrong?
You forgot to set powerDomain[:,0]=1, that's why your plot goes wrong at 0. And yes you are over fitting: look how quickly your plot fires up as soon as you get out of your training domain.
I have implemented logistic regression in Python. I think there is some bug in the code. I am not able to get the correct accuracy for the testset.
Here is the code:
from __future__ import division
import numpy as np
from math import *
import os, sys
class LogisticRegressionModel:
def __init__(self, n):
self.n = n
self.theta = np.zeros((n+1, 1))
print(self.theta)
def SGD(self, trainingSet, epochs, minibatchsize, eta):
m = len(trainingSet)
for epoch in range(epochs):
derSum = np.zeros(self.theta.shape)
for xi, yi in trainingSet:
xi = np.concatenate(([[1]], xi), axis=0)
#print(xi)
hi = self.sigmoid(np.dot(np.transpose(self.theta), xi))
derSum = derSum + (hi-yi)*xi
self.theta = self.theta - eta/m*derSum
print(self.cost(trainingSet))
def cost(self, dataset):
totCost=0
for xi, yi in dataset:
xi = np.concatenate(([[1]], xi), axis=0)
hi = self.sigmoid(np.dot(np.transpose(self.theta), xi))
totCost += -1*(yi*log(hi)+(1-yi)*log(1-hi))
return totCost/len(dataset)
def sigmoid(self, z):
return 1.0/(1.0+np.exp(-1*z))
def evaluate(self, testSet):
mtest = len(testSet)
count=0
for xi, yi in testSet:
xi = np.concatenate(([[1]], xi), axis=0)
hi = self.sigmoid(np.dot(self.theta.transpose(), xi))
#print(str(hi[0, 0])+" "+str(yi))
if hi>=0.5:
hi=1
else:
hi=0
if yi==hi:
count+=1
print(count/mtest*100)
The LR is a two-class classifier. The data set is having a linear decision boundary and I tested it using Octave which is giving an accuracy of more that 95%. But the above implementation stucks around 60%. I also tried changing the learning rate and other things. But that doesn't help.
Assuming that your training data is a list containing pairs like ([feature1,...,featuren], label), the following code seems to work fine for me. It's a modification of your code, except that I put things in array form where it was appropriate:
from __future__ import division
import numpy as np
def sigmoid(z):
return 1/(1+np.exp(-z))
def log_loss(y,ypred):
return -(y*np.log(ypred) + (1-y)*np.log(1-ypred)).mean()
class LogisticRegressionModel:
def __init__(self, n):
self.n = n
self.theta = np.zeros((1,n+1))
print(self.theta)
def SGD(self, trainingSet, epochs, minibatchsize, eta):
m = len(trainingSet)
X = np.ones((self.n+1,m))
Y = np.zeros((1,m))
for i, (xi, yi) in enumerate(trainingSet):
X[1:,i] = xi
Y[:,i] = yi
for epoch in xrange(epochs):
H = sigmoid(self.theta.dot(X))
derSum = (H-Y).dot(X.T)
self.theta -= eta * derSum/m
print(log_loss(Y,H))
def evaluate(self, testSet):
mtest = len(testSet)
X = np.ones((self.n+1,mtest))
Y = np.zeros((1,mtest))
for i, (xi, yi) in enumerate(testSet):
X[1:,i] = xi
Y[:,i] = yi
H = sigmoid(self.theta.dot(X))
H = (H >= 0.5)
print((H == Y).mean() * 100)
I'm not sure what's broken in your code, since this should work identically to yours (save for places where reloading the data is redundant).
I've been reading Bishop's book on machine learning, and I'm trying to implement the backpropagation algorithm for a neural network, but it's not finding a solution. The code is below. I've broken it down into the network code and the testing code.
import numpy as np
from collections import namedtuple
import matplotlib.pyplot as plt
import scipy.optimize as opt
# Network code
def tanh(x):
return np.tanh(x)
def dtanh(x):
return 1 - np.tan(x)**2
def identity(x):
return x
def unpack_weights(w, D, M, K):
"""
len(w) = (D + 1)*M + (M + 1)*K, where
D = number of inputs, excluding bias
M = number of hidden units, excluding bias
K = number of output units
"""
UnpackedWeights = namedtuple("UpackedWeights", ["wHidden", "wOutput"])
cutoff = M*(D + 1)
wHidden = w[:cutoff].reshape(M, D + 1)
wOutput = w[cutoff:].reshape(K, M + 1)
return UnpackedWeights(wHidden=wHidden, wOutput=wOutput)
def compute_output(x, weights, fcnHidden=tanh, fcnOutput=identity):
NetworkResults = namedtuple("NetworkResults", ["hiddenAct", "hiddenOut", "outputAct", "outputOut"])
xBias = np.vstack((1., x))
hiddenAct = weights.wHidden.dot(xBias)
hiddenOut = np.vstack((1., fcnHidden(hiddenAct)))
outputAct = weights.wOutput.dot(hiddenOut)
outputOut = fcnOutput(outputAct)
return NetworkResults(hiddenAct=hiddenAct, hiddenOut=hiddenOut, outputAct=outputAct,
outputOut=outputOut)
def backprop(t, x, M, fcnHidden=tanh, fcnOutput=identity, dFcnHidden=dtanh):
maxIter = 10000
learningRate = 0.2
N, K = t.shape
N, D = x.shape
nParams = (D + 1)*M + (M + 1)*K
w0 = np.random.uniform(-0.1, 0.1, nParams)
for _ in xrange(maxIter):
sse = 0.
for n in xrange(N):
weights = unpack_weights(w0, D, M, K)
# Compute net output
netResults = compute_output(x=x[n].reshape(-1, 1), weights=weights,
fcnHidden=fcnHidden, fcnOutput=fcnOutput)
# Compute derivatives of error function wrt wOutput
outputDelta = netResults.outputOut - t[n].reshape(K, 1)
outputDerivs = outputDelta.dot(netResults.hiddenOut.T)
# Compute derivateives of error function wrt wHidden
hiddenDelta = dFcnHidden(netResults.hiddenAct)*(weights.wOutput[:, 1:].T.dot(outputDelta))
xBias = np.vstack((1., x[n].reshape(-1, 1)))
hiddenDerivs = hiddenDelta.dot(xBias.T)
delErr = np.hstack((np.ravel(hiddenDerivs), np.ravel(outputDerivs)))
w1 = w0 - learningRate*delErr
w0 = w1
sse += np.sum(outputDelta**2)
return w0
# Testing code
def generate_test_data():
D, M, K, N = 1, 3, 1, 25
x = np.sort(np.random.uniform(-1., 1., (N, D)), axis=0)
t = 1.0 + x**2
return D, M, K, N, x, t
def test_backprop():
D, M, K, N, x, t = generate_test_data()
return backprop(t, x, M)
def scipy_solution(t, x, D, M, K, N, method="BFGS"):
def obj_fn(w):
weights = unpack_weights(w, D, M, K)
err = 0
for n in xrange(N):
netOut = compute_output(x[n], weights=weights)
err += (netOut.outputOut[0, 0] - t[n])**2
return err
w0 = np.random.uniform(-1, 1, (D + 1)*M + (M + 1)*K)
return opt.minimize(obj_fn, w0, method=method)
When I use the optimize module in scipy (i.e., the scipy_solution() function) to find the network weights, the sum of squared errors gets very close to zero, and the output of the network looks like the data I generated. When I use my backpropagation function, the sum of squared errors gets stuck between 2.0 and 3.0, and the network output looks almost linear. Moreover, when I feed the scipy solution for the weights to my backprop function as the starting value, my backprop function still doesn't find the right solution.
I've been stuck on this for a couple of days, so I'd really appreciate any tips anyone has. Thanks.
def dtanh(x):
return 1 - np.tan(x)**2
should be
def dtanh(x):
return 1 - np.tanh(x)**2