I implemented a simple linear regression and I want to try it out by fitting a non linear model
specifically I am trying to fit a model for the function y = x^3 + 5 for example
this is my code
import numpy as np
import numpy.matlib
import matplotlib.pyplot as plt
def predict(X,W):
return np.dot(X,W)
def gradient(X, Y, W, regTerm=0):
return (-np.dot(X.T, Y) + np.dot(np.dot(X.T,X),W))/(m*k) + regTerm * W /(n*k)
def cost(X, Y, W, regTerm=0):
m, k = Y.shape
n, k = W.shape
Yhat = predict(X, W)
return np.trace(np.dot(Y-Yhat,(Y-Yhat).T))/(2*m*k) + regTerm * np.trace(np.dot(W,W.T)) / (2*n*k)
def Rsquared(X, Y, W):
m, k = Y.shape
SSres = cost(X, Y, W)
Ybar = np.mean(Y,axis=0)
Ybar = np.matlib.repmat(Ybar, m, 1)
SStot = np.trace(np.dot(Y-Ybar,(Y-Ybar).T))
return 1-SSres/SStot
m = 10
n = 200
k = 1
trX = np.random.rand(m, n)
trX[:, 0] = 1
for i in range(2, n):
trX[:, i] = trX[:, 1] ** i
trY = trX[:, 1] ** 3 + 5
trY = np.reshape(trY, (m, k))
W = np.random.rand(n, k)
numIter = 10000
learningRate = 0.5
for i in range(0, numIter):
W = W - learningRate * gradient(trX, trY, W)
domain = np.linspace(0,1,100000)
powerDomain = np.copy(domain)
m = powerDomain.shape[0]
powerDomain = np.reshape(powerDomain, (m, 1))
powerDomain = np.matlib.repmat(powerDomain, 1, n)
for i in range(1, n):
powerDomain[:, i] = powerDomain[:, 0] ** i
print(Rsquared(trX, trY, W))
plt.plot(trX[:, 1],trY,'o', domain, predict(powerDomain, W),'r')
plt.show()
the R^2 I'm getting is very close to 1, meaning I found a very good fit to the training data, but it isn't shown on the plots. When I plot the data, it usually looks like this:
it looks as if I'm underfitting the data, but with such a complex hypothesis, with 200 features (meaning i allow polynomials up to x^200) and only 10 training examples, I should very clearly be overfitting data, so I expect the red line to pass through all the blue points and go wild between them.
This isn't what I'm getting which is confusing to me.
What's wrong?
You forgot to set powerDomain[:,0]=1, that's why your plot goes wrong at 0. And yes you are over fitting: look how quickly your plot fires up as soon as you get out of your training domain.
Related
I am trying to implement gradient descent in python. Though my code is returning result by I think results I am getting are completely wrong.
Here is the code I have written:
import numpy as np
import pandas
dataset = pandas.read_csv('D:\ML Data\house-prices-advanced-regression-techniques\\train.csv')
X = np.empty((0, 1),int)
Y = np.empty((0, 1), int)
for i in range(dataset.shape[0]):
X = np.append(X, dataset.at[i, 'LotArea'])
Y = np.append(Y, dataset.at[i, 'SalePrice'])
X = np.c_[np.ones(len(X)), X]
Y = Y.reshape(len(Y), 1)
def gradient_descent(X, Y, theta, iterations=100, learningRate=0.000001):
m = len(X)
for i in range(iterations):
prediction = np.dot(X, theta)
theta = theta - (1/m) * learningRate * (X.T.dot(prediction - Y))
return theta
theta = np.random.randn(2,1)
theta = gradient_descent(X, Y, theta)
print('theta',theta)
The result I get after running this program is:
theta [[-5.23237458e+228]
[-1.04560188e+233]]
Which are very high values. Can someone point out the mistake I have made in implementation.
Also, 2nd problem is I have to set value of learning rate very low (in this case i have set to 0.000001) to work other wise program throws an error.
Please help me in diagnosis the problem.
try to reduce the learning rate with iteration otherwise it wont be able to reach the optimal lowest.try this
import numpy as np
import pandas
dataset = pandas.read_csv('start.csv')
X = np.empty((0, 1),int)
Y = np.empty((0, 1), int)
for i in range(dataset.shape[0]):
X = np.append(X, dataset.at[i, 'R&D Spend'])
Y = np.append(Y, dataset.at[i, 'Profit'])
X = np.c_[np.ones(len(X)), X]
Y = Y.reshape(len(Y), 1)
def gradient_descent(X, Y, theta, iterations=50, learningRate=0.01):
m = len(X)
for i in range(iterations):
prediction = np.dot(X, theta)
theta = theta - (1/m) * learningRate * (X.T.dot(prediction - Y))
learningRate/=10;
return theta
theta = np.random.randn(2,1)
theta = gradient_descent(X, Y, theta)
print('theta',theta)
I'm currently working on Andrew Ng's gradient descent exercise using python but keeps getting me the wrong optimal theta. I followed this vectorization cheatsheet for gradient descent --- https://medium.com/ml-ai-study-group/vectorized-implementation-of-cost-functions-and-gradient-vectors-linear-regression-and-logistic-31c17bca9181.
Here is my code:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def cost_func(X, Y, theta):
m = len(X)
H = X.dot(theta)
J = 1/(2*m) * (H - Y).T.dot(H - Y)
return J
def gradient_descent(X, Y, alpha=0.01, iterations=1500):
#initializing theta as a zero vector
theta = np.zeros(X.shape[1])
#initializing the a list of cost function value
J_list = [cost_func(X, Y, theta)]
m = len(X)
while iterations > 0:
H = X.dot(theta)
delta = (1/m)*X.T.dot(H - Y)
theta = theta - alpha * delta
iterations -= 1
J_list.append(cost_func(X, Y, theta))
return theta, J_list
def check_convergence(J_list):
plt.plot(range(len(J_list)), J_list)
plt.xlabel('Iterations')
plt.ylabel('Cost J')
plt.show()
file_name_1 = 'https://raw.githubusercontent.com/kaleko/CourseraML/master/ex1/data/ex1data1.txt'
df1 = pd.read_csv(file_name_1, header=None)
X = df1.values[:, 0]
Y = df1.values[:, 1]
m = len(X)
X = np.column_stack((np.ones(m), X))
theta_optimal, J_list = gradient_descent(X, Y, 0.01, 1500)
print(theta_optimal)
check_convergence(J_list)
My theta output is [-3.63029144 1.16636235], which is incorrect.
Here is my cost function graph. As you see, it converges way too quickly.
The correct graph should look like.
Thank you.
Last night I wrote a simple binary logistic regression python code.
It seems to be working correctly (likelihood increases with each iteration, and I get good classification results).
My problem is that I can only initialize my weights with W = np.random.randn(n+1, 1) normal distribution.
But I don't want normal distribution, I want uniform distribution. But when I do that, I get the error
"RuntimeWarning: divide by zero encountered in log
return np.dot(Y.T, np.log(predictions)) + np.dot((onesVector - Y).T, np.log(onesVector - predictions))"
this is my code
import numpy as np
import matplotlib.pyplot as plt
def sigmoid(x):
return 1/(1+np.exp(-x))
def predict(X, W):
return sigmoid(np.dot(X, W))
def logLikelihood(X, Y, W):
m = X.shape[0]
predictions = predict(X, W)
onesVector = np.ones((m, 1))
return np.dot(Y.T, np.log(predictions)) + np.dot((onesVector - Y).T, np.log(onesVector - predictions))
def gradient(X, Y, W):
return np.dot(X.T, Y - predict(X, W))
def successRate(X, Y, W):
m = Y.shape[0]
predictions = predict(X, W) > 0.5
correct = (Y == predictions)
return 100 * np.sum(correct)/float(correct.shape[0])
trX = np.load("binaryMnistTrainX.npy")
trY = np.load("binaryMnistTrainY.npy")
teX = np.load("binaryMnistTestX.npy")
teY = np.load("binaryMnistTestY.npy")
m, n = trX.shape
trX = np.concatenate((trX, np.ones((m, 1))),axis=1)
teX = np.concatenate((teX, np.ones((teX.shape[0], 1))),axis=1)
W = np.random.randn(n+1, 1)
learningRate = 0.00001
numIter = 500
likelihoodArray = np.zeros((numIter, 1))
for i in range(0, numIter):
W = W + learningRate * gradient(trX, trY, W)
likelihoodArray[i, 0] = logLikelihood(trX, trY, W)
print("train success rate is %lf" %(successRate(trX, trY, W)))
print("test success rate is %lf" %(successRate(teX, teY, W)))
plt.plot(likelihoodArray)
plt.show()
If i initialize my W to be zeros or randn then it works.
If I initialize it to random (not normal) or ones, then I get the division by zero thing.
Why does this happen and how can I fix it?
I implemented logistic regression and use it on a data set. (This is an exercise in Coursera's ML course Week #3 (which normally uses matlab and octave) using python (so this isn't cheating)).
I started with the implementation in sklearn to classify the data set used in week three of this course (http://pastie.org/10872959). Here is a small, reproducible example for anyone to try out what I used (it relies only on numpy and sklearn):
It takes the data set, splits it into the feature matrix and the output matrix, and then constructs 26 more features from the original 2 (i.e from
). I then use logistic regression in sklearn, but this does not give the contour plot desired (please see below).
from sklearn.linear_model import LogisticRegression as expit
import numpy as np
def thetaFunc(y, theta, x):
deg = 6
spot = 0
sum = 0
for i in range(1, deg + 1):
for j in range(i + 1):
sum += theta[spot] * x**(i - j) * y**(j)
spot += 1
return sum
def constructVariations(X, deg):
features = np.zeros((len(X), 27))
spot = 0
for i in range(1, deg + 1):
for j in range(i + 1):
features[:, spot] = X[:,0]**(i - j) * X[:,1]**(j)
spot += 1
return features
if __name__ == '__main__':
data = np.loadtxt("ex2points.txt", delimiter = ",")
X,Y = np.split(data, [len(data[0,:]) - 1], 1)
X = reg.constructVariations(X, 6)
oneArray = np.ones((len(X),1))
X = np.hstack((oneArray, X))
trial = expit(solver = 'sag')
trial = trial.fit(X = X,y = np.ravel(Y))
print(trial.coef_)
# everything below has been edited in
from matplotlib import pyplot as plt
txt = open("RegLogTheta", "r").read()
txt = txt.split()
theta = np.array(txt, float)
x = np.linspace(-1, 1.5, 100)
y = np.linspace(-1,1.5,100)
z = np.empty((100,100))
xx,yy = np.meshgrid(x,y)
for i in range(len(x)):
for j in range(len(y)):
z[i][j] = thetaFunc(yy[i][j], theta, xx[i][j])
plt.contour(xx,yy,z, levels = [0])
plt.show()
Here are the coefficients of the generic feature terms.
http://pastie.org/10872957 (i.e the coefficients to terms
and the contour it generates:
One potential source of error is that I'm misinterpreting the 7 X 4 matrix coefficient matrix stored in trial._coeff. I believe that these 28 values are the coefficients to the 28 "variations" above, and I've mapped the coefficients to the variations both column-wise and row-wise. By column-wise, I mean that [:][0] get mapped to the first 7 variations, [:][1] to the next 7 and so on, and my function constructVariations explains how the variations are systematically created. Now the API maintains than an array of shape (n_classes, n_features) is stored in trial._coeff, so should I infer that fit classified the data into four classes? Or have I run through this problem poorly in another way?
Update
My interpretation (and/or use) of the weights must be at fault:
Instead of relying on the prediction built into sklearn, I myself tried to calculate the values that set the following to 1/2
The values of theta are those found from printing trial._coeff and x and y are scalars. Those x,y are then plotted to give the contour.
The code I used (but did not originally add in) attempts to do this. What is wrong with the math behind it?
One potential source of error is that I'm misinterpreting the 7 X 4 matrix coefficient matrix stored in trial._coeff
This matrix is not 7x4, it is 1x28 (check print(trial.coef_.shape)). One coefficient for each of your 28 features (27 returned by constructVariations and 1 added manually).
so should I infer that fit classified the data into four classes?
No, you missinterpreted the array, it has a single row (for binary classificaation there is no point in having two).
Or have I run through this problem poorly in another way?
Code is fine, interpretation not. In particular, see actual decision boundary from your model (plotted by calling "predict" and plotting contour)
from sklearn.linear_model import LogisticRegression as expit
import numpy as np
def constructVariations(X, deg):
features = np.zeros((len(X), 27))
spot = 0
for i in range(1, deg + 1):
for j in range(i + 1):
features[:, spot] = X[:,0]**(i - j) * X[:,1]**(j)
spot += 1
return features
if __name__ == '__main__':
data = np.loadtxt("ex2points.txt", delimiter = ",")
X,Y = np.split(data, [len(data[0,:]) - 1], 1)
rawX = np.copy(X)
X = constructVariations(X, 6)
oneArray = np.ones((len(X),1))
X = np.hstack((oneArray, X))
trial = expit(solver = 'sag')
trial = trial.fit(X = X,y = np.ravel(Y))
print(trial.coef_)
from matplotlib import pyplot as plt
h = 0.01
x_min, x_max = rawX[:, 0].min() - 1, rawX[:, 0].max() + 1
y_min, y_max = rawX[:, 1].min() - 1, rawX[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
data = constructVariations(np.c_[xx.ravel(), yy.ravel()], 6)
oneArray = np.ones((len(data),1))
data = np.hstack((oneArray, data))
Z = trial.predict(data)
Z = Z.reshape(xx.shape)
plt.figure()
plt.scatter(rawX[:, 0], rawX[:, 1], c=Y, linewidth=0, s=50)
plt.contourf(xx, yy, Z, cmap=plt.cm.Paired, alpha=0.8)
plt.show()
Update
In the code provided you forgot (in visualization) that you added column of "1"s to your data representation, thus your thetas are one "off", as theta[0] is a bias, theta1 is related to your 0'th variable etc.
def thetaFunc(y, theta, x):
deg = 6
spot = 0
sum = theta[spot]
spot += 1
for i in range(1, deg + 1):
for j in range(i + 1):
sum += theta[spot] * x**(i - j) * y**(j)
spot += 1
return sum
you also forgot about intercept term from logisticregression itself, thus
xx,yy = np.meshgrid(x,y)
for i in range(len(x)):
for j in range(len(y)):
z[i][j] = thetaFunc(yy[i][j], theta, xx[i][j])
z -= trial.intercept_
(image generated using fixed code of yours)
import numpy as np
from sklearn.linear_model import LogisticRegression as expit
def thetaFunc(y, theta, x):
deg = 6
spot = 0
sum = theta[spot]
spot += 1
for i in range(1, deg + 1):
for j in range(i + 1):
sum += theta[spot] * x**(i - j) * y**(j)
spot += 1
return np.exp(-sum)
def constructVariations(X, deg):
features = np.zeros((len(X), 27))
spot = 0
for i in range(1, deg + 1):
for j in range(i + 1):
features[:, spot] = X[:,0]**(i - j) * X[:,1]**(j)
spot += 1
return features
if __name__ == '__main__':
data = np.loadtxt("ex2points.txt", delimiter = ",")
X,Y = np.split(data, [len(data[0,:]) - 1], 1)
X = constructVariations(X, 6)
rawX = np.copy(X)
oneArray = np.ones((len(X),1))
X = np.hstack((oneArray, X))
trial = expit(solver = 'sag')
trial = trial.fit(X = X,y = np.ravel(Y))
from matplotlib import pyplot as plt
theta = trial.coef_.ravel()
x = np.linspace(-1, 1.5, 100)
y = np.linspace(-1,1.5,100)
z = np.empty((100,100))
xx,yy = np.meshgrid(x,y)
for i in range(len(x)):
for j in range(len(y)):
z[i][j] = thetaFunc(yy[i][j], theta, xx[i][j])
z -= trial.intercept_
plt.contour(xx,yy,z > 1,cmap=plt.cm.Paired, alpha=0.8)
plt.scatter(rawX[:, 0], rawX[:, 1], c=Y, linewidth=0, s=50)
plt.show()
I've been reading Bishop's book on machine learning, and I'm trying to implement the backpropagation algorithm for a neural network, but it's not finding a solution. The code is below. I've broken it down into the network code and the testing code.
import numpy as np
from collections import namedtuple
import matplotlib.pyplot as plt
import scipy.optimize as opt
# Network code
def tanh(x):
return np.tanh(x)
def dtanh(x):
return 1 - np.tan(x)**2
def identity(x):
return x
def unpack_weights(w, D, M, K):
"""
len(w) = (D + 1)*M + (M + 1)*K, where
D = number of inputs, excluding bias
M = number of hidden units, excluding bias
K = number of output units
"""
UnpackedWeights = namedtuple("UpackedWeights", ["wHidden", "wOutput"])
cutoff = M*(D + 1)
wHidden = w[:cutoff].reshape(M, D + 1)
wOutput = w[cutoff:].reshape(K, M + 1)
return UnpackedWeights(wHidden=wHidden, wOutput=wOutput)
def compute_output(x, weights, fcnHidden=tanh, fcnOutput=identity):
NetworkResults = namedtuple("NetworkResults", ["hiddenAct", "hiddenOut", "outputAct", "outputOut"])
xBias = np.vstack((1., x))
hiddenAct = weights.wHidden.dot(xBias)
hiddenOut = np.vstack((1., fcnHidden(hiddenAct)))
outputAct = weights.wOutput.dot(hiddenOut)
outputOut = fcnOutput(outputAct)
return NetworkResults(hiddenAct=hiddenAct, hiddenOut=hiddenOut, outputAct=outputAct,
outputOut=outputOut)
def backprop(t, x, M, fcnHidden=tanh, fcnOutput=identity, dFcnHidden=dtanh):
maxIter = 10000
learningRate = 0.2
N, K = t.shape
N, D = x.shape
nParams = (D + 1)*M + (M + 1)*K
w0 = np.random.uniform(-0.1, 0.1, nParams)
for _ in xrange(maxIter):
sse = 0.
for n in xrange(N):
weights = unpack_weights(w0, D, M, K)
# Compute net output
netResults = compute_output(x=x[n].reshape(-1, 1), weights=weights,
fcnHidden=fcnHidden, fcnOutput=fcnOutput)
# Compute derivatives of error function wrt wOutput
outputDelta = netResults.outputOut - t[n].reshape(K, 1)
outputDerivs = outputDelta.dot(netResults.hiddenOut.T)
# Compute derivateives of error function wrt wHidden
hiddenDelta = dFcnHidden(netResults.hiddenAct)*(weights.wOutput[:, 1:].T.dot(outputDelta))
xBias = np.vstack((1., x[n].reshape(-1, 1)))
hiddenDerivs = hiddenDelta.dot(xBias.T)
delErr = np.hstack((np.ravel(hiddenDerivs), np.ravel(outputDerivs)))
w1 = w0 - learningRate*delErr
w0 = w1
sse += np.sum(outputDelta**2)
return w0
# Testing code
def generate_test_data():
D, M, K, N = 1, 3, 1, 25
x = np.sort(np.random.uniform(-1., 1., (N, D)), axis=0)
t = 1.0 + x**2
return D, M, K, N, x, t
def test_backprop():
D, M, K, N, x, t = generate_test_data()
return backprop(t, x, M)
def scipy_solution(t, x, D, M, K, N, method="BFGS"):
def obj_fn(w):
weights = unpack_weights(w, D, M, K)
err = 0
for n in xrange(N):
netOut = compute_output(x[n], weights=weights)
err += (netOut.outputOut[0, 0] - t[n])**2
return err
w0 = np.random.uniform(-1, 1, (D + 1)*M + (M + 1)*K)
return opt.minimize(obj_fn, w0, method=method)
When I use the optimize module in scipy (i.e., the scipy_solution() function) to find the network weights, the sum of squared errors gets very close to zero, and the output of the network looks like the data I generated. When I use my backpropagation function, the sum of squared errors gets stuck between 2.0 and 3.0, and the network output looks almost linear. Moreover, when I feed the scipy solution for the weights to my backprop function as the starting value, my backprop function still doesn't find the right solution.
I've been stuck on this for a couple of days, so I'd really appreciate any tips anyone has. Thanks.
def dtanh(x):
return 1 - np.tan(x)**2
should be
def dtanh(x):
return 1 - np.tanh(x)**2