X = df.drop(columns="Math")
y = df.iloc[:, 4]
theta = np.array([0]*len(X.columns))
def hypothesis(theta, X):
return theta*X
def computeCost(X, y, theta):
y1 = hypothesis(theta, X)
y1=np.sum(y1, axis=1)
return sum(np.sqrt((y1-y)**2))/(2*47)
def gradientDescent(X, y, theta, alpha, i):
J = [] #cost function in each iterations
k = 0
while k < i:
y1 = hypothesis(theta, X)
y1 = np.sum(y1, axis=1)
for c in range(0, len(X.columns)):
theta[c] = theta[c] - alpha*(sum((y1-y)*X.iloc[:,c])/len(X))
j = computeCost(X, y, theta)
J.append(j)
k += 1
return J, j, theta
J, j, theta = gradientDescent(X, y, theta, 0.05, 10000)
The dataset is consists of five columns. The first is the column of ones for the bias term. The second until the last are int64 consisting numerical value from 1-100. The second field represents the Physics scores, the third represents the Science scores, the fourth represents the Statistics scores, while the last one represents the Math scores. I am trying to use the 1st until the 4th column to predict the 5th column (Math)
The error will appear as follows:
OverflowError Traceback (most recent call last)
<ipython-input-26-d17a8fb83984> in <module>()
----> 1 J, j, theta = gradientDescent(X, y, theta, 0.05, 10000)
<ipython-input-25-bfec0d0edcfa> in gradientDescent(X, y, theta, alpha, i)
6 y1 = np.sum(y1, axis=1)
7 for c in range(0, len(X.columns)):
----> 8 theta[c] = theta[c] - alpha*(sum((y1-y)*X.iloc[:,c])/len(X))
9 j = computeCost(X, y, theta)
10 J.append(j)
OverflowError: Python int too large to convert to C
You ran into the error most likely due to a combination of:
Setting theta to be integer with theta = np.array([0]*len(X.columns)). You can do something like np.zeros(np.shape(X)[1])
Setting a learning rate that is too high, you can check your cost or J, it might be increasing indicating the learning rate is too high
Not very sure about your bias terms as 1, this might depend on your range of values.
So if I test your code with a simple example:
import pandas as pd
import numpy as np
np.random.seed(111)
df = pd.DataFrame(np.random.randint(0,100,(50,4)),
columns=['const','Physics','Science','Stats'])
df['const'] = 1
df['Math'] = 0.2*df['Physics'] + 0.4*df['Science'] + 0.5*df['Stats']
Then initialize:
X = df.drop(columns="Math")
y = df.iloc[:, 4]
theta = np.ones(X.shape[1])
Then run with a smaller learning rate:
J, j, theta = gradientDescent(X, y, theta, 0.0001,100)
theta
array([0.98851902, 0.1950524 , 0.39639991, 0.49143374])
Related
import numpy as np
import pandas as pd
import numpy as np
from matplotlib import pyplot as pt
def computeCost(X,y,theta):
m=len(y)
predictions= X*theta-y
sqrerror=np.power(predictions,2)
return 1/(2*m)*np.sum(sqrerror)
def gradientDescent(X, y, theta, alpha, num_iters):
m = len(y)
jhistory = np.zeros((num_iters,1))
for i in range(num_iters):
h = X * theta
s = h - y
theta = theta - (alpha / m) * (s.T*X).T
jhistory_iter = computeCost(X, y, theta)
return theta,jhistory_iter
data = open(r'C:\Users\Coding\Desktop\machine-learning-ex1\ex1\ex1data1.txt')
data1=np.array(pd.read_csv(r'C:\Users\Coding\Desktop\machine-learning-ex1\ex1\ex1data1.txt',header=None))
y =np.array(data1[:,1])
m=len(y)
y=np.asmatrix(y.reshape(m,1))
X = np.array([data1[:,0]]).reshape(m,1)
X = np.asmatrix(np.insert(X,0,1,axis=1))
theta=np.zeros((2,1))
iterations = 1500
alpha = 0.01;
print('Testing the cost function ...')
J = computeCost(X, y, theta)
print('With theta = [0 , 0]\nCost computed = ', J)
print('Expected cost value (approx) 32.07')
theta=np.asmatrix([[-1,0],[1,2]])
J = computeCost(X, y, theta)
print('With theta = [-1 , 2]\nCost computed =', J)
print('Expected cost value (approx) 54.24')
theta,JJ = gradientDescent(X, y, theta, alpha, iterations)
print('Theta found by gradient descent:')
print(theta)
print('Expected theta values (approx)')
print(' -3.6303\n 1.1664\n')
predict1 = [1, 3.5] *theta
print(predict1*10000)
Result:
Testing the cost function ...
With theta = [0 , 0]
Cost computed = 32.072733877455676
Expected cost value (approx) 32.07
With theta = [-1 , 2]
Cost computed = 69.84811062494227
Expected cost value (approx) 54.24
Theta found by gradient descent:
[[-3.70304726 -3.64357517]
[ 1.17367146 1.16769684]]
Expected theta values (approx)
-3.6303
1.1664
[[4048.02858742 4433.63790186]]
There are two problems, the first Cost computed was right, but the second one was wrong. And there are 4 element in my gradient descent(suppose to be two)
When you mention "With theta = [-1 , 2]"
and you enter
theta=np.asmatrix([[-1,0],[1,2]])
I think this is incorrect. Assuming that you have single feature and you added a column of 1, and you are trying to do simple linear regression
The correct way should be
np.array([-1,2])
Also where have
predictions= X*theta-y
It would be better if you did
np.dot(X,theta)-y
When you multiply, it's not doing the same thing.
I'm currently working on Andrew Ng's gradient descent exercise using python but keeps getting me the wrong optimal theta. I followed this vectorization cheatsheet for gradient descent --- https://medium.com/ml-ai-study-group/vectorized-implementation-of-cost-functions-and-gradient-vectors-linear-regression-and-logistic-31c17bca9181.
Here is my code:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def cost_func(X, Y, theta):
m = len(X)
H = X.dot(theta)
J = 1/(2*m) * (H - Y).T.dot(H - Y)
return J
def gradient_descent(X, Y, alpha=0.01, iterations=1500):
#initializing theta as a zero vector
theta = np.zeros(X.shape[1])
#initializing the a list of cost function value
J_list = [cost_func(X, Y, theta)]
m = len(X)
while iterations > 0:
H = X.dot(theta)
delta = (1/m)*X.T.dot(H - Y)
theta = theta - alpha * delta
iterations -= 1
J_list.append(cost_func(X, Y, theta))
return theta, J_list
def check_convergence(J_list):
plt.plot(range(len(J_list)), J_list)
plt.xlabel('Iterations')
plt.ylabel('Cost J')
plt.show()
file_name_1 = 'https://raw.githubusercontent.com/kaleko/CourseraML/master/ex1/data/ex1data1.txt'
df1 = pd.read_csv(file_name_1, header=None)
X = df1.values[:, 0]
Y = df1.values[:, 1]
m = len(X)
X = np.column_stack((np.ones(m), X))
theta_optimal, J_list = gradient_descent(X, Y, 0.01, 1500)
print(theta_optimal)
check_convergence(J_list)
My theta output is [-3.63029144 1.16636235], which is incorrect.
Here is my cost function graph. As you see, it converges way too quickly.
The correct graph should look like.
Thank you.
I am trying to write a program to calculate the slope and the intercept of a linear regression model but when I am running more than 10 iterations, the gradient descent function gives the np.nan value for both intercept as well as slope.
Below is my implementation
def get_gradient_at_b(x, y, b, m):
N = len(x)
diff = 0
for i in range(N):
x_val = x[i]
y_val = y[i]
diff += (y_val - ((m * x_val) + b))
b_gradient = -(2/N) * diff
return b_gradient
def get_gradient_at_m(x, y, b, m):
N = len(x)
diff = 0
for i in range(N):
x_val = x[i]
y_val = y[i]
diff += x_val * (y_val - ((m * x_val) + b))
m_gradient = -(2/N) * diff
return m_gradient
def step_gradient(b_current, m_current, x, y, learning_rate):
b_gradient = get_gradient_at_b(x, y, b_current, m_current)
m_gradient = get_gradient_at_m(x, y, b_current, m_current)
b = b_current - (learning_rate * b_gradient)
m = m_current - (learning_rate * m_gradient)
return [b, m]
def gradient_descent(x, y, learning_rate, num_iterations):
b = 0
m = 0
for i in range(num_iterations):
b, m = step_gradient(b, m, x, y, learning_rate)
return [b,m]
I am running it on the following data:
a=[3.87656018e+11, 4.10320300e+11, 4.15730874e+11, 4.52699998e+11,
4.62146799e+11, 4.78965491e+11, 5.08068952e+11, 5.99592902e+11,
6.99688853e+11, 8.08901077e+11, 9.20316530e+11, 1.20111177e+12,
1.18695276e+12, 1.32394030e+12, 1.65661707e+12, 1.82304993e+12,
1.82763786e+12, 1.85672212e+12, 2.03912745e+12, 2.10239081e+12,
2.27422971e+12, 2.60081824e+12]
b=[3.3469950e+10, 3.4784980e+10, 3.3218720e+10, 3.6822490e+10,
4.4560290e+10, 4.3826720e+10, 5.2719430e+10, 6.3842550e+10,
8.3535940e+10, 1.0309053e+11, 1.2641405e+11, 1.6313218e+11,
1.8529536e+11, 1.7875143e+11, 2.4981555e+11, 3.0596392e+11,
3.0040058e+11, 3.1440530e+11, 3.1033848e+11, 2.6229109e+11,
2.7585243e+11, 3.0352616e+11]
print(gradient_descent(a, b, 0.01, 100))
#result --> [nan, nan]
When I run the gradient_descent function on a dataset with smaller values, it gives the correct answers. Also I was able to obtain the intercept and slope for the above data with from sklearn.linear_model import LinearRegression
Any help will be appreciated in figuring out why the result is [nan, nan] instead of giving me the correct intercept and slope.
You need to reduce the learning rate. Since the values in a and b are so large (>= 1e11), the learning rate needs be approximately 1e-25 for this to even do the gradient descent, else it will randomly overshoot because of large gradients of a and b.
b, m = gradient_descent(a, b, 5e-25, 100)
print(b, m)
Out: -3.7387067636195266e-13 0.13854551291084335
I'm trying to implement gradient descent in Simple linear regression. Whenever I run the code, I get the error
The code i am using is this:
def get_data(df, feature, predict):
X = df[feature]
Y = df[predict]
X = np.float64(X)
Y = np.float64(Y)
return X, Y
def average(X, Y, b, m, length):
temp1 = 0
temp2 = 0
for i in range(length):
temp1 += (b + m * X[i]) - Y[i]
temp2 += ((b + m * X[i]) - Y[i]) * X[i]
return temp1 / float(length), temp2 / float(length)
def gradient_descent(b, m, alpha, length, num_iterations, X, Y):
for i in range(num_iterations):
temp1, temp2 = average(X, Y, b, m, length)
b_temp = b - alpha * temp1
m_temp = m - alpha * temp2
b = b_temp
m = m_temp
return b, m
def run(b, m, alpha, feature, predict, df, num_iterations):
X, Y = get_data(df, feature, predict)
length = np.alen(X)
final_b, final_m = gradient(b, m, alpha, length, num_iterations, X, Y)
return final_b, final_m
b, m = run(0, 0, 0.05, 'sqft_living', 'price', df, 1000)
The error it gives my is this:
/Users/*****/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:15:
RuntimeWarning: overflow encountered in double_scalars from
ipykernel import kernelapp as app
/Users/*****/anaconda2/lib/python2.7/site-packages/ipykernel_launcher.py:25:
RuntimeWarning: invalid value encountered in double_scalars.
I'm not able to identify which part of the code is causing the error. I tried to convert numpy array to float64 also my code is not running into Divide by Zero Error. Can someone identify the error? Also, How can it be rectified?
I implemented a simple linear regression and I want to try it out by fitting a non linear model
specifically I am trying to fit a model for the function y = x^3 + 5 for example
this is my code
import numpy as np
import numpy.matlib
import matplotlib.pyplot as plt
def predict(X,W):
return np.dot(X,W)
def gradient(X, Y, W, regTerm=0):
return (-np.dot(X.T, Y) + np.dot(np.dot(X.T,X),W))/(m*k) + regTerm * W /(n*k)
def cost(X, Y, W, regTerm=0):
m, k = Y.shape
n, k = W.shape
Yhat = predict(X, W)
return np.trace(np.dot(Y-Yhat,(Y-Yhat).T))/(2*m*k) + regTerm * np.trace(np.dot(W,W.T)) / (2*n*k)
def Rsquared(X, Y, W):
m, k = Y.shape
SSres = cost(X, Y, W)
Ybar = np.mean(Y,axis=0)
Ybar = np.matlib.repmat(Ybar, m, 1)
SStot = np.trace(np.dot(Y-Ybar,(Y-Ybar).T))
return 1-SSres/SStot
m = 10
n = 200
k = 1
trX = np.random.rand(m, n)
trX[:, 0] = 1
for i in range(2, n):
trX[:, i] = trX[:, 1] ** i
trY = trX[:, 1] ** 3 + 5
trY = np.reshape(trY, (m, k))
W = np.random.rand(n, k)
numIter = 10000
learningRate = 0.5
for i in range(0, numIter):
W = W - learningRate * gradient(trX, trY, W)
domain = np.linspace(0,1,100000)
powerDomain = np.copy(domain)
m = powerDomain.shape[0]
powerDomain = np.reshape(powerDomain, (m, 1))
powerDomain = np.matlib.repmat(powerDomain, 1, n)
for i in range(1, n):
powerDomain[:, i] = powerDomain[:, 0] ** i
print(Rsquared(trX, trY, W))
plt.plot(trX[:, 1],trY,'o', domain, predict(powerDomain, W),'r')
plt.show()
the R^2 I'm getting is very close to 1, meaning I found a very good fit to the training data, but it isn't shown on the plots. When I plot the data, it usually looks like this:
it looks as if I'm underfitting the data, but with such a complex hypothesis, with 200 features (meaning i allow polynomials up to x^200) and only 10 training examples, I should very clearly be overfitting data, so I expect the red line to pass through all the blue points and go wild between them.
This isn't what I'm getting which is confusing to me.
What's wrong?
You forgot to set powerDomain[:,0]=1, that's why your plot goes wrong at 0. And yes you are over fitting: look how quickly your plot fires up as soon as you get out of your training domain.