I am trying to implement gradient descent in python. Though my code is returning result by I think results I am getting are completely wrong.
Here is the code I have written:
import numpy as np
import pandas
dataset = pandas.read_csv('D:\ML Data\house-prices-advanced-regression-techniques\\train.csv')
X = np.empty((0, 1),int)
Y = np.empty((0, 1), int)
for i in range(dataset.shape[0]):
X = np.append(X, dataset.at[i, 'LotArea'])
Y = np.append(Y, dataset.at[i, 'SalePrice'])
X = np.c_[np.ones(len(X)), X]
Y = Y.reshape(len(Y), 1)
def gradient_descent(X, Y, theta, iterations=100, learningRate=0.000001):
m = len(X)
for i in range(iterations):
prediction = np.dot(X, theta)
theta = theta - (1/m) * learningRate * (X.T.dot(prediction - Y))
return theta
theta = np.random.randn(2,1)
theta = gradient_descent(X, Y, theta)
print('theta',theta)
The result I get after running this program is:
theta [[-5.23237458e+228]
[-1.04560188e+233]]
Which are very high values. Can someone point out the mistake I have made in implementation.
Also, 2nd problem is I have to set value of learning rate very low (in this case i have set to 0.000001) to work other wise program throws an error.
Please help me in diagnosis the problem.
try to reduce the learning rate with iteration otherwise it wont be able to reach the optimal lowest.try this
import numpy as np
import pandas
dataset = pandas.read_csv('start.csv')
X = np.empty((0, 1),int)
Y = np.empty((0, 1), int)
for i in range(dataset.shape[0]):
X = np.append(X, dataset.at[i, 'R&D Spend'])
Y = np.append(Y, dataset.at[i, 'Profit'])
X = np.c_[np.ones(len(X)), X]
Y = Y.reshape(len(Y), 1)
def gradient_descent(X, Y, theta, iterations=50, learningRate=0.01):
m = len(X)
for i in range(iterations):
prediction = np.dot(X, theta)
theta = theta - (1/m) * learningRate * (X.T.dot(prediction - Y))
learningRate/=10;
return theta
theta = np.random.randn(2,1)
theta = gradient_descent(X, Y, theta)
print('theta',theta)
Related
import numpy as np
import pandas as pd
import numpy as np
from matplotlib import pyplot as pt
def computeCost(X,y,theta):
m=len(y)
predictions= X*theta-y
sqrerror=np.power(predictions,2)
return 1/(2*m)*np.sum(sqrerror)
def gradientDescent(X, y, theta, alpha, num_iters):
m = len(y)
jhistory = np.zeros((num_iters,1))
for i in range(num_iters):
h = X * theta
s = h - y
theta = theta - (alpha / m) * (s.T*X).T
jhistory_iter = computeCost(X, y, theta)
return theta,jhistory_iter
data = open(r'C:\Users\Coding\Desktop\machine-learning-ex1\ex1\ex1data1.txt')
data1=np.array(pd.read_csv(r'C:\Users\Coding\Desktop\machine-learning-ex1\ex1\ex1data1.txt',header=None))
y =np.array(data1[:,1])
m=len(y)
y=np.asmatrix(y.reshape(m,1))
X = np.array([data1[:,0]]).reshape(m,1)
X = np.asmatrix(np.insert(X,0,1,axis=1))
theta=np.zeros((2,1))
iterations = 1500
alpha = 0.01;
print('Testing the cost function ...')
J = computeCost(X, y, theta)
print('With theta = [0 , 0]\nCost computed = ', J)
print('Expected cost value (approx) 32.07')
theta=np.asmatrix([[-1,0],[1,2]])
J = computeCost(X, y, theta)
print('With theta = [-1 , 2]\nCost computed =', J)
print('Expected cost value (approx) 54.24')
theta,JJ = gradientDescent(X, y, theta, alpha, iterations)
print('Theta found by gradient descent:')
print(theta)
print('Expected theta values (approx)')
print(' -3.6303\n 1.1664\n')
predict1 = [1, 3.5] *theta
print(predict1*10000)
Result:
Testing the cost function ...
With theta = [0 , 0]
Cost computed = 32.072733877455676
Expected cost value (approx) 32.07
With theta = [-1 , 2]
Cost computed = 69.84811062494227
Expected cost value (approx) 54.24
Theta found by gradient descent:
[[-3.70304726 -3.64357517]
[ 1.17367146 1.16769684]]
Expected theta values (approx)
-3.6303
1.1664
[[4048.02858742 4433.63790186]]
There are two problems, the first Cost computed was right, but the second one was wrong. And there are 4 element in my gradient descent(suppose to be two)
When you mention "With theta = [-1 , 2]"
and you enter
theta=np.asmatrix([[-1,0],[1,2]])
I think this is incorrect. Assuming that you have single feature and you added a column of 1, and you are trying to do simple linear regression
The correct way should be
np.array([-1,2])
Also where have
predictions= X*theta-y
It would be better if you did
np.dot(X,theta)-y
When you multiply, it's not doing the same thing.
I'm currently working on Andrew Ng's gradient descent exercise using python but keeps getting me the wrong optimal theta. I followed this vectorization cheatsheet for gradient descent --- https://medium.com/ml-ai-study-group/vectorized-implementation-of-cost-functions-and-gradient-vectors-linear-regression-and-logistic-31c17bca9181.
Here is my code:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
def cost_func(X, Y, theta):
m = len(X)
H = X.dot(theta)
J = 1/(2*m) * (H - Y).T.dot(H - Y)
return J
def gradient_descent(X, Y, alpha=0.01, iterations=1500):
#initializing theta as a zero vector
theta = np.zeros(X.shape[1])
#initializing the a list of cost function value
J_list = [cost_func(X, Y, theta)]
m = len(X)
while iterations > 0:
H = X.dot(theta)
delta = (1/m)*X.T.dot(H - Y)
theta = theta - alpha * delta
iterations -= 1
J_list.append(cost_func(X, Y, theta))
return theta, J_list
def check_convergence(J_list):
plt.plot(range(len(J_list)), J_list)
plt.xlabel('Iterations')
plt.ylabel('Cost J')
plt.show()
file_name_1 = 'https://raw.githubusercontent.com/kaleko/CourseraML/master/ex1/data/ex1data1.txt'
df1 = pd.read_csv(file_name_1, header=None)
X = df1.values[:, 0]
Y = df1.values[:, 1]
m = len(X)
X = np.column_stack((np.ones(m), X))
theta_optimal, J_list = gradient_descent(X, Y, 0.01, 1500)
print(theta_optimal)
check_convergence(J_list)
My theta output is [-3.63029144 1.16636235], which is incorrect.
Here is my cost function graph. As you see, it converges way too quickly.
The correct graph should look like.
Thank you.
Closed. This question needs details or clarity. It is not currently accepting answers.
Want to improve this question? Add details and clarify the problem by editing this post.
Closed 4 years ago.
Improve this question
I want to implement Coordinate Descent in Python and compare the result with that of Gradient Descent. I wrote the code. But it does not work well. GD is maybe ok, but CD is not good.
This is the reference to Coordinate Descent. --- LINK
And I expected this result
Gradient Descent vs Coordinate Descent
import pandas as pd
import matplotlib.pyplot as plt
import numpy as np
x = np.linspace(10, 100, 1000)
y = np.linspace(10, 100, 1000)
def func(x, y, param):
return param[0] * x + param[1] * y
def costf(X, y, param):
return np.sum((X.dot(param) - y) ** 2)/(2 * len(y))
z = func(x, y, [5, 8]) + np.random.normal(0., 10.)
z = z.reshape(-1, 1)
interc = np.ones(1000)
X = np.concatenate([interc.reshape(-1, 1), x.reshape(-1, 1), y.reshape(-1, 1)], axis=1)
#param = np.random.randn(3).T
param = np.array([2, 2, 2])
def gradient_descent(X, y, param, eta=0.01, iter=100):
cost_history = [0] * iter
for iteration in range(iter):
h = X.dot(param)
loss = h - y
gradient = X.T.dot(loss)/(2 * len(y))
param = param - eta * gradient
cost = costf(X, y, param)
#print(cost)
cost_history[iteration] = cost
return param, cost_history
def coordinate_descent(X, y, param, iter=100):
cost_history = [0] * iter
for iteration in range(iter):
for i in range(len(param)):
dele = np.dot(np.delete(X, i, axis=1), np.delete(param, i, axis=0))
param[i] = np.dot(X[:,i].T, (y - dele))/np.sum(np.square(X[:,i]))
cost = costf(X, y, param)
#print(cost)
cost_history[iteration] = cost
return param, cost_history
ret, xret = gradient_descent(X, z, param)
cret, cxret = coordinate_descent(X, z, param)
plt.plot(range(len(xret)), xret, label="GD")
plt.plot(range(len(cxret)), cxret, label="CD")
plt.legend()
plt.show()
Thank you in advance.
I'm not an expert in optimaztion. Here's only some advise.
Your code looks pretty good to me, except that
1.the following two lines seems wrong
loss = h - y
param[i] = np.dot(X[:,i].T, (y - dele))/np.sum(np.square(X[:,i]))
You reshaped y to (n_samples, 1) with z = z.reshape(-1, 1), while h and dele has shape (n_samples,). This results a (n_samples, n_samples) matrix which I think is not what you wanted.
2.your data might not be that good for test purpose. It doesn't mean that it won't work but you may need some extract effort to tune the learning rate / number of iterations.
I tried your code on boston dataset, the output looks like this.
the code
from sklearn.datasets import load_boston
from sklearn.preprocessing import StandardScaler
# boston house-prices dataset
data = load_boston()
X, y = data.data, data.target
X = StandardScaler().fit_transform(X) # for easy convergence
X = np.hstack([X, np.ones((X.shape[0], 1))])
param = np.zeros(X.shape[1])
def gradient_descent(X, y, param, eta=0.01, iter=300):
cost_history = [0] * (iter+1)
cost_history[0] = costf(X, y, param) # you may want to save initial cost
for iteration in range(iter):
h = X.dot(param)
loss = h - y.ravel()
gradient = X.T.dot(loss)/(2 * len(y))
param = param - eta * gradient
cost = costf(X, y, param)
#print(cost)
cost_history[iteration+1] = cost
return param, cost_history
def coordinate_descent(X, y, param, iter=300):
cost_history = [0] * (iter+1)
cost_history[0] = costf(X, y, param)
for iteration in range(iter):
for i in range(len(param)):
dele = np.dot(np.delete(X, i, axis=1), np.delete(param, i, axis=0))
param[i] = np.dot(X[:,i].T, (y.ravel() - dele))/np.sum(np.square(X[:,i]))
cost = costf(X, y, param)
cost_history[iteration+1] = cost
return param, cost_history
ret, xret = gradient_descent(X, y, param)
cret, cxret = coordinate_descent(X, y, param.copy())
plt.plot(range(len(xret)), xret, label="GD")
plt.plot(range(len(cxret)), cxret, label="CD")
plt.legend()
If this is what you want, you can then try to test on some other data. Just be careful that even the implementation is right, you still need to tune hyperparamters to get good convergence.
Hi I am a beginner in coding in python and machine learning and I am trying to learn about what goes on under the hood of logistic regression and making it run in python from scratch. I have been tasked with plotting and ranking the weights/coefficients of logistic regression below in order to remove features with the least impact. But, whilst I've added a basic plot it doesn't help me rank the coefficients/thetas. I was initially going to try using seaborn's sns.coefplot() but this has been deprecated. Any help pointing in the right direction would be appreciated.
This is also using the wisconin breast cancer dataset (https://www.kaggle.com/uciml/breast-cancer-wisconsin-data)
import matplotlib.pyplot as plt
import pandas as pd
import seaborn as sns
from sklearn.preprocessing import MinMaxScaler
from sklearn.model_selection import train_test_split
df = pd.read_csv("cancerdata.csv")
X = df.values[:,2:-1].astype('float64')
X = (X - np.mean(X, axis =0)) / np.std(X, axis = 0)
X = np.hstack([np.ones((X.shape[0], 1)),X])
X = MinMaxScaler().fit_transform(X)
Y = df["diagnosis"].map({'M':1,'B':0})
Y = np.array(Y)
X_train,X_test,Y_train,Y_test =
train_test_split(X,Y,test_size=0.25)
def Sigmoid(z):
return 1/(1 + np.exp(-z))
def Hypothesis(theta, x):
return Sigmoid(x # theta)
def Cost_Function(X,Y,theta,m):
hi = Hypothesis(theta, X)
_y = Y.reshape(-1, 1)
J = 1/float(m) * np.sum(-_y * np.log(hi) - (1-_y) * np.log(1-hi))
return J
def Cost_Function_Derivative(X,Y,theta,m,alpha):
hi = Hypothesis(theta,X)
_y = Y.reshape(-1, 1)
J = alpha/float(m) * X.T # (hi - _y)
return J
def Gradient_Descent(X,Y,theta,m,alpha):
new_theta = theta - Cost_Function_Derivative(X,Y,theta,m,alpha)
return new_theta
def Accuracy(theta):
correct = 0
length = len(X_test)
prediction = (Hypothesis(theta, X_test) > 0.5)
_y = Y_test.reshape(-1, 1)
correct = prediction == _y
my_accuracy = (np.sum(correct) / length)*100
print ('LR Accuracy %: ', my_accuracy)
def Logistic_Regression(X,Y,alpha,theta,num_iters):
m = len(Y)
for x in range(num_iters):
new_theta = Gradient_Descent(X,Y,theta,m,alpha)
theta = new_theta
if x % 100 == 0:
#print ('theta: ', theta)
Accuracy(theta)
x = np.linspace(-6, 6, 50)
y = -(theta[0] + theta[1]*x)/theta[2]
plt.plot(x, y)
plt.plot(theta)
plt.show()
ep = .012
initial_theta = np.random.rand(X_train.shape[1],1) * 2 * ep - ep
alpha = 0.5
iterations = 2000
Logistic_Regression(X_train,Y_train,alpha,initial_theta,iterations)
I am trying to write a function for Gibbs sampler in the Bayesian framework. I got the code from this [website][1], which is a straightforward regression model. However, I am tackling a more complicated model which is: y= beta0 + beta1* x + x^gamma * sigma * epsilon where sigma is the variance of the model. That means I need to estimate p(beta0|y,x,beta1,sigma,gamma) and so on(in the Gibbs sampler method). my question is how should I modify the code to sample beta0, beta1 and other variables as there are extra variables to condition on.
My codes are:
import numpy as np
import pymc as pm
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
plt.rcParams['figure.figsize'] = (10, 5)
conda install -c conda-forge pymc3=3.0
def sample_beta_0(y, x, beta_1, sigma, gamma, mu_0, tau_0):
N = len(y)
assert len(x) == N
tau_i = 1/((x**gamma)*sigma)**2
precision = tau_0 + sum(tau_i)
mean = tau_0 * mu_0 + np.sum((y - beta_1 * x)*tau_i)
mean /= precision
return np.random.normal(mean, 1 / np.sqrt(precision))
def sample_beta_1(y, x, beta_0, sigma, mu_1, sigma_1):
N = len(y)
assert len(x) == N
precision = sigma_1 + sigma * np.sum(x * x)
mean = sigma_1 * mu_1 + sigma * np.sum( (y - beta_0) * x)
mean /= precision
return np.random.normal(mean, 1 / np.sqrt(precision))
def sample_sigma(y, x, beta_0, beta_1, alpha, beta):
N = len(y)
alpha_new = alpha + N / 2
resid = y - beta_0 - beta_1 * x
beta_new = beta + np.sum(resid * resid) / 2
return np.random.gamma(alpha_new, 1 / beta_new)
beta_0_true = -1
beta_1_true = 2
sigma_true = 1
N = 50
x = np.random.uniform(low=0, high=4, size=N)
y = np.random.normal(beta_0_true + beta_1_true * x, 1 / np.sqrt(sigma_true))
synth_plot = plt.plot(x, y, "o")
plt.xlabel("x")
plt.ylabel("y")
# print('Y are', y)
# print('X are', x)
plt.show()
"""GIBBS Sampler"""
# specify initial values
init = {"beta_0": 0,
"beta_1": 0,
"sigma": 2}
# specify hyper parameters
hypers = {"mu_0": 0,
"sigma_0": 1,
"mu_1": 0,
"sigma_1": 1,
"alpha": 2,
"beta": 1}
def gibbs(y, x, iters, init, hypers):
assert len(y) == len(x)
beta_0 = init["beta_0"]
beta_1 = init["beta_1"]
sigma = init["sigma"]
trace = np.zeros((iters, 3)) # trace to store values of beta_0, beta_1, sigma
for it in range(iters):
beta_0 = sample_beta_0(y, x, beta_1, sigma, hypers["mu_0"], hypers["sigma_0"])
beta_1 = sample_beta_1(y, x, beta_0, sigma, hypers["mu_1"], hypers["sigma_1"])
sigma = sample_sigma(y, x, beta_0, beta_1, hypers["alpha"], hypers["beta"])
trace[it, :] = np.array((beta_0, beta_1, sigma))
trace = pd.DataFrame(trace)
trace.columns = ['beta_0', 'beta_1', 'sigma']
print(trace)
return trace
iters = 1000
trace = gibbs(y, x, iters, init, hypers)
traceplot = trace.plot()
traceplot.set_xlabel("Iteration")
traceplot.set_ylabel("Parameter value")
trace_burnt = trace[500:999]
hist_plot = trace_burnt.hist(bins = 30, layout = (1,3))
print(trace_burnt.median())
print(trace_burnt.std())
I know is really long but please help!