I'm teaching myself Logistic Regression and I run into a problem while implementing it from scratch. The script kinda works but it starts outputting nans and giving me "divide by zero encountered in log" warnings when I initialize it with weights bigger than around 10.
This is my code
`
import numpy as np
import matplotlib.pyplot as plt
import scipy
def robust_sigmoid(x):
return np.exp(-np.logaddexp(0, -x))
def predict(x, w, train = True):
z = robust_sigmoid(np.dot(w, x))
if train:
return z
else:
return int(z>0.5)
def NLLloss(Y_hat, Y):
total_loss = - np.sum(np.multiply(Y, np.log(Y_hat)) + np.multiply(1 - Y, np.log(1 - Y_hat)))
return total_loss
def grad(X, Y_hat, Y):
return np.dot(X.T, (Y_hat - Y))
def GradientDescent(X, Y, alpha, iterations, threshold, weight):
Y_hat = np.array([predict(x, weight) for x in X])
loss = []
for i in range(iterations):
gradient = grad(X, Y_hat, Y)
weight = weight - alpha*gradient
Y_hat = np.array([predict(x, weight) for x in X])
new_loss = NLLloss(Y_hat, Y)
loss.append(new_loss)
if i%10 == 0:
print("Iteration {0}, Cost: {1}".format(i, new_loss))
if i > 3 and abs(loss[-2] - loss[-1]) < threshold:
break
return weight
toy_dataset = np.array([[2.7810836,2.550537003,1],
[1.465489372,2.362125076,1],
[3.396561688,4.400293529,1],
[1.38807019,1.850220317,1],
[3.06407232,3.005305973,1],
[7.627531214,2.759262235,1],
[5.332441248,2.088626775,1],
[6.922596716,1.77106367,1],
[8.675418651,-0.242068655,1],
[7.673756466,3.508563011,1]])
for col in range(toy_dataset.shape[1]-1):
toy_dataset[:,col] -= np.average(toy_dataset[:,col])
toy_dataset[:,col] /= np.std(toy_dataset[:,col])
print(toy_dataset)
label = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
weight = GradientDescent(toy_dataset, label, 0.1, 100, 0.0001, [10,1.5,10])
print(weight)
prediction = []
for data in toy_dataset:
prediction.append(predict(weight, data, False))
print(prediction)
`
I thought it was a a problem with how I implemented the sigmoid function so I went and looked for a better implementation but it didn't fix anything. I also tried to normalize my dataset but it only made the program accept slightly bigger weights.
Is there anyway to fix this?
Related
I have a class Linear Regression and want to check how does it work with dataset load_boston. I calculated the Mean absolute percentage error (MAPE) and the result is nan.
import numpy as np
import warnings
from sklearn.base import BaseEstimator
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import pandas as pd
warnings.filterwarnings('ignore')
class LinearRegressionSGD(BaseEstimator):
def __init__(self, epsilon=1e-4, max_steps=1000, w0=None, alpha=1e-2):
'''
epsilon: difference for the rate of change of weights
max_steps: maximum number of steps in gradient descent
w0: np.array (d,) - initial weights
alpha: learning step
'''
self.epsilon = epsilon
self.max_steps = max_steps
self.w0 = w0
self.alpha = alpha
self.w = None
self.w_history = []
def fit(self, X, y):
"""
X: np.array (l, d)
y: np.array (l)
---
output: self
"""
l, d = X.shape
if self.w0 is None:
self.w0 = np.zeros(d)
self.w = self.w0
for step in range(self.max_steps):
self.w_history.append(self.w)
w_new = self.w - self.alpha * self.calc_gradient(X, y)
if (np.linalg.norm(w_new - self.w) < self.epsilon):
break
self.w = w_new
return self
def predict(self, X):
"""
X: np.array (l, d)
---
output: np.array (l)
"""
if self.w is None:
raise Exception('Not trained yet')
l, d = X.shape
y_pred = []
for i in range(l):
y_pred.append(np.dot(X[i], self.w))
return np.array(y_pred)
def calc_gradient(self, X, y):
"""
X: np.array (l, d)
y: np.array (l)
---
output: np.array (d)
"""
l, d = X.shape
gradient = []
for j in range(d):
dQ = 0
for i in range(l):
dQ += (2 / l) * X[i][j] * (np.dot(X[i], self.w) - y[i])
gradient.append(dQ)
return np.array(gradient)
data = load_boston()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
X_train, X_test, y_train, y_test = train_test_split(np.array(X), y, test_size=0.3, random_state=10)
def MAPE(y_true, y_pred):
"""
y_true: np.array (l)
y_pred: np.array (l)
---
output: float [0, +inf)
"""
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
# Task 2
sgd = LinearRegressionSGD()
sgd.fit(X_train, y_train)
y_pred_sgd = sgd.predict(X_test)
print(MAPE(y_test, y_pred_sgd))
# Task 3
a, b = X_test.shape
w_0 = np.random.uniform(-2, 2, (b))
lr = LinearRegressionSGD(w0=w_0)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print(MAPE(y_test, y_pred_lr))
But when I create X, y like below, the code works properly and MAPE gives float value
n_features = 2
n_objects = 300
num_steps = 100
np.random.seed(1)
w_true = np.random.normal(0, 0.1, size=(n_features, ))
w_0 = np.random.uniform(-2, 2, (n_features))
X = np.random.uniform(-5, 5, (n_objects, n_features))
y = np.dot(X, w_true) + np.random.normal(0, 1, (n_objects))
What is the problem with my code? and how to fix it to get the float value?
(Sorry for my bad English, its not my native language)
I tried to implement logistic regression only with numpy in Python, but the result is not satisfying. The predictions seems incorrect and loss is not improving so it is probably something wrong with the code. Does anyone know what could fix it? Thank you very much!
Here is algorithm:
import numpy as np
# training data and labels
X = np.concatenate((np.random.normal(0.25, 0.1, 50), np.random.normal(0.75, 0.1, 50)), axis=None)
Y = np.concatenate((np.zeros((50,), dtype=np.int32), np.ones((50,), dtype=np.int32)), axis=None)
def logistic_sigmoid(a):
return 1 / (1 + np.exp(-a))
# forward pass
def forward_pass(w, x):
return logistic_sigmoid(w * x)
# gradient computation
def backward_pass(x, y, y_real):
return np.sum((y - y_real) * x)
# computing loss
def loss(y, y_real):
return -np.sum(y_real * np.log(y) + (1 - y_real) * np.log(1 - y))
# training
def train():
w = 0.0
learning_rate = 0.01
i = 200
test_number = 0.3
for epoch in range(i):
y = forward_pass(w, X)
gradient = backward_pass(X, y, Y)
w = w - learning_rate * gradient
print(f'epoch {epoch + 1}, x = {test_number}, y = {forward_pass(w, test_number):.3f}, loss = {loss(y, Y):.3f}')
train()
At first glance you are missing you intercept term (typically called b_0, or bias) and its gradient update. Also in the backward_pass and loss calculations you are not dividing by the amount of data samples.
You can see two examples of how to implement it from scratch here:
1: Example based on Andrew Ng explanations in the Machine Learning course in Coursera
2: Implementation of Jason Brownlee from Machine Learning mastery website
I am trying to implement multivariate linear regression(gradient descent and mse cost function) but the loss value keeps exponentially increasing for every iteration of gradient descent and I'm unable to figure out why?
from sklearn.datasets import load_boston
class LinearRegression:
def __init__(self):
self.X = None # The feature vectors [shape = (m, n)]
self.y = None # The regression outputs [shape = (m, 1)]
self.W = None # The parameter vector `W` [shape = (n, 1)]
self.bias = None # The bias value `b`
self.lr = None # Learning Rate `alpha`
self.m = None
self.n = None
self.epochs = None
def fit(self, X: np.ndarray, y: np.ndarray, epochs: int = 100, lr: float = 0.001):
self.X = X # shape (m, n)
self.m, self.n = X.shape
assert y.size == self.m and y.shape[0] == self.m
self.y = np.reshape(y, (-1, 1)) # shape (m, ) or (m, 1)
assert self.y.shape == (self.m, 1)
self.W = np.random.random((self.n, 1)) * 1e-3 # shape (n, 1)
self.bias = 0.0
self.epochs = epochs
self.lr = lr
self.minimize()
def minimize(self, verbose: bool = True):
for num_epoch in range(self.epochs):
predictions = np.dot(self.X, self.W)
assert predictions.shape == (self.m, 1)
grad_w = (1/self.m) * np.sum((predictions-self.y) * self.X, axis=0)[:, np.newaxis]
self.W = self.W - self.lr * grad_w
assert self.W.shape == grad_w.shape
loss = (1 / 2 * self.m) * np.sum(np.square(predictions - self.y))
if verbose:
print(f'Epoch : {num_epoch+1}/{self.epochs} \t Loss : {loss.item()}')
linear_regression = LinearRegression()
x_train, y_train = load_boston(return_X_y=True)
linear_regression.fit(x_train, y_train, 10)
I'm using the boston housing dataset from sklearn.
PS. I'd like to know what's causing this issue and how to fix it and whether or not my implementation is correct.
Thanks
The error is in the gradient. A divergence like that for an iterative shrinkage-thresholding algorithms (ISTA) solver is not something you should see.
For your gradient computation: X is of shape (m,n) and W of shape(n,1) so (prediction - y) is of shape (m,1) then you multiply by X on the left? (m,1) by (m,n)? Not sure what numpy is computing but it is not what you want to compute:
grad_w = (1/self.m) * np.sum((predictions-self.y) * self.X, axis=0)[:, np.newaxis]
here the code should be a bit different to have a (n,m) multiply by a (m,1) in order to get a (n,1), same shape as W.
(1/self.m) * np.sum(self.X.T*(predictions-self.y) , axis=0)[:, np.newaxis]
For the derivation to be correct.
I am also not sure of why you use the dot (which is a good idea) for the prediction but not for the gradient.
You Also do not need so many reshapes:
from sklearn.datasets import load_boston
A,b = load_boston(return_X_y=True)
n_samples = A.shape[0]
n_features = A.shape[1]
def grad_linreg(x):
"""Least-squares gradient"""
grad = (1. / n_samples) * np.dot(A.T, np.dot(A, x) - b)
return grad
def loss_linreg(x):
"""Least-squares loss"""
f = (1. / (2. * n_samples)) * sum((b - np.dot(A, x)) ** 2)
return f
And then you check that your gradient is good:
from scipy.optimize import check_grad
from numpy.random import randn
check_grad(loss_linreg,grad_linreg,randn(n_features))
check_grad(loss_linreg,grad_linreg,randn(n_features))
check_grad(loss_linreg,grad_linreg,randn(n_features))
check_grad(loss_linreg,grad_linreg,randn(n_features))
You can then build the Model on that.
If you want to test that with ISTA/FISTA and Logistic/Linear Regression and LASSO/RIDGE, here is a jupyter notebook with the theory and a working example
I managed to successfully implement multilinear regression using only numpy for Iris dataset. I wanted to do the same for
boston houses data set but my model won't learn and I have no idea why.
import pandas as pd
# read data and split into test and training sets
data = pd.read_csv('train.csv')
data = (data - data.mean()) / data.std() # normalize data
split_data = np.random.rand(len(data)) < 0.8
train_data = data[split_data].round(5)
test_data = data[~split_data]
# create matrices
input_features_train = train_data.drop(['ID', 'medv'], 1).values
output_feature_train = train_data.medv.values.reshape(-1, 1)
ones = np.ones([input_features_train.shape[0], 1])
input_features_train = np.concatenate((ones, input_features_train), 1)
weight = np.zeros([1, 14])
def computeCost(X, y, theta):
summed = np.power(((X # theta.T) - y), 2)
return np.sum(summed) / (2 * len(X))
def gradientDescent(X, y, theta, iters, alpha):
costs = np.zeros(iters)
for i in range(iters):
theta = theta - (alpha / len(X)) * np.sum(X * (X # theta.T - y), 0)
costs[i] = computeCost(X, y, theta)
return theta, costs
learning_rate = 0.01
iterations = 100000
weights, cost = gradientDescent(input_features_train, output_feature_train, weight, iterations, learning_rate)
print("Weights: ", weights)
finalCost = computeCost(input_features_train, output_feature_train, weights)
# test
input_features_test = test_data.drop(['ID', 'medv'], 1).values
output_feature_test = test_data.medv.values.reshape(-1, 1)
ones = np.ones([input_features_test.shape[0], 1])
input_features_test = np.concatenate((ones, input_features_test), 1)
def test_data(input_features, output_feature, weights):
predictions = np.round(np.dot(input_features, weights.T))
for i in range(len(output_feature)):
predicted = predictions[i]
success = predictions[i] == output_feature[i]
print('For features: ', input_features[i], ' housing price should be ', output_feature[i])
print("Predicted: %f" % predicted)
print("Is success? ", success)
print()
test_data(input_features_test, output_feature_test, weights)
predictions = np.round(np.dot(input_features_test, weights.T))
accuracy = (sum(predictions == output_feature_test) / float(len(output_feature_test)) * 100)[0]
print("Accuracy of the model is ", accuracy, "% after ", iterations, "iterations")
example output goes as follow
Weights: [[ 0.01465871 -0.11583742 0.17729105 0.01249782 0.09822299 -0.31249182
0.25208063 -0.00937766 -0.48751822 0.46772537 -0.27637035 -0.1590125
0.12926108 -0.48910136]]
For features: [ 1. -0.44852959 -0.47141352 0.09095532 -0.25240023 0.13793157
0.46506236 0.03105118 -0.62153314 -0.98758424 -0.79769195 1.18594974
0.37563165 -0.40259248] housing price should be [-0.04019949]
Predicted: 0.000000
Is success? [False]
I tried even 10000000 iterations and still it fails all tests and has 0% accuracy. On iris data set I managed to get 100% with this model so I don't understand why it won't work.
I suspect it might be something with data normalization as without it I get RuntimeWarning: overflow encountered in power
summed = np.power(((X # theta.T) - y), 2) error which I also don't know why is happening.
Could you please point me in the right direction ? Thanks!
I really suggest you to use scikit learn for this. You can use SGD Regressor,or Cat Boost Regressor which have inbuilt support required for approaches like these.
The main motive behind this suggestion is using gradient descent manually may lead to some logical error which may go undetected.
Try to solve with scikit learn.That might help.
I tried to implement multivariate linear regression from scratch and it works pretty well actually. When I was testing it with a toy dataset, I run into sometimes the predictions were 'NaN'. I know what are the possible NaN reasons though, I couldn't understand which one causes it in my script.
Note: with 0.0001 learning rate and 1.000.000 iterations, I got a really good line for the dataset though, when learning rate is 0.001 and the number of iterations is 1.000.000, the predictions were NaN.
Here is the script:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import numpy as np
class MultivariateLinearRegression():
#constructor
def __init__(self, learning_rate, learning_algorithm, epoch_num):
self.learning_rate = learning_rate
self.learning_algoritm = learning_algorithm
self.epoch_num = epoch_num
self.theta = 0
self.training_sample = 0
def train(self, X, Y):
Y = Y.reshape((Y.size, 1))
if len(X.shape) == 1:
X = X.reshape((X.size, 1))
bias = np.ones([X.shape[0], 1])
X = np.concatenate((X, bias), 1)
self.theta = np.zeros([X.shape[1], 1])
self.training_sample = X.shape[0]
cost_history = []
for i in range (self.epoch_num):
hypothesis = X.dot(self.theta)
cost_func = np.transpose(X).dot(np.subtract(hypothesis, Y))
gradient = (self.learning_rate / self.training_sample) * cost_func
self.theta = np.subtract(self.theta, gradient)
cost_history.append(self.theta)
return cost_history
def predict(self, X):
X = np.array(X)
bias = np.ones([1]).reshape((1,1))
if len(X.shape) == 1:
X = X.reshape((X.size, 1))
X = np.concatenate((X, bias))
return np.transpose(X).dot(self.theta)[0] # [63,1]
datas = pd.read_csv('pattern_recognition_data.txt').to_numpy()
X = datas[0:25,0]
Y = datas[0:25:,1]
X_test = datas[25:29, 0]
Y_test = datas[25:29, 1]
mlr = MultivariateLinearRegression(0.001, 'gradient descent', 1000000) # 0.0001 ve 1.000.000
mlr.train(X, Y)
Y_pred = []
for x in X_test:
print('X : ', x)
Y_pred.append(mlr.predict([x]))
plt.plot(X, Y, 'bs')
plt.plot(X_test, Y_pred, 'r')
Thanks in advance
The dataset:
39,144
47,220
45,138
47,145
65,162
46,142
67,170
42,124
67,158
56,154
64,162
56,150
59,140
34,110
42,128
48,130
45,135
17,114
20,116
19,124
36,136
50,142
39,120
21,120
44,160
53,158
63,144
29,130
25,125
69,175