Multivariate linear regression ends up with 'NaN' sometimes - python

I tried to implement multivariate linear regression from scratch and it works pretty well actually. When I was testing it with a toy dataset, I run into sometimes the predictions were 'NaN'. I know what are the possible NaN reasons though, I couldn't understand which one causes it in my script.
Note: with 0.0001 learning rate and 1.000.000 iterations, I got a really good line for the dataset though, when learning rate is 0.001 and the number of iterations is 1.000.000, the predictions were NaN.
Here is the script:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import numpy as np
class MultivariateLinearRegression():
def __init__(self, learning_rate, learning_algorithm, epoch_num):
self.learning_rate = learning_rate
self.learning_algoritm = learning_algorithm
self.epoch_num = epoch_num
self.theta = 0
self.training_sample = 0
def train(self, X, Y):
Y = Y.reshape((Y.size, 1))
if len(X.shape) == 1:
X = X.reshape((X.size, 1))
bias = np.ones([X.shape[0], 1])
X = np.concatenate((X, bias), 1)
self.theta = np.zeros([X.shape[1], 1])
self.training_sample = X.shape[0]
cost_history = []
for i in range (self.epoch_num):
hypothesis =
cost_func = np.transpose(X).dot(np.subtract(hypothesis, Y))
gradient = (self.learning_rate / self.training_sample) * cost_func
self.theta = np.subtract(self.theta, gradient)
return cost_history
def predict(self, X):
X = np.array(X)
bias = np.ones([1]).reshape((1,1))
if len(X.shape) == 1:
X = X.reshape((X.size, 1))
X = np.concatenate((X, bias))
return np.transpose(X).dot(self.theta)[0] # [63,1]
datas = pd.read_csv('pattern_recognition_data.txt').to_numpy()
X = datas[0:25,0]
Y = datas[0:25:,1]
X_test = datas[25:29, 0]
Y_test = datas[25:29, 1]
mlr = MultivariateLinearRegression(0.001, 'gradient descent', 1000000) # 0.0001 ve 1.000.000
mlr.train(X, Y)
Y_pred = []
for x in X_test:
print('X : ', x)
plt.plot(X, Y, 'bs')
plt.plot(X_test, Y_pred, 'r')
Thanks in advance
The dataset:


Error: mat1 and mat2 shapes cannot be multiplied (1000x10 and 1x1)

I am trying to implement Ridge Regression in pytorch, defining the loss function and plotting said function over different iterations. The only issue is, I keep getting an error code: mat1 and mat2 shapes cannot be multiplied (1000x10 and 1x1). I would like to convert the second matrix to a 1x10 in order to complete the code but I can't seem to get it to work.
import numpy as np
import matplotlib.pyplot as plt
%matplotlib inline
n = 1000
p = 10
mean = np.zeros((p))
val = 0.8
cov = np.ones((p,p))*val
cov = cov + np.eye(p)*(1-val)
X = np.random.multivariate_normal(mean, cov, n)
theta_true = np.concatenate((np.ones((5,1)), np.zeros((5,1))),axis=0)
Sigma = np.eye(n,n,k=-1)*0.4 + np.eye(n,n)*1 + np.eye(n,n,k=1)*0.4
mean = np.zeros(n)
e = np.random.multivariate_normal(mean, Sigma, 1)
y=X#theta_true + delta*e.T
import torch
X_t = torch.from_numpy(X).float()
y_t = torch.from_numpy(y).float()
Sigma_t = torch.from_numpy(Sigma).float()
import torch.nn as nn
import torch.nn.functional as F
class MyLinear(nn.Module):
def __init__(self):
super(MyLinear, self).__init__()
self.linear = nn.Linear(1, 1)
def forward(self, x):
out = self.linear(x)
return out
def L2_norm(model):
return torch.sum(list(model.parameters())[0]**2)
def L1_norm(model):
return torch.sum(torch.abs(list(model.parameters())[0]))
def ridge_loss(y_pred, y_true, model, lambda_):
mse = F.mse_loss(y_pred, y_true)
regularization = lambda_ * L2_norm(model)
return mse + regularization
import matplotlib.pyplot as plt
model = MyLinear()
optimizer = torch.optim.SGD(model.parameters(), lr=0.01)
lambda_ = 0.1
num_epochs = 1000
loss_values = []
for epoch in range(num_epochs):
y_pred = model(X_t)
loss = ridge_loss(y_pred, y_t, model, lambda_)
plt.title('Ridge Regression Loss over Iterations')
I tried changing the theta_true definition to transform the matrix but the same error occurred.
theta_true = np.concatenate((np.ones((5,1)), np.zeros((5,1)))).reshape(10, 1)
Your Linear layer in MyLinear (line 37) is what is causing the issue.
self.linear = nn.Linear(1, 1)
means 1 input channel, one output channel, but x, as you have it here has shape (1000, 10), meaning it has 10 channels. So you will need to change that line to
self.linear = nn.Linear(10, 1)
that will do the trick, here is the image I get with that change:

Logistic regression numerical problem with gradient descent

I'm teaching myself Logistic Regression and I run into a problem while implementing it from scratch. The script kinda works but it starts outputting nans and giving me "divide by zero encountered in log" warnings when I initialize it with weights bigger than around 10.
This is my code
import numpy as np
import matplotlib.pyplot as plt
import scipy
def robust_sigmoid(x):
return np.exp(-np.logaddexp(0, -x))
def predict(x, w, train = True):
z = robust_sigmoid(, x))
if train:
return z
return int(z>0.5)
def NLLloss(Y_hat, Y):
total_loss = - np.sum(np.multiply(Y, np.log(Y_hat)) + np.multiply(1 - Y, np.log(1 - Y_hat)))
return total_loss
def grad(X, Y_hat, Y):
return, (Y_hat - Y))
def GradientDescent(X, Y, alpha, iterations, threshold, weight):
Y_hat = np.array([predict(x, weight) for x in X])
loss = []
for i in range(iterations):
gradient = grad(X, Y_hat, Y)
weight = weight - alpha*gradient
Y_hat = np.array([predict(x, weight) for x in X])
new_loss = NLLloss(Y_hat, Y)
if i%10 == 0:
print("Iteration {0}, Cost: {1}".format(i, new_loss))
if i > 3 and abs(loss[-2] - loss[-1]) < threshold:
return weight
toy_dataset = np.array([[2.7810836,2.550537003,1],
for col in range(toy_dataset.shape[1]-1):
toy_dataset[:,col] -= np.average(toy_dataset[:,col])
toy_dataset[:,col] /= np.std(toy_dataset[:,col])
label = np.array([0, 0, 0, 0, 0, 1, 1, 1, 1, 1])
weight = GradientDescent(toy_dataset, label, 0.1, 100, 0.0001, [10,1.5,10])
prediction = []
for data in toy_dataset:
prediction.append(predict(weight, data, False))
I thought it was a a problem with how I implemented the sigmoid function so I went and looked for a better implementation but it didn't fix anything. I also tried to normalize my dataset but it only made the program accept slightly bigger weights.
Is there anyway to fix this?

Getting nan while working with class Linear Regression

I have a class Linear Regression and want to check how does it work with dataset load_boston. I calculated the Mean absolute percentage error (MAPE) and the result is nan.
import numpy as np
import warnings
from sklearn.base import BaseEstimator
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import pandas as pd
class LinearRegressionSGD(BaseEstimator):
def __init__(self, epsilon=1e-4, max_steps=1000, w0=None, alpha=1e-2):
epsilon: difference for the rate of change of weights
max_steps: maximum number of steps in gradient descent
w0: np.array (d,) - initial weights
alpha: learning step
self.epsilon = epsilon
self.max_steps = max_steps
self.w0 = w0
self.alpha = alpha
self.w = None
self.w_history = []
def fit(self, X, y):
X: np.array (l, d)
y: np.array (l)
output: self
l, d = X.shape
if self.w0 is None:
self.w0 = np.zeros(d)
self.w = self.w0
for step in range(self.max_steps):
w_new = self.w - self.alpha * self.calc_gradient(X, y)
if (np.linalg.norm(w_new - self.w) < self.epsilon):
self.w = w_new
return self
def predict(self, X):
X: np.array (l, d)
output: np.array (l)
if self.w is None:
raise Exception('Not trained yet')
l, d = X.shape
y_pred = []
for i in range(l):
y_pred.append([i], self.w))
return np.array(y_pred)
def calc_gradient(self, X, y):
X: np.array (l, d)
y: np.array (l)
output: np.array (d)
l, d = X.shape
gradient = []
for j in range(d):
dQ = 0
for i in range(l):
dQ += (2 / l) * X[i][j] * ([i], self.w) - y[i])
return np.array(gradient)
data = load_boston()
X = pd.DataFrame(, columns=data.feature_names)
y =
X_train, X_test, y_train, y_test = train_test_split(np.array(X), y, test_size=0.3, random_state=10)
def MAPE(y_true, y_pred):
y_true: np.array (l)
y_pred: np.array (l)
output: float [0, +inf)
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
# Task 2
sgd = LinearRegressionSGD(), y_train)
y_pred_sgd = sgd.predict(X_test)
print(MAPE(y_test, y_pred_sgd))
# Task 3
a, b = X_test.shape
w_0 = np.random.uniform(-2, 2, (b))
lr = LinearRegressionSGD(w0=w_0), y_train)
y_pred_lr = lr.predict(X_test)
print(MAPE(y_test, y_pred_lr))
But when I create X, y like below, the code works properly and MAPE gives float value
n_features = 2
n_objects = 300
num_steps = 100
w_true = np.random.normal(0, 0.1, size=(n_features, ))
w_0 = np.random.uniform(-2, 2, (n_features))
X = np.random.uniform(-5, 5, (n_objects, n_features))
y =, w_true) + np.random.normal(0, 1, (n_objects))
What is the problem with my code? and how to fix it to get the float value?
(Sorry for my bad English, its not my native language)

Linear Regression loss value increasing after each iteration of gradient descent

I am trying to implement multivariate linear regression(gradient descent and mse cost function) but the loss value keeps exponentially increasing for every iteration of gradient descent and I'm unable to figure out why?
from sklearn.datasets import load_boston
class LinearRegression:
def __init__(self):
self.X = None # The feature vectors [shape = (m, n)]
self.y = None # The regression outputs [shape = (m, 1)]
self.W = None # The parameter vector `W` [shape = (n, 1)]
self.bias = None # The bias value `b` = None # Learning Rate `alpha`
self.m = None
self.n = None
self.epochs = None
def fit(self, X: np.ndarray, y: np.ndarray, epochs: int = 100, lr: float = 0.001):
self.X = X # shape (m, n)
self.m, self.n = X.shape
assert y.size == self.m and y.shape[0] == self.m
self.y = np.reshape(y, (-1, 1)) # shape (m, ) or (m, 1)
assert self.y.shape == (self.m, 1)
self.W = np.random.random((self.n, 1)) * 1e-3 # shape (n, 1)
self.bias = 0.0
self.epochs = epochs = lr
def minimize(self, verbose: bool = True):
for num_epoch in range(self.epochs):
predictions =, self.W)
assert predictions.shape == (self.m, 1)
grad_w = (1/self.m) * np.sum((predictions-self.y) * self.X, axis=0)[:, np.newaxis]
self.W = self.W - * grad_w
assert self.W.shape == grad_w.shape
loss = (1 / 2 * self.m) * np.sum(np.square(predictions - self.y))
if verbose:
print(f'Epoch : {num_epoch+1}/{self.epochs} \t Loss : {loss.item()}')
linear_regression = LinearRegression()
x_train, y_train = load_boston(return_X_y=True), y_train, 10)
I'm using the boston housing dataset from sklearn.
PS. I'd like to know what's causing this issue and how to fix it and whether or not my implementation is correct.
The error is in the gradient. A divergence like that for an iterative shrinkage-thresholding algorithms (ISTA) solver is not something you should see.
For your gradient computation: X is of shape (m,n) and W of shape(n,1) so (prediction - y) is of shape (m,1) then you multiply by X on the left? (m,1) by (m,n)? Not sure what numpy is computing but it is not what you want to compute:
grad_w = (1/self.m) * np.sum((predictions-self.y) * self.X, axis=0)[:, np.newaxis]
here the code should be a bit different to have a (n,m) multiply by a (m,1) in order to get a (n,1), same shape as W.
(1/self.m) * np.sum(self.X.T*(predictions-self.y) , axis=0)[:, np.newaxis]
For the derivation to be correct.
I am also not sure of why you use the dot (which is a good idea) for the prediction but not for the gradient.
You Also do not need so many reshapes:
from sklearn.datasets import load_boston
A,b = load_boston(return_X_y=True)
n_samples = A.shape[0]
n_features = A.shape[1]
def grad_linreg(x):
"""Least-squares gradient"""
grad = (1. / n_samples) *,, x) - b)
return grad
def loss_linreg(x):
"""Least-squares loss"""
f = (1. / (2. * n_samples)) * sum((b -, x)) ** 2)
return f
And then you check that your gradient is good:
from scipy.optimize import check_grad
from numpy.random import randn
You can then build the Model on that.
If you want to test that with ISTA/FISTA and Logistic/Linear Regression and LASSO/RIDGE, here is a jupyter notebook with the theory and a working example

numpy squeeze side effects

I've trained a simple machine learning model, a polynomial regression. The pseudocode of prediction function is as follows:
def f(x):
x is a np.ndarray of shape (m, )
# X is stacked of x ** 0, x ** 1, x ** 2, ..., x ** (n - 1) by rows
# X is of shape of (m, n)
# m is the number of training examples
X = generate(x)
Y =, W)
return Y
W is trained parameters. Here the shape of Y is (m, 1), but if I return Y.squeeze(), say of shape (m,), I get a very different standard deviation on the test set, say 70 for the former and 8 for the latter.
I use random initialisation, but I've trained and tested many times, the std of the squeezed version is much smaller. So I just wonder why.
I just show the complete codes below, and you can test by yourself. My questions are in line 90 and line 91
# python: 3.5.2
# encoding: utf-8
# numpy: 1.14.1
import numpy as np
import matplotlib.pyplot as plt
def load_data(filename):
xys = []
with open(filename, 'r') as f:
for line in f:
xys.append(map(float, line.strip().split()))
xs, ys = zip(*xys)
return np.asarray(xs), np.asarray(ys)
def evaluate(ys, ys_pred):
std = np.sqrt(np.mean(np.abs(ys - ys_pred) ** 2))
return std
def linear_regression(x_train, y_train, n=2, learning_rate=0.0005, epochs=1000, l2=0, Print=False):
This target function is: y = b + w1 * x^1 + w2 * x^2 + ...
also y = b +, x)
:param x_train: np.ndarray
:param y_train: np.ndarray
:return: a trained model (as a function), trained by x_train and y_train
# get the number of train e.g.
m = x_train.shape[0]
# set and initialize parameters here
# intercept
b = np.float64(-10)
# weights
w = np.float64(np.random.randn(n, 1))
# convert the x_train matrix to a design matrix
X = np.zeros((n, m), dtype=np.float64)
for i in range(n):
X[i, :] = x_train ** (i + 1)
X = np.float64(X)
Y = np.float64(np.reshape(y_train, newshape=(1, m)))
# if plot of the training process is needed
costs = []
# train on the dataset
for epoch in range(epochs):
# compute the gradient of cost on w
Z = b +, X)
dZ = Z - Y
dw = 1./m *, dZ.T)
db = 1./m * np.squeeze(np.sum(dZ))
# update the parameters, for w, I also set "weight decay"
w -= learning_rate * dw + l2 * w
b -= learning_rate * db
cost = np.squeeze(0.5/m *, dZ.T))
if Print == True and epoch % 25 == 0:
print("Cost after " + str(epoch) + " iterations " + ": " + str(cost))
# plot the costs
if Print == True:
def pred(x):
assert type(x) is np.ndarray
m = x.shape[0]
# convert the x_train matrix to a design matrix
X = np.zeros((n, m))
for i in range(n):
X[i, :] = x ** (i + 1)
# to predict
Y = b +, X)
return Y.T
# return Y.squeeze()
return pred
if __name__ == '__main__':
train_file = 'train.txt'
test_file = 'test.txt'
# load data
x_train, y_train = load_data(train_file)
x_test, y_test = load_data(test_file)
# use a trained linear-regression model
f = linear_regression(x_train, y_train, n=2, epochs=10000, Print=False, learning_rate=1e-8, l2=5e-2)
# compute the predictions
y_test_pred = f(x_test)
# use the test set to evaluate the model
std = evaluate(y_test, y_test_pred)
print('the standard deviation:{:.1f}'.format(std))
# show the result
plt.plot(x_train, y_train, 'ro', markersize=3)
plt.plot(x_test, y_test, 'k')
plt.plot(x_test, y_test_pred)
plt.title('Linear Regression')
plt.legend(['train', 'test', 'pred'])

