I wrote a linear regression from scratch but the loss is increasing. My data are the areas and the prices (as labels) of the houston housing dataset. I tried multiple learning-rates (from 10 to 0.00000000001), but its still not working. With every epoch, my fit-line/function keeps moving further away from the data points. There must be something wrong with the functions I guess, but I cant figure out what.
Here is an example of the loss:
loss: 0.5977188541860982
loss: 0.6003449724263221
loss: 0.6029841845821928
loss: 0.6056365560589673
loss: 0.6083021525886172
loss: 0.6109810402314608
loss: 0.6136732853778034
loss: 0.6163789547495854
loss: 0.6190981154020385
loss: 0.6218308347253524
loss: 0.6245771804463445
And here the code:
from preprocessing import load_csv
import pandas as pd
import numpy as np
import random
import matplotlib.pyplot as plt
# mean squared error
def MSE(y_prediction, y_true, deriv=(False, 1)):
if deriv[0]:
# deriv[1] is the derivitive of the fit_function
return 2 * np.mean(np.subtract(y_true, y_prediction) * deriv[1])
return np.mean(np.square(np.subtract(y_true, y_prediction)))
# linear function
def fit_function(theta_0, theta_1, x):
return theta_0 + (theta_1 * x)
# train model
def train(dataset, epochs=10, lr=0.01):
# loadinh and normalizing the data
x = (v := np.array(dataset["GrLivArea"].tolist()[:100])) / max(v)
y = (l := np.array(dataset["SalePrice"].tolist()[:100])) / max(l)
# y-intercept
theta_0 = random.uniform(min(y), max(y))
# slope
theta_1 = random.uniform(-1, 1)
for epoch in range(epochs):
predictions = fit_function(theta_0, theta_1, x)
loss = MSE(predictions, y)
delta_theta_0 = MSE(predictions, y, deriv=(True, 1))
delta_theta_1 = MSE(predictions, y, deriv=(True, x))
theta_0 -= lr * delta_theta_0
theta_1 -= lr * delta_theta_1
print("\nloss:", loss)
plt.style.use("ggplot")
plt.scatter(x, y)
x, predictions = map(list, zip(*sorted(zip(x, predictions))))
plt.plot(x, predictions, "b--")
plt.show()
train(load_csv("dataset/houston_housing/single_variable_dataset/train.csv"), epochs=500, lr=0.001)
Here is the plot after 500 epochs.
Thanks for your help :)
Quite an old post, but I thought I'd give an answer anyway.
You flipped the sign on the MSE derivative:
def MSE(y_prediction, y_true, deriv=(False, 1)):
if deriv[0]:
return 2 * np.mean(np.subtract(y_prediction, y_true) * deriv[1])
return np.mean(np.square(np.subtract(y_true, y_prediction)))
The partial derivatives w.r.t. you parameters are:
For conciseness:
def MSE(y_prediction, y_true, deriv=None):
if deriv is not None:
return 2 * np.mean((y_prediction - y_true)*deriv)
return np.mean((y_prediction - y_true)**2)
Which allows you to get the derivatives without passing in a tuple with a flag:
delta_theta_0 = MSE(predictions, y, deriv=1)
delta_theta_1 = MSE(predictions, y, deriv=x)
Here's an example using sklearn.datasets.load_boston with LSTAT (lower status of the population) and MEDV (Median value of owner-occupied homes in $1000's) as target the last two data features as input and target respectively.
Trained with epochs=10000 and lr=0.001:
Related
Say that, for some reason, I want to fit a linear regression using PyTorch, as illustrated below.
How could I compute the Hessian matrix of the model to, ultimately, compute the standard error for my parameter estimates?
import torch
import torch.nn as nn
# set seed
torch.manual_seed(42)
# define the model
class OLS_pytorch(nn.Module):
def __init__(self, X, Y):
super(OLS_pytorch, self).__init__()
self.X = X
self.Y = Y
self.beta = nn.Parameter(torch.ones(X.shape[1], 1, requires_grad=True))
self.intercept = nn.Parameter(torch.ones(1, requires_grad=True))
self.loss = nn.MSELoss()
def forward(self):
return self.X # self.beta + self.intercept
def fit(self, lr=0.01, epochs=1000):
optimizer = torch.optim.Adam(self.parameters(), lr=lr)
for epoch in range(epochs):
optimizer.zero_grad()
loss = self.loss(self.forward(), self.Y)
loss.backward()
optimizer.step()
if epoch % 10 == 0:
print(f"Epoch {epoch} loss: {loss.item()}")
return self
Generating some data and using the model
# Generate some data
X = torch.randn(100, 1)
Y = 2 * X + 3 + torch.randn(100, 1)
# fit the model
model = OLS_pytorch(X, Y)
model.fit()
#extract parameters
model.beta, model.intercept
#Epoch 980 loss: 0.7803605794906616
#Epoch 990 loss: 0.7803605794906616
#(Parameter containing:
# tensor([[2.0118]], requires_grad=True),
# Parameter containing:
# tensor([3.0357], requires_grad=True))
For instance, in R, using the same data and the lm() function, I recover the same parameters, but I am also able to recover the Hessian matrix, and them I am able to compute standard errors.
ols <- lm(Y ~ X, data = xy)
ols$coefficients
#(Intercept) X
# 3.035674 2.011811
vcov(ols)
# (Intercept) X
# (Intercept) 0.0079923921 -0.0004940884
# X -0.0004940884 0.0082671053
summary(ols)
# Coefficients:
# Estimate Std. Error t value Pr(>|t|)
# (Intercept) 3.03567 0.08940 33.96 <2e-16 ***
# X 2.01181 0.09092 22.13 <2e-16 ***
# ---
# Signif. codes: 0 ‘***’ 0.001 ‘**’ 0.01 ‘*’ 0.05 ‘.’ 0.1 ‘ ’ 1
UPDATE: Using the answer from #cherrywoods
here is how you would match the standard errors produced by lm() in R
# predict
y_pred = model.X # model.beta + model.intercept
sigma_hat = torch.sum((y_pred - model.Y)**2)/ (N-2) #2 is the number of estimated parameters.
from torch.autograd.functional import hessian
def loss(beta, intercept):
y_pred = model.X # beta + intercept
return model.loss(y_pred, model.Y)
H = torch.Tensor(hessian(loss, (model.beta, model.intercept)))
vcov = torch.sqrt(torch.diag(sigma_hat*torch.inverse(H/2)) )
print(vcov)
#tensor([0.9092, 0.8940], grad_fn=<SqrtBackward0>)
You can compute the Hessian using torch.autograd.functional.hessian.
from torch.autograd.functional import hessian
def loss(beta, intercept):
y_pred = model.X # beta + intercept
return model.loss(y_pred, model.Y)
H = hessian(loss, (model.beta, model.intercept))
I created a model class which is a subclass of keras.Model. While training the model, I want to change the weights of the loss functions after some epochs. In order to do that I created boolean variables to my model indicating that the model should start training with additional loss function. I add a pseudo code that mainly shows what I am trying to achieve.
class MyModel(keras.Model):
self.start_loss_2 = False
def train_step(self):
# Check if training with loss_2 started
weight_loss_2 = 0.0
if self.start_loss_2:
weight_loss_2 = 0.5
# Pass the data through model
# Calculate two loss values
total_loss = loss_1 + weight_loss_2 * loss_2
# Calculate gradients with tf.Tape
# Update variables
# This is called via Callback after each epoch
def epoch_finised(epoch_num):
if epoch_num > START_LOSS_2:
self.start_loss_2 = True
My questions is:
Is it valid to use if-else statement whose value changes after some time? If it is not, how can achieve this?
Yes. You can create a tf.Variable and then assign a new value to it based on some training criteria.
Example:
import numpy as np
import tensorflow as tf
# simple toy network
x_in = tf.keras.Input((10))
x = tf.keras.layers.Dense(25)(x_in)
x_out = tf.keras.layers.Dense(1)(x)
# model
m = tf.keras.Model(x_in, x_out)
# fake data
X = tf.random.normal((100, 10))
y0 = tf.random.normal((100, ))
y1 = tf.random.normal((100, ))
# optimizer
m_opt = tf.keras.optimizers.Adam(1e-2)
# prep data
ds = tf.data.Dataset.from_tensor_slices((X, y0, y1))
ds = ds.repeat().batch(5)
train_iter = iter(ds)
# toy loss function that uses a weight
def loss_fn(y_true0, y_true1, y_pred, weight):
mse = tf.keras.losses.MSE
mse_0 = tf.math.reduce_mean(mse(y_true0, y_pred))
mse_1 = tf.math.reduce_mean(mse(y_true1, y_pred))
return mse_0 + weight * mse_1
NUM_EPOCHS = 4
NUM_BATCHES_PER_EPOCH = 10
START_NEW_LOSS_AT_GLOBAL_STEP = 20
# the weight variable set to 0 initially and then
# will be changed after a certain number of steps
# (or some other training criteria)
w = tf.Variable(0.0, trainable=False)
for epoch in range(NUM_EPOCHS):
losses = []
for batch in range(NUM_BATCHES_PER_EPOCH):
X_train, y0_train, y1_train = next(train_iter)
with tf.GradientTape() as tape:
y_hat = m(X_train)
loss = loss_fn(y0_train, y1_train, y_hat, w)
losses.append(loss)
m_vars = m.trainable_variables
m_grads = tape.gradient(loss, m_vars)
m_opt.apply_gradients(zip(m_grads, m_vars))
print(f"epoch: {epoch}\tloss: {np.mean(losses):.4f}")
losses = []
# if the criteria is met assign a huge number to see if the
# loss spikes up
if (epoch + 1) * (batch + 1) >= START_NEW_LOSS_AT_GLOBAL_STEP:
w.assign(10000.0)
# epoch: 0 loss: 1.8226
# epoch: 1 loss: 1.1143
# epoch: 2 loss: 8788.2227 <= looks like assign worked
# epoch: 3 loss: 10999.5449
I tried to implement logistic regression only with numpy in Python, but the result is not satisfying. The predictions seems incorrect and loss is not improving so it is probably something wrong with the code. Does anyone know what could fix it? Thank you very much!
Here is algorithm:
import numpy as np
# training data and labels
X = np.concatenate((np.random.normal(0.25, 0.1, 50), np.random.normal(0.75, 0.1, 50)), axis=None)
Y = np.concatenate((np.zeros((50,), dtype=np.int32), np.ones((50,), dtype=np.int32)), axis=None)
def logistic_sigmoid(a):
return 1 / (1 + np.exp(-a))
# forward pass
def forward_pass(w, x):
return logistic_sigmoid(w * x)
# gradient computation
def backward_pass(x, y, y_real):
return np.sum((y - y_real) * x)
# computing loss
def loss(y, y_real):
return -np.sum(y_real * np.log(y) + (1 - y_real) * np.log(1 - y))
# training
def train():
w = 0.0
learning_rate = 0.01
i = 200
test_number = 0.3
for epoch in range(i):
y = forward_pass(w, X)
gradient = backward_pass(X, y, Y)
w = w - learning_rate * gradient
print(f'epoch {epoch + 1}, x = {test_number}, y = {forward_pass(w, test_number):.3f}, loss = {loss(y, Y):.3f}')
train()
At first glance you are missing you intercept term (typically called b_0, or bias) and its gradient update. Also in the backward_pass and loss calculations you are not dividing by the amount of data samples.
You can see two examples of how to implement it from scratch here:
1: Example based on Andrew Ng explanations in the Machine Learning course in Coursera
2: Implementation of Jason Brownlee from Machine Learning mastery website
I'm trying to build a neural network on the Mnist dataset for a HW assignment. I'm not asking anyone to DO the assignment for me, I'm just having trouble figuring out why the Training accuracy and Test Accuracy seem to be static for every epoch?
It's as if my way of updating weights is not working.
Epoch: 0, Train Accuracy: 10.22%, Train Cost: 3.86, Test Accuracy: 10.1%
Epoch: 1, Train Accuracy: 10.22%, Train Cost: 3.86, Test Accuracy: 10.1%
Epoch: 2, Train Accuracy: 10.22%, Train Cost: 3.86, Test Accuracy: 10.1%
Epoch: 3, Train Accuracy: 10.22%, Train Cost: 3.86, Test Accuracy: 10.1%
.
.
.
However, when I run the actual forward and backprop lines in a loop without any 'fluff' of classes or methods the cost goes down. I just can't seem to get it working in the current class setup.
I've tried building my own methods that pass the weights and biases between the backprop and feed-forward methods explicitly, however, those changes haven't done anything to fix this gradient descent issue.
I'm pretty sure it has to do with the definition of the backprop method in the NeuralNetwork class below. I've been struggling to find a way to update the weights by accessing the weight and bias variables in the main training loop.
def backward(self, Y_hat, Y):
'''
Backward pass through network. Update parameters
INPUT
Y_hat: Network predicted
shape: (?, 10)
Y: Correct target
shape: (?, 10)
RETURN
cost: calculate J for errors
type: (float)
'''
#Naked Backprop
dJ_dZ2 = Y_hat - Y
dJ_dW2 = np.matmul(np.transpose(X2), dJ_dZ2)
dJ_db2 = Y_hat - Y
dJ_dX2 = np.matmul(dJ_db2, np.transpose(NeuralNetwork.W2))
dJ_dZ1 = dJ_dX2 * d_sigmoid(Z1)
inner_mat = np.matmul(Y-Y_hat,np.transpose(NeuralNetwork.W2))
dJ_dW1 = np.matmul(np.transpose(X),inner_mat) * d_sigmoid(Z1)
dJ_db1 = np.matmul(Y - Y_hat, np.transpose(NeuralNetwork.W2)) * d_sigmoid(Z1)
lr = 0.1
# weight updates here
#just line 'em up and do lr * the dJ_.. vars you found above
NeuralNetwork.W2 = NeuralNetwork.W2 - lr * dJ_dW2
NeuralNetwork.b2 = NeuralNetwork.b2 - lr * dJ_db2
NeuralNetwork.W1 = NeuralNetwork.W1 - lr * dJ_dW1
NeuralNetwork.b1 = NeuralNetwork.b1 - lr * dJ_db1
# calculate the cost
cost = -1 * np.sum(Y * np.log(Y_hat))
# calc gradients
# weight updates
return cost#, W1, W2, b1, b2
I'm really at a loss here, any help is appreciated!
Full code is shown here...
import keras
import numpy as np
import matplotlib.pyplot as plt
from keras.datasets import mnist
np.random.seed(0)
"""### Load MNIST Dataset"""
(x_train, y_train), (x_test, y_test) = mnist.load_data()
X = x_train[0].reshape(1,-1)/255.; Y = y_train[0]
zeros = np.zeros(10); zeros[Y] = 1
Y = zeros
#Here we implement the forward pass for the network using the single example, $X$, from above
### Initialize weights and Biases
num_hidden_nodes = 200
num_classes = 10
# init weights
#first set of weights (these are what the input matrix is multiplied by)
W1 = np.random.uniform(-1e-3,1e-3,size=(784,num_hidden_nodes))
#this is the first bias layer and i think it's a 200 dimensional vector of the biases that go into each neuron before the sigmoid function.
b1 = np.zeros((1,num_hidden_nodes))
#again this are the weights for the 2nd layer that are multiplied by the activation output of the 1st layer
W2 = np.random.uniform(-1e-3,1e-3,size=(num_hidden_nodes,num_classes))
#these are the biases that are added to each neuron before the final softmax activation.
b2 = np.zeros((1,num_classes))
# multiply input with weights
Z1 = np.add(np.matmul(X,W1), b1)
def sigmoid(z):
return 1 / (1 + np.exp(- z))
def d_sigmoid(g):
return sigmoid(g) * (1. - sigmoid(g))
# activation function of Z1
X2 = sigmoid(Z1)
Z2 = np.add(np.matmul(X2,W2), b2)
# softmax
def softmax(z):
# subracting the max adds numerical stability
shiftx = z - np.max(z)
exps = np.exp(shiftx)
return exps / np.sum(exps)
def d_softmax(Y_hat, Y):
return Y_hat - Y
# the hypothesis,
Y_hat = softmax(Z2)
"""Initially the network guesses all categories equally. As we perform backprop the network will get better at discerning images and their categories."""
"""### Calculate Cost"""
cost = -1 * np.sum(Y * np.log(Y_hat))
#so i think the main thing here is like a nested chain rule thing, where we find the change in the cost with respec to each
# set of matrix weights and biases?
#here is probably the order of how we do things based on whats in math below...
'''
1. find the partial deriv of the cost function with respect to the output of the second layer, without the softmax it looks like for some reason?
2. find the partial deriv of the cost function with respect to the weights of the second layer, which is dope cause we can re-use the partial deriv from step 1
3. this one I know intuitively we're looking for the parial deriv of cost with respect to the bias term of the second layer, but how TF does that math translate into
numpy? is that the same y_hat - Y from the first step? where is there anyother Y_hat - y?
4. This is also confusing cause I know where to get the weights for layer 2 from and how to transpose them, but again, where is the Y_hat - Y?
5. Here we take the missing partial deriv from step 4 and multiply it by the d_sigmoid function of the first layer outputs before activations.
6. In this step we multiply the first layer weights (transposed) by the var from 5
7. And this is weird too, this just seems like the same step as number 5 repeated for some reason but with y-y_hat instead of y_hat-y
'''
#look at tutorials like this https://www.youtube.com/watch?v=7qYtIveJ6hU
#I think the most backprop layer steps are fine without biases but how do we find the bias derivatives
#maybe just the hypothesis matrix minus the actual y matrix?
dJ_dZ2 = Y_hat - Y
#find partial deriv of cost w respect to 2nd layer weights
dJ_dW2 = np.matmul(np.transpose(X2), dJ_dZ2)
#finding the partial deriv of cost with respect to the 2nd layer biases
#I'm still not 100% sure why this is here and why it works out to Y_hat - Y
dJ_db2 = Y_hat - Y
#finding the partial deriv of cost with respect to 2nd layer inputs
dJ_dX2 = np.matmul(dJ_db2, np.transpose(W2))
#finding the partial deriv of cost with respect to Activation of layer 1
dJ_dZ1 = dJ_dX2 * d_sigmoid(Z1)
#y-yhat matmul 2nd layer weights
#I added the transpose to the W2 var because the matrices were not compaible sizes without it
inner_mat = np.matmul(Y-Y_hat,np.transpose(W2))
dJ_dW1 = np.matmul(np.transpose(X),inner_mat) * d_sigmoid(Z1)
class NeuralNetwork:
# set learning rate
lr = 0.01
# init weights
W1 = np.random.uniform(-1e-3,1e-3,size=(784,num_hidden_nodes))
b1 = np.zeros((1,num_hidden_nodes))
W2 = np.random.uniform(-1e-3,1e-3,size=(num_hidden_nodes,num_classes))
b2 = np.zeros((1,num_classes))
def __init__(self, num_hidden_nodes, num_classes, lr=0.01):
'''
# set learning rate
lr = lr
# init weights
W1 = np.random.uniform(-1e-3,1e-3,size=(784,num_hidden_nodes))
b1 = np.zeros((1,num_hidden_nodes))
W2 = np.random.uniform(-1e-3,1e-3,size=(num_hidden_nodes,num_classes))
b2 = np.zeros((1,num_classes))
'''
def forward(self, X1):
'''
Forward pass through the network
INPUT
X: input to network
shape: (?, 784)
RETURN
Y_hat: prediction from output of network
shape: (?, 10)
'''
Z1 = np.add(np.matmul(X,W1), b1)
X2 = sigmoid(Z1)# activation function of Z1
Z2 = np.add(np.matmul(X2,W2), b2)
Y_hat = softmax(Z2)
#return the hypothesis
return Y_hat
# store input for backward pass
# you can basically copy and past what you did in the forward pass above here
# think about what you need to store for the backward pass
return
def backward(self, Y_hat, Y):
'''
Backward pass through network. Update parameters
INPUT
Y_hat: Network predicted
shape: (?, 10)
Y: Correct target
shape: (?, 10)
RETURN
cost: calculate J for errors
type: (float)
'''
#Naked Backprop
dJ_dZ2 = Y_hat - Y
dJ_dW2 = np.matmul(np.transpose(X2), dJ_dZ2)
dJ_db2 = Y_hat - Y
dJ_dX2 = np.matmul(dJ_db2, np.transpose(NeuralNetwork.W2))
dJ_dZ1 = dJ_dX2 * d_sigmoid(Z1)
inner_mat = np.matmul(Y-Y_hat,np.transpose(NeuralNetwork.W2))
dJ_dW1 = np.matmul(np.transpose(X),inner_mat) * d_sigmoid(Z1)
dJ_db1 = np.matmul(Y - Y_hat, np.transpose(NeuralNetwork.W2)) * d_sigmoid(Z1)
lr = 0.1
# weight updates here
#just line 'em up and do lr * the dJ_.. vars you found above
NeuralNetwork.W2 = NeuralNetwork.W2 - lr * dJ_dW2
NeuralNetwork.b2 = NeuralNetwork.b2 - lr * dJ_db2
NeuralNetwork.W1 = NeuralNetwork.W1 - lr * dJ_dW1
NeuralNetwork.b1 = NeuralNetwork.b1 - lr * dJ_db1
# calculate the cost
cost = -1 * np.sum(Y * np.log(Y_hat))
# calc gradients
# weight updates
return cost#, W1, W2, b1, b2
nn = NeuralNetwork(200,10,lr=.01)
num_train = float(len(x_train))
num_test = float(len(x_test))
for epoch in range(10):
train_correct = 0; train_cost = 0
# training loop
for i in range(len(x_train)):
x = x_train[i]; y = y_train[i]
# standardizing input to range 0 to 1
X = x.reshape(1,784) /255.
# forward pass through network
Y_hat = nn.forward(X)
# get pred number
pred_num = np.argmax(Y_hat)
# check if prediction was accurate
if pred_num == y:
train_correct += 1
# make a one hot categorical vector; same as keras.utils.to_categorical()
zeros = np.zeros(10); zeros[y] = 1
Y = zeros
# compute gradients and update weights
train_cost += nn.backward(Y_hat, Y)
test_correct = 0
# validation loop
for i in range(len(x_test)):
x = x_test[i]; y = y_test[i]
# standardizing input to range 0 to 1
X = x.reshape(1,784) /255.
# forward pass
Y_hat = nn.forward(X)
# get pred number
pred_num = np.argmax(Y_hat)
# check if prediction was correct
if pred_num == y:
test_correct += 1
# no backward pass here!
# compute average metrics for train and test
train_correct = round(100*(train_correct/num_train), 2)
test_correct = round(100*(test_correct/num_test ), 2)
train_cost = round( train_cost/num_train, 2)
# print status message every epoch
log_message = 'Epoch: {epoch}, Train Accuracy: {train_acc}%, Train Cost: {train_cost}, Test Accuracy: {test_acc}%'.format(
epoch=epoch,
train_acc=train_correct,
train_cost=train_cost,
test_acc=test_correct
)
print (log_message)
also, The project is in this colab & ipynb notebook
I believe this is pretty clear, in this part of your loop:
for epoch in range(10):
train_correct = 0; train_cost = 0
# training loop
for i in range(len(x_train)):
x = x_train[i]; y = y_train[i]
# standardizing input to range 0 to 1
X = x.reshape(1,784) /255.
# forward pass through network
Y_hat = nn.forward(X)
# get pred number
pred_num = np.argmax(Y_hat)
# check if prediction was accurate
if pred_num == y:
train_correct += 1
# make a one hot categorical vector; same as keras.utils.to_categorical()
zeros = np.zeros(10); zeros[y] = 1
Y = zeros
# compute gradients and update weights
train_cost += nn.backward(Y_hat, Y)
test_correct = 0
# validation loop
for i in range(len(x_test)):
x = x_test[i]; y = y_test[i]
# standardizing input to range 0 to 1
X = x.reshape(1,784) /255.
# forward pass
Y_hat = nn.forward(X)
# get pred number
pred_num = np.argmax(Y_hat)
# check if prediction was correct
if pred_num == y:
test_correct += 1
# no backward pass here!
# compute average metrics for train and test
train_correct = round(100*(train_correct/num_train), 2)
test_correct = round(100*(test_correct/num_test ), 2)
train_cost = round( train_cost/num_train, 2)
# print status message every epoch
log_message = 'Epoch: {epoch}, Train Accuracy: {train_acc}%, Train Cost: {train_cost}, Test Accuracy: {test_acc}%'.format(
epoch=epoch,
train_acc=train_correct,
train_cost=train_cost,
test_acc=test_correct
)
print (log_message)
For every epoch of the 10 epochs in your loop, you are setting your train_correct and train_cost to 0, hence there is no updating after each epoch
I am trying to start learning ML.
I wrote a simple example:
import numpy as np
# Prepare the data
input = np.array(list(range(100)))
output = np.array([x**2 + 2 for x in list(range(100))])
# Visualize Data
import matplotlib.pyplot as plt
plt.plot(input, output, 'ro')
plt.show()
# Define your Model
a = 1
b = 1
# y = ax + b # we put a bias in the model based on our knowledge
# Train your model == Optimize the parameters so that they give very less loss
for e in range(10):
for x, y in zip(input, output):
y_hat = a*x + b
loss = 0.5*(y_hat-y)**2
# Now that we have loss, we want gradient of the parameters a and b
# derivative of loss wrt a = (-x)(y-ax+b)
# so gradient descent: a = a - (learning_rate)*(derivative wrt a)
a = a - 0.1*(-x)*(y_hat-y)
b = b - 0.1*(-1)*(y_hat-y)
print("Epoch {0} Training loss = {1}".format(e, loss))
# Make Prections on new data
test_input = np.array(list(range(101,150)))
test_output = np.array([x**2.0 + 2 for x in list(range(101,150))])
model_predictions = np.array([a*x + b for x in list(range(101,150))])
plt.plot(test_input, test_output, 'ro')
plt.plot(test_input, model_predictions, '-')
plt.show()
Now when I run the code:
ml_zero.py:22: RuntimeWarning: overflow encountered in double_scalars
loss = 0.5*(y_hat-y)**2
Epoch 0 Training loss = inf
ml_zero.py:21: RuntimeWarning: overflow encountered in double_scalars
y_hat = a*x + b
Epoch 1 Training loss = inf
ml_zero.py:21: RuntimeWarning: invalid value encountered in double_scalars
y_hat = a*x + b
Epoch 2 Training loss = nan
Epoch 3 Training loss = nan
Epoch 4 Training loss = nan
Epoch 5 Training loss = nan
Epoch 6 Training loss = nan
Epoch 7 Training loss = nan
Epoch 8 Training loss = nan
Epoch 9 Training loss = nan
Why is the error nan? I wrote the simplest model, but with python I was getting:
Traceback (most recent call last):
File "ml_zero.py", line 20, in <module>
loss = (y_hat-y)**2
OverflowError: (34, 'Result too large')
Then I converted all Python lists to numpy. Now, I get Nan error, I just don't understand why these small values are giving theses errors.
With Daniele's answer to replace the loss with mean squared loss, i.e. dividing the loss by total number of inputs, I get this output:
Epoch 0 Training loss = 1.7942781420994678e+36
Epoch 1 Training loss = 9.232837400842652e+70
Epoch 2 Training loss = 4.751367833814119e+105
Epoch 3 Training loss = 2.4455835946216386e+140
Epoch 4 Training loss = 1.2585275201812707e+175
Epoch 5 Training loss = 6.4767849625200624e+209
Epoch 6 Training loss = 3.331617554363007e+244
Epoch 7 Training loss = 1.714758503849272e+279
ml_zero.py:22: RuntimeWarning: overflow encountered in double_scalars
loss = 0.5*(y-y_hat)**2
Epoch 8 Training loss = inf
Epoch 9 Training loss = inf
At least it runs, but I am trying to learn the linear function using Stochastic gradient descent, which updates parameters after each point's loss.
Still not getting how people work with these models, loss should decrease why is it increasing with gradient descent?
You got the math wrong. When you compute the gradient update for GD you have to divide by the number of samples in your dataset: that's why it is called mean squared error and not just squared error.
Also, you might want to use smaller inputs since you're trying to work with an exponential, as it tends to grow... well, exponentially with x.
Look at this post for a good intro to LR and GD.
I took the liberty of rewriting your code a bit, this should work:
import numpy as np
import matplotlib.pyplot as plt
# Prepare the data
input_ = np.linspace(0, 10, 100) # Don't assign user data to Python's input builtin
output = np.array([x**2 + 2 for x in input_])
# Define model
a = 1
b = 1
# Train model
N = input_.shape[0] # Number of samples
for e in range(10):
loss = 0.
for x, y in zip(input_, output):
y_hat = a * x + b
a = a - 0.1 * (2. / N) * (-x) * (y - y_hat)
b = b - 0.1 * (2. / N) * (-1) * (y - y_hat)
loss += 0.5 * ((y - y_hat) ** 2)
loss /= N
print("Epoch {:2d}\tLoss: {:4f}".format(e, loss))
# Predict on test data
test_input = np.linspace(0, 15, 150) # Training data [0-10] + test data [10 - 15]
test_output = np.array([x**2.0 + 2 for x in test_input])
model_predictions = np.array([a*x + b for x in test_input])
plt.plot(test_input, test_output, 'ro')
plt.plot(test_input, model_predictions, '-')
plt.show()
This should give you as output something along these lines:
Epoch 0 Loss: 33.117127
Epoch 1 Loss: 42.949756
Epoch 2 Loss: 40.733332
Epoch 3 Loss: 38.657764
Epoch 4 Loss: 36.774646
Epoch 5 Loss: 35.067299
Epoch 6 Loss: 33.520409
Epoch 7 Loss: 32.119958
Epoch 8 Loss: 30.853112
Epoch 9 Loss: 29.708126
And this is the output plot:
Cheers
EDIT: OP was asking about SGD. The above answer is still valid code, but it's for standard GD (where you iterate on the whole dataset at the same time).
For SGD, the main loop must be slightly changed:
for e in range(10):
for x, y in zip(input_, output):
y_hat = a * x + b
loss = 0.5 * ((y - y_hat) ** 2)
a = a - 0.01 * (2.) * (-x) * (y - y_hat)
b = b - 0.01 * (2.) * (-1) * (y - y_hat)
print("Epoch {:2d}\tLoss: {:4f}".format(e, loss))
Note that I had to lower the learning rate to avoid divergence. When you train with a batch size of 1 it becomes really important to avoid this kind of gradient explosions, because a single sample may substantially mess up your descent towards the optimum.
Example output:
Epoch 0 Loss: 0.130379
Epoch 1 Loss: 0.123007
Epoch 2 Loss: 0.117352
Epoch 3 Loss: 0.112991
Epoch 4 Loss: 0.109615
Epoch 5 Loss: 0.106992
Epoch 6 Loss: 0.104948
Epoch 7 Loss: 0.103353
Epoch 8 Loss: 0.102105
Epoch 9 Loss: 0.101127