I am trying to implement multivariate linear regression(gradient descent and mse cost function) but the loss value keeps exponentially increasing for every iteration of gradient descent and I'm unable to figure out why?
from sklearn.datasets import load_boston
class LinearRegression:
def __init__(self):
self.X = None # The feature vectors [shape = (m, n)]
self.y = None # The regression outputs [shape = (m, 1)]
self.W = None # The parameter vector `W` [shape = (n, 1)]
self.bias = None # The bias value `b`
self.lr = None # Learning Rate `alpha`
self.m = None
self.n = None
self.epochs = None
def fit(self, X: np.ndarray, y: np.ndarray, epochs: int = 100, lr: float = 0.001):
self.X = X # shape (m, n)
self.m, self.n = X.shape
assert y.size == self.m and y.shape[0] == self.m
self.y = np.reshape(y, (-1, 1)) # shape (m, ) or (m, 1)
assert self.y.shape == (self.m, 1)
self.W = np.random.random((self.n, 1)) * 1e-3 # shape (n, 1)
self.bias = 0.0
self.epochs = epochs
self.lr = lr
self.minimize()
def minimize(self, verbose: bool = True):
for num_epoch in range(self.epochs):
predictions = np.dot(self.X, self.W)
assert predictions.shape == (self.m, 1)
grad_w = (1/self.m) * np.sum((predictions-self.y) * self.X, axis=0)[:, np.newaxis]
self.W = self.W - self.lr * grad_w
assert self.W.shape == grad_w.shape
loss = (1 / 2 * self.m) * np.sum(np.square(predictions - self.y))
if verbose:
print(f'Epoch : {num_epoch+1}/{self.epochs} \t Loss : {loss.item()}')
linear_regression = LinearRegression()
x_train, y_train = load_boston(return_X_y=True)
linear_regression.fit(x_train, y_train, 10)
I'm using the boston housing dataset from sklearn.
PS. I'd like to know what's causing this issue and how to fix it and whether or not my implementation is correct.
Thanks
The error is in the gradient. A divergence like that for an iterative shrinkage-thresholding algorithms (ISTA) solver is not something you should see.
For your gradient computation: X is of shape (m,n) and W of shape(n,1) so (prediction - y) is of shape (m,1) then you multiply by X on the left? (m,1) by (m,n)? Not sure what numpy is computing but it is not what you want to compute:
grad_w = (1/self.m) * np.sum((predictions-self.y) * self.X, axis=0)[:, np.newaxis]
here the code should be a bit different to have a (n,m) multiply by a (m,1) in order to get a (n,1), same shape as W.
(1/self.m) * np.sum(self.X.T*(predictions-self.y) , axis=0)[:, np.newaxis]
For the derivation to be correct.
I am also not sure of why you use the dot (which is a good idea) for the prediction but not for the gradient.
You Also do not need so many reshapes:
from sklearn.datasets import load_boston
A,b = load_boston(return_X_y=True)
n_samples = A.shape[0]
n_features = A.shape[1]
def grad_linreg(x):
"""Least-squares gradient"""
grad = (1. / n_samples) * np.dot(A.T, np.dot(A, x) - b)
return grad
def loss_linreg(x):
"""Least-squares loss"""
f = (1. / (2. * n_samples)) * sum((b - np.dot(A, x)) ** 2)
return f
And then you check that your gradient is good:
from scipy.optimize import check_grad
from numpy.random import randn
check_grad(loss_linreg,grad_linreg,randn(n_features))
check_grad(loss_linreg,grad_linreg,randn(n_features))
check_grad(loss_linreg,grad_linreg,randn(n_features))
check_grad(loss_linreg,grad_linreg,randn(n_features))
You can then build the Model on that.
If you want to test that with ISTA/FISTA and Logistic/Linear Regression and LASSO/RIDGE, here is a jupyter notebook with the theory and a working example
Related
I want to implement gradient descent with numpy for linear regression but I have some error in this code:
import numpy as np
# Code Example
rng = np.random.RandomState(10)
X = 10*rng.rand(1000, 5) # feature matrix
y = 0.9 + np.dot(X, [2.2, 4, -4, 1, 2]) # target vector
# GD implementation for linear regression
def GD(X, y, eta=0.1, n_iter=20):
theta = np.zeros((X.shape[0], X.shape[1]))
for i in range(n_iter):
grad = 2 * np.mean((np.dot(theta.T, X) - y) * X)
theta = theta - eta * grad
return theta
# SGD implementation for linear regression
def SGD(X, y, eta=0.1, n_iter=20):
theta = np.zeros(1, X.shape[1])
for i in range(n_iter):
for j in range(X.shape[0]):
grad = 2 * np.mean((np.dot(theta.T, X[j,:]) - y[j]) * X[j,:])
theta = theta - eta * grad
return theta
# MSE loss for linear regression with numpy
def MSE(X, y, theta):
return np.mean((X.dot(theta.T) - y)**2)
# linear regression with GD and MSE with numpy
theta_gd = GD(X, y)
theta_sgd = SGD(X, y)
print('MSE with GD: ', MSE(X, y, theta_gd))
print('MSE with SGD: ', MSE(X, y, theta_sgd))
The error is
grad = 2 * np.mean((np.dot(theta.T, X) - y) * X)
ValueError: operands could not be broadcast together with shapes (5,5) (1000,)
and I can't solve it.
Minor changes in your code that resolve dimensionality issues during matrix multiplication make the code run successfully. In particular, note that a linear regression on a design matrix X of dimension Nxk has a parameter vector theta of size k.
In addition, I'd suggest some changes in SGD() that make it a proper stochastic gradient descent. Namely, evaluating the gradient over random subsets of the data realized as realized by randomly partitioning the index set of the train data with np.random.shuffle() and looping through it. The batch_size determines the size of each subset after which the parameter estimate is updated. The argument seed ensures reproducibility.
# GD implementation for linear regression
def GD(X, y, eta=0.001, n_iter=100):
theta = np.zeros(X.shape[1])
for i in range(n_iter):
for j in range(X.shape[0]):
grad = (2 * np.mean(X[j,:] # theta - y[j]) * X[j,:]) # changed line
theta -= eta * grad
return theta
# SGD implementation for linear regression
def SGD(X, y, eta=0.001, n_iter=1000, batch_size=25, seed=7678):
theta = np.zeros(X.shape[1])
indexSet = list(range(len(X)))
np.random.seed(seed)
for i in range(n_iter):
np.random.shuffle(indexSet) # random shuffle of index set
for j in range(round(len(X) / batch_size)+1):
X_sub = X[indexSet[j*batch_size:(j+1)*batch_size],:]
y_sub = y[indexSet[j*batch_size:(j+1)*batch_size]]
if(len(X_sub) > 0):
grad = (2 * np.mean(X_sub # theta - y_sub) * X_sub) # changed line
theta -= eta * np.mean(grad, axis=0)
return theta
Running the code, I get
print('MSE with GD : ', MSE(X, y, theta_gd))
print('MSE with SGD: ', MSE(X, y, theta_sgd))
> MSE with GD : 0.07602
MSE with SGD: 0.05762
Each observation has 5 features, and X contains 1000 observations:
X = rng.rand(1000, 5) * 10 # X.shape == (1000, 5)
Create y which is perfectly linearly correlated with X (with no distortions):
real_weights = np.array([2.2, 4, -4, 1, 2]).reshape(-1, 1)
real_bias = 0.9
y = X # real_weights + real_bias # y.shape == (1000, 1)
G.D. implementation for linear regression:
Note:
w (weights) is your theta variable.
I have also added the calculation of b (bias).
def GD(X, y, eta=0.1, n_iter=20):
# Initialize weights and a bias (all zeros):
w = np.zeros((X.shape[1], 1)) # w.shape == (5, 1)
b = 0
# Gradient descent
for i in range(n_iter):
errors = X # w + b - y # errors.shape == (1000, 1)
dw = 2 * np.mean(errors * X, axis=0).reshape(5, 1)
db = 2 * np.mean(errors)
w -= eta * dw
b -= eta * db
return w, b
Testing:
w, b = GD(X, y, eta=0.003, n_iter=5000)
print(w, b)
[[ 2.20464905]
[ 4.00510139]
[-3.99569374]
[ 1.00444026]
[ 2.00407476]] 0.7805448262466914
Notes:
Your function SGD also contains some error..
I'm using the # operator because it's just my preference over np.dot.
I have a class Linear Regression and want to check how does it work with dataset load_boston. I calculated the Mean absolute percentage error (MAPE) and the result is nan.
import numpy as np
import warnings
from sklearn.base import BaseEstimator
from sklearn.datasets import load_boston
from sklearn.model_selection import train_test_split
import pandas as pd
warnings.filterwarnings('ignore')
class LinearRegressionSGD(BaseEstimator):
def __init__(self, epsilon=1e-4, max_steps=1000, w0=None, alpha=1e-2):
'''
epsilon: difference for the rate of change of weights
max_steps: maximum number of steps in gradient descent
w0: np.array (d,) - initial weights
alpha: learning step
'''
self.epsilon = epsilon
self.max_steps = max_steps
self.w0 = w0
self.alpha = alpha
self.w = None
self.w_history = []
def fit(self, X, y):
"""
X: np.array (l, d)
y: np.array (l)
---
output: self
"""
l, d = X.shape
if self.w0 is None:
self.w0 = np.zeros(d)
self.w = self.w0
for step in range(self.max_steps):
self.w_history.append(self.w)
w_new = self.w - self.alpha * self.calc_gradient(X, y)
if (np.linalg.norm(w_new - self.w) < self.epsilon):
break
self.w = w_new
return self
def predict(self, X):
"""
X: np.array (l, d)
---
output: np.array (l)
"""
if self.w is None:
raise Exception('Not trained yet')
l, d = X.shape
y_pred = []
for i in range(l):
y_pred.append(np.dot(X[i], self.w))
return np.array(y_pred)
def calc_gradient(self, X, y):
"""
X: np.array (l, d)
y: np.array (l)
---
output: np.array (d)
"""
l, d = X.shape
gradient = []
for j in range(d):
dQ = 0
for i in range(l):
dQ += (2 / l) * X[i][j] * (np.dot(X[i], self.w) - y[i])
gradient.append(dQ)
return np.array(gradient)
data = load_boston()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
X_train, X_test, y_train, y_test = train_test_split(np.array(X), y, test_size=0.3, random_state=10)
def MAPE(y_true, y_pred):
"""
y_true: np.array (l)
y_pred: np.array (l)
---
output: float [0, +inf)
"""
y_true, y_pred = np.array(y_true), np.array(y_pred)
return np.mean(np.abs((y_true - y_pred) / y_true)) * 100
# Task 2
sgd = LinearRegressionSGD()
sgd.fit(X_train, y_train)
y_pred_sgd = sgd.predict(X_test)
print(MAPE(y_test, y_pred_sgd))
# Task 3
a, b = X_test.shape
w_0 = np.random.uniform(-2, 2, (b))
lr = LinearRegressionSGD(w0=w_0)
lr.fit(X_train, y_train)
y_pred_lr = lr.predict(X_test)
print(MAPE(y_test, y_pred_lr))
But when I create X, y like below, the code works properly and MAPE gives float value
n_features = 2
n_objects = 300
num_steps = 100
np.random.seed(1)
w_true = np.random.normal(0, 0.1, size=(n_features, ))
w_0 = np.random.uniform(-2, 2, (n_features))
X = np.random.uniform(-5, 5, (n_objects, n_features))
y = np.dot(X, w_true) + np.random.normal(0, 1, (n_objects))
What is the problem with my code? and how to fix it to get the float value?
(Sorry for my bad English, its not my native language)
I tried to implement multivariate linear regression from scratch and it works pretty well actually. When I was testing it with a toy dataset, I run into sometimes the predictions were 'NaN'. I know what are the possible NaN reasons though, I couldn't understand which one causes it in my script.
Note: with 0.0001 learning rate and 1.000.000 iterations, I got a really good line for the dataset though, when learning rate is 0.001 and the number of iterations is 1.000.000, the predictions were NaN.
Here is the script:
import pandas as pd
import matplotlib.pyplot as plt
import sys
import numpy as np
class MultivariateLinearRegression():
#constructor
def __init__(self, learning_rate, learning_algorithm, epoch_num):
self.learning_rate = learning_rate
self.learning_algoritm = learning_algorithm
self.epoch_num = epoch_num
self.theta = 0
self.training_sample = 0
def train(self, X, Y):
Y = Y.reshape((Y.size, 1))
if len(X.shape) == 1:
X = X.reshape((X.size, 1))
bias = np.ones([X.shape[0], 1])
X = np.concatenate((X, bias), 1)
self.theta = np.zeros([X.shape[1], 1])
self.training_sample = X.shape[0]
cost_history = []
for i in range (self.epoch_num):
hypothesis = X.dot(self.theta)
cost_func = np.transpose(X).dot(np.subtract(hypothesis, Y))
gradient = (self.learning_rate / self.training_sample) * cost_func
self.theta = np.subtract(self.theta, gradient)
cost_history.append(self.theta)
return cost_history
def predict(self, X):
X = np.array(X)
bias = np.ones([1]).reshape((1,1))
if len(X.shape) == 1:
X = X.reshape((X.size, 1))
X = np.concatenate((X, bias))
return np.transpose(X).dot(self.theta)[0] # [63,1]
datas = pd.read_csv('pattern_recognition_data.txt').to_numpy()
X = datas[0:25,0]
Y = datas[0:25:,1]
X_test = datas[25:29, 0]
Y_test = datas[25:29, 1]
mlr = MultivariateLinearRegression(0.001, 'gradient descent', 1000000) # 0.0001 ve 1.000.000
mlr.train(X, Y)
Y_pred = []
for x in X_test:
print('X : ', x)
Y_pred.append(mlr.predict([x]))
plt.plot(X, Y, 'bs')
plt.plot(X_test, Y_pred, 'r')
Thanks in advance
The dataset:
39,144
47,220
45,138
47,145
65,162
46,142
67,170
42,124
67,158
56,154
64,162
56,150
59,140
34,110
42,128
48,130
45,135
17,114
20,116
19,124
36,136
50,142
39,120
21,120
44,160
53,158
63,144
29,130
25,125
69,175
I've got a toy example set up of a linear regression model with one input variable and one output variable. The problem that I'm encountering is that the output for the bias is far off from the generated data. If I manually set the bias then it will produce a weight and bias which is close enough to the original.
I've written two pieces of code gen_data which generates data and GradientDescent which performs that gradient descent algorithm to find the weight and bias.
def gen_data(num_points=50, slope=1, bias=10, x_max=50):
f = lambda z: slope * z + bias
x = np.zeros(shape=(num_points, 1))
y = np.zeros(shape=(num_points, 1))
for i in range(num_points):
x_temp = np.random.uniform()*x_max
x[i] = x_temp
y[i] = f(x_temp) + np.random.normal(scale=3.0)
return (x, y)
# \mathbb{R}^1 with no regularization
def gradientDescent2(x, y, learning_rate=0.0001, epochs=100):
theta = np.random.rand()
bias = np.random.rand()
for i in range(0, epochs):
loss = (theta * x + bias) - y
cost = np.mean(loss**2) / 2
# print('Iteration {} | Cost: {}'.format(i, cost))
grad_b = np.mean(loss)
grad_t = np.mean(loss*x)
# updates
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
return (theta, bias)
If you want to use batch update, don't set your batch_size equals to your simple size. (I also believe that batch_update is not very suitable for this case.)
2.Your gradient calculation and parameter update are incorrect, the gradient should be:
grad_b = 1
grad_t = x
For the parameter update, you should always trying to minimize the loss, so it should be
if loss>0:
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
elif loss< 0:
bias += learning_rate * grad_b
theta += learning_rate * grad_t
After all, below is the modified code works well.
import numpy as np
import sys
def gen_data(num_points=500, slope=1, bias=10, x_max=50):
f = lambda z: slope * z + bias
x = np.zeros(shape=(num_points))
y = np.zeros(shape=(num_points))
for i in range(num_points):
x_temp = np.random.uniform()*x_max
x[i] = x_temp
y[i] = f(x_temp) #+ np.random.normal(scale=3.0)
#print('x:',x[i],' y:',y[i])
return (x, y)
def gradientDescent2(x, y, learning_rate=0.001, epochs=100):
theta = np.random.rand()
bias = np.random.rand()
for i in range(0, epochs):
for j in range(len(x)):
loss = (theta * x[j] + bias) - y[j]
cost = np.mean(loss**2) / 2
# print('Iteration {} | Cost: {}'.format(i, cost))
grad_b = 1
grad_t = x[j]
if loss>0:
bias -= learning_rate * grad_b
theta -= learning_rate * grad_t
elif loss< 0:
bias += learning_rate * grad_b
theta += learning_rate * grad_t
return (theta, bias)
def main():
x,y =gen_data()
ta,bias = gradientDescent2(x,y)
print('theta:',ta)
print('bias:',bias)
if __name__ == '__main__':
sys.exit(int(main() or 0))
Trying to use Backpropagation Neural Network for multiclass classification. I have found this code and try to adapt it. It is based on the lections of Machine Learning in Coursera from Andrew Ng.
I don't understand exactly the implementation of scipy.optimize.minimize function here. It is used just once in the code. Is it iteratively updating the weights of the network? How can I visualize (plot) it's performance to see when it converges?
Using this function what parameters I can adjust to achieve better performance? I found here a list common parameters:
Number of neurons in the hidden layer: this is hidden_layer_size=25 in my code
Learning rate: can I still adjust that using built-in minimization function?
Momentum: is that reg_lambda=0 in my case? Regularization parameter to avoid overfitting, right?
Epoch: maxiter=500
Here is my training data (target class is in the last column):
65535, 3670, 65535, 3885, -0.73, 1
65535, 3962, 65535, 3556, -0.72, 1
65535, 3573, 65535, 3529, -0.61, 1
3758, 3123, 4117, 3173, -0.21, 0
3906, 3119, 4288, 3135, -0.28, 0
3750, 3073, 4080, 3212, -0.26, 0
65535, 3458, 65535, 3330, -0.85, 2
65535, 3315, 65535, 3306, -0.87, 2
65535, 3950, 65535, 3613, -0.84, 2
65535, 32576, 65535, 19613, -0.35, 3
65535, 16657, 65535, 16618, -0.37, 3
65535, 16657, 65535, 16618, -0.32, 3
The dependencies are so obvious, I think it should be so easy to classify it...
But results are terrible. I get accuracy of 0.6 to 0.8. This is absolutely inappropriate for my application. I know I need more data normally, but I would be already happy when I could at least fit the training data (without taking into account potential overfitting)
Here is the code:
import numpy as np
from scipy import optimize
from sklearn import cross_validation
from sklearn.metrics import accuracy_score
import math
class NN_1HL(object):
def __init__(self, reg_lambda=0, epsilon_init=0.12, hidden_layer_size=25, opti_method='TNC', maxiter=500):
self.reg_lambda = reg_lambda
self.epsilon_init = epsilon_init
self.hidden_layer_size = hidden_layer_size
self.activation_func = self.sigmoid
self.activation_func_prime = self.sigmoid_prime
self.method = opti_method
self.maxiter = maxiter
def sigmoid(self, z):
return 1 / (1 + np.exp(-z))
def sigmoid_prime(self, z):
sig = self.sigmoid(z)
return sig * (1 - sig)
def sumsqr(self, a):
return np.sum(a ** 2)
def rand_init(self, l_in, l_out):
self.epsilon_init = (math.sqrt(6))/(math.sqrt(l_in + l_out))
return np.random.rand(l_out, l_in + 1) * 2 * self.epsilon_init - self.epsilon_init
def pack_thetas(self, t1, t2):
return np.concatenate((t1.reshape(-1), t2.reshape(-1)))
def unpack_thetas(self, thetas, input_layer_size, hidden_layer_size, num_labels):
t1_start = 0
t1_end = hidden_layer_size * (input_layer_size + 1)
t1 = thetas[t1_start:t1_end].reshape((hidden_layer_size, input_layer_size + 1))
t2 = thetas[t1_end:].reshape((num_labels, hidden_layer_size + 1))
return t1, t2
def _forward(self, X, t1, t2):
m = X.shape[0]
ones = None
if len(X.shape) == 1:
ones = np.array(1).reshape(1,)
else:
ones = np.ones(m).reshape(m,1)
# Input layer
a1 = np.hstack((ones, X))
# Hidden Layer
z2 = np.dot(t1, a1.T)
a2 = self.activation_func(z2)
a2 = np.hstack((ones, a2.T))
# Output layer
z3 = np.dot(t2, a2.T)
a3 = self.activation_func(z3)
return a1, z2, a2, z3, a3
def function(self, thetas, input_layer_size, hidden_layer_size, num_labels, X, y, reg_lambda):
t1, t2 = self.unpack_thetas(thetas, input_layer_size, hidden_layer_size, num_labels)
m = X.shape[0]
Y = np.eye(num_labels)[y]
_, _, _, _, h = self._forward(X, t1, t2)
costPositive = -Y * np.log(h).T
costNegative = (1 - Y) * np.log(1 - h).T
cost = costPositive - costNegative
J = np.sum(cost) / m
if reg_lambda != 0:
t1f = t1[:, 1:]
t2f = t2[:, 1:]
reg = (self.reg_lambda / (2 * m)) * (self.sumsqr(t1f) + self.sumsqr(t2f))
J = J + reg
return J
def function_prime(self, thetas, input_layer_size, hidden_layer_size, num_labels, X, y, reg_lambda):
t1, t2 = self.unpack_thetas(thetas, input_layer_size, hidden_layer_size, num_labels)
m = X.shape[0]
t1f = t1[:, 1:]
t2f = t2[:, 1:]
Y = np.eye(num_labels)[y]
Delta1, Delta2 = 0, 0
for i, row in enumerate(X):
a1, z2, a2, z3, a3 = self._forward(row, t1, t2)
# Backprop
d3 = a3 - Y[i, :].T
d2 = np.dot(t2f.T, d3) * self.activation_func_prime(z2)
Delta2 += np.dot(d3[np.newaxis].T, a2[np.newaxis])
Delta1 += np.dot(d2[np.newaxis].T, a1[np.newaxis])
Theta1_grad = (1 / m) * Delta1
Theta2_grad = (1 / m) * Delta2
if reg_lambda != 0:
Theta1_grad[:, 1:] = Theta1_grad[:, 1:] + (reg_lambda / m) * t1f
Theta2_grad[:, 1:] = Theta2_grad[:, 1:] + (reg_lambda / m) * t2f
return self.pack_thetas(Theta1_grad, Theta2_grad)
def fit(self, X, y):
num_features = X.shape[0]
input_layer_size = X.shape[1]
num_labels = len(set(y))
theta1_0 = self.rand_init(input_layer_size, self.hidden_layer_size)
theta2_0 = self.rand_init(self.hidden_layer_size, num_labels)
thetas0 = self.pack_thetas(theta1_0, theta2_0)
options = {'maxiter': self.maxiter}
_res = optimize.minimize(self.function, thetas0, jac=self.function_prime, method=self.method,
args=(input_layer_size, self.hidden_layer_size, num_labels, X, y, 0), options=options)
self.t1, self.t2 = self.unpack_thetas(_res.x, input_layer_size, self.hidden_layer_size, num_labels)
np.savetxt("weights_t1.txt", self.t1, newline="\n")
np.savetxt("weights_t2.txt", self.t2, newline="\n")
def predict(self, X):
return self.predict_proba(X).argmax(0)
def predict_proba(self, X):
_, _, _, _, h = self._forward(X, self.t1, self.t2)
return h
##################
# IR data #
##################
values = np.loadtxt('infrared_data.txt', delimiter=', ', usecols=[0,1,2,3,4])
targets = np.loadtxt('infrared_data.txt', delimiter=', ', dtype=(int), usecols=[5])
X_train, X_test, y_train, y_test = cross_validation.train_test_split(values, targets, test_size=0.4)
nn = NN_1HL()
nn.fit(values, targets)
print("Accuracy of classification: "+str(accuracy_score(y_test, nn.predict(X_test))))
In the given code scipy.optimize.minimize iteratively minimizes function given it's derivative (Jacobi's matrix). According to the documentation, use can specify callback argument to a function that will be called after each iteration — this will let you measure performance, though I'm not sure if it'll let you halt the optimization process.
All parameters you listed are hyperparameters, it's hard to optimize them directly:
Number of neurons in the hidden layer is a discrete valued parameters, and, thus, is not optimizable via gradient techniques. Moreover, it affects NeuralNet architecture, so you can't optimize it while training the net. What you can do, though, is to use some higher-level routine to search for possible options, like exhaustive grid search with cross-validation (for example look at GridSearchCV) or other tools for hyperparameter search (hyperopt, spearmint, MOE, etc).
Learning rate does not seem to be customizable for most of the optimization methods available. But, actually, learning rate in gradient descent is just a Newton's method with Hessian "approximated" by 1 / eta I — diagonal matrix with inverted learning rates on the major diagonal. So you can try hessian-based methods with this heuristic.
Momentum is completely unrelated to regularization. It's an optimization technique, and, since you use scipy for optimization, is unavailable for you.