I am trying to make this estimator scikit-learn-compatible so that I can search the parameter space with GridSearchCV.
EDIT:
I have modified the script as suggested (see below).
the fit signature is modified to fit(self, X, y)
All parameters are passed in __init__
There is still a compatibility issue with GripdSearchCV, possibly because the estimator is a multilabel classifier.
ValueError: Can't handle mix of multilabel-indicator and continuous-multioutput
But that is beyond the point; the attribute error is now gone. So, we can safely conclude that the modifications suggested made the estimator scikit-learn-compatible.
Final code script:
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelBinarizer
from sklearn.cross_validation import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
class LogisticClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, basis=None, itrs=100, learn_rate=0.1, reg=0.1, momentum=0.5, proj_layer_size=10):
self.W = []
self.A = None
if basis == 'rectifier':
self.basisfunc = self.rectifier_basis
else:
self.basisfunc = self.identity
self.itrs = itrs
self.learn_rate = learn_rate
self.reg = reg
self.momentum = momentum
self.proj_layer_size = proj_layer_size
def identity(self, x):
return np.hstack((x, 1))
def rectifier_basis(self, x):
xn = np.dot(self.A, x)
return self.identity(np.maximum(xn, 0))
def basismap(self, X):
new_dimensions = self.basisfunc(X[0,:]).shape[0]
Xn = np.zeros((X.shape[0], new_dimensions))
for i, xi in enumerate(X):
Xn[i,:] = self.basisfunc(xi)
return Xn
def fit(self, X, Y):
self.A = np.random.uniform(-1, 1, (self.proj_layer_size, X.shape[1]))
Xn = self.basismap(X)
self.W = np.array(np.random.uniform(-0.1, 0.1, (Y.shape[1], Xn.shape[1])))
costs_train, costs_test = [], []
previous_grad = np.zeros(self.W.shape)
for i in range(self.itrs):
grad = self.grad(Xn, Y)
self.W = self.W - self.learn_rate*(grad+self.momentum*previous_grad)
previous_grad = grad
costs_train.append(self.loss(X, Y))
#costs_test.append(self.loss(Xtest, Ytest))
#return (costs_train, costs_test)
return costs_train
def softmax(self, Z):
Z = np.maximum(Z, -1e3)
Z = np.minimum(Z, 1e3)
numerator = np.exp(Z)
return numerator/np.sum(numerator, axis=1).reshape((-1,1))
def predict(self, X):
Xn = self.basismap(X)
return self.softmax(np.dot(Xn, self.W.T))
def grad(self, Xn, Y):
Yh = self.softmax(np.dot(Xn, self.W.T))
return -np.dot(Y.T-Yh.T,Xn)/Xn.shape[0] + self.reg*self.W
def loss(self, X, Y):
Yh = self.predict(X)
return -np.mean(np.mean(Y*np.log(Yh)))-self.reg*np.trace(np.dot(self.W,self.W.T))/self.W.shape[0]
def get_params(self, deep=True):
return {"itrs": self.itrs, "learn_rate": self.learn_rate, "reg": self.reg, "momentum": self.momentum,
"report_cost": self.report_cost, "proj_layer_size": self.proj_layer_size, "iseed": self.iseed}
def set_params(self, **parameters):
for parameter, value in parameters.items():
setattr(self, parameter, value)
#make data
X, Y = make_classification(n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3,
n_clusters_per_class=1, random_state=31)
lb = LabelBinarizer()
Y = lb.fit_transform(Y)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.25, random_state=5)
#model optimization
param_grid = {'learn_rate': [0.1, 0.01, 0.001],
'reg': [0.001, 0.01]
}
clf = LogisticClassifier(basis='rectifier')
gs_cv = GridSearchCV(clf, param_grid, scoring='accuracy').fit(Xtrain, Ytrain)
print('Best hyperparameters: %r' % gs_cv.best_params_)
In the get_params method you call self.itrs, but your object doesn't have such attribute.
Also I suggest you to change fit signature to something like fit(self, X, y) and
Pass all the parameters in __init__
Split X and y to train and test using sklearn.cross_validation.train_test_split.
That'd make your code more sklearn-like and more compatible with library functions.
Related
I am working on an implementation of MAML (see https://arxiv.org/pdf/1703.03400.pdf) in Jax.
When training on a distribution of simple linear regression tasks it seems to perform fine (takes a while to converge but ultimately works).
However when training on a tasks distributed like A * sin(B + X) where A, B are random variables all the weights in the network converge to 0. training results
This is clearly not right.
Thanks in advance for any help provided.
Full code here https://colab.research.google.com/drive/1YoOkwo5tI42LeIbBOxpImkN55Kg9wScl?usp=sharing or see below for minimal code.
Task Generation code:
class MAMLDataLoader:
def __init__(self, sample_task_fn, num_tasks, batch_size):
self.sample_task_fn = sample_task_fn
self.num_tasks = num_tasks
self.batch_size = batch_size
def sample_tasks(self, key):
XS = jnp.empty((self.num_tasks, 2 * self.batch_size, 1))
YS = jnp.empty((self.num_tasks, 2 * self.batch_size, 1))
for i in range(self.num_tasks):
key, subkey = random.split(key)
xs, ys = self.sample_task_fn(self.batch_size * 2, subkey)
XS = XS.at[i].set(xs)
YS = YS.at[i].set(ys)
x_train, x_test = XS[:, :self.batch_size], XS[:, self.batch_size:]
y_train, y_test = YS[:, :self.batch_size], YS[:, self.batch_size:]
return x_train, y_train, x_test, y_test
def dummy_input(self):
key = random.PRNGKey(0)
x = self.sample_task_fn(1, key)[0][0]
return x
def sample_sinusoidal_task(samples, key):
# y = a * sin(b + x)
xs_key, amplitude_key, phase_key = random.split(key, num=3)
amplitude = random.uniform(amplitude_key, (1, 1))
phase = random.uniform(phase_key, (1, 1)) * jnp.pi * 2
xs = (random.uniform(xs_key, (samples, 1)) * 4 - 2) * jnp.pi
ys = amplitude * jnp.sin(xs + phase)
return xs, ys
Here is the main MAML code:
class MAMLTrainer:
def __init__(self, model, alpha, optimiser, inner_steps=1):
self.model = model
self.alpha = alpha
self.optimiser = optimiser
self.inner_steps = inner_steps
self.jit_step = jit(self.step)
def loss(self, params, x, y):
preds = self.model.apply(params, x)
return jnp.mean(jnp.inner(y - preds, y - preds) / 2.0)
def update(self, params, x, y, inner_steps=None):
if inner_steps is None:
inner_steps = self.inner_steps
loss_grad = grad(self.loss)
def _update(i, params):
grads = loss_grad(params, x, y)
new_params = tree_map(lambda p, g: p - self.alpha * g, params, grads)
return new_params
return lax.fori_loop(0, inner_steps, _update, params)
def meta_loss(self, params, x1, y1, x2, y2):
return self.loss(self.update(params, x1, x2), x2, y2)
def batch_meta_loss(self, params, x1, y1, x2, y2):
return jnp.mean(vmap(partial(self.meta_loss, params))(x1, y1, x2, y2))
def step(self, params, optimiser, x1, y1, x2, y2):
loss, grads = value_and_grad(self.batch_meta_loss)(params, x1, y1, x2, y2)
updates, opt_state = self.optimiser.update(grads, optimiser, params)
params = optax.apply_updates(params, updates)
return params, loss
def train(self, dataloader, steps, key, params=None):
if params is None:
key, subkey = random.split(key)
params = self.model.init(subkey, dataloader.dummy_input())
optimiser = self.optimiser.init(params)
pbar, losses = tqdm(range(steps), desc='Training'), []
for epoch in pbar:
key, subkey = random.split(key)
params, loss = self.jit_step(params, optimiser, *dataloader.sample_tasks(subkey))
losses.append(loss)
if epoch % 100 == 0:
avg_loss = jnp.mean(jnp.array(losses[-100:]))
pbar.set_postfix_str(f'current_loss: {loss:.3f}, running_loss_100_epochs: {avg_loss:.3f}')
return params, jnp.array(losses)
def n_shot_learn(self, x_train, y_train, params, n):
return self.update(params, x_train, y_train, n)
Training Code:
class SimpleMLP(nn.Module):
features: Sequence[int]
#nn.compact
def __call__(self, inputs):
x = inputs
for i, feat in enumerate(self.features[:-1]):
x = nn.Dense(feat)(x)
x = nn.relu(x)
return nn.Dense(self.features[-1])(x)
model = SimpleMLP([64, 64, 1])
optimiser = optax.adam(1e-3)
trainer = MAMLTrainer(model, 0.1, optimiser, 1)
dataloader = MAMLDataLoader(sample_sinusoidal_task, 2, 100)
key = random.PRNGKey(0)
params, losses = trainer.train(dataloader, 10000, key)
Hi I've been working on a neural network to tackle the MNIST dataset, but when I run the code the accuracy begins to increase but eventually results in 0.098 accuracy, I also encounter an overflow error in exp when calculating the SoftMax values. I have tried to debug my code but I don't understand where I'm going wrong. If anyone can point me in the right direction that would be great and if you can't find an error could you give me any tips on techniques to try to debug this. Thanks in advance.
import numpy as np
import pandas as pd
df = pd.read_csv('../input/digit-recognizer/train.csv')
data = np.array(df.values)
data = data.T
data
Y = data[0,:]
X = data[1:,:]
Y_train = Y[:41000]
X_train = X[:,:41000]
X_train = X_train/255
Y_val = Y[41000:]
X_val = X[:,41000:]
X_val = X_val/255
print(np.max(X_train))
class NeuralNetwork:
def __init__(self, n_in, n_out):
self.w1, self.b1 = self.Generate_Weights_Biases(10,784)
self.w2, self.b2 = self.Generate_Weights_Biases(10,10)
def Generate_Weights_Biases(self, n_in, n_out):
weights = 0.01*np.random.randn(n_in, n_out)
biases = np.zeros((n_in,1))
return weights, biases
def forward(self, X):
self.Z1 = self.w1.dot(X) + self.b1
self.a1 = self.ReLu(self.Z1)
self.z2 = self.w2.dot(self.a1) + self.b2
y_pred = self.Softmax(self.z2)
return y_pred
def ReLu(self, Z):
return np.maximum(Z,0)
def Softmax(self, Z):
#exponentials = np.exp(Z)
#sumexp = np.sum(np.exp(Z), axis=0)
#print(Z)
return np.exp(Z)/np.sum(np.exp(Z))
def ReLu_Derv(self, x):
return np.greaterthan(x, 0).astype(int)
def One_hot_encoding(self, Y):
one_hot = np.zeros((Y.size, 10))
rows = np.arange(Y.size)
one_hot[rows, Y] = 1
one_hot = one_hot.T
return one_hot
def Get_predictions(self, y_pred):
return np.argmax(y_pred, 0)
def accuracy(self, pred, Y):
return np.sum(pred == Y)/Y.size
def BackPropagation(self, X, Y, y_pred, lr=0.01):
m = Y.size
one_hot_y = self.One_hot_encoding(Y)
e2 = y_pred - one_hot_y
derW2 = (1/m)* e2.dot(self.a1.T)
derB2 =(1/m) * e2
#derB2 = derB2.reshape(10,1)
e1 = self.w2.T.dot(e2) * self.ReLu(self.a1)
derW1 = (1/m) * e1.dot(X.T)
derB1 = (1/m) * e1
#derB1 = derB1.reshape(10,1)
self.w1 = self.w1 - lr*derW1
self.b1 = self.b1 - lr*np.sum(derB1, axis=1, keepdims=True)
self.w2 = self.w2 - lr*derW2
self.b2 = self.b2 - lr*np.sum(derB2, axis=1, keepdims=True)
def train(self, X, Y, epochs = 1000):
for i in range(epochs):
y_pred = self.forward(X)
predict = self.Get_predictions(y_pred)
accuracy = self.accuracy(predict, Y)
print(accuracy)
self.BackPropagation(X, Y, y_pred)
return self.w1, self.b1, self.w2, self.b2
NN = NeuralNetwork(X_train, Y_train)
w1,b1,w2,b2 = NN.train(X_train,Y_train)
I found the following errors:
Your softmax implementation doesn't work because of terrific numeric errors you get exponentiating potentially large numbers to obtain something between 0 and 1. And besides, you forgot to specify the summation axis in the denominator. Here is a working implementation:
def Softmax(self, Z):
e = np.exp(Z - Z.max(axis=0, keepdims=True))
return e/e.sum(axis=0, keepdims=True)
(Here and below I skip coding-style remarks that are not essential in this context. Like that this should be a class method or a stand-alone function etc.)
Your ReLu derivative implementation doesn't work for me at all. May be I have a different numpy version. This one works:
def ReLu_Derv(self, x):
return (x > 0).astype(int)
You need to actually use this implementation in BackPropagation:
e1 = self.w2.T.dot(e2) * self.ReLu_Derv(self.a1)
With these amendments, I managed to achieve 91.0% accuracy after 100 iteration with LR=0.1. I loaded MNIST from Keras with this code:
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()
X = train_images.reshape(-1, 28*28).T
Y = train_labels
Hi I'm trying to train my own designed neural network on the MNIST handwritten data set and every time I run this code the accuracy starts to increase then decreases and I get an overflow warning. Can someone explain whether my code is just poor and messy or whether I have just missed something little out. Thanks in advance
import numpy as np
import pandas as pd
df = pd.read_csv('../input/digit-recognizer/train.csv')
data = np.array(df.values)
data = data.T
data
Y = data[0,:]
X = data[1:,:]
Y_train = Y[:41000]
X_train = X[:,:41000]
X_train = X_train/255
Y_val = Y[41000:]
X_val = X[:,41000:]
X_val = X_val/255
print(np.max(X_train))
class NeuralNetwork:
def __init__(self, n_in, n_out):
self.w1, self.b1 = self.Generate_Weights_Biases(10,784)
self.w2, self.b2 = self.Generate_Weights_Biases(10,10)
def Generate_Weights_Biases(self, n_in, n_out):
weights = 0.01*np.random.randn(n_in, n_out)
biases = np.zeros((n_in,1))
return weights, biases
def forward(self, X):
self.Z1 = self.w1.dot(X) + self.b1
self.a1 = self.ReLu(self.Z1)
self.z2 = self.w2.dot(self.a1) + self.b1
y_pred = self.Softmax(self.z2)
return y_pred
def ReLu(self, Z):
return np.maximum(Z,0)
def Softmax(self, Z):
#exponentials = np.exp(Z)
#sumexp = np.sum(np.exp(Z), axis=0)
#print(Z)
return np.exp(Z)/np.sum(np.exp(Z))
def ReLu_Derv(self, x):
return np.greaterthan(x, 0).astype(int)
def One_hot_encoding(self, Y):
one_hot = np.zeros((Y.size, 10))
rows = np.arange(Y.size)
one_hot[rows, Y] = 1
one_hot = one_hot.T
return one_hot
def Get_predictions(self, y_pred):
return np.argmax(y_pred, 0)
def accuracy(self, pred, Y):
return np.sum(pred == Y)/Y.size
def BackPropagation(self, X, Y, y_pred, lr=0.01):
m = Y.size
one_hot_y = self.One_hot_encoding(Y)
e2 = y_pred - one_hot_y
derW2 = (1/m)* e2.dot(self.a1.T)
derB2 =(1/m) * np.sum(e2,axis=1)
derB2 = derB2.reshape(10,1)
e1 = self.w2.T.dot(e2) * self.ReLu(self.a1)
derW1 = (1/m) * e1.dot(X.T)
derB1 = (1/m) * np.sum(e1, axis=1)
derB1 = derB1.reshape(10,1)
self.w1 = self.w1 - lr*derW1
self.b1 = self.b1 - lr*derB1
self.w2 = self.w2 - lr*derW2
self.b2 = self.b2 - lr*derB2
def train(self, X, Y, epochs = 1000):
for i in range(epochs):
y_pred = self.forward(X)
predict = self.Get_predictions(y_pred)
accuracy = self.accuracy(predict, Y)
print(accuracy)
self.BackPropagation(X, Y, y_pred)
return self.w1, self.b1, self.w2, self.b2
NN = NeuralNetwork(X_train, Y_train)
w1,b1,w2,b2 = NN.train(X_train,Y_train)
You should use a different bias for the second layer
self.z2 = self.w2.dot(self.a1) + self.b1 # not b1
self.z2 = self.w2.dot(self.a1) + self.b2 # but b2
When doing something like this
derB2 =(1/m) * np.sum(e2,axis=1)
you would like to use (keepdims = True) to make sure that derB2.shape is (something,1) but not (something, ). It makes your code more rigorous.
I am trying to run linear regression on data with shapes (768, 8) and (768,) for X and Y respectively. I want to run the predictor such that I get values for y for the last three rows of X. I have manually added content of one row to the predictor.
class LinearRegression() :
def __init__( self, learning_rate, iterations ) :
self.learning_rate = learning_rate
self.iterations = iterations
def fit( self, X, Y ) :
self.m, self.n = X.shape
# weight initialization
self.W = np.zeros( self.n )
self.b = 0
self.X = X
self.Y = Y
for i in range( self.iterations ) :
self.update_weights()
return self
def update_weights( self ) :
Y_pred = self.predict( self.X )
# calculate gradients
dW = - ( 2 * ( self.X.T ).dot( self.Y - Y_pred ) ) / self.m
db = - 2 * np.sum( self.Y - Y_pred ) / self.m
# update weights
self.W = self.W - self.learning_rate * dW
self.b = self.b - self.learning_rate * db
return self
def predict( self, X ) :
return X.dot( self.W ) + self.b
def runLR() :
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size = 1/3, random_state = 0 )
model = LinearRegression( iterations = 1000, learning_rate = 0.01 )
model.fit( X_train, Y_train )
see = model.predict(np.array([[0.98, 514.5, 294.0, 110.25, 7.0, 2.0,
0.0, 0.0]]))
print(see)
runLR()
However, the result of see that I get is
[nan]. I believe I am passing my data into model.predict in a wrong format?
X=np.random.normal(size=(768,8))
Y=X[:,0]+2*X[:,1]+3*X[:,2]+4*X[:,3]+5*X[:,4]+6*X[:,5]+7*X[:,6]+8*X[:,7]
def runLR() :
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size = float(1)/3, random_state = 0 )
model = LinearRegression( iterations = 1000, learning_rate = 0.01 )
model.fit( X_train, Y_train )
print(model.b)
print(model.W)
see = model.predict(np.array([[0.98, 514.5, 294.0, 110.25, 7.0, 2.0,
0.0, 0.0]]))
print(see)
runLR()
This should work. Just change 1/3 to float(1)/3 in the test_size because 1/3 is just 0 I think.
-1.2610015928318272e-07 #b is about 0
[0.99999993 1.99999988 3.00000005 3.99999998 5.00000002 5.99999999
7.00000003 7.99999991] # W is 1 through 8
[2399.97995156] # predicted y
To add more predictors, add them to the array like this: np.array([[0.98, 514.5, 294.0, 110.25, 7.0, 2.0, 0.0, 0.0],[1,2,3,1,2,3,1,2],[5,6,7,8,1,2,4,3]])
I was implementing L1 regularization with pytorch for feature selection and found that I have different results compared to Sklearn or cvxpy. Perhaps I am implementing nn.L1Loss incorrectly or maybe there is a better way to optimize (I tried both Adam and SGD with a few different lr)?
import numpy as np
from tqdm import tqdm_notebook
import matplotlib.pyplot as plt
import cvxpy as cp
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.linear_model import Lasso
import torch
import torch.nn as nn
import torch.optim as optim
# generate data
X,y, coef_true = make_regression(n_samples=200, n_features=10000, n_informative=10,
coef = True, random_state = 123)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25)
print(np.where(coef_true != 0)[0])
# [ 893 4422 4428 5284 5632 5975 6388 7586 8270 9597]
Using sklearn, I get the correct answer:
# sklearn lasso
lasso_sklearn = Lasso(alpha = 0.2, warm_start = True)
lasso_sklearn.coef_ = np.zeros(X_train.shape[1])
lasso_sklearn.fit(X_train, y_train)
coef_sklearn = lasso_sklearn.coef_
print(np.where(lasso_sklearn.coef_ != 0)[0])
# [ 893 4422 4428 5284 5632 5975 6388 7586 8270 9597]
Using pytorch, I get this answer:
# pytorch lasso
class lasso(nn.Module):
def __init__(self, in_dim,):
super(lasso, self).__init__()
self.linear = nn.Linear(in_dim,1)
def forward(self, X):
return self.linear(X)
def weights_init(m):
if isinstance(m, nn.Linear):
torch.nn.init.zeros_(m.weight)
lasso_pytorch = lasso(X_train.shape[1])
lasso_pytorch.apply(weights_init)
l1_loss = nn.L1Loss(reduction = 'sum')
mse_loss = nn.MSELoss()
optimizer = optim.Adam(lasso_pytorch.parameters(), lr = 0.0001)
alpha = 0.20
n_epoch = 5000
loss_history = []
lasso_pytorch.train()
for epoch in tqdm_notebook(range(n_epoch)):
optimizer.zero_grad()
outputs = lasso_pytorch(torch.from_numpy(X_train).float(),)
loss = 0.5 * mse_loss(outputs, torch.from_numpy(y_train.reshape(-1,1)).float())
p = 0
for param in lasso_pytorch.parameters():
loss += alpha * l1_loss(param, torch.zeros_like(param))
loss_history.append(loss)
loss.backward()
optimizer.step()
coef_pytorch = np.array(lasso_pytorch.linear.weight.data).squeeze()
print(np.argsort(-np.abs(coef_pytorch))[:10])
# [5141 2251 902 2848 5002 8925 9328 8084 1888 2208]
Using cvxpy, I also get the correct answer:
# cvxpy lasso
def loss_fn(X, Y, beta):
return cp.norm2(cp.matmul(X, beta) - Y)**2
def regularizer(beta,):
return cp.norm1(beta)
def objective_fn(X, Y, beta, alpha):
return 0.5/(len(X)) * loss_fn(X, Y, beta) + alpha * regularizer(beta) # from sklearn lasso
coef = cp.Variable(X_train.shape[1])
coef.value = np.zeros(X_train.shape[1])
alpha = cp.Parameter(nonneg=True)
alpha.value = 0.2
problem = cp.Problem(cp.Minimize(objective_fn(X, y, coef, alpha,)))
problem.solve(solver = cp.ECOS, warm_start = True,)
coef_cvxpy = coef.value
print(np.where(coef.value > 0.01)[0])
# [ 893, 4422, 4428, 5284, 5632, 5975, 6388, 7586, 8270, 9597]