Vanishing parameters in MAML JAX (Meta Learning)

Vanishing parameters in MAML JAX (Meta Learning) - python

I am working on an implementation of MAML (see https://arxiv.org/pdf/1703.03400.pdf) in Jax.
When training on a distribution of simple linear regression tasks it seems to perform fine (takes a while to converge but ultimately works).
However when training on a tasks distributed like A * sin(B + X) where A, B are random variables all the weights in the network converge to 0. training results
This is clearly not right.
Thanks in advance for any help provided.
Full code here https://colab.research.google.com/drive/1YoOkwo5tI42LeIbBOxpImkN55Kg9wScl?usp=sharing or see below for minimal code.
Task Generation code:
class MAMLDataLoader:
def __init__(self, sample_task_fn, num_tasks, batch_size):
self.sample_task_fn = sample_task_fn
self.num_tasks = num_tasks
self.batch_size = batch_size
def sample_tasks(self, key):
XS = jnp.empty((self.num_tasks, 2 * self.batch_size, 1))
YS = jnp.empty((self.num_tasks, 2 * self.batch_size, 1))
for i in range(self.num_tasks):
key, subkey = random.split(key)
xs, ys = self.sample_task_fn(self.batch_size * 2, subkey)
XS = XS.at[i].set(xs)
YS = YS.at[i].set(ys)
x_train, x_test = XS[:, :self.batch_size], XS[:, self.batch_size:]
y_train, y_test = YS[:, :self.batch_size], YS[:, self.batch_size:]
return x_train, y_train, x_test, y_test
def dummy_input(self):
key = random.PRNGKey(0)
x = self.sample_task_fn(1, key)[0][0]
return x
def sample_sinusoidal_task(samples, key):
# y = a * sin(b + x)
xs_key, amplitude_key, phase_key = random.split(key, num=3)
amplitude = random.uniform(amplitude_key, (1, 1))
phase = random.uniform(phase_key, (1, 1)) * jnp.pi * 2
xs = (random.uniform(xs_key, (samples, 1)) * 4 - 2) * jnp.pi
ys = amplitude * jnp.sin(xs + phase)
return xs, ys
Here is the main MAML code:
class MAMLTrainer:
def __init__(self, model, alpha, optimiser, inner_steps=1):
self.model = model
self.alpha = alpha
self.optimiser = optimiser
self.inner_steps = inner_steps
self.jit_step = jit(self.step)
def loss(self, params, x, y):
preds = self.model.apply(params, x)
return jnp.mean(jnp.inner(y - preds, y - preds) / 2.0)
def update(self, params, x, y, inner_steps=None):
if inner_steps is None:
inner_steps = self.inner_steps
loss_grad = grad(self.loss)
def _update(i, params):
grads = loss_grad(params, x, y)
new_params = tree_map(lambda p, g: p - self.alpha * g, params, grads)
return new_params
return lax.fori_loop(0, inner_steps, _update, params)
def meta_loss(self, params, x1, y1, x2, y2):
return self.loss(self.update(params, x1, x2), x2, y2)
def batch_meta_loss(self, params, x1, y1, x2, y2):
return jnp.mean(vmap(partial(self.meta_loss, params))(x1, y1, x2, y2))
def step(self, params, optimiser, x1, y1, x2, y2):
loss, grads = value_and_grad(self.batch_meta_loss)(params, x1, y1, x2, y2)
updates, opt_state = self.optimiser.update(grads, optimiser, params)
params = optax.apply_updates(params, updates)
return params, loss
def train(self, dataloader, steps, key, params=None):
if params is None:
key, subkey = random.split(key)
params = self.model.init(subkey, dataloader.dummy_input())
optimiser = self.optimiser.init(params)
pbar, losses = tqdm(range(steps), desc='Training'), []
for epoch in pbar:
key, subkey = random.split(key)
params, loss = self.jit_step(params, optimiser, *dataloader.sample_tasks(subkey))
losses.append(loss)
if epoch % 100 == 0:
avg_loss = jnp.mean(jnp.array(losses[-100:]))
pbar.set_postfix_str(f'current_loss: {loss:.3f}, running_loss_100_epochs: {avg_loss:.3f}')
return params, jnp.array(losses)
def n_shot_learn(self, x_train, y_train, params, n):
return self.update(params, x_train, y_train, n)
Training Code:
class SimpleMLP(nn.Module):
features: Sequence[int]
#nn.compact
def __call__(self, inputs):
x = inputs
for i, feat in enumerate(self.features[:-1]):
x = nn.Dense(feat)(x)
x = nn.relu(x)
return nn.Dense(self.features[-1])(x)
model = SimpleMLP([64, 64, 1])
optimiser = optax.adam(1e-3)
trainer = MAMLTrainer(model, 0.1, optimiser, 1)
dataloader = MAMLDataLoader(sample_sinusoidal_task, 2, 100)
key = random.PRNGKey(0)
params, losses = trainer.train(dataloader, 10000, key)

Related

MNIST numpy neural network accuracy hovering at 10%

Hi I've been working on a neural network to tackle the MNIST dataset, but when I run the code the accuracy begins to increase but eventually results in 0.098 accuracy, I also encounter an overflow error in exp when calculating the SoftMax values. I have tried to debug my code but I don't understand where I'm going wrong. If anyone can point me in the right direction that would be great and if you can't find an error could you give me any tips on techniques to try to debug this. Thanks in advance.
import numpy as np
import pandas as pd
df = pd.read_csv('../input/digit-recognizer/train.csv')
data = np.array(df.values)
data = data.T
data
Y = data[0,:]
X = data[1:,:]
Y_train = Y[:41000]
X_train = X[:,:41000]
X_train = X_train/255
Y_val = Y[41000:]
X_val = X[:,41000:]
X_val = X_val/255
print(np.max(X_train))
class NeuralNetwork:
def __init__(self, n_in, n_out):
self.w1, self.b1 = self.Generate_Weights_Biases(10,784)
self.w2, self.b2 = self.Generate_Weights_Biases(10,10)
def Generate_Weights_Biases(self, n_in, n_out):
weights = 0.01*np.random.randn(n_in, n_out)
biases = np.zeros((n_in,1))
return weights, biases
def forward(self, X):
self.Z1 = self.w1.dot(X) + self.b1
self.a1 = self.ReLu(self.Z1)
self.z2 = self.w2.dot(self.a1) + self.b2
y_pred = self.Softmax(self.z2)
return y_pred
def ReLu(self, Z):
return np.maximum(Z,0)
def Softmax(self, Z):
#exponentials = np.exp(Z)
#sumexp = np.sum(np.exp(Z), axis=0)
#print(Z)
return np.exp(Z)/np.sum(np.exp(Z))
def ReLu_Derv(self, x):
return np.greaterthan(x, 0).astype(int)
def One_hot_encoding(self, Y):
one_hot = np.zeros((Y.size, 10))
rows = np.arange(Y.size)
one_hot[rows, Y] = 1
one_hot = one_hot.T
return one_hot
def Get_predictions(self, y_pred):
return np.argmax(y_pred, 0)
def accuracy(self, pred, Y):
return np.sum(pred == Y)/Y.size
def BackPropagation(self, X, Y, y_pred, lr=0.01):
m = Y.size
one_hot_y = self.One_hot_encoding(Y)
e2 = y_pred - one_hot_y
derW2 = (1/m)* e2.dot(self.a1.T)
derB2 =(1/m) * e2
#derB2 = derB2.reshape(10,1)
e1 = self.w2.T.dot(e2) * self.ReLu(self.a1)
derW1 = (1/m) * e1.dot(X.T)
derB1 = (1/m) * e1
#derB1 = derB1.reshape(10,1)
self.w1 = self.w1 - lr*derW1
self.b1 = self.b1 - lr*np.sum(derB1, axis=1, keepdims=True)
self.w2 = self.w2 - lr*derW2
self.b2 = self.b2 - lr*np.sum(derB2, axis=1, keepdims=True)
def train(self, X, Y, epochs = 1000):
for i in range(epochs):
y_pred = self.forward(X)
predict = self.Get_predictions(y_pred)
accuracy = self.accuracy(predict, Y)
print(accuracy)
self.BackPropagation(X, Y, y_pred)
return self.w1, self.b1, self.w2, self.b2
NN = NeuralNetwork(X_train, Y_train)
w1,b1,w2,b2 = NN.train(X_train,Y_train)

I found the following errors:
Your softmax implementation doesn't work because of terrific numeric errors you get exponentiating potentially large numbers to obtain something between 0 and 1. And besides, you forgot to specify the summation axis in the denominator. Here is a working implementation:
def Softmax(self, Z):
e = np.exp(Z - Z.max(axis=0, keepdims=True))
return e/e.sum(axis=0, keepdims=True)
(Here and below I skip coding-style remarks that are not essential in this context. Like that this should be a class method or a stand-alone function etc.)
Your ReLu derivative implementation doesn't work for me at all. May be I have a different numpy version. This one works:
def ReLu_Derv(self, x):
return (x > 0).astype(int)
You need to actually use this implementation in BackPropagation:
e1 = self.w2.T.dot(e2) * self.ReLu_Derv(self.a1)
With these amendments, I managed to achieve 91.0% accuracy after 100 iteration with LR=0.1. I loaded MNIST from Keras with this code:
(train_images, train_labels), (test_images, test_labels) = tf.keras.datasets.mnist.load_data()
X = train_images.reshape(-1, 28*28).T
Y = train_labels

Multi class neural network for MNIST data set not working

Hi I'm trying to train my own designed neural network on the MNIST handwritten data set and every time I run this code the accuracy starts to increase then decreases and I get an overflow warning. Can someone explain whether my code is just poor and messy or whether I have just missed something little out. Thanks in advance
import numpy as np
import pandas as pd
df = pd.read_csv('../input/digit-recognizer/train.csv')
data = np.array(df.values)
data = data.T
data
Y = data[0,:]
X = data[1:,:]
Y_train = Y[:41000]
X_train = X[:,:41000]
X_train = X_train/255
Y_val = Y[41000:]
X_val = X[:,41000:]
X_val = X_val/255
print(np.max(X_train))
class NeuralNetwork:
def __init__(self, n_in, n_out):
self.w1, self.b1 = self.Generate_Weights_Biases(10,784)
self.w2, self.b2 = self.Generate_Weights_Biases(10,10)
def Generate_Weights_Biases(self, n_in, n_out):
weights = 0.01*np.random.randn(n_in, n_out)
biases = np.zeros((n_in,1))
return weights, biases
def forward(self, X):
self.Z1 = self.w1.dot(X) + self.b1
self.a1 = self.ReLu(self.Z1)
self.z2 = self.w2.dot(self.a1) + self.b1
y_pred = self.Softmax(self.z2)
return y_pred
def ReLu(self, Z):
return np.maximum(Z,0)
def Softmax(self, Z):
#exponentials = np.exp(Z)
#sumexp = np.sum(np.exp(Z), axis=0)
#print(Z)
return np.exp(Z)/np.sum(np.exp(Z))
def ReLu_Derv(self, x):
return np.greaterthan(x, 0).astype(int)
def One_hot_encoding(self, Y):
one_hot = np.zeros((Y.size, 10))
rows = np.arange(Y.size)
one_hot[rows, Y] = 1
one_hot = one_hot.T
return one_hot
def Get_predictions(self, y_pred):
return np.argmax(y_pred, 0)
def accuracy(self, pred, Y):
return np.sum(pred == Y)/Y.size
def BackPropagation(self, X, Y, y_pred, lr=0.01):
m = Y.size
one_hot_y = self.One_hot_encoding(Y)
e2 = y_pred - one_hot_y
derW2 = (1/m)* e2.dot(self.a1.T)
derB2 =(1/m) * np.sum(e2,axis=1)
derB2 = derB2.reshape(10,1)
e1 = self.w2.T.dot(e2) * self.ReLu(self.a1)
derW1 = (1/m) * e1.dot(X.T)
derB1 = (1/m) * np.sum(e1, axis=1)
derB1 = derB1.reshape(10,1)
self.w1 = self.w1 - lr*derW1
self.b1 = self.b1 - lr*derB1
self.w2 = self.w2 - lr*derW2
self.b2 = self.b2 - lr*derB2
def train(self, X, Y, epochs = 1000):
for i in range(epochs):
y_pred = self.forward(X)
predict = self.Get_predictions(y_pred)
accuracy = self.accuracy(predict, Y)
print(accuracy)
self.BackPropagation(X, Y, y_pred)
return self.w1, self.b1, self.w2, self.b2
NN = NeuralNetwork(X_train, Y_train)
w1,b1,w2,b2 = NN.train(X_train,Y_train)

You should use a different bias for the second layer
self.z2 = self.w2.dot(self.a1) + self.b1 # not b1
self.z2 = self.w2.dot(self.a1) + self.b2 # but b2
When doing something like this
derB2 =(1/m) * np.sum(e2,axis=1)
you would like to use (keepdims = True) to make sure that derB2.shape is (something,1) but not (something, ). It makes your code more rigorous.

linear regression on 8D X data returns nan

I am trying to run linear regression on data with shapes (768, 8) and (768,) for X and Y respectively. I want to run the predictor such that I get values for y for the last three rows of X. I have manually added content of one row to the predictor.
class LinearRegression() :
def __init__( self, learning_rate, iterations ) :
self.learning_rate = learning_rate
self.iterations = iterations
def fit( self, X, Y ) :
self.m, self.n = X.shape
# weight initialization
self.W = np.zeros( self.n )
self.b = 0
self.X = X
self.Y = Y
for i in range( self.iterations ) :
self.update_weights()
return self
def update_weights( self ) :
Y_pred = self.predict( self.X )
# calculate gradients
dW = - ( 2 * ( self.X.T ).dot( self.Y - Y_pred ) ) / self.m
db = - 2 * np.sum( self.Y - Y_pred ) / self.m
# update weights
self.W = self.W - self.learning_rate * dW
self.b = self.b - self.learning_rate * db
return self
def predict( self, X ) :
return X.dot( self.W ) + self.b
def runLR() :
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size = 1/3, random_state = 0 )
model = LinearRegression( iterations = 1000, learning_rate = 0.01 )
model.fit( X_train, Y_train )
see = model.predict(np.array([[0.98, 514.5, 294.0, 110.25, 7.0, 2.0,
0.0, 0.0]]))
print(see)
runLR()
However, the result of see that I get is
[nan]. I believe I am passing my data into model.predict in a wrong format?

X=np.random.normal(size=(768,8))
Y=X[:,0]+2*X[:,1]+3*X[:,2]+4*X[:,3]+5*X[:,4]+6*X[:,5]+7*X[:,6]+8*X[:,7]
def runLR() :
X_train, X_test, Y_train, Y_test = train_test_split(
X, Y, test_size = float(1)/3, random_state = 0 )
model = LinearRegression( iterations = 1000, learning_rate = 0.01 )
model.fit( X_train, Y_train )
print(model.b)
print(model.W)
see = model.predict(np.array([[0.98, 514.5, 294.0, 110.25, 7.0, 2.0,
0.0, 0.0]]))
print(see)
runLR()
This should work. Just change 1/3 to float(1)/3 in the test_size because 1/3 is just 0 I think.
-1.2610015928318272e-07 #b is about 0
[0.99999993 1.99999988 3.00000005 3.99999998 5.00000002 5.99999999
7.00000003 7.99999991] # W is 1 through 8
[2399.97995156] # predicted y
To add more predictors, add them to the array like this: np.array([[0.98, 514.5, 294.0, 110.25, 7.0, 2.0, 0.0, 0.0],[1,2,3,1,2,3,1,2],[5,6,7,8,1,2,4,3]])

Estimate Variables of a Sine Wave using Tensorflow

Given data in the form x, y such that y = A sin(B(x) + C) + D, identify A, B, C, and D using Tensorflow.
I have written the following code to do so, but unfortunately it does not learn. Note here the problem is not to predict the sine curve correctly, but to identify the variables. Bonus points if it is possible to change the function's form to y = A * X_2 * sin (B(X_1) + C) + D.
x = np.linspace(0, 100, 1000)
A = np.random.normal(1)
B = np.random.normal(.5)
C = np.random.normal(1)
D = np.random.normal(1)
y = A*np.sin((B*x) + C) + D
x = tf.constant([x.astype('float32')])
y = tf.constant([y.astype('float32')])
class Addition(tf.Module):
def __init__(self, inputs, name=None):
super().__init__(name=name)
self.b_1 = tf.Variable(tf.random.normal([inputs]), name='b1')
self.b_2 = tf.Variable(tf.random.normal([inputs]), name='b2')
def __call__(self, x):
out = tf.math.multiply(x, self.b_1) + self.b_2
return out
class Sinusoid(tf.Module):
def __init__(self, inputs, name=None):
super().__init__(name=name)
def __call__(self, x):
sine = tf.math.sin(x)
return sine
class Sine_Model(tf.Module):
def __init__(self, name=None):
super().__init__(name=name)
self.add_1 = Addition(inputs=1)
self.sin_1 = Sinusoid(inputs=1)
self.add_2 = Addition(inputs=1)
def __call__(self, x):
x = self.add_1(x)
x = self.sin_1(x)
x = self.add_2(x)
return x
model = Sine_Model(name='sine')
loss_object = tf.keras.losses.MeanSquaredError()
optimizer = tf.keras.optimizers.Adam(learning_rate=.1)
train_loss = tf.keras.metrics.Mean(name='train_loss')
#tf.function
def train_step(x, y):
with tf.GradientTape() as tape:
predictions = model(x)
loss = loss_object(y, predictions)
gradients = tape.gradient(loss, model.trainable_variables)
optimizer.apply_gradients(zip(gradients, model.trainable_variables))
train_loss(loss)
EPOCHS = 200
for epoch in range(EPOCHS):
# Reset the metrics at the start of the next epoch
train_loss.reset_states()
train_step(x, y)
template = 'Epoch {}, Loss: {}'
#print(template.format(epoch + 1,
# train_loss.result()))
y_predicted = sine_model(x)
plt.scatter(x, y_predicted.numpy()[0])
plt.scatter(x, y, c='r')
I did see an answer to this question using scipy here. But I would like to see if it is possible to do using Tensorflow specifically, as I am interested in modularity and would like to be able to solve the problem noted as a bonus above (y = A * X_2 * sin (B(X_1) + C) + D).
Thanks!

How to make this estimator scikit-learn-compatible?

I am trying to make this estimator scikit-learn-compatible so that I can search the parameter space with GridSearchCV.
EDIT:
I have modified the script as suggested (see below).
the fit signature is modified to fit(self, X, y)
All parameters are passed in __init__
There is still a compatibility issue with GripdSearchCV, possibly because the estimator is a multilabel classifier.
ValueError: Can't handle mix of multilabel-indicator and continuous-multioutput
But that is beyond the point; the attribute error is now gone. So, we can safely conclude that the modifications suggested made the estimator scikit-learn-compatible.
Final code script:
import numpy as np
from sklearn.grid_search import GridSearchCV
from sklearn.datasets import make_classification
from sklearn.preprocessing import LabelBinarizer
from sklearn.cross_validation import train_test_split
from sklearn.base import BaseEstimator, ClassifierMixin
class LogisticClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, basis=None, itrs=100, learn_rate=0.1, reg=0.1, momentum=0.5, proj_layer_size=10):
self.W = []
self.A = None
if basis == 'rectifier':
self.basisfunc = self.rectifier_basis
else:
self.basisfunc = self.identity
self.itrs = itrs
self.learn_rate = learn_rate
self.reg = reg
self.momentum = momentum
self.proj_layer_size = proj_layer_size
def identity(self, x):
return np.hstack((x, 1))
def rectifier_basis(self, x):
xn = np.dot(self.A, x)
return self.identity(np.maximum(xn, 0))
def basismap(self, X):
new_dimensions = self.basisfunc(X[0,:]).shape[0]
Xn = np.zeros((X.shape[0], new_dimensions))
for i, xi in enumerate(X):
Xn[i,:] = self.basisfunc(xi)
return Xn
def fit(self, X, Y):
self.A = np.random.uniform(-1, 1, (self.proj_layer_size, X.shape[1]))
Xn = self.basismap(X)
self.W = np.array(np.random.uniform(-0.1, 0.1, (Y.shape[1], Xn.shape[1])))
costs_train, costs_test = [], []
previous_grad = np.zeros(self.W.shape)
for i in range(self.itrs):
grad = self.grad(Xn, Y)
self.W = self.W - self.learn_rate*(grad+self.momentum*previous_grad)
previous_grad = grad
costs_train.append(self.loss(X, Y))
#costs_test.append(self.loss(Xtest, Ytest))
#return (costs_train, costs_test)
return costs_train
def softmax(self, Z):
Z = np.maximum(Z, -1e3)
Z = np.minimum(Z, 1e3)
numerator = np.exp(Z)
return numerator/np.sum(numerator, axis=1).reshape((-1,1))
def predict(self, X):
Xn = self.basismap(X)
return self.softmax(np.dot(Xn, self.W.T))
def grad(self, Xn, Y):
Yh = self.softmax(np.dot(Xn, self.W.T))
return -np.dot(Y.T-Yh.T,Xn)/Xn.shape[0] + self.reg*self.W
def loss(self, X, Y):
Yh = self.predict(X)
return -np.mean(np.mean(Y*np.log(Yh)))-self.reg*np.trace(np.dot(self.W,self.W.T))/self.W.shape[0]
def get_params(self, deep=True):
return {"itrs": self.itrs, "learn_rate": self.learn_rate, "reg": self.reg, "momentum": self.momentum,
"report_cost": self.report_cost, "proj_layer_size": self.proj_layer_size, "iseed": self.iseed}
def set_params(self, **parameters):
for parameter, value in parameters.items():
setattr(self, parameter, value)
#make data
X, Y = make_classification(n_features=2, n_informative=2, n_redundant=0, n_repeated=0, n_classes=3,
n_clusters_per_class=1, random_state=31)
lb = LabelBinarizer()
Y = lb.fit_transform(Y)
Xtrain, Xtest, Ytrain, Ytest = train_test_split(X, Y, test_size=0.25, random_state=5)
#model optimization
param_grid = {'learn_rate': [0.1, 0.01, 0.001],
'reg': [0.001, 0.01]
}
clf = LogisticClassifier(basis='rectifier')
gs_cv = GridSearchCV(clf, param_grid, scoring='accuracy').fit(Xtrain, Ytrain)
print('Best hyperparameters: %r' % gs_cv.best_params_)

In the get_params method you call self.itrs, but your object doesn't have such attribute.
Also I suggest you to change fit signature to something like fit(self, X, y) and
Pass all the parameters in __init__
Split X and y to train and test using sklearn.cross_validation.train_test_split.
That'd make your code more sklearn-like and more compatible with library functions.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Vanishing parameters in MAML JAX (Meta Learning) - python

Related

MNIST numpy neural network accuracy hovering at 10%

Multi class neural network for MNIST data set not working

linear regression on 8D X data returns nan

Estimate Variables of a Sine Wave using Tensorflow

How to make this estimator scikit-learn-compatible?

Categories

Resources