I am using Keras' sklearn wrapper for a regressor, namely tf.keras.wrappers.scikit_learn.KerasRegressor.
I want this regressor to work within sklearn's cross validation scheme, namely sklearn.model_selection.cross_validate.
The regressor generally works without CV.
However, the latter fails, because I have a necessary parameter in the regressor's __init__ method that defines the batch input shape and it appears to be missing.
This seems to be the case because MyRegressor or KerasRegressor isn't correctly cloneable using clone(estimator). The specific error message is:
KeyError: 'batch_input_shape'
Is there a way to make MyRegressor work with cross_validate? Am I somehow violating sklearn's requirements?
Please see this condensed working example:
from sklearn.datasets import make_regression
from sklearn.model_selection import cross_validate
from tensorflow.keras.layers import Dense, LSTM
from tensorflow.keras.models import Sequential
from tensorflow.keras.wrappers.scikit_learn import KerasRegressor
class MyRegressor(KerasRegressor):
def __init__(self, batch_input_shape, build_fn=None, **kwargs):
self.batch_input_shape = batch_input_shape
super().__init__(**kwargs)
def __call__(self, *kwargs):
model = Sequential([
LSTM(16, stateful=True, batch_input_shape=self.batch_input_shape),
Dense(1),
])
model.compile(optimizer='adam', loss='mean_squared_error', metrics=['RootMeanSquaredError'])
return model
def reset_states(self):
self.model.reset_states()
X, y = make_regression(6400, 5)
X = X.reshape(X.shape[0], 1, X.shape[1])
batch_size = 64
batch_input_shape = (batch_size, 1, X.shape[-1])
# Works fine
reg = MyRegressor(batch_input_shape)
for i in range(10):
reg.fit(X, y, batch_size=batch_size)
reg.reset_states()
# Doesn't work
reg = MyRegressor(batch_input_shape)
results = cross_validate(reg, X, y, scoring=['neg_mean_squared_error'])
Cloneability requires a proper get_params method. Most often this is obtained by inheriting from sklearn's BaseEstimator, but KerasRegressor instead implements its own directly (source). The way it does it is incompatible with your additional batch_input_shape; you can tweak it to make things work:
def get_params(self, deep=False):
res = self.sk_params.copy() # sk_params was set by KerasRegressor.__init__
res.update({
'build_fn': self.build_fn,
'batch_input_shape': self.batch_input_shape,
})
return res
(I get an error in your example after this update, about input shapes. But I'm less familiar with batch sizes and keras to be able to answer that followup.)
Related
After I created my model in Keras, I want to get the gradients and apply them directly in Tensorflow with the tf.train.AdamOptimizer class. However, since I am using a Dropout layer, I don't know how to tell to the model whether it is in the training mode or not. The training keyword is not accepted. This is the code:
net_input = Input(shape=(1,))
net_1 = Dense(50)
net_2 = ReLU()
net_3 = Dropout(0.5)
net = Model(net_input, net_3(net_2(net_1(net_input))))
#mycost = ...
optimizer = tf.train.AdamOptimizer()
gradients = optimizer.compute_gradients(mycost, var_list=[net.trainable_weights])
# perform some operations on the gradients
# gradients = ...
trainstep = optimizer.apply_gradients(gradients)
I get the same behavior with and without dropout layer, even with dropout rate=1. How to solve this?
As #Sharky already said you can use training argument while invoking call() method of Dropout class. However, if you want to train in tensorflow graph mode you need to pass a placeholder and feed it boolean value during training. Here is the example of fitting Gaussian blobs applicable to your case:
import tensorflow as tf
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.model_selection import train_test_split
from tensorflow.keras.layers import Dense
from tensorflow.keras.layers import Dropout
from tensorflow.keras.layers import ReLU
from tensorflow.keras.layers import Input
from tensorflow.keras import Model
x_train, y_train = make_blobs(n_samples=10,
n_features=2,
centers=[[1, 1], [-1, -1]],
cluster_std=1)
x_train, x_test, y_train, y_test = train_test_split(
x_train, y_train, test_size=0.2)
# `istrain` indicates whether it is inference or training
istrain = tf.placeholder(tf.bool, shape=())
y = tf.placeholder(tf.int32, shape=(None))
net_input = Input(shape=(2,))
net_1 = Dense(2)
net_2 = Dense(2)
net_3 = Dropout(0.5)
net = Model(net_input, net_3(net_2(net_1(net_input)), training=istrain))
xentropy = tf.nn.sparse_softmax_cross_entropy_with_logits(
labels=y, logits=net.output)
loss_fn = tf.reduce_mean(xentropy)
optimizer = tf.train.AdamOptimizer(0.01)
grads_and_vars = optimizer.compute_gradients(loss_fn,
var_list=[net.trainable_variables])
trainstep = optimizer.apply_gradients(grads_and_vars)
with tf.Session() as sess:
sess.run(tf.global_variables_initializer())
l1 = loss_fn.eval({net_input:x_train,
y:y_train,
istrain:True}) # apply dropout
print(l1) # 1.6264652
l2 = loss_fn.eval({net_input:x_train,
y:y_train,
istrain:False}) # no dropout
print(l2) # 1.5676715
sess.run(trainstep, feed_dict={net_input:x_train,
y:y_train,
istrain:True}) # train with dropout
Keras layers inherit from tf.keras.layers.Layer class. Keras API handle this internally with model.fit. In case Keras Dropout is used with pure TensorFlow training loop, it supports a training argument in its call function.
So you can control it with
dropout = tf.keras.layers.Dropout(rate, noise_shape, seed)(prev_layer, training=is_training)
From official TF docs
Note: - The following optional keyword arguments are reserved for
specific uses: * training: Boolean scalar tensor of Python boolean
indicating whether the call is meant for training or inference. *
mask: Boolean input mask. - If the layer's call method takes a mask
argument (as some Keras layers do), its default value will be set to
the mask generated for inputs by the previous layer (if input did come
from a layer that generated a corresponding mask, i.e. if it came from
a Keras layer with masking support.
https://www.tensorflow.org/api_docs/python/tf/keras/layers/Dropout#call
This is the model I defined it is a simple lstm with 2 fully connect layers.
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class mylstm(nn.Module):
def __init__(self,input_dim, output_dim, hidden_dim,linear_dim):
super(mylstm, self).__init__()
self.hidden_dim=hidden_dim
self.lstm=nn.LSTMCell(input_dim,self.hidden_dim)
self.linear1=nn.Linear(hidden_dim,linear_dim)
self.linear2=nn.Linear(linear_dim,output_dim)
def forward(self, input):
out,_=self.lstm(input)
out=nn.Dropout(p=0.3)(out)
out=self.linear1(out)
out=nn.Dropout(p=0.3)(out)
out=self.linear2(out)
return out
x_train and x_val are float dataframe with shape (4478,30), while y_train and y_val are float df with shape (4478,10)
x_train.head()
Out[271]:
0 1 2 3 ... 26 27 28 29
0 1.6110 1.6100 1.6293 1.6370 ... 1.6870 1.6925 1.6950 1.6905
1 1.6100 1.6293 1.6370 1.6530 ... 1.6925 1.6950 1.6905 1.6960
2 1.6293 1.6370 1.6530 1.6537 ... 1.6950 1.6905 1.6960 1.6930
3 1.6370 1.6530 1.6537 1.6620 ... 1.6905 1.6960 1.6930 1.6955
4 1.6530 1.6537 1.6620 1.6568 ... 1.6960 1.6930 1.6955 1.7040
[5 rows x 30 columns]
x_train.shape
Out[272]: (4478, 30)
Define the varible and do one time bp, I can find out the vaildation loss is 1.4941
model=mylstm(30,10,200,100).double()
from torch import optim
optimizer=optim.RMSprop(model.parameters(), lr=0.001, alpha=0.9)
criterion=nn.L1Loss()
input_=torch.autograd.Variable(torch.from_numpy(np.array(x_train)))
target=torch.autograd.Variable(torch.from_numpy(np.array(y_train)))
input2_=torch.autograd.Variable(torch.from_numpy(np.array(x_val)))
target2=torch.autograd.Variable(torch.from_numpy(np.array(y_val)))
optimizer.zero_grad()
output=model(input_)
loss=criterion(output,target)
loss.backward()
optimizer.step()
moniter=criterion(model(input2_),target2)
moniter
Out[274]: tensor(1.4941, dtype=torch.float64, grad_fn=<L1LossBackward>)
But I called forward function again I get a different number due to randomness of dropout
moniter=criterion(model(input2_),target2)
moniter
Out[275]: tensor(1.4943, dtype=torch.float64, grad_fn=<L1LossBackward>)
what should I do that I can eliminate all the dropout in predicting phrase?
I tried eval():
moniter=criterion(model.eval()(input2_),target2)
moniter
Out[282]: tensor(1.4942, dtype=torch.float64, grad_fn=<L1LossBackward>)
moniter=criterion(model.eval()(input2_),target2)
moniter
Out[283]: tensor(1.4945, dtype=torch.float64, grad_fn=<L1LossBackward>)
And pass an addtional parameter p to control dropout:
import copy
import torch
import torch.nn as nn
import torch.nn.functional as F
import torch.optim as optim
class mylstm(nn.Module):
def __init__(self,input_dim, output_dim, hidden_dim,linear_dim,p):
super(mylstm, self).__init__()
self.hidden_dim=hidden_dim
self.lstm=nn.LSTMCell(input_dim,self.hidden_dim)
self.linear1=nn.Linear(hidden_dim,linear_dim)
self.linear2=nn.Linear(linear_dim,output_dim)
def forward(self, input,p):
out,_=self.lstm(input)
out=nn.Dropout(p=p)(out)
out=self.linear1(out)
out=nn.Dropout(p=p)(out)
out=self.linear2(out)
return out
model=mylstm(30,10,200,100,0.3).double()
output=model(input_)
loss=criterion(output,target)
loss.backward()
optimizer.step()
moniter=criterion(model(input2_,0),target2)
Traceback (most recent call last):
File "<ipython-input-286-e49b6fac918b>", line 1, in <module>
output=model(input_)
File "D:\Users\shan xu\Anaconda3\lib\site-packages\torch\nn\modules\module.py", line 489, in __call__
result = self.forward(*input, **kwargs)
TypeError: forward() missing 1 required positional argument: 'p'
But neither of them worked.
You have to define your nn.Dropout layer in your __init__ and assign it to your model to be responsive for calling eval().
So changing your model like this should work for you:
class mylstm(nn.Module):
def __init__(self,input_dim, output_dim, hidden_dim,linear_dim,p):
super(mylstm, self).__init__()
self.hidden_dim=hidden_dim
self.lstm=nn.LSTMCell(input_dim,self.hidden_dim)
self.linear1=nn.Linear(hidden_dim,linear_dim)
self.linear2=nn.Linear(linear_dim,output_dim)
# define dropout layer in __init__
self.drop_layer = nn.Dropout(p=p)
def forward(self, input):
out,_= self.lstm(input)
# apply model dropout, responsive to eval()
out= self.drop_layer(out)
out= self.linear1(out)
# apply model dropout, responsive to eval()
out= self.drop_layer(out)
out= self.linear2(out)
return out
If you change it like this dropout will be inactive as soon as you call eval().
NOTE: If you want to continue training afterwards you need to call train() on your model to leave evaluation mode.
You can also find a small working example for dropout with eval() for evaluation mode here:
nn.Dropout vs. F.dropout pyTorch
I add this answer just because I'm facing now the same issue while trying to reproduce Deep Bayesian active learning through dropout disagreement.
If you need to keep dropout active (for example to bootstrap a set of different predictions for the same test instances) you just need to leave the model in training mode, there is no need to define your own dropout layer.
Since in pytorch you need to define your own prediction function, you can just add a parameter to it like this:
def predict_class(model, test_instance, active_dropout=False):
if active_dropout:
model.train()
else:
model.eval()
As the other answers said, the dropout layer is desired to be defined in your model's __init__ method, so that your model can keep track of all information of each pre-defined layer. When the model's state is changed, it would notify all layers and do some relevant work. For instance, while calling model.eval() your model would deactivate the dropout layers but directly pass all activations. In general, if you wanna deactivate your dropout layers, you'd better define the dropout layers in __init__ method using nn.Dropout module.
Maybe this is just a bug or I am really stupid, I wrapped (or better said a colleague wrapped) a Keras model using some Keras transformations also wrapped so we can use the Keras model with the sklearn library.
Now when I use fit on the Pipeline it works fine. It runs and it returns a working model instance. However when I use a GridSearchCV for some reason it fails to do the transforms (or so it would seem) and it gives me the following error:
InvalidArgumentError (see above for traceback): indices[11,2] = 26048 is not in [0, 10001)
[[Node: embedding_4/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](embedding_4/embeddings/read, embedding_4/Cast)]]
The code looks something like this:
vocab_size = 10001
class TextsToSequences(Tokenizer, BaseEstimator, TransformerMixin):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def fit(self, X, y=None):
print('fitting the text')
print(self.document_count)
self.fit_on_texts(X)
return self
def transform(self, X, y=None):
print('transforming the text')
r = np.array(self.texts_to_sequences(X))
print(r)
print(self.document_count)
return r
class Padder(BaseEstimator, TransformerMixin):
def __init__(self, maxlen=500):
self.maxlen = maxlen
self.max_index = None
def fit(self, X, y=None):
#self.max_index = pad_sequences(X, maxlen=self.maxlen).max()
return self
def transform(self, X, y=None):
print('pad the text')
X = pad_sequences(X, maxlen=self.maxlen, padding='post')
#X[X > self.max_index] = 0
print(X)
return X
maxlen = 15
def makeLstmModel():
model = Sequential()
model.add(Embedding(10001, 100, input_length=15))
model.add(LSTM(35, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(16, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
return model
lstmmodel = KerasClassifier(build_fn=makeLstmModel, epochs=5, batch_size=1000, verbose=42)
pipeline = [
('seq', TextsToSequences(num_words=vocab_size)),
('pad', Padder(maxlen)),
('clf', lstmmodel)
]
textClassifier = Pipeline(pipeline)
#Setup parameters
parameters = {} #Some params to use in gridsearch
skf = StratifiedKFold(n_splits=numberOfFolds, shuffle=True, random_state=1)
gscv = GridSearchCV(textClassifier, parameters, cv=skf, iid=False, n_jobs=1, verbose=50)
gscv.fit(x_train, y_train)
Now the above code fails with InvalidArgumentError, but when I run fit with the Pipeline it works:
Is there a difference between fit() in GridSearchCV and Pipeline? Am I really stupid or is this just a bug?
BTW, I am currently forced to use Sklearn 0.19.1.
After hours of thinking and debugging, I came to the following conclusion:
Pipeline.fit() is able to auto fill **kwargs arguments.
GridSearchCV.fit() is not able to auto fill **kwargs arguments.
I tested this on sklearn 0.19.1
My issue was that the bag of words created with Keras Tokenizer was created using the num_words parameter which limits the bag to a maximum number of words. My colleague did a bad job at this hence the number of words matches to the number of input dimensions in the LSTM model. Because the num_words were never set, the bag was always bigger than the input dimension.
The num_words were passed to the Tokenizer as **kwargs arguments.
class TextsToSequences(Tokenizer, BaseEstimator, TransformerMixin):
def __init__(self, **kwargs):
super().__init__(**kwargs)
For some reason GridSearchCV.fit() is not able to fill this automatically. The solution to this would be to use fixed arguments.
class TextsToSequences(Tokenizer, BaseEstimator, TransformerMixin):
def __init__(self, num_words=8000, **kwargs):
super().__init__(num_words, **kwargs)
After this change GridSearchCV.fit() works.
I am new to PyTorch and I am trying out the Embedding Layer.
I wrote a naive classification task, where all the inputs are the equal and all the labels are set to 1.0. I hence expect the model to learn quickly to predict 1.0.
The input is always 0, which is fed into a nn.Embedding(1,32) layer, followed by nn.Linear(32,1) and nn.Relu().
However, an unexpected and undesired behavior occurs: training outcome is different for different times I run the code.
For example,
setting the random seed to 10, model converges: loss decreases and model always predicts 1.0
setting the random seed to 1111, model doesn't converge: loss doesn't decrease and model always predicts 0.5. In those cases the parameters are not updated
Here is the minimal, replicable code:
from torch.nn import BCEWithLogitsLoss
import torch.nn as nn
import torch.optim as optim
from torch.utils.data import DataLoader
from torch.autograd import Variable
from torch.utils.data import Dataset
import torch
class MyModel(nn.Module):
def __init__(self):
super(MyModel, self).__init__()
self.vgg_fc = nn.Linear(32, 1)
self.relu = nn.ReLU()
self.embeddings = nn.Embedding(1, 32)
def forward(self, data):
emb = self.embeddings(data['index'])
return self.relu(self.vgg_fc(emb))
class MyDataset(Dataset):
def __init__(self):
pass
def __len__(self):
return 1000
def __getitem__(self, idx):
return {'label': 1.0, 'index': 0}
def train():
model = MyModel()
db = MyDataset()
dataloader = DataLoader(db, batch_size=256, shuffle=True, num_workers=16)
loss_function = BCEWithLogitsLoss()
optimizer_rel = optim.SGD(model.parameters(), lr=0.1)
for epoch in range(50):
for i_batch, sample_batched in enumerate(dataloader):
model.zero_grad()
out = model({'index': Variable(sample_batched['index'])})
labels = Variable(sample_batched['label'].type(torch.FloatTensor).view(sample_batched['label'].shape[0], 1))
loss = loss_function(out, labels)
loss.backward()
optimizer_rel.step()
print 'Epoch:', epoch, 'batch', i_batch, 'Tr_Loss:', loss.data[0]
return model
if __name__ == '__main__':
# please, try seed 10 (converge) and seed 1111 (fails)
torch.manual_seed(10)
train()
Without specifying the random seed, different runs have different outcome.
Why is, in those cases, the model unable to learn such a easy task?
Is there any mistake in the way I use nn.Embedding layer?
Thank you
I found the problem was the final relu layer, before the sigmoid.
As stated here, that layer will:
throw away information without adding any additional benefit
Removing the layer, the network learned as expected with any seed.
I am running a keras script (no direct call to theano in my script) and I get the following error:
TypeError: ('An update must have the same type as the original shared
variable (shared_var=<TensorType(float32, matrix)>,
shared_var.type=TensorType(float32, matrix),
update_val=Elemwise{add,no_inplace}.0,
update_val.type=TensorType(float64, matrix)).',
'If the difference is related to the broadcast pattern,
you can call the tensor.unbroadcast(var, axis_to_unbroadcast[, ...])
function to remove broadcastable dimensions.')
I have seen the error from folks running theano directly, but not through keras. Not sure what I should do, since I am not dealing with tensors directly.
the problem was that there is a change in keras version (I am currently using keras 0.3.2 with theano 0.8.0) and what used to be fine does not work well with he new keras version.
The following was the original code, and see the fix below.
from keras.models import Sequential
import keras.optimizers
from keras.layers.core import Dense, Dropout
from keras.layers.normalization import BatchNormalization
from keras.layers.advanced_activations import PReLU
from keras.layers.core import Activation
from keras.optimizers import SGD, Adam
from sklearn.preprocessing import StandardScaler
from sklearn.base import BaseEstimator, RegressorMixin
class NnRegression(BaseEstimator, RegressorMixin):
def __init__(self, apply_standart_scaling=True,
dropx=[0.2, 0.5, 0.5], nb_neuronx=[50, 30], nb_epoch=105, validation_split=0.,
verbose=1):
self.apply_standart_scaling = apply_standart_scaling
self.dropx = dropx
self.nb_neuronx = nb_neuronx
self.nb_epoch = nb_epoch
self.validation_split = validation_split
self.verbose = verbose
def fit(self, X, y):
nb_features = X.shape[1]
self.standart_scaling = StandardScaler() if self.apply_standart_scaling else None
if self.standart_scaling:
X = self.standart_scaling.fit_transform(X)
model = Sequential()
model.add(Dropout(input_shape = (nb_features,),p= self.dropx[0]))
model.add(Dense(output_dim = self.nb_neuronx[0], init='glorot_uniform'))
model.add(PReLU())
model.add(BatchNormalization(self.nb_neuronx[0],)))
model.add(Dropout(self.dropx[1]))
model.add(Dense(self.nb_neuronx[1], init='glorot_uniform'))
model.add(PReLU())
model.add(BatchNormalization(self.nb_neuronx[0],)))
model.add(Dropout(self.dropx[2]))
model.add(Dense(1, init='glorot_uniform'))
nn_verbose = 1 if self.verbose>0 else 0
optz = keras.optimizers.Adam(lr=0.001, beta_1=0.9, beta_2=0.999, epsilon=1e-08)
model.compile(optimizer=Adam(),loss='mse')
model.fit(X, y, batch_size=16,
nb_epoch=self.nb_epoch, validation_split=self.validation_split, verbose=nn_verbose)
self.model = model
def predict(self, X):
if self.standart_scaling:
X = self.standart_scaling.transform(X)
return self.model.predict_proba(X, verbose=0)
well, it turns out that the problem is this single line of code:
model.add(BatchNormalization(self.nb_neuronx[0],)))
It should actually be:
model.add(BatchNormalization())
because the number of neurons has no business within the normalization layer (however this did not bother in a previous keras version).
This apparently causes theano to generate new weights that are not float32 but float64, and that triggers the message above.