I am trying to do some hyper-parameter tuning in my pipeline and have the following setup:
model = KerasClassifier(build_fn = create_model, epochs = 5)
pipeline = Pipeline(steps =[('Tokenizepadder', TokenizePadding()),
('NN', model)] )
Where I have a variable 'maxlen' in both the Tokenizepadder and my Neural Network (for the Neural Network it is called max_length, I was afraid naming them the same would cause errors later in the code). When I try to perform a grid search, I am struggling to have these values correspond. If I perform grid search for these values seperately, they won't mach and there will be a problem with the input data not matching the neural network.
In short I would like to do something like:
pipeline = Pipeline(steps =[('Tokenizepadder', TokenizePadding()),
('NN', KerasClassifier(build_fn = create_model, epochs = 5, max_length = pipeline.get_params()['Tokenizepadder__maxlen']))] )
So that when I am performing a grid search for the parameter 'Tokenizepadder__maxlen', it will change the value 'NN__max_length' to the same value.
May be you can change your classifier and tokenizer, to pass around max_len parameter. Then, only grid search with tokenizer max_len parameter.
Not the cleanest way, but might do.
from sklearn.base import BaseEstimator, TransformerMixin, EstimatorMixin
class TokeinizePadding(BaseEstimator, TransformerMixin):
def __init__(self, max_len, ...):
self.max_len = max_len
...
def fit(self, X, y=None):
...
return self
def transform(self, X, y=None):
data = ... # your stuff
return {"array": data, "max_len": self.max_len}
class KerasClassifier(...):
...
def fit(data, y):
self.max_len = data["max_len"]
self.build_model()
X = data["array"]
... # your stuff
Related
I am trying to modify PyTorch DataLoader class to:
Compute the Pearson's correlation coefficient for each batch
Select only the two features with the highest correlation before
Masking all the remaining features
Pass the resulting batch to an MLP
I tried to write an explanatory Python code for this:
import torch
import torch.nn as nn
from torch.utils.data import Dataset, DataLoader
import numpy as np
class CorrelationDataset(Dataset):
def __init__(self, data):
self.data = data
def __len__(self):
return len(self.data)
def __getitem__(self, idx):
return self.data[idx]
class CorrelationLoader(DataLoader):
def __init__(self, dataset, batch_size, shuffle=False, num_workers=0):
super(CorrelationLoader, self).__init__(dataset, batch_size, shuffle, num_workers)
def __iter__(self):
for batch in super(CorrelationLoader, self).__iter__():
# Compute the Pearson correlation coefficient for each batch
corr = np.corrcoef(batch, rowvar=False)
# Get the indices of the two features with the highest correlation
top_corr = np.unravel_index(np.argmax(corr, axis=None), corr.shape)
# Mask all remaining features
mask = torch.zeros(batch.shape[1])
mask[top_corr[0]] = 1
mask[top_corr[1]] = 1
mask = mask.byte()
# Pass the selected features to the NN
yield batch[:, mask]
data = torch.randn(100, 10)
dataset = CorrelationDataset(data)
dataloader = CorrelationLoader(dataset, batch_size=32)
mlp = nn.Sequential(
nn.Linear(2, 10),
nn.ReLU(),
nn.Linear(10, 1)
)
for batch in dataloader:
output = mlp(batch)
print(output)
Unfortunately the code does not work properly. Does anyone know how to fix it?
Maybe this is just a bug or I am really stupid, I wrapped (or better said a colleague wrapped) a Keras model using some Keras transformations also wrapped so we can use the Keras model with the sklearn library.
Now when I use fit on the Pipeline it works fine. It runs and it returns a working model instance. However when I use a GridSearchCV for some reason it fails to do the transforms (or so it would seem) and it gives me the following error:
InvalidArgumentError (see above for traceback): indices[11,2] = 26048 is not in [0, 10001)
[[Node: embedding_4/Gather = Gather[Tindices=DT_INT32, Tparams=DT_FLOAT, validate_indices=true, _device="/job:localhost/replica:0/task:0/cpu:0"](embedding_4/embeddings/read, embedding_4/Cast)]]
The code looks something like this:
vocab_size = 10001
class TextsToSequences(Tokenizer, BaseEstimator, TransformerMixin):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def fit(self, X, y=None):
print('fitting the text')
print(self.document_count)
self.fit_on_texts(X)
return self
def transform(self, X, y=None):
print('transforming the text')
r = np.array(self.texts_to_sequences(X))
print(r)
print(self.document_count)
return r
class Padder(BaseEstimator, TransformerMixin):
def __init__(self, maxlen=500):
self.maxlen = maxlen
self.max_index = None
def fit(self, X, y=None):
#self.max_index = pad_sequences(X, maxlen=self.maxlen).max()
return self
def transform(self, X, y=None):
print('pad the text')
X = pad_sequences(X, maxlen=self.maxlen, padding='post')
#X[X > self.max_index] = 0
print(X)
return X
maxlen = 15
def makeLstmModel():
model = Sequential()
model.add(Embedding(10001, 100, input_length=15))
model.add(LSTM(35, dropout=0.2, recurrent_dropout=0.2))
model.add(Dense(16, activation='sigmoid'))
model.compile(loss='categorical_crossentropy', optimizer='adam', metrics=['accuracy'])
model.summary()
return model
lstmmodel = KerasClassifier(build_fn=makeLstmModel, epochs=5, batch_size=1000, verbose=42)
pipeline = [
('seq', TextsToSequences(num_words=vocab_size)),
('pad', Padder(maxlen)),
('clf', lstmmodel)
]
textClassifier = Pipeline(pipeline)
#Setup parameters
parameters = {} #Some params to use in gridsearch
skf = StratifiedKFold(n_splits=numberOfFolds, shuffle=True, random_state=1)
gscv = GridSearchCV(textClassifier, parameters, cv=skf, iid=False, n_jobs=1, verbose=50)
gscv.fit(x_train, y_train)
Now the above code fails with InvalidArgumentError, but when I run fit with the Pipeline it works:
Is there a difference between fit() in GridSearchCV and Pipeline? Am I really stupid or is this just a bug?
BTW, I am currently forced to use Sklearn 0.19.1.
After hours of thinking and debugging, I came to the following conclusion:
Pipeline.fit() is able to auto fill **kwargs arguments.
GridSearchCV.fit() is not able to auto fill **kwargs arguments.
I tested this on sklearn 0.19.1
My issue was that the bag of words created with Keras Tokenizer was created using the num_words parameter which limits the bag to a maximum number of words. My colleague did a bad job at this hence the number of words matches to the number of input dimensions in the LSTM model. Because the num_words were never set, the bag was always bigger than the input dimension.
The num_words were passed to the Tokenizer as **kwargs arguments.
class TextsToSequences(Tokenizer, BaseEstimator, TransformerMixin):
def __init__(self, **kwargs):
super().__init__(**kwargs)
For some reason GridSearchCV.fit() is not able to fill this automatically. The solution to this would be to use fixed arguments.
class TextsToSequences(Tokenizer, BaseEstimator, TransformerMixin):
def __init__(self, num_words=8000, **kwargs):
super().__init__(num_words, **kwargs)
After this change GridSearchCV.fit() works.
I am writing a linear regression class which fits a model to some data, similar to the scikit-learn implementation.
Once the model is fit, I want to be able to call a predict() method without having to pass the trained model weights as a parameter to the method. What I have so far is below
class LinReg:
""" Fit a linear model to data"""
def __init__(self):
....
def fit(self, x, y):
"""Fit a model to data x with targets y"""
...
# model weights w are calculated here
return w
def predict(self, x, w):
"""Predict the target variable of the data x using trained weights w"""
...
# predicted y values, y_pred, are calulated here
return y_pred
The trained weights w are returned from fit() so the user can store these as a variable to later pass to the predict() method.
lm = LinReg()
w = lm.fit(x,y)
y_pred = lm.predict(x_new, w) # don't want to pass w here
However, I do not want to return w from fit(); I want to somehow store w once it is calculated in fit() so that the user does not have to concern themselves with the weights, but also such that the weights can be easily used in the predict() method.
How do I do this? Is there a pythonic or standard OO way to do this?
I would store it as an instance-level property:
def __init__(self):
self.w = None # define the prop here...
....
def fit(self, x, y):
"""Fit a model to data x with targets y"""
...
# model weights w are calculated here
self.w = your_computed_value
def predict(self, x):
"""Predict the target variable of the data x using trained weights w"""
...
# predicted y values, y_pred, are calulated here
do_something_here(self.w)
return y_pred
I have a Scikit learn pipeline which includes a feature union as so
from sklearn.pipeline import Pipeline, FeatureUnion
pipeline = Pipeline([
('feats', FeatureUnion([
#
('Vec', Doc2vec()),
('Counter', I_counter()),
])),
('clf', LogisticRegression()) # classifier
])
Each of the two processes in the feature union are classes I've written myself. The first of these is a self written vectorizer based on the Gensim Doc2Vec model. Full code here
If I understand the feature union documentation correctly, it runs each step in parallel and concatenates the output vectors into a single vector passed to the next step (The CLF pipeline in this case).
I wrote each class to return a single numpy array, however the above code is triggering an error.
TypeError: All estimators should implement fit and transform. 'Pipeline(memory=None, steps=[('vec', Doc2vec())])' (type <class 'sklearn.pipeline.Pipeline'>) doesn't
If I understand the error correctly (?), it's stating that the Doc2vec class is not outputting a suitable feature?
The Doc2vec class outputs a single numpy array, containing a 100 vector array for each inputted text sequence. I naively assumed it would simply concatenate this to the I_counter output and all work happyly.
Might someone be able to highlight where my logic is wrong?
--
EDIT, more code
class Doc2vec(BaseEstimator, TransformerMixin):
def fit(self, x, y=None):
return self
def vec(data):
print('starting')
SentimentDocument = namedtuple('SentimentDocument', 'words tags split sentiment')
alldocs = []
for line_no, line in data.iterrows():
#tokens = gensim.utils.to_unicode(line).split()
words = gensim.utils.simple_preprocess(line['post'])
tags = [line_no] # 'tags = [tokens[0]]' would also work at extra memory cost
split = ['train', 'test'][line_no//1200] # 25k train, 25k test, 25k extra
if gensim.utils.simple_preprocess(line['type']) == ['depression']:
sentiment = (1.0)
else:
sentiment = (0.0)
alldocs.append(SentimentDocument(words, tags, split, sentiment))
train_docs = [doc for doc in alldocs if doc.split == 'train']
test_docs = [doc for doc in alldocs if doc.split == 'test']
#print('%d docs: %d train-sentiment, %d test-sentiment' % (len(alldocs), len(train_docs), len(test_docs)))
from random import shuffle
doc_list = alldocs[:]
shuffle(doc_list)
cores = multiprocessing.cpu_count()
assert gensim.models.doc2vec.FAST_VERSION > -1, "This will be painfully slow otherwise"
simple_models = [
# PV-DM w/ default averaging; a higher starting alpha may improve CBOW/PV-DM modes
Doc2Vec(dm=1, vector_size=100, window=10, negative=5, hs=0, min_count=2, sample=0,
epochs=20, workers=cores, alpha=0.05, comment='alpha=0.05')
]
for model in simple_models:
model.build_vocab(train_docs)
#print("%s vocabulary scanned & state initialized" % model)
models_by_name = OrderedDict((str(model), model) for model in simple_models)
model.train(train_docs, total_examples=len(train_docs), epochs=model.epochs)
train_targets, train_regressors = zip(*[(doc.words, doc.sentiment) for doc in train_docs])
import numpy as np
X = []
for i in range(len(train_targets)):
X.append(model.infer_vector(train_targets[i]))
train_x = np.asarray(X)
print(type(train_x))
return(train_x)
class I_counter(BaseEstimator, TransformerMixin):
def fit(self, x, y=None):
return self
def transform(self, data):
def i_count(name):
tokens = nltk.word_tokenize(name)
count = tokens.count("I")
count2 = tokens.count("i")
return(count+count2)
vecfunc = np.vectorize(i_count)
data = np.transpose(np.matrix(data['post']))
result = vecfunc(data)
return result
While preprocessing the labels for a machine learning classifying task, I need to one hot encode the labels which take string values. It happens that OneHotEncoder from sklearn.preprocessing or to_categorical from kera.np_utils require int inputs. This means that I need to precede the one hot encoder with a LabelEncoder. I have done it by hand with a custom class:
class LabelOneHotEncoder():
def __init__(self):
self.ohe = OneHotEncoder()
self.le = LabelEncoder()
def fit_transform(self, x):
features = self.le.fit_transform( x)
return self.ohe.fit_transform( features.reshape(-1,1))
def transform( self, x):
return self.ohe.transform( self.la.transform( x.reshape(-1,1)))
def inverse_tranform( self, x):
return self.le.inverse_transform( self.ohe.inverse_tranform( x))
def inverse_labels( self, x):
return self.le.inverse_transform( x)
I am confident there must a way of doing it within the sklearn API using a sklearn.pipeline, but when using:
LabelOneHotEncoder = Pipeline( [ ("le",LabelEncoder), ("ohe", OneHotEncoder)])
I get the error ValueError: bad input shape () from the OneHotEncoder. My guess is that the output of the LabelEncoder needs to be reshaped, by adding a trivial second axis. I am not sure how to add this feature though.
It's strange that they don't play together nicely... I'm surprised. I'd extend the class to return the reshaped data like you suggested.
class ModifiedLabelEncoder(LabelEncoder):
def fit_transform(self, y, *args, **kwargs):
return super().fit_transform(y).reshape(-1, 1)
def transform(self, y, *args, **kwargs):
return super().transform(y).reshape(-1, 1)
Then using the pipeline should work.
pipe = Pipeline([("le", ModifiedLabelEncoder()), ("ohe", OneHotEncoder())])
pipe.fit_transform(['dog', 'cat', 'dog'])
https://github.com/scikit-learn/scikit-learn/blob/a24c8b46/sklearn/preprocessing/label.py#L39
From scikit-learn 0.20, OneHotEncoder accepts strings, so you don't need a LabelEncoder before it anymore. And you can just use it in a pipeline.
I have used a customized class to wrap my label encoder function and it returns the whole updated dataset.
class CustomLabelEncode(BaseEstimator, TransformerMixin):
def fit(self, X, y=None):
return self
def transform(self, X ,y=None):
le=LabelEncoder()
for i in X[cat_cols]:
X[i]=le.fit_transform(X[i])
return X
cat_cols=['Family','Education','Securities Account','CDAccount','Online','CreditCard']
le_ct=make_column_transformer((CustomLabelEncode(),cat_cols),remainder='passthrough')
pd.DataFrame(ct3.fit_transform(X)) #This will show you your changes
Final_pipeline=make_pipeline(le_ct)
[I have implemented it you can see my github link]
[1]: https://github.com/Ayushmina-20/sklearn_pipeline
It is not for the asked question but for applying only LabelEncoder to all columns you can use the below format
df_non_numeric =df.select_dtypes(['object'])
non_numeric_cols = df_non_numeric.columns.values
from sklearn.preprocessing import LabelEncoder
for col in non_numeric_cols:
df[col] = LabelEncoder().fit_transform(df[col].values)
df.head()