How to test preprocessing combinations in nested pipeline using GridSearchCV? - python

I've been working on this classification problem using sklearn's Pipeline to combine the preprocessing step (scaling) and the cross validation step (GridSearchCV) using Logistic Regression.
Here is the simplified code:
# import dependencies
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
# scaler and encoder options
scaler = StandardScaler() # there are 3 options that I want to try
encoder = OneHotEncoder() # only one option, no need to GridSearch it
# use ColumnTransformer to apply different preprocesses to numerical and categorical columns
preprocessor = ColumnTransformer(transformers = [('categorical', encoder, cat_columns),
('numerical', scaler, num_columns),
])
# combine the preprocessor with LogisticRegression() using Pipeline
full_pipeline = Pipeline(steps = [('preprocessor', preprocessor),
('log_reg', LogisticRegression())])
What I'm trying to do, is to try different scaling methods (e.g. standard scaling, robust scaling, etc.) and after trying all of those, pick the scaling method that yields the best metric (i.e. accuracy). However, I don't know how to do this using the GridSearchCV:
from sklearn.model_selection import GridSearchCV
# set params combination I want to try
scaler_options = {'numerical':[StandardScaler(), RobustScaler(), MinMaxScaler()]}
# initialize GridSearchCV using full_pipeline as final estimator
grid_cv = GridSearchCV(full_pipeline, param_grid = scaler_options, cv = 5)
# fit the data
grid_cv.fit(X_train, y_train)
I know that the code above won't work, particularly because of the scaler_options that I've set as param_grid. I realize that the scaler_options I set can't be processed by GridSearchCV. Why? Because it isn't a hyperparameter of the pipeline (unlike 'log_reg__C', a hyperparameter from LogisticRegression() than can be accessed by the GridSearchCV). But instead its a component of the ColumnTransformer which I have nested inside the full_pipeline.
So the main question is, how do I automate GridSearchCV to test all of my scaler options? Since the scaler is a component of a sub-pipeline (i.e. the previous ColumnTransformer).

As you suggested you could create a class that takes in its __init()__ parameters, the scaler you want to use.
Then you could specify in your grid search parameters the Scaler your class should use to initialize the class.
I wrote that i hope it helps :
class ScalerSelector(BaseEstimator, TransformerMixin):
def __init__(self, scaler=StandardScaler()):
super().__init__()
self.scaler = scaler
def fit(self, X, y=None):
return self.scaler.fit(X)
def transform(self, X, y=None):
return self.scaler.transform(X)
Here you can find a full example that you can run to test :
# import dependencies
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.datasets import load_breast_cancer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import pandas as pd
class ScalerSelector(BaseEstimator, TransformerMixin):
def __init__(self, scaler=StandardScaler()):
super().__init__()
self.scaler = scaler
def fit(self, X, y=None):
return self.scaler.fit(X)
def transform(self, X, y=None):
return self.scaler.transform(X)
data = load_breast_cancer()
features = data["data"]
target = data["target"]
data = pd.DataFrame(data['data'], columns=data['feature_names'])
col_names = data.columns.tolist()
# scaler and encoder options
my_scaler = ScalerSelector()
preprocessor = ColumnTransformer(transformers = [('numerical', my_scaler, col_names)
])
# combine the preprocessor with LogisticRegression() using Pipeline
full_pipeline = Pipeline(steps = [('preprocessor', preprocessor),
('log_reg', LogisticRegression())
])
# set params combination I want to try
scaler_options = {'preprocessor__numerical__scaler':[StandardScaler(), RobustScaler(), MinMaxScaler()]}
# initialize GridSearchCV using full_pipeline as final estimator
grid_cv = GridSearchCV(full_pipeline, param_grid = scaler_options)
# fit the data
grid_cv.fit(data, target)
# best params :
grid_cv.best_params_

You can fulfill what you intend without creating a custom transformer. And you can even pass the 'passthrough' argument into param_grid to experiment with the scenario where you don't want to do any scaling in that step at all.
In this example, suppose we want to investigate whether it is better for the model to impose a Scaler transformer on numerical features, num_features.
cat_features = selector(dtype_exclude='number')(train.drop('target', axis=1))
num_features = selector(dtype_include='number')(train.drop('target', axis=1))
cat_preprocessor = Pipeline(steps=[
('oh', OneHotEncoder(handle_unknown='ignore')),
('ss', StandardScaler())
])
num_preprocessor = Pipeline(steps=[
('pt', PowerTransformer(method='yeo-johnson')),
('ss', StandardScaler()) # Create a place holder for your test here !!!
])
preprocessor = ColumnTransformer(transformers=[
('cat', cat_preprocessor, cat_features),
('num', num_preprocessor, num_features)
])
model = Pipeline(steps=[
('prep', preprocessor),
('clf', RidgeClassifier())
])
X = train.drop('target', axis=1)
y = train['target']
param_grid = {
'prep__cat__ss': ['passthrough', StandardScaler(with_mean=False)] # 'passthrough',
}
gs = GridSearchCV(
estimator=model,
param_grid=param_grid,
scoring='roc_auc',
n_jobs=-1,
cv=2
)
gs.fit(X, y)

Related

How to tune a custom classifier in sklearn with additional required parameters in fit and predict methods?

For the purposes of this question, I have created a dummy dataset below:
import numpy as np
import pandas as pd
cities = ['Berlin', 'Frankfurt', 'Hamburg',
'Nuremberg', 'Munich', 'Stuttgart',
'Hanover', 'Saarbruecken', 'Cologne',
'Constance', 'Freiburg', 'Karlsruhe'
]
n= len(cities)
data = pd.DataFrame({
'City':cities,
'Temperature': np.random.normal(24, 3, n),
'Humidity': np.random.normal(78, 2.5, n),
'Wind': np.random.normal(15, 4, n),
'Target': np.random.randint(2, size=n)
})
I have a (simplified) custom classifier below which maps the text feature to a continuous value before combining it with the remaining features for classification:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
class CustomClassifier(ClassifierMixin, BaseEstimator):
def __init__(self, n_estimators=100):
self.n_estimators=100
self.NLP = Pipeline(
[
('preprocessor', TfidfVectorizer()),
('regressor', LogisticRegression())
]
)
self.Classifier = GradientBoostingClassifier(n_estimators=self.n_estimators)
def fit(self, X, y, text_data, **kwargs):
self.NLP.fit(text_data, y)
text_feature = self.NLP.predict_proba(text_data)
new_X = np.concatenate(
(text_feature[:,1, np.newaxis], X),
axis=1
)
self.Classifier.fit(new_X, y)
return self
def predict(self, X, text_data):
text_feature = self.NLP.predict_proba(text_data)
new_X = np.concatenate(
(text_feature[:,1, np.newaxis],X),
axis=1
)
y_pred = self.Classifier.predict(new_X)
return y_pred
I can run fit and predict without issue, but if I try to tune the pipeline using cross validation as follows:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import RandomizedSearchCV
custom_model = CustomClassifier()
pipe = Pipeline([
('scaling', MaxAbsScaler()),
('classifier', custom_model)
])
params = {'classifier__n_estimators':[100,200]}
tuner = RandomizedSearchCV(
pipe,
param_distributions=params,
cv=3,
n_iter=2
)
tuner.fit(X=data.drop(['City', 'Target'], axis=1), y=data.loc[:,'Target'], classifier__text_data=data.loc[:,'City'])
I receive the following error:
predict() missing 1 required positional argument: 'text_data'
Which also prevents me from using something like cross_val_score.
Any ideas on how I can approach tuning my custom pipeline?
This might be a limitation of the RandomizedSearchCV class (which does not accept additional parameters in predict), so I tried to write a custom class with a custom predict method:
class RandomizedSearchCVPlus(RandomizedSearchCV):
def predict(self, X, **predict_params):
return self.best_estimator_.predict(X, **predict_params)
But this did not solve the issue.

Scaler fitted in a pipeline turns out to be not fitted yet

Please consider this code:
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
# data
train_X = pd.DataFrame(data=np.random.rand(20, 3), columns=["a", "b", "c"])
train_y = pd.Series(data=np.random.randint(0,2, 20), name="y")
test_X = pd.DataFrame(data=np.random.rand(10, 3), columns=["a", "b", "c"])
test_y = pd.Series(data=np.random.randint(0,2, 10), name="y")
# scaler
scaler = StandardScaler()
# feature selection
p = Pipeline(steps=[("scaler0", scaler),
("model", SVC(kernel="linear", C=1))])
rfe = RFE(p, n_features_to_select=2, step=1,
importance_getter="named_steps.model.coef_")
rfe.fit(train_X, train_y)
# apply the scaler to the test set
scaled_test = scaler.transform(test_X)
I get this message:
NotFittedError: This StandardScaler instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
Why is the scaler not fitted?
When passing a pipeline or an estimator to RFE, it essentially gets cloned by the RFE and fit until it finds the best fit with the reduced number of features.
To access this fit estimator you can use
fit_pipeline = rfe.estimator_
But note, this new pipeline uses the top n_features_to_select features.

Unable to execute RFECV over LogisticRegression

My ML project is about "Loan Eligibility prediction"
For that I used data below : https://www.kaggle.com/code/sazid28/home-loan-prediction/data?select=train.csv
and my code is as shown :
import random
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import \
SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import \
OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_selection import RFECV
df_train_original = pd.read_csv("train.csv.xls")
df = df_train_original.drop(df_train_original.columns[0], axis=1)
# Remplace 'Credit_History' by random value (0 or 1)
random.seed(0)
df['Credit_History'] = \
df['Credit_History'].apply(
lambda x: np.random.choice(df['Credit_History'].dropna().values)
if np.isnan(x) else x)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
# Data pre-processing
numerical_feature, categorical_feature = [], []
for i in X.columns:
if X[i].dtype == 'O':
categorical_feature.append(i)
else:
numerical_feature.append(i)
imputer = IterativeImputer(random_state=0)
scaler = StandardScaler()
encoder = OrdinalEncoder()
# Replace categorical features with the most frequent value of the column
# Gender (-13) , Married (-3), Self_Employed (-32)
# LoanAmount (-22) Loan_Amount_Term (-14) Credit_History (-50)
numerical_pipeline = make_pipeline(imputer, scaler)
categorical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), encoder)
preprocessor = make_column_transformer((numerical_pipeline, numerical_feature),
(categorical_pipeline, categorical_feature),
remainder='passthrough')
clf = LogisticRegression(random_state=0, max_iter=df.shape[0])
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
random_state=0)
params = {
'logisticregression__class_weight': [None, 'balanced'],
'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
'logisticregression__C': np.linspace(0.001, 0.1, 30),
}
model = make_pipeline(preprocessor, clf)
selector = RFECV(model, step=1, min_features_to_select=2, cv=5)
selector.fit(X_train, y_train)
when I run the code I get :
ValueError: could not convert string to float: 'Male'
I think that the data is not fitted and transformed before going through RFECV.
How to fix this?
RFECV does not work with a pipeline as the estimator, as it requires the estimator to expose either a coef_ or a feature_importances_. Pipelines do not, and even if they did, there would be no guarantee that the feature importances of the final estimator correspond to the features input to the pipeline with arbitrary transformations in the intermediate.
What you can do is make the RFECV transformer an element of your pipeline between the preprocessing and the final estimator, ie
preprocessor = make_column_transformer((numerical_pipeline, numerical_feature),
(categorical_pipeline, categorical_feature),
remainder='passthrough')
clf_fs = LogisticRegression(random_state=0, max_iter=df.shape[0])
clf = LogisticRegression(random_state=0, max_iter=df.shape[0])
feature_selector = RFECV(clf_fs , step=1, min_features_to_select=2, cv=5)
model = make_pipeline(preprocessor, feature_selector, clf)

I can't 'passthrough' a step in sklearn pipeline

I wanted to test different scaler in my pipeline, so I created a class that can take as parameter the scaler I want (Standard, MinMax etc..)
It works fine. But i want to specifiy in my param_grid to test without scaler. And I added 'passthrough' in my pipeline estimators, but it doesn't work and i get the following error :
AttributeError: 'str' object has no attribute 'set_params'
I know it might have something to do with the construction of my class but i can't find how to fix it.
Here is the code you can run to test.
# import dependencies
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.datasets import load_breast_cancer
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
class ScalerSelector(BaseEstimator, TransformerMixin):
def __init__(self, scaler=StandardScaler()):
super().__init__()
self.scaler = scaler
def fit(self, X, y=None):
return self.scaler.fit(X)
def transform(self, X, y=None):
return self.scaler.transform(X)
data = load_breast_cancer()
features = data["data"]
target = data["target"]
data = pd.DataFrame(data['data'], columns=data['feature_names'])
col_names = data.columns.tolist()
# scaler and encoder options
my_scaler = ScalerSelector()
preprocessor = ColumnTransformer(transformers = [('numerical', my_scaler, col_names)
])
# combine the preprocessor with LogisticRegression() using Pipeline
full_pipeline = Pipeline(steps = [('preprocessor', preprocessor),
('log_reg', LogisticRegression())
])
# set params combination I want to try
scaler_options = {'preprocessor': ['passthrough'],
'preprocessor__numerical__scaler':[StandardScaler(), RobustScaler(), MinMaxScaler()]}
# : ['passthrough', ScalerSelector()]
# initialize GridSearchCV using full_pipeline as final estimator
grid_cv = GridSearchCV(full_pipeline, param_grid = scaler_options)
# fit the data
grid_cv.fit(data, target)
# best params :
grid_cv.best_params_

Optimising a meta estimator

I'm trying to use the GridSearchCV functions of scikit-learn to find the best parameters of some base models, which I then feed into a stacking estimator.
My code is based on this post (which I'm using to illustrate): https://stats.stackexchange.com/questions/139042/ensemble-of-different-kinds-of-regressors-using-scikit-learn-or-any-other-pytho/274147
I'd like to perform a grid search over the parameters of my estimators (mostly the ridge parameter, the number of neighbours in KNN, and the RF depth and spilt), but I can't get it working. I define the model, below:
from sklearn.base import TransformerMixin
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
class RidgeTransformer(Ridge, TransformerMixin):
def transform(self, X, *_):
return self.predict(X)
class RandomForestTransformer(RandomForestRegressor, TransformerMixin):
def transform(self, X, *_):
return self.predict(X)
class KNeighborsTransformer(KNeighborsRegressor, TransformerMixin):
def transform(self, X, *_):
return self.predict(X)
def build_model():
ridge_transformer = Pipeline(steps=[
('scaler', StandardScaler()),
('poly_feats', PolynomialFeatures()),
('ridge', RidgeTransformer())
])
pred_union = FeatureUnion(
transformer_list=[
('ridge', ridge_transformer),
('rand_forest', RandomForestTransformer()),
('knn', KNeighborsTransformer())
],
n_jobs=2
)
model = Pipeline(steps=[
('pred_union', pred_union),
('lin_regr', LinearRegression())
])
return model
Now, I'd like to run CV on the parameters of the forest. I can get the parameters with:
print(model.get_params().keys())
But when I run the code below, I still get an error:
pipe = Pipeline(steps=[('reg', model)])
parameters = {'pred_union__rand_forest__n_estimators':[20, 50, 100, 200]}
g_search = GridSearchCV(pipe, parameters)
X, y = make_regression(n_features=10, n_targets=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
g_search.fit(X_train, y_train)
Invalid parameter pred_union for estimator Pipeline(memory=None,
steps=[('reg', Pipeline(memory=None,
steps=[('pred_union', FeatureUnion(n_jobs=2,
transformer_list=[('ridge', Pipeline(memory=None,
steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('poly_feats', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('ridge', RidgeTransformer(...=None)), ('lin_regr', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))]))]). Check the list of available parameters with `estimator.get_params().keys()`.
What am I doing wrong?
Your model is actually already a pipeline, so why are you wrapping it again in a pipeline? No need for pipe = Pipeline(steps=[('reg', model)]). Just use model inside the grid-search.
But if you want to wrap it inside a pipeline and then work, then you need to update the parameters by appending the 'reg' to each name.
parameters = {'reg__pred_union__rand_forest__n_estimators':[20, 50, 100, 200]}

Categories

Resources