I can't 'passthrough' a step in sklearn pipeline - python

I wanted to test different scaler in my pipeline, so I created a class that can take as parameter the scaler I want (Standard, MinMax etc..)
It works fine. But i want to specifiy in my param_grid to test without scaler. And I added 'passthrough' in my pipeline estimators, but it doesn't work and i get the following error :
AttributeError: 'str' object has no attribute 'set_params'
I know it might have something to do with the construction of my class but i can't find how to fix it.
Here is the code you can run to test.
# import dependencies
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.datasets import load_breast_cancer
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
class ScalerSelector(BaseEstimator, TransformerMixin):
def __init__(self, scaler=StandardScaler()):
super().__init__()
self.scaler = scaler
def fit(self, X, y=None):
return self.scaler.fit(X)
def transform(self, X, y=None):
return self.scaler.transform(X)
data = load_breast_cancer()
features = data["data"]
target = data["target"]
data = pd.DataFrame(data['data'], columns=data['feature_names'])
col_names = data.columns.tolist()
# scaler and encoder options
my_scaler = ScalerSelector()
preprocessor = ColumnTransformer(transformers = [('numerical', my_scaler, col_names)
])
# combine the preprocessor with LogisticRegression() using Pipeline
full_pipeline = Pipeline(steps = [('preprocessor', preprocessor),
('log_reg', LogisticRegression())
])
# set params combination I want to try
scaler_options = {'preprocessor': ['passthrough'],
'preprocessor__numerical__scaler':[StandardScaler(), RobustScaler(), MinMaxScaler()]}
# : ['passthrough', ScalerSelector()]
# initialize GridSearchCV using full_pipeline as final estimator
grid_cv = GridSearchCV(full_pipeline, param_grid = scaler_options)
# fit the data
grid_cv.fit(data, target)
# best params :
grid_cv.best_params_

Related

How to tune a custom classifier in sklearn with additional required parameters in fit and predict methods?

For the purposes of this question, I have created a dummy dataset below:
import numpy as np
import pandas as pd
cities = ['Berlin', 'Frankfurt', 'Hamburg',
'Nuremberg', 'Munich', 'Stuttgart',
'Hanover', 'Saarbruecken', 'Cologne',
'Constance', 'Freiburg', 'Karlsruhe'
]
n= len(cities)
data = pd.DataFrame({
'City':cities,
'Temperature': np.random.normal(24, 3, n),
'Humidity': np.random.normal(78, 2.5, n),
'Wind': np.random.normal(15, 4, n),
'Target': np.random.randint(2, size=n)
})
I have a (simplified) custom classifier below which maps the text feature to a continuous value before combining it with the remaining features for classification:
from sklearn.base import BaseEstimator, ClassifierMixin
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import GradientBoostingClassifier
class CustomClassifier(ClassifierMixin, BaseEstimator):
def __init__(self, n_estimators=100):
self.n_estimators=100
self.NLP = Pipeline(
[
('preprocessor', TfidfVectorizer()),
('regressor', LogisticRegression())
]
)
self.Classifier = GradientBoostingClassifier(n_estimators=self.n_estimators)
def fit(self, X, y, text_data, **kwargs):
self.NLP.fit(text_data, y)
text_feature = self.NLP.predict_proba(text_data)
new_X = np.concatenate(
(text_feature[:,1, np.newaxis], X),
axis=1
)
self.Classifier.fit(new_X, y)
return self
def predict(self, X, text_data):
text_feature = self.NLP.predict_proba(text_data)
new_X = np.concatenate(
(text_feature[:,1, np.newaxis],X),
axis=1
)
y_pred = self.Classifier.predict(new_X)
return y_pred
I can run fit and predict without issue, but if I try to tune the pipeline using cross validation as follows:
from sklearn.pipeline import Pipeline
from sklearn.preprocessing import MaxAbsScaler
from sklearn.model_selection import RandomizedSearchCV
custom_model = CustomClassifier()
pipe = Pipeline([
('scaling', MaxAbsScaler()),
('classifier', custom_model)
])
params = {'classifier__n_estimators':[100,200]}
tuner = RandomizedSearchCV(
pipe,
param_distributions=params,
cv=3,
n_iter=2
)
tuner.fit(X=data.drop(['City', 'Target'], axis=1), y=data.loc[:,'Target'], classifier__text_data=data.loc[:,'City'])
I receive the following error:
predict() missing 1 required positional argument: 'text_data'
Which also prevents me from using something like cross_val_score.
Any ideas on how I can approach tuning my custom pipeline?
This might be a limitation of the RandomizedSearchCV class (which does not accept additional parameters in predict), so I tried to write a custom class with a custom predict method:
class RandomizedSearchCVPlus(RandomizedSearchCV):
def predict(self, X, **predict_params):
return self.best_estimator_.predict(X, **predict_params)
But this did not solve the issue.

Extract feature names after Pipeline usage with ColumnTransformer (sklearn)

I have the following toy code.
I use a pipeline to automatically normalize numerical variables and apply one-hot-encoding to the categorical ones.
I can get the coefficients of the linear regression model easily using pipe['logisticregression'].coef_ but how can I get all the feature names in the right order as this appearing in the coef matrix?
from sklearn.compose import ColumnTransformer
import numpy as np, pandas as pd
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
# data from https://www.kaggle.com/datasets/uciml/adult-census-income
data = pd.read_csv("adult.csv")
data = data.iloc[0:3000,:]
target = "workclass"
y = data[target]
X = data.drop(columns=target)
numerical_columns_selector = make_column_selector(dtype_exclude=object)
categorical_columns_selector = make_column_selector(dtype_include=object)
numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)
ct = ColumnTransformer([ ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns) ,
('std', StandardScaler(), numerical_columns)])
model = LogisticRegression(max_iter=500)
pipe = make_pipeline(ct, model)
data_train, data_test, target_train, target_test = train_test_split(
X, y, random_state=42)
pipe.fit(data_train, target_train)
pipe['logisticregression'].coef_.shape

Scikit-learn SequentialFeatureSelector Input contains NaN, infinity or a value too large for dtype('float64'). even with pipeline

I'm trying to use SequentialFeatureSelector and for estimator parameter I'm passing it a pipeline that includes a step that inputes the missing values:
model = Pipeline(steps=[('preprocessing',
ColumnTransformer(transformers=[('pipeline-1',
Pipeline(steps=[('imputing',
SimpleImputer(fill_value=-1,
strategy='constant')),
('preprocessing',
StandardScaler())]),
<sklearn.compose._column_transformer.make_column_selector object at 0x1300013d0>),
('pipeline-2',
Pipeline(steps=[('imputing',
SimpleImputer(fill_value='missing',
strategy='constant')),
('encoding',
OrdinalEncoder(handle_unknown='ignore'))]),
<sklearn.compose._column_transformer.make_column_selector object at 0x1300015b0>)])),
('model',
LGBMClassifier(class_weight='balanced', random_state=1,
reg_lambda=0.1))])
Nonetheless when passing this to selector it shows an error, what does not make any sense since I have already fit and evaluated my model and it runs ok
fselector = SequentialFeatureSelector(estimator = model, scoring= "roc_auc", cv = 3, n_jobs= -1, ).fit(X, target)
_assert_all_finite(X, allow_nan, msg_dtype)
101 not allow_nan and not np.isfinite(X).all()):
102 type_err = 'infinity' if allow_nan else 'NaN, infinity'
--> 103 raise ValueError(
104 msg_err.format
105 (type_err,
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
EDIT:
Reproducible example:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
X, y = load_iris(return_X_y = True)
X[:10,0] = np.NaN
clf = Pipeline([("preprocessing", SimpleImputer(missing_values= np.NaN)),("model",LogisticRegression(random_state = 1))])
SequentialFeatureSelector(estimator = clf,
scoring= "accuracy",
cv = 3).fit(X, y)
It shows the same error, in spite of the clf can be fit without problems
ScikitLearn's documentation does not state that the SequentialFeatureSelector works with pipeline objects. It only states that the class accepts an unfitted estimator. In view of this, you could remove the classifier from your pipeline, preprocess X, and then pass it along with an unfitted classifier for feature selection as shown in the example below.
import numpy as np
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MaxAbsScaler
X, y = load_iris(return_X_y = True)
X[:10,0] = np.NaN
pipe = Pipeline([("preprocessing", SimpleImputer(missing_values= np.NaN)),
('scaler', MaxAbsScaler())])
# Preprocess your data
X = pipe.fit_transform(X)
# Run the SequentialFeatureSelector
sfs = SequentialFeatureSelector(estimator = LogisticRegression(),
scoring= "accuracy",
cv = 3).fit(X, y)
# Check which features are important and transform X
sfs.get_support()
X = sfs.transform(X)
You can use SequentialFeatureSelection from mlxtend package
https://rasbt.github.io/mlxtend/
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
X, y = load_iris(return_X_y = True)
X[:10,0] = np.NaN
clf = Pipeline([
("preprocessing", SimpleImputer(missing_values= np.NaN)),
("model",LogisticRegression(random_state = 1))
])
sfs = SequentialFeatureSelector(estimator = clf,
forward = True,
k_features = 'best',
scoring = "accuracy",
cv = 3, n_jobs=-1).fit(X, y)
sfs.k_feature_idx_
>>> (0, 1, 2, 3)

How to test preprocessing combinations in nested pipeline using GridSearchCV?

I've been working on this classification problem using sklearn's Pipeline to combine the preprocessing step (scaling) and the cross validation step (GridSearchCV) using Logistic Regression.
Here is the simplified code:
# import dependencies
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder, StandardScaler, MinMaxScaler, RobustScaler
# scaler and encoder options
scaler = StandardScaler() # there are 3 options that I want to try
encoder = OneHotEncoder() # only one option, no need to GridSearch it
# use ColumnTransformer to apply different preprocesses to numerical and categorical columns
preprocessor = ColumnTransformer(transformers = [('categorical', encoder, cat_columns),
('numerical', scaler, num_columns),
])
# combine the preprocessor with LogisticRegression() using Pipeline
full_pipeline = Pipeline(steps = [('preprocessor', preprocessor),
('log_reg', LogisticRegression())])
What I'm trying to do, is to try different scaling methods (e.g. standard scaling, robust scaling, etc.) and after trying all of those, pick the scaling method that yields the best metric (i.e. accuracy). However, I don't know how to do this using the GridSearchCV:
from sklearn.model_selection import GridSearchCV
# set params combination I want to try
scaler_options = {'numerical':[StandardScaler(), RobustScaler(), MinMaxScaler()]}
# initialize GridSearchCV using full_pipeline as final estimator
grid_cv = GridSearchCV(full_pipeline, param_grid = scaler_options, cv = 5)
# fit the data
grid_cv.fit(X_train, y_train)
I know that the code above won't work, particularly because of the scaler_options that I've set as param_grid. I realize that the scaler_options I set can't be processed by GridSearchCV. Why? Because it isn't a hyperparameter of the pipeline (unlike 'log_reg__C', a hyperparameter from LogisticRegression() than can be accessed by the GridSearchCV). But instead its a component of the ColumnTransformer which I have nested inside the full_pipeline.
So the main question is, how do I automate GridSearchCV to test all of my scaler options? Since the scaler is a component of a sub-pipeline (i.e. the previous ColumnTransformer).
As you suggested you could create a class that takes in its __init()__ parameters, the scaler you want to use.
Then you could specify in your grid search parameters the Scaler your class should use to initialize the class.
I wrote that i hope it helps :
class ScalerSelector(BaseEstimator, TransformerMixin):
def __init__(self, scaler=StandardScaler()):
super().__init__()
self.scaler = scaler
def fit(self, X, y=None):
return self.scaler.fit(X)
def transform(self, X, y=None):
return self.scaler.transform(X)
Here you can find a full example that you can run to test :
# import dependencies
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.datasets import load_breast_cancer
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
import pandas as pd
class ScalerSelector(BaseEstimator, TransformerMixin):
def __init__(self, scaler=StandardScaler()):
super().__init__()
self.scaler = scaler
def fit(self, X, y=None):
return self.scaler.fit(X)
def transform(self, X, y=None):
return self.scaler.transform(X)
data = load_breast_cancer()
features = data["data"]
target = data["target"]
data = pd.DataFrame(data['data'], columns=data['feature_names'])
col_names = data.columns.tolist()
# scaler and encoder options
my_scaler = ScalerSelector()
preprocessor = ColumnTransformer(transformers = [('numerical', my_scaler, col_names)
])
# combine the preprocessor with LogisticRegression() using Pipeline
full_pipeline = Pipeline(steps = [('preprocessor', preprocessor),
('log_reg', LogisticRegression())
])
# set params combination I want to try
scaler_options = {'preprocessor__numerical__scaler':[StandardScaler(), RobustScaler(), MinMaxScaler()]}
# initialize GridSearchCV using full_pipeline as final estimator
grid_cv = GridSearchCV(full_pipeline, param_grid = scaler_options)
# fit the data
grid_cv.fit(data, target)
# best params :
grid_cv.best_params_
You can fulfill what you intend without creating a custom transformer. And you can even pass the 'passthrough' argument into param_grid to experiment with the scenario where you don't want to do any scaling in that step at all.
In this example, suppose we want to investigate whether it is better for the model to impose a Scaler transformer on numerical features, num_features.
cat_features = selector(dtype_exclude='number')(train.drop('target', axis=1))
num_features = selector(dtype_include='number')(train.drop('target', axis=1))
cat_preprocessor = Pipeline(steps=[
('oh', OneHotEncoder(handle_unknown='ignore')),
('ss', StandardScaler())
])
num_preprocessor = Pipeline(steps=[
('pt', PowerTransformer(method='yeo-johnson')),
('ss', StandardScaler()) # Create a place holder for your test here !!!
])
preprocessor = ColumnTransformer(transformers=[
('cat', cat_preprocessor, cat_features),
('num', num_preprocessor, num_features)
])
model = Pipeline(steps=[
('prep', preprocessor),
('clf', RidgeClassifier())
])
X = train.drop('target', axis=1)
y = train['target']
param_grid = {
'prep__cat__ss': ['passthrough', StandardScaler(with_mean=False)] # 'passthrough',
}
gs = GridSearchCV(
estimator=model,
param_grid=param_grid,
scoring='roc_auc',
n_jobs=-1,
cv=2
)
gs.fit(X, y)

Chaining transformations in scikit pipeline

I'm using the scikit pipeline to create a preprocess on a dataset. I have a dataset with four variables: ['monetary', 'frequency1', 'frequency2', 'recency'] and I want to preprocess all but recency. To preprocess, I first want to get the log and then standardize. However, when I get the transformed data from the pipeline, I get 7 columns (3 log, 3 standardize, recency). Is there a way to chain the transformations and so I can get the log and after the log perform standardize and only get a 4 feature dataset?
def create_pipeline(df):
all_but_recency = ['monetary', 'frequency1','frequency2']
# Preprocess
preprocessor = ColumnTransformer(
transformers=[
( 'log', FunctionTransformer(np.log), all_but_recency ),
( 'standardize', preprocessing.StandardScaler(), all_but_recency ) ],
remainder='passthrough')
# Pipeline
estimators = [( 'preprocess', preprocessor )]
pipe = Pipeline(steps=estimators)
print(pipe.set_params().fit_transform(df).shape)
Thanks in advance
You have to apply the FunctionTransformer sequentially. Try this!
def create_pipeline(df):
all_but_recency = ['monetary', 'frequency1','frequency2']
# Preprocess
# Preprocess
preprocessor1 = ColumnTransformer([('log', FunctionTransformer(np.log), all_but_recency)],'passthrough')
preprocessor2 = ColumnTransformer([('standardize', preprocessing.StandardScaler(), all_but_recency)],'passthrough' )
# Pipeline
estimators = [('preprocess1', preprocessor1),('standardize', preprocessor2)]
pipe = Pipeline(steps=estimators)
print(pipe.set_params().fit_transform(df).shape)
working example
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
iris = load_iris()
X, y = iris.data, iris.target
df= pd.DataFrame(X,columns = iris.feature_names)
all_but_one = [0,1,2]
# Preprocess
preprocessor1 = ColumnTransformer([('log', FunctionTransformer(np.log), all_but_one)],'passthrough')
preprocessor2 = ColumnTransformer([('standardize', preprocessing.StandardScaler(), all_but_one)],'passthrough' )
# Pipeline
estimators = [('preprocess1', preprocessor1),('scalling', preprocessor2)]
pipe = Pipeline(steps=estimators,)
pipe.fit_transform(df)

Categories

Resources