Using StandardScaler as Preprocessor in Mlens Pipeline generates Classification Warning - python

I am trying to scale my data within the crossvalidation folds of a MLENs Superlearner pipeline. When I use StandardScaler in the pipeline (as demonstrated below), I receive the following warning:
/miniconda3/envs/r_env/lib/python3.7/site-packages/mlens/parallel/_base_functions.py:226: MetricWarning: [pipeline-1.mlpclassifier.0.2] Could not score pipeline-1.mlpclassifier. Details:
ValueError("Classification metrics can't handle a mix of binary and continuous-multioutput targets")
(name, inst_name, exc), MetricWarning)
Of note, when I omit the StandardScaler() the warning disappears, but the data is not scaled.
breast_cancer_data = load_breast_cancer()
X = breast_cancer_data['data']
y = breast_cancer_data['target']
from sklearn.model_selection import train_test_split
X, X_val, y, y_val = train_test_split(X, y, test_size=.3, random_state=0)
from sklearn.base import BaseEstimator
class RFBasedFeatureSelector(BaseEstimator):
def __init__(self, n_estimators):
self.n_estimators = n_estimators
self.selector = None
def fit(self, X, y):
clf = RandomForestClassifier(n_estimators=self.n_estimators, random_state = RANDOM_STATE, class_weight = 'balanced')
clf = clf.fit(X, y)
self.selector = SelectFromModel(clf, prefit=True, threshold = 0.001)
def transform(self, X):
if self.selector is None:
raise AttributeError('The selector attribute has not been assigned. You cannot call transform before first calling fit or fit_transform.')
return self.selector.transform(X)
def fit_transform(self, X, y):
self.fit(X, y)
return self.transform(X)
N_FOLDS = 5
RF_ESTIMATORS = 1000
N_ESTIMATORS = 1000
RANDOM_STATE = 42
from mlens.metrics import make_scorer
from sklearn.metrics import roc_auc_score, balanced_accuracy_score
accuracy_scorer = make_scorer(balanced_accuracy_score, average='micro', greater_is_better=True)
from mlens.ensemble.super_learner import SuperLearner
from sklearn.linear_model import LogisticRegression
from sklearn.neural_network import MLPClassifier
from sklearn.ensemble import ExtraTreesClassifier, RandomForestClassifier
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectFromModel
ensemble = SuperLearner(folds=N_FOLDS, shuffle=True, random_state=RANDOM_STATE, n_jobs=10, scorer=balanced_accuracy_score, backend="multiprocessing")
preprocessing1 = {'pipeline-1': [StandardScaler()]
}
preprocessing2 = {'pipeline-1': [RFBasedFeatureSelector(N_ESTIMATORS)]
}
estimators = {'pipeline-1': [RandomForestClassifier(RF_ESTIMATORS, random_state=RANDOM_STATE, class_weight='balanced'),
MLPClassifier(hidden_layer_sizes=(10, 10, 10), activation='relu', solver='sgd',
max_iter=5000)
]
}
ensemble.add(estimators, preprocessing2, preprocessing1)
ensemble.add_meta(LogisticRegression(solver='liblinear', class_weight = 'balanced'))
ensemble.fit(X,y)
yhat = ensemble.predict(X_val)
balanced_accuracy_score(y_val, yhat)```
>Error text: /miniconda3/envs/r_env/lib/python3.7/site-packages/mlens/parallel/_base_functions.py:226: MetricWarning: [pipeline-1.mlpclassifier.0.2] Could not score pipeline-1.mlpclassifier. Details:
ValueError("Classification metrics can't handle a mix of binary and continuous-multioutput targets")
(name, inst_name, exc), MetricWarning)

You are currently passing your preprocessing steps as two separate arguments when calling the add method.
You can instead combine them as follows:
preprocessing = {'pipeline-1': [RFBasedFeatureSelector(N_ESTIMATORS),StandardScaler()]}
Please refer to the documentation for the add method found here:
https://mlens.readthedocs.io/en/0.1.x/source/mlens.ensemble.super_learner/

Related

Unable to execute RFECV over LogisticRegression

My ML project is about "Loan Eligibility prediction"
For that I used data below : https://www.kaggle.com/code/sazid28/home-loan-prediction/data?select=train.csv
and my code is as shown :
import random
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import \
SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import \
OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_selection import RFECV
df_train_original = pd.read_csv("train.csv.xls")
df = df_train_original.drop(df_train_original.columns[0], axis=1)
# Remplace 'Credit_History' by random value (0 or 1)
random.seed(0)
df['Credit_History'] = \
df['Credit_History'].apply(
lambda x: np.random.choice(df['Credit_History'].dropna().values)
if np.isnan(x) else x)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
# Data pre-processing
numerical_feature, categorical_feature = [], []
for i in X.columns:
if X[i].dtype == 'O':
categorical_feature.append(i)
else:
numerical_feature.append(i)
imputer = IterativeImputer(random_state=0)
scaler = StandardScaler()
encoder = OrdinalEncoder()
# Replace categorical features with the most frequent value of the column
# Gender (-13) , Married (-3), Self_Employed (-32)
# LoanAmount (-22) Loan_Amount_Term (-14) Credit_History (-50)
numerical_pipeline = make_pipeline(imputer, scaler)
categorical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), encoder)
preprocessor = make_column_transformer((numerical_pipeline, numerical_feature),
(categorical_pipeline, categorical_feature),
remainder='passthrough')
clf = LogisticRegression(random_state=0, max_iter=df.shape[0])
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
random_state=0)
params = {
'logisticregression__class_weight': [None, 'balanced'],
'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
'logisticregression__C': np.linspace(0.001, 0.1, 30),
}
model = make_pipeline(preprocessor, clf)
selector = RFECV(model, step=1, min_features_to_select=2, cv=5)
selector.fit(X_train, y_train)
when I run the code I get :
ValueError: could not convert string to float: 'Male'
I think that the data is not fitted and transformed before going through RFECV.
How to fix this?
RFECV does not work with a pipeline as the estimator, as it requires the estimator to expose either a coef_ or a feature_importances_. Pipelines do not, and even if they did, there would be no guarantee that the feature importances of the final estimator correspond to the features input to the pipeline with arbitrary transformations in the intermediate.
What you can do is make the RFECV transformer an element of your pipeline between the preprocessing and the final estimator, ie
preprocessor = make_column_transformer((numerical_pipeline, numerical_feature),
(categorical_pipeline, categorical_feature),
remainder='passthrough')
clf_fs = LogisticRegression(random_state=0, max_iter=df.shape[0])
clf = LogisticRegression(random_state=0, max_iter=df.shape[0])
feature_selector = RFECV(clf_fs , step=1, min_features_to_select=2, cv=5)
model = make_pipeline(preprocessor, feature_selector, clf)

How to use t-SNE inside the pipeline

How could I use t-SNE inside my pipeline?
I have managed without pipelining to successfully run t-SNE and on it a classification algorithm.
Do I need to write a custom method that can be called in the pipeline that returns a dataframe, or how does it work?
# How I used t-SNE
%%time
from sklearn.manifold import TSNE
X_std = StandardScaler().fit_transform(dfListingsFeature_classification)
ts = TSNE()
X_tsne = ts.fit_transform(X_std)
print(X_tsne.shape)
feature_list = []
for i in range(1,X_tsne.shape[1]+1):
feature_list .append("TSNE" + str(i))
df_new = pd.DataFrame(X_tsne, columns= feature_list )
df_new['label'] = y
#df_new.head()
X = df_new.drop(columns=['label'])
y = df_new['label']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
#X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1)
rfc= RandomForestClassifier()
# Train Decision Tree Classifer
rfc= rfc.fit(X_train,y_train)
#Predict the response for test dataset
y_pred = rfc.predict(X_test)
What I want to use it
# How could I use TSNE() inside the the pipeline?
%%time
steps = [('standardscaler', StandardScaler()),
('tsne', TSNE()),
('rfc', RandomForestClassifier())]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
parameteres = {'rfc__max_depth':[1,2,3,4,5,6,7,8,9,10,11,12],
'rfc__criterion':['gini', 'entropy']}
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
grid.fit(X_train, y_train)
print("score = %3.2f" %(grid.score(X_test,y_test)))
print('Training set score: ' + str(grid.score(X_train,y_train)))
print('Test set score: ' + str(grid.score(X_test,y_test)))
print(grid.best_params_)
y_pred = grid.predict(X_test)
print("Accuracy:",metrics.accuracy_score(y_test, y_pred))
print("Precison:",metrics.precision_score(y_test, y_pred))
print("Recall:",metrics.recall_score(y_test, y_pred))
[OUT] TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'TSNE()' (type <class 'sklearn.manifold._t_sne.TSNE'>) doesn't
Should I build a custom method or how ? If so how should it look like ?
class TestTSNE(BaseEstimator, TransformerMixin):
def __init__(self):
# don't know
def fit(self, X, y = None):
X_std = StandardScaler().fit_transform(dfListingsFeature_classification)
ts = TSNE()
X_tsne = ts.fit_transform(X_std)
return self
def transform(self, X, y = None):
feature_list = []
for i in range(1,shelf.X_tsne.shape[1]+1):
feature_list .append("TSNE" + str(i))
df_new = pd.DataFrame(X_tsne, columns= feature_list )
df_new['label'] = y
#df_new.head()
X = df_new.drop(columns=['label'])
y = df_new['label']
return X, y
...
steps = [('standardscaler', StandardScaler()),
('testTSNE', TestTSNE()),
('rfc', RandomForestClassifier())]
pipeline = Pipeline(steps)
I think you misunderstood the use of pipeline. From help page:
Pipeline of transforms with a final estimator.
Sequentially apply a list of transforms and a final estimator.
Intermediate steps of the pipeline must be ‘transforms’, that is, they
must implement fit and transform methods. The final estimator only
needs to implement fit
So this means if your pipeline is:
steps = [('standardscaler', StandardScaler()),
('tsne', TSNE()),
('rfc', RandomForestClassifier())]
You are going to apply standscaler to your features first, then transform the result of this with tsne, before passing it to the classifier. I don't think it makes much sense to train on the tsne output.
If you really want to latch onto pipeline, then you will need to store the results of tsne as an attribute, then just return the feature, training as it is, so that the classifier can work on it.
Something like
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.manifold import TSNE
from sklearn.datasets import make_classification
class TestTSNE(BaseEstimator, TransformerMixin):
def __init__(self,n_components,random_state=None,method='exact'):
self.n_components = n_components
self.method = method
self.random_state = random_state
def fit(self, X, y = None):
ts = TSNE(n_components = self.n_components,
method = self.method, random_state = self.random_state)
self.X_tsne = ts.fit_transform(X)
return self
def transform(self, X, y = None):
return X
Then:
steps = [('standardscaler', StandardScaler()),
('testTSNE', TestTSNE(2)),
('rfc', RandomForestClassifier())]
pipeline = Pipeline(steps)
X,y = make_classification()
pipeline.fit(X,y)
You can retrieve your tsne like this:
pd.DataFrame(pipeline.steps[1][1].X_tsne)
0 1
0 -38.756626 -4.693253
1 46.516308 53.633842
2 49.107910 16.482645
3 18.306377 9.432504
4 33.551056 -27.441383
.. ... ...
95 -31.337574 -16.913471
96 -57.918224 -39.959976
97 55.282658 37.582535
98 66.425125 19.717241
99 -50.692646 11.545088

How can I use yellowbrick on the output of non-scikit models

How can I use yellowbrick on the output of non-Scikit models?
I have a PyTorch multi-class classifier network and would like to use the ClassificationReport functionality on the results of applying this model to data. How can I do this?
If you use the skorch library which makes Pytorch models sci-kit learn compatible then you can use yellowbrick's Third party wrappers then you can possibly make your models work. Here is some example code
import numpy as np
from sklearn.datasets import make_classification
from torch import nn
from sklearn.model_selection import train_test_split
from skorch import NeuralNetClassifier
X, y = make_classification(1000, 20, n_informative=10, random_state=0)
X = X.astype(np.float32)
y = y.astype(np.int64)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
class MyModule(nn.Module):
def __init__(self, num_units=10, nonlin=nn.ReLU()):
super(MyModule, self).__init__()
self.dense0 = nn.Linear(20, num_units)
self.nonlin = nonlin
self.dropout = nn.Dropout(0.5)
self.dense1 = nn.Linear(num_units, num_units)
self.output = nn.Linear(num_units, 2)
self.softmax = nn.Softmax(dim=-1)
def forward(self, X, **kwargs):
X = self.nonlin(self.dense0(X))
X = self.dropout(X)
X = self.nonlin(self.dense1(X))
X = self.softmax(self.output(X))
return X
net = NeuralNetClassifier(
MyModule,
max_epochs=10,
lr=0.1,
# Shuffle training data on each epoch
iterator_train__shuffle=True,
)
# Import the wrap function and a Yellowbrick visualizer
from yellowbrick.contrib.wrapper import wrap
from yellowbrick.classifier import classification_report
# Instantiate the third party estimator and wrap it, optionally fitting it
model = wrap(net)
model.fit(X_train, y_train)
# Use the visualizer
oz = classification_report(model, X_train, y_train, X_test=X_test, y_test=y_test, support=True, is_fitted=True)

Got ValueError when calling cross_val_score

I am trying to make a project for Machine Learning and I wanted to perform an accuracy evaluation of multiple alhorithms. I am using this CSV and I am loading only Date, Time and CO columns ( I manually renamed it in the CSV). After I prepare my training data, I am trying to perform the evaluations, but I am getting:
ValueError: Supported target types are: ('binary', 'multiclass'). Got 'unknown' instead.
The shapes for the vectors used for evaluations (X_train and Y_train) are:
(9357, 2)
(9357,)
The class:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
class Models:
test_size: float
random_state: int
def __init__(self, test_size: float = 0.20, random_state: int = 1) -> None:
super().__init__()
self.test_size = test_size
self.random_state = random_state
#staticmethod
def init_models() -> []:
return [
('LR', LogisticRegression(solver='liblinear', multi_class='ovr')),
('LDA', LinearDiscriminantAnalysis()),
('KNN', KNeighborsClassifier()),
('CART', DecisionTreeClassifier()),
('NB', GaussianNB()),
('SVM', SVC(gamma='auto'))
]
def train(self, x: [], y: []):
x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=self.test_size,
random_state=self.random_state)
return x_train, x_validation, y_train, y_validation
def evaluate(self, x_train: [], y_train: [], splits: int = 10, random_state: int = 1):
results = []
names = []
models = self.init_models()
for name, model in models:
kfold = StratifiedKFold(n_splits=splits, random_state=random_state)
cv_results = cross_val_score(estimator=model, X=x_train, y=y_train, cv=kfold, scoring='accuracy')
results.append(cv_results)
names.append(name)
print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
And I am calling my class as:
models_helper = Models()
array = dataset.values
X = array[:, 1:3]
Y = array[:, 2]
prepared = models_helper.train(X, Y)
classification = models_helper.evaluate(prepared[0], prepared[2])
I avoided this problem by first calculating predicted values with cross_val_predict and then using the predicted values with y_test to get score with metrics.accuracy_score.
# Function that runs the requested algorithm and returns the accuracy metrics.
# Passing the sklearn model as an argument along with cv values and training data.
def fit_ml_algo(algo, X_train, y_train, cv):
# One Pass
model = algo.fit(X_train, y_train)
acc = round(model.score(X_train, y_train) * 100, 2)
# Cross Validation
train_pred = model_selection.cross_val_predict(algo,
X_train,
y_train,
cv=cv,
n_jobs = -1)
# Cross-validation accuracy metric
acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
return train_pred, acc, acc_cv

Optimising a meta estimator

I'm trying to use the GridSearchCV functions of scikit-learn to find the best parameters of some base models, which I then feed into a stacking estimator.
My code is based on this post (which I'm using to illustrate): https://stats.stackexchange.com/questions/139042/ensemble-of-different-kinds-of-regressors-using-scikit-learn-or-any-other-pytho/274147
I'd like to perform a grid search over the parameters of my estimators (mostly the ridge parameter, the number of neighbours in KNN, and the RF depth and spilt), but I can't get it working. I define the model, below:
from sklearn.base import TransformerMixin
from sklearn.datasets import make_regression
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestRegressor
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler, PolynomialFeatures
from sklearn.linear_model import LinearRegression, Ridge
class RidgeTransformer(Ridge, TransformerMixin):
def transform(self, X, *_):
return self.predict(X)
class RandomForestTransformer(RandomForestRegressor, TransformerMixin):
def transform(self, X, *_):
return self.predict(X)
class KNeighborsTransformer(KNeighborsRegressor, TransformerMixin):
def transform(self, X, *_):
return self.predict(X)
def build_model():
ridge_transformer = Pipeline(steps=[
('scaler', StandardScaler()),
('poly_feats', PolynomialFeatures()),
('ridge', RidgeTransformer())
])
pred_union = FeatureUnion(
transformer_list=[
('ridge', ridge_transformer),
('rand_forest', RandomForestTransformer()),
('knn', KNeighborsTransformer())
],
n_jobs=2
)
model = Pipeline(steps=[
('pred_union', pred_union),
('lin_regr', LinearRegression())
])
return model
Now, I'd like to run CV on the parameters of the forest. I can get the parameters with:
print(model.get_params().keys())
But when I run the code below, I still get an error:
pipe = Pipeline(steps=[('reg', model)])
parameters = {'pred_union__rand_forest__n_estimators':[20, 50, 100, 200]}
g_search = GridSearchCV(pipe, parameters)
X, y = make_regression(n_features=10, n_targets=2)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
g_search.fit(X_train, y_train)
Invalid parameter pred_union for estimator Pipeline(memory=None,
steps=[('reg', Pipeline(memory=None,
steps=[('pred_union', FeatureUnion(n_jobs=2,
transformer_list=[('ridge', Pipeline(memory=None,
steps=[('scaler', StandardScaler(copy=True, with_mean=True, with_std=True)), ('poly_feats', PolynomialFeatures(degree=2, include_bias=True, interaction_only=False)), ('ridge', RidgeTransformer(...=None)), ('lin_regr', LinearRegression(copy_X=True, fit_intercept=True, n_jobs=1, normalize=False))]))]). Check the list of available parameters with `estimator.get_params().keys()`.
What am I doing wrong?
Your model is actually already a pipeline, so why are you wrapping it again in a pipeline? No need for pipe = Pipeline(steps=[('reg', model)]). Just use model inside the grid-search.
But if you want to wrap it inside a pipeline and then work, then you need to update the parameters by appending the 'reg' to each name.
parameters = {'reg__pred_union__rand_forest__n_estimators':[20, 50, 100, 200]}

Categories

Resources