How to use Gridsearchcv to tune BaseEstimators within AdaBoostClassifier

How to use Gridsearchcv to tune BaseEstimators within AdaBoostClassifier - python

from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
# generate dataset
X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.50], flip_y=0, random_state=4)
abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
parameters = {'base_estimator__max_depth':[i for i in range(2,11,2)],
'base_estimator__min_samples_leaf':[1,2],
'n_estimators':[10,50,250,1000],
'learning_rate':[0.01,0.1]}
clf = GridSearchCV(abc, parameters,verbose=3,scoring='f1',n_jobs=-1)
clf.fit(X,y)
Here I am using GridsearchCV to tune AdaBoostClassifier as well as the base estimator which is
DecisionTreeClassifier. Can I use GridsearchCV to tune AdaBoostClassifier and more than one base estimator, (decisiontree and SVC,...) at the same time?**

Specify a list of parameter grids of every base estimator:
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
parameters = [
{'boost': [AdaBoostClassifier(base_estimator=DecisionTreeClassifier())],
'boost__base_estimator__max_depth': (None, 10),
'boost__learning_rate': (0.01, 0.1)},
{'boost': [AdaBoostClassifier(base_estimator=SVC())],
'boost__base_estimator__degree': (2, 3),
'boost__learning_rate': (0.01, 0.1)},
]
boost_pipe = Pipeline([('boost', DummyClassifier())])
grid = GridSearchCV(boost_pipe, parameters)

Related

AttributeError 'GridSearchCV' object has no attribute 'cv_results_'

I'm working through the load_boston() data for a scikit-learn tutorial. I'm running into this attribute error:
AttributeError 'GridSearchCV' object has no attribute 'cv_results_'
Does anyone know if there is a bug? I am using 1.1.1 version of scikit-learn.
import sklearn
from sklearn.datasets import load_boston
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
import matplotlib.pylab as plt
import pandas as pd
from sklearn.model_selection import cross_val_score
print(sklearn.__version__)
X, y = load_boston(return_X_y=True)
mod = KNeighborsRegressor().fit(X, y)
pipe = Pipeline([
("scale", StandardScaler()),
("model", KNeighborsRegressor(n_neighbors=3))
])
print(pipe.get_params())
mod1 = GridSearchCV(estimator=pipe, param_grid={'model__n_neighbors': [1,2,3,4,5,6,7,8,9,10]},cv = 3)
pipe.fit(X, y)
pred = pipe.predict(X)
df = pd.DataFrame(mod1.cv_results_)
plt.scatter(pred, y) #pred instead of X
plt.title("Boston Housing Market")
plt.show()

Point is that cv_results_ is an attribute of the fitted GridSearchCV instance, while you've only fitted the pipeline (its base estimator). Therefore, you should fit mod1 to make it work.
import sklearn
from sklearn.datasets import load_boston
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
import matplotlib.pylab as plt
import pandas as pd
from sklearn.model_selection import cross_val_score
X, y = load_boston(return_X_y=True)
mod = KNeighborsRegressor().fit(X,y)
pipe = Pipeline([
("scale", StandardScaler()),
("model", KNeighborsRegressor(n_neighbors=3))
])
print(pipe.get_params())
mod1 = GridSearchCV(estimator=pipe,param_grid={'model__n_neighbors':
[1,2,3,4,5,6,7,8,9,10]},cv = 3)
mod1.fit(X, y)
df = pd.DataFrame(mod1.cv_results_)
Be aware, though, that method .fit() of GridSearchCV does not return the fitted base estimator (despite fitting it, of course). Therefore, you won't be able to call pipe.predict(X) if you just substitute pipe.fit(X, y) via mod1.fit(X, y).

X does not have valid feature names, but RFECV was fitted with feature names

I wrote this code:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import pickle
import pandas as pd
import joblib
import shap
df = pd.read_csv('train.txt',sep='\t')
def create_model(X_train=full_X_train,y_train=full_y_train,model_name=SVC(kernel='linear'),n_splits=5,file_name='random_forest_with_hpo_with_fs_all_features_class'):
clf = model_name
k_fold = StratifiedKFold(n_splits=n_splits,random_state=42,shuffle=True)
f1 = []
count = 0
for train_index,test_index in k_fold.split(X_train,y_train):
x_train_fold,x_test_fold = X_train.iloc[train_index],X_train.iloc[test_index]
y_train_fold,y_test_fold = y_train[train_index],y_train[test_index]
clf.fit(x_train_fold,y_train_fold)
y_pred = clf.predict(x_test_fold)
save_mod = file_name + '.' + str(count) + '.fold.json'
pickle.dump(clf,open(save_mod,'wb'))
f1.append(f1_score(y_test_fold,y_pred))
return f1
def run_model_with_grid_search(model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_with_fs_all_features_class', n_splits=5, output_file='random_forest_with_hpo_with_fs_all_features_class.txt', param_grid={}):
param_grid = [{'random_forest_with_hpo_with_fs_all_features_class__bootstrap':[True,False],
'random_forest_with_hpo_with_fs_all_features_class__max_depth':[10,20,30,40],
'random_forest_with_hpo_with_fs_all_features_class__n_estimators':[200,500,700]
}]
pipe = Pipeline([('feature_selection',RFECV(estimator=RandomForestClassifier(),scoring='accuracy',step=1,cv=StratifiedKFold(5))),('random_forest_with_hpo_with_fs_all_features_class',RandomForestClassifier())])
search = GridSearchCV(
pipe,
cv=5,
param_grid=param_grid,
scoring='accuracy',
refit=True
)
fit_model = search.fit(X_train,y_train)
print('Optimal number of features: ' + feature_selection.n_features_)
return fit_model,fit_model.best_params_,fit_model.best_score_
fit_model,params,best_score = run_model_with_grid_search()
model = create_model(model_name=fit_model)
I get the warning 'X does not have valid feature names, but RFECV was fitted with feature names'. I can see this question elsewhere e.g.here but I can't understand how this answer would apply here - could someone point out where in the code below is leading to this warning (and I guess ideally how to fix it).

All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough

Can I ask, when I run this code, it produces an output without error:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score, cross_val_predict,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import chi2, f_regression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBRegressor
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import mutual_info_classif
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.pipeline import Pipeline
from scipy.stats import spearmanr
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix,classification_report
import pickle
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score
from sklearn.datasets import make_classification
#Generate fake data
X, y = make_classification(n_samples=5000, n_classes=2, n_features=20, n_redundant=0,random_state=0) #fake data
X_train = X[:4500] #.iloc for df
y_train = y[:4500]
X_test = X[4500:]#.reset_index(drop=True,inplace=True)
y_test = y[4500:]
scorers = {
'precision_score': make_scorer(precision_score),
'recall_score': make_scorer(recall_score),
'accuracy_score': make_scorer(accuracy_score)
}
def run_SVC(X_train, y_train, X_test, y_test,output_file,data_name,refit_score='precision_score'):
'''
run SVC algorithm, with CV and hyperparameter tuning.
'''
short_dataname = data_name.strip().split('/')
file_model_name = output_file + '_svc_' + short_dataname[-1]
clf = SVC()
skf = StratifiedKFold(n_splits=2,random_state=42,shuffle=True)
#fs = SelectKBest(score_func = mutual_info_classif)
pipeline = Pipeline(steps=[('svc',clf)]) #,('sel',fs)
print(pipeline.get_params().keys())
search = GridSearchCV(
pipeline,
param_grid={
'svc__C': [0.01, 0.1, 10, 1000], ##Regularization
'svc__gamma': [0.0001, 0.01, 1, 10],
'svc__kernel':['linear','rbf'],
},
return_train_score=True,
verbose=3,
refit=refit_score,
scoring=scorers,
cv=skf,
n_jobs=-1,
)
search.fit(X_train, y_train)
# make the predictions
y_pred = search.predict(X_test)
print('Best params for {}'.format(refit_score))
print(search.best_params_)
print(classification_report(y_test,y_pred)) #labels=['neg','pos']
return
print(run_SVC(X_train,y_train,X_test,y_test,'test.txt','dataset'))
When i comment in the only two lines that are commented out (#fs = SelectKBest(score_func = mutual_info_classif)) and fs in the line after that, I get the error:
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'SVC()' (type <class 'sklearn.svm._classes.SVC'>) doesn't
I can see that other people have addressed this on SO before, e.g. here, so I tried to follow that person's answer, but my SelectKBest is already before my pipeline - when I move the line with 'fs' to be higher in my code (which I thought was what the answer was saying), I get the same error.
Could someone show me where I'm going wrong here and what I'm meant to change to remove this error?

The order of the steps in a Pipeline matters, and only the last step can be a non-transformer like your svc.

How to find most relevant variables with PCA dimension reduction?

I'm new to python coding, and working on a project but stuck at coding part.
I have one target variable and 23 relevant variables.
My dataset is 11(simples)*23(descriptors) and one target dataset 11(simples)*1(target variable).
How do I find the most relevant variables from these 23 descriptors with PCA dimensional reduction?
pca = PCA()
pca.fit(df2)
transformed = pca.transform(df2)
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
steps = [('pca', PCA()), ('m', LogisticRegression())]
model = Pipeline(steps=steps)`
from sklearn.preprocessing import MinMaxScaler
steps = [('norm', MinMaxScaler()), ('pca', PCA()), ('m', LogisticRegression())]
model = Pipeline(steps=steps)
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=11, n_features=23, n_informative=5, n_redundant=18, random_state=7)
print(X.shape, y.shape)
from numpy import mean
from numpy import std
from sklearn.datasets import make_classification
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedStratifiedKFold
from sklearn.pipeline import Pipeline
from sklearn.decomposition import PCA
from sklearn.linear_model import LogisticRegression
X, y = make_classification(n_samples=11, n_features=23, n_informative=5, n_redundant=18, random_state=7)
steps = [('pca', PCA(n_components=4)), ('m', LogisticRegression())]
model = Pipeline(steps=steps)
cv = RepeatedStratifiedKFold(n_splits=5, n_repeats=3, random_state=1)
n_scores = cross_val_score(model, X, y, scoring='accuracy', cv=cv, n_jobs=-1, error_score='raise')
# report performance
print('Accuracy: %.3f (%.3f)' % (mean(n_scores), std(n_scores)))
enter image description heredata
enter image description here
I'm supposed to get this image but I don't know how to do.

sklearn.exceptions.NotFittedError: Estimator not fitted, call `fit` before exploiting the model

I tried Random Forests regression.
The code is given below.
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
np.random.seed(0)
d1 = np.random.randint(2, size=(50, 10))
d2 = np.random.randint(3, size=(50, 10))
d3 = np.random.randint(4, size=(50, 10))
Y = np.random.randint(7, size=(50,))
X = np.column_stack([d1, d2, d3])
n_smples, n_feats = X.shape
print (n_smples, n_feats)
kf = KFold(n_splits=5, shuffle=True, random_state=0)
regr = RandomForestRegressor(max_features=None,random_state=0)
pipe = make_pipeline(RFECV(estimator=regr, step=3, cv=kf, scoring =
'neg_mean_squared_error', n_jobs=-1),
GridSearchCV(regr, param_grid={'n_estimators': [100, 300]},
cv=kf, scoring = 'neg_mean_squared_error',
n_jobs=-1))
ypredicts = cross_val_predict(pipe, X, Y, cv=kf, n_jobs=-1)
rmse = mean_squared_error(Y, ypredicts)
print (rmse)
However, I got the following error:
sklearn.exceptions.NotFittedError: Estimator not fitted, call fit before exploiting the model.
I also tried:
model = pipe.fit(X,Y)
ypredicts = cross_val_predict(model, X, Y, cv=kf, n_jobs=-1)
But got the same error.
Edit 1:
I also tried:
pipe.fit(X,Y)
But got the same error.
In Python 2.7 (Sklearn 0.20), for the same code I got different error:
TerminatedWorkerError: A worker process managed by the executor was unexpectedly terminated. This could be caused by a segmentation fault while calling the function or by an excessive memory usage causing the Operating System to kill the worker.
In Python 2.7 (Sklearn 0.20.3):
NotFittedError: Estimator not fitted, callfitbefore exploiting the model.

It seems that you are trying to select best parameters for your classifier by using grid search their is a another to do so. You are using pipelines but in this method I am not using pipeline but I am getting best parameters through random search.
import numpy as np
from sklearn.preprocessing import StandardScaler
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import KFold, cross_val_predict
from sklearn.feature_selection import SelectKBest, f_regression
from sklearn.pipeline import make_pipeline, Pipeline
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_selection import RFECV
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.ensemble import RandomForestClassifier
np.random.seed(0)
d1 = np.random.randint(2, size=(50, 10))
d2 = np.random.randint(3, size=(50, 10))
d3 = np.random.randint(4, size=(50, 10))
Y = np.random.randint(7, size=(50,))
X = np.column_stack([d1, d2, d3])
n_smples, n_feats = X.shape
print (n_smples, n_feats)
kf = KFold(n_splits=5, shuffle=True, random_state=0)
regr = RandomForestRegressor(max_features=None,random_state=0)
n_iter_search = 20
random_search = RandomizedSearchCV(regr, param_distributions={'n_estimators': [100, 300]},
n_iter=20, cv=kf,verbose=1,return_train_score=True)
random_search.fit(X, Y)
ypredicts=random_search.predict(X)
rmse = mean_squared_error(Y, ypredicts)
print(rmse)
print(random_search.best_params_)
random_search.cv_results_
Try this piece of code. I hope this code is fullfilling your problem.

Instead of
model = pipe.fit(X,Y)
have you tried
pipe.fit(X,Y)
instead?
so that would be
pipe.fit(X,Y)
# change model to pipe
ypredicts = cross_val_predict(pipe, X, Y, cv=kf, n_jobs=-1)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How to use Gridsearchcv to tune BaseEstimators within AdaBoostClassifier - python

Related

AttributeError 'GridSearchCV' object has no attribute 'cv_results_'

X does not have valid feature names, but RFECV was fitted with feature names

All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough

How to find most relevant variables with PCA dimension reduction?

sklearn.exceptions.NotFittedError: Estimator not fitted, call `fit` before exploiting the model

Categories

Resources