I'm working through the load_boston() data for a scikit-learn tutorial. I'm running into this attribute error:
AttributeError 'GridSearchCV' object has no attribute 'cv_results_'
Does anyone know if there is a bug? I am using 1.1.1 version of scikit-learn.
import sklearn
from sklearn.datasets import load_boston
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
import matplotlib.pylab as plt
import pandas as pd
from sklearn.model_selection import cross_val_score
print(sklearn.__version__)
X, y = load_boston(return_X_y=True)
mod = KNeighborsRegressor().fit(X, y)
pipe = Pipeline([
("scale", StandardScaler()),
("model", KNeighborsRegressor(n_neighbors=3))
])
print(pipe.get_params())
mod1 = GridSearchCV(estimator=pipe, param_grid={'model__n_neighbors': [1,2,3,4,5,6,7,8,9,10]},cv = 3)
pipe.fit(X, y)
pred = pipe.predict(X)
df = pd.DataFrame(mod1.cv_results_)
plt.scatter(pred, y) #pred instead of X
plt.title("Boston Housing Market")
plt.show()
Point is that cv_results_ is an attribute of the fitted GridSearchCV instance, while you've only fitted the pipeline (its base estimator). Therefore, you should fit mod1 to make it work.
import sklearn
from sklearn.datasets import load_boston
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
import matplotlib.pylab as plt
import pandas as pd
from sklearn.model_selection import cross_val_score
X, y = load_boston(return_X_y=True)
mod = KNeighborsRegressor().fit(X,y)
pipe = Pipeline([
("scale", StandardScaler()),
("model", KNeighborsRegressor(n_neighbors=3))
])
print(pipe.get_params())
mod1 = GridSearchCV(estimator=pipe,param_grid={'model__n_neighbors':
[1,2,3,4,5,6,7,8,9,10]},cv = 3)
mod1.fit(X, y)
df = pd.DataFrame(mod1.cv_results_)
Be aware, though, that method .fit() of GridSearchCV does not return the fitted base estimator (despite fitting it, of course). Therefore, you won't be able to call pipe.predict(X) if you just substitute pipe.fit(X, y) via mod1.fit(X, y).
Related
I wrote this code:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import pickle
import pandas as pd
import joblib
import shap
df = pd.read_csv('train.txt',sep='\t')
def create_model(X_train=full_X_train,y_train=full_y_train,model_name=SVC(kernel='linear'),n_splits=5,file_name='random_forest_with_hpo_with_fs_all_features_class'):
clf = model_name
k_fold = StratifiedKFold(n_splits=n_splits,random_state=42,shuffle=True)
f1 = []
count = 0
for train_index,test_index in k_fold.split(X_train,y_train):
x_train_fold,x_test_fold = X_train.iloc[train_index],X_train.iloc[test_index]
y_train_fold,y_test_fold = y_train[train_index],y_train[test_index]
clf.fit(x_train_fold,y_train_fold)
y_pred = clf.predict(x_test_fold)
save_mod = file_name + '.' + str(count) + '.fold.json'
pickle.dump(clf,open(save_mod,'wb'))
f1.append(f1_score(y_test_fold,y_pred))
return f1
def run_model_with_grid_search(model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_with_fs_all_features_class', n_splits=5, output_file='random_forest_with_hpo_with_fs_all_features_class.txt', param_grid={}):
param_grid = [{'random_forest_with_hpo_with_fs_all_features_class__bootstrap':[True,False],
'random_forest_with_hpo_with_fs_all_features_class__max_depth':[10,20,30,40],
'random_forest_with_hpo_with_fs_all_features_class__n_estimators':[200,500,700]
}]
pipe = Pipeline([('feature_selection',RFECV(estimator=RandomForestClassifier(),scoring='accuracy',step=1,cv=StratifiedKFold(5))),('random_forest_with_hpo_with_fs_all_features_class',RandomForestClassifier())])
search = GridSearchCV(
pipe,
cv=5,
param_grid=param_grid,
scoring='accuracy',
refit=True
)
fit_model = search.fit(X_train,y_train)
print('Optimal number of features: ' + feature_selection.n_features_)
return fit_model,fit_model.best_params_,fit_model.best_score_
fit_model,params,best_score = run_model_with_grid_search()
model = create_model(model_name=fit_model)
I get the warning 'X does not have valid feature names, but RFECV was fitted with feature names'. I can see this question elsewhere e.g.here but I can't understand how this answer would apply here - could someone point out where in the code below is leading to this warning (and I guess ideally how to fix it).
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import AdaBoostClassifier
from sklearn.datasets import make_classification
# generate dataset
X, y = make_classification(n_samples=100, n_features=2, n_redundant=0,
n_clusters_per_class=1, weights=[0.50], flip_y=0, random_state=4)
abc = AdaBoostClassifier(base_estimator=DecisionTreeClassifier())
parameters = {'base_estimator__max_depth':[i for i in range(2,11,2)],
'base_estimator__min_samples_leaf':[1,2],
'n_estimators':[10,50,250,1000],
'learning_rate':[0.01,0.1]}
clf = GridSearchCV(abc, parameters,verbose=3,scoring='f1',n_jobs=-1)
clf.fit(X,y)
Here I am using GridsearchCV to tune AdaBoostClassifier as well as the base estimator which is
DecisionTreeClassifier. Can I use GridsearchCV to tune AdaBoostClassifier and more than one base estimator, (decisiontree and SVC,...) at the same time?**
Specify a list of parameter grids of every base estimator:
from sklearn.dummy import DummyClassifier
from sklearn.ensemble import AdaBoostClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
parameters = [
{'boost': [AdaBoostClassifier(base_estimator=DecisionTreeClassifier())],
'boost__base_estimator__max_depth': (None, 10),
'boost__learning_rate': (0.01, 0.1)},
{'boost': [AdaBoostClassifier(base_estimator=SVC())],
'boost__base_estimator__degree': (2, 3),
'boost__learning_rate': (0.01, 0.1)},
]
boost_pipe = Pipeline([('boost', DummyClassifier())])
grid = GridSearchCV(boost_pipe, parameters)
Can I ask, when I run this code, it produces an output without error:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score, cross_val_predict,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import chi2, f_regression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBRegressor
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import mutual_info_classif
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.pipeline import Pipeline
from scipy.stats import spearmanr
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix,classification_report
import pickle
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score
from sklearn.datasets import make_classification
#Generate fake data
X, y = make_classification(n_samples=5000, n_classes=2, n_features=20, n_redundant=0,random_state=0) #fake data
X_train = X[:4500] #.iloc for df
y_train = y[:4500]
X_test = X[4500:]#.reset_index(drop=True,inplace=True)
y_test = y[4500:]
scorers = {
'precision_score': make_scorer(precision_score),
'recall_score': make_scorer(recall_score),
'accuracy_score': make_scorer(accuracy_score)
}
def run_SVC(X_train, y_train, X_test, y_test,output_file,data_name,refit_score='precision_score'):
'''
run SVC algorithm, with CV and hyperparameter tuning.
'''
short_dataname = data_name.strip().split('/')
file_model_name = output_file + '_svc_' + short_dataname[-1]
clf = SVC()
skf = StratifiedKFold(n_splits=2,random_state=42,shuffle=True)
#fs = SelectKBest(score_func = mutual_info_classif)
pipeline = Pipeline(steps=[('svc',clf)]) #,('sel',fs)
print(pipeline.get_params().keys())
search = GridSearchCV(
pipeline,
param_grid={
'svc__C': [0.01, 0.1, 10, 1000], ##Regularization
'svc__gamma': [0.0001, 0.01, 1, 10],
'svc__kernel':['linear','rbf'],
},
return_train_score=True,
verbose=3,
refit=refit_score,
scoring=scorers,
cv=skf,
n_jobs=-1,
)
search.fit(X_train, y_train)
# make the predictions
y_pred = search.predict(X_test)
print('Best params for {}'.format(refit_score))
print(search.best_params_)
print(classification_report(y_test,y_pred)) #labels=['neg','pos']
return
print(run_SVC(X_train,y_train,X_test,y_test,'test.txt','dataset'))
When i comment in the only two lines that are commented out (#fs = SelectKBest(score_func = mutual_info_classif)) and fs in the line after that, I get the error:
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'SVC()' (type <class 'sklearn.svm._classes.SVC'>) doesn't
I can see that other people have addressed this on SO before, e.g. here, so I tried to follow that person's answer, but my SelectKBest is already before my pipeline - when I move the line with 'fs' to be higher in my code (which I thought was what the answer was saying), I get the same error.
Could someone show me where I'm going wrong here and what I'm meant to change to remove this error?
The order of the steps in a Pipeline matters, and only the last step can be a non-transformer like your svc.
This is my code.
import pandas as pd
from sklearn.model_selection import train_test_split, cross_val_score
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.tree import DecisionTreeClassifier, plot_tree, export_graphviz, export_text
from sklearn.pipeline import Pipeline
from sklearn.metrics import confusion_matrix, classification_report, accuracy_score, roc_curve, auc, f1_score, roc_auc_score
import warnings; warnings.simplefilter('ignore')
data_files = 'dataset_for_learning_decision_tree.xlsx'
data = pd.read_excel(data_files)
train_data = data[['title','category','processed_title']]
categories=train_data['category']
labels=list(set(categories))
X_train, X_test, y_train, y_test = train_test_split(train_data['processed_title'],train_data['category'],test_size=0.2,random_state=57)
vectorizer = CountVectorizer()
X = vectorizer.fit_transform(X_train)
decisiontree=DecisionTreeClassifier()
model = Pipeline([('vect', vectorizer),
('tfidf', TfidfTransformer()),
('clf', decisiontree),
])
model.fit(X_train,y_train)
predicted = model.predict(X_test)
confusion_matrix(y_test,predicted)
print('accuracy_score',accuracy_score(y_test,predicted))
print('Reporting...')
print(classification_report(y_test,predicted))
import numpy as np
from mlxtend.plotting import plot_decision_regions
X=np.array(X_train)
y=np.array(y_train)
plot_decision_regions(X=X,
y=y,
clf=model.named_steps['clf'])
I want to draw a plot_decision_region.
However, when I executed this code, I got the same error as the title.
When running with y=y.astype(np.integer), I get errors such as ValueError: invalid literal for int() with base 10: 'depression'. How should I fix it?
Convert the class labels to integers first,
import numpy as np
from mlxtend.plotting import plot_decision_regions
X = np.array(X_train)
y = np.array(y_train)
d = {'addiction':0, 'depression':1, 'normal':2}
y = list(map(lambda i : d[i], y))
plot_decision_regions(X=X,
y=y,
clf=model.named_steps['clf'])
I'm trying to use SequentialFeatureSelector and for estimator parameter I'm passing it a pipeline that includes a step that inputes the missing values:
model = Pipeline(steps=[('preprocessing',
ColumnTransformer(transformers=[('pipeline-1',
Pipeline(steps=[('imputing',
SimpleImputer(fill_value=-1,
strategy='constant')),
('preprocessing',
StandardScaler())]),
<sklearn.compose._column_transformer.make_column_selector object at 0x1300013d0>),
('pipeline-2',
Pipeline(steps=[('imputing',
SimpleImputer(fill_value='missing',
strategy='constant')),
('encoding',
OrdinalEncoder(handle_unknown='ignore'))]),
<sklearn.compose._column_transformer.make_column_selector object at 0x1300015b0>)])),
('model',
LGBMClassifier(class_weight='balanced', random_state=1,
reg_lambda=0.1))])
Nonetheless when passing this to selector it shows an error, what does not make any sense since I have already fit and evaluated my model and it runs ok
fselector = SequentialFeatureSelector(estimator = model, scoring= "roc_auc", cv = 3, n_jobs= -1, ).fit(X, target)
_assert_all_finite(X, allow_nan, msg_dtype)
101 not allow_nan and not np.isfinite(X).all()):
102 type_err = 'infinity' if allow_nan else 'NaN, infinity'
--> 103 raise ValueError(
104 msg_err.format
105 (type_err,
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
EDIT:
Reproducible example:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
X, y = load_iris(return_X_y = True)
X[:10,0] = np.NaN
clf = Pipeline([("preprocessing", SimpleImputer(missing_values= np.NaN)),("model",LogisticRegression(random_state = 1))])
SequentialFeatureSelector(estimator = clf,
scoring= "accuracy",
cv = 3).fit(X, y)
It shows the same error, in spite of the clf can be fit without problems
ScikitLearn's documentation does not state that the SequentialFeatureSelector works with pipeline objects. It only states that the class accepts an unfitted estimator. In view of this, you could remove the classifier from your pipeline, preprocess X, and then pass it along with an unfitted classifier for feature selection as shown in the example below.
import numpy as np
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MaxAbsScaler
X, y = load_iris(return_X_y = True)
X[:10,0] = np.NaN
pipe = Pipeline([("preprocessing", SimpleImputer(missing_values= np.NaN)),
('scaler', MaxAbsScaler())])
# Preprocess your data
X = pipe.fit_transform(X)
# Run the SequentialFeatureSelector
sfs = SequentialFeatureSelector(estimator = LogisticRegression(),
scoring= "accuracy",
cv = 3).fit(X, y)
# Check which features are important and transform X
sfs.get_support()
X = sfs.transform(X)
You can use SequentialFeatureSelection from mlxtend package
https://rasbt.github.io/mlxtend/
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
X, y = load_iris(return_X_y = True)
X[:10,0] = np.NaN
clf = Pipeline([
("preprocessing", SimpleImputer(missing_values= np.NaN)),
("model",LogisticRegression(random_state = 1))
])
sfs = SequentialFeatureSelector(estimator = clf,
forward = True,
k_features = 'best',
scoring = "accuracy",
cv = 3, n_jobs=-1).fit(X, y)
sfs.k_feature_idx_
>>> (0, 1, 2, 3)