Related
I'm working through the load_boston() data for a scikit-learn tutorial. I'm running into this attribute error:
AttributeError 'GridSearchCV' object has no attribute 'cv_results_'
Does anyone know if there is a bug? I am using 1.1.1 version of scikit-learn.
import sklearn
from sklearn.datasets import load_boston
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
import matplotlib.pylab as plt
import pandas as pd
from sklearn.model_selection import cross_val_score
print(sklearn.__version__)
X, y = load_boston(return_X_y=True)
mod = KNeighborsRegressor().fit(X, y)
pipe = Pipeline([
("scale", StandardScaler()),
("model", KNeighborsRegressor(n_neighbors=3))
])
print(pipe.get_params())
mod1 = GridSearchCV(estimator=pipe, param_grid={'model__n_neighbors': [1,2,3,4,5,6,7,8,9,10]},cv = 3)
pipe.fit(X, y)
pred = pipe.predict(X)
df = pd.DataFrame(mod1.cv_results_)
plt.scatter(pred, y) #pred instead of X
plt.title("Boston Housing Market")
plt.show()
Point is that cv_results_ is an attribute of the fitted GridSearchCV instance, while you've only fitted the pipeline (its base estimator). Therefore, you should fit mod1 to make it work.
import sklearn
from sklearn.datasets import load_boston
from sklearn.neighbors import KNeighborsRegressor
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.linear_model import LinearRegression
import matplotlib.pylab as plt
import pandas as pd
from sklearn.model_selection import cross_val_score
X, y = load_boston(return_X_y=True)
mod = KNeighborsRegressor().fit(X,y)
pipe = Pipeline([
("scale", StandardScaler()),
("model", KNeighborsRegressor(n_neighbors=3))
])
print(pipe.get_params())
mod1 = GridSearchCV(estimator=pipe,param_grid={'model__n_neighbors':
[1,2,3,4,5,6,7,8,9,10]},cv = 3)
mod1.fit(X, y)
df = pd.DataFrame(mod1.cv_results_)
Be aware, though, that method .fit() of GridSearchCV does not return the fitted base estimator (despite fitting it, of course). Therefore, you won't be able to call pipe.predict(X) if you just substitute pipe.fit(X, y) via mod1.fit(X, y).
I wrote this code:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.feature_selection import RFECV
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
import pickle
import pandas as pd
import joblib
import shap
df = pd.read_csv('train.txt',sep='\t')
def create_model(X_train=full_X_train,y_train=full_y_train,model_name=SVC(kernel='linear'),n_splits=5,file_name='random_forest_with_hpo_with_fs_all_features_class'):
clf = model_name
k_fold = StratifiedKFold(n_splits=n_splits,random_state=42,shuffle=True)
f1 = []
count = 0
for train_index,test_index in k_fold.split(X_train,y_train):
x_train_fold,x_test_fold = X_train.iloc[train_index],X_train.iloc[test_index]
y_train_fold,y_test_fold = y_train[train_index],y_train[test_index]
clf.fit(x_train_fold,y_train_fold)
y_pred = clf.predict(x_test_fold)
save_mod = file_name + '.' + str(count) + '.fold.json'
pickle.dump(clf,open(save_mod,'wb'))
f1.append(f1_score(y_test_fold,y_pred))
return f1
def run_model_with_grid_search(model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_with_fs_all_features_class', n_splits=5, output_file='random_forest_with_hpo_with_fs_all_features_class.txt', param_grid={}):
param_grid = [{'random_forest_with_hpo_with_fs_all_features_class__bootstrap':[True,False],
'random_forest_with_hpo_with_fs_all_features_class__max_depth':[10,20,30,40],
'random_forest_with_hpo_with_fs_all_features_class__n_estimators':[200,500,700]
}]
pipe = Pipeline([('feature_selection',RFECV(estimator=RandomForestClassifier(),scoring='accuracy',step=1,cv=StratifiedKFold(5))),('random_forest_with_hpo_with_fs_all_features_class',RandomForestClassifier())])
search = GridSearchCV(
pipe,
cv=5,
param_grid=param_grid,
scoring='accuracy',
refit=True
)
fit_model = search.fit(X_train,y_train)
print('Optimal number of features: ' + feature_selection.n_features_)
return fit_model,fit_model.best_params_,fit_model.best_score_
fit_model,params,best_score = run_model_with_grid_search()
model = create_model(model_name=fit_model)
I get the warning 'X does not have valid feature names, but RFECV was fitted with feature names'. I can see this question elsewhere e.g.here but I can't understand how this answer would apply here - could someone point out where in the code below is leading to this warning (and I guess ideally how to fix it).
I have a data set like this, it's 343 columns of binary data, and it is sparsely encoded (i.e. there are many more 0s than 1s):
column1 ... column343
0 0 ... 0
1 0 ... 0
2 0 ... 0
3 0 ... 0
4 0 ... 0
.. ... ... ...
214 0 ... 0
215 0 ... 0
216 0 ... 0
217 0 ... 0
218 0 ... 0
[219 rows x 343 columns]
(219, 343)
Could someone please explain to me how to fix the issue where this script:
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, RFECV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pickle
#import neptune.new as neptune
import pandas as pd
import shap
df = pd.read_csv('train.txt',sep='\t') #hard-coded
full_y_train = df['Event]
df = df.drop(['Event'],axis=1)
full_X_train = df
def run_model_with_grid_search(param_grid={}, output_plt_file = 'plt.png',model_name=RandomForestClassifier(),X_train=full_X_train,y_train=full_y_train,model_id='random_forest_with_hpo_no_fs_geno_class', n_splits=5, output_file='random_forest_with_hpo_no_fs_geno_class.txt'):
list_shap_values = list()
list_test_sets = list()
cv_outer = KFold(n_splits=5,shuffle=True,random_state=1)
for train_ix,test_ix in cv_outer.split(X_train):
split_x_train, split_x_test = X_train.iloc[train_ix,:],X_train.iloc[test_ix,:]
split_y_train, split_y_test = y_train.iloc[train_ix],y_train.iloc[test_ix]
model = model_name
cv_inner = KFold(n_splits=3,shuffle=True,random_state=1)
search = GridSearchCV(model,param_grid=param_grid,scoring='roc_auc',cv=cv_inner,refit=True)
result = search.fit(split_x_train,split_y_train)
best_model = result.best_estimator_
yhat = best_model.predict(split_x_test)
explainer = shap.TreeExplainer(result.best_estimator_)
shap_values = explainer.shap_values(split_x_test,check_additivity=False)
list_shap_values.append(shap_values)
list_test_sets.append(test_ix)
test_set = list_test_sets[0]
shap_values = np.array(list_shap_values[0])
for i in range(1,len(list_test_sets)):
test_set = np.concatenate((test_set,list_test_sets[i]),axis=0)
shap_values = np.concatenate((shap_values,np.array(list_shap_values[i])),axis=1)
X_test_df = pd.DataFrame(full_X_train[test_set])
cols = X_test_df.columns
shap_sum = np.abs(shap_values[1,:,:]).mean(0)
importance_df = pd.DataFrame({
'column_name':cols,
'shap_values':shap_sum
})
print(importance_df)
return
param_grid = [{
'min_samples_leaf':[1,3,5],
}]
run_model_with_grid_search(param_grid=param_grid)
Generates the error:
Traceback (most recent call last):
File "/home/data/ml_models_genotypic_only_fortest.py", line 103, in <module>
run_model_with_grid_search(param_grid=param_grid)
File "/home/data/ml_models_genotypic_only_fortest.py", line 80, in run_model_with_grid_search
X_test_df = pd.DataFrame(full_X_train[test_set])
File "/home/apps/easybuild/software/SciPy-bundle/2021.10-foss-2021b/lib/python3.9/site-packages/pandas/core/frame.py", line 3464, in __getitem__
indexer = self.loc._get_listlike_indexer(key, axis=1)[1]
File "/home/apps/easybuild/software/SciPy-bundle/2021.10-foss-2021b/lib/python3.9/site-packages/pandas/core/indexing.py", line 1314, in _get_listlike_indexer
self._validate_read_indexer(keyarr, indexer, axis)
File "/home/apps/easybuild/software/SciPy-bundle/2021.10-foss-2021b/lib/python3.9/site-packages/pandas/core/indexing.py", line 1374, in _validate_read_indexer
raise KeyError(f"None of [{key}] are in the [{axis_name}]")
KeyError: "None of [Int64Index([ 0, 4, 11, 16, 18, 19, 28, 29, 31, 33,\n ...\n 156, 157, 175, 178, 192, 203, 204, 207, 211, 215],\n dtype='int64', length=219)] are in the [columns]"
I do not get the error if I remove check_additivity=False from the script, however, if I remove the check_additivity parameter, I get the error:
shap.utils._exceptions.ExplainerError: Additivity check failed in TreeExplainer! Please ensure the data matrix you pass to the explainer is the same data shape that the model was trained on. If your data shape is correct, then please report this on GitHub.
Consider retrying with the feature perturbation=interventional option. This check failed because for one of the samples, the sum of the SHAP values is 0.908553, while the model output was 0.940000. If this difference is acceptable, you can set check_additivity=False to disable this check.
If I replace my data set with a fake data set:
full_X_train,full_y_train = make_classification(n_samples =500,n_features = 20, random_state=1, n_informative=10,n_redundant=10)
, I do not get the error.
So whether I leave check_additivity=False in or out of the script for my real data, leaves me with two different errors, and I'm not sure how to get around this?
It's hard to debug your code as it's not reproducible but you may follow the following code snippet that "just runs":
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold,KFold
from sklearn.feature_selection import SelectKBest
from sklearn.datasets import load_breast_cancer
#from xgboost import XGBClassifier
from sklearn.feature_selection import mutual_info_classif
from sklearn.feature_selection import SelectKBest, RFECV
from sklearn.pipeline import Pipeline
from sklearn.svm import SVC
from sklearn.ensemble import RandomForestClassifier
from sklearn.naive_bayes import GaussianNB
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import make_scorer, recall_score, accuracy_score, precision_score
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score,f1_score,roc_auc_score
from sklearn import metrics
from sklearn.datasets import make_classification
from numpy import mean
from sklearn.model_selection import train_test_split
from numpy import std
from sklearn.utils import shuffle
import numpy as np
from sklearn.metrics import roc_curve
from sklearn.pipeline import Pipeline
import matplotlib.pyplot as plt
import pickle
#import neptune.new as neptune
import pandas as pd
import shap
full_X_train, full_y_train = load_breast_cancer(return_X_y=True, as_frame=True)
def run_model_with_grid_search(
param_grid={},
output_plt_file="plt.png",
model_name=RandomForestClassifier(),
X_train=full_X_train,
y_train=full_y_train,
model_id="random_forest_with_hpo_no_fs_geno_class",
n_splits=5,
output_file="random_forest_with_hpo_no_fs_geno_class.txt",
):
list_shap_values = list()
list_test_sets = list()
cv_outer = KFold(n_splits=5, shuffle=True, random_state=1)
for train_ix, test_ix in cv_outer.split(X_train):
split_x_train, split_x_test = (
X_train.iloc[train_ix, :],
X_train.iloc[test_ix, :],
)
split_y_train, split_y_test = y_train[train_ix], y_train[test_ix]
model = model_name
cv_inner = KFold(n_splits=3, shuffle=True, random_state=1)
search = GridSearchCV(
model, param_grid=param_grid, scoring="roc_auc", cv=cv_inner, refit=True
)
result = search.fit(split_x_train, split_y_train)
best_model = result.best_estimator_
yhat = best_model.predict(split_x_test)
explainer = shap.TreeExplainer(result.best_estimator_)
shap_values = explainer.shap_values(split_x_test, check_additivity=False)
list_shap_values.append(shap_values)
shap_values = np.vstack([sv[1] for sv in list_shap_values])
sv = np.abs(shap_values.mean(0))
cols = X_train.columns
importance_df = pd.DataFrame({"column_name": cols, "shap_values": sv})
return importance_df
param_grid = [{"min_samples_leaf": [1, 3, 5],}]
importance_df = run_model_with_grid_search(param_grid=param_grid)
print(importance_df)
column_name shap_values
0 mean radius 0.000202
1 mean texture 0.000585
2 mean perimeter 0.000728
3 mean area 0.000541
4 mean smoothness 0.000867
5 mean compactness 0.000098
6 mean concavity 0.000759
7 mean concave points 0.003325
8 mean symmetry 0.000033
9 mean fractal dimension 0.000349
...
Note, the above code runs on my machine with both True and False for check_additivity param
Can I ask, when I run this code, it produces an output without error:
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import PolynomialFeatures
from sklearn.decomposition import PCA
from sklearn.preprocessing import scale
from sklearn.model_selection import cross_val_score, cross_val_predict,cross_validate
from sklearn.model_selection import KFold
from sklearn.model_selection import GridSearchCV
from sklearn.feature_selection import chi2, f_regression
from sklearn.model_selection import train_test_split
from sklearn.model_selection import RepeatedKFold
from sklearn.model_selection import StratifiedKFold
from sklearn.feature_selection import SelectKBest
#from xgboost import XGBRegressor
from sklearn.feature_selection import f_regression
from sklearn.feature_selection import mutual_info_regression
from sklearn.feature_selection import mutual_info_classif
from sklearn import metrics
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import GradientBoostingRegressor
from sklearn.feature_selection import SelectKBest
from sklearn.neighbors import KNeighborsRegressor
from sklearn.linear_model import BayesianRidge
from sklearn.pipeline import Pipeline
from scipy.stats import spearmanr
from sklearn.svm import SVR
from sklearn.svm import SVC
from sklearn.metrics import roc_curve, precision_recall_curve, auc, make_scorer, recall_score, accuracy_score, precision_score, confusion_matrix,classification_report
import pickle
import numpy as np
from sklearn.metrics import precision_recall_fscore_support
from sklearn.metrics import make_scorer
from sklearn.metrics import precision_score,recall_score
from sklearn.datasets import make_classification
#Generate fake data
X, y = make_classification(n_samples=5000, n_classes=2, n_features=20, n_redundant=0,random_state=0) #fake data
X_train = X[:4500] #.iloc for df
y_train = y[:4500]
X_test = X[4500:]#.reset_index(drop=True,inplace=True)
y_test = y[4500:]
scorers = {
'precision_score': make_scorer(precision_score),
'recall_score': make_scorer(recall_score),
'accuracy_score': make_scorer(accuracy_score)
}
def run_SVC(X_train, y_train, X_test, y_test,output_file,data_name,refit_score='precision_score'):
'''
run SVC algorithm, with CV and hyperparameter tuning.
'''
short_dataname = data_name.strip().split('/')
file_model_name = output_file + '_svc_' + short_dataname[-1]
clf = SVC()
skf = StratifiedKFold(n_splits=2,random_state=42,shuffle=True)
#fs = SelectKBest(score_func = mutual_info_classif)
pipeline = Pipeline(steps=[('svc',clf)]) #,('sel',fs)
print(pipeline.get_params().keys())
search = GridSearchCV(
pipeline,
param_grid={
'svc__C': [0.01, 0.1, 10, 1000], ##Regularization
'svc__gamma': [0.0001, 0.01, 1, 10],
'svc__kernel':['linear','rbf'],
},
return_train_score=True,
verbose=3,
refit=refit_score,
scoring=scorers,
cv=skf,
n_jobs=-1,
)
search.fit(X_train, y_train)
# make the predictions
y_pred = search.predict(X_test)
print('Best params for {}'.format(refit_score))
print(search.best_params_)
print(classification_report(y_test,y_pred)) #labels=['neg','pos']
return
print(run_SVC(X_train,y_train,X_test,y_test,'test.txt','dataset'))
When i comment in the only two lines that are commented out (#fs = SelectKBest(score_func = mutual_info_classif)) and fs in the line after that, I get the error:
TypeError: All intermediate steps should be transformers and implement fit and transform or be the string 'passthrough' 'SVC()' (type <class 'sklearn.svm._classes.SVC'>) doesn't
I can see that other people have addressed this on SO before, e.g. here, so I tried to follow that person's answer, but my SelectKBest is already before my pipeline - when I move the line with 'fs' to be higher in my code (which I thought was what the answer was saying), I get the same error.
Could someone show me where I'm going wrong here and what I'm meant to change to remove this error?
The order of the steps in a Pipeline matters, and only the last step can be a non-transformer like your svc.
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn import linear_model
from sklearn.metrics import confusion_matrix
from sklearn import metrics
from sklearn.preprocessing import LabelEncoder
from google.colab import files
df = files.upload()
df='Dataset.csv'
df=df.dropna()
AttributeError Traceback (most recent call last)
in ()
----> 1 df=df.dropna()
AttributeError: 'str' object has no attribute 'dropna'
You are not loading the file as a dataframe, you just assign the file name of df. Use instead -
df = pd.read_csv('Dataset.csv')
df = df.dropna()