GridSearchCV gives different result than my manually tuning procedure - python

I get different results when i am doing GridSearch using sklearn and manually.
The first code block is my procedure when i run GridSearch using sklearn:
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn import pipeline
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
X = folded_train.drop(columns = ["10_fold", "class_encoded"])
y = folded_train["class_encoded"]
ten_fold = folded_train["10_fold"]
logo = LeaveOneGroupOut()
cross_val_groups = logo.split(X, y, ten_fold)
classifier = (Pipeline([("sampling", RandomUnderSampler()),
("classifier", ensemble.RandomForestClassifier(n_jobs = -1))]))
param_grid = {
"classifier__n_estimators" : [100, 200, 300, 400, 600],
"classifier__max_depth": [1, 3, 5, 7],
"classifier__criterion": ["gini", "entropy"]
}
model = model_selection.GridSearchCV(
estimator = classifier,
param_grid = param_grid,
scoring = "roc_auc",
verbose = 10,
n_jobs = 1,
cv = cross_val_groups
)
model.fit(X,y)
And I am trying to do the same procedure manually. Here is my code:
from sklearn import ensemble
from sklearn import metrics
from sklearn import model_selection
from sklearn import pipeline
from imblearn.pipeline import Pipeline
from imblearn.under_sampling import RandomUnderSampler
X = folded_train.drop(columns = ["10_fold", "class_encoded"])
y = folded_train["class_encoded"]
ten_fold = folded_train["10_fold"]
number_of_estimators = [100, 200, 300]
maximum_depths = [1, 3, 5, 7]
criterions = ["gini", "entropy"]
logo = LeaveOneGroupOut()
for criterion in criterions:
for max_depth in maximum_depths:
for n_of_estimator in number_of_estimators:
for train_index, val_index in logo.split(X, y, ten_fold):
aPipeline = (Pipeline(steps=[('sampling', RandomUnderSampler()),
('classifier', ensemble.RandomForestClassifier(criterion= criterion,
max_depth= max_depth,
n_estimators= n_of_estimator,
n_jobs=-1))]))
X_trn, X_vl = X.iloc[train_index], X.iloc[val_index]
y_trn, y_vl = y.iloc[train_index], y.iloc[val_index]
aPipeline1.fit(X_trn, y_trn)
predictions = aPipeline1.predict(X_vl)
print("Criterion", criterion, " Max Depth", max_depth, "Number of estimator ", n_of_estimator, "score ", metrics.roc_auc_score(y_vl, predictions))
For sklearn GridSearchCV, i obtained scores (roc_auc) for specific parameters as below:
For criterion = "gini", max_depth = 1 and n_estimators = 100,
[0.786, 0.799, 0.789, 0.796, 0.775, 0.776, 0.779, 0.788, 0.770, 0.769] for each cv iteration
And my manual execution for same parameters i get:
[0.730, 0.749, 0.714, 0.710, 0.732, 0.724, 0.711, 0.724, 0.715, 0.734]
And this results is valid for other parameter combinations too. What are the factors that lead to such kind of situation?
Note: I found this but it is not answer of my problem: Why GridSearchCV model results are different than the model I manually tuned?

Related

How to get the support_ values from RFE pipeline?

I created a Pipeline with RFE and RandomForestClassifer in it and then applied RandomizedSearchCV to find the best hyperparameter values for both. This is what my code looks like -
from sklearn.esemble_learning import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
steps = [
("rfe", RFE(estimator = RandomForestClassifier(random_state = 42))),
("est", RandomForestClassifier())
]
rf_clf_pl = Pipeline(steps = steps)
params = {
"rfe__n_features_to_select" : range(2, smote_X_train.shape[1] + 1),
"est__random_state" : np.linspace(0, 42, 5).astype(int),
"est__n_estimators" : range(50, 201, 10),
"est__max_depth" : [None] + list(range(5, max_depth, 3)),
"est__max_leaf_nodes" : [None] + list(range(100, max_leaf_nodes, 20))
}
rs = RandomizedSearchCV(estimator = rf_clf_pl, cv = 4, param_distributions = params, n_jobs = -1, n_iter = 100, random_state = 42)
rs.fit(smote_X_train, smote_y_train)
I tried using the code below but got an error -
rf_clf_pl.named_steps["rfe"].support_
Error -
AttributeError Traceback (most recent call last)
<ipython-input-53-c73290f0e090> in <module>()
----> 1 rf_clf_pl.named_steps["rfe"].support_
AttributeError: 'RFE' object has no attribute 'support_'
How can I get the name of the retained features?
You can access the retained features of the best estimator as follows:
rs.best_estimator_.named_steps['rfe'].support_
Namely, you should access the best_estimator_ attribute of the RandomizedSearchCV fitted instance (i.e. the pipeline re-fitted with the best found hyperparameters thanks to the default parameter refit=True of RandomizedSearchCV).
The way you were trying to access attribute support_ from the pipeline instance does not work because you've not explicitly fitted the pipeline itself nor the fitted RandomizedSearchCV returns the fitted base estimator (despite calling .fit() on it while running the search) with the exception of the best_estimator_ in the case described above.
Here's an example:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, train_test_split
iris = load_iris(as_frame=True)
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=0)
steps = [
("rfe", RFE(estimator = RandomForestClassifier(random_state = 42))),
("est", RandomForestClassifier())
]
rf_clf_pl = Pipeline(steps = steps)
params = {
"rfe__n_features_to_select" : range(2, X_train.shape[1] + 1),
"est__random_state" : np.linspace(0, 42, 5).astype(int),
"est__n_estimators" : range(50, 201, 10),
"est__max_depth" : [None] + list(range(5, 16, 3)),
"est__max_leaf_nodes" : [None] + list(range(100, 201, 20))
}
rs = RandomizedSearchCV(estimator = rf_clf_pl, cv = 4, param_distributions = params, n_jobs = -1, n_iter = 100, random_state = 42)
rs.fit(X_train, y_train)
rs.best_estimator_.named_steps['rfe'].support_
Eventually, if you want to access the explicit names of the retained features, you can retrieve them via rs.feature_names_in_[np.where(rs.best_estimator_.named_steps['rfe'].support_)[0]].

increasing SVM model accuracy using hyperparameters

I'm building a language detection model based on how the letters appear in one word rather than how words appear in a sentence so the model is expected to predict the language of a single word at a time. The languages I'm using in my training data are English, Hungarian and Latin. The data is arranged in a manner that every single word is in one line.
I have tried to accomplish that using linear regression and the accuracy was around 80 % so I tried to solve the problem using SVM. After that, I did a grid search and the results weren't significantly improved.
I need the accuracy to be around 96% percent. Should I consider a different model or could this be improved?
You can find the data files [Here][1].
[Google_Colab_notebook][2]
confusion matrices for SVM and linear regression
SVM confusion matrix
here are the confusion matrices :
First SVM [SVM][3].First Linear regresion[LR][4].Optimized SVM [Optimized SVM ][5].
import string
import re
import codecs
import numpy as np
import pandas as pd
import seaborn as sns
from sklearn import metrics
from sklearn.metrics import classification_report
from sklearn import linear_model
from sklearn import pipeline
from sklearn import feature_extraction
from sklearn.model_selection import train_test_split
from google.colab import drive
import matplotlib.pyplot as plt
from sklearn.metrics import plot_confusion_matrix
import time
from sklearn.utils import resample
from sklearn import svm
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
#load the drive
drive.mount('/content/gdrive/',force_remount=True)
#load raw training files
hun_df=pd.read_csv("/content/gdrive/MyDrive/language_detection_files/hun_words.txt","utf-8",header=None,names=["Hungarian"])
eng_df=pd.read_csv("/content/gdrive/MyDrive/language_detection_files/eng_words.txt","utf-8",header=None,names=["English"])
lat_df=pd.read_csv("/content/gdrive/MyDrive/language_detection_files/lat_words.txt","utf-8",header=None,names=["Latin"])
#regular expression to clean the data
string_pattern = r'\W|\d+|[^\x00-\x7F\x80-\xFF\u0100-\u017F\u0180-\u024F\u1E00-\u1EFF]'
regex_pattern = re.compile(string_pattern) # compile string pattern to re.Pattern object
#natural language processing toolkit
!pip install nltk
import nltk
nltk.download('stopwords')
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize
stop_words = set(stopwords.words('hungarian'))
#cleaning and putting the words into a list
data_hun=[]
lang_hun=[]
for i,line in hun_df.iterrows():
line = line['Hungarian']
line=line.lower()
line = re.sub(regex_pattern,'', line)
if not line in stop_words and len(line) > 1 and len(line) < 14: # trying to downsample to balance the classes
data_hun.append(line)
lang_hun.append("Hungarian") #len(data_hun) is 254211
stop_words = set(stopwords.words('english'))
data_eng=[]
lang_eng=[]
for i,line in eng_df.iterrows():
line = line['English']
line=line.lower()
line = re.sub(regex_pattern,'', line) # consider compiling the regex in a different cell
if not line in stop_words and len(line) > 1 and len(line) < 14: # trying to downsample to balance the classes
data_eng.append(line)
lang_eng.append("English") # len(data_eng) 336610
data_lat=[]
lang_lat=[]
for i,line in lat_df.iterrows():
line = line['Latin']
line=line.lower()
line = re.sub(regex_pattern,'', line)
if len(line)>1 and len(line) < 14 :
data_lat.append(line)
lang_lat.append("Latin") # len(data_lat) 185249
number_of_samples=10000
#downsample and balance data
lat = resample(data_lat,
replace=True,
n_samples=number_of_samples,
random_state=42)
hun = resample(data_hun,
replace=True,
n_samples=number_of_samples,
random_state=42)
eng = resample(data_eng,
replace=True,
n_samples=number_of_samples,
random_state=42)
lang_lat = lang_lat[0:number_of_samples]
lang_hun = lang_hun[0:number_of_samples]
lang_eng = lang_eng[0:number_of_samples]
df=pd.DataFrame( {"Text":eng+hun+lat,
"Language":lang_eng+lang_hun+lang_lat})
print(df.shape)
# splitting the data to train and test
X,y=df.iloc[:,0],df.iloc[:,1]
X_train,X_test,y_train,y_test= train_test_split(X,
y,
test_size=0.2,
random_state=0)
print(X_train.shape)
print(X_test.shape)
print(y_train.shape)
print(y_test.shape)
# vectorize the cleaned text every letter is vectorized
vectorizer= feature_extraction.text.TfidfVectorizer(ngram_range=(1,5),analyzer='char')
LR_pipeline=pipeline.Pipeline([
('vectorizer',vectorizer),
('clf',linear_model.LogisticRegression(solver='newton-cg',max_iter=1000)
) # {‘newton-cg’, ‘lbfgs’, , ‘sag’, ‘saga’}
])
#linear regression model prepare and fit
start = time.time()
LR_pipeline.fit(X_train,y_train)
end = round(time.time()-start,2)
end
y_predicted=LR_pipeline.predict(X_test)
acc=(metrics.accuracy_score(y_test,y_predicted))*100
print(acc,'%')
#linear regression model metrics
print(classification_report(y_test, y_predicted))
#prepare data for the SVM model and fit it
Tfidf_vect = feature_extraction.text.TfidfVectorizer(ngram_range=(1,5),analyzer='char')
Tfidf_vect.fit(df.iloc[:,0])
X_train_Tfidf = Tfidf_vect.transform(X_train)
X_test_Tfidf = Tfidf_vect.transform(X_test)
SVM = svm.SVC(C=1.0, kernel='linear', degree=3, gamma='auto',verbose=True)
start = time.time()
SVM.fit(X_train_Tfidf,y_train)# predict the labels on validation dataset
end = round(time.time()-start,2)
end
y_predicted=SVM.predict(X_test_Tfidf)
acc=(metrics.accuracy_score(y_test,y_predicted))*100
print(acc,'%')
#SVM classification metrics
print(classification_report(y_test, y_predicted))
# The two confusion matrices side by side
plot_confusion_matrix(SVM, X_test_Tfidf, y_test)
plot_confusion_matrix(LR_pipeline, X_test, y_test)
plt.show()
# defining parameter range to perform a grid search and optimize hyper parameters
param_grid = {'C': [ 100, 1000], #[0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['linear']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
# fitting the model for grid search
grid.fit(X_train_Tfidf,y_train)
# defining parameter range
param_grid = {'C': [ 0.1,1, 10, 100, 1000], #[0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['linear']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
# fitting the model for grid search
grid.fit(X_train_Tfidf,y_train)
# defining parameter range
param_grid = {'C': [0.1, 1, 10, 100, 1000], #[0.1, 1, 10, 100, 1000],
'gamma': [1, 0.1, 0.01, 0.001, 0.0001],
'kernel': ['rbf']}
grid = GridSearchCV(SVC(), param_grid, refit = True, verbose = 3)
# fitting the model for grid search
grid.fit(X_train_Tfidf,y_train)
# print best parameter after tuning
print(grid.best_params_)
# print how our model looks after hyper-parameter tuning
print(grid.best_estimator_)
'''
best parameters
{'C': 1, 'gamma': 1, 'kernel': 'linear'}
SVC(C=1, gamma=1, kernel='linear')
'''
SVM_opt = svm.SVC(C=1, kernel='linear', degree=3, gamma=1,verbose=True)
start = time.time()
SVM_opt.fit(X_train_Tfidf,y_train)# predict the labels on validation dataset
end = round(time.time()-start,2)
end
y_predicted=SVM_opt.predict(X_test_Tfidf)
acc=(metrics.accuracy_score(y_test,y_predicted))*100
print(acc,'%')
# metrics and confusion matrix
print(classification_report(y_test, y_predicted))
plot_confusion_matrix(SVM_opt, X_test_Tfidf, y_test)
plt.show()
*[1]: https://drive.google.com/drive/folders/1nKINXfDG4IbEw4YlUzGTaRpJh4CFwGlC?usp=sharing
[2]: https://colab.research.google.com/drive/1FJNd7vXcHnrNZYz9aM56U1TalhVBCvO7?usp=sharing
[3]: https://i.stack.imgur.com/SeDh7.png
[4]: https://i.stack.imgur.com/Ksb7m.png
[5]: https://i.stack.imgur.com/Iw35J.png

how can i get_param names for targetencoder? gridsearch

i have the below scenario:
preprocess = make_column_transformer(
(SimpleImputer(strategy='constant',fill_value = 0),numeric_cols),
(ce.TargetEncoder(),['country'])
)
pipeline = make_pipeline(preprocess,XGBClassifier())
pipeline[0].get_params().keys()
dict_keys(['n_jobs', 'remainder', 'sparse_threshold', 'transformer_weights', 'transformers', 'verbose', 'simpleimputer', 'targetencoder', 'simpleimputer__add_indicator', 'simpleimputer__copy', 'simpleimputer__fill_value', 'simpleimputer__missing_values', 'simpleimputer__strategy', 'simpleimputer__verbose', 'targetencoder__cols', 'targetencoder__drop_invariant', 'targetencoder__handle_missing', 'targetencoder__handle_unknown', 'targetencoder__min_samples_leaf', 'targetencoder__return_df', 'targetencoder__smoothing', 'targetencoder__verbose'])
i then wish to do a grid search on the smoothing factor:
so:
param_grid = {
'xgbclassifier__learning_rate': [0.01,0.005,0.001],
'targetencoder__smoothing': [1, 10, 30, 50]
}
pipeline = make_pipeline(preprocess,XGBClassifier())
# Initialize Grid Search Modelg
clf = GridSearchCV(pipeline,param_grid = param_grid,scoring = 'neg_mean_squared_error',
verbose= 1,iid= True,
refit = True,cv = 3)
clf.fit(X_train,y_train)
however i get this error:
ValueError: Invalid parameter transformer_targetencoder for estimator Pipeline(steps=[('columntransformer',
ColumnTransformer(transformers...
how can i access the smoothing paramter?
Using your example, it will be columntransformer__targetencoder__smoothing . To reproduce the pipeline, first I use an example dataset and define the columns:
from sklearn.compose import make_column_transformer
from sklearn.pipeline import make_pipeline
from sklearn.impute import SimpleImputer
import category_encoders as ce
from xgboost import XGBClassifier
from sklearn.model_selection import GridSearchCV
X_train = pd.DataFrame({'x1':np.random.normal(0,1,50),
'x2':np.random.normal(0,1,50),
'country':np.random.choice(['A','B','C'],50)})
y_train = np.random.binomial(1,0.5,50)
numeric_cols = ['x1','x2']
preprocess = make_column_transformer(
(SimpleImputer(strategy='constant',fill_value = 0),numeric_cols),
(ce.TargetEncoder(),['country'])
)
pipeline = make_pipeline(preprocess,XGBClassifier())
You should look at the keys at a higher level:
pipeline.get_params().keys()
Then set up the grid, make sure that the smoothing is a float (see this issue):
param_grid = { 'columntransformer__targetencoder__smoothing': [1.0, 10.0],
'xgbclassifier__learning_rate': [0.01,0.001]}
pipeline = make_pipeline(preprocess,XGBClassifier())
clf = GridSearchCV(pipeline,param_grid = param_grid,scoring = 'neg_mean_squared_error',
verbose= 1,refit = True,cv = 3)
clf.fit(X_train,y_train)
And it should work

AttributeError: 'GridSearchCV' object has no attribute 'cv_results_' on scikit-learn 0.19.2

I'm currently using Scikit-Learn version 0.19.2 and Python 3.6.3
For some reason, I can't access the cv_results_ attribute from my GridSearchCV.
This is the code I'm using:
df = pd.read_csv(input_file, sep = ";", header=None)
numpy_array = df.as_matrix()
y=numpy_array[:,1]
y[y=='RR']=1
y[y=='AIRR']=0
print(y)
y=y.astype('int')
vectorizer = TfidfVectorizer(sublinear_tf=True, max_df=0.5, stop_words=stopwords)
X=numpy_array[:,0]
X=vectorizer.fit_transform(X)
param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
"base_estimator__splitter" : ["best", "random"],
"n_estimators": [1, 2]
}
DTC = DecisionTreeClassifier(random_state = 11, max_features = "auto", class_weight = "balanced",max_depth = None)
# Create and fit an AdaBoosted decision tree
bdt = AdaBoostClassifier(base_estimator = DTC)
grid_search_ABC = GridSearchCV(bdt, param_grid=param_grid, scoring = 'roc_auc', cv=5, refit=True)
pred = grid_search_ABC.fit(X,y)
print(metrics.confusion_matrix(y, pred))
mean=grid_search_ABC.cv_results_['mean_test_score']
std=grid_search_ABC.cv_results_['std_test_score']
I read that this has mostly to do with GridSearchCV not being fitted probably, but I can totally use it to predict new instances and etc.
Any pointers, please?
The problem is probably with your dataset. This is why this sites encourages you to post examples which are verifiable.
I just tried running your code on the iris dataset and it worked just fine:
from sklearn import datasets
from sklearn.model_selection import GridSearchCV
iris = datasets.load_iris()
from sklearn.tree import DecisionTreeClassifier
from sklearn.ensemble import AdaBoostClassifier
param_grid = {"base_estimator__criterion" : ["gini", "entropy"],
"base_estimator__splitter" : ["best", "random"],
"n_estimators": [1, 2]
}
DTC = DecisionTreeClassifier(random_state = 11, max_features = "auto", class_weight = "balanced",max_depth = None)
bdt = AdaBoostClassifier(base_estimator = DTC)
grid_search_ABC = GridSearchCV(bdt, param_grid=param_grid, scoring = 'roc_auc', cv=5, refit=True)
pred = grid_search_ABC.fit(iris.data, iris.target>0)
print(grid_search_ABC.cv_results_['mean_test_score'])
and it worked just fine.

How to predict data in python after training a stacked model?

I'm new to machine learning in python and have seen the concept of stacking models and wanted to give it a shot. The problem is i don't know how predict new data as i don't fully understand machine learning implementation in python. the code that i managed scrap looks like this:
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error,mean_squared_error
from sklearn.ensemble import ExtraTreesRegressor
from sklearn.ensemble import RandomForestRegressor
from sklearn.ensemble import BaggingRegressor
from sklearn.ensemble import GradientBoostingRegressor
from catboost import CatBoostRegressor
from xgboost import XGBRegressor
from vecstack import stacking
import pandas as pd
X = pd.read_csv('db/file_name3.csv')
y = pd.read_csv('db/train_labels(1).csv')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
models = [
CatBoostRegressor(iterations=200,
learning_rate=0.03,
depth=4,
loss_function='RMSE',
eval_metric='RMSE',
random_seed=99,
od_type='Iter',
od_wait=50,
logging_level='Silent'),
CatBoostRegressor(iterations=500,
learning_rate=0.06,
depth=3,
loss_function='RMSE',
eval_metric='RMSE',
random_seed=99,
od_type='Iter',
od_wait=50,
logging_level='Silent'),
ExtraTreesRegressor(random_state = 0, n_jobs = -1,
n_estimators = 100, max_depth = 3),
RandomForestRegressor(random_state = 0, n_jobs = -1,
n_estimators = 300, max_depth = 3),
XGBRegressor(eta=0.02,reg_lambda=5,reg_alpha=1),
XGBRegressor(eta=0.1,reg_lambda=1,reg_alpha=10),
XGBRegressor(eta=0.02,reg_lambda=1,reg_alpha=10,n_estimators=300),
XGBRegressor(eta=0.012,max_depth=3,n_estimators=200),
GradientBoostingRegressor(),
BaggingRegressor(),
]
test1= pd.read_csv('db/Cleaned Data.csv')
S_train, S_test = stacking(models, X_train, y_train, X_train,
regression = True, metric = mean_absolute_error, n_folds = 10 ,
shuffle = True, random_state = 0, verbose = 2)
model = model.fit(S_train, y_train)
y_pred = model.predict(S_test)
print(y_pred.shape)
as you can see test1 is the data that i want to predict but could't figure it out. I can predict the data from my training set but not the new one. I have not changed any of the parameters of the models from the documentation.
import matplotlib.pyplot as plt
import pandas as pd
import numpy as np
from sklearn.ensemble import RandomForestRegressor
from sklearn.model_selection import KFold, cross_val_score
# load your X, y and test1 data here
RF = RandomForestRegressor(random_state = 0, n_jobs = -1,
n_estimators = 300, max_depth = 3)
# Validation function
n_folds = 5
def rmsle_cv(model):
kf = KFold(n_folds, shuffle=True,
random_state=42).get_n_splits(X.values)
rmse= np.sqrt(-cross_val_score(model, X.values, y.values.ravel(),
scoring="neg_mean_squared_error", cv = kf))
return(rmse)
score = rmsle_cv(RF)
print("Random Forest score: {:.3f} ({:.3f})\n".format(score.mean(),
score.std()))
RF.fit(X,y.values.ravel())
RF_train_pred = RF.predict(X)
RF_pred = RF.predict(test1)
print (RF_pred.shape)

Categories

Resources