Here's the piece of the code:
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import LogisticRegressionCV
skf = StratifiedKFold(n_splits=5)
skf_1 = skf.split(titanic_dataset, surv_titanic)
ls_1 = np.logspace(-1.0, 2.0, num=500)
clf = LogisticRegressionCV(Cs=ls_1, cv = skf_1, scoring = "roc_auc", n_jobs=-1, random_state=17)
clf_model = clf.fit(x_train, y_train)
This says:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-130-b99a5912ff5a> in <module>
----> 1 clf_model = clf.fit(x_train, y_train)
H:\Anaconda_3\lib\site-packages\sklearn\linear_model\_logistic.py in fit(self, X, y, sample_weight)
2098 # (n_classes, n_folds, n_Cs . n_l1_ratios) or
2099 # (1, n_folds, n_Cs . n_l1_ratios)
-> 2100 coefs_paths, Cs, scores, n_iter_ = zip(*fold_coefs_)
2101 self.Cs_ = Cs[0]
2102 if multi_class == 'multinomial':
ValueError: not enough values to unpack (expected 4, got 0)
The train and test datasets had been prepared before, and they behave nicely with other classifiers.
Such a generic error message tells me nothing. What is the problem here?
In short, the issue was that you passed the result of skf.split(titanic_dataset, surv_titanic) to the cv argument on LogisticRegressionCV when you needed to pass StratifiedKFold(n_splits=5) directly instead.
Below I show the code that reproduced your error, and below that I show two alternative methods that accomplish what I believe you were trying to do.
# Some example data
data = load_breast_cancer()
X = data['data']
y = data['target']
# Set up the stratifiedKFold
skf = StratifiedKFold(n_splits=5)
# Don't do this... only here to reproduce the error
skf_indicies = skf.split(X, y)
# Some regularization
ls_1 = np.logspace(-1.0, 2.0, num=5)
# This creates your error
clf_error = LogisticRegressionCV(Cs=ls_1,
cv = skf_indicies,
scoring = "roc_auc",
n_jobs=-1,
random_state=17)
# Error created by passing result of skf.split to cv
clf_model = clf_error.fit(X, y)
# This is probably what you meant to do
clf_using_skf = LogisticRegressionCV(Cs=ls_1,
cv = skf,
scoring = "roc_auc",
n_jobs=-1,
random_state=17,
max_iter=1_000)
# This will now fit without the error
clf_model_skf = clf_using_skf.fit(X, y)
# This is the easiest method, and from the docs also does the
# same thing as StratifiedKFold
# https://scikit-learn.org/stable/modules/generated/sklearn.linear_model.LogisticRegressionCV.html
clf_easiest = LogisticRegressionCV(Cs=ls_1,
cv = 5,
scoring = "roc_auc",
n_jobs=-1,
random_state=17,
max_iter=1_000)
# This will now fit without the error
clf_model_easiest = clf_easiest.fit(X, y)
Related
I created a Pipeline with RFE and RandomForestClassifer in it and then applied RandomizedSearchCV to find the best hyperparameter values for both. This is what my code looks like -
from sklearn.esemble_learning import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
steps = [
("rfe", RFE(estimator = RandomForestClassifier(random_state = 42))),
("est", RandomForestClassifier())
]
rf_clf_pl = Pipeline(steps = steps)
params = {
"rfe__n_features_to_select" : range(2, smote_X_train.shape[1] + 1),
"est__random_state" : np.linspace(0, 42, 5).astype(int),
"est__n_estimators" : range(50, 201, 10),
"est__max_depth" : [None] + list(range(5, max_depth, 3)),
"est__max_leaf_nodes" : [None] + list(range(100, max_leaf_nodes, 20))
}
rs = RandomizedSearchCV(estimator = rf_clf_pl, cv = 4, param_distributions = params, n_jobs = -1, n_iter = 100, random_state = 42)
rs.fit(smote_X_train, smote_y_train)
I tried using the code below but got an error -
rf_clf_pl.named_steps["rfe"].support_
Error -
AttributeError Traceback (most recent call last)
<ipython-input-53-c73290f0e090> in <module>()
----> 1 rf_clf_pl.named_steps["rfe"].support_
AttributeError: 'RFE' object has no attribute 'support_'
How can I get the name of the retained features?
You can access the retained features of the best estimator as follows:
rs.best_estimator_.named_steps['rfe'].support_
Namely, you should access the best_estimator_ attribute of the RandomizedSearchCV fitted instance (i.e. the pipeline re-fitted with the best found hyperparameters thanks to the default parameter refit=True of RandomizedSearchCV).
The way you were trying to access attribute support_ from the pipeline instance does not work because you've not explicitly fitted the pipeline itself nor the fitted RandomizedSearchCV returns the fitted base estimator (despite calling .fit() on it while running the search) with the exception of the best_estimator_ in the case described above.
Here's an example:
from sklearn.datasets import load_iris
from sklearn.ensemble import RandomForestClassifier
from sklearn.feature_selection import RFE
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV, train_test_split
iris = load_iris(as_frame=True)
X, y = iris.data, iris.target
X_train, X_test, y_train, y_test= train_test_split(X, y, random_state=0)
steps = [
("rfe", RFE(estimator = RandomForestClassifier(random_state = 42))),
("est", RandomForestClassifier())
]
rf_clf_pl = Pipeline(steps = steps)
params = {
"rfe__n_features_to_select" : range(2, X_train.shape[1] + 1),
"est__random_state" : np.linspace(0, 42, 5).astype(int),
"est__n_estimators" : range(50, 201, 10),
"est__max_depth" : [None] + list(range(5, 16, 3)),
"est__max_leaf_nodes" : [None] + list(range(100, 201, 20))
}
rs = RandomizedSearchCV(estimator = rf_clf_pl, cv = 4, param_distributions = params, n_jobs = -1, n_iter = 100, random_state = 42)
rs.fit(X_train, y_train)
rs.best_estimator_.named_steps['rfe'].support_
Eventually, if you want to access the explicit names of the retained features, you can retrieve them via rs.feature_names_in_[np.where(rs.best_estimator_.named_steps['rfe'].support_)[0]].
from sklearn import datasets
import numpy as np
# Assigning the petal length and petal width of the 150 flower samples to Matrix X
# Class labels of the flower to vector y
iris = datasets.load_iris()
X = iris.data[:, [2, 3]]
y = iris.target
print('Class labels:', np.unique(y))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=1, stratify=y)
print('Labels counts in y:', np.bincount(y))
print('Labels counts in y_train:', np.bincount(y_train))
print ('Labels counts in y_test:', np.bincount(y_test))
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
sc.fit(X_train)
X_train_std = sc.transform(X_train)
X_test_std = sc.transform(X_test)
from sklearn.linear_model import Perceptron
ppn = Perceptron(n_iter=40, eta0=0.1, random_state=1)
ppn.fit(X_train_std, y_train)
y_pred = ppn.predict(X_test_std)
print('Misclassified samples: %d' % (y_test != y_pred).sum())
When I run I get this error message:
Traceback (most recent call last):
File "c:/Users/Desfios 5/Desktop/Python/Ch3.py", line 27, in <module>
ppn = Perceptron(n_iter=40, eta0=0.1, random_state=1)
File "C:\Users\Desfios 5\AppData\Roaming\Python\Python38\site-packages\sklearn\utils\validation.py", line 72, in inner_f
return f(**kwargs)
TypeError: __init__() got an unexpected keyword argument 'n_iter'
I've tried uninstalling and installing scikit-learn but that did not help. Any help?
I just change the n_iter to max_iter and it work for me
ppn = Perceptron(max_iter=40, eta0=0.3, random_state=0)
You receive this error
TypeError: init() got an unexpected keyword argument 'n_iter'
because the Perceptron has no parameter 'n_iter' you can use before fitting it.
You are trying to access the n_iter_ attribute, which is an "Estimated attribute" (you can tell by the underscore at the end) and only stored after the fit method has been called. Reference in Documentation
Before fitting, you can only access the n_iter_no_change parameter for n_iter.
I am working on a dataset TelcoSigtel which has 5k observations, 21 features, and an imbalanced target with 86% non-churner and 16% churner.
Sorry, I wanted to give an extract of the dataframe but it is way too big or when I try to take a small bunch there are not enough churners.
My problem is the following those two methods below should give the same results but it is dramatically different on some algorithms and on some other they give the exact same results.
Information about the dataset:
models = [('logit',
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=600,
multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
solver='liblinear', tol=0.0001, verbose=0, warm_start=False)), ....]
# Method 1:
from sklearn import model_selection
from sklearn.model_selection import KFold
X = telcom.drop("churn", axis=1)
Y = telcom["churn"]
results = []
names = []
seed = 0
scoring = "roc_auc"
for name, model in models:
kfold = model_selection.KFold(n_splits = 5, random_state = seed)
cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison-AUC')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.grid()
plt.show()
# Method 2:
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
kf = KFold(n_splits=5, random_state=0)
X = telcom.drop("churn", axis=1)
Y = telcom["churn"]
results = []
names = []
to_store1 = list()
seed = 0
scoring = "roc_auc"
cv_results = np.array([])
for name, model in models:
for train_index, test_index in kf.split(X):
# split the data
X_train, X_test = X.loc[train_index,:].values, X.loc[test_index,:].values
y_train, y_test = np.ravel(Y[train_index]), np.ravel(Y[test_index])
model = model # Choose a model here
model.fit(X_train, y_train )
y_pred = model.predict(X_test)
to_store1.append(train_index)
# store fold results
result = roc_auc_score(y_test, y_pred)
cv_results = np.append(cv_results, result)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
cv_results = np.array([])
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison-AUC')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.grid()
plt.show()
The short answer is that you should use model.predict_proba(X_test)[:, 1] or model.decision_function(X_test) to get identical results since roc auc scorer needs class probabilities. The long answer is that you can reproduce the same behavior with a toy example:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import roc_auc_score, make_scorer
def assert_equal_scores(rnd_seed, needs_threshold):
"""Assert two different scorings, return equal results."""
X, y, *_ = load_breast_cancer().values()
kfold = KFold(random_state=rnd_seed)
lr = LogisticRegression(random_state=rnd_seed + 10)
roc_auc_scorer = make_scorer(roc_auc_score, needs_threshold=needs_threshold)
cv_scores1 = cross_val_score(lr, X, y, cv=kfold, scoring=roc_auc_scorer)
cv_scores2 = cross_val_score(lr, X, y, cv=kfold, scoring='roc_auc')
np.testing.assert_equal(cv_scores1, cv_scores2)
Try assert_equal_scores(10, False) and assert_equal_scores(10, True) (or any other random seed). The first one raises an AssertionError. The difference is that roc auc scorer requires the needs_threshold parameter to be True.
i am trying to built a model for LasVegasTripAdvisorReviews-Dataset
using bagging algorithm ,
i have an error (Multilabel and multi-output classification is not supported)
can you please help me and tell me how to solve the error )
regards
the attachment contain link to lasvegas dataset LasVegasTripAdvisorReviews-Dataset
# Voting Ensemble for Classification
import pandas
from sklearn import model_selection
from sklearn.linear_model import LogisticRegression
from sklearn.tree import DecisionTreeClassifier
from sklearn.svm import SVC
from sklearn.ensemble import VotingClassifier,GradientBoostingClassifier,AdaBoostClassifier,RandomForestClassifier
url = "h:/LasVegasTripAdvisorReviews-Dataset.csv"
names = ['User country','Nr. reviews','Nr. hotel reviews','Helpful votes','Period of stay','Traveler type','Pool','Gym','Tennis court','Spa','Casino','Free internet','Hotel name','Hotel stars','Nr. rooms','User continent','Member years','Review month','Review weekday','Score']
dataframe = pandas.read_csv(url, names=names)
array = dataframe.values
X = array[:,:]
Y = array[:,:]
seed = 7
kfold = model_selection.KFold(n_splits=10, random_state=seed)
# create the sub models
estimators = []
model1 = AdaBoostClassifier()
estimators.append(('AdaBoost', model1))
model2 = GradientBoostingClassifier()
estimators.append(('GradientBoosting', model2))
model3 = RandomForestClassifier()
estimators.append(('RandomForest', model3))
# create the ensemble model
ensemble = VotingClassifier(estimators)
results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)
print(results.mean())
Stacktrace:
NotImplementedError Traceback (most recent call last)
<ipython-input-9-bda887b4022f> in <module>
27 # create the ensemble model
28 ensemble = VotingClassifier(estimators)
---> 29 results = model_selection.cross_val_score(ensemble, X, Y, cv=kfold)
30 print(results.mean())
/usr/local/lib/python3.5/dist-packages/sklearn/model_selection/_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, error_score)
400 fit_params=fit_params,
401 pre_dispatch=pre_dispatch,
--> 402 error_score=error_score)
403 return cv_results['test_score']
404
...
...
NotImplementedError: Multilabel and multi-output classification is not supported.
You have the line:
X = array[:,:]
Y = array[:,:]
Meaning that your feature matrix (X) and target vector (Y) are the same.
You need to chose only one column to be your Y.
For example, let's suppose your want your last column to be Y.
Then, you should change the above lines to this:
X = values[:,:-1]
Y = values[:,-1:]
This should solve the error you got. The error you have basically means: I don't support more than one column in Y.
The code below works fine with:
scorer = make_scorer(roc_auc_score)
but gives "ValueError: bad input shape" with:
scorer = make_scorer(roc_auc_score, needs_proba = True)
The code is:
clf = GaussianNB()
cv = ShuffleSplit(features.shape[0], n_iter = 10, test_size = 0.2, random_state = 0)
scorer = make_scorer(roc_auc_score, needs_proba = True)
score = cross_val_score(clf, features, labels, cv=cv, scoring=scorer)
How would I get around this error so the score is based on probability estimates?
If you're using one of the default scoring metrics you don't need to pass a callable to cross_val_score, you can just, you can just call it with the name of the metric you're using:
score = cross_val_score(clf, features, labels, cv=cv, scoring='roc_auc_score')