GridSearch with SVM producing IndexError - python

I'm building a classifier using an SVM and want to perform a Grid Search to help automate finding the optimal model. Here's the code:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
X.shape # (22343, 323)
y.shape # (22343, 1)
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.4, random_state=0
)
tuned_parameters = [
{
'estimator__kernel': ['rbf'],
'estimator__gamma': [1e-3, 1e-4],
'estimator__C': [1, 10, 100, 1000]
},
{
'estimator__kernel': ['linear'],
'estimator__C': [1, 10, 100, 1000]
}
]
model_to_set = OneVsRestClassifier(SVC(), n_jobs=-1)
clf = GridSearchCV(model_to_set, tuned_parameters)
clf.fit(X_train, y_train)
and I get the following error message (this isn't the whole stack trace. just the last 3 calls):
----------------------------------------------------
/anaconda/lib/python3.5/site-packages/sklearn/model_selection/_split.py in split(self, X, y, groups)
88 X, y, groups = indexable(X, y, groups)
89 indices = np.arange(_num_samples(X))
---> 90 for test_index in self._iter_test_masks(X, y, groups):
91 train_index = indices[np.logical_not(test_index)]
92 test_index = indices[test_index]
/anaconda/lib/python3.5/site-packages/sklearn/model_selection/_split.py in _iter_test_masks(self, X, y, groups)
606
607 def _iter_test_masks(self, X, y=None, groups=None):
--> 608 test_folds = self._make_test_folds(X, y)
609 for i in range(self.n_splits):
610 yield test_folds == i
/anaconda/lib/python3.5/site-packages/sklearn/model_selection/_split.py in _make_test_folds(self, X, y, groups)
593 for test_fold_indices, per_cls_splits in enumerate(zip(*per_cls_cvs)):
594 for cls, (_, test_split) in zip(unique_y, per_cls_splits):
--> 595 cls_test_folds = test_folds[y == cls]
596 # the test split can be too big because we used
597 # KFold(...).split(X[:max(c, n_splits)]) when data is not 100%
IndexError: too many indices for array
Also, when I try reshaping the arrays so that the y is (22343,) I find that the GridSearch never finishes even if I set the tuned_parameters to only default values.
And here are the versions for all of the packages if that helps:
Python: 3.5.2
scikit-learn: 0.18
pandas: 0.19.0

It seems that there is no error in your implementation.
However, as it's mentioned in the sklearndocumentation, the "fit time complexity is more than quadratic with the number of samples which makes it hard to scale to dataset with more than a couple of 10000 samples". See documentation here
In your case, you have 22343 samples, which can lead to some computational problems/memory issues. That is why when you do your default CV it takes a lot of time. Try to reduce your train set using 10000 samples or less.

Related

Error when trying to tune MLPClassifier hidden_layer_sizes using BayesSearchCV

When trying to tune the sklearn MLPClassifier hidden_layer_sizes hyper parameter, using BayesSearchCV, I get an error: ValueError: can only convert an array of size 1 to a Python scalar.
However, when I use GridSearchCV, it works great! What am I missing?
Here goes a reproducible example:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.datasets import load_iris
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
X, y = load_iris(True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
train_size=0.75,
random_state=0)
# this does not work!
opt_bs = BayesSearchCV(MLPClassifier(),
{'learning_rate_init': Real(0.001, 0.05),
'solver': Categorical(["adam", 'sgd']),
'hidden_layer_sizes': Categorical([(10,5), (15,10,5)])},
n_iter=32,
random_state=0)
# this one does :)
opt_gs = GridSearchCV(MLPClassifier(),
{'learning_rate_init': [0.001, 0.05],
'solver': ["adam", 'sgd'],
'hidden_layer_sizes': [(10,5), (15,10,5)]})
# executes optimization using opt_gs or opt_bs
opt = opt_bs
res = opt.fit(X_train, y_train)
opt
Produces:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-64-78e6d29cae99> in <module>()
27 # executes optimization using opt_gs or opt_bs
28 opt = opt_bs
---> 29 res = opt.fit(X_train, y_train)
30 opt
/usr/local/lib/python3.6/dist-packages/skopt/searchcv.py in fit(self, X, y, groups, callback)
678 optim_result = self._step(
679 X, y, search_space, optimizer,
--> 680 groups=groups, n_points=n_points_adjusted
681 )
682 n_iter -= n_points
/usr/local/lib/python3.6/dist-packages/skopt/searchcv.py in _step(self, X, y, search_space, optimizer, groups, n_points)
553
554 # convert parameters to python native types
--> 555 params = [[np.array(v).item() for v in p] for p in params]
556
557 # make lists into dictionaries
/usr/local/lib/python3.6/dist-packages/skopt/searchcv.py in <listcomp>(.0)
553
554 # convert parameters to python native types
--> 555 params = [[np.array(v).item() for v in p] for p in params]
556
557 # make lists into dictionaries
/usr/local/lib/python3.6/dist-packages/skopt/searchcv.py in <listcomp>(.0)
553
554 # convert parameters to python native types
--> 555 params = [[np.array(v).item() for v in p] for p in params]
556
557 # make lists into dictionaries
ValueError: can only convert an array of size 1 to a Python scalar
Unfortunately, BayesSearchCV accepts only parameters in Categorical, Integer, or Real type values. In your case, there is no issue w.r.t learning_rate_init and solver parameters as they are clearly defined as Real and Categorical respectively, the problem comes in the hidden_layer_sizes where you have declared the number of neurons as Categorical values which in this case are tuples, and BayesSearchCV is not yet equipped to handle search spaces in tuples, refer here for more details on this. However, as a temporary hack, you could just create your own wrapper around MLPClassifier to make the parameters of the estimator be recognized properly. Please refer to the following code snippet for a sample:
from skopt import BayesSearchCV
from skopt.space import Integer
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
import itertools
X, y = load_iris(True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
train_size=0.75,
random_state=0)
class MLPWrapper(BaseEstimator, ClassifierMixin):
def __init__(self, layer1=10, layer2=10, layer3=10):
self.layer1 = layer1
self.layer2 = layer2
self.layer3 = layer3
def fit(self, X, y):
model = MLPClassifier(
hidden_layer_sizes=[self.layer1, self.layer2, self.layer3]
)
model.fit(X, y)
self.model = model
return self
def predict(self, X):
return self.model.predict(X)
def score(self, X, y):
return self.model.score(X, y)
opt = BayesSearchCV(
estimator=MLPWrapper(),
search_spaces={
'layer1': Integer(10, 100),
'layer2': Integer(10, 100),
'layer3': Integer(10, 100)
},
n_iter=11
)
opt.fit(X_train, y_train)
opt.score(X_test,y_test)
0.9736842105263158
Note: This assumes that you build an MLP network with three layers. You can modify it as per your need. Also, it becomes slightly tricky to create a class that constructs any MLP with an arbitrary number of layers.

Positional argument error when trying to train SGDClassifier on binary classification

I'm working through Aurelien Geron's Hands-On ML textbook and have got stuck trying to train an SGDClassifier.
I'm using the MNIST handwritten numbers data and running my code in a Jupyter Notebook via Anaconda. Both my anaconda (1.7.0) and sklearn (0.20.dev0) are updated. I've pasted the code I used to load the data, select the first 60k rows, shuffle the order and convert the labels to 1 (True) for all 5's and 0 (False) for all other numbers. Both X_train and y_train_5 are numpy arrays.
I've pasted the error message I get below.
Nothing seems to be wrong with the dimensions of the data, I tried converting X_train to a sparse matrix (the suggested format for SGDClassifier) and various max_iter values and got the same error message each time. Am I missing something obvious? Do I need to use a different version of sklearn? I've searched online but couldn't find any posts describing similar issues with SGDClassifier. I'd be super grateful for any kind of pointer.
Code
from six.moves import urllib
from scipy.io import loadmat
import numpy as np
from sklearn.linear_model import SGDClassifier
# Load MNIST data #
from scipy.io import loadmat
mnist_alternative_url = "https://github.com/amplab/datascience-
sp14/raw/master/lab7/mldata/mnist-original.mat"
mnist_path = "./mnist-original.mat"
response = urllib.request.urlopen(mnist_alternative_url)
with open(mnist_path, "wb") as f:
content = response.read()
f.write(content)
mnist_raw = loadmat(mnist_path)
mnist = {
"data": mnist_raw["data"].T,
"target": mnist_raw["label"][0],
"COL_NAMES": ["label", "data"],
"DESCR": "mldata.org dataset: mnist-original",
}
# Assign X and y #
X, y = mnist['data'], mnist['target']
# Select first 60000 numbers #
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000],
y[60000:]
# Shuffle order #
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
# Convert labels to binary (5 or "not 5") #
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
# Train SGDClassifier #
sgd_clf = SGDClassifier(max_iter=5, random_state=42)
sgd_clf.fit(X_train, y_train_5)
Error Message
---------------------------------------------------------------------------
TypeError
Traceback (most recent call last)
<ipython-input-10-5a25eed28833> in <module>()
37 # Train SGDClassifier
38 sgd_clf = SGDClassifier(max_iter=5, random_state=42)
---> 39 sgd_clf.fit(X_train, y_train_5)
~\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py in fit(self, X, y, coef_init, intercept_init, sample_weight)
712 loss=self.loss, learning_rate=self.learning_rate,
713 coef_init=coef_init, intercept_init=intercept_init,
--> 714 sample_weight=sample_weight)
715
716
~\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py in _fit(self, X, y, alpha, C, loss, learning_rate, coef_init, intercept_init, sample_weight)
570
571 self._partial_fit(X, y, alpha, C, loss, learning_rate, self._max_iter,
--> 572 classes, sample_weight, coef_init, intercept_init)
573
574 if (self._tol is not None and self._tol > -np.inf
~\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py in _partial_fit(self, X, y, alpha, C, loss, learning_rate, max_iter, classes, sample_weight, coef_init, intercept_init)
529 learning_rate=learning_rate,
530 sample_weight=sample_weight,
--> 531 max_iter=max_iter)
532 else:
533 raise ValueError(
~\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py in _fit_binary(self, X, y, alpha, C, sample_weight, learning_rate, max_iter)
587 self._expanded_class_weight[1],
588 self._expanded_class_weight[0],
--> 589 sample_weight)
590
591 self.t_ += n_iter_ * X.shape[0]
~\Anaconda3\lib\site-packages\sklearn\linear_model\stochastic_gradient.py in fit_binary(est, i, X, y, alpha, C, learning_rate, max_iter, pos_weight, neg_weight, sample_weight)
419 pos_weight, neg_weight,
420 learning_rate_type, est.eta0,
--> 421 est.power_t, est.t_, intercept_decay)
422
423 else:
~\Anaconda3\lib\site-packages\sklearn\linear_model\sgd_fast.pyx in sklearn.linear_model.sgd_fast.plain_sgd()
TypeError: plain_sgd() takes at most 21 positional arguments (25 given)
It appears your version of scikit-learn is just a little outdated. Try running:
pip install -U scikit-learn
then your code will run (with some slight formatting updates):
from six.moves import urllib
from scipy.io import loadmat
import numpy as np
from sklearn.linear_model import SGDClassifier
from scipy.io import loadmat
# Load MNIST data #
mnist_alternative_url = "https://github.com/amplab/datascience-sp14/raw/master/lab7/mldata/mnist-original.mat"
mnist_path = "./mnist-original.mat"
response = urllib.request.urlopen(mnist_alternative_url)
with open(mnist_path, "wb") as f:
content = response.read()
f.write(content)
mnist_raw = loadmat(mnist_path)
mnist = {
"data": mnist_raw["data"].T,
"target": mnist_raw["label"][0],
"COL_NAMES": ["label", "data"],
"DESCR": "mldata.org dataset: mnist-original",
}
# Assign X and y #
X, y = mnist['data'], mnist['target']
# Select first 60000 numbers #
X_train, X_test, y_train, y_test = X[:60000], X[60000:], y[:60000], y[60000:]
# Shuffle order #
shuffle_index = np.random.permutation(60000)
X_train, y_train = X_train[shuffle_index], y_train[shuffle_index]
# Convert labels to binary (5 or "not 5") #
y_train_5 = (y_train == 5)
y_test_5 = (y_test == 5)
# Train SGDClassifier #
sgd_clf = SGDClassifier(max_iter=5, random_state=42)
sgd_clf.fit(X_train, y_train_5)

an issue with Machine Learning (Fitting a Model)

I'm using anaconda-navigator -> python3.6
when I run the following code I get this error:
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.grid_search import GridSearchCV
def fit_model(X, y):
""" Performs grid search over the 'max_depth' parameter for a
decision tree regressor trained on the input data [X, y]. """
# Create cross-validation sets from the training data
# sklearn version 0.18: ShuffleSplit(n_splits=10, test_size=0.1, train_size=None, random_state=None)
# sklearn versiin 0.17: ShuffleSplit(n, n_iter=10, test_size=0.1, train_size=None, random_state=None)
cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)
# TODO: Create a decision tree regressor object
regressor = DecisionTreeRegressor()
# TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
params = {'max_depth':range(1,10)}
# TODO: Transform 'performance_metric' into a scoring function using 'make_scorer'
scoring_fnc = make_scorer(performance_metric)
# TODO: Create the grid search cv object --> GridSearchCV()
# Make sure to include the right parameters in the object:
# (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
grid = GridSearchCV(regressor, params, scoring_fnc, cv=cv_sets)
# Fit the grid search object to the data to compute the optimal model
grid = grid.fit(X, y)
# Return the optimal model after fitting the data
return grid.best_estimator_`
# Fit the training data to the model using grid search
reg = fit_model(X_train, y_train)
# Produce the value for 'max_depth'
print ("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']))
Here are the error messages:
ValueError Traceback (most recent call last)
<ipython-input-12-05857a84a7c5> in <module>()
1 # Fit the training data to the model using grid search
----> 2 reg = fit_model(X_train, y_train)
3
4 # Produce the value for 'max_depth'
5 print ("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']))
<ipython-input-11-2c0c19498236> in fit_model(X, y)
26 # (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
27
---> 28 grid = GridSearchCV(regressor, params, scoring_fnc, cv=cv_sets)
29
30 # Fit the grid search object to the data to compute the optimal model
~/anaconda3/lib/python3.6/site-packages/sklearn/grid_search.py in __init__(self, estimator, param_grid, scoring, fit_params, n_jobs, iid, refit, cv, verbose, pre_dispatch, error_score)
819 refit, cv, verbose, pre_dispatch, error_score)
820 self.param_grid = param_grid
--> 821 _check_param_grid(param_grid)
822
823 def fit(self, X, y=None):
~/anaconda3/lib/python3.6/site-packages/sklearn/grid_search.py in _check_param_grid(param_grid)
349 if True not in check:
350 raise ValueError("Parameter values for parameter ({0}) need "
--> 351 "to be a sequence.".format(name))
352
353 if len(v) == 0:
ValueError: Parameter values for parameter (max_depth) need to be a sequence.
grid_search.py checks this:
check = [isinstance(v, k) for k in (list, tuple, np.ndarray)]
It seems like you can't use a range. I would try this:
params = {'max_depth': np.arange(1,10)}
or without numpy:
params = {'max_depth': [x for x in range(1,10)]}

Sklearn pass fit() parameters to xgboost in pipeline

Similar to How to pass a parameter to only one part of a pipeline object in scikit learn? I want to pass parameters to only one part of a pipeline. Usually, it should work fine like:
estimator = XGBClassifier()
pipeline = Pipeline([
('clf', estimator)
])
and executed like
pipeline.fit(X_train, y_train, clf__early_stopping_rounds=20)
but it fails with:
/usr/local/lib/python3.5/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
114 """
115 Xt, yt, fit_params = self._pre_transform(X, y, **fit_params)
--> 116 self.steps[-1][-1].fit(Xt, yt, **fit_params)
117 return self
118
/usr/local/lib/python3.5/site-packages/xgboost-0.6-py3.5.egg/xgboost/sklearn.py in fit(self, X, y, sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose)
443 early_stopping_rounds=early_stopping_rounds,
444 evals_result=evals_result, obj=obj, feval=feval,
--> 445 verbose_eval=verbose)
446
447 self.objective = xgb_options["objective"]
/usr/local/lib/python3.5/site-packages/xgboost-0.6-py3.5.egg/xgboost/training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, learning_rates, xgb_model, callbacks)
201 evals=evals,
202 obj=obj, feval=feval,
--> 203 xgb_model=xgb_model, callbacks=callbacks)
204
205
/usr/local/lib/python3.5/site-packages/xgboost-0.6-py3.5.egg/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
97 end_iteration=num_boost_round,
98 rank=rank,
---> 99 evaluation_result_list=evaluation_result_list))
100 except EarlyStopException:
101 break
/usr/local/lib/python3.5/site-packages/xgboost-0.6-py3.5.egg/xgboost/callback.py in callback(env)
196 def callback(env):
197 """internal function"""
--> 198 score = env.evaluation_result_list[-1][1]
199 if len(state) == 0:
200 init(env)
IndexError: list index out of range
Whereas a
estimator.fit(X_train, y_train, early_stopping_rounds=20)
works just fine.
For the early stopping rounds, you must always specify the validation set given by the argument eval_set. Here is how the error in your code can be fixed.
pipeline.fit(X_train, y_train, clf__early_stopping_rounds=20, clf__eval_set=[(test_X, test_y)])
I recently used the following steps to use the eval metric and eval_set parameters for Xgboost.
1. create the pipeline with the pre-processing/feature transformation steps:
This was made from a pipeline defined earlier which includes the xgboost model as the last step.
pipeline_temp = pipeline.Pipeline(pipeline.cost_pipe.steps[:-1])
2. Fit this Pipeline
X_trans = pipeline_temp.fit_transform(X_train[FEATURES],y_train)
3. Create your eval_set by applying the transformations to the test set
eval_set = [(X_trans, y_train), (pipeline_temp.transform(X_test), y_test)]
4. Add your xgboost step back into the Pipeline
pipeline_temp.steps.append(pipeline.cost_pipe.steps[-1])
5. Fit the new pipeline by passing the Parameters
pipeline_temp.fit(X_train[FEATURES], y_train,
xgboost_model__eval_metric = ERROR_METRIC,
xgboost_model__eval_set = eval_set)
6. Persist the Pipeline if you wish to.
joblib.dump(pipeline_temp, save_path)
This is the solution: https://www.kaggle.com/c/otto-group-product-classification-challenge/forums/t/13755/xgboost-early-stopping-and-other-issues both early_stooping_rounds and the watchlist / eval_set need to be passed. Unfortunately, this does not work for me, as the variables on the watchlist would require a preprocessing step which is only applied in the pipeline / I would need to apply this step manually.
Here's a solution that works in a Pipeline with GridSearchCV:
Over-ride the XGBRegressor or XGBClssifier.fit() Function
This step uses train_test_split() to select the specified number of
validation records from X for the eval_set and then passes the
remaining records along to fit().
A new parameter eval_test_size is added to .fit() to control the number of validation records. (see train_test_split test_size documenation)
**kwargs passes along any other parameters added by the user for the XGBRegressor.fit() function.
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import train_test_split
class XGBRegressor_ES(XGBRegressor):
def fit(self, X, y, *, eval_test_size=None, **kwargs):
if eval_test_size is not None:
params = super(XGBRegressor, self).get_xgb_params()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=eval_test_size, random_state=params['random_state'])
eval_set = [(X_test, y_test)]
# Could add (X_train, y_train) to eval_set
# to get .eval_results() for both train and test
#eval_set = [(X_train, y_train),(X_test, y_test)]
kwargs['eval_set'] = eval_set
return super(XGBRegressor_ES, self).fit(X_train, y_train, **kwargs)
Example Usage
Below is a multistep pipeline that includes multiple transformations to X. The pipeline's fit() function passes the new evaluation parameter to the XGBRegressor_ES class above as xgbr__eval_test_size=200. In this example:
X_train contains text documents passed to the pipeline.
XGBRegressor_ES.fit() uses train_test_split() to select 200 records from X_train for the validation set and early stopping. (This could also be a percentage such as xgbr__eval_test_size=0.2)
The remaining records in X_train are passed along to XGBRegressor.fit() for the actual fit().
Early stopping may now occur after 75 rounds of unchanged boosting for each cv fold in a gridsearch.
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectPercentile, f_regression
xgbr_pipe = Pipeline(steps=[('tfidf', TfidfVectorizer()),
('vt',VarianceThreshold()),
('scaler', StandardScaler()),
('Sp', SelectPercentile()),
('xgbr',XGBRegressor_ES(n_estimators=2000,
objective='reg:squarederror',
eval_metric='mae',
learning_rate=0.0001,
random_state=7)) ])
X_train = train_idxs['f_text'].values
y_train = train_idxs['Pct_Change_20'].values
Example Fitting the Pipeline:
%time xgbr_pipe.fit(X_train, y_train,
xgbr__eval_test_size=200,
xgbr__eval_metric='mae',
xgbr__early_stopping_rounds=75)
Example Fitting GridSearchCV:
learning_rate = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3]
param_grid = dict(xgbr__learning_rate=learning_rate)
grid_search = GridSearchCV(xgbr_pipe, param_grid, scoring="neg_mean_absolute_error", n_jobs=-1, cv=10)
grid_result = grid_search.fit(X_train, y_train,
xgbr__eval_test_size=200,
xgbr__eval_metric='mae',
xgbr__early_stopping_rounds=75)

GridSearch for Multilabel OneVsRestClassifier?

I'm doing a grid search over multilabel data as follows:
#imports
from sklearn.svm import SVC as classifier
from sklearn.pipeline import Pipeline
from sklearn.decomposition import RandomizedPCA
from sklearn.cross_validation import StratifiedKFold
from sklearn.grid_search import GridSearchCV
#classifier pipeline
clf_pipeline = clf_pipeline = OneVsRestClassifier(
Pipeline([('reduce_dim', RandomizedPCA()),
('clf', classifier())
]
))
C_range = 10.0 ** np.arange(-2, 9)
gamma_range = 10.0 ** np.arange(-5, 4)
n_components_range = (10, 100, 200)
degree_range = (1, 2, 3, 4)
param_grid = dict(estimator__clf__gamma=gamma_range,
estimator__clf__c=c_range,
estimator__clf__degree=degree_range,
estimator__reduce_dim__n_components=n_components_range)
grid = GridSearchCV(clf_pipeline, param_grid,
cv=StratifiedKFold(y=Y, n_folds=3), n_jobs=1,
verbose=2)
grid.fit(X, Y)
I'm seeing the following traceback:
/Users/andrewwinterman/Documents/sparks-honey/classifier/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit_grid_point(X, y, base_clf, clf_params, train, test, loss_func, score_func, verbose, **fit_params)
107
108 if y is not None:
--> 109 y_test = y[safe_mask(y, test)]
110 y_train = y[safe_mask(y, train)]
111 clf.fit(X_train, y_train, **fit_params)
TypeError: only integer arrays with one element can be converted to an index
Looks like GridSearchCV objects to multiple labels. How should I work around this? Do I need to explicitly iterate through the unique classes with label_binarizer, running grid search on each sub-estimator?
I think there is a bug in grid_search.py
Have you tried to give y as numpy array?
import numpy as np
Y = np.asarray(Y)

Categories

Resources