an issue with Machine Learning (Fitting a Model) - python

I'm using anaconda-navigator -> python3.6
when I run the following code I get this error:
from sklearn.metrics import make_scorer
from sklearn.tree import DecisionTreeRegressor
from sklearn.grid_search import GridSearchCV
def fit_model(X, y):
""" Performs grid search over the 'max_depth' parameter for a
decision tree regressor trained on the input data [X, y]. """
# Create cross-validation sets from the training data
# sklearn version 0.18: ShuffleSplit(n_splits=10, test_size=0.1, train_size=None, random_state=None)
# sklearn versiin 0.17: ShuffleSplit(n, n_iter=10, test_size=0.1, train_size=None, random_state=None)
cv_sets = ShuffleSplit(X.shape[0], n_iter = 10, test_size = 0.20, random_state = 0)
# TODO: Create a decision tree regressor object
regressor = DecisionTreeRegressor()
# TODO: Create a dictionary for the parameter 'max_depth' with a range from 1 to 10
params = {'max_depth':range(1,10)}
# TODO: Transform 'performance_metric' into a scoring function using 'make_scorer'
scoring_fnc = make_scorer(performance_metric)
# TODO: Create the grid search cv object --> GridSearchCV()
# Make sure to include the right parameters in the object:
# (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
grid = GridSearchCV(regressor, params, scoring_fnc, cv=cv_sets)
# Fit the grid search object to the data to compute the optimal model
grid = grid.fit(X, y)
# Return the optimal model after fitting the data
return grid.best_estimator_`
# Fit the training data to the model using grid search
reg = fit_model(X_train, y_train)
# Produce the value for 'max_depth'
print ("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']))
Here are the error messages:
ValueError Traceback (most recent call last)
<ipython-input-12-05857a84a7c5> in <module>()
1 # Fit the training data to the model using grid search
----> 2 reg = fit_model(X_train, y_train)
3
4 # Produce the value for 'max_depth'
5 print ("Parameter 'max_depth' is {} for the optimal model.".format(reg.get_params()['max_depth']))
<ipython-input-11-2c0c19498236> in fit_model(X, y)
26 # (estimator, param_grid, scoring, cv) which have values 'regressor', 'params', 'scoring_fnc', and 'cv_sets' respectively.
27
---> 28 grid = GridSearchCV(regressor, params, scoring_fnc, cv=cv_sets)
29
30 # Fit the grid search object to the data to compute the optimal model
~/anaconda3/lib/python3.6/site-packages/sklearn/grid_search.py in __init__(self, estimator, param_grid, scoring, fit_params, n_jobs, iid, refit, cv, verbose, pre_dispatch, error_score)
819 refit, cv, verbose, pre_dispatch, error_score)
820 self.param_grid = param_grid
--> 821 _check_param_grid(param_grid)
822
823 def fit(self, X, y=None):
~/anaconda3/lib/python3.6/site-packages/sklearn/grid_search.py in _check_param_grid(param_grid)
349 if True not in check:
350 raise ValueError("Parameter values for parameter ({0}) need "
--> 351 "to be a sequence.".format(name))
352
353 if len(v) == 0:
ValueError: Parameter values for parameter (max_depth) need to be a sequence.

grid_search.py checks this:
check = [isinstance(v, k) for k in (list, tuple, np.ndarray)]
It seems like you can't use a range. I would try this:
params = {'max_depth': np.arange(1,10)}
or without numpy:
params = {'max_depth': [x for x in range(1,10)]}

Related

Error when trying to tune MLPClassifier hidden_layer_sizes using BayesSearchCV

When trying to tune the sklearn MLPClassifier hidden_layer_sizes hyper parameter, using BayesSearchCV, I get an error: ValueError: can only convert an array of size 1 to a Python scalar.
However, when I use GridSearchCV, it works great! What am I missing?
Here goes a reproducible example:
from skopt import BayesSearchCV
from skopt.space import Real, Categorical, Integer
from sklearn.datasets import load_iris
from sklearn.neural_network import MLPClassifier
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
X, y = load_iris(True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
train_size=0.75,
random_state=0)
# this does not work!
opt_bs = BayesSearchCV(MLPClassifier(),
{'learning_rate_init': Real(0.001, 0.05),
'solver': Categorical(["adam", 'sgd']),
'hidden_layer_sizes': Categorical([(10,5), (15,10,5)])},
n_iter=32,
random_state=0)
# this one does :)
opt_gs = GridSearchCV(MLPClassifier(),
{'learning_rate_init': [0.001, 0.05],
'solver': ["adam", 'sgd'],
'hidden_layer_sizes': [(10,5), (15,10,5)]})
# executes optimization using opt_gs or opt_bs
opt = opt_bs
res = opt.fit(X_train, y_train)
opt
Produces:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-64-78e6d29cae99> in <module>()
27 # executes optimization using opt_gs or opt_bs
28 opt = opt_bs
---> 29 res = opt.fit(X_train, y_train)
30 opt
/usr/local/lib/python3.6/dist-packages/skopt/searchcv.py in fit(self, X, y, groups, callback)
678 optim_result = self._step(
679 X, y, search_space, optimizer,
--> 680 groups=groups, n_points=n_points_adjusted
681 )
682 n_iter -= n_points
/usr/local/lib/python3.6/dist-packages/skopt/searchcv.py in _step(self, X, y, search_space, optimizer, groups, n_points)
553
554 # convert parameters to python native types
--> 555 params = [[np.array(v).item() for v in p] for p in params]
556
557 # make lists into dictionaries
/usr/local/lib/python3.6/dist-packages/skopt/searchcv.py in <listcomp>(.0)
553
554 # convert parameters to python native types
--> 555 params = [[np.array(v).item() for v in p] for p in params]
556
557 # make lists into dictionaries
/usr/local/lib/python3.6/dist-packages/skopt/searchcv.py in <listcomp>(.0)
553
554 # convert parameters to python native types
--> 555 params = [[np.array(v).item() for v in p] for p in params]
556
557 # make lists into dictionaries
ValueError: can only convert an array of size 1 to a Python scalar
Unfortunately, BayesSearchCV accepts only parameters in Categorical, Integer, or Real type values. In your case, there is no issue w.r.t learning_rate_init and solver parameters as they are clearly defined as Real and Categorical respectively, the problem comes in the hidden_layer_sizes where you have declared the number of neurons as Categorical values which in this case are tuples, and BayesSearchCV is not yet equipped to handle search spaces in tuples, refer here for more details on this. However, as a temporary hack, you could just create your own wrapper around MLPClassifier to make the parameters of the estimator be recognized properly. Please refer to the following code snippet for a sample:
from skopt import BayesSearchCV
from skopt.space import Integer
from sklearn.datasets import load_iris
from sklearn.model_selection import train_test_split
from sklearn.neural_network import MLPRegressor, MLPClassifier
from sklearn.base import BaseEstimator, ClassifierMixin
import itertools
X, y = load_iris(True)
X_train, X_test, y_train, y_test = train_test_split(X, y,
train_size=0.75,
random_state=0)
class MLPWrapper(BaseEstimator, ClassifierMixin):
def __init__(self, layer1=10, layer2=10, layer3=10):
self.layer1 = layer1
self.layer2 = layer2
self.layer3 = layer3
def fit(self, X, y):
model = MLPClassifier(
hidden_layer_sizes=[self.layer1, self.layer2, self.layer3]
)
model.fit(X, y)
self.model = model
return self
def predict(self, X):
return self.model.predict(X)
def score(self, X, y):
return self.model.score(X, y)
opt = BayesSearchCV(
estimator=MLPWrapper(),
search_spaces={
'layer1': Integer(10, 100),
'layer2': Integer(10, 100),
'layer3': Integer(10, 100)
},
n_iter=11
)
opt.fit(X_train, y_train)
opt.score(X_test,y_test)
0.9736842105263158
Note: This assumes that you build an MLP network with three layers. You can modify it as per your need. Also, it becomes slightly tricky to create a class that constructs any MLP with an arbitrary number of layers.

If no scoring is specified, the estimator passed should have a 'score' method

I am doing hyperparameter tuning and I wrote this code(from this tutorial -https://machinelearningmastery.com/grid-search-hyperparameters-deep-learning-models-python-keras/?unapproved=524264&moderation-hash=83f45bd57dd6c1c5e37699b257905830#comment-524264-
)
from sklearn.model_selection import GridSearchCV
# fix random seed for reproducibility
seed = 7
np.random.seed(seed)
# define the grid search parameters
batch_size = [10, 20, 40, 60, 80, 100]
epochs = [10, 50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)
grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
grid_result = grid.fit(scaled_X, y)
# summarize results
print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
means = grid_result.cv_results_['mean_test_score']
stds = grid_result.cv_results_['std_test_score']
params = grid_result.cv_results_['params']
for mean, stdev, param in zip(means, stds, params):
print("%f (%f) with: %r" % (mean, stdev, param))
And this is the error I got-
TypeError Traceback (most recent call last)
<ipython-input-39-3821841029c0> in <module>
11 param_grid = dict(batch_size=batch_size, epochs=epochs)
12 grid = GridSearchCV(estimator=model, param_grid=param_grid, n_jobs=-1, cv=3)
---> 13 grid_result = grid.fit(scaled_X, y)
14 # summarize results
15 print("Best: %f using %s" % (grid_result.best_score_, grid_result.best_params_))
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
607
608 scorers, self.multimetric_ = _check_multimetric_scoring(
--> 609 self.estimator, scoring=self.scoring)
610
611 if self.multimetric_:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\metrics\scorer.py in _check_multimetric_scoring(estimator, scoring)
340 if callable(scoring) or scoring is None or isinstance(scoring,
341 str):
--> 342 scorers = {"score": check_scoring(estimator, scoring=scoring)}
343 return scorers, False
344 else:
~\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\metrics\scorer.py in check_scoring(estimator, scoring, allow_none)
293 "If no scoring is specified, the estimator passed should "
294 "have a 'score' method. The estimator %r does not."
--> 295 % estimator)
296 elif isinstance(scoring, Iterable):
297 raise ValueError("For evaluating multiple scores, use "
TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator <keras.engine.sequential.Sequential object at 0x0000025F8892C248> does not.
I have tried the answer from-
Scikit-learn TypeError: If no scoring is specified, the estimator passed should have a 'score' method
I think you need to specify type of score to use in GridSearch.Because GridSearch maximizes a score over the grid of parameters.For example classiication problem you can use f1 score,precision,recall scores. If scoring is None GridSearch will use estimator’s score method.
from sklearn.metrics import make_scorer
from sklearn.metrics import accuracy_score, precision_score, recall_score
from sklearn.model_selection import GridSearchCV
scorers = {
'precision_score': make_scorer(precision_score),
'recall_score': make_scorer(recall_score),
'accuracy_score': make_scorer(accuracy_score)
}
grid_search_cv=GridSearchCV(model,param_grid,verbose=1,cv=3,scoring=scorers,refit="precision_score")
check about scoring parameter documentation in GridSearch
you can use built in scoring parameters or define your own function.
I had the same problem.
I needed to run:
estimator = KerasClassifier(build_fn=baseline_model, epochs=10, batch_size=10, verbose=1)
and then to change the estimator :
grid = GridSearchCV(estimator=estimator, param_grid=param_grid, n_jobs=-1, cv=3)

Compute yScore of Learning Algorithm

I'm quite new to the ML python environment, I need to plot the precision/recall graph, as stated in this post: [https://scikit-learn.org/stable/auto_examples/model_selection/plot_precision_recall.html][1] you need to compute the y_score :
# Create a simple classifier
classifier = svm.LinearSVC(random_state=random_state)
classifier.fit(X_train, y_train)
y_score = classifier.decision_function(X_test)
So the question is: how can I compute the score using Multinomial NaiveBayes or LearningTree? In my code I have:
print("MultinomialNB - countVectorizer")
xTrain, xTest, yTrain, yTest=countVectorizer(db)
classifier = MultinomialNB()
model = classifier.fit(xTrain, yTrain)
yPred = model.predict(xTest)
print("confusion Matrix of MNB/ cVectorizer:\n")
print(confusion_matrix(yTest, yPred))
print("\n")
print("classificationReport Matrix of MNB/ cVectorizer:\n")
print(classification_report(yTest, yPred))
elapsed_time = time.time() - start_time
print("elapsed Time: %.3fs" %elapsed_time)
Plot function:
def plotLearningAlgorithm(yTest,yScore,algName):
precision, recall, _ = precision_recall_curve(yTest, yScore)
plt.step(recall, precision, color='b', alpha=0.2,
where='post')
plt.fill_between(recall, precision, alpha=0.2, color='b', **step_kwargs)
plt.xlabel('Recall')
plt.ylabel('Precision')
plt.ylim([0.0, 1.05])
plt.xlim([0.0, 1.0])
plt.title('2-class Precision-Recall'+ algName +'curve: AP={0:0.2f}'.format(average_precision))
Error with plot:
<ipython-input-43-d07c3365bfc2> in MultinomialNaiveBayesOPT()
11 yPred = model.predict(xTest)
12
---> 13 plotLearningAlgorithm(yTest,model.predict_proba(xTest),"MultinomialNB - countVectorizer")
14
15 print("confusion Matrix of MNB/ cVectorizer:\n")
<ipython-input-42-260aac9918f2> in plotLearningAlgorithm(yTest, yScore, algName)
1 def plotLearningAlgorithm(yTest,yScore,algName):
2
----> 3 precision, recall, _ = precision_recall_curve(yTest, yScore)
4
5 step_kwargs = ({'step': 'post'}
/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/ranking.py in precision_recall_curve(y_true, probas_pred, pos_label, sample_weight)
522 fps, tps, thresholds = _binary_clf_curve(y_true, probas_pred,
523 pos_label=pos_label,
--> 524 sample_weight=sample_weight)
525
526 precision = tps / (tps + fps)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/metrics/ranking.py in _binary_clf_curve(y_true, y_score, pos_label, sample_weight)
398 check_consistent_length(y_true, y_score, sample_weight)
399 y_true = column_or_1d(y_true)
--> 400 y_score = column_or_1d(y_score)
401 assert_all_finite(y_true)
402 assert_all_finite(y_score)
/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/validation.py in column_or_1d(y, warn)
758 return np.ravel(y)
759
--> 760 raise ValueError("bad input shape {0}".format(shape))
761
762
ValueError: bad input shape (9000, 2)
Where db contains my dataset already divided between train set and test set.
Any suggestions?
Solution:
def plot_pr(y_pred,y_true,l):
precision, recall, thresholds = precision_recall_curve(y_true, y_pred,pos_label=l)
return precision,recall
def plotPrecisionRecall(xTest,yTest,yPred,learningName,model):
yPred_probability = model.predict_proba(xTest)
yPred_probability = yPred_probability[:,1];
no_skill_probs = [0 for _ in range(len(yTest))]
ns_precision,ns_recall,_=precision_recall_curve(yTest,no_skill_probs,pos_label="L")
precision, rec= plot_pr(yPred_probability,yTest,"L");
plt.title(learningName)
plt.plot(ns_recall,ns_precision,linestyle='--',label='No Skill')
plt.plot(rec,precision,Label='Skill')
plt.xlabel("Recall")
plt.ylabel("Precision")
plt.legend()
plt.show()
So as It turns out the y_Pred needed to be transformed with:
yPred_probability = yPred_probability[:,1];
So big thank you to #ignoring_gravity to providing me to the right solution, I've also printed the no-skill line to extra readability to the graph.
What they call y_score is just the predicted probabilities outputted by your ML algorithm.
In multinomial nb and in a decision tree (I suppose that's what you mean by LearningTree?), you can do this with the method .predict_proba:
classifier = MultinomialNB()
model = classifier.fit(xTrain, yTrain)
yPred = model.predict_proba(xTest)
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
# countvectorizer is not used for train and test split, instead use train_test_split
train_x, test_x, train_y, test_y = train_test_split(X, y, test_size=0.33, random_state=42) # here x is going to be your textual data, whereas y will be your target
countMatrix_train = vect.fit_transform(train_x) # you have to fit with your train data
countMatrix_test = vect.transform(test_x) # now have to transform( and not fit_transform) according to your train data
classifier = MultinomialNB()
classifier.fit(countMatrix_train, train_y)
ypred = classifier.predict(countMatrix_test) # this will give you class for your test data, now use this for making classification report

Sklearn pass fit() parameters to xgboost in pipeline

Similar to How to pass a parameter to only one part of a pipeline object in scikit learn? I want to pass parameters to only one part of a pipeline. Usually, it should work fine like:
estimator = XGBClassifier()
pipeline = Pipeline([
('clf', estimator)
])
and executed like
pipeline.fit(X_train, y_train, clf__early_stopping_rounds=20)
but it fails with:
/usr/local/lib/python3.5/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
114 """
115 Xt, yt, fit_params = self._pre_transform(X, y, **fit_params)
--> 116 self.steps[-1][-1].fit(Xt, yt, **fit_params)
117 return self
118
/usr/local/lib/python3.5/site-packages/xgboost-0.6-py3.5.egg/xgboost/sklearn.py in fit(self, X, y, sample_weight, eval_set, eval_metric, early_stopping_rounds, verbose)
443 early_stopping_rounds=early_stopping_rounds,
444 evals_result=evals_result, obj=obj, feval=feval,
--> 445 verbose_eval=verbose)
446
447 self.objective = xgb_options["objective"]
/usr/local/lib/python3.5/site-packages/xgboost-0.6-py3.5.egg/xgboost/training.py in train(params, dtrain, num_boost_round, evals, obj, feval, maximize, early_stopping_rounds, evals_result, verbose_eval, learning_rates, xgb_model, callbacks)
201 evals=evals,
202 obj=obj, feval=feval,
--> 203 xgb_model=xgb_model, callbacks=callbacks)
204
205
/usr/local/lib/python3.5/site-packages/xgboost-0.6-py3.5.egg/xgboost/training.py in _train_internal(params, dtrain, num_boost_round, evals, obj, feval, xgb_model, callbacks)
97 end_iteration=num_boost_round,
98 rank=rank,
---> 99 evaluation_result_list=evaluation_result_list))
100 except EarlyStopException:
101 break
/usr/local/lib/python3.5/site-packages/xgboost-0.6-py3.5.egg/xgboost/callback.py in callback(env)
196 def callback(env):
197 """internal function"""
--> 198 score = env.evaluation_result_list[-1][1]
199 if len(state) == 0:
200 init(env)
IndexError: list index out of range
Whereas a
estimator.fit(X_train, y_train, early_stopping_rounds=20)
works just fine.
For the early stopping rounds, you must always specify the validation set given by the argument eval_set. Here is how the error in your code can be fixed.
pipeline.fit(X_train, y_train, clf__early_stopping_rounds=20, clf__eval_set=[(test_X, test_y)])
I recently used the following steps to use the eval metric and eval_set parameters for Xgboost.
1. create the pipeline with the pre-processing/feature transformation steps:
This was made from a pipeline defined earlier which includes the xgboost model as the last step.
pipeline_temp = pipeline.Pipeline(pipeline.cost_pipe.steps[:-1])
2. Fit this Pipeline
X_trans = pipeline_temp.fit_transform(X_train[FEATURES],y_train)
3. Create your eval_set by applying the transformations to the test set
eval_set = [(X_trans, y_train), (pipeline_temp.transform(X_test), y_test)]
4. Add your xgboost step back into the Pipeline
pipeline_temp.steps.append(pipeline.cost_pipe.steps[-1])
5. Fit the new pipeline by passing the Parameters
pipeline_temp.fit(X_train[FEATURES], y_train,
xgboost_model__eval_metric = ERROR_METRIC,
xgboost_model__eval_set = eval_set)
6. Persist the Pipeline if you wish to.
joblib.dump(pipeline_temp, save_path)
This is the solution: https://www.kaggle.com/c/otto-group-product-classification-challenge/forums/t/13755/xgboost-early-stopping-and-other-issues both early_stooping_rounds and the watchlist / eval_set need to be passed. Unfortunately, this does not work for me, as the variables on the watchlist would require a preprocessing step which is only applied in the pipeline / I would need to apply this step manually.
Here's a solution that works in a Pipeline with GridSearchCV:
Over-ride the XGBRegressor or XGBClssifier.fit() Function
This step uses train_test_split() to select the specified number of
validation records from X for the eval_set and then passes the
remaining records along to fit().
A new parameter eval_test_size is added to .fit() to control the number of validation records. (see train_test_split test_size documenation)
**kwargs passes along any other parameters added by the user for the XGBRegressor.fit() function.
from xgboost.sklearn import XGBRegressor
from sklearn.model_selection import train_test_split
class XGBRegressor_ES(XGBRegressor):
def fit(self, X, y, *, eval_test_size=None, **kwargs):
if eval_test_size is not None:
params = super(XGBRegressor, self).get_xgb_params()
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=eval_test_size, random_state=params['random_state'])
eval_set = [(X_test, y_test)]
# Could add (X_train, y_train) to eval_set
# to get .eval_results() for both train and test
#eval_set = [(X_train, y_train),(X_test, y_test)]
kwargs['eval_set'] = eval_set
return super(XGBRegressor_ES, self).fit(X_train, y_train, **kwargs)
Example Usage
Below is a multistep pipeline that includes multiple transformations to X. The pipeline's fit() function passes the new evaluation parameter to the XGBRegressor_ES class above as xgbr__eval_test_size=200. In this example:
X_train contains text documents passed to the pipeline.
XGBRegressor_ES.fit() uses train_test_split() to select 200 records from X_train for the validation set and early stopping. (This could also be a percentage such as xgbr__eval_test_size=0.2)
The remaining records in X_train are passed along to XGBRegressor.fit() for the actual fit().
Early stopping may now occur after 75 rounds of unchanged boosting for each cv fold in a gridsearch.
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import StandardScaler
from sklearn.feature_selection import SelectPercentile, f_regression
xgbr_pipe = Pipeline(steps=[('tfidf', TfidfVectorizer()),
('vt',VarianceThreshold()),
('scaler', StandardScaler()),
('Sp', SelectPercentile()),
('xgbr',XGBRegressor_ES(n_estimators=2000,
objective='reg:squarederror',
eval_metric='mae',
learning_rate=0.0001,
random_state=7)) ])
X_train = train_idxs['f_text'].values
y_train = train_idxs['Pct_Change_20'].values
Example Fitting the Pipeline:
%time xgbr_pipe.fit(X_train, y_train,
xgbr__eval_test_size=200,
xgbr__eval_metric='mae',
xgbr__early_stopping_rounds=75)
Example Fitting GridSearchCV:
learning_rate = [0.0001, 0.001, 0.01, 0.05, 0.1, 0.2, 0.3]
param_grid = dict(xgbr__learning_rate=learning_rate)
grid_search = GridSearchCV(xgbr_pipe, param_grid, scoring="neg_mean_absolute_error", n_jobs=-1, cv=10)
grid_result = grid_search.fit(X_train, y_train,
xgbr__eval_test_size=200,
xgbr__eval_metric='mae',
xgbr__early_stopping_rounds=75)

GridSearch with SVM producing IndexError

I'm building a classifier using an SVM and want to perform a Grid Search to help automate finding the optimal model. Here's the code:
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV
from sklearn.multiclass import OneVsRestClassifier
X.shape # (22343, 323)
y.shape # (22343, 1)
X_train, X_test, y_train, y_test = train_test_split(
X, Y, test_size=0.4, random_state=0
)
tuned_parameters = [
{
'estimator__kernel': ['rbf'],
'estimator__gamma': [1e-3, 1e-4],
'estimator__C': [1, 10, 100, 1000]
},
{
'estimator__kernel': ['linear'],
'estimator__C': [1, 10, 100, 1000]
}
]
model_to_set = OneVsRestClassifier(SVC(), n_jobs=-1)
clf = GridSearchCV(model_to_set, tuned_parameters)
clf.fit(X_train, y_train)
and I get the following error message (this isn't the whole stack trace. just the last 3 calls):
----------------------------------------------------
/anaconda/lib/python3.5/site-packages/sklearn/model_selection/_split.py in split(self, X, y, groups)
88 X, y, groups = indexable(X, y, groups)
89 indices = np.arange(_num_samples(X))
---> 90 for test_index in self._iter_test_masks(X, y, groups):
91 train_index = indices[np.logical_not(test_index)]
92 test_index = indices[test_index]
/anaconda/lib/python3.5/site-packages/sklearn/model_selection/_split.py in _iter_test_masks(self, X, y, groups)
606
607 def _iter_test_masks(self, X, y=None, groups=None):
--> 608 test_folds = self._make_test_folds(X, y)
609 for i in range(self.n_splits):
610 yield test_folds == i
/anaconda/lib/python3.5/site-packages/sklearn/model_selection/_split.py in _make_test_folds(self, X, y, groups)
593 for test_fold_indices, per_cls_splits in enumerate(zip(*per_cls_cvs)):
594 for cls, (_, test_split) in zip(unique_y, per_cls_splits):
--> 595 cls_test_folds = test_folds[y == cls]
596 # the test split can be too big because we used
597 # KFold(...).split(X[:max(c, n_splits)]) when data is not 100%
IndexError: too many indices for array
Also, when I try reshaping the arrays so that the y is (22343,) I find that the GridSearch never finishes even if I set the tuned_parameters to only default values.
And here are the versions for all of the packages if that helps:
Python: 3.5.2
scikit-learn: 0.18
pandas: 0.19.0
It seems that there is no error in your implementation.
However, as it's mentioned in the sklearndocumentation, the "fit time complexity is more than quadratic with the number of samples which makes it hard to scale to dataset with more than a couple of 10000 samples". See documentation here
In your case, you have 22343 samples, which can lead to some computational problems/memory issues. That is why when you do your default CV it takes a lot of time. Try to reduce your train set using 10000 samples or less.

Categories

Resources