I'm trying to build a pipeline with my own functions. To do so I inherited BaseEstimator and TransformerMixin from sklearn base and defined my own transform methods.
When I do pipeline.fit(X,y), it works fine.
The problem is when I try to create a GridSearchCV object with the pipeline. I get the following error:
ValueError: operands could not be broadcast together with shapes (730,36) (228,) (730,36).
730 Is just the number of lines of matrix X divided by 'cv' = 2, the number of folds I choose for the cross-validation in the GridSearchCV.
I have no idea how to debug that. I've tried some prints in the middle of my functions, and the result is pretty weird.
I'm attaching the functions I created as well as the pipeline. I'd be really glad if someone could help.
Here are the functions I created for the Pipeline:
from sklearn.base import BaseEstimator, TransformerMixin
class MissingData(BaseEstimator, TransformerMixin):
def fit( self, X, y = None ):
return self
def transform(self, X , y = None, strategies = ( "most_frequent", "mean") ):
print('Started MissingData')
X_ = X.copy()
#Categorical Variables handling
categorical_variables = list(X_.select_dtypes(include=['category','object']))
imp_category = SimpleImputer(strategy = strategies[0])
X_[categorical_variables] = pd.DataFrame(imp_category.fit_transform(X_[categorical_variables]))
#Numeric varialbes handling
numerical_variables = list(set(X_.columns) - set(categorical_variables))
imp_numerical = SimpleImputer(strategy = strategies[1])
X_[numerical_variables] = pd.DataFrame(imp_numerical.fit_transform(X_[numerical_variables]))
print('Finished MissingData')
print('Inf: ',X_.isnull().sum().sum())
return X_
class OHEncode(BaseEstimator, TransformerMixin):
def fit(self, X, y = None ):
return self
def encode_and_drop_original_and_first_dummy(self,df, feature_to_encode):
dummies = pd.get_dummies(df[feature_to_encode] , prefix = feature_to_encode, drop_first=True) #Drop first equals true will take care of the dummies variables trap
res = pd.concat([df, dummies], axis=1)
res = res.drop([feature_to_encode], axis=1)
return(res)
def transform(self, X , y = None, categorical_variables = None ):
X_ = X.copy()
if categorical_variables == None:
categorical_variables = list(X_.select_dtypes(include=['category','object']))
print('Started Encoding')
#Let's update the matrix X with the one hot ecoded version of all features in categorical_variables
for feature_to_encode in categorical_variables:
X_ = self.encode_and_drop_original_and_first_dummy(X_ , feature_to_encode)
print('Finished Encoding')
print('Inf: ',X_.isnull().sum().sum())
return X_
Here is the Pipeline with the GridSearchCV:
pca = PCA(n_components=10)
pipeline = Pipeline([('MissingData', MissingData()), ('OHEncode', OHEncode()) ,
('scaler', StandardScaler()) , ('pca', pca), ('rf', LinearRegression())])
parameters = {'pca__n_components': [5, 15, 30, 45, 64]}
grid = GridSearchCV(pipeline, param_grid=parameters, cv = 2)
grid.fit(X, y)
And finally here is the full output including my prints and the error:
Started MissingData
Finished MissingData
Inf: 57670
Started Encoding
Finished Encoding
Inf: 26280
Started MissingData
Finished MissingData
Inf: 0
Started Encoding
C:\Users\menoci\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\extmath.py:765: RuntimeWarning: invalid value encountered in true_divide
updated_mean = (last_sum + new_sum) / updated_sample_count
C:\Users\menoci\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\extmath.py:706: RuntimeWarning: Degrees of freedom <= 0 for slice.
result = op(x, *args, **kwargs)
C:\Users\menoci\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
FitFailedWarning)
Finished Encoding
Inf: 0
Started MissingData
Finished MissingData
Inf: 57670
Started Encoding
Finished Encoding
Inf: 26280
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-67-f78b56dad89d> in <module>
15
16 #pipeline.set_params(rf__n_estimators = 50)
---> 17 grid.fit(X, y)
18
19 #rf_val_predictions = pipeline.predict(X)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
710 return results
711
--> 712 self._run_search(evaluate_candidates)
713
714 # For multi-metric evaluation, store the best_index_, best_params_ and
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1151 def _run_search(self, evaluate_candidates):
1152 """Search all candidates in param_grid"""
-> 1153 evaluate_candidates(ParameterGrid(self.param_grid))
1154
1155
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
689 for parameters, (train, test)
690 in product(candidate_params,
--> 691 cv.split(X, y, groups)))
692
693 if len(out) < 1:
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in __call__(self, iterable)
1005 self._iterating = self._original_iterator is not None
1006
-> 1007 while self.dispatch_one_batch(iterator):
1008 pass
1009
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~\AppData\Roaming\Python\Python37\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~\AppData\Roaming\Python\Python37\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
542 else:
543 fit_time = time.time() - start_time
--> 544 test_scores = _score(estimator, X_test, y_test, scorer)
545 score_time = time.time() - start_time - fit_time
546 if return_train_score:
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
589 scores = scorer(estimator, X_test)
590 else:
--> 591 scores = scorer(estimator, X_test, y_test)
592
593 error_msg = ("scoring must return a number, got %s (%s) "
~\AppData\Roaming\Python\Python37\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
87 *args, **kwargs)
88 else:
---> 89 score = scorer(estimator, *args, **kwargs)
90 scores[name] = score
91 return scores
~\AppData\Roaming\Python\Python37\site-packages\sklearn\metrics\_scorer.py in _passthrough_scorer(estimator, *args, **kwargs)
369 def _passthrough_scorer(estimator, *args, **kwargs):
370 """Function that wraps estimator.score"""
--> 371 return estimator.score(*args, **kwargs)
372
373
~\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in score(self, X, y, sample_weight)
611 Xt = X
612 for _, name, transform in self._iter(with_final=False):
--> 613 Xt = transform.transform(Xt)
614 score_params = {}
615 if sample_weight is not None:
~\AppData\Roaming\Python\Python37\site-packages\sklearn\preprocessing\_data.py in transform(self, X, copy)
804 else:
805 if self.with_mean:
--> 806 X -= self.mean_
807 if self.with_std:
808 X /= self.scale_
ValueError: operands could not be broadcast together with shapes (730,36) (228,) (730,36)
The first point, I would should you to use OneHotEncoder (OHE) class from sklearn. Then, define in the constructor of OHEncode an object of OHE and fit it with the all categorical values you have (to make them "seen" at each GridSearch iteration). Then in transform fuction of OHEncode, apply transform using the object of OHE.
DON'T fit the OHE object Inside the fit function because then you will have the same error; at each GridSearch iteration, the fit and transform functions are applied.
Related
I'm trying to utilize make_pipeline() from scikit-learn along with GridSearchCV(). The Pipeline is simple and only includes two steps, a StandardScaler() and an MLPRegressor(). The GridSearchCV()is also pretty simple with the slight wrinkle that I'm using TimeSeriesSplit() for cross-validation.
The error I'm getting is as follows:
ValueError: Invalid parameter MLPRegressor for estimator Pipeline(steps=[('standardscaler', StandardScaler()),('mlpregressor', MLPRegressor())]). Check the list of available parameters with estimator.get_params().keys().
Can someone help me understand how I can rectify this problem so I can use the make_pipeline() framework with both GridSearchCV() and MLPRegressor() .
from sklearn.neural_network import MLPRegressor
...: from sklearn.preprocessing import StandardScaler
...: from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
...: from sklearn.pipeline import make_pipeline
...: import numpy as np
In [2]: tscv = TimeSeriesSplit(n_splits = 5)
In [3]: pipe = make_pipeline(StandardScaler(), MLPRegressor())
In [4]: param_grid = {'MLPRegressor__hidden_layer_sizes': [(16,16,), (64,64,), (
...: 128,128,)], 'MLPRegressor__activation': ['identity', 'logistic', 'tanh',
...: 'relu'],'MLPRegressor__solver': ['adam','sgd']}
In [5]: grid = GridSearchCV(pipe, param_grid = param_grid, cv = tscv)
In [6]: features = np.random.random([1000,10])
In [7]: target = np.random.normal(0,10,1000)
In [8]: grid.fit(features, target)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-8-7233f9f2005e> in <module>
----> 1 grid.fit(features, target)
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1294 def _run_search(self, evaluate_candidates):
1295 """Search all candidates in param_grid"""
-> 1296 evaluate_candidates(ParameterGrid(self.param_grid))
1297
1298
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params, cv, more_results)
793 n_splits, n_candidates, n_candidates * n_splits))
794
--> 795 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
796 X, y,
797 train=train, test=test,
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in __call__(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
584 cloned_parameters[k] = clone(v, safe=False)
585
--> 586 estimator = estimator.set_params(**cloned_parameters)
587
588 start_time = time.time()
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/pipeline.py in set_params(self, **kwargs)
148 self
149 """
--> 150 self._set_params('steps', **kwargs)
151 return self
152
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/utils/metaestimators.py in _set_params(self, attr, **params)
52 self._replace_estimator(attr, name, params.pop(name))
53 # 3. Step parameters and other initialisation arguments
---> 54 super().set_params(**params)
55 return self
56
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/base.py in set_params(self, **params)
228 key, delim, sub_key = key.partition('__')
229 if key not in valid_params:
--> 230 raise ValueError('Invalid parameter %s for estimator %s. '
231 'Check the list of available parameters '
232 'with `estimator.get_params().keys()`.' %
ValueError: Invalid parameter MLPRegressor for estimator Pipeline(steps=[('standardscaler', StandardScaler()),
('mlpregressor', MLPRegressor())]). Check the list of available parameters with `estimator.get_params().keys()`.
Solution
Yes. Make the pipeline first. Then treat the pipeline as your model and pass it to GridSearchCV.
Your problem is in the following line (you had it mislabeled):
Replace MLPRegressor__ with mlpregressor__.
The Fix:
The pipeline named_step for MLPRegressor estimator was mislabeled as MLPRegressor__ in the param_grid.
Changing it to mlpregressor__ fixed the problem.
You may run and check it in this colab notebook.
# INCORRECT
param_grid = {
'MLPRegressor__hidden_layer_sizes': [(16, 16,), (64, 64,), (128, 128,)],
'MLPRegressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
'MLPRegressor__solver': ['adam', 'sgd'],
}
# CORRECTED
param_grid = {
'mlpregressor__hidden_layer_sizes': [(16, 16,), (64, 64,), (128, 128,)],
'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
'mlpregressor__solver': ['adam', 'sgd'],
}
Note
The key to understand what was wrong here, was to observe the last two lines of the error stack.
ValueError: Invalid parameter MLPRegressor for estimator Pipeline(steps=[('standardscaler', StandardScaler()),
('mlpregressor', MLPRegressor())]). Check the list of available parameters with `estimator.get_params().keys()`.
I'm trying to use gridsearchCV to search over specified parameters scoring with neg log loss:
grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = 'neg_log_loss', cv = logo, verbose = 10)
grid.fit(X, y_true, groups = names)
ValueError: y_true contains only one label (1.0). Please provide the true labels explicitly through the labels argument.
The same code but scoring with accuracy works fine. I found that for log loss we need to specify the labels, which works fine when using sklearn.metrics:
y_labels = np.unique(y_true)
y_labels
array([0., 1., 2.])
metrics.log_loss(y_true, y_pred, labels = y_labels )
So I tried:
grid.fit(order_inner_x, y_inner, groups = names_inner, labels = y_labels)
ValueError: not enough values to unpack (expected 2, got 1)
I've tried quite a few variations of the above and also creating my own scorer with:
LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
But everything I try comes down to one of the two above errors. Obviously I'm missing something, so any help much appreciated.
Update:
Made a small mistake in the above - this is a three class problem, not a binary problem as I first implied.
I've tried Ben's suggestion (thanks!):
LogLoss = metrics.make_scorer(metrics.log_loss, greater_is_better=False, needs_proba=True, labels=[0, 1, 2])
grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = LogLoss, cv = logo, verbose = 10)
grid.fit(order_inner_x, y_inner, groups=names_inner)
I'm getting a different error, so hopefully one step closer, here's the full traceback:
ValueError Traceback (most recent call last)
<ipython-input-164-43d9f1633dc9> in <module>
2
3 grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = LogLoss, cv = logo, verbose = 10)
----> 4 grid.fit(order_inner_x, y_inner, groups=names_inner)
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
918 self._iterating = self._original_iterator is not None
919
--> 920 while self.dispatch_one_batch(iterator):
921 pass
922
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
566 fit_time = time.time() - start_time
567 # _score will return dict if is_multimetric is True
--> 568 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
569 score_time = time.time() - start_time - fit_time
570 if return_train_score:
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
603 """
604 if is_multimetric:
--> 605 return _multimetric_score(estimator, X_test, y_test, scorer)
606 else:
607 if y_test is None:
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
633 score = scorer(estimator, X_test)
634 else:
--> 635 score = scorer(estimator, X_test, y_test)
636
637 if hasattr(score, 'item'):
~/anaconda3/lib/python3.7/site-packages/sklearn/metrics/scorer.py in __call__(self, clf, X, y, sample_weight)
133 ' but need classifier with two'
134 ' classes for {} scoring'.format(
--> 135 y_pred.shape, self._score_func.__name__))
136 if sample_weight is not None:
137 return self._sign * self._score_func(y, y_pred,
ValueError: got predict_proba of shape (200, 3), but need classifier with two classes for log_loss scoring
You're most of the way there: you need to provide the labels to your metric. In this attempt:
grid.fit(order_inner_x, y_inner, groups = names_inner, labels = y_labels)
you pass the labels, but to the grid search's fit method rather than the scoring parameter itself.
make_scorer allows other keyword arguments to be passed to the metric function, so this should work:
LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True, labels=[0, 1])
grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = LogLoss, cv = logo, verbose = 10)
grid.fit(X, y_true, groups = names)
I am learning RandomizedSearchCV with a toy example. Suppose that I want to build a linear model y = ax + b. I wrote a custom sklearn estimator that looks like the following:
import numpy as np
# import the base estimator
from sklearn.base import BaseEstimator, RegressorMixin
class testEstimator(BaseEstimator,RegressorMixin):
def __init__(self, alpha=1, beta=0):
self.alpha = alpha
self.beta = beta
def fit(self, X, y=None):
mu = np.ones((len(X)))
for ii in range(len(X)):
mu[ii] = self.alpha*X[ii] + self.beta
self.mu_ = mu
return self
def predict(self, X):
try:
getattr(self,"mu_")
except:
raise RuntimeError("You must train classifer before predicting data!")
return self.mu_
def score(self, X, y):
print("y: ", y)
print("mu: ", self.mu_)
return np.dot(y - self.predict(X), y - self.predict(X))/len(X)
Then, I need to test this.
# temp estimator
tempEs = testEstimator()
# temp params grid
params_grid_temp = {'alpha': [0,1,2,3,4,5,6], 'beta': [0,1,2,3,4]}
# test randomizedSearchCV
temp = RandomizedSearchCV(tempEs, params_grid_temp)
# define X,y
X = range(10)
y = np.dot(2, range(10)) + 4
# fit model
temp.fit(X,y)
However, I got the error
ValueError Traceback (most recent call last)
<ipython-input-8-72a46fdf9098> in <module>
9 y = np.dot(2, range(10)) + 4
10 # fit model
---> 11 temp.fit(X,y)
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1527 def _run_search(self, evaluate_candidates):
1528 """Search n_iter candidates from param_distributions"""
-> 1529 evaluate_candidates(ParameterSampler(
1530 self.param_distributions, self.n_iter,
1531 random_state=self.random_state))
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
706 n_splits, n_candidates, n_candidates * n_splits))
707
--> 708 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
709 X, y,
710 train=train, test=test,
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1027 # remaining jobs.
1028 self._iterating = False
-> 1029 if self.dispatch_one_batch(iterator):
1030 self._iterating = self._original_iterator is not None
1031
~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
845 return False
846 else:
--> 847 self._dispatch(tasks)
848 return True
849
~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
763 with self._lock:
764 job_idx = len(self._jobs)
--> 765 job = self._backend.apply_async(batch, callback=cb)
766 # A job can complete so quickly than its callback is
767 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
558 else:
559 fit_time = time.time() - start_time
--> 560 test_scores = _score(estimator, X_test, y_test, scorer)
561 score_time = time.time() - start_time - fit_time
562 if return_train_score:
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
605 scores = scorer(estimator, X_test)
606 else:
--> 607 scores = scorer(estimator, X_test, y_test)
608
609 error_msg = ("scoring must return a number, got %s (%s) "
~\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
88 *args, **kwargs)
89 else:
---> 90 score = scorer(estimator, *args, **kwargs)
91 scores[name] = score
92 return scores
~\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _passthrough_scorer(estimator, *args, **kwargs)
370 def _passthrough_scorer(estimator, *args, **kwargs):
371 """Function that wraps estimator.score"""
--> 372 return estimator.score(*args, **kwargs)
373
374
<ipython-input-7-0c2138d9bf96> in score(self, X, y)
20 print("y: ", y)
21 print("mu: ", self.mu_)
---> 22 return np.dot(y - self.predict(X), y - self.predict(X))/len(X)
ValueError: operands could not be broadcast together with shapes (2,) (8,)
I figured out (2,) means the size of y and (8,) means the size of self.mu_. How does this happen? They are supposed to be 10.
Okay, I found the problem. Your predict method is totally wrong. It must return predicted value instead of returning mu_.
def predict(self, X):
return self.alpha*X + self.beta
That's it. You can also optimize code in your fit method
I am testing an open-source code but I keep running into this error. I am passing a crf model into cross_val_predict and that's where the error starts. Pystruct is being used for the crf model. The crf model is built from sparse matrices which I think is causing the problem but I am not entirely sure.
This is the code:
def evaluate_dataset(config):
# Parameters
folds = config['folds']
results = {}
crf = config['clf']
sample_size = config['sample_size'] if 'sample_size' in config else None
paths = list(Path(config['dir']).iterdir())
preprocessor = SectionPreprocess(ground_truth=True)
paths, X, y = preprocessor.preprocess(paths, sample_size=sample_size)
# Dataset information
results['num_binaries'] = len(y)
results['avg_code_fraction'] = np.mean([np.mean(yy) for yy in y])
results['params'] = crf.get_params()
# Cross-validation
# logreg = LogisticRegression()
results['cv_folds'] = folds
start = time.time()
y_pred = cross_val_predict(crf, X, y, cv=folds, n_jobs=1)
end = time.time()
results['cv_time'] = (end - start) / folds
values = [
(metrics.accuracy_score(true, pred),) + metrics.precision_recall_fscore_support(true, pred, average='binary')
for (true, pred) in zip(y, y_pred)
]
accuracy, precision, recall, f1, _ = zip(*values)
results['cv_metrics'] = {
'accuracy': (np.mean(accuracy), np.std(accuracy)),
'precision': (np.mean(precision), np.std(precision)),
'recall': (np.mean(recall), np.std(recall)),
'f1': (np.mean(f1), np.std(f1)),
}
return results
evaluation_folder = Path('evaluation')
evaluation_folder.mkdir(exist_ok=True)
for (name, config) in configurations.items():
print(config)
%time results = evaluate_dataset(config)
print("## " + name)
print(results)
print('')
file_path = evaluation_folder / (name + '.json')
with file_path.open('w') as f:
json.dump(results, f)
And this is the error:
ValueError Traceback (most recent call last)
in
in evaluate_dataset(config)
24
25 start = time.time()
---> 26 y_pred = cross_val_predict(crf, X, y, cv=folds, n_jobs=1)
27 end = time.time()
28 results['cv_time'] = (end - start) / folds
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_val_predict(estimator, X, y, groups, cv, n_jobs, verbose, fit_params, pre_dispatch, method)
399 prediction_blocks = parallel(delayed(_fit_and_predict)(
400 clone(estimator), X, y, train, test, verbose, fit_params, method)
--> 401 for train, test in cv_iter)
402
403 # Concatenate the predictions
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in (.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, method)
472 estimator.fit(X_train, **fit_params)
473 else:
--> 474 estimator.fit(X_train, y_train, **fit_params)
475 func = getattr(estimator, method)
476 predictions = func(X_test)
~/Google Drive (gdk244#nyu.edu)/MoMa Lab /CRF/code_section_identification/crf_models/CRFModel.py in fit(self, X, y, **fit_params)
74 verbose=self.verbose
75 )
---> 76 self._ssvm.fit(X, y)
77
78 return self
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/pystruct/learners/frankwolfe_ssvm.py in fit(self, X, Y, constraints, initialize)
295 self._frank_wolfe_batch(X, Y)
296 else:
--> 297 self._frank_wolfe_bc(X, Y)
298 except KeyboardInterrupt:
299 pass
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/pystruct/learners/frankwolfe_ssvm.py in _frank_wolfe_bc(self, X, Y)
220 i = perm[j]
221 x, y = X[i], Y[i]
--> 222 y_hat, delta_joint_feature, slack, loss = find_constraint(self.model, x, y, w)
223 # ws and ls
224 ws = delta_joint_feature * self.C
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/pystruct/utils/inference.py in find_constraint(model, x, y, w, y_hat, relaxed, compute_difference)
63
64 if y_hat is None:
---> 65 y_hat = model.loss_augmented_inference(x, y, w, relaxed=relaxed)
66 joint_feature = model.joint_feature
67 if getattr(model, 'rescale_C', False):
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/pystruct/models/crf.py in loss_augmented_inference(self, x, y, w, relaxed, return_energy)
104 pairwise_potentials = self._get_pairwise_potentials(x, w)
105 edges = self._get_edges(x)
--> 106 loss_augment_unaries(unary_potentials, np.asarray(y), self.class_weight)
107
108 return inference_dispatch(unary_potentials, pairwise_potentials, edges,
utils.pyx in utils.loss_augment_unaries (src/utils.c:5132)()
ValueError: Buffer dtype mismatch, expected 'double' but got Python object
I'm trying to use custom cross-validation sets for a very specific dataset and scikit-optimize using BayesSearchCV. I've been able to replicate the error with scikit-learn using GridSearchCV.
Straight from the documentation:
cv : int, cross-validation generator or an iterable, optional
Determines the cross-validation splitting strategy. Possible inputs
for cv are:
None, to use the default 3-fold cross validation, integer, to specify
the number of folds in a (Stratified)KFold, An object to be used as a
cross-validation generator. An iterable yielding train, test splits.
For integer/None inputs, if the estimator is a classifier and y is
either binary or multiclass, StratifiedKFold is used. In all other
cases, KFold is used.
Refer User Guide for the various cross-validation strategies that can
be used here.
I can't use cv=10 in my specific dataset. This is only to illustrate the error.
I would like to use a list of lists for the cross-validation training-testing splits as it says in the documentation. How can I format my cross-validation lists correctly?
# Generate data
def iris_data(noise=None, palette="hls", desat=1):
# Iris dataset
X = pd.DataFrame(load_iris().data,
index = [*map(lambda x:f"iris_{x}", range(150))],
columns = [*map(lambda x: x.split(" (cm)")[0].replace(" ","_"), load_iris().feature_names)])
y = pd.Series(load_iris().target,
index = X.index,
name = "Species")
cmap = map_colors(y, mode=1, palette=palette, desat=desat)#y.map(lambda x:{0:"red",1:"green",2:"blue"}[x])
if noise is not None:
X_noise = pd.DataFrame(
np.random.RandomState(0).normal(size=(X.shape[0], noise)),
index=X_iris.index,
columns=[*map(lambda x:f"noise_{x}", range(noise))]
)
X = pd.concat([X, X_noise], axis=1)
return (X, y, cmap)
X, y, c = iris_data(noise=50)
# Get cross-validations
cv = list()
for i in range(10):
idx_tr = np.random.choice(np.arange(X.shape[0]),size=100, replace=False)
idx_te = set(range(X.shape[0])) - set(idx_tr)
tr_te_splits = [idx_tr.tolist(), list(idx_te)]
cv.append(tr_te_splits)
# Get hyperparameter searchspace
search_spaces = {
"n_estimators": [1,10,50],
"criterion": ["gini", "entropy"],
"max_features": ["sqrt", "log2", None],
"min_samples_leaf": [1,2,3,5,8,13],
}
opt = GridSearchCV(RandomForestClassifier(random_state=0), search_spaces, scoring="accuracy", n_jobs=1, cv=cv)
opt.fit(X,y)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
<ipython-input-26-d1117d10dfa6> in <module>()
59 }
60 opt = GridSearchCV(RandomForestClassifier(random_state=0), search_spaces, scoring="accuracy", n_jobs=1, cv=cv)
---> 61 opt.fit(X,y)
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
637 error_score=self.error_score)
638 for parameters, (train, test) in product(candidate_params,
--> 639 cv.split(X, y, groups)))
640
641 # if one choose to see train score, "out" will contain train score info
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
446 start_time = time.time()
447
--> 448 X_train, y_train = _safe_split(estimator, X, y, train)
449 X_test, y_test = _safe_split(estimator, X, y, test, train)
450
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in _safe_split(estimator, X, y, indices, train_indices)
198 X_subset = X[np.ix_(indices, train_indices)]
199 else:
--> 200 X_subset = safe_indexing(X, indices)
201
202 if y is not None:
~/anaconda/envs/python3/lib/python3.6/site-packages/sklearn/utils/__init__.py in safe_indexing(X, indices)
144 if hasattr(X, "iloc"):
145 # Work-around for indexing with read-only indices in pandas
--> 146 indices = indices if indices.flags.writeable else indices.copy()
147 # Pandas Dataframes and Series
148 try:
AttributeError: 'list' object has no attribute 'flags'
)
Since the input objects X and y are pandas they require named indicies I believe. If I convert them to numpy via .values method then it works. You just need to make sure the orders are correct if you do it this way.