I'm trying to use TransformedTargetRegressor in a model pipeline and run a GridSearchCV on top of it.
Here is a minimal working example:
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import TransformedTargetRegressor
X,y = make_regression()
model_pipe = Pipeline([
('model', TransformedTargetRegressor(RandomForestRegressor()))
])
params={'model__n_estimators': [1, 10, 50]}
model = GridSearchCV(model_pipe, param_grid= params)
model.fit(X,y)
This model results in the following error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-48-828bdf0e7ede> in <module>
17 model = GridSearchCV(model_pipe, param_grid= params)
18
---> 19 model.fit(X,y)
~/miniconda3/envs/gymbo/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
686 return results
687
--> 688 self._run_search(evaluate_candidates)
689
690 # For multi-metric evaluation, store the best_index_, best_params_ and
~/miniconda3/envs/gymbo/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1147 def _run_search(self, evaluate_candidates):
1148 """Search all candidates in param_grid"""
-> 1149 evaluate_candidates(ParameterGrid(self.param_grid))
1150
1151
~/miniconda3/envs/gymbo/lib/python3.6/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
665 for parameters, (train, test)
666 in product(candidate_params,
--> 667 cv.split(X, y, groups)))
668
669 if len(out) < 1:
~/miniconda3/envs/gymbo/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
1001 # remaining jobs.
1002 self._iterating = False
-> 1003 if self.dispatch_one_batch(iterator):
1004 self._iterating = self._original_iterator is not None
1005
~/miniconda3/envs/gymbo/lib/python3.6/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
832 return False
833 else:
--> 834 self._dispatch(tasks)
835 return True
836
~/miniconda3/envs/gymbo/lib/python3.6/site-packages/joblib/parallel.py in _dispatch(self, batch)
751 with self._lock:
752 job_idx = len(self._jobs)
--> 753 job = self._backend.apply_async(batch, callback=cb)
754 # A job can complete so quickly than its callback is
755 # called before we get here, causing self._jobs to
~/miniconda3/envs/gymbo/lib/python3.6/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
199 def apply_async(self, func, callback=None):
200 """Schedule a func to be run"""
--> 201 result = ImmediateResult(func)
202 if callback:
203 callback(result)
~/miniconda3/envs/gymbo/lib/python3.6/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
580 # Don't delay the application, to avoid keeping the input
581 # arguments in memory
--> 582 self.results = batch()
583
584 def get(self):
~/miniconda3/envs/gymbo/lib/python3.6/site-packages/joblib/parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~/miniconda3/envs/gymbo/lib/python3.6/site-packages/joblib/parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~/miniconda3/envs/gymbo/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
501 train_scores = {}
502 if parameters is not None:
--> 503 estimator.set_params(**parameters)
504
505 start_time = time.time()
~/miniconda3/envs/gymbo/lib/python3.6/site-packages/sklearn/pipeline.py in set_params(self, **kwargs)
162 self
163 """
--> 164 self._set_params('steps', **kwargs)
165 return self
166
~/miniconda3/envs/gymbo/lib/python3.6/site-packages/sklearn/utils/metaestimators.py in _set_params(self, attr, **params)
48 self._replace_estimator(attr, name, params.pop(name))
49 # 3. Step parameters and other initialisation arguments
---> 50 super().set_params(**params)
51 return self
52
~/miniconda3/envs/gymbo/lib/python3.6/site-packages/sklearn/base.py in set_params(self, **params)
231
232 for key, sub_params in nested_params.items():
--> 233 valid_params[key].set_params(**sub_params)
234
235 return self
~/miniconda3/envs/gymbo/lib/python3.6/site-packages/sklearn/base.py in set_params(self, **params)
222 'Check the list of available parameters '
223 'with `estimator.get_params().keys()`.' %
--> 224 (key, self))
225
226 if delim:
ValueError: Invalid parameter n_estimators for estimator TransformedTargetRegressor(check_inverse=True, func=None, inverse_func=None,
regressor=RandomForestRegressor(bootstrap=True,
criterion='mse',
max_depth=None,
max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1,
min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators='warn',
n_jobs=None,
oob_score=False,
random_state=None,
verbose=0,
warm_start=False),
transformer=None). Check the list of available parameters with `estimator.get_params().keys()`.
This model runs when I remove TransformedTargetRegressor from the pipeline and just pass the random forest. Why is this? How can I use TransformedTargetRegressor in a pipeline as I have shown above?
The RandomForestRegressor is stored as regressor param in TransformedTargetRegressor.
Hence, the right way to define the params for GridSearchCV is
params={'model__regressor__n_estimators': [1, 10, 50]}
Seems like people are having issues with zeros in y. Consider the following using log1p and expm1. See another worked example here
X,y = make_regression()
model_pipe = Pipeline([
('model', TransformedTargetRegressor(regressor=RandomForestRegressor(),
func=np.log1p,
inverse_func=np.expm1))
])
params={'model__regressor__n_estimators': [1, 10, 50]}
model = GridSearchCV(model_pipe, param_grid= params)
model.fit(X,y)
I've found out the answer. The TransformedTargetregressor needs to be applied to the grid search estimator as so
from sklearn.datasets import make_regression
from sklearn.ensemble import RandomForestRegressor
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
from sklearn.compose import TransformedTargetRegressor
X,y = make_regression()
model_pipe = Pipeline([
('model', RandomForestRegressor())
])
params={'model__n_estimators': [1, 10, 50]}
model = TransformedTargetRegressor(GridSearchCV(model_pipe, param_grid= params), func=np.log, inverse_func=np.exp)
model.fit(X,y)
Related
I'm trying to utilize make_pipeline() from scikit-learn along with GridSearchCV(). The Pipeline is simple and only includes two steps, a StandardScaler() and an MLPRegressor(). The GridSearchCV()is also pretty simple with the slight wrinkle that I'm using TimeSeriesSplit() for cross-validation.
The error I'm getting is as follows:
ValueError: Invalid parameter MLPRegressor for estimator Pipeline(steps=[('standardscaler', StandardScaler()),('mlpregressor', MLPRegressor())]). Check the list of available parameters with estimator.get_params().keys().
Can someone help me understand how I can rectify this problem so I can use the make_pipeline() framework with both GridSearchCV() and MLPRegressor() .
from sklearn.neural_network import MLPRegressor
...: from sklearn.preprocessing import StandardScaler
...: from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
...: from sklearn.pipeline import make_pipeline
...: import numpy as np
In [2]: tscv = TimeSeriesSplit(n_splits = 5)
In [3]: pipe = make_pipeline(StandardScaler(), MLPRegressor())
In [4]: param_grid = {'MLPRegressor__hidden_layer_sizes': [(16,16,), (64,64,), (
...: 128,128,)], 'MLPRegressor__activation': ['identity', 'logistic', 'tanh',
...: 'relu'],'MLPRegressor__solver': ['adam','sgd']}
In [5]: grid = GridSearchCV(pipe, param_grid = param_grid, cv = tscv)
In [6]: features = np.random.random([1000,10])
In [7]: target = np.random.normal(0,10,1000)
In [8]: grid.fit(features, target)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-8-7233f9f2005e> in <module>
----> 1 grid.fit(features, target)
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1294 def _run_search(self, evaluate_candidates):
1295 """Search all candidates in param_grid"""
-> 1296 evaluate_candidates(ParameterGrid(self.param_grid))
1297
1298
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params, cv, more_results)
793 n_splits, n_candidates, n_candidates * n_splits))
794
--> 795 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
796 X, y,
797 train=train, test=test,
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in __call__(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
584 cloned_parameters[k] = clone(v, safe=False)
585
--> 586 estimator = estimator.set_params(**cloned_parameters)
587
588 start_time = time.time()
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/pipeline.py in set_params(self, **kwargs)
148 self
149 """
--> 150 self._set_params('steps', **kwargs)
151 return self
152
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/utils/metaestimators.py in _set_params(self, attr, **params)
52 self._replace_estimator(attr, name, params.pop(name))
53 # 3. Step parameters and other initialisation arguments
---> 54 super().set_params(**params)
55 return self
56
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/base.py in set_params(self, **params)
228 key, delim, sub_key = key.partition('__')
229 if key not in valid_params:
--> 230 raise ValueError('Invalid parameter %s for estimator %s. '
231 'Check the list of available parameters '
232 'with `estimator.get_params().keys()`.' %
ValueError: Invalid parameter MLPRegressor for estimator Pipeline(steps=[('standardscaler', StandardScaler()),
('mlpregressor', MLPRegressor())]). Check the list of available parameters with `estimator.get_params().keys()`.
Solution
Yes. Make the pipeline first. Then treat the pipeline as your model and pass it to GridSearchCV.
Your problem is in the following line (you had it mislabeled):
Replace MLPRegressor__ with mlpregressor__.
The Fix:
The pipeline named_step for MLPRegressor estimator was mislabeled as MLPRegressor__ in the param_grid.
Changing it to mlpregressor__ fixed the problem.
You may run and check it in this colab notebook.
# INCORRECT
param_grid = {
'MLPRegressor__hidden_layer_sizes': [(16, 16,), (64, 64,), (128, 128,)],
'MLPRegressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
'MLPRegressor__solver': ['adam', 'sgd'],
}
# CORRECTED
param_grid = {
'mlpregressor__hidden_layer_sizes': [(16, 16,), (64, 64,), (128, 128,)],
'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
'mlpregressor__solver': ['adam', 'sgd'],
}
Note
The key to understand what was wrong here, was to observe the last two lines of the error stack.
ValueError: Invalid parameter MLPRegressor for estimator Pipeline(steps=[('standardscaler', StandardScaler()),
('mlpregressor', MLPRegressor())]). Check the list of available parameters with `estimator.get_params().keys()`.
I'm trying to use gridsearchCV to search over specified parameters scoring with neg log loss:
grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = 'neg_log_loss', cv = logo, verbose = 10)
grid.fit(X, y_true, groups = names)
ValueError: y_true contains only one label (1.0). Please provide the true labels explicitly through the labels argument.
The same code but scoring with accuracy works fine. I found that for log loss we need to specify the labels, which works fine when using sklearn.metrics:
y_labels = np.unique(y_true)
y_labels
array([0., 1., 2.])
metrics.log_loss(y_true, y_pred, labels = y_labels )
So I tried:
grid.fit(order_inner_x, y_inner, groups = names_inner, labels = y_labels)
ValueError: not enough values to unpack (expected 2, got 1)
I've tried quite a few variations of the above and also creating my own scorer with:
LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
But everything I try comes down to one of the two above errors. Obviously I'm missing something, so any help much appreciated.
Update:
Made a small mistake in the above - this is a three class problem, not a binary problem as I first implied.
I've tried Ben's suggestion (thanks!):
LogLoss = metrics.make_scorer(metrics.log_loss, greater_is_better=False, needs_proba=True, labels=[0, 1, 2])
grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = LogLoss, cv = logo, verbose = 10)
grid.fit(order_inner_x, y_inner, groups=names_inner)
I'm getting a different error, so hopefully one step closer, here's the full traceback:
ValueError Traceback (most recent call last)
<ipython-input-164-43d9f1633dc9> in <module>
2
3 grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = LogLoss, cv = logo, verbose = 10)
----> 4 grid.fit(order_inner_x, y_inner, groups=names_inner)
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
918 self._iterating = self._original_iterator is not None
919
--> 920 while self.dispatch_one_batch(iterator):
921 pass
922
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
566 fit_time = time.time() - start_time
567 # _score will return dict if is_multimetric is True
--> 568 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
569 score_time = time.time() - start_time - fit_time
570 if return_train_score:
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
603 """
604 if is_multimetric:
--> 605 return _multimetric_score(estimator, X_test, y_test, scorer)
606 else:
607 if y_test is None:
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
633 score = scorer(estimator, X_test)
634 else:
--> 635 score = scorer(estimator, X_test, y_test)
636
637 if hasattr(score, 'item'):
~/anaconda3/lib/python3.7/site-packages/sklearn/metrics/scorer.py in __call__(self, clf, X, y, sample_weight)
133 ' but need classifier with two'
134 ' classes for {} scoring'.format(
--> 135 y_pred.shape, self._score_func.__name__))
136 if sample_weight is not None:
137 return self._sign * self._score_func(y, y_pred,
ValueError: got predict_proba of shape (200, 3), but need classifier with two classes for log_loss scoring
You're most of the way there: you need to provide the labels to your metric. In this attempt:
grid.fit(order_inner_x, y_inner, groups = names_inner, labels = y_labels)
you pass the labels, but to the grid search's fit method rather than the scoring parameter itself.
make_scorer allows other keyword arguments to be passed to the metric function, so this should work:
LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True, labels=[0, 1])
grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = LogLoss, cv = logo, verbose = 10)
grid.fit(X, y_true, groups = names)
I'm trying to build a pipeline with my own functions. To do so I inherited BaseEstimator and TransformerMixin from sklearn base and defined my own transform methods.
When I do pipeline.fit(X,y), it works fine.
The problem is when I try to create a GridSearchCV object with the pipeline. I get the following error:
ValueError: operands could not be broadcast together with shapes (730,36) (228,) (730,36).
730 Is just the number of lines of matrix X divided by 'cv' = 2, the number of folds I choose for the cross-validation in the GridSearchCV.
I have no idea how to debug that. I've tried some prints in the middle of my functions, and the result is pretty weird.
I'm attaching the functions I created as well as the pipeline. I'd be really glad if someone could help.
Here are the functions I created for the Pipeline:
from sklearn.base import BaseEstimator, TransformerMixin
class MissingData(BaseEstimator, TransformerMixin):
def fit( self, X, y = None ):
return self
def transform(self, X , y = None, strategies = ( "most_frequent", "mean") ):
print('Started MissingData')
X_ = X.copy()
#Categorical Variables handling
categorical_variables = list(X_.select_dtypes(include=['category','object']))
imp_category = SimpleImputer(strategy = strategies[0])
X_[categorical_variables] = pd.DataFrame(imp_category.fit_transform(X_[categorical_variables]))
#Numeric varialbes handling
numerical_variables = list(set(X_.columns) - set(categorical_variables))
imp_numerical = SimpleImputer(strategy = strategies[1])
X_[numerical_variables] = pd.DataFrame(imp_numerical.fit_transform(X_[numerical_variables]))
print('Finished MissingData')
print('Inf: ',X_.isnull().sum().sum())
return X_
class OHEncode(BaseEstimator, TransformerMixin):
def fit(self, X, y = None ):
return self
def encode_and_drop_original_and_first_dummy(self,df, feature_to_encode):
dummies = pd.get_dummies(df[feature_to_encode] , prefix = feature_to_encode, drop_first=True) #Drop first equals true will take care of the dummies variables trap
res = pd.concat([df, dummies], axis=1)
res = res.drop([feature_to_encode], axis=1)
return(res)
def transform(self, X , y = None, categorical_variables = None ):
X_ = X.copy()
if categorical_variables == None:
categorical_variables = list(X_.select_dtypes(include=['category','object']))
print('Started Encoding')
#Let's update the matrix X with the one hot ecoded version of all features in categorical_variables
for feature_to_encode in categorical_variables:
X_ = self.encode_and_drop_original_and_first_dummy(X_ , feature_to_encode)
print('Finished Encoding')
print('Inf: ',X_.isnull().sum().sum())
return X_
Here is the Pipeline with the GridSearchCV:
pca = PCA(n_components=10)
pipeline = Pipeline([('MissingData', MissingData()), ('OHEncode', OHEncode()) ,
('scaler', StandardScaler()) , ('pca', pca), ('rf', LinearRegression())])
parameters = {'pca__n_components': [5, 15, 30, 45, 64]}
grid = GridSearchCV(pipeline, param_grid=parameters, cv = 2)
grid.fit(X, y)
And finally here is the full output including my prints and the error:
Started MissingData
Finished MissingData
Inf: 57670
Started Encoding
Finished Encoding
Inf: 26280
Started MissingData
Finished MissingData
Inf: 0
Started Encoding
C:\Users\menoci\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\extmath.py:765: RuntimeWarning: invalid value encountered in true_divide
updated_mean = (last_sum + new_sum) / updated_sample_count
C:\Users\menoci\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\extmath.py:706: RuntimeWarning: Degrees of freedom <= 0 for slice.
result = op(x, *args, **kwargs)
C:\Users\menoci\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
FitFailedWarning)
Finished Encoding
Inf: 0
Started MissingData
Finished MissingData
Inf: 57670
Started Encoding
Finished Encoding
Inf: 26280
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-67-f78b56dad89d> in <module>
15
16 #pipeline.set_params(rf__n_estimators = 50)
---> 17 grid.fit(X, y)
18
19 #rf_val_predictions = pipeline.predict(X)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
710 return results
711
--> 712 self._run_search(evaluate_candidates)
713
714 # For multi-metric evaluation, store the best_index_, best_params_ and
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1151 def _run_search(self, evaluate_candidates):
1152 """Search all candidates in param_grid"""
-> 1153 evaluate_candidates(ParameterGrid(self.param_grid))
1154
1155
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
689 for parameters, (train, test)
690 in product(candidate_params,
--> 691 cv.split(X, y, groups)))
692
693 if len(out) < 1:
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in __call__(self, iterable)
1005 self._iterating = self._original_iterator is not None
1006
-> 1007 while self.dispatch_one_batch(iterator):
1008 pass
1009
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~\AppData\Roaming\Python\Python37\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~\AppData\Roaming\Python\Python37\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
542 else:
543 fit_time = time.time() - start_time
--> 544 test_scores = _score(estimator, X_test, y_test, scorer)
545 score_time = time.time() - start_time - fit_time
546 if return_train_score:
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
589 scores = scorer(estimator, X_test)
590 else:
--> 591 scores = scorer(estimator, X_test, y_test)
592
593 error_msg = ("scoring must return a number, got %s (%s) "
~\AppData\Roaming\Python\Python37\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
87 *args, **kwargs)
88 else:
---> 89 score = scorer(estimator, *args, **kwargs)
90 scores[name] = score
91 return scores
~\AppData\Roaming\Python\Python37\site-packages\sklearn\metrics\_scorer.py in _passthrough_scorer(estimator, *args, **kwargs)
369 def _passthrough_scorer(estimator, *args, **kwargs):
370 """Function that wraps estimator.score"""
--> 371 return estimator.score(*args, **kwargs)
372
373
~\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in score(self, X, y, sample_weight)
611 Xt = X
612 for _, name, transform in self._iter(with_final=False):
--> 613 Xt = transform.transform(Xt)
614 score_params = {}
615 if sample_weight is not None:
~\AppData\Roaming\Python\Python37\site-packages\sklearn\preprocessing\_data.py in transform(self, X, copy)
804 else:
805 if self.with_mean:
--> 806 X -= self.mean_
807 if self.with_std:
808 X /= self.scale_
ValueError: operands could not be broadcast together with shapes (730,36) (228,) (730,36)
The first point, I would should you to use OneHotEncoder (OHE) class from sklearn. Then, define in the constructor of OHEncode an object of OHE and fit it with the all categorical values you have (to make them "seen" at each GridSearch iteration). Then in transform fuction of OHEncode, apply transform using the object of OHE.
DON'T fit the OHE object Inside the fit function because then you will have the same error; at each GridSearch iteration, the fit and transform functions are applied.
I am using Python 3.7 in a Jupyter Notebook. I am creating classification models based on Jason Brownlee's ebook Machine Learning Mastery with Python. The code is essentially cut and pasted from the ebook into the Jupyter Notebook. The models work fine when I split the data but when I use k-fold cross validation it generates a Future warning message I'll cut and paste the code and message below. I entered error_score =np.nan and it didn't fix the problem but I don't know where the code should be entered. I would appreciate any advice but keep in mind that I am a novice. Thanks
# Logistic Regression Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
df = pd.read_csv('Diabetes_Classification.csv')
array = df.values
X = array[:,0:8]
Y = array[:,8]
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression(solver='liblinear')
error_score = np.nan
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
# Logistic Regression Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
df = pd.read_csv('Diabetes_Classification.csv')
array = df.values
X = array[:,0:8]
Y = array[:,8]
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression(solver='liblinear')
error_score = np.nan
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
/Users/roberthoyt/opt/anaconda3/lib/python3.7/site-
packages/sklearn/model_selection/_validation.py:530: FutureWarning: From version 0.22, errors during
fit will result in a cross validation score of NaN by default. Use error_score='raise' if you want
an exception raised or error_score=np.nan to adopt the behavior from version 0.22.
FutureWarning)
ValueError Traceback (most recent call last)
<ipython-input-105-010e5612fd63> in <module>
11 model = LogisticRegression(solver='liblinear')
12 error_score = np.nan
---> 13 results = cross_val_score(model, X, Y, cv=kfold)
14 print(results.mean())
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in
cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch,
error_score)
389 fit_params=fit_params,
390 pre_dispatch=pre_dispatch,
--> 391 error_score=error_score)
392 return cv_results['test_score']
393
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in
cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch,
return_train_score, return_estimator, error_score)
230 return_times=True, return_estimator=return_estimator,
231 error_score=error_score)
--> 232 for train, test in cv.split(X, y, groups))
233
234 zipped_scores = list(zip(*scores))
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self,
iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self,
func,
callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self,
batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _
fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params,
return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator,
error_score)
514 estimator.fit(X_train, **fit_params)
515 else:
--> 516 estimator.fit(X_train, y_train, **fit_params)
517
518 except Exception as e:
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py in fit(self, X, y,
sample_weight)
1531 X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
1532 accept_large_sparse=solver != 'liblinear')
-> 1533 check_classification_targets(y)
1534 self.classes_ = np.unique(y)
1535 n_samples, n_features = X.shape
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/multiclass.py in
check_classification_targets(y)
167 if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
168 'multilabel-indicator', 'multilabel-sequences']:
--> 169 raise ValueError("Unknown label type: %r" % y_type)
170
171
ValueError: Unknown label type: 'continuous'
The problem is that your targets are continuous and you're doing a classification task. Make sure The column you're using a target is categorical. You may have to convert it to integer. All of this is reported in the traceback:
check_classification_targets(y)
167 if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
168 'multilabel-indicator', 'multilabel-sequences']:
--> 169 raise ValueError("Unknown label type: %r" % y_type)
Your target is not in the accepted targets. your target is continuous:
ValueError: Unknown label type: 'continuous'
Check if your target is an integer with df.dtypes and change it to integer if it isn't.
Y = array[:,8].astype(int)
That is assuming that you haven't made the mistake of making a classification task on continuous values. You can also check if all values represent 0s and 1s:
np.unique(array[:, 8])
I am trying to carry out a grid search to optimise my parameters my code is:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
parameters = [{'kernel':['rbf'], 'gamma' :[1e-2, 1e-3, 1e-4 ,1e-5],
'C': [1, 10, 100, 1000]},
{'kernel': ['poly'], 'C': [1, 10, 100, 1000], 'degree':[1,2,3,4]}]
clf = GridSearchCV (SVC(C=1), parameters, cv=5, scoring='f1_macro')
clf.fit(X_train, y_train)
My X_train, y_train are floating point numbers which are:
x_train = [[3.30049159],[2.25226244],[1.44078451] ...,[5.63927925],[5.431458],[4.35674369]]
y_train = [[0.2681013],[0.03454225],[0.02062136]...,[0.21827915],[0.28768273,[0.27969417]]
I believe the error may be that I am using floating point numbers and maybe only integers are able to be passed into the classifier if this is the case, how would this be resolved? My full traceback error message is:
ValueError Traceback (most recent call last)
<ipython-input-51-fb016a0a90cc> in <module>()
11
12 clf = GridSearchCV (SVC(C=1), parameters, cv=5, scoring='f1_macro')
---> 13 clf.fit(X_train, y_train)
~/anaconda3_501/lib/python3.6/site-packages/sklearn/grid_search.py in fit(self, X, y)
836
837 """
--> 838 return self._fit(X, y, ParameterGrid(self.param_grid))
839
840
~/anaconda3_501/lib/python3.6/site-packages/sklearn/grid_search.py in _fit(self, X, y, parameter_iterable)
572 self.fit_params, return_parameters=True,
573 error_score=self.error_score)
--> 574 for parameters in parameter_iterable
575 for train, test in cv)
576
~/anaconda3_501/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
~/anaconda3_501/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
~/anaconda3_501/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
~/anaconda3_501/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
~/anaconda3_501/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3_501/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3_501/lib/python3.6/site-packages/sklearn/cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1673 estimator.fit(X_train, **fit_params)
1674 else:
-> 1675 estimator.fit(X_train, y_train, **fit_params)
1676
1677 except Exception as e:
~/anaconda3_501/lib/python3.6/site-packages/sklearn/svm/base.py in fit(self, X, y, sample_weight)
148
149 X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr')
--> 150 y = self._validate_targets(y)
151
152 sample_weight = np.asarray([]
~/anaconda3_501/lib/python3.6/site-packages/sklearn/svm/base.py in _validate_targets(self, y)
498 def _validate_targets(self, y):
499 y_ = column_or_1d(y, warn=True)
--> 500 check_classification_targets(y)
501 cls, y = np.unique(y_, return_inverse=True)
502 self.class_weight_ = compute_class_weight(self.class_weight, cls, y_)
~/anaconda3_501/lib/python3.6/site-packages/sklearn/utils/multiclass.py in check_classification_targets(y)
170 if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
171 'multilabel-indicator', 'multilabel-sequences']:
--> 172 raise ValueError("Unknown label type: %r" % y_type)
173
174
ValueError: Unknown label type: 'continuous'
Help with this would be appreciated.
You are using a classifier. You can only classify binary or categorical variables. If you want to use support vector but predict numeric values you should use support vector regression.
Otherwise you will have to classify your y-values into groups.
This is a regression problem not a classification problem. What the model is trying to do is - fit X into classes defined by Y (which are continuous). This is unknown to SVC classifier. Update your code with SVR
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV
X_train = [[3.30049159], [2.25226244], [1.44078451]]
#1. Y should be 1d array of dimensions (n_samples,)
y_train = [0.2681013, 0.03454225, 0.02062136]
#Grid Search
parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
'C': [1, 10, 100, 1000]},
{'kernel': ['poly'], 'C': [1, 10, 100, 1000], 'degree': [1, 2, 3, 4]}]
#2. Type of regressor
reg = SVR(C=1)
#3. Regression evaluation cannot be done using f1_macro, so updated NMSE
clf = GridSearchCV(reg, parameters, cv=5, scoring='neg_mean_squared_error')
clf.fit(X_train, y_train)