I am learning RandomizedSearchCV with a toy example. Suppose that I want to build a linear model y = ax + b. I wrote a custom sklearn estimator that looks like the following:
import numpy as np
# import the base estimator
from sklearn.base import BaseEstimator, RegressorMixin
class testEstimator(BaseEstimator,RegressorMixin):
def __init__(self, alpha=1, beta=0):
self.alpha = alpha
self.beta = beta
def fit(self, X, y=None):
mu = np.ones((len(X)))
for ii in range(len(X)):
mu[ii] = self.alpha*X[ii] + self.beta
self.mu_ = mu
return self
def predict(self, X):
try:
getattr(self,"mu_")
except:
raise RuntimeError("You must train classifer before predicting data!")
return self.mu_
def score(self, X, y):
print("y: ", y)
print("mu: ", self.mu_)
return np.dot(y - self.predict(X), y - self.predict(X))/len(X)
Then, I need to test this.
# temp estimator
tempEs = testEstimator()
# temp params grid
params_grid_temp = {'alpha': [0,1,2,3,4,5,6], 'beta': [0,1,2,3,4]}
# test randomizedSearchCV
temp = RandomizedSearchCV(tempEs, params_grid_temp)
# define X,y
X = range(10)
y = np.dot(2, range(10)) + 4
# fit model
temp.fit(X,y)
However, I got the error
ValueError Traceback (most recent call last)
<ipython-input-8-72a46fdf9098> in <module>
9 y = np.dot(2, range(10)) + 4
10 # fit model
---> 11 temp.fit(X,y)
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1527 def _run_search(self, evaluate_candidates):
1528 """Search n_iter candidates from param_distributions"""
-> 1529 evaluate_candidates(ParameterSampler(
1530 self.param_distributions, self.n_iter,
1531 random_state=self.random_state))
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
706 n_splits, n_candidates, n_candidates * n_splits))
707
--> 708 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
709 X, y,
710 train=train, test=test,
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1027 # remaining jobs.
1028 self._iterating = False
-> 1029 if self.dispatch_one_batch(iterator):
1030 self._iterating = self._original_iterator is not None
1031
~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
845 return False
846 else:
--> 847 self._dispatch(tasks)
848 return True
849
~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
763 with self._lock:
764 job_idx = len(self._jobs)
--> 765 job = self._backend.apply_async(batch, callback=cb)
766 # A job can complete so quickly than its callback is
767 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
558 else:
559 fit_time = time.time() - start_time
--> 560 test_scores = _score(estimator, X_test, y_test, scorer)
561 score_time = time.time() - start_time - fit_time
562 if return_train_score:
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
605 scores = scorer(estimator, X_test)
606 else:
--> 607 scores = scorer(estimator, X_test, y_test)
608
609 error_msg = ("scoring must return a number, got %s (%s) "
~\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
88 *args, **kwargs)
89 else:
---> 90 score = scorer(estimator, *args, **kwargs)
91 scores[name] = score
92 return scores
~\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _passthrough_scorer(estimator, *args, **kwargs)
370 def _passthrough_scorer(estimator, *args, **kwargs):
371 """Function that wraps estimator.score"""
--> 372 return estimator.score(*args, **kwargs)
373
374
<ipython-input-7-0c2138d9bf96> in score(self, X, y)
20 print("y: ", y)
21 print("mu: ", self.mu_)
---> 22 return np.dot(y - self.predict(X), y - self.predict(X))/len(X)
ValueError: operands could not be broadcast together with shapes (2,) (8,)
I figured out (2,) means the size of y and (8,) means the size of self.mu_. How does this happen? They are supposed to be 10.
Okay, I found the problem. Your predict method is totally wrong. It must return predicted value instead of returning mu_.
def predict(self, X):
return self.alpha*X + self.beta
That's it. You can also optimize code in your fit method
Related
I'm trying to utilize make_pipeline() from scikit-learn along with GridSearchCV(). The Pipeline is simple and only includes two steps, a StandardScaler() and an MLPRegressor(). The GridSearchCV()is also pretty simple with the slight wrinkle that I'm using TimeSeriesSplit() for cross-validation.
The error I'm getting is as follows:
ValueError: Invalid parameter MLPRegressor for estimator Pipeline(steps=[('standardscaler', StandardScaler()),('mlpregressor', MLPRegressor())]). Check the list of available parameters with estimator.get_params().keys().
Can someone help me understand how I can rectify this problem so I can use the make_pipeline() framework with both GridSearchCV() and MLPRegressor() .
from sklearn.neural_network import MLPRegressor
...: from sklearn.preprocessing import StandardScaler
...: from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
...: from sklearn.pipeline import make_pipeline
...: import numpy as np
In [2]: tscv = TimeSeriesSplit(n_splits = 5)
In [3]: pipe = make_pipeline(StandardScaler(), MLPRegressor())
In [4]: param_grid = {'MLPRegressor__hidden_layer_sizes': [(16,16,), (64,64,), (
...: 128,128,)], 'MLPRegressor__activation': ['identity', 'logistic', 'tanh',
...: 'relu'],'MLPRegressor__solver': ['adam','sgd']}
In [5]: grid = GridSearchCV(pipe, param_grid = param_grid, cv = tscv)
In [6]: features = np.random.random([1000,10])
In [7]: target = np.random.normal(0,10,1000)
In [8]: grid.fit(features, target)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-8-7233f9f2005e> in <module>
----> 1 grid.fit(features, target)
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1294 def _run_search(self, evaluate_candidates):
1295 """Search all candidates in param_grid"""
-> 1296 evaluate_candidates(ParameterGrid(self.param_grid))
1297
1298
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params, cv, more_results)
793 n_splits, n_candidates, n_candidates * n_splits))
794
--> 795 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
796 X, y,
797 train=train, test=test,
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in __call__(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
584 cloned_parameters[k] = clone(v, safe=False)
585
--> 586 estimator = estimator.set_params(**cloned_parameters)
587
588 start_time = time.time()
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/pipeline.py in set_params(self, **kwargs)
148 self
149 """
--> 150 self._set_params('steps', **kwargs)
151 return self
152
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/utils/metaestimators.py in _set_params(self, attr, **params)
52 self._replace_estimator(attr, name, params.pop(name))
53 # 3. Step parameters and other initialisation arguments
---> 54 super().set_params(**params)
55 return self
56
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/base.py in set_params(self, **params)
228 key, delim, sub_key = key.partition('__')
229 if key not in valid_params:
--> 230 raise ValueError('Invalid parameter %s for estimator %s. '
231 'Check the list of available parameters '
232 'with `estimator.get_params().keys()`.' %
ValueError: Invalid parameter MLPRegressor for estimator Pipeline(steps=[('standardscaler', StandardScaler()),
('mlpregressor', MLPRegressor())]). Check the list of available parameters with `estimator.get_params().keys()`.
Solution
Yes. Make the pipeline first. Then treat the pipeline as your model and pass it to GridSearchCV.
Your problem is in the following line (you had it mislabeled):
Replace MLPRegressor__ with mlpregressor__.
The Fix:
The pipeline named_step for MLPRegressor estimator was mislabeled as MLPRegressor__ in the param_grid.
Changing it to mlpregressor__ fixed the problem.
You may run and check it in this colab notebook.
# INCORRECT
param_grid = {
'MLPRegressor__hidden_layer_sizes': [(16, 16,), (64, 64,), (128, 128,)],
'MLPRegressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
'MLPRegressor__solver': ['adam', 'sgd'],
}
# CORRECTED
param_grid = {
'mlpregressor__hidden_layer_sizes': [(16, 16,), (64, 64,), (128, 128,)],
'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
'mlpregressor__solver': ['adam', 'sgd'],
}
Note
The key to understand what was wrong here, was to observe the last two lines of the error stack.
ValueError: Invalid parameter MLPRegressor for estimator Pipeline(steps=[('standardscaler', StandardScaler()),
('mlpregressor', MLPRegressor())]). Check the list of available parameters with `estimator.get_params().keys()`.
My Code:
from sklearn.model_selection import GridSearchCV
from gensim.sklearn_api import W2VTransformer
from sklearn.metrics import accuracy_score, make_scorer
s_obj = W2VTransformer()
params_grid = {
'size': [100,200,300],
'window':[10,15,20],
'min_count': [1,2,3,4,5,6],
'workers': [10,20],
'sg':[0,1],
'negative': [2,3,4,6,5],
'sample':[1e-5]
}
s_model = GridSearchCV(s_obj, params_grid, cv=3,
scoring=make_scorer(accuracy_score))
s_model.fit(sentences)
print(s_model.best_params_)
Error is : " TypeError: _score() missing 1 required positional argument: 'y_true' "
PS: I reached the point that the error is showing something about
y_true i.e needed a labelled data then I am not having labelled data,
working on unsupervised learning, so if I am correct then do we have
any other library to tune the unsupervised model?
full traceback
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-230-37cafb83162e> in <module>
14
15 s_model = GridSearchCV(s_obj,params_grid,cv=3,scoring=make_scorer(accuracy_score))
---> 16 s_model.fit(train)
17
18 print(s_model.best_params_)
~/anaconda3/lib/python3.8/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1186 def _run_search(self, evaluate_candidates):
1187 """Search all candidates in param_grid"""
-> 1188 evaluate_candidates(ParameterGrid(self.param_grid))
1189
1190
~/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
706 n_splits, n_candidates, n_candidates * n_splits))
707
--> 708 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
709 X, y,
710 train=train, test=test,
~/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
~/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
~/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
~/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~/anaconda3/lib/python3.8/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/anaconda3/lib/python3.8/site-packages/joblib/parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
558 else:
559 fit_time = time.time() - start_time
--> 560 test_scores = _score(estimator, X_test, y_test, scorer)
561 score_time = time.time() - start_time - fit_time
562 if return_train_score:
~/anaconda3/lib/python3.8/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer)
603 scorer = _MultimetricScorer(**scorer)
604 if y_test is None:
--> 605 scores = scorer(estimator, X_test)
606 else:
607 scores = scorer(estimator, X_test, y_test)
~/anaconda3/lib/python3.8/site-packages/sklearn/metrics/_scorer.py in __call__(self, estimator, *args, **kwargs)
85 for name, scorer in self._scorers.items():
86 if isinstance(scorer, _BaseScorer):
---> 87 score = scorer._score(cached_call, estimator,
88 *args, **kwargs)
89 else:
TypeError: _score() missing 1 required positional argument: 'y_true'
Can anyone help me to solve this issue?
I'm trying to build a pipeline with my own functions. To do so I inherited BaseEstimator and TransformerMixin from sklearn base and defined my own transform methods.
When I do pipeline.fit(X,y), it works fine.
The problem is when I try to create a GridSearchCV object with the pipeline. I get the following error:
ValueError: operands could not be broadcast together with shapes (730,36) (228,) (730,36).
730 Is just the number of lines of matrix X divided by 'cv' = 2, the number of folds I choose for the cross-validation in the GridSearchCV.
I have no idea how to debug that. I've tried some prints in the middle of my functions, and the result is pretty weird.
I'm attaching the functions I created as well as the pipeline. I'd be really glad if someone could help.
Here are the functions I created for the Pipeline:
from sklearn.base import BaseEstimator, TransformerMixin
class MissingData(BaseEstimator, TransformerMixin):
def fit( self, X, y = None ):
return self
def transform(self, X , y = None, strategies = ( "most_frequent", "mean") ):
print('Started MissingData')
X_ = X.copy()
#Categorical Variables handling
categorical_variables = list(X_.select_dtypes(include=['category','object']))
imp_category = SimpleImputer(strategy = strategies[0])
X_[categorical_variables] = pd.DataFrame(imp_category.fit_transform(X_[categorical_variables]))
#Numeric varialbes handling
numerical_variables = list(set(X_.columns) - set(categorical_variables))
imp_numerical = SimpleImputer(strategy = strategies[1])
X_[numerical_variables] = pd.DataFrame(imp_numerical.fit_transform(X_[numerical_variables]))
print('Finished MissingData')
print('Inf: ',X_.isnull().sum().sum())
return X_
class OHEncode(BaseEstimator, TransformerMixin):
def fit(self, X, y = None ):
return self
def encode_and_drop_original_and_first_dummy(self,df, feature_to_encode):
dummies = pd.get_dummies(df[feature_to_encode] , prefix = feature_to_encode, drop_first=True) #Drop first equals true will take care of the dummies variables trap
res = pd.concat([df, dummies], axis=1)
res = res.drop([feature_to_encode], axis=1)
return(res)
def transform(self, X , y = None, categorical_variables = None ):
X_ = X.copy()
if categorical_variables == None:
categorical_variables = list(X_.select_dtypes(include=['category','object']))
print('Started Encoding')
#Let's update the matrix X with the one hot ecoded version of all features in categorical_variables
for feature_to_encode in categorical_variables:
X_ = self.encode_and_drop_original_and_first_dummy(X_ , feature_to_encode)
print('Finished Encoding')
print('Inf: ',X_.isnull().sum().sum())
return X_
Here is the Pipeline with the GridSearchCV:
pca = PCA(n_components=10)
pipeline = Pipeline([('MissingData', MissingData()), ('OHEncode', OHEncode()) ,
('scaler', StandardScaler()) , ('pca', pca), ('rf', LinearRegression())])
parameters = {'pca__n_components': [5, 15, 30, 45, 64]}
grid = GridSearchCV(pipeline, param_grid=parameters, cv = 2)
grid.fit(X, y)
And finally here is the full output including my prints and the error:
Started MissingData
Finished MissingData
Inf: 57670
Started Encoding
Finished Encoding
Inf: 26280
Started MissingData
Finished MissingData
Inf: 0
Started Encoding
C:\Users\menoci\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\extmath.py:765: RuntimeWarning: invalid value encountered in true_divide
updated_mean = (last_sum + new_sum) / updated_sample_count
C:\Users\menoci\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\extmath.py:706: RuntimeWarning: Degrees of freedom <= 0 for slice.
result = op(x, *args, **kwargs)
C:\Users\menoci\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
FitFailedWarning)
Finished Encoding
Inf: 0
Started MissingData
Finished MissingData
Inf: 57670
Started Encoding
Finished Encoding
Inf: 26280
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-67-f78b56dad89d> in <module>
15
16 #pipeline.set_params(rf__n_estimators = 50)
---> 17 grid.fit(X, y)
18
19 #rf_val_predictions = pipeline.predict(X)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
710 return results
711
--> 712 self._run_search(evaluate_candidates)
713
714 # For multi-metric evaluation, store the best_index_, best_params_ and
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1151 def _run_search(self, evaluate_candidates):
1152 """Search all candidates in param_grid"""
-> 1153 evaluate_candidates(ParameterGrid(self.param_grid))
1154
1155
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
689 for parameters, (train, test)
690 in product(candidate_params,
--> 691 cv.split(X, y, groups)))
692
693 if len(out) < 1:
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in __call__(self, iterable)
1005 self._iterating = self._original_iterator is not None
1006
-> 1007 while self.dispatch_one_batch(iterator):
1008 pass
1009
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~\AppData\Roaming\Python\Python37\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~\AppData\Roaming\Python\Python37\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
542 else:
543 fit_time = time.time() - start_time
--> 544 test_scores = _score(estimator, X_test, y_test, scorer)
545 score_time = time.time() - start_time - fit_time
546 if return_train_score:
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
589 scores = scorer(estimator, X_test)
590 else:
--> 591 scores = scorer(estimator, X_test, y_test)
592
593 error_msg = ("scoring must return a number, got %s (%s) "
~\AppData\Roaming\Python\Python37\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
87 *args, **kwargs)
88 else:
---> 89 score = scorer(estimator, *args, **kwargs)
90 scores[name] = score
91 return scores
~\AppData\Roaming\Python\Python37\site-packages\sklearn\metrics\_scorer.py in _passthrough_scorer(estimator, *args, **kwargs)
369 def _passthrough_scorer(estimator, *args, **kwargs):
370 """Function that wraps estimator.score"""
--> 371 return estimator.score(*args, **kwargs)
372
373
~\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in score(self, X, y, sample_weight)
611 Xt = X
612 for _, name, transform in self._iter(with_final=False):
--> 613 Xt = transform.transform(Xt)
614 score_params = {}
615 if sample_weight is not None:
~\AppData\Roaming\Python\Python37\site-packages\sklearn\preprocessing\_data.py in transform(self, X, copy)
804 else:
805 if self.with_mean:
--> 806 X -= self.mean_
807 if self.with_std:
808 X /= self.scale_
ValueError: operands could not be broadcast together with shapes (730,36) (228,) (730,36)
The first point, I would should you to use OneHotEncoder (OHE) class from sklearn. Then, define in the constructor of OHEncode an object of OHE and fit it with the all categorical values you have (to make them "seen" at each GridSearch iteration). Then in transform fuction of OHEncode, apply transform using the object of OHE.
DON'T fit the OHE object Inside the fit function because then you will have the same error; at each GridSearch iteration, the fit and transform functions are applied.
Python sci-kit learn KNN Grid Search Cross Validation error
I am trying to recreated KNN model for prediction of car destination.
https://github.com/carlosbkm/car-destination-prediction
The code is not working at Grid search cross validation here:
https://github.com/carlosbkm/car-destination-prediction/blob/master/k-nearest-model.ipynb
At first geodash was not working so I switched it to geodash2 and there was no problem.
When I try to fit the model I get.
TypeError: unsupported operand type(s) for /: 'str' and 'int'
When I try to fit X and y for Grid Search Cross Validation I get an error.
The problem is coming from
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
if score_func:
gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
else:
gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
gs.fit(X, y)
print ("BEST", gs.best_params_, gs.best_score_, gs.cv_results_)
best = gs.best_estimator_
return best
I can not fit the model to X and y:
gs.fit(X, y)
I tried to make X and y into floats but nothing changed
When I execute this:
# Create a k-Nearest Neighbors Regression estimator
knn_estimator = KNeighborsRegressor()
#knn_parameters = {"n_neighbors": [1,2,5,10,20,50,100]}
knn_parameters = {"n_neighbors": [1,2,5]}
knn_best = cv_optimize(knn_estimator, knn_parameters, X_train, y_train, score_func='neg_mean_squared_error')
I get:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-124-34b56429c6b5> in <module>()
4 #knn_parameters = {"n_neighbors": [1,2,5,10,20,50,100]}
5 knn_parameters = {"n_neighbors": [1,2,5]}
----> 6 knn_best = cv_optimize(knn_estimator, knn_parameters, X_train, y_train, score_func='neg_mean_squared_error')
<ipython-input-116-1a00f84f1047> in cv_optimize(clf, parameters, X, y, n_jobs, n_folds, score_func)
6 else:
7 gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
----> 8 gs.fit(X, y)
9 print ("BEST", gs.best_params_, gs.best_score_, gs.cv_results_)
10 best = gs.best_estimator_
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups)
943 train/test set.
944 """
--> 945 return self._fit(X, y, groups, ParameterGrid(self.param_grid))
946
947
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _fit(self, X, y, groups, parameter_iterable)
562 return_times=True, return_parameters=True,
563 error_score=self.error_score)
--> 564 for parameters in parameter_iterable
565 for train, test in cv_iter)
566
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
258 else:
259 fit_time = time.time() - start_time
--> 260 test_score = _score(estimator, X_test, y_test, scorer)
261 score_time = time.time() - start_time - fit_time
262 if return_train_score:
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer)
286 score = scorer(estimator, X_test)
287 else:
--> 288 score = scorer(estimator, X_test, y_test)
289 if hasattr(score, 'item'):
290 try:
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/metrics/scorer.py in __call__(self, estimator, X, y_true, sample_weight)
89 super(_PredictScorer, self).__call__(estimator, X, y_true,
90 sample_weight=sample_weight)
---> 91 y_pred = estimator.predict(X)
92 if sample_weight is not None:
93 return self._sign * self._score_func(y_true, y_pred,
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/neighbors/regression.py in predict(self, X)
151
152 if weights is None:
--> 153 y_pred = np.mean(_y[neigh_ind], axis=1)
154 else:
155 y_pred = np.empty((X.shape[0], _y.shape[1]), dtype=np.float64)
~/anaconda3/envs/datascience/lib/python3.6/site-packages/numpy/core/fromnumeric.py in mean(a, axis, dtype, out, keepdims)
2907
2908 return _methods._mean(a, axis=axis, dtype=dtype,
-> 2909 out=out, **kwargs)
2910
2911
~/anaconda3/envs/datascience/lib/python3.6/site-packages/numpy/core/_methods.py in _mean(a, axis, dtype, out, keepdims)
71 if isinstance(ret, mu.ndarray):
72 ret = um.true_divide(
---> 73 ret, rcount, out=ret, casting='unsafe', subok=False)
74 if is_float16_result and out is None:
75 ret = arr.dtype.type(ret)
TypeError: unsupported operand type(s) for /: 'str' and 'int'
I am newbie in programming and machine learning. I am doing an assignment on KNN and amazon fine food reviews but getting this error.
My code:
from sklearn.model_selection import train_test_split
Y = data['Score'].values
X_with_stop= data['Text_with_stop'].values
X_no_stop = data['New_Text'].values
X_with_stop_train, X_with_stop_test, y_train, y_test = train_test_split(X_with_stop, Y, test_size=0.33, shuffle=False)
print(X_with_stop_train.shape, y_train.shape,y_test.shape)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
bow_X_train_brute = vectorizer.fit_transform(X_with_stop_train)
bow_X_test_brute = vectorizer.transform(X_with_stop_test)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
neighbors = list(range(3,99,2))
cv_scores = []
for k in neighbors:
knn = KNeighborsClassifier(n_neighbors=k,algorithm='brute')
scores = cross_val_score(knn, bow_X_train_brute, y_train, cv=10, scoring='accuracy')
cv_scores.append(scores.mean())
MSE = [1 - x for x in cv_scores]
# determining best k
optimal_k = neighbors[MSE.index(min(MSE))]
print ("The optimal number of neighbors is %d" % optimal_k)
# plot misclassification error vs k
plt.plot(neighbors, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.title("Plot for K vs Error for Brute force algorithm")
plt.show()
The output:
(413629,) (413629,) (203729,)
The error i am getting is as below:
MemoryError Traceback (most recent call last)
<ipython-input-17-f1ce8e46a2a3> in <module>()
43 for k in neighbors:
44 knn = KNeighborsClassifier(n_neighbors=k,algorithm='brute')
---> 45 scores = cross_val_score(knn, bow_X_train_brute, y_train, cv=10, scoring='accuracy')
46 cv_scores.append(scores.mean())
47
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
340 n_jobs=n_jobs, verbose=verbose,
341 fit_params=fit_params,
--> 342 pre_dispatch=pre_dispatch)
343 return cv_results['test_score']
344
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
204 fit_params, return_train_score=return_train_score,
205 return_times=True)
--> 206 for train, test in cv.split(X, y, groups))
207
208 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
486 fit_time = time.time() - start_time
487 # _score will return dict if is_multimetric is True
--> 488 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
489 score_time = time.time() - start_time - fit_time
490 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
521 """
522 if is_multimetric:
--> 523 return _multimetric_score(estimator, X_test, y_test, scorer)
524 else:
525 if y_test is None:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
551 score = scorer(estimator, X_test)
552 else:
--> 553 score = scorer(estimator, X_test, y_test)
554
555 if hasattr(score, 'item'):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\scorer.py in __call__(self, estimator, X, y_true, sample_weight)
99 super(_PredictScorer, self).__call__(estimator, X, y_true,
100 sample_weight=sample_weight)
--> 101 y_pred = estimator.predict(X)
102 if sample_weight is not None:
103 return self._sign * self._score_func(y_true, y_pred,
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neighbors\classification.py in predict(self, X)
143 X = check_array(X, accept_sparse='csr')
144
--> 145 neigh_dist, neigh_ind = self.kneighbors(X)
146
147 classes_ = self.classes_
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neighbors\base.py in kneighbors(self, X, n_neighbors, return_distance)
355 if self.effective_metric_ == 'euclidean':
356 dist = pairwise_distances(X, self._fit_X, 'euclidean',
--> 357 n_jobs=n_jobs, squared=True)
358 else:
359 dist = pairwise_distances(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in pairwise_distances(X, Y, metric, n_jobs, **kwds)
1245 func = partial(distance.cdist, metric=metric, **kwds)
1246
-> 1247 return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1248
1249
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1088 if n_jobs == 1:
1089 # Special case to avoid picklability checks in delayed
-> 1090 return func(X, Y, **kwds)
1091
1092 # TODO: in some cases, backend='threading' may be appropriate
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in euclidean_distances(X, Y, Y_norm_squared, squared, X_norm_squared)
244 YY = row_norms(Y, squared=True)[np.newaxis, :]
245
--> 246 distances = safe_sparse_dot(X, Y.T, dense_output=True)
247 distances *= -2
248 distances += XX
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\extmath.py in safe_sparse_dot(a, b, dense_output)
133 """
134 if issparse(a) or issparse(b):
--> 135 ret = a * b
136 if dense_output and hasattr(ret, "toarray"):
137 ret = ret.toarray()
C:\ProgramData\Anaconda3\lib\site-packages\scipy\sparse\base.py in __mul__(self, other)
477 if self.shape[1] != other.shape[0]:
478 raise ValueError('dimension mismatch')
--> 479 return self._mul_sparse_matrix(other)
480
481 # If it's a list or whatever, treat it like a matrix
C:\ProgramData\Anaconda3\lib\site-packages\scipy\sparse\compressed.py in _mul_sparse_matrix(self, other)
500 maxval=nnz)
501 indptr = np.asarray(indptr, dtype=idx_dtype)
--> 502 indices = np.empty(nnz, dtype=idx_dtype)
503 data = np.empty(nnz, dtype=upcast(self.dtype, other.dtype))
504
A MemoryError usually means that you ran out of RAM. And seeing the size of your dataset, I think it might be a plausible explanation.
To be sure, just look at your RAM usage while executing your code.