I am newbie in programming and machine learning. I am doing an assignment on KNN and amazon fine food reviews but getting this error.
My code:
from sklearn.model_selection import train_test_split
Y = data['Score'].values
X_with_stop= data['Text_with_stop'].values
X_no_stop = data['New_Text'].values
X_with_stop_train, X_with_stop_test, y_train, y_test = train_test_split(X_with_stop, Y, test_size=0.33, shuffle=False)
print(X_with_stop_train.shape, y_train.shape,y_test.shape)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
bow_X_train_brute = vectorizer.fit_transform(X_with_stop_train)
bow_X_test_brute = vectorizer.transform(X_with_stop_test)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
neighbors = list(range(3,99,2))
cv_scores = []
for k in neighbors:
knn = KNeighborsClassifier(n_neighbors=k,algorithm='brute')
scores = cross_val_score(knn, bow_X_train_brute, y_train, cv=10, scoring='accuracy')
cv_scores.append(scores.mean())
MSE = [1 - x for x in cv_scores]
# determining best k
optimal_k = neighbors[MSE.index(min(MSE))]
print ("The optimal number of neighbors is %d" % optimal_k)
# plot misclassification error vs k
plt.plot(neighbors, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.title("Plot for K vs Error for Brute force algorithm")
plt.show()
The output:
(413629,) (413629,) (203729,)
The error i am getting is as below:
MemoryError Traceback (most recent call last)
<ipython-input-17-f1ce8e46a2a3> in <module>()
43 for k in neighbors:
44 knn = KNeighborsClassifier(n_neighbors=k,algorithm='brute')
---> 45 scores = cross_val_score(knn, bow_X_train_brute, y_train, cv=10, scoring='accuracy')
46 cv_scores.append(scores.mean())
47
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
340 n_jobs=n_jobs, verbose=verbose,
341 fit_params=fit_params,
--> 342 pre_dispatch=pre_dispatch)
343 return cv_results['test_score']
344
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
204 fit_params, return_train_score=return_train_score,
205 return_times=True)
--> 206 for train, test in cv.split(X, y, groups))
207
208 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
486 fit_time = time.time() - start_time
487 # _score will return dict if is_multimetric is True
--> 488 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
489 score_time = time.time() - start_time - fit_time
490 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
521 """
522 if is_multimetric:
--> 523 return _multimetric_score(estimator, X_test, y_test, scorer)
524 else:
525 if y_test is None:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
551 score = scorer(estimator, X_test)
552 else:
--> 553 score = scorer(estimator, X_test, y_test)
554
555 if hasattr(score, 'item'):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\scorer.py in __call__(self, estimator, X, y_true, sample_weight)
99 super(_PredictScorer, self).__call__(estimator, X, y_true,
100 sample_weight=sample_weight)
--> 101 y_pred = estimator.predict(X)
102 if sample_weight is not None:
103 return self._sign * self._score_func(y_true, y_pred,
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neighbors\classification.py in predict(self, X)
143 X = check_array(X, accept_sparse='csr')
144
--> 145 neigh_dist, neigh_ind = self.kneighbors(X)
146
147 classes_ = self.classes_
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neighbors\base.py in kneighbors(self, X, n_neighbors, return_distance)
355 if self.effective_metric_ == 'euclidean':
356 dist = pairwise_distances(X, self._fit_X, 'euclidean',
--> 357 n_jobs=n_jobs, squared=True)
358 else:
359 dist = pairwise_distances(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in pairwise_distances(X, Y, metric, n_jobs, **kwds)
1245 func = partial(distance.cdist, metric=metric, **kwds)
1246
-> 1247 return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1248
1249
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1088 if n_jobs == 1:
1089 # Special case to avoid picklability checks in delayed
-> 1090 return func(X, Y, **kwds)
1091
1092 # TODO: in some cases, backend='threading' may be appropriate
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in euclidean_distances(X, Y, Y_norm_squared, squared, X_norm_squared)
244 YY = row_norms(Y, squared=True)[np.newaxis, :]
245
--> 246 distances = safe_sparse_dot(X, Y.T, dense_output=True)
247 distances *= -2
248 distances += XX
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\extmath.py in safe_sparse_dot(a, b, dense_output)
133 """
134 if issparse(a) or issparse(b):
--> 135 ret = a * b
136 if dense_output and hasattr(ret, "toarray"):
137 ret = ret.toarray()
C:\ProgramData\Anaconda3\lib\site-packages\scipy\sparse\base.py in __mul__(self, other)
477 if self.shape[1] != other.shape[0]:
478 raise ValueError('dimension mismatch')
--> 479 return self._mul_sparse_matrix(other)
480
481 # If it's a list or whatever, treat it like a matrix
C:\ProgramData\Anaconda3\lib\site-packages\scipy\sparse\compressed.py in _mul_sparse_matrix(self, other)
500 maxval=nnz)
501 indptr = np.asarray(indptr, dtype=idx_dtype)
--> 502 indices = np.empty(nnz, dtype=idx_dtype)
503 data = np.empty(nnz, dtype=upcast(self.dtype, other.dtype))
504
A MemoryError usually means that you ran out of RAM. And seeing the size of your dataset, I think it might be a plausible explanation.
To be sure, just look at your RAM usage while executing your code.
Related
I'm trying to use gridsearchCV to search over specified parameters scoring with neg log loss:
grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = 'neg_log_loss', cv = logo, verbose = 10)
grid.fit(X, y_true, groups = names)
ValueError: y_true contains only one label (1.0). Please provide the true labels explicitly through the labels argument.
The same code but scoring with accuracy works fine. I found that for log loss we need to specify the labels, which works fine when using sklearn.metrics:
y_labels = np.unique(y_true)
y_labels
array([0., 1., 2.])
metrics.log_loss(y_true, y_pred, labels = y_labels )
So I tried:
grid.fit(order_inner_x, y_inner, groups = names_inner, labels = y_labels)
ValueError: not enough values to unpack (expected 2, got 1)
I've tried quite a few variations of the above and also creating my own scorer with:
LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
But everything I try comes down to one of the two above errors. Obviously I'm missing something, so any help much appreciated.
Update:
Made a small mistake in the above - this is a three class problem, not a binary problem as I first implied.
I've tried Ben's suggestion (thanks!):
LogLoss = metrics.make_scorer(metrics.log_loss, greater_is_better=False, needs_proba=True, labels=[0, 1, 2])
grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = LogLoss, cv = logo, verbose = 10)
grid.fit(order_inner_x, y_inner, groups=names_inner)
I'm getting a different error, so hopefully one step closer, here's the full traceback:
ValueError Traceback (most recent call last)
<ipython-input-164-43d9f1633dc9> in <module>
2
3 grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = LogLoss, cv = logo, verbose = 10)
----> 4 grid.fit(order_inner_x, y_inner, groups=names_inner)
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
918 self._iterating = self._original_iterator is not None
919
--> 920 while self.dispatch_one_batch(iterator):
921 pass
922
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
566 fit_time = time.time() - start_time
567 # _score will return dict if is_multimetric is True
--> 568 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
569 score_time = time.time() - start_time - fit_time
570 if return_train_score:
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
603 """
604 if is_multimetric:
--> 605 return _multimetric_score(estimator, X_test, y_test, scorer)
606 else:
607 if y_test is None:
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
633 score = scorer(estimator, X_test)
634 else:
--> 635 score = scorer(estimator, X_test, y_test)
636
637 if hasattr(score, 'item'):
~/anaconda3/lib/python3.7/site-packages/sklearn/metrics/scorer.py in __call__(self, clf, X, y, sample_weight)
133 ' but need classifier with two'
134 ' classes for {} scoring'.format(
--> 135 y_pred.shape, self._score_func.__name__))
136 if sample_weight is not None:
137 return self._sign * self._score_func(y, y_pred,
ValueError: got predict_proba of shape (200, 3), but need classifier with two classes for log_loss scoring
You're most of the way there: you need to provide the labels to your metric. In this attempt:
grid.fit(order_inner_x, y_inner, groups = names_inner, labels = y_labels)
you pass the labels, but to the grid search's fit method rather than the scoring parameter itself.
make_scorer allows other keyword arguments to be passed to the metric function, so this should work:
LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True, labels=[0, 1])
grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = LogLoss, cv = logo, verbose = 10)
grid.fit(X, y_true, groups = names)
I am learning RandomizedSearchCV with a toy example. Suppose that I want to build a linear model y = ax + b. I wrote a custom sklearn estimator that looks like the following:
import numpy as np
# import the base estimator
from sklearn.base import BaseEstimator, RegressorMixin
class testEstimator(BaseEstimator,RegressorMixin):
def __init__(self, alpha=1, beta=0):
self.alpha = alpha
self.beta = beta
def fit(self, X, y=None):
mu = np.ones((len(X)))
for ii in range(len(X)):
mu[ii] = self.alpha*X[ii] + self.beta
self.mu_ = mu
return self
def predict(self, X):
try:
getattr(self,"mu_")
except:
raise RuntimeError("You must train classifer before predicting data!")
return self.mu_
def score(self, X, y):
print("y: ", y)
print("mu: ", self.mu_)
return np.dot(y - self.predict(X), y - self.predict(X))/len(X)
Then, I need to test this.
# temp estimator
tempEs = testEstimator()
# temp params grid
params_grid_temp = {'alpha': [0,1,2,3,4,5,6], 'beta': [0,1,2,3,4]}
# test randomizedSearchCV
temp = RandomizedSearchCV(tempEs, params_grid_temp)
# define X,y
X = range(10)
y = np.dot(2, range(10)) + 4
# fit model
temp.fit(X,y)
However, I got the error
ValueError Traceback (most recent call last)
<ipython-input-8-72a46fdf9098> in <module>
9 y = np.dot(2, range(10)) + 4
10 # fit model
---> 11 temp.fit(X,y)
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1527 def _run_search(self, evaluate_candidates):
1528 """Search n_iter candidates from param_distributions"""
-> 1529 evaluate_candidates(ParameterSampler(
1530 self.param_distributions, self.n_iter,
1531 random_state=self.random_state))
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
706 n_splits, n_candidates, n_candidates * n_splits))
707
--> 708 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
709 X, y,
710 train=train, test=test,
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1027 # remaining jobs.
1028 self._iterating = False
-> 1029 if self.dispatch_one_batch(iterator):
1030 self._iterating = self._original_iterator is not None
1031
~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
845 return False
846 else:
--> 847 self._dispatch(tasks)
848 return True
849
~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
763 with self._lock:
764 job_idx = len(self._jobs)
--> 765 job = self._backend.apply_async(batch, callback=cb)
766 # A job can complete so quickly than its callback is
767 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
558 else:
559 fit_time = time.time() - start_time
--> 560 test_scores = _score(estimator, X_test, y_test, scorer)
561 score_time = time.time() - start_time - fit_time
562 if return_train_score:
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
605 scores = scorer(estimator, X_test)
606 else:
--> 607 scores = scorer(estimator, X_test, y_test)
608
609 error_msg = ("scoring must return a number, got %s (%s) "
~\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
88 *args, **kwargs)
89 else:
---> 90 score = scorer(estimator, *args, **kwargs)
91 scores[name] = score
92 return scores
~\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _passthrough_scorer(estimator, *args, **kwargs)
370 def _passthrough_scorer(estimator, *args, **kwargs):
371 """Function that wraps estimator.score"""
--> 372 return estimator.score(*args, **kwargs)
373
374
<ipython-input-7-0c2138d9bf96> in score(self, X, y)
20 print("y: ", y)
21 print("mu: ", self.mu_)
---> 22 return np.dot(y - self.predict(X), y - self.predict(X))/len(X)
ValueError: operands could not be broadcast together with shapes (2,) (8,)
I figured out (2,) means the size of y and (8,) means the size of self.mu_. How does this happen? They are supposed to be 10.
Okay, I found the problem. Your predict method is totally wrong. It must return predicted value instead of returning mu_.
def predict(self, X):
return self.alpha*X + self.beta
That's it. You can also optimize code in your fit method
Python sci-kit learn KNN Grid Search Cross Validation error
I am trying to recreated KNN model for prediction of car destination.
https://github.com/carlosbkm/car-destination-prediction
The code is not working at Grid search cross validation here:
https://github.com/carlosbkm/car-destination-prediction/blob/master/k-nearest-model.ipynb
At first geodash was not working so I switched it to geodash2 and there was no problem.
When I try to fit the model I get.
TypeError: unsupported operand type(s) for /: 'str' and 'int'
When I try to fit X and y for Grid Search Cross Validation I get an error.
The problem is coming from
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
if score_func:
gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
else:
gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
gs.fit(X, y)
print ("BEST", gs.best_params_, gs.best_score_, gs.cv_results_)
best = gs.best_estimator_
return best
I can not fit the model to X and y:
gs.fit(X, y)
I tried to make X and y into floats but nothing changed
When I execute this:
# Create a k-Nearest Neighbors Regression estimator
knn_estimator = KNeighborsRegressor()
#knn_parameters = {"n_neighbors": [1,2,5,10,20,50,100]}
knn_parameters = {"n_neighbors": [1,2,5]}
knn_best = cv_optimize(knn_estimator, knn_parameters, X_train, y_train, score_func='neg_mean_squared_error')
I get:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-124-34b56429c6b5> in <module>()
4 #knn_parameters = {"n_neighbors": [1,2,5,10,20,50,100]}
5 knn_parameters = {"n_neighbors": [1,2,5]}
----> 6 knn_best = cv_optimize(knn_estimator, knn_parameters, X_train, y_train, score_func='neg_mean_squared_error')
<ipython-input-116-1a00f84f1047> in cv_optimize(clf, parameters, X, y, n_jobs, n_folds, score_func)
6 else:
7 gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
----> 8 gs.fit(X, y)
9 print ("BEST", gs.best_params_, gs.best_score_, gs.cv_results_)
10 best = gs.best_estimator_
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups)
943 train/test set.
944 """
--> 945 return self._fit(X, y, groups, ParameterGrid(self.param_grid))
946
947
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _fit(self, X, y, groups, parameter_iterable)
562 return_times=True, return_parameters=True,
563 error_score=self.error_score)
--> 564 for parameters in parameter_iterable
565 for train, test in cv_iter)
566
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
258 else:
259 fit_time = time.time() - start_time
--> 260 test_score = _score(estimator, X_test, y_test, scorer)
261 score_time = time.time() - start_time - fit_time
262 if return_train_score:
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer)
286 score = scorer(estimator, X_test)
287 else:
--> 288 score = scorer(estimator, X_test, y_test)
289 if hasattr(score, 'item'):
290 try:
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/metrics/scorer.py in __call__(self, estimator, X, y_true, sample_weight)
89 super(_PredictScorer, self).__call__(estimator, X, y_true,
90 sample_weight=sample_weight)
---> 91 y_pred = estimator.predict(X)
92 if sample_weight is not None:
93 return self._sign * self._score_func(y_true, y_pred,
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/neighbors/regression.py in predict(self, X)
151
152 if weights is None:
--> 153 y_pred = np.mean(_y[neigh_ind], axis=1)
154 else:
155 y_pred = np.empty((X.shape[0], _y.shape[1]), dtype=np.float64)
~/anaconda3/envs/datascience/lib/python3.6/site-packages/numpy/core/fromnumeric.py in mean(a, axis, dtype, out, keepdims)
2907
2908 return _methods._mean(a, axis=axis, dtype=dtype,
-> 2909 out=out, **kwargs)
2910
2911
~/anaconda3/envs/datascience/lib/python3.6/site-packages/numpy/core/_methods.py in _mean(a, axis, dtype, out, keepdims)
71 if isinstance(ret, mu.ndarray):
72 ret = um.true_divide(
---> 73 ret, rcount, out=ret, casting='unsafe', subok=False)
74 if is_float16_result and out is None:
75 ret = arr.dtype.type(ret)
TypeError: unsupported operand type(s) for /: 'str' and 'int'
I am trying to use logistic regression for classification of iris dataset but i'm facing a value error when fitting the model.
I am using iris dataset. I can't figure why it returns a value_error. Any help is appreciated.
iris = datasets.load_iris()
X, y = iris.data, iris.target
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state= 81,
test_size=0.3)
logreg = LogisticRegression()
params_grid = {"C":[0.001, 0.01, 0.1, 1, 10, 100]}
gridcv = GridSearchCV(logreg, params_grid, cv=10, scoring='roc_auc')
gridcv.fit(x_train, y_train)
Then I get value_error when fitting
ValueError Traceback (most recent call last)
<ipython-input-108-f4ab6e5f5a79> in <module>()
----> 1 gridcv.fit(x_train, y_train)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
637 error_score=self.error_score)
638 for parameters, (train, test) in product(candidate_params,
--> 639 cv.split(X, y, groups)))
640
641 # if one choose to see train score, "out" will contain train score info
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
486 fit_time = time.time() - start_time
487 # _score will return dict if is_multimetric is True
--> 488 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
489 score_time = time.time() - start_time - fit_time
490 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
521 """
522 if is_multimetric:
--> 523 return _multimetric_score(estimator, X_test, y_test, scorer)
524 else:
525 if y_test is None:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
551 score = scorer(estimator, X_test)
552 else:
--> 553 score = scorer(estimator, X_test, y_test)
554
555 if hasattr(score, 'item'):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\scorer.py in __call__(self, clf, X, y, sample_weight)
179 y_type = type_of_target(y)
180 if y_type not in ("binary", "multilabel-indicator"):
--> 181 raise ValueError("{0} format is not supported".format(y_type))
182
183 if is_regressor(clf):
ValueError: multiclass format is not supported
You will need to use scoring supported by in case of multi classes. an example is 'recall_micro'
iris = datasets.load_iris()
X, y = iris.data, iris.target
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state= 81,
test_size=0.3)
logreg = LogisticRegression()
params_grid = {"C":[0.001, 0.01, 0.1, 1, 10, 100]}
gridcv = GridSearchCV(logreg, params_grid, cv=10, scoring='recall_micro')
gridcv.fit(x_train, y_train)
My first multiclass classication. I have values Xtrn and Ytrn. Ytrn have 5 values [0,1,2,3,4]. But if i start then get "multiclass format is not supported".
This example of values:
Xtrn Ytrn
-1.35173485 1.50224188 2.04951167 0.43759658 0.24381777 2
2.81047260 1.31259056 1.39265240 0.16384002 0.65438366 3
2.32878809 -1.92845940 -2.06453246 0.73132270 0.11771229 2
-0.12810555 -2.07268765 -2.40760215 0.97855042 0.11144164 1
1.88682063 0.75792329 -0.09754671 0.46571931 0.62111648 2
-1.09361266 1.74758304 2.49960891 0.36679883 0.88895562 2
0.71760095 -1.30711698 -2.15681966 0.33700593 0.07171119 2
4.60060308 -1.60544855 -1.88996123 0.94500124 0.63776116 4
-0.84223064 2.78233537 3.07299711 0.31470071 0.34424704 1
-0.71236435 0.53140549 0.46677096 0.12320728 0.58829090 2
-0.35333909 1.12463059 1.70104349 0.89084673 0.16585229 2
3.04322100 -1.36878116 -2.31056167 0.81178387 0.04095645 1
-1.04088918 -1.97497570 -1.93285343 0.54101882 0.02528487 1
-0.41624939 0.54592833 0.95458283 0.40004902 0.55062705 2
-1.77706795 0.29061278 0.68186697 0.17430716 0.75095729 0
This is code:
#import data
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from sklearn import metrics, cross_validation, grid_search, preprocessing
Xtrn = pd.read_csv('x_train_secret.csv', header=None, delimiter=';', na_values='?')
Ytrn = pd.read_csv('y_train_secret.csv', header=None)
Test = pd.read_csv('x_test_secret.csv', header=None, delimiter=';', na_values='?')
#Number of unique values Ytrn
n_classes_ = len(np.unique(Ytrn))
#learning model
X_train, X_test, y_train, y_test = train_test_split(Xtrn, Ytrn, test_size=0.30, random_state=42)
xgb_model = xgb.XGBClassifier(objective='multi:softmax')
xgb_params = [{'num_class': n_classes_}]
xgb_params = [
{
"n_estimators": range(50, 501, 50),
}
]
#cv
cv = cross_validation.StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.3, random_state=42)
xgb_grid = grid_search.GridSearchCV(xgb_model, xgb_params, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=3)
xgb_grid.fit(X_train, y_train)
This is error:
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=50 .................................................
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-233-77d3e8d4b8c3> in <module>()
10
11 xgb_grid = grid_search.GridSearchCV(xgb_model, xgb_params, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=3)
---> 12 xgb_grid.fit(X_train, y_train)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y)
827
828 """
--> 829 return self._fit(X, y, ParameterGrid(self.param_grid))
830
831
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable)
571 self.fit_params, return_parameters=True,
572 error_score=self.error_score)
--> 573 for parameters in parameter_iterable
574 for train, test in cv)
575
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1682
1683 else:
-> 1684 test_score = _score(estimator, X_test, y_test, scorer)
1685 if return_train_score:
1686 train_score = _score(estimator, X_train, y_train, scorer)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator, X_test, y_test, scorer)
1739 score = scorer(estimator, X_test)
1740 else:
-> 1741 score = scorer(estimator, X_test, y_test)
1742 if hasattr(score, 'item'):
1743 try:
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.pyc in __call__(self, clf, X, y, sample_weight)
169 y_type = type_of_target(y)
170 if y_type not in ("binary", "multilabel-indicator"):
--> 171 raise ValueError("{0} format is not supported".format(y_type))
172
173 if is_regressor(clf):
ValueError: multiclass format is not supported
I found answer. Scoring='roc_auc' onle for binary classification. Need another (eg accuracy)
xgb_params = [{'num_class': n_classes_}] need delete