MemoryError cross_val_score Jupyter Notebook - python

I am newbie in programming and machine learning. I am doing an assignment on KNN and amazon fine food reviews but getting this error.
My code:
from sklearn.model_selection import train_test_split
Y = data['Score'].values
X_with_stop= data['Text_with_stop'].values
X_no_stop = data['New_Text'].values
X_with_stop_train, X_with_stop_test, y_train, y_test = train_test_split(X_with_stop, Y, test_size=0.33, shuffle=False)
print(X_with_stop_train.shape, y_train.shape,y_test.shape)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
bow_X_train_brute = vectorizer.fit_transform(X_with_stop_train)
bow_X_test_brute = vectorizer.transform(X_with_stop_test)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
neighbors = list(range(3,99,2))
cv_scores = []
for k in neighbors:
knn = KNeighborsClassifier(n_neighbors=k,algorithm='brute')
scores = cross_val_score(knn, bow_X_train_brute, y_train, cv=10, scoring='accuracy')
cv_scores.append(scores.mean())
MSE = [1 - x for x in cv_scores]
# determining best k
optimal_k = neighbors[MSE.index(min(MSE))]
print ("The optimal number of neighbors is %d" % optimal_k)
# plot misclassification error vs k
plt.plot(neighbors, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.title("Plot for K vs Error for Brute force algorithm")
plt.show()
The output:
(413629,) (413629,) (203729,)
The error i am getting is as below:
MemoryError Traceback (most recent call last)
<ipython-input-17-f1ce8e46a2a3> in <module>()
43 for k in neighbors:
44 knn = KNeighborsClassifier(n_neighbors=k,algorithm='brute')
---> 45 scores = cross_val_score(knn, bow_X_train_brute, y_train, cv=10, scoring='accuracy')
46 cv_scores.append(scores.mean())
47
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
340 n_jobs=n_jobs, verbose=verbose,
341 fit_params=fit_params,
--> 342 pre_dispatch=pre_dispatch)
343 return cv_results['test_score']
344
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
204 fit_params, return_train_score=return_train_score,
205 return_times=True)
--> 206 for train, test in cv.split(X, y, groups))
207
208 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
486 fit_time = time.time() - start_time
487 # _score will return dict if is_multimetric is True
--> 488 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
489 score_time = time.time() - start_time - fit_time
490 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
521 """
522 if is_multimetric:
--> 523 return _multimetric_score(estimator, X_test, y_test, scorer)
524 else:
525 if y_test is None:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
551 score = scorer(estimator, X_test)
552 else:
--> 553 score = scorer(estimator, X_test, y_test)
554
555 if hasattr(score, 'item'):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\scorer.py in __call__(self, estimator, X, y_true, sample_weight)
99 super(_PredictScorer, self).__call__(estimator, X, y_true,
100 sample_weight=sample_weight)
--> 101 y_pred = estimator.predict(X)
102 if sample_weight is not None:
103 return self._sign * self._score_func(y_true, y_pred,
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neighbors\classification.py in predict(self, X)
143 X = check_array(X, accept_sparse='csr')
144
--> 145 neigh_dist, neigh_ind = self.kneighbors(X)
146
147 classes_ = self.classes_
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neighbors\base.py in kneighbors(self, X, n_neighbors, return_distance)
355 if self.effective_metric_ == 'euclidean':
356 dist = pairwise_distances(X, self._fit_X, 'euclidean',
--> 357 n_jobs=n_jobs, squared=True)
358 else:
359 dist = pairwise_distances(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in pairwise_distances(X, Y, metric, n_jobs, **kwds)
1245 func = partial(distance.cdist, metric=metric, **kwds)
1246
-> 1247 return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1248
1249
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1088 if n_jobs == 1:
1089 # Special case to avoid picklability checks in delayed
-> 1090 return func(X, Y, **kwds)
1091
1092 # TODO: in some cases, backend='threading' may be appropriate
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in euclidean_distances(X, Y, Y_norm_squared, squared, X_norm_squared)
244 YY = row_norms(Y, squared=True)[np.newaxis, :]
245
--> 246 distances = safe_sparse_dot(X, Y.T, dense_output=True)
247 distances *= -2
248 distances += XX
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\extmath.py in safe_sparse_dot(a, b, dense_output)
133 """
134 if issparse(a) or issparse(b):
--> 135 ret = a * b
136 if dense_output and hasattr(ret, "toarray"):
137 ret = ret.toarray()
C:\ProgramData\Anaconda3\lib\site-packages\scipy\sparse\base.py in __mul__(self, other)
477 if self.shape[1] != other.shape[0]:
478 raise ValueError('dimension mismatch')
--> 479 return self._mul_sparse_matrix(other)
480
481 # If it's a list or whatever, treat it like a matrix
C:\ProgramData\Anaconda3\lib\site-packages\scipy\sparse\compressed.py in _mul_sparse_matrix(self, other)
500 maxval=nnz)
501 indptr = np.asarray(indptr, dtype=idx_dtype)
--> 502 indices = np.empty(nnz, dtype=idx_dtype)
503 data = np.empty(nnz, dtype=upcast(self.dtype, other.dtype))
504

A MemoryError usually means that you ran out of RAM. And seeing the size of your dataset, I think it might be a plausible explanation.
To be sure, just look at your RAM usage while executing your code.

Related

GridSearchCV with score = neg_log_loss

I'm trying to use gridsearchCV to search over specified parameters scoring with neg log loss:
grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = 'neg_log_loss', cv = logo, verbose = 10)
grid.fit(X, y_true, groups = names)
ValueError: y_true contains only one label (1.0). Please provide the true labels explicitly through the labels argument.
The same code but scoring with accuracy works fine. I found that for log loss we need to specify the labels, which works fine when using sklearn.metrics:
y_labels = np.unique(y_true)
y_labels
array([0., 1., 2.])
metrics.log_loss(y_true, y_pred, labels = y_labels )
So I tried:
grid.fit(order_inner_x, y_inner, groups = names_inner, labels = y_labels)
ValueError: not enough values to unpack (expected 2, got 1)
I've tried quite a few variations of the above and also creating my own scorer with:
LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True)
But everything I try comes down to one of the two above errors. Obviously I'm missing something, so any help much appreciated.
Update:
Made a small mistake in the above - this is a three class problem, not a binary problem as I first implied.
I've tried Ben's suggestion (thanks!):
LogLoss = metrics.make_scorer(metrics.log_loss, greater_is_better=False, needs_proba=True, labels=[0, 1, 2])
grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = LogLoss, cv = logo, verbose = 10)
grid.fit(order_inner_x, y_inner, groups=names_inner)
I'm getting a different error, so hopefully one step closer, here's the full traceback:
ValueError Traceback (most recent call last)
<ipython-input-164-43d9f1633dc9> in <module>
2
3 grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = LogLoss, cv = logo, verbose = 10)
----> 4 grid.fit(order_inner_x, y_inner, groups=names_inner)
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
918 self._iterating = self._original_iterator is not None
919
--> 920 while self.dispatch_one_batch(iterator):
921 pass
922
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
566 fit_time = time.time() - start_time
567 # _score will return dict if is_multimetric is True
--> 568 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
569 score_time = time.time() - start_time - fit_time
570 if return_train_score:
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
603 """
604 if is_multimetric:
--> 605 return _multimetric_score(estimator, X_test, y_test, scorer)
606 else:
607 if y_test is None:
~/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
633 score = scorer(estimator, X_test)
634 else:
--> 635 score = scorer(estimator, X_test, y_test)
636
637 if hasattr(score, 'item'):
~/anaconda3/lib/python3.7/site-packages/sklearn/metrics/scorer.py in __call__(self, clf, X, y, sample_weight)
133 ' but need classifier with two'
134 ' classes for {} scoring'.format(
--> 135 y_pred.shape, self._score_func.__name__))
136 if sample_weight is not None:
137 return self._sign * self._score_func(y, y_pred,
ValueError: got predict_proba of shape (200, 3), but need classifier with two classes for log_loss scoring
You're most of the way there: you need to provide the labels to your metric. In this attempt:
grid.fit(order_inner_x, y_inner, groups = names_inner, labels = y_labels)
you pass the labels, but to the grid search's fit method rather than the scoring parameter itself.
make_scorer allows other keyword arguments to be passed to the metric function, so this should work:
LogLoss = make_scorer(log_loss, greater_is_better=False, needs_proba=True, labels=[0, 1])
grid = GridSearchCV(spec_pipeline, param_grid = spec_params, scoring = LogLoss, cv = logo, verbose = 10)
grid.fit(X, y_true, groups = names)

Weird shape when I use randomized search cv

I am learning RandomizedSearchCV with a toy example. Suppose that I want to build a linear model y = ax + b. I wrote a custom sklearn estimator that looks like the following:
import numpy as np
# import the base estimator
from sklearn.base import BaseEstimator, RegressorMixin
class testEstimator(BaseEstimator,RegressorMixin):
def __init__(self, alpha=1, beta=0):
self.alpha = alpha
self.beta = beta
def fit(self, X, y=None):
mu = np.ones((len(X)))
for ii in range(len(X)):
mu[ii] = self.alpha*X[ii] + self.beta
self.mu_ = mu
return self
def predict(self, X):
try:
getattr(self,"mu_")
except:
raise RuntimeError("You must train classifer before predicting data!")
return self.mu_
def score(self, X, y):
print("y: ", y)
print("mu: ", self.mu_)
return np.dot(y - self.predict(X), y - self.predict(X))/len(X)
Then, I need to test this.
# temp estimator
tempEs = testEstimator()
# temp params grid
params_grid_temp = {'alpha': [0,1,2,3,4,5,6], 'beta': [0,1,2,3,4]}
# test randomizedSearchCV
temp = RandomizedSearchCV(tempEs, params_grid_temp)
# define X,y
X = range(10)
y = np.dot(2, range(10)) + 4
# fit model
temp.fit(X,y)
However, I got the error
ValueError Traceback (most recent call last)
<ipython-input-8-72a46fdf9098> in <module>
9 y = np.dot(2, range(10)) + 4
10 # fit model
---> 11 temp.fit(X,y)
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1527 def _run_search(self, evaluate_candidates):
1528 """Search n_iter candidates from param_distributions"""
-> 1529 evaluate_candidates(ParameterSampler(
1530 self.param_distributions, self.n_iter,
1531 random_state=self.random_state))
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
706 n_splits, n_candidates, n_candidates * n_splits))
707
--> 708 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
709 X, y,
710 train=train, test=test,
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1027 # remaining jobs.
1028 self._iterating = False
-> 1029 if self.dispatch_one_batch(iterator):
1030 self._iterating = self._original_iterator is not None
1031
~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
845 return False
846 else:
--> 847 self._dispatch(tasks)
848 return True
849
~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
763 with self._lock:
764 job_idx = len(self._jobs)
--> 765 job = self._backend.apply_async(batch, callback=cb)
766 # A job can complete so quickly than its callback is
767 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
558 else:
559 fit_time = time.time() - start_time
--> 560 test_scores = _score(estimator, X_test, y_test, scorer)
561 score_time = time.time() - start_time - fit_time
562 if return_train_score:
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
605 scores = scorer(estimator, X_test)
606 else:
--> 607 scores = scorer(estimator, X_test, y_test)
608
609 error_msg = ("scoring must return a number, got %s (%s) "
~\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
88 *args, **kwargs)
89 else:
---> 90 score = scorer(estimator, *args, **kwargs)
91 scores[name] = score
92 return scores
~\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _passthrough_scorer(estimator, *args, **kwargs)
370 def _passthrough_scorer(estimator, *args, **kwargs):
371 """Function that wraps estimator.score"""
--> 372 return estimator.score(*args, **kwargs)
373
374
<ipython-input-7-0c2138d9bf96> in score(self, X, y)
20 print("y: ", y)
21 print("mu: ", self.mu_)
---> 22 return np.dot(y - self.predict(X), y - self.predict(X))/len(X)
ValueError: operands could not be broadcast together with shapes (2,) (8,)
I figured out (2,) means the size of y and (8,) means the size of self.mu_. How does this happen? They are supposed to be 10.
Okay, I found the problem. Your predict method is totally wrong. It must return predicted value instead of returning mu_.
def predict(self, X):
return self.alpha*X + self.beta
That's it. You can also optimize code in your fit method

Grid Search Cross Validation error when trying to fit X, y with GridSearchCV sklearn

Python sci-kit learn KNN Grid Search Cross Validation error
I am trying to recreated KNN model for prediction of car destination.
https://github.com/carlosbkm/car-destination-prediction
The code is not working at Grid search cross validation here:
https://github.com/carlosbkm/car-destination-prediction/blob/master/k-nearest-model.ipynb
At first geodash was not working so I switched it to geodash2 and there was no problem.
When I try to fit the model I get.
TypeError: unsupported operand type(s) for /: 'str' and 'int'
When I try to fit X and y for Grid Search Cross Validation I get an error.
The problem is coming from
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
if score_func:
gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
else:
gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
gs.fit(X, y)
print ("BEST", gs.best_params_, gs.best_score_, gs.cv_results_)
best = gs.best_estimator_
return best
I can not fit the model to X and y:
gs.fit(X, y)
I tried to make X and y into floats but nothing changed
When I execute this:
# Create a k-Nearest Neighbors Regression estimator
knn_estimator = KNeighborsRegressor()
#knn_parameters = {"n_neighbors": [1,2,5,10,20,50,100]}
knn_parameters = {"n_neighbors": [1,2,5]}
knn_best = cv_optimize(knn_estimator, knn_parameters, X_train, y_train, score_func='neg_mean_squared_error')
I get:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-124-34b56429c6b5> in <module>()
4 #knn_parameters = {"n_neighbors": [1,2,5,10,20,50,100]}
5 knn_parameters = {"n_neighbors": [1,2,5]}
----> 6 knn_best = cv_optimize(knn_estimator, knn_parameters, X_train, y_train, score_func='neg_mean_squared_error')
<ipython-input-116-1a00f84f1047> in cv_optimize(clf, parameters, X, y, n_jobs, n_folds, score_func)
6 else:
7 gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
----> 8 gs.fit(X, y)
9 print ("BEST", gs.best_params_, gs.best_score_, gs.cv_results_)
10 best = gs.best_estimator_
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups)
943 train/test set.
944 """
--> 945 return self._fit(X, y, groups, ParameterGrid(self.param_grid))
946
947
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _fit(self, X, y, groups, parameter_iterable)
562 return_times=True, return_parameters=True,
563 error_score=self.error_score)
--> 564 for parameters in parameter_iterable
565 for train, test in cv_iter)
566
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
258 else:
259 fit_time = time.time() - start_time
--> 260 test_score = _score(estimator, X_test, y_test, scorer)
261 score_time = time.time() - start_time - fit_time
262 if return_train_score:
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer)
286 score = scorer(estimator, X_test)
287 else:
--> 288 score = scorer(estimator, X_test, y_test)
289 if hasattr(score, 'item'):
290 try:
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/metrics/scorer.py in __call__(self, estimator, X, y_true, sample_weight)
89 super(_PredictScorer, self).__call__(estimator, X, y_true,
90 sample_weight=sample_weight)
---> 91 y_pred = estimator.predict(X)
92 if sample_weight is not None:
93 return self._sign * self._score_func(y_true, y_pred,
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/neighbors/regression.py in predict(self, X)
151
152 if weights is None:
--> 153 y_pred = np.mean(_y[neigh_ind], axis=1)
154 else:
155 y_pred = np.empty((X.shape[0], _y.shape[1]), dtype=np.float64)
~/anaconda3/envs/datascience/lib/python3.6/site-packages/numpy/core/fromnumeric.py in mean(a, axis, dtype, out, keepdims)
2907
2908 return _methods._mean(a, axis=axis, dtype=dtype,
-> 2909 out=out, **kwargs)
2910
2911
~/anaconda3/envs/datascience/lib/python3.6/site-packages/numpy/core/_methods.py in _mean(a, axis, dtype, out, keepdims)
71 if isinstance(ret, mu.ndarray):
72 ret = um.true_divide(
---> 73 ret, rcount, out=ret, casting='unsafe', subok=False)
74 if is_float16_result and out is None:
75 ret = arr.dtype.type(ret)
TypeError: unsupported operand type(s) for /: 'str' and 'int'

how to fix value error using logistic regression on iris dataset

I am trying to use logistic regression for classification of iris dataset but i'm facing a value error when fitting the model.
I am using iris dataset. I can't figure why it returns a value_error. Any help is appreciated.
iris = datasets.load_iris()
X, y = iris.data, iris.target
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state= 81,
test_size=0.3)
logreg = LogisticRegression()
params_grid = {"C":[0.001, 0.01, 0.1, 1, 10, 100]}
gridcv = GridSearchCV(logreg, params_grid, cv=10, scoring='roc_auc')
gridcv.fit(x_train, y_train)
Then I get value_error when fitting
ValueError Traceback (most recent call last)
<ipython-input-108-f4ab6e5f5a79> in <module>()
----> 1 gridcv.fit(x_train, y_train)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
637 error_score=self.error_score)
638 for parameters, (train, test) in product(candidate_params,
--> 639 cv.split(X, y, groups)))
640
641 # if one choose to see train score, "out" will contain train score info
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
486 fit_time = time.time() - start_time
487 # _score will return dict if is_multimetric is True
--> 488 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
489 score_time = time.time() - start_time - fit_time
490 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
521 """
522 if is_multimetric:
--> 523 return _multimetric_score(estimator, X_test, y_test, scorer)
524 else:
525 if y_test is None:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
551 score = scorer(estimator, X_test)
552 else:
--> 553 score = scorer(estimator, X_test, y_test)
554
555 if hasattr(score, 'item'):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\scorer.py in __call__(self, clf, X, y, sample_weight)
179 y_type = type_of_target(y)
180 if y_type not in ("binary", "multilabel-indicator"):
--> 181 raise ValueError("{0} format is not supported".format(y_type))
182
183 if is_regressor(clf):
ValueError: multiclass format is not supported
You will need to use scoring supported by in case of multi classes. an example is 'recall_micro'
iris = datasets.load_iris()
X, y = iris.data, iris.target
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state= 81,
test_size=0.3)
logreg = LogisticRegression()
params_grid = {"C":[0.001, 0.01, 0.1, 1, 10, 100]}
gridcv = GridSearchCV(logreg, params_grid, cv=10, scoring='recall_micro')
gridcv.fit(x_train, y_train)

Multi-class classification in xgboost (python)

My first multiclass classication. I have values Xtrn and Ytrn. Ytrn have 5 values [0,1,2,3,4]. But if i start then get "multiclass format is not supported".
This example of values:
Xtrn Ytrn
-1.35173485 1.50224188 2.04951167 0.43759658 0.24381777 2
2.81047260 1.31259056 1.39265240 0.16384002 0.65438366 3
2.32878809 -1.92845940 -2.06453246 0.73132270 0.11771229 2
-0.12810555 -2.07268765 -2.40760215 0.97855042 0.11144164 1
1.88682063 0.75792329 -0.09754671 0.46571931 0.62111648 2
-1.09361266 1.74758304 2.49960891 0.36679883 0.88895562 2
0.71760095 -1.30711698 -2.15681966 0.33700593 0.07171119 2
4.60060308 -1.60544855 -1.88996123 0.94500124 0.63776116 4
-0.84223064 2.78233537 3.07299711 0.31470071 0.34424704 1
-0.71236435 0.53140549 0.46677096 0.12320728 0.58829090 2
-0.35333909 1.12463059 1.70104349 0.89084673 0.16585229 2
3.04322100 -1.36878116 -2.31056167 0.81178387 0.04095645 1
-1.04088918 -1.97497570 -1.93285343 0.54101882 0.02528487 1
-0.41624939 0.54592833 0.95458283 0.40004902 0.55062705 2
-1.77706795 0.29061278 0.68186697 0.17430716 0.75095729 0
This is code:
#import data
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from sklearn import metrics, cross_validation, grid_search, preprocessing
Xtrn = pd.read_csv('x_train_secret.csv', header=None, delimiter=';', na_values='?')
Ytrn = pd.read_csv('y_train_secret.csv', header=None)
Test = pd.read_csv('x_test_secret.csv', header=None, delimiter=';', na_values='?')
#Number of unique values Ytrn
n_classes_ = len(np.unique(Ytrn))
#learning model
X_train, X_test, y_train, y_test = train_test_split(Xtrn, Ytrn, test_size=0.30, random_state=42)
xgb_model = xgb.XGBClassifier(objective='multi:softmax')
xgb_params = [{'num_class': n_classes_}]
xgb_params = [
{
"n_estimators": range(50, 501, 50),
}
]
#cv
cv = cross_validation.StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.3, random_state=42)
xgb_grid = grid_search.GridSearchCV(xgb_model, xgb_params, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=3)
xgb_grid.fit(X_train, y_train)
This is error:
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=50 .................................................
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-233-77d3e8d4b8c3> in <module>()
10
11 xgb_grid = grid_search.GridSearchCV(xgb_model, xgb_params, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=3)
---> 12 xgb_grid.fit(X_train, y_train)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y)
827
828 """
--> 829 return self._fit(X, y, ParameterGrid(self.param_grid))
830
831
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable)
571 self.fit_params, return_parameters=True,
572 error_score=self.error_score)
--> 573 for parameters in parameter_iterable
574 for train, test in cv)
575
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1682
1683 else:
-> 1684 test_score = _score(estimator, X_test, y_test, scorer)
1685 if return_train_score:
1686 train_score = _score(estimator, X_train, y_train, scorer)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator, X_test, y_test, scorer)
1739 score = scorer(estimator, X_test)
1740 else:
-> 1741 score = scorer(estimator, X_test, y_test)
1742 if hasattr(score, 'item'):
1743 try:
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.pyc in __call__(self, clf, X, y, sample_weight)
169 y_type = type_of_target(y)
170 if y_type not in ("binary", "multilabel-indicator"):
--> 171 raise ValueError("{0} format is not supported".format(y_type))
172
173 if is_regressor(clf):
ValueError: multiclass format is not supported
I found answer. Scoring='roc_auc' onle for binary classification. Need another (eg accuracy)
xgb_params = [{'num_class': n_classes_}] need delete

Categories

Resources