Multi-class classification in xgboost (python)

Multi-class classification in xgboost (python) - python

My first multiclass classication. I have values Xtrn and Ytrn. Ytrn have 5 values [0,1,2,3,4]. But if i start then get "multiclass format is not supported".
This example of values:
Xtrn Ytrn
-1.35173485 1.50224188 2.04951167 0.43759658 0.24381777 2
2.81047260 1.31259056 1.39265240 0.16384002 0.65438366 3
2.32878809 -1.92845940 -2.06453246 0.73132270 0.11771229 2
-0.12810555 -2.07268765 -2.40760215 0.97855042 0.11144164 1
1.88682063 0.75792329 -0.09754671 0.46571931 0.62111648 2
-1.09361266 1.74758304 2.49960891 0.36679883 0.88895562 2
0.71760095 -1.30711698 -2.15681966 0.33700593 0.07171119 2
4.60060308 -1.60544855 -1.88996123 0.94500124 0.63776116 4
-0.84223064 2.78233537 3.07299711 0.31470071 0.34424704 1
-0.71236435 0.53140549 0.46677096 0.12320728 0.58829090 2
-0.35333909 1.12463059 1.70104349 0.89084673 0.16585229 2
3.04322100 -1.36878116 -2.31056167 0.81178387 0.04095645 1
-1.04088918 -1.97497570 -1.93285343 0.54101882 0.02528487 1
-0.41624939 0.54592833 0.95458283 0.40004902 0.55062705 2
-1.77706795 0.29061278 0.68186697 0.17430716 0.75095729 0
This is code:
#import data
import pandas as pd
import numpy as np
from sklearn.cross_validation import train_test_split
import xgboost as xgb
from sklearn import metrics, cross_validation, grid_search, preprocessing
Xtrn = pd.read_csv('x_train_secret.csv', header=None, delimiter=';', na_values='?')
Ytrn = pd.read_csv('y_train_secret.csv', header=None)
Test = pd.read_csv('x_test_secret.csv', header=None, delimiter=';', na_values='?')
#Number of unique values Ytrn
n_classes_ = len(np.unique(Ytrn))
#learning model
X_train, X_test, y_train, y_test = train_test_split(Xtrn, Ytrn, test_size=0.30, random_state=42)
xgb_model = xgb.XGBClassifier(objective='multi:softmax')
xgb_params = [{'num_class': n_classes_}]
xgb_params = [
{
"n_estimators": range(50, 501, 50),
}
]
#cv
cv = cross_validation.StratifiedShuffleSplit(y_train, n_iter=5, test_size=0.3, random_state=42)
xgb_grid = grid_search.GridSearchCV(xgb_model, xgb_params, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=3)
xgb_grid.fit(X_train, y_train)
This is error:
Fitting 5 folds for each of 10 candidates, totalling 50 fits
[CV] n_estimators=50 .................................................
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-233-77d3e8d4b8c3> in <module>()
10
11 xgb_grid = grid_search.GridSearchCV(xgb_model, xgb_params, scoring='roc_auc', cv=cv, n_jobs=-1, verbose=3)
---> 12 xgb_grid.fit(X_train, y_train)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y)
827
828 """
--> 829 return self._fit(X, y, ParameterGrid(self.param_grid))
830
831
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable)
571 self.fit_params, return_parameters=True,
572 error_score=self.error_score)
--> 573 for parameters in parameter_iterable
574 for train, test in cv)
575
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.pyc in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1682
1683 else:
-> 1684 test_score = _score(estimator, X_test, y_test, scorer)
1685 if return_train_score:
1686 train_score = _score(estimator, X_train, y_train, scorer)
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator, X_test, y_test, scorer)
1739 score = scorer(estimator, X_test)
1740 else:
-> 1741 score = scorer(estimator, X_test, y_test)
1742 if hasattr(score, 'item'):
1743 try:
/home/rudolf/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.pyc in __call__(self, clf, X, y, sample_weight)
169 y_type = type_of_target(y)
170 if y_type not in ("binary", "multilabel-indicator"):
--> 171 raise ValueError("{0} format is not supported".format(y_type))
172
173 if is_regressor(clf):
ValueError: multiclass format is not supported

I found answer. Scoring='roc_auc' onle for binary classification. Need another (eg accuracy)
xgb_params = [{'num_class': n_classes_}] need delete

Related

Grid Search Cross Validation error when trying to fit X, y with GridSearchCV sklearn

Python sci-kit learn KNN Grid Search Cross Validation error
I am trying to recreated KNN model for prediction of car destination.
https://github.com/carlosbkm/car-destination-prediction
The code is not working at Grid search cross validation here:
https://github.com/carlosbkm/car-destination-prediction/blob/master/k-nearest-model.ipynb
At first geodash was not working so I switched it to geodash2 and there was no problem.
When I try to fit the model I get.
TypeError: unsupported operand type(s) for /: 'str' and 'int'
When I try to fit X and y for Grid Search Cross Validation I get an error.
The problem is coming from
def cv_optimize(clf, parameters, X, y, n_jobs=1, n_folds=5, score_func=None):
if score_func:
gs = GridSearchCV(clf, param_grid=parameters, cv=n_folds, n_jobs=n_jobs, scoring=score_func)
else:
gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
gs.fit(X, y)
print ("BEST", gs.best_params_, gs.best_score_, gs.cv_results_)
best = gs.best_estimator_
return best
I can not fit the model to X and y:
gs.fit(X, y)
I tried to make X and y into floats but nothing changed
When I execute this:
# Create a k-Nearest Neighbors Regression estimator
knn_estimator = KNeighborsRegressor()
#knn_parameters = {"n_neighbors": [1,2,5,10,20,50,100]}
knn_parameters = {"n_neighbors": [1,2,5]}
knn_best = cv_optimize(knn_estimator, knn_parameters, X_train, y_train, score_func='neg_mean_squared_error')
I get:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-124-34b56429c6b5> in <module>()
4 #knn_parameters = {"n_neighbors": [1,2,5,10,20,50,100]}
5 knn_parameters = {"n_neighbors": [1,2,5]}
----> 6 knn_best = cv_optimize(knn_estimator, knn_parameters, X_train, y_train, score_func='neg_mean_squared_error')
<ipython-input-116-1a00f84f1047> in cv_optimize(clf, parameters, X, y, n_jobs, n_folds, score_func)
6 else:
7 gs = GridSearchCV(clf, param_grid=parameters, n_jobs=n_jobs, cv=n_folds)
----> 8 gs.fit(X, y)
9 print ("BEST", gs.best_params_, gs.best_score_, gs.cv_results_)
10 best = gs.best_estimator_
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups)
943 train/test set.
944 """
--> 945 return self._fit(X, y, groups, ParameterGrid(self.param_grid))
946
947
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _fit(self, X, y, groups, parameter_iterable)
562 return_times=True, return_parameters=True,
563 error_score=self.error_score)
--> 564 for parameters in parameter_iterable
565 for train, test in cv_iter)
566
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
258 else:
259 fit_time = time.time() - start_time
--> 260 test_score = _score(estimator, X_test, y_test, scorer)
261 score_time = time.time() - start_time - fit_time
262 if return_train_score:
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _score(estimator, X_test, y_test, scorer)
286 score = scorer(estimator, X_test)
287 else:
--> 288 score = scorer(estimator, X_test, y_test)
289 if hasattr(score, 'item'):
290 try:
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/metrics/scorer.py in __call__(self, estimator, X, y_true, sample_weight)
89 super(_PredictScorer, self).__call__(estimator, X, y_true,
90 sample_weight=sample_weight)
---> 91 y_pred = estimator.predict(X)
92 if sample_weight is not None:
93 return self._sign * self._score_func(y_true, y_pred,
~/anaconda3/envs/datascience/lib/python3.6/site-packages/sklearn/neighbors/regression.py in predict(self, X)
151
152 if weights is None:
--> 153 y_pred = np.mean(_y[neigh_ind], axis=1)
154 else:
155 y_pred = np.empty((X.shape[0], _y.shape[1]), dtype=np.float64)
~/anaconda3/envs/datascience/lib/python3.6/site-packages/numpy/core/fromnumeric.py in mean(a, axis, dtype, out, keepdims)
2907
2908 return _methods._mean(a, axis=axis, dtype=dtype,
-> 2909 out=out, **kwargs)
2910
2911
~/anaconda3/envs/datascience/lib/python3.6/site-packages/numpy/core/_methods.py in _mean(a, axis, dtype, out, keepdims)
71 if isinstance(ret, mu.ndarray):
72 ret = um.true_divide(
---> 73 ret, rcount, out=ret, casting='unsafe', subok=False)
74 if is_float16_result and out is None:
75 ret = arr.dtype.type(ret)
TypeError: unsupported operand type(s) for /: 'str' and 'int'

MemoryError cross_val_score Jupyter Notebook

I am newbie in programming and machine learning. I am doing an assignment on KNN and amazon fine food reviews but getting this error.
My code:
from sklearn.model_selection import train_test_split
Y = data['Score'].values
X_with_stop= data['Text_with_stop'].values
X_no_stop = data['New_Text'].values
X_with_stop_train, X_with_stop_test, y_train, y_test = train_test_split(X_with_stop, Y, test_size=0.33, shuffle=False)
print(X_with_stop_train.shape, y_train.shape,y_test.shape)
from sklearn.feature_extraction.text import CountVectorizer
vectorizer = CountVectorizer()
bow_X_train_brute = vectorizer.fit_transform(X_with_stop_train)
bow_X_test_brute = vectorizer.transform(X_with_stop_test)
from sklearn.neighbors import KNeighborsClassifier
from sklearn.model_selection import cross_val_score
from sklearn.metrics import accuracy_score
neighbors = list(range(3,99,2))
cv_scores = []
for k in neighbors:
knn = KNeighborsClassifier(n_neighbors=k,algorithm='brute')
scores = cross_val_score(knn, bow_X_train_brute, y_train, cv=10, scoring='accuracy')
cv_scores.append(scores.mean())
MSE = [1 - x for x in cv_scores]
# determining best k
optimal_k = neighbors[MSE.index(min(MSE))]
print ("The optimal number of neighbors is %d" % optimal_k)
# plot misclassification error vs k
plt.plot(neighbors, MSE)
plt.xlabel('Number of Neighbors K')
plt.ylabel('Misclassification Error')
plt.title("Plot for K vs Error for Brute force algorithm")
plt.show()
The output:
(413629,) (413629,) (203729,)
The error i am getting is as below:
MemoryError Traceback (most recent call last)
<ipython-input-17-f1ce8e46a2a3> in <module>()
43 for k in neighbors:
44 knn = KNeighborsClassifier(n_neighbors=k,algorithm='brute')
---> 45 scores = cross_val_score(knn, bow_X_train_brute, y_train, cv=10, scoring='accuracy')
46 cv_scores.append(scores.mean())
47
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch)
340 n_jobs=n_jobs, verbose=verbose,
341 fit_params=fit_params,
--> 342 pre_dispatch=pre_dispatch)
343 return cv_results['test_score']
344
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch, return_train_score)
204 fit_params, return_train_score=return_train_score,
205 return_times=True)
--> 206 for train, test in cv.split(X, y, groups))
207
208 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
486 fit_time = time.time() - start_time
487 # _score will return dict if is_multimetric is True
--> 488 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
489 score_time = time.time() - start_time - fit_time
490 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
521 """
522 if is_multimetric:
--> 523 return _multimetric_score(estimator, X_test, y_test, scorer)
524 else:
525 if y_test is None:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
551 score = scorer(estimator, X_test)
552 else:
--> 553 score = scorer(estimator, X_test, y_test)
554
555 if hasattr(score, 'item'):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\scorer.py in __call__(self, estimator, X, y_true, sample_weight)
99 super(_PredictScorer, self).__call__(estimator, X, y_true,
100 sample_weight=sample_weight)
--> 101 y_pred = estimator.predict(X)
102 if sample_weight is not None:
103 return self._sign * self._score_func(y_true, y_pred,
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neighbors\classification.py in predict(self, X)
143 X = check_array(X, accept_sparse='csr')
144
--> 145 neigh_dist, neigh_ind = self.kneighbors(X)
146
147 classes_ = self.classes_
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\neighbors\base.py in kneighbors(self, X, n_neighbors, return_distance)
355 if self.effective_metric_ == 'euclidean':
356 dist = pairwise_distances(X, self._fit_X, 'euclidean',
--> 357 n_jobs=n_jobs, squared=True)
358 else:
359 dist = pairwise_distances(
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in pairwise_distances(X, Y, metric, n_jobs, **kwds)
1245 func = partial(distance.cdist, metric=metric, **kwds)
1246
-> 1247 return _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1248
1249
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in _parallel_pairwise(X, Y, func, n_jobs, **kwds)
1088 if n_jobs == 1:
1089 # Special case to avoid picklability checks in delayed
-> 1090 return func(X, Y, **kwds)
1091
1092 # TODO: in some cases, backend='threading' may be appropriate
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\pairwise.py in euclidean_distances(X, Y, Y_norm_squared, squared, X_norm_squared)
244 YY = row_norms(Y, squared=True)[np.newaxis, :]
245
--> 246 distances = safe_sparse_dot(X, Y.T, dense_output=True)
247 distances *= -2
248 distances += XX
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\utils\extmath.py in safe_sparse_dot(a, b, dense_output)
133 """
134 if issparse(a) or issparse(b):
--> 135 ret = a * b
136 if dense_output and hasattr(ret, "toarray"):
137 ret = ret.toarray()
C:\ProgramData\Anaconda3\lib\site-packages\scipy\sparse\base.py in __mul__(self, other)
477 if self.shape[1] != other.shape[0]:
478 raise ValueError('dimension mismatch')
--> 479 return self._mul_sparse_matrix(other)
480
481 # If it's a list or whatever, treat it like a matrix
C:\ProgramData\Anaconda3\lib\site-packages\scipy\sparse\compressed.py in _mul_sparse_matrix(self, other)
500 maxval=nnz)
501 indptr = np.asarray(indptr, dtype=idx_dtype)
--> 502 indices = np.empty(nnz, dtype=idx_dtype)
503 data = np.empty(nnz, dtype=upcast(self.dtype, other.dtype))
504

A MemoryError usually means that you ran out of RAM. And seeing the size of your dataset, I think it might be a plausible explanation.
To be sure, just look at your RAM usage while executing your code.

how to fix value error using logistic regression on iris dataset

I am trying to use logistic regression for classification of iris dataset but i'm facing a value error when fitting the model.
I am using iris dataset. I can't figure why it returns a value_error. Any help is appreciated.
iris = datasets.load_iris()
X, y = iris.data, iris.target
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state= 81,
test_size=0.3)
logreg = LogisticRegression()
params_grid = {"C":[0.001, 0.01, 0.1, 1, 10, 100]}
gridcv = GridSearchCV(logreg, params_grid, cv=10, scoring='roc_auc')
gridcv.fit(x_train, y_train)
Then I get value_error when fitting
ValueError Traceback (most recent call last)
<ipython-input-108-f4ab6e5f5a79> in <module>()
----> 1 gridcv.fit(x_train, y_train)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
637 error_score=self.error_score)
638 for parameters, (train, test) in product(candidate_params,
--> 639 cv.split(X, y, groups)))
640
641 # if one choose to see train score, "out" will contain train score info
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in apply_async(self, func, callback)
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, error_score)
486 fit_time = time.time() - start_time
487 # _score will return dict if is_multimetric is True
--> 488 test_scores = _score(estimator, X_test, y_test, scorer, is_multimetric)
489 score_time = time.time() - start_time - fit_time
490 if return_train_score:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, is_multimetric)
521 """
522 if is_multimetric:
--> 523 return _multimetric_score(estimator, X_test, y_test, scorer)
524 else:
525 if y_test is None:
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _multimetric_score(estimator, X_test, y_test, scorers)
551 score = scorer(estimator, X_test)
552 else:
--> 553 score = scorer(estimator, X_test, y_test)
554
555 if hasattr(score, 'item'):
C:\ProgramData\Anaconda3\lib\site-packages\sklearn\metrics\scorer.py in __call__(self, clf, X, y, sample_weight)
179 y_type = type_of_target(y)
180 if y_type not in ("binary", "multilabel-indicator"):
--> 181 raise ValueError("{0} format is not supported".format(y_type))
182
183 if is_regressor(clf):
ValueError: multiclass format is not supported

You will need to use scoring supported by in case of multi classes. an example is 'recall_micro'
iris = datasets.load_iris()
X, y = iris.data, iris.target
x_train, x_test, y_train, y_test = train_test_split(X, y, stratify=y, random_state= 81,
test_size=0.3)
logreg = LogisticRegression()
params_grid = {"C":[0.001, 0.01, 0.1, 1, 10, 100]}
gridcv = GridSearchCV(logreg, params_grid, cv=10, scoring='recall_micro')
gridcv.fit(x_train, y_train)

ValueError: Unknown label type: 'continuous

I am trying to carry out a grid search to optimise my parameters my code is:
from sklearn.grid_search import GridSearchCV
from sklearn.svm import SVC
parameters = [{'kernel':['rbf'], 'gamma' :[1e-2, 1e-3, 1e-4 ,1e-5],
'C': [1, 10, 100, 1000]},
{'kernel': ['poly'], 'C': [1, 10, 100, 1000], 'degree':[1,2,3,4]}]
clf = GridSearchCV (SVC(C=1), parameters, cv=5, scoring='f1_macro')
clf.fit(X_train, y_train)
My X_train, y_train are floating point numbers which are:
x_train = [[3.30049159],[2.25226244],[1.44078451] ...,[5.63927925],[5.431458],[4.35674369]]
y_train = [[0.2681013],[0.03454225],[0.02062136]...,[0.21827915],[0.28768273,[0.27969417]]
I believe the error may be that I am using floating point numbers and maybe only integers are able to be passed into the classifier if this is the case, how would this be resolved? My full traceback error message is:
ValueError Traceback (most recent call last)
<ipython-input-51-fb016a0a90cc> in <module>()
11
12 clf = GridSearchCV (SVC(C=1), parameters, cv=5, scoring='f1_macro')
---> 13 clf.fit(X_train, y_train)
~/anaconda3_501/lib/python3.6/site-packages/sklearn/grid_search.py in fit(self, X, y)
836
837 """
--> 838 return self._fit(X, y, ParameterGrid(self.param_grid))
839
840
~/anaconda3_501/lib/python3.6/site-packages/sklearn/grid_search.py in _fit(self, X, y, parameter_iterable)
572 self.fit_params, return_parameters=True,
573 error_score=self.error_score)
--> 574 for parameters in parameter_iterable
575 for train, test in cv)
576
~/anaconda3_501/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
777 # was dispatched. In particular this covers the edge
778 # case of Parallel used with an exhausted iterator.
--> 779 while self.dispatch_one_batch(iterator):
780 self._iterating = True
781 else:
~/anaconda3_501/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
623 return False
624 else:
--> 625 self._dispatch(tasks)
626 return True
627
~/anaconda3_501/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
586 dispatch_timestamp = time.time()
587 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 588 job = self._backend.apply_async(batch, callback=cb)
589 self._jobs.append(job)
590
109 def apply_async(self, func, callback=None):
110 """Schedule a func to be run"""
--> 111 result = ImmediateResult(func)
112 if callback:
113 callback(result)
~/anaconda3_501/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
330 # Don't delay the application, to avoid keeping the input
331 # arguments in memory
--> 332 self.results = batch()
333
334 def get(self):
~/anaconda3_501/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3_501/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/anaconda3_501/lib/python3.6/site-packages/sklearn/cross_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1673 estimator.fit(X_train, **fit_params)
1674 else:
-> 1675 estimator.fit(X_train, y_train, **fit_params)
1676
1677 except Exception as e:
~/anaconda3_501/lib/python3.6/site-packages/sklearn/svm/base.py in fit(self, X, y, sample_weight)
148
149 X, y = check_X_y(X, y, dtype=np.float64, order='C', accept_sparse='csr')
--> 150 y = self._validate_targets(y)
151
152 sample_weight = np.asarray([]
~/anaconda3_501/lib/python3.6/site-packages/sklearn/svm/base.py in _validate_targets(self, y)
498 def _validate_targets(self, y):
499 y_ = column_or_1d(y, warn=True)
--> 500 check_classification_targets(y)
501 cls, y = np.unique(y_, return_inverse=True)
502 self.class_weight_ = compute_class_weight(self.class_weight, cls, y_)
~/anaconda3_501/lib/python3.6/site-packages/sklearn/utils/multiclass.py in check_classification_targets(y)
170 if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
171 'multilabel-indicator', 'multilabel-sequences']:
--> 172 raise ValueError("Unknown label type: %r" % y_type)
173
174
ValueError: Unknown label type: 'continuous'
Help with this would be appreciated.

You are using a classifier. You can only classify binary or categorical variables. If you want to use support vector but predict numeric values you should use support vector regression.
Otherwise you will have to classify your y-values into groups.

This is a regression problem not a classification problem. What the model is trying to do is - fit X into classes defined by Y (which are continuous). This is unknown to SVC classifier. Update your code with SVR
from sklearn.svm import SVR
from sklearn.grid_search import GridSearchCV
X_train = [[3.30049159], [2.25226244], [1.44078451]]
#1. Y should be 1d array of dimensions (n_samples,)
y_train = [0.2681013, 0.03454225, 0.02062136]
#Grid Search
parameters = [{'kernel': ['rbf'], 'gamma': [1e-2, 1e-3, 1e-4, 1e-5],
'C': [1, 10, 100, 1000]},
{'kernel': ['poly'], 'C': [1, 10, 100, 1000], 'degree': [1, 2, 3, 4]}]
#2. Type of regressor
reg = SVR(C=1)
#3. Regression evaluation cannot be done using f1_macro, so updated NMSE
clf = GridSearchCV(reg, parameters, cv=5, scoring='neg_mean_squared_error')
clf.fit(X_train, y_train)

Grid search with f1 as scoring function, several pages of error message

Want to use Gridsearch to find best parameters and use f1 as the scoring metric.
If i remove the scoring function, all works well and i get no errors.
Here is my code:
from sklearn import grid_search
parameters = {'n_neighbors':(1,3,5,10,15),'weights':('uniform','distance'),'algorithm':('ball_tree','kd_tree','brute'),'leaf_size':(5,10,20,30,50)}
reg = grid_search.GridSearchCV(estimator=neigh,param_grid=parameters,scoring="f1")
train_classifier(reg, X_train, y_train)
train_f1_score = predict_labels(reg, X_train, y_train)
print reg.best_params_
print "F1 score for training set: {}".format(train_f1_score)
print "F1 score for test set: {}".format(predict_labels(reg, X_test, y_test))
When i execute i get pages upon pages as errors, and i cannot make heads or tails of it :(
ValueError Traceback (most recent call last)
<ipython-input-17-3083ff8a20ea> in <module>()
3 parameters = {'n_neighbors':(1,3,5,10,15),'weights':('uniform','distance'),'algorithm':('ball_tree','kd_tree','brute'),'leaf_size':(5,10,20,30,50)}
4 reg = grid_search.GridSearchCV(estimator=neigh,param_grid=parameters,scoring="f1")
----> 5 train_classifier(reg, X_train, y_train)
6 train_f1_score = predict_labels(reg, X_train, y_train)
7 print reg.best_params_
<ipython-input-9-b56ce25fd90b> in train_classifier(clf, X_train, y_train)
5 print "Training {}...".format(clf.__class__.__name__)
6 start = time.time()
----> 7 clf.fit(X_train, y_train)
8 end = time.time()
9 print "Done!\nTraining time (secs): {:.3f}".format(end - start)
//anaconda/lib/python2.7/site-packages/sklearn/grid_search.pyc in fit(self, X, y)
802
803 """
--> 804 return self._fit(X, y, ParameterGrid(self.param_grid))
805
806
//anaconda/lib/python2.7/site-packages/sklearn/grid_search.pyc in _fit(self, X, y, parameter_iterable)
551 self.fit_params, return_parameters=True,
552 error_score=self.error_score)
--> 553 for parameters in parameter_iterable
554 for train, test in cv)
555
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self, iterable)
802 self._iterating = True
803
--> 804 while self.dispatch_one_batch(iterator):
805 pass
806
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in dispatch_one_batch(self, iterator)
660 return False
661 else:
--> 662 self._dispatch(tasks)
663 return True
664
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in _dispatch(self, batch)
568
569 if self._pool is None:
--> 570 job = ImmediateComputeBatch(batch)
571 self._jobs.append(job)
572 self.n_dispatched_batches += 1
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __init__(self, batch)
181 # Don't delay the application, to avoid keeping the input
182 # arguments in memory
--> 183 self.results = batch()
184
185 def get(self):
//anaconda/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.pyc in __call__(self)
70
71 def __call__(self):
---> 72 return [func(*args, **kwargs) for func, args, kwargs in self.items]
73
74 def __len__(self):
//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, error_score)
1548
1549 else:
-> 1550 test_score = _score(estimator, X_test, y_test, scorer)
1551 if return_train_score:
1552 train_score = _score(estimator, X_train, y_train, scorer)
//anaconda/lib/python2.7/site-packages/sklearn/cross_validation.pyc in _score(estimator, X_test, y_test, scorer)
1604 score = scorer(estimator, X_test)
1605 else:
-> 1606 score = scorer(estimator, X_test, y_test)
1607 if not isinstance(score, numbers.Number):
1608 raise ValueError("scoring must return a number, got %s (%s) instead."
//anaconda/lib/python2.7/site-packages/sklearn/metrics/scorer.pyc in __call__(self, estimator, X, y_true, sample_weight)
88 else:
89 return self._sign * self._score_func(y_true, y_pred,
---> 90 **self._kwargs)
91
92
//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in f1_score(y_true, y_pred, labels, pos_label, average, sample_weight)
637 return fbeta_score(y_true, y_pred, 1, labels=labels,
638 pos_label=pos_label, average=average,
--> 639 sample_weight=sample_weight)
640
641
//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in fbeta_score(y_true, y_pred, beta, labels, pos_label, average, sample_weight)
754 average=average,
755 warn_for=('f-score',),
--> 756 sample_weight=sample_weight)
757 return f
758
//anaconda/lib/python2.7/site-packages/sklearn/metrics/classification.pyc in precision_recall_fscore_support(y_true, y_pred, beta, labels, pos_label, average, warn_for, sample_weight)
982 else:
983 raise ValueError("pos_label=%r is not a valid label: %r" %
--> 984 (pos_label, present_labels))
985 labels = [pos_label]
986 if labels is None:
ValueError: pos_label=1 is not a valid label: array(['no', 'yes'],
dtype='|S3')

Seems that you have label array with values 'no' and 'yes', you should convert them to binary 1-0 numerical representation, because your error states that scoring function cannot understand where 0's and 1's are in your label array.
Other possible way to solve it without modifying your label array:
from sklearn.metrics import f1_score
from sklearn.metrics import make_scorer
f1_scorer = make_scorer(f1_score, pos_label="yes")
reg = grid_search.GridSearchCV(estimator=neigh,param_grid=parameters,scoring=f1_scorer)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Multi-class classification in xgboost (python) - python

I found answer. Scoring='roc_auc' onle for binary classification. Need another (eg accuracy) xgb_params = [{'num_class': n_classes_}] need delete

Related

Grid Search Cross Validation error when trying to fit X, y with GridSearchCV sklearn

MemoryError cross_val_score Jupyter Notebook

how to fix value error using logistic regression on iris dataset

ValueError: Unknown label type: 'continuous

Grid search with f1 as scoring function, several pages of error message

Categories

Resources