BayesSearchCV - lightgbm - early stopping - "ValueError: not enough values to unpack" - python

could you help me with the problem below? Many thanks in advance.
Without fit_params=fit_params, the code below works fine, but I want to try early stopping with lgbm.
I did try to search for clues but found limited resources but some github issues from lightgbm and BayesSearchCV.
lg = lgb.LGBMClassifier(random_state=42, n_jobs=-1, objective='multiclass', n_estimators=5000)
fullPipeline = Pipeline(steps=[
('scaler', StandardScaler()),
('model', lg)
])
param_space = {'model__max_depth': [2, 63],
'model__num_leaves': [7, 4095],
}
fit_params = {
'early_stopping_rounds':30,
'eval_metric':'accuracy',
'eval_set':[(xValid, yValid)],
}
BSLGB = BayesSearchCV(fullPipeline, param_space, random_state=42, scoring='accuracy', cv=5, n_iter=50, verbose=3, n_jobs=-1,
fit_params=fit_params)
%time BSLGB.fit(xTrain.astype(float), yTrain)
Exception:
ValueError Traceback (most recent call last)
<timed eval> in <module>
C:\Anaconda3x64\envs\ml\lib\site-packages\skopt\searchcv.py in fit(self, X, y, groups, callback)
652 optim_result = self._step(
653 X, y, search_space, optimizer,
--> 654 groups=groups, n_points=n_points_adjusted
655 )
656 n_iter -= n_points
C:\Anaconda3x64\envs\ml\lib\site-packages\skopt\searchcv.py in _step(self, X, y, search_space, optimizer, groups, n_points)
548 refit = self.refit
549 self.refit = False
--> 550 self._fit(X, y, groups, params_dict)
551 self.refit = refit
552
C:\Anaconda3x64\envs\ml\lib\site-packages\skopt\searchcv.py in _fit(self, X, y, groups, parameter_iterable)
401 error_score=self.error_score
402 )
--> 403 for parameters in parameter_iterable
404 for train, test in cv_iter)
405
C:\Anaconda3x64\envs\ml\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
928
929 with self._backend.retrieval_context():
--> 930 self.retrieve()
931 # Make sure that we get a last message telling us we are done
932 elapsed_time = time.time() - self._start_time
C:\Anaconda3x64\envs\ml\lib\site-packages\sklearn\externals\joblib\parallel.py in retrieve(self)
831 try:
832 if getattr(self._backend, 'supports_timeout', False):
--> 833 self._output.extend(job.get(timeout=self.timeout))
834 else:
835 self._output.extend(job.get())
C:\Anaconda3x64\envs\ml\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
519 AsyncResults.get from multiprocessing."""
520 try:
--> 521 return future.result(timeout=timeout)
522 except LokyTimeoutError:
523 raise TimeoutError()
C:\Anaconda3x64\envs\ml\lib\concurrent\futures\_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433 else:
434 raise TimeoutError()
C:\Anaconda3x64\envs\ml\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
ValueError: not enough values to unpack (expected 2, got 1)

The root cause of this issue is I passed a pipeline, not a model into BayesSearchCV. Meanwhile, my fit_params do not have a prefix. To fix:
fit_params = {
'model__early_stopping_rounds':30,
'model__eval_metric':'multi_logloss',
'model__eval_set':[(xValid, yValid)],
}

Related

XGBoost image classification Bad Allocation error

So I am using Xgboost for an image classification problem but I am getting a Bad Allocation error when running model.fit and passing the train data. I think that the problem must be the data but its in a 2d array as supported by XGBoost also when the data had been passed through a VGG16 Feature Extractor it worked with no problem. Any possible solutions please?
x = np.array(allImages)<br>
y = np.array(allLabels)<br>
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=123)<br>
le = preprocessing.LabelEncoder()<br>
le.fit(y_train)<br>
y_train_encoded = le.transform(y_train)<br>
del y_train<br>
le.fit(y_test)<br>
y_test_encoded = le.transform(y_test)<br>
x_train = x_train.reshape(x_train.shape[0],-1)<br>
model = xgb.XGBClassifier(use_label_encoder=False,objective='multi:softprob',
num_class= 10)<br>
model.fit(x_train,y_train_encoded)<br>
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-12-d0749b6882f3> in <module>
5 model = xgb.XGBClassifier(use_label_encoder=False,objective='multi:softprob',
6 num_class= 10)
----> 7 model.fit(x_train,y_train_encoded)
8 #For the kFold
9 le.fit(y)
C:\Anaconda\lib\site-packages\xgboost\core.py in inner_f(*args, **kwargs)
420 for k, arg in zip(sig.parameters, args):
421 kwargs[k] = arg
--> 422 return f(**kwargs)
423
424 return inner_f
C:\Anaconda\lib\site-packages\xgboost\sklearn.py in fit(self, X, y, sample_weight, base_margin, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, feature_weights, callbacks)
901 self.n_features_in_ = self._features_count
902
--> 903 train_dmatrix, evals = self._wrap_evaluation_matrices(
904 X, y, group=None, sample_weight=sample_weight, base_margin=base_margin,
905 feature_weights=feature_weights,
C:\Anaconda\lib\site-packages\xgboost\sklearn.py in _wrap_evaluation_matrices(self, X, y, group, sample_weight, base_margin, feature_weights, eval_set, sample_weight_eval_set, eval_group, label_transform)
263
264 y = label_transform(y)
--> 265 train_dmatrix = DMatrix(data=X, label=y, weight=sample_weight,
266 base_margin=base_margin,
267 missing=self.missing, nthread=self.n_jobs)
C:\Anaconda\lib\site-packages\xgboost\core.py in __init__(self, data, label, weight, base_margin, missing, silent, feature_names, feature_types, nthread, enable_categorical)
498
499 from .data import dispatch_data_backend
--> 500 handle, feature_names, feature_types = dispatch_data_backend(
501 data, missing=self.missing,
502 threads=self.nthread,
C:\Anaconda\lib\site-packages\xgboost\data.py in dispatch_data_backend(data, missing, threads, feature_names, feature_types, enable_categorical)
528 return _from_scipy_csr(data.tocsr(), missing, feature_names, feature_types)
529 if _is_numpy_array(data):
--> 530 return _from_numpy_array(data, missing, threads, feature_names,
531 feature_types)
532 if _is_uri(data):
C:\Anaconda\lib\site-packages\xgboost\data.py in _from_numpy_array(data, missing, nthread, feature_names, feature_types)
145 flatten = _transform_np_array(data)
146 handle = ctypes.c_void_p()
--> 147 _check_call(_LIB.XGDMatrixCreateFromMat_omp(
148 flatten.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
149 c_bst_ulong(data.shape[0]),
C:\Anaconda\lib\site-packages\xgboost\core.py in _check_call(ret)
187 """
188 if ret != 0:
--> 189 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
190
191
XGBoostError: bad allocation

PicklingError: Could not pickle the task to send it to the workers

I am working on an NLP Kaggle project and I am using RandomizedSearchCV in my project. I have defined a function named GO which implements RandomizedSearchCV with KFold and scoring criteria and grid_param. Following is my code and when I call the function GO, it gives an error:
kf = KFold(n_splits=5, random_state=0, shuffle=True)
acc = lambda y, y_pred: accuracy_score(y, y_pred)
scorer = make_scorer(acc, greater_is_better=True)
def GO(model, grid, n_iter=100):
search = RandomizedSearchCV(model, grid, n_iter, scorer, n_jobs=-1, cv=kf, random_state=0, verbose=True)
return search.fit(X_train, y_train)
This is the error I get:
PicklingError Traceback (most recent call last)
<ipython-input-131-310dea03e0ad> in <module>
3
4 for pipe, grid in zip(pipes, grids):
----> 5 fitted_models.append(GO(pipe, grid))
<ipython-input-129-98eb26241ea1> in GO(model, grid, n_iter)
1 def GO(model, grid, n_iter=100):
2 search = RandomizedSearchCV(model, grid, n_iter, scorer, n_jobs=-1, cv=kf, random_state=0, verbose=True)
----> 3 return search.fit(X_train, y_train)
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1513 evaluate_candidates(ParameterSampler(
1514 self.param_distributions, self.n_iter,
-> 1515 random_state=self.random_state))
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
928
929 with self._backend.retrieval_context():
--> 930 self.retrieve()
931 # Make sure that we get a last message telling us we are done
932 elapsed_time = time.time() - self._start_time
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in retrieve(self)
831 try:
832 if getattr(self._backend, 'supports_timeout', False):
--> 833 self._output.extend(job.get(timeout=self.timeout))
834 else:
835 self._output.extend(job.get())
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
519 AsyncResults.get from multiprocessing."""
520 try:
--> 521 return future.result(timeout=timeout)
522 except LokyTimeoutError:
523 raise TimeoutError()
~\Anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433 else:
434 raise TimeoutError()
~\Anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
PicklingError: Could not pickle the task to send it to the workers.
I tried to resolve it but can't do. Can anyone here help me?

Custom scoring in learning_curve

I cannot customize a scoring in sklearn.model_selection.learning_curve. I have as estimator a SVR, which is as regressor, but the estimator should be an classifier, and I need to implement how to translate the continous values to classes.
I have followed the documentation: https://scikit-learn.org/stable/modules/model_evaluation.html and https://scikit-learn.org/stable/modules/generated/sklearn.model_selection.learning_curve.html#sklearn.model_selection.learning_curve
I am using scikit-learn-0.22 and python 3.7.
This my code:
def scorer(y_true, y_pred):
closest = [ y_true[i] if abs(y_true[i] - y_) <= 1.0 else y_true.flat[np.abs(y_true - y_).argmin()] for i, y_ in enumerate(y_pred)]
return accuracy_score(y_true, closest)
train_sizes, train_scores, test_scores, fit_times, _ = \
learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
train_sizes=train_sizes,
return_times=True, scoring=make_scorer(scorer))
I got the flowing error:
AttributeError: 'Series' object has no attribute 'flat'
<ipython-input-10-bc8ce2a8f15e> in plot_learning_curve(estimator, title, X, y, scoring, axes, ylim, cv, n_jobs, train_sizes)
90 learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs,
91 train_sizes=train_sizes,
---> 92 return_times=True, scoring=scoring)
93 train_scores_mean = np.mean(train_scores, axis=1)
94 train_scores_std = np.std(train_scores, axis=1)
~/miniconda3/envs/dtscience/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in learning_curve(estimator, X, y, groups, train_sizes, cv, scoring, exploit_incremental_learning, n_jobs, pre_dispatch, verbose, shuffle, random_state, error_score, return_times)
1265 parameters=None, fit_params=None, return_train_score=True,
1266 error_score=error_score, return_times=return_times)
-> 1267 for train, test in train_test_proportions)
1268 out = np.array(out)
1269 n_cv_folds = out.shape[0] // n_unique_ticks
~/miniconda3/envs/dtscience/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1015
1016 with self._backend.retrieval_context():
-> 1017 self.retrieve()
1018 # Make sure that we get a last message telling us we are done
1019 elapsed_time = time.time() - self._start_time
~/miniconda3/envs/dtscience/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self)
907 try:
908 if getattr(self._backend, 'supports_timeout', False):
--> 909 self._output.extend(job.get(timeout=self.timeout))
910 else:
911 self._output.extend(job.get())
~/miniconda3/envs/dtscience/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
560 AsyncResults.get from multiprocessing."""
561 try:
--> 562 return future.result(timeout=timeout)
563 except LokyTimeoutError:
564 raise TimeoutError()
~/miniconda3/envs/dtscience/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~/miniconda3/envs/dtscience/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
AttributeError: 'Series' object has no attribute 'flat'
So we can't see the value of y in plot_learning_curve, but somewhere along the lines y is being provided from a pandas DataFrame or is a standalone Series. In the code for your scorer function you have the following:
y_true.flat[np.abs(y_true - y_).argmin()]
Because of this y needs to be a numpy ndarray. Since y is a Series you either want to do the following:
learning_curve(estimator, X, y.values, cv=cv, n_jobs=n_jobs,
train_sizes=train_sizes,
return_times=True, scoring=make_scorer(scorer))
Or make sure plot learning curve is called as follows in your other script:
plot_learning_curve(estimator, title, X, y.values, scoring, axes, ylim, cv, n_jobs, train_sizes)

Python KeyError: 0 when using D2VTransformer

I am getting KeyError:0 when running this code in python:
full_pipeline.fit(X_train, y_train)
Here is the completed code:
from gensim.sklearn_api import D2VTransformer
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
name_pipeline = Pipeline( steps = [
( 'feature_selector', FeatureSelector(['name']) ),
( 'feature_transformer', D2VTransformer() ) ] )
description_pipeline = Pipeline( steps = [
( 'feature_selector', FeatureSelector(['description']) ),
( 'feature_transformer', D2VTransformer() ) ] )
X_pipeline = FeatureUnion( transformer_list = [
( 'name_pipeline', name_pipeline ),
( 'description_pipeline', description_pipeline ) ] )
#Split up the train and test data
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.1)
clf = LogisticRegression(random_state=0, class_weight='balanced', solver='lbfgs', max_iter=1000, multi_class='multinomial')
full_pipeline = Pipeline( steps =
[ ( 'pipeline', X_pipeline),
( 'model', clf ) ] )
full_pipeline.fit(X_train, y_train)
And here is the error I'm getting:
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2896 try:
-> 2897 return self._engine.get_loc(key)
2898 except KeyError:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 0
During handling of the above exception, another exception occurred:
KeyError Traceback (most recent call last)
19 frames
<ipython-input-14-0ddbaedffb67> in <module>()
25 ( 'model', clf ) ] )
26
---> 27 full_pipeline.fit(X_train, y_train)
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
350 This estimator
351 """
--> 352 Xt, fit_params = self._fit(X, y, **fit_params)
353 with _print_elapsed_time('Pipeline',
354 self._log_message(len(self.steps) - 1)):
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
315 message_clsname='Pipeline',
316 message=self._log_message(step_idx),
--> 317 **fit_params_steps[name])
318 # Replace the transformer of the step with the fitted
319 # transformer. This is necessary when loading the transformer
/usr/local/lib/python3.6/dist-packages/joblib/memory.py in __call__(self, *args, **kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
714 with _print_elapsed_time(message_clsname, message):
715 if hasattr(transformer, 'fit_transform'):
--> 716 res = transformer.fit_transform(X, y, **fit_params)
717 else:
718 res = transformer.fit(X, y, **fit_params).transform(X)
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
910 sum of n_components (output dimension) over transformers.
911 """
--> 912 results = self._parallel_func(X, y, fit_params, _fit_transform_one)
913 if not results:
914 # All transformers are None
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _parallel_func(self, X, y, fit_params, func)
940 message=self._log_message(name, idx, len(transformers)),
941 **fit_params) for idx, (name, transformer,
--> 942 weight) in enumerate(transformers, 1))
943
944 def transform(self, X):
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in __call__(self, iterable)
1001 # remaining jobs.
1002 self._iterating = False
-> 1003 if self.dispatch_one_batch(iterator):
1004 self._iterating = self._original_iterator is not None
1005
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
832 return False
833 else:
--> 834 self._dispatch(tasks)
835 return True
836
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in _dispatch(self, batch)
751 with self._lock:
752 job_idx = len(self._jobs)
--> 753 job = self._backend.apply_async(batch, callback=cb)
754 # A job can complete so quickly than its callback is
755 # called before we get here, causing self._jobs to
/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
199 def apply_async(self, func, callback=None):
200 """Schedule a func to be run"""
--> 201 result = ImmediateResult(func)
202 if callback:
203 callback(result)
/usr/local/lib/python3.6/dist-packages/joblib/_parallel_backends.py in __init__(self, batch)
580 # Don't delay the application, to avoid keeping the input
581 # arguments in memory
--> 582 self.results = batch()
583
584 def get(self):
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
/usr/local/lib/python3.6/dist-packages/joblib/parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
714 with _print_elapsed_time(message_clsname, message):
715 if hasattr(transformer, 'fit_transform'):
--> 716 res = transformer.fit_transform(X, y, **fit_params)
717 else:
718 res = transformer.fit(X, y, **fit_params).transform(X)
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
391 return Xt
392 if hasattr(last_step, 'fit_transform'):
--> 393 return last_step.fit_transform(Xt, y, **fit_params)
394 else:
395 return last_step.fit(Xt, y, **fit_params).transform(Xt)
/usr/local/lib/python3.6/dist-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
554 else:
555 # fit method of arity 2 (supervised transformation)
--> 556 return self.fit(X, y, **fit_params).transform(X)
557
558
/usr/local/lib/python3.6/dist-packages/gensim/sklearn_api/d2vmodel.py in fit(self, X, y)
158
159 """
--> 160 if isinstance(X[0], doc2vec.TaggedDocument):
161 d2v_sentences = X
162 else:
/usr/local/lib/python3.6/dist-packages/pandas/core/frame.py in __getitem__(self, key)
2993 if self.columns.nlevels > 1:
2994 return self._getitem_multilevel(key)
-> 2995 indexer = self.columns.get_loc(key)
2996 if is_integer(indexer):
2997 indexer = [indexer]
/usr/local/lib/python3.6/dist-packages/pandas/core/indexes/base.py in get_loc(self, key, method, tolerance)
2897 return self._engine.get_loc(key)
2898 except KeyError:
-> 2899 return self._engine.get_loc(self._maybe_cast_indexer(key))
2900 indexer = self.get_indexer([key], method=method, tolerance=tolerance)
2901 if indexer.ndim > 1 or indexer.size > 1:
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas/_libs/hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 0
Does anyone know why might this happen? I think it has to do with D2VTransformer because when I'm running the code below I'm getting the same error:
model = D2VTransformer(min_count=1, size=5)
docvecs = model.fit_transform(X_train)
But when trying to select only one column from the dataframe:
docvecs = model.fit_transform(X_train['name'])
it doesn't throw an error and that is why when I created the pipelines I've only used one column, but still getting the error.
This is how X_train looks.
name description
9107 way great entrepreneur push limit help succeed way great entrepreneur push limit
7706 dit het team week week dit het team week week
3995 decorate home jewel tone feel bold colour choice inspire fill home abun...
5220 attic meat district attic meat district
3412 tee apparel choose design item clothe accessory piece inde...
... ... ...
3830 marque web designer mode marque web designer
3261 design holiday rest bite try lear magazine dai... design holiday rest bite try lear
2415 hallucinatory house father spirit music room hold tower season rug produce early...
7223 jacket rise jacket rise
4697 cupcake bake explorer love love chocolate cupcake top kind easy foll...
And some more details about X_train:
X_train.shape
(7159, 2)
X_train.dtypes
name object
description object
dtype: object
It looks like there was a recent bug and fix in gensim (October 2019, and not yet in any official release) to make D2VTransformer more tolerant of some Pandas Series as data sources, to resolve exactly the same exception as you've hit.
The line of code changed is exactly the one shown in your extended error-stack - line 160 of d2vmodel.py, testing X[0].
I would suggest grabbing the raw source of the latest version of d2vmodel.py to use locally (instead of importing from gensim.sklearn_api), and check if that resolves your issue. See:
https://github.com/RaRe-Technologies/gensim/blob/develop/gensim/sklearn_api/d2vmodel.py

GridSearchCV - TypeError: an integer is required

I am trying to find the best hyperparameters for my SVM using Grid Search. When doing it the following way:
from sklearn.model_selection import GridSearchCV
param_grid = {'coef0': [10, 5, 0.5, 0.001], 'C': [100, 50, 1, 0.001]}
poly_svm_search = SVC(kernel="poly", degree="2")
grid_search = GridSearchCV(poly_svm_search, param_grid, cv=5, scoring='f1')
grid_search.fit(train_data, train_labels)
I get this error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-72-dadf5782618c> in <module>
8
----> 9 grid_search.fit(train_data, train_labels)
~/.local/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~/.local/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~/.local/lib/python3.6/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
981 # remaining jobs.
982 self._iterating = False
--> 983 if self.dispatch_one_batch(iterator):
984 self._iterating = self._original_iterator is not None
985
~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
823 return False
824 else:
--> 825 self._dispatch(tasks)
826 return True
827
~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
780 with self._lock:
781 job_idx = len(self._jobs)
--> 782 job = self._backend.apply_async(batch, callback=cb)
783 # A job can complete so quickly than its callback is
784 # called before we get here, causing self._jobs to
~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
543 # Don't delay the application, to avoid keeping the input
544 # arguments in memory
--> 545 self.results = batch()
546
547 def get(self):
~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
259 with parallel_backend(self._backend):
260 return [func(*args, **kwargs)
--> 261 for func, args, kwargs in self.items]
262
263 def __len__(self):
~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
259 with parallel_backend(self._backend):
260 return [func(*args, **kwargs)
--> 261 for func, args, kwargs in self.items]
262
263 def __len__(self):
~/.local/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
526 estimator.fit(X_train, **fit_params)
527 else:
--> 528 estimator.fit(X_train, y_train, **fit_params)
529
530 except Exception as e:
~/.local/lib/python3.6/site-packages/sklearn/svm/base.py in fit(self, X, y, sample_weight)
210
211 seed = rnd.randint(np.iinfo('i').max)
--> 212 fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
213 # see comment on the other call to np.iinfo in this file
214
~/.local/lib/python3.6/site-packages/sklearn/svm/base.py in _sparse_fit(self, X, y, sample_weight, solver_type, kernel, random_seed)
291 sample_weight, self.nu, self.cache_size, self.epsilon,
292 int(self.shrinking), int(self.probability), self.max_iter,
--> 293 random_seed)
294
295 self._warn_from_fit_status()
sklearn/svm/libsvm_sparse.pyx in sklearn.svm.libsvm_sparse.libsvm_sparse_train()
TypeError: an integer is required
My train_labels variable contains a list of booleans, so I have a binary classification problem. train_data is a <class'scipy.sparse.csr.csr_matrix'>, basically containing all scaled and One-Hot encoded features.
What did I do wrong? It's hard for me to track down what the issue is here. I thank you for any help in advance ;).
When you initialize the SVC using this line:
poly_svm_search = SVC(kernel="poly", degree="2")
You are supplying degree param with a string, due to inverted commas around it. But according to the documentation, degree takes an integer as value.
degree : int, optional (default=3) Degree of the polynomial kernel
function (‘poly’). Ignored by all other kernels.
So you need to do this:
poly_svm_search = SVC(kernel="poly", degree=2)
Notice how I did not use inverted commas here.

Categories

Resources