I am trying to fit an XGBoost model to my data with an early stopping round and therefore an eval_set parameter. However, I am using a pipeline that does preprocessing before the model fitting step. I would want to set the parameter "eval_set" to that particular step and have used the syntax "stepname__eval_set=.." which doesn't seem to work.
Here is my code :
XGB=XGBRegressor(n_estimators=10000,learning_rate=0.05,verbose=False)
myPip=Pipeline(steps=[("preprocessing",preprocessor),
("model",XGB)])
myPip.fit(X_train2,y_train,model__eval_set=[(X_val2,y_val)],model__early_stopping_rounds=5)
It returns the following error
ValueError Traceback (most recent call last)
C:\Users\PCGZ~1\AppData\Local\Temp/ipykernel_17976/459508294.py in <module>
2 myPip=Pipeline(steps=[("preprocessing",preprocessor),
3 ("model",XGB)])
----> 4 myPip.fit(X_train2,y_train,model__eval_set=[(X_val2,y_val)],model__early_stopping_rounds=5)
5 y_pred_val=myPip.predict(X_val2)
6 y_pred_train=myPip.predict(X_train2)
~\anaconda3\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
344 if self._final_estimator != 'passthrough':
345 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
--> 346 self._final_estimator.fit(Xt, y, **fit_params_last_step)
347
348 return self
~\anaconda3\lib\site-packages\xgboost\core.py in inner_f(*args, **kwargs)
618 for k, arg in zip(sig.parameters, args):
619 kwargs[k] = arg
--> 620 return func(**kwargs)
621
622 return inner_f
~\anaconda3\lib\site-packages\xgboost\sklearn.py in fit(self, X, y, sample_weight, base_margin, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, base_margin_eval_set, feature_weights, callbacks)
1012 with config_context(verbosity=self.verbosity):
1013 evals_result: TrainingCallback.EvalsLog = {}
-> 1014 train_dmatrix, evals = _wrap_evaluation_matrices(
1015 missing=self.missing,
1016 X=X,
~\anaconda3\lib\site-packages\xgboost\sklearn.py in _wrap_evaluation_matrices(missing, X, y, group, qid, sample_weight, base_margin, feature_weights, eval_set, sample_weight_eval_set, base_margin_eval_set, eval_group, eval_qid, create_dmatrix, enable_categorical, feature_types)
497 evals.append(train_dmatrix)
498 else:
--> 499 m = create_dmatrix(
500 data=valid_X,
501 label=valid_y,
~\anaconda3\lib\site-packages\xgboost\sklearn.py in _create_dmatrix(self, ref, **kwargs)
932 except TypeError: # `QuantileDMatrix` supports lesser types than DMatrix
933 pass
--> 934 return DMatrix(**kwargs, nthread=self.n_jobs)
935
936 def _set_evaluation_result(self, evals_result: TrainingCallback.EvalsLog) -> None:
~\anaconda3\lib\site-packages\xgboost\core.py in inner_f(*args, **kwargs)
618 for k, arg in zip(sig.parameters, args):
619 kwargs[k] = arg
--> 620 return func(**kwargs)
621
622 return inner_f
~\anaconda3\lib\site-packages\xgboost\core.py in __init__(self, data, label, weight, base_margin, missing, silent, feature_names, feature_types, nthread, group, qid, label_lower_bound, label_upper_bound, feature_weights, enable_categorical)
741 return
742
--> 743 handle, feature_names, feature_types = dispatch_data_backend(
744 data,
745 missing=self.missing,
~\anaconda3\lib\site-packages\xgboost\data.py in dispatch_data_backend(data, missing, threads, feature_names, feature_types, enable_categorical)
955 return _from_tuple(data, missing, threads, feature_names, feature_types)
956 if _is_pandas_df(data):
--> 957 return _from_pandas_df(data, enable_categorical, missing, threads,
958 feature_names, feature_types)
959 if _is_pandas_series(data):
~\anaconda3\lib\site-packages\xgboost\data.py in _from_pandas_df(data, enable_categorical, missing, nthread, feature_names, feature_types)
402 feature_types: Optional[FeatureTypes],
403 ) -> DispatchedDataBackendReturnType:
--> 404 data, feature_names, feature_types = _transform_pandas_df(
405 data, enable_categorical, feature_names, feature_types
406 )
~\anaconda3\lib\site-packages\xgboost\data.py in _transform_pandas_df(data, enable_categorical, feature_names, feature_types, meta, meta_type)
376 for dtype in data.dtypes
377 ):
--> 378 _invalid_dataframe_dtype(data)
379
380 feature_names, feature_types = _pandas_feature_info(
~\anaconda3\lib\site-packages\xgboost\data.py in _invalid_dataframe_dtype(data)
268 type_err = "DataFrame.dtypes for data must be int, float, bool or category."
269 msg = f"""{type_err} {_ENABLE_CAT_ERR} {err}"""
--> 270 raise ValueError(msg)
271
272
ValueError: DataFrame.dtypes for data must be int, float, bool or category. When categorical type is supplied, The experimental DMatrix parameter`enable_categorical` must be set to `True`. Invalid columns:MSZoning: object, Street: object, LotShape: object, LandContour: object, Utilities: object, LotConfig: object, LandSlope: object, Neighborhood: object, Condition1: object, Condition2: object, BldgType: object, HouseStyle: object, RoofStyle: object, RoofMatl: object, Exterior1st: object, Exterior2nd: object, MasVnrType: object, ExterQual: object, ExterCond: object, Foundation: object, BsmtQual: object, BsmtCond: object, BsmtExposure: object, BsmtFinType1: object, BsmtFinType2: object, Heating: object, HeatingQC: object, CentralAir: object, Electrical: object, KitchenQual: object, Functional: object, GarageType: object, GarageFinish: object, GarageQual: object, GarageCond: object, PavedDrive: object, SaleType: object, SaleCondition: object
PS : The prepocessing pipeline isn't the issue, since the pipeline worked fine with other models that do not take the eval_set parameter.
Thank you in advance for your kindly help.
I have found "a" solution for this particular problem : which was passing the eval_set parameter (which was unprocessed data) to the model that was fitted using preprocessed data. Trying to evaluate it with unprocessed data that ultimately had a different column structure gave the error shown above.
The idea is to perform the pipeline step by step, just like so :
XGB=XGBRegressor(n_estimators=10000,learning_rate=0.05,verbose=False)
#This is our original Pipeline
myPip=Pipeline(steps=[("preprocessing",preprocessor),
("model",XGB)])
#We fit the preprocessing step on the unprocessed training data
myPip[0].fit(X_train2,y_train)
#And transform both the training and validation data
X_trainXGB=myPip[0].transform(X_train2)
X_valXGB=myPip[0].transform(X_val2)
#We fit the model on the clean data
myPip[1].fit(X_trainXGB,y_train,eval_set=[(X_valXGB,y_val)],early_stopping_rounds=5)
#And predict the result using the preprocessed (transformed) validation data
y_preds=myPip[1].predict(X_valXGB)
Related
I am trying to train a StackingClassifier in Sklearn, but I keep running into this error where the fit method seems to be complaining about me having passed it numpy arrays. To my knowledge, this is how all the fit methods in sklearn are supposed to work. I read and followed the example from the documentation and expanded on it to include a more complex and comprehensive pipeline that would process categorical, ordinal, scalar, and text data.
Sorry in advance for the lengthy code sample, but I felt it was necessary to provide a complete reproducible example. Simply breaking down the pipeline into its constituent estimators and test those individually did not raise any exceptions, so I figure that the error somehow comes from the gestalt estimator.
Select Features
categorical_data = [
"race",
"gender",
"admission_type_id",
"discharge_disposition_id",
"admission_source_id",
"insulin",
"diabetesMed",
"change",
"payer_code",
"A1Cresult",
"metformin",
"repaglinide",
"nateglinide",
"chlorpropamide",
"glimepiride",
"glipizide",
"glyburide",
"tolbutamide",
"pioglitazone",
"rosiglitazone",
"acarbose",
"miglitol",
"tolazamide",
"glyburide.metformin",
"glipizide.metformin",
]
ordinal_data = [
"age"
]
scalar_data = [
"num_medications",
"time_in_hospital",
"num_lab_procedures",
"num_procedures",
"number_outpatient",
"number_emergency",
"number_inpatient",
"number_diagnoses",
]
text_data = [
"diag_1_desc",
"diag_2_desc",
"diag_3_desc"
]
Create Column Transformers
impute_trans = compose.make_column_transformer(
(
impute.SimpleImputer(
strategy="constant",
fill_value="missing"
),
categorical_data
)
)
encode_trans = compose.make_column_transformer(
(
preprocessing.OneHotEncoder(
sparse=False,
handle_unknown="ignore"
),
categorical_data
),
(
preprocessing.OrdinalEncoder(),
ordinal_data
)
)
scalar_trans = compose.make_column_transformer(
(preprocessing.StandardScaler(), scalar_data),
)
text_trans = compose.make_column_transformer(
(TfidfVectorizer(ngram_range=(1,2)), "diag_1_desc"),
(TfidfVectorizer(ngram_range=(1,2)), "diag_2_desc"),
(TfidfVectorizer(ngram_range=(1,2)), "diag_3_desc"),
)
Create Estimators
cat_pre_pipe = make_pipeline(impute_trans, encode_trans)
logreg = LogisticRegression(
solver = "saga",
penalty="elasticnet",
l1_ratio=0.5,
max_iter=1000
)
text_pipe = make_pipeline(text_trans, logreg)
scalar_pipe = make_pipeline(scalar_trans, logreg)
cat_pipe = make_pipeline(cat_pre_pipe, logreg)
estimators = [
("cat", cat_pipe),
("text", text_pipe),
("scalar", scalar_pipe)
]
Create Stacking Classifier
stack_clf = StackingClassifier(
estimators=estimators,
final_estimator=logreg
)
diabetes_data = pd.read_csv("8k_diabetes.csv", delimiter=',')
x_train, x_test, y_train, y_test = train_test_split(
pd.concat([
preprocess_dataframe(diabetes_data[text_data]),
diabetes_data[categorical_data + scalar_data]
], axis=1),
diabetes_data["readmitted"].astype(int)
)
stack_clf.fit(x_train, y_train)
Complete Stack Trace
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/sklearn/utils/__init__.py:409, in _get_column_indices(X, key)
408 try:
--> 409 all_columns = X.columns
410 except AttributeError:
AttributeError: 'numpy.ndarray' object has no attribute 'columns'
During handling of the above exception, another exception occurred:
ValueError Traceback (most recent call last)
Input In [19], in <cell line: 1>()
----> 1 stack_clf.fit(x_train, y_train)
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/sklearn/ensemble/_stacking.py:488, in StackingClassifier.fit(self, X, y, sample_weight)
486 self._le = LabelEncoder().fit(y)
487 self.classes_ = self._le.classes_
--> 488 return super().fit(X, self._le.transform(y), sample_weight)
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/sklearn/ensemble/_stacking.py:158, in _BaseStacking.fit(self, X, y, sample_weight)
153 stack_method = [self.stack_method] * len(all_estimators)
155 # Fit the base estimators on the whole training data. Those
156 # base estimators will be used in transform, predict, and
157 # predict_proba. They are exposed publicly.
--> 158 self.estimators_ = Parallel(n_jobs=self.n_jobs)(
159 delayed(_fit_single_estimator)(clone(est), X, y, sample_weight)
160 for est in all_estimators
161 if est != "drop"
162 )
164 self.named_estimators_ = Bunch()
165 est_fitted_idx = 0
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/joblib/parallel.py:1043, in Parallel.__call__(self, iterable)
1034 try:
1035 # Only set self._iterating to True if at least a batch
1036 # was dispatched. In particular this covers the edge
(...)
1040 # was very quick and its callback already dispatched all the
1041 # remaining jobs.
1042 self._iterating = False
-> 1043 if self.dispatch_one_batch(iterator):
1044 self._iterating = self._original_iterator is not None
1046 while self.dispatch_one_batch(iterator):
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/joblib/parallel.py:861, in Parallel.dispatch_one_batch(self, iterator)
859 return False
860 else:
--> 861 self._dispatch(tasks)
862 return True
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/joblib/parallel.py:779, in Parallel._dispatch(self, batch)
777 with self._lock:
778 job_idx = len(self._jobs)
--> 779 job = self._backend.apply_async(batch, callback=cb)
780 # A job can complete so quickly than its callback is
781 # called before we get here, causing self._jobs to
782 # grow. To ensure correct results ordering, .insert is
783 # used (rather than .append) in the following line
784 self._jobs.insert(job_idx, job)
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/joblib/_parallel_backends.py:208, in SequentialBackend.apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/joblib/_parallel_backends.py:572, in ImmediateResult.__init__(self, batch)
569 def __init__(self, batch):
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/joblib/parallel.py:262, in BatchedCalls.__call__(self)
258 def __call__(self):
259 # Set the default nested backend to self._backend but do not set the
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/joblib/parallel.py:262, in <listcomp>(.0)
258 def __call__(self):
259 # Set the default nested backend to self._backend but do not set the
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/sklearn/utils/fixes.py:216, in _FuncWrapper.__call__(self, *args, **kwargs)
214 def __call__(self, *args, **kwargs):
215 with config_context(**self.config):
--> 216 return self.function(*args, **kwargs)
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/sklearn/ensemble/_base.py:42, in _fit_single_estimator(estimator, X, y, sample_weight, message_clsname, message)
40 else:
41 with _print_elapsed_time(message_clsname, message):
---> 42 estimator.fit(X, y)
43 return estimator
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/sklearn/pipeline.py:390, in Pipeline.fit(self, X, y, **fit_params)
364 """Fit the model.
365
366 Fit all the transformers one after the other and transform the
(...)
387 Pipeline with fitted steps.
388 """
389 fit_params_steps = self._check_fit_params(**fit_params)
--> 390 Xt = self._fit(X, y, **fit_params_steps)
391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
392 if self._final_estimator != "passthrough":
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/sklearn/pipeline.py:348, in Pipeline._fit(self, X, y, **fit_params_steps)
346 cloned_transformer = clone(transformer)
347 # Fit or load from cache the current transformer
--> 348 X, fitted_transformer = fit_transform_one_cached(
349 cloned_transformer,
350 X,
351 y,
352 None,
353 message_clsname="Pipeline",
354 message=self._log_message(step_idx),
355 **fit_params_steps[name],
356 )
357 # Replace the transformer of the step with the fitted
358 # transformer. This is necessary when loading the transformer
359 # from the cache.
360 self.steps[step_idx] = (name, fitted_transformer)
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/joblib/memory.py:349, in NotMemorizedFunc.__call__(self, *args, **kwargs)
348 def __call__(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/sklearn/pipeline.py:893, in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/sklearn/pipeline.py:434, in Pipeline.fit_transform(self, X, y, **fit_params)
432 fit_params_last_step = fit_params_steps[self.steps[-1][0]]
433 if hasattr(last_step, "fit_transform"):
--> 434 return last_step.fit_transform(Xt, y, **fit_params_last_step)
435 else:
436 return last_step.fit(Xt, y, **fit_params_last_step).transform(Xt)
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py:672, in ColumnTransformer.fit_transform(self, X, y)
670 self._check_n_features(X, reset=True)
671 self._validate_transformers()
--> 672 self._validate_column_callables(X)
673 self._validate_remainder(X)
675 result = self._fit_transform(X, y, _fit_transform_one)
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/sklearn/compose/_column_transformer.py:352, in ColumnTransformer._validate_column_callables(self, X)
350 columns = columns(X)
351 all_columns.append(columns)
--> 352 transformer_to_input_indices[name] = _get_column_indices(X, columns)
354 self._columns = all_columns
355 self._transformer_to_input_indices = transformer_to_input_indices
File ~/anaconda3/envs/assignment2/lib/python3.8/site-packages/sklearn/utils/__init__.py:411, in _get_column_indices(X, key)
409 all_columns = X.columns
410 except AttributeError:
--> 411 raise ValueError(
412 "Specifying the columns using strings is only "
413 "supported for pandas DataFrames"
414 )
415 if isinstance(key, str):
416 columns = [key]
ValueError: Specifying the columns using strings is only supported for pandas DataFrames
Full Pipeline Diagram
Your categorical pipeline chains two column transformers together. After the first one, the output is a numpy array, but then the second one cannot select transformers by column name as you've requested. Notice the final error message is more informative here, ValueError: Specifying the columns using strings is only supported for pandas DataFrames.
I'd suggest using one column transformer with separate pipelines instead of one pipeline with multiple columntransformers for this reason.
So I am using Xgboost for an image classification problem but I am getting a Bad Allocation error when running model.fit and passing the train data. I think that the problem must be the data but its in a 2d array as supported by XGBoost also when the data had been passed through a VGG16 Feature Extractor it worked with no problem. Any possible solutions please?
x = np.array(allImages)<br>
y = np.array(allLabels)<br>
x_train,x_test,y_train,y_test = train_test_split(x,y,test_size=0.2,random_state=123)<br>
le = preprocessing.LabelEncoder()<br>
le.fit(y_train)<br>
y_train_encoded = le.transform(y_train)<br>
del y_train<br>
le.fit(y_test)<br>
y_test_encoded = le.transform(y_test)<br>
x_train = x_train.reshape(x_train.shape[0],-1)<br>
model = xgb.XGBClassifier(use_label_encoder=False,objective='multi:softprob',
num_class= 10)<br>
model.fit(x_train,y_train_encoded)<br>
---------------------------------------------------------------------------
XGBoostError Traceback (most recent call last)
<ipython-input-12-d0749b6882f3> in <module>
5 model = xgb.XGBClassifier(use_label_encoder=False,objective='multi:softprob',
6 num_class= 10)
----> 7 model.fit(x_train,y_train_encoded)
8 #For the kFold
9 le.fit(y)
C:\Anaconda\lib\site-packages\xgboost\core.py in inner_f(*args, **kwargs)
420 for k, arg in zip(sig.parameters, args):
421 kwargs[k] = arg
--> 422 return f(**kwargs)
423
424 return inner_f
C:\Anaconda\lib\site-packages\xgboost\sklearn.py in fit(self, X, y, sample_weight, base_margin, eval_set, eval_metric, early_stopping_rounds, verbose, xgb_model, sample_weight_eval_set, feature_weights, callbacks)
901 self.n_features_in_ = self._features_count
902
--> 903 train_dmatrix, evals = self._wrap_evaluation_matrices(
904 X, y, group=None, sample_weight=sample_weight, base_margin=base_margin,
905 feature_weights=feature_weights,
C:\Anaconda\lib\site-packages\xgboost\sklearn.py in _wrap_evaluation_matrices(self, X, y, group, sample_weight, base_margin, feature_weights, eval_set, sample_weight_eval_set, eval_group, label_transform)
263
264 y = label_transform(y)
--> 265 train_dmatrix = DMatrix(data=X, label=y, weight=sample_weight,
266 base_margin=base_margin,
267 missing=self.missing, nthread=self.n_jobs)
C:\Anaconda\lib\site-packages\xgboost\core.py in __init__(self, data, label, weight, base_margin, missing, silent, feature_names, feature_types, nthread, enable_categorical)
498
499 from .data import dispatch_data_backend
--> 500 handle, feature_names, feature_types = dispatch_data_backend(
501 data, missing=self.missing,
502 threads=self.nthread,
C:\Anaconda\lib\site-packages\xgboost\data.py in dispatch_data_backend(data, missing, threads, feature_names, feature_types, enable_categorical)
528 return _from_scipy_csr(data.tocsr(), missing, feature_names, feature_types)
529 if _is_numpy_array(data):
--> 530 return _from_numpy_array(data, missing, threads, feature_names,
531 feature_types)
532 if _is_uri(data):
C:\Anaconda\lib\site-packages\xgboost\data.py in _from_numpy_array(data, missing, nthread, feature_names, feature_types)
145 flatten = _transform_np_array(data)
146 handle = ctypes.c_void_p()
--> 147 _check_call(_LIB.XGDMatrixCreateFromMat_omp(
148 flatten.ctypes.data_as(ctypes.POINTER(ctypes.c_float)),
149 c_bst_ulong(data.shape[0]),
C:\Anaconda\lib\site-packages\xgboost\core.py in _check_call(ret)
187 """
188 if ret != 0:
--> 189 raise XGBoostError(py_str(_LIB.XGBGetLastError()))
190
191
XGBoostError: bad allocation
I am using Python 3.7 in a Jupyter Notebook. I am creating classification models based on Jason Brownlee's ebook Machine Learning Mastery with Python. The code is essentially cut and pasted from the ebook into the Jupyter Notebook. The models work fine when I split the data but when I use k-fold cross validation it generates a Future warning message I'll cut and paste the code and message below. I entered error_score =np.nan and it didn't fix the problem but I don't know where the code should be entered. I would appreciate any advice but keep in mind that I am a novice. Thanks
# Logistic Regression Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
df = pd.read_csv('Diabetes_Classification.csv')
array = df.values
X = array[:,0:8]
Y = array[:,8]
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression(solver='liblinear')
error_score = np.nan
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
# Logistic Regression Classification
from pandas import read_csv
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.linear_model import LogisticRegression
df = pd.read_csv('Diabetes_Classification.csv')
array = df.values
X = array[:,0:8]
Y = array[:,8]
kfold = KFold(n_splits=10, random_state=7)
model = LogisticRegression(solver='liblinear')
error_score = np.nan
results = cross_val_score(model, X, Y, cv=kfold)
print(results.mean())
/Users/roberthoyt/opt/anaconda3/lib/python3.7/site-
packages/sklearn/model_selection/_validation.py:530: FutureWarning: From version 0.22, errors during
fit will result in a cross validation score of NaN by default. Use error_score='raise' if you want
an exception raised or error_score=np.nan to adopt the behavior from version 0.22.
FutureWarning)
ValueError Traceback (most recent call last)
<ipython-input-105-010e5612fd63> in <module>
11 model = LogisticRegression(solver='liblinear')
12 error_score = np.nan
---> 13 results = cross_val_score(model, X, Y, cv=kfold)
14 print(results.mean())
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in
cross_val_score(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch,
error_score)
389 fit_params=fit_params,
390 pre_dispatch=pre_dispatch,
--> 391 error_score=error_score)
392 return cv_results['test_score']
393
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in
cross_validate(estimator, X, y, groups, scoring, cv, n_jobs, verbose, fit_params, pre_dispatch,
return_train_score, return_estimator, error_score)
230 return_times=True, return_estimator=return_estimator,
231 error_score=error_score)
--> 232 for train, test in cv.split(X, y, groups))
233
234 zipped_scores = list(zip(*scores))
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
919 # remaining jobs.
920 self._iterating = False
--> 921 if self.dispatch_one_batch(iterator):
922 self._iterating = self._original_iterator is not None
923
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in dispatch_one_batch(self,
iterator)
757 return False
758 else:
--> 759 self._dispatch(tasks)
760 return True
761
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in _dispatch(self, batch)
714 with self._lock:
715 job_idx = len(self._jobs)
--> 716 job = self._backend.apply_async(batch, callback=cb)
717 # A job can complete so quickly than its callback is
718 # called before we get here, causing self._jobs to
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in apply_async(self,
func,
callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/opt/anaconda3/lib/python3.7/site-packages/joblib/_parallel_backends.py in __init__(self,
batch)
547 # Don't delay the application, to avoid keeping the input
548 # arguments in memory
--> 549 self.results = batch()
550
551 def get(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in __call__(self)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/joblib/parallel.py in <listcomp>(.0)
223 with parallel_backend(self._backend, n_jobs=self._n_jobs):
224 return [func(*args, **kwargs)
--> 225 for func, args, kwargs in self.items]
226
227 def __len__(self):
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in _
fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params,
return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator,
error_score)
514 estimator.fit(X_train, **fit_params)
515 else:
--> 516 estimator.fit(X_train, y_train, **fit_params)
517
518 except Exception as e:
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/linear_model/logistic.py in fit(self, X, y,
sample_weight)
1531 X, y = check_X_y(X, y, accept_sparse='csr', dtype=_dtype, order="C",
1532 accept_large_sparse=solver != 'liblinear')
-> 1533 check_classification_targets(y)
1534 self.classes_ = np.unique(y)
1535 n_samples, n_features = X.shape
~/opt/anaconda3/lib/python3.7/site-packages/sklearn/utils/multiclass.py in
check_classification_targets(y)
167 if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
168 'multilabel-indicator', 'multilabel-sequences']:
--> 169 raise ValueError("Unknown label type: %r" % y_type)
170
171
ValueError: Unknown label type: 'continuous'
The problem is that your targets are continuous and you're doing a classification task. Make sure The column you're using a target is categorical. You may have to convert it to integer. All of this is reported in the traceback:
check_classification_targets(y)
167 if y_type not in ['binary', 'multiclass', 'multiclass-multioutput',
168 'multilabel-indicator', 'multilabel-sequences']:
--> 169 raise ValueError("Unknown label type: %r" % y_type)
Your target is not in the accepted targets. your target is continuous:
ValueError: Unknown label type: 'continuous'
Check if your target is an integer with df.dtypes and change it to integer if it isn't.
Y = array[:,8].astype(int)
That is assuming that you haven't made the mistake of making a classification task on continuous values. You can also check if all values represent 0s and 1s:
np.unique(array[:, 8])
I am using GridSearchCV in order to find the best parameters for my pipeline.
My pipeline seems to work well as I can apply:
pipeline.fit(X_train, y_train)
preds = pipeline.predict(X_test)
And I get a decent result.
But GridSearchCV obviously doesn't like something, and I cannot figure it out.
My pipeline:
feats = FeatureUnion([('age', age),
('education_num', education_num),
('is_education_favo', is_education_favo),
('is_marital_status_favo', is_marital_status_favo),
('hours_per_week', hours_per_week),
('capital_diff', capital_diff),
('sex', sex),
('race', race),
('native_country', native_country)
])
pipeline = Pipeline([
('adhocFC',AdHocFeaturesCreation()),
('imputers', KnnImputer(target = 'native-country', n_neighbors = 5)),
('features',feats),('clf',LogisticRegression())])
My GridSearch:
hyperparameters = {'imputers__n_neighbors' : [5,21,41], 'clf__C' : [1.0, 2.0]}
GSCV = GridSearchCV(pipeline, hyperparameters, cv=3, scoring = 'roc_auc' , refit = False) #change n_jobs = 2, refit = False
GSCV.fit(X_train, y_train)
I receive 11 similar warnings:
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/ipykernel/main.py:11:
SettingWithCopyWarning: A value is trying to be set on a copy of a
slice from a DataFrame. Try using .loc[row_indexer,col_indexer] =
value instead
and this is the error message:
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/ipykernel/main.py:11:
SettingWithCopyWarning: A value is trying to be set on a copy of a
slice from a DataFrame. Try using .loc[row_indexer,col_indexer] =
value instead
See the caveats in the documentation:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/ipykernel/main.py:12:
SettingWithCopyWarning: A value is trying to be set on a copy of a
slice from a DataFrame. Try using .loc[row_indexer,col_indexer] =
value instead
See the caveats in the documentation:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/ipykernel/main.py:14:
SettingWithCopyWarning: A value is trying to be set on a copy of a
slice from a DataFrame. Try using .loc[row_indexer,col_indexer] =
value instead
See the caveats in the documentation:
http://pandas.pydata.org/pandas-docs/stable/indexing.html#indexing-view-versus-copy
--------------------------------------------------------------------------- ValueError Traceback (most recent call
last) in ()
3 GSCV = GridSearchCV(pipeline, hyperparameters, cv=3, scoring = 'roc_auc' ,refit = False) #change n_jobs = 2, refit = False
4
----> 5 GSCV.fit(X_train, y_train)
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/model_selection/_search.py
in fit(self, X, y, groups)
943 train/test set.
944 """
--> 945 return self._fit(X, y, groups, ParameterGrid(self.param_grid))
946
947
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/model_selection/_search.py
in _fit(self, X, y, groups, parameter_iterable)
562 return_times=True, return_parameters=True,
563 error_score=self.error_score)
--> 564 for parameters in parameter_iterable
565 for train, test in cv_iter)
566
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py
in call(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py
in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py
in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py
in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py
in init(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py
in call(self)
129
130 def call(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def len(self):
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py
in (.0)
129
130 def call(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def len(self):
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/model_selection/_validation.py
in _fit_and_score(estimator, X, y, scorer, train, test, verbose,
parameters, fit_params, return_train_score, return_parameters,
return_n_test_samples, return_times, error_score)
236 estimator.fit(X_train, **fit_params)
237 else:
--> 238 estimator.fit(X_train, y_train, **fit_params)
239
240 except Exception as e:
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/pipeline.py
in fit(self, X, y, **fit_params)
266 This estimator
267 """
--> 268 Xt, fit_params = self._fit(X, y, **fit_params)
269 if self._final_estimator is not None:
270 self._final_estimator.fit(Xt, y, **fit_params)
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/pipeline.py
in _fit(self, X, y, **fit_params)
232 pass
233 elif hasattr(transform, "fit_transform"):
--> 234 Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
235 else:
236 Xt = transform.fit(Xt, y, **fit_params_steps[name]) \
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/sklearn/base.py
in fit_transform(self, X, y, **fit_params)
495 else:
496 # fit method of arity 2 (supervised transformation)
--> 497 return self.fit(X, y, **fit_params).transform(X)
498
499
in fit(self, X, y)
16 self.ohe.fit(X_full)
17 #Create a Dataframe that does not contain any nulls, categ variables are OHE, with all each rows
---> 18 X_ohe_full = self.ohe.transform(X_full[~X[self.col].isnull()].drop(self.col,
axis=1))
19
20 #Fit the classifier on lines where col is null
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/frame.py
in getitem(self, key) 2057 return
self._getitem_multilevel(key) 2058 else:
-> 2059 return self._getitem_column(key) 2060 2061 def _getitem_column(self, key):
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/frame.py
in _getitem_column(self, key) 2064 # get column 2065
if self.columns.is_unique:
-> 2066 return self._get_item_cache(key) 2067 2068 # duplicate columns & possible reduce dimensionality
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/generic.py
in _get_item_cache(self, item) 1384 res = cache.get(item)
1385 if res is None:
-> 1386 values = self._data.get(item) 1387 res = self._box_item_values(item, values) 1388
cache[item] = res
/home/jo/anaconda2/envs/py35/lib/python3.5/site-packages/pandas/core/internals.py
in get(self, item, fastpath) 3550 loc =
indexer.item() 3551 else:
-> 3552 raise ValueError("cannot label index with a null key") 3553 3554 return self.iget(loc,
fastpath=fastpath)
ValueError: cannot label index with a null key
Without additional information I believe it is because your X_train and y_train variables are pandas dataframe, the basic sci-kit learn library isn't comparable with these: e.g., the .fit method of a classifier is expecting an array like object.
By feeding in pandas dataframes you are inadvertently indexing them like numpy arrays, which is not that stable in pandas.
Try converting your training data to numpy arrays:
X_train_arr = X_train.to_numpy()
y_train_arr = y_train.to_numpy()
I am trying to find the best hyperparameters for my SVM using Grid Search. When doing it the following way:
from sklearn.model_selection import GridSearchCV
param_grid = {'coef0': [10, 5, 0.5, 0.001], 'C': [100, 50, 1, 0.001]}
poly_svm_search = SVC(kernel="poly", degree="2")
grid_search = GridSearchCV(poly_svm_search, param_grid, cv=5, scoring='f1')
grid_search.fit(train_data, train_labels)
I get this error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-72-dadf5782618c> in <module>
8
----> 9 grid_search.fit(train_data, train_labels)
~/.local/lib/python3.6/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~/.local/lib/python3.6/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1189 def _run_search(self, evaluate_candidates):
1190 """Search all candidates in param_grid"""
-> 1191 evaluate_candidates(ParameterGrid(self.param_grid))
1192
1193
~/.local/lib/python3.6/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
981 # remaining jobs.
982 self._iterating = False
--> 983 if self.dispatch_one_batch(iterator):
984 self._iterating = self._original_iterator is not None
985
~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
823 return False
824 else:
--> 825 self._dispatch(tasks)
826 return True
827
~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
780 with self._lock:
781 job_idx = len(self._jobs)
--> 782 job = self._backend.apply_async(batch, callback=cb)
783 # A job can complete so quickly than its callback is
784 # called before we get here, causing self._jobs to
~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
180 def apply_async(self, func, callback=None):
181 """Schedule a func to be run"""
--> 182 result = ImmediateResult(func)
183 if callback:
184 callback(result)
~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
543 # Don't delay the application, to avoid keeping the input
544 # arguments in memory
--> 545 self.results = batch()
546
547 def get(self):
~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
259 with parallel_backend(self._backend):
260 return [func(*args, **kwargs)
--> 261 for func, args, kwargs in self.items]
262
263 def __len__(self):
~/.local/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in <listcomp>(.0)
259 with parallel_backend(self._backend):
260 return [func(*args, **kwargs)
--> 261 for func, args, kwargs in self.items]
262
263 def __len__(self):
~/.local/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
526 estimator.fit(X_train, **fit_params)
527 else:
--> 528 estimator.fit(X_train, y_train, **fit_params)
529
530 except Exception as e:
~/.local/lib/python3.6/site-packages/sklearn/svm/base.py in fit(self, X, y, sample_weight)
210
211 seed = rnd.randint(np.iinfo('i').max)
--> 212 fit(X, y, sample_weight, solver_type, kernel, random_seed=seed)
213 # see comment on the other call to np.iinfo in this file
214
~/.local/lib/python3.6/site-packages/sklearn/svm/base.py in _sparse_fit(self, X, y, sample_weight, solver_type, kernel, random_seed)
291 sample_weight, self.nu, self.cache_size, self.epsilon,
292 int(self.shrinking), int(self.probability), self.max_iter,
--> 293 random_seed)
294
295 self._warn_from_fit_status()
sklearn/svm/libsvm_sparse.pyx in sklearn.svm.libsvm_sparse.libsvm_sparse_train()
TypeError: an integer is required
My train_labels variable contains a list of booleans, so I have a binary classification problem. train_data is a <class'scipy.sparse.csr.csr_matrix'>, basically containing all scaled and One-Hot encoded features.
What did I do wrong? It's hard for me to track down what the issue is here. I thank you for any help in advance ;).
When you initialize the SVC using this line:
poly_svm_search = SVC(kernel="poly", degree="2")
You are supplying degree param with a string, due to inverted commas around it. But according to the documentation, degree takes an integer as value.
degree : int, optional (default=3) Degree of the polynomial kernel
function (‘poly’). Ignored by all other kernels.
So you need to do this:
poly_svm_search = SVC(kernel="poly", degree=2)
Notice how I did not use inverted commas here.