Related
I am using TimeseriesGenerator for my problem.
The shapes for my train and test data are:
x_train - (306720, 20)
x_test - (306720,)
y_train - (4321, 20)
y_test - (4321,)
And their dtype is float64. And I dont need to use to.numpy() anymore.
I then use TimeSeriesGenerator
train_data = TimeseriesGenerator(x_train, x_test, length=144, batch_size=100)
test_data = TimeseriesGenerator(y_train, y_test, length=144, batch_size=100)
When I try to run
GRU = keras.models.Sequential([keras.layers.GRU(100), keras.layers.Dense(32, activation= 'relu')])
GRU.compile(loss="mae", optimizer="adam")
resultsGRU = GRU.fit(train_data, test_data, epochs = 5)
I get the following error:
File ~\Anaconda3\lib\site-packages\keras\utils\traceback_utils.py:70, in filter_traceback.<locals>.error_handler(*args, **kwargs)
67 filtered_tb = _process_traceback_frames(e.__traceback__)
68 # To get the full stack trace, call:
69 # `tf.debugging.disable_traceback_filtering()`
---> 70 raise e.with_traceback(filtered_tb) from None
71 finally:
72 del filtered_tb
File ~\Anaconda3\lib\site-packages\keras\engine\data_adapter.py:997, in KerasSequenceAdapter.__init__(self, x, y, sample_weights, shuffle, workers, use_multiprocessing, max_queue_size, model, **kwargs)
984 def __init__(
985 self,
986 x,
(...)
994 **kwargs
995 ):
996 if not is_none_or_empty(y):
--> 997 raise ValueError(
998 "`y` argument is not supported when using "
999 "`keras.utils.Sequence` as input."
1000 )
1001 if not is_none_or_empty(sample_weights):
1002 raise ValueError(
1003 "`sample_weight` argument is not supported when using "
1004 "`keras.utils.Sequence` as input."
1005 )
ValueError: `y` argument is not supported when using `keras.utils.Sequence` as input.
I tried
x, y = train_data[0]
print(x.shape, y.shape)
to convert it to float before I use GRU.fit(), but I get this error
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
File ~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py:3621, in Index.get_loc(self, key, method, tolerance)
3620 try:
-> 3621 return self._engine.get_loc(casted_key)
3622 except KeyError as err:
File ~\Anaconda3\lib\site-packages\pandas\_libs\index.pyx:136, in pandas._libs.index.IndexEngine.get_loc()
File ~\Anaconda3\lib\site-packages\pandas\_libs\index.pyx:163, in pandas._libs.index.IndexEngine.get_loc()
File pandas\_libs\hashtable_class_helper.pxi:2131, in pandas._libs.hashtable.Int64HashTable.get_item()
File pandas\_libs\hashtable_class_helper.pxi:2140, in pandas._libs.hashtable.Int64HashTable.get_item()
KeyError: 4331
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
Input In [205], in <cell line: 1>()
----> 1 x, y = train_data[0]
2 print(x.shape, y.shape)
File ~\Anaconda3\lib\site-packages\keras\preprocessing\sequence.py:189, in TimeseriesGenerator.__getitem__(self, index)
177 rows = np.arange(
178 i,
179 min(i + self.batch_size * self.stride, self.end_index + 1),
180 self.stride,
181 )
183 samples = np.array(
184 [
185 self.data[row - self.length : row : self.sampling_rate]
186 for row in rows
187 ]
188 )
--> 189 targets = np.array([self.targets[row] for row in rows])
191 if self.reverse:
192 return samples[:, ::-1, ...], targets
File ~\Anaconda3\lib\site-packages\keras\preprocessing\sequence.py:189, in <listcomp>(.0)
177 rows = np.arange(
178 i,
179 min(i + self.batch_size * self.stride, self.end_index + 1),
180 self.stride,
181 )
183 samples = np.array(
184 [
185 self.data[row - self.length : row : self.sampling_rate]
186 for row in rows
187 ]
188 )
--> 189 targets = np.array([self.targets[row] for row in rows])
191 if self.reverse:
192 return samples[:, ::-1, ...], targets
File ~\Anaconda3\lib\site-packages\pandas\core\series.py:958, in Series.__getitem__(self, key)
955 return self._values[key]
957 elif key_is_scalar:
--> 958 return self._get_value(key)
960 if is_hashable(key):
961 # Otherwise index.get_value will raise InvalidIndexError
962 try:
963 # For labels that don't resolve as scalars like tuples and frozensets
File ~\Anaconda3\lib\site-packages\pandas\core\series.py:1069, in Series._get_value(self, label, takeable)
1066 return self._values[label]
1068 # Similar to Index.get_value, but we do not fall back to positional
-> 1069 loc = self.index.get_loc(label)
1070 return self.index._get_values_for_loc(self, loc, label)
File ~\Anaconda3\lib\site-packages\pandas\core\indexes\base.py:3623, in Index.get_loc(self, key, method, tolerance)
3621 return self._engine.get_loc(casted_key)
3622 except KeyError as err:
-> 3623 raise KeyError(key) from err
3624 except TypeError:
3625 # If we have a listlike key, _check_indexing_error will raise
3626 # InvalidIndexError. Otherwise we fall through and re-raise
3627 # the TypeError.
3628 self._check_indexing_error(key)
KeyError: 4331
Can anyone please explain what is wrong?
My whole code worked fine before, I re-ran it to check if everything really works and now I suddenly have this problem and I don't know how to fix it.
As part of my thesis work, I am working on an anomaly detection workflow. It utilizes SGDOneClassSVM of Scikit-Learn due to the high amount of data (~ 265k records and 45 features). I am successful with implementing the model and finetuning the hyperparamaters.
Using a number of libraries, data treated by StandardScaler() is utilized first to find principal components with IncrementalPCA(). Then SGDOneClassSVM() model was built and good result was gained:
model = SGDOneClassSVM(nu=0.1810486, shuffle=True, fit_intercept=True, random_state=42, tol=1e-3)
model.fit(X_train_pca, y_train)
y_pred = model.predict(X_valid_pca)
cm = confusion_matrix(y_valid, y_pred, labels=[-1, 1])
print(cm)
print(f1_score(y_valid, y_pred))
[[ 497 13]
[ 15 35035]]
0.9996005592170961
The validation curve of the above model
Having ploted the calibration curve I realized there is a room for further improvement:
Calibration curve of the above model
I tried to calibrate the model in different ways, but none worked.
1,
final_model_isotonic = CalibratedClassifierCV(model, method="isotonic", cv='prefit')
prob_iso = final_model_isotonic.decision_function(X_calib)
X_plot, y_iso = calibration_curve(y_calib, prob_iso, n_bins = 10, normalize = True)
plt.figure(figsize=(12, 9))
plt.plot([0, 1], [0, 1], color='blue', linestyle = '--', label = 'Reference')
plt.plot(y_iso, X_plot, color='green', marker = '.', label = 'Final Model - Isotonic Calibration')
leg = plt.legend(loc = 'upper left')
plt.xlabel('Average Predicted Probability in each bin')
plt.ylabel('Ratio of positives')
plt.show()
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_29952/1350381664.py in <module>
1 final_model_isotonic = CalibratedClassifierCV(model, method="isotonic", cv='prefit')
----> 2 prob_iso = final_model_isotonic.decision_function(X_calib)
3
4 X_plot, y_iso = calibration_curve(y_calib, prob_iso, n_bins = 10, normalize = True)
5
AttributeError: 'CalibratedClassifierCV' object has no attribute 'decision_function'
2, Here - what I can see - is that is says fit has not happened. Based on the first code block it actually has happened.
final_model_isotonic = CalibratedClassifierCV(model, method="isotonic", cv='prefit')
final_model_isotonic.fit(X_valid_pca, y_valid)
prob = final_model_isotonic.decision_function(X_calib_pca)
# Creating Calibration Curve
x, y = calibration_curve(y_calib, prob, n_bins = 10, normalize = True)
# Plot calibration curve
plt.figure(figsize=(13, 7))
# Plot perfectly calibrated
plt.plot([0, 1], [0, 1], color='blue', linestyle = '--', label = 'Reference')
# Plot model's calibration curve
plt.plot(y, x, color='green', marker = '.', label = 'Final Model')
leg = plt.legend(loc = 'upper left')
plt.xlabel('Average Predicted Probability in each bin')
plt.ylabel('Ratio of positives')
plt.show()
---------------------------------------------------------------------------
NotFittedError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_29952/4100438610.py in <module>
1 final_model_isotonic = CalibratedClassifierCV(model, method="isotonic", cv='prefit')
----> 2 final_model_isotonic.fit(X_valid_pca, y_valid)
3 prob = final_model_isotonic.decision_function(X_calib_pca)
4
5 # Creating Calibration Curve
~\Anaconda3\lib\site-packages\sklearn\calibration.py in fit(self, X, y, sample_weight)
281 if self.cv == "prefit":
282 # `classes_` should be consistent with that of base_estimator
--> 283 check_is_fitted(self.base_estimator, attributes=["classes_"])
284 self.classes_ = self.base_estimator.classes_
285
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in check_is_fitted(estimator, attributes, msg, all_or_any)
1220
1221 if not fitted:
-> 1222 raise NotFittedError(msg % {"name": type(estimator).__name__})
1223
1224
NotFittedError: This SGDOneClassSVM instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
3,
final_model_isotonic = CalibratedClassifierCV(model, method="isotonic", cv=3)
final_model_isotonic.fit(X_valid_pca)
y_hat_calib = final_model_isotonic.predict(X_calib_pca)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_29952/3807651700.py in <module>
1 final_model_isotonic = CalibratedClassifierCV(model, method="isotonic", cv=3)
----> 2 final_model_isotonic.fit(X_valid_pca)
3 y_hat_calib = final_model_isotonic.predict(X_calib_pca)
4
5
TypeError: fit() missing 1 required positional argument: 'y'
4,
final_model_isotonic = CalibratedClassifierCV(model, method="isotonic", cv=3)
final_model_isotonic.fit(X_valid_pca, y_valid)
y_hat_calib = final_model_isotonic.predict(X_calib_pca)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_29952/2453888579.py in <module>
1 final_model_isotonic = CalibratedClassifierCV(model, method="isotonic", cv=3)
----> 2 final_model_isotonic.fit(X_valid_pca, y_valid)
3 y_hat_calib = final_model_isotonic.predict(X_calib_pca)
4
5
~\Anaconda3\lib\site-packages\sklearn\calibration.py in fit(self, X, y, sample_weight)
339 parallel = Parallel(n_jobs=self.n_jobs)
340
--> 341 self.calibrated_classifiers_ = parallel(
342 delayed(_fit_classifier_calibrator_pair)(
343 clone(base_estimator),
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1041 # remaining jobs.
1042 self._iterating = False
-> 1043 if self.dispatch_one_batch(iterator):
1044 self._iterating = self._original_iterator is not None
1045
~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
859 return False
860 else:
--> 861 self._dispatch(tasks)
862 return True
863
~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
777 with self._lock:
778 job_idx = len(self._jobs)
--> 779 job = self._backend.apply_async(batch, callback=cb)
780 # A job can complete so quickly than its callback is
781 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\Anaconda3\lib\site-packages\sklearn\utils\fixes.py in __call__(self, *args, **kwargs)
214 def __call__(self, *args, **kwargs):
215 with config_context(**self.config):
--> 216 return self.function(*args, **kwargs)
217
218
~\Anaconda3\lib\site-packages\sklearn\calibration.py in _fit_classifier_calibrator_pair(estimator, X, y, train, test, supports_sw, method, classes, sample_weight)
515 predictions = _compute_predictions(pred_method, method_name, X_test, n_classes)
516
--> 517 calibrated_classifier = _fit_calibrator(
518 estimator, predictions, y_test, classes, method, sample_weight=sw_test
519 )
~\Anaconda3\lib\site-packages\sklearn\calibration.py in _fit_calibrator(clf, predictions, y, classes, method, sample_weight)
623 Y = label_binarize(y, classes=classes)
624 label_encoder = LabelEncoder().fit(classes)
--> 625 pos_class_indices = label_encoder.transform(clf.classes_)
626 calibrators = []
627 for class_idx, this_pred in zip(pos_class_indices, predictions.T):
AttributeError: 'SGDOneClassSVM' object has no attribute 'classes_'
5,
X = np.concatenate([X_train_pca, X_valid_pca])
y = np.concatenate([y_train, y_valid])
valid_fold = [-1 for _ in range(X_train.shape[0])] + [0 for _ in range(X_valid.shape[0])]
ps = PredefinedSplit(valid_fold)
final_model_isotonic = CalibratedClassifierCV(model, method="isotonic", cv=ps)
final_model_isotonic.fit(X, y)
y_hat_calib = final_model_isotonic.predict(X_calib_pca)
---------------------------------------------------------------------------
AttributeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_29952/2567788326.py in <module>
5
6 final_model_isotonic = CalibratedClassifierCV(model, method="isotonic", cv=ps)
----> 7 final_model_isotonic.fit(X, y)
8 y_hat_calib = final_model_isotonic.predict(X_calib_pca)
9
~\Anaconda3\lib\site-packages\sklearn\calibration.py in fit(self, X, y, sample_weight)
339 parallel = Parallel(n_jobs=self.n_jobs)
340
--> 341 self.calibrated_classifiers_ = parallel(
342 delayed(_fit_classifier_calibrator_pair)(
343 clone(base_estimator),
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1041 # remaining jobs.
1042 self._iterating = False
-> 1043 if self.dispatch_one_batch(iterator):
1044 self._iterating = self._original_iterator is not None
1045
~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
859 return False
860 else:
--> 861 self._dispatch(tasks)
862 return True
863
~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
777 with self._lock:
778 job_idx = len(self._jobs)
--> 779 job = self._backend.apply_async(batch, callback=cb)
780 # A job can complete so quickly than its callback is
781 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~\Anaconda3\lib\site-packages\sklearn\utils\fixes.py in __call__(self, *args, **kwargs)
214 def __call__(self, *args, **kwargs):
215 with config_context(**self.config):
--> 216 return self.function(*args, **kwargs)
217
218
~\Anaconda3\lib\site-packages\sklearn\calibration.py in _fit_classifier_calibrator_pair(estimator, X, y, train, test, supports_sw, method, classes, sample_weight)
515 predictions = _compute_predictions(pred_method, method_name, X_test, n_classes)
516
--> 517 calibrated_classifier = _fit_calibrator(
518 estimator, predictions, y_test, classes, method, sample_weight=sw_test
519 )
~\Anaconda3\lib\site-packages\sklearn\calibration.py in _fit_calibrator(clf, predictions, y, classes, method, sample_weight)
623 Y = label_binarize(y, classes=classes)
624 label_encoder = LabelEncoder().fit(classes)
--> 625 pos_class_indices = label_encoder.transform(clf.classes_)
626 calibrators = []
627 for class_idx, this_pred in zip(pos_class_indices, predictions.T):
AttributeError: 'SGDOneClassSVM' object has no attribute 'classes_'
It would be kind if you could advise me. I planned to share the whole workflow and the data, too, but the data is large e.g. to make available on GitHub. I am sorry.
I am currently working on the "French Motor Claims Datasets freMTPL2freq" Kaggle competition (https://www.kaggle.com/floser/french-motor-claims-datasets-fremtpl2freq). Unfortunately I get a "NotFittedError: All estimators failed to fit" error whenever I am using RandomizedSearchCV and I cannot figure out why that is.
Any help is much appreciated.
import numpy as np
import statsmodels.api as sm
import scipy.stats as stats
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_poisson_deviance
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_gamma_deviance
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
data_freq = pd.read_csv('freMTPL2freq.csv')
data_freq['Area'] = data_freq['Area'].str.replace('\'','')
data_freq['VehBrand'] = data_freq['VehBrand'].str.replace('\'','')
data_freq['VehGas'] = data_freq['VehGas'].str.replace('\'','')
data_freq['Region'] = data_freq['Region'].str.replace('\'','')
data_freq['frequency'] = data_freq['ClaimNb'] / data_freq['Exposure']
y = data_freq['frequency']
X = data_freq.drop(['frequency', 'ClaimNb', 'IDpol'], axis = 1)
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, shuffle = True, random_state = 42)
pt_columns = ['VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Density']
cat_columns = ['Area', 'Region', 'VehBrand', 'VehGas']
from xgboost import XGBRegressor
ct = ColumnTransformer([('pt', 'passthrough', pt_columns),
('ohe', OneHotEncoder(), cat_columns)])
pipe_xgbr = Pipeline([('cf_trans', ct),
('ssc', StandardScaler(with_mean = False)),
('xgb_regressor', XGBRegressor())
])
param = {'xgb_regressor__n_estimators':[3, 5],
'xgb_regressor__max_depth':[3, 5, 7],
'xgb_regressor__learning_rate':[0.1, 0.5],
'xgb_regressor__colsample_bytree':[0.5, 0.8],
'xgb_regressor__subsample':[0.5, 0.8]
}
rscv = RandomizedSearchCV(pipe_xgbr, param_distributions = param, n_iter = 2, scoring = mean_squared_error, n_jobs = -1, cv = 5, error_score = 'raise')
rscv.fit(X_train, y_train, xgbr_regressor__sample_weight = X_train['Exposure'])
The first five rows of the original dataframe data_freq look like this:
IDpol ClaimNb Exposure Area VehPower VehAge DrivAge BonusMalus VehBrand VehGas Density Region
0 1.0 1 0.10 D 5 0 55 50 B12 Regular 1217 R82
1 3.0 1 0.77 D 5 0 55 50 B12 Regular 1217 R82
2 5.0 1 0.75 B 6 2 52 50 B12 Diesel 54 R22
3 10.0 1 0.09 B 7 0 46 50 B12 Diesel 76 R72
4 11.0 1 0.84 B 7 0 46 50 B12 Diesel 76 R72
The error I get is as follows:
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 418, in _process_worker
r = call_item()
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 608, in __call__
return self.func(*args, **kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\pipeline.py", line 340, in fit
fit_params_steps = self._check_fit_params(**fit_params)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\pipeline.py", line 261, in _check_fit_params
fit_params_steps[step][param] = pval
KeyError: 'xgbr_regressor'
"""
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-68-0c1886d1e985> in <module>
----> 1 rscv.fit(X_train, y_train, xgbr_regressor__sample_weight = X_train['Exposure'])
2 #pipe_xgbr.fit(X_train, y_train)
3 #X_train.describe(include = 'all')
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1633 evaluate_candidates(ParameterSampler(
1634 self.param_distributions, self.n_iter,
-> 1635 random_state=self.random_state))
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
807 (split_idx, (train, test)) in product(
808 enumerate(candidate_params),
--> 809 enumerate(cv.split(X, y, groups))))
810
811 if len(out) < 1:
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1015
1016 with self._backend.retrieval_context():
-> 1017 self.retrieve()
1018 # Make sure that we get a last message telling us we are done
1019 elapsed_time = time.time() - self._start_time
~\anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
907 try:
908 if getattr(self._backend, 'supports_timeout', False):
--> 909 self._output.extend(job.get(timeout=self.timeout))
910 else:
911 self._output.extend(job.get())
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
560 AsyncResults.get from multiprocessing."""
561 try:
--> 562 return future.result(timeout=timeout)
563 except LokyTimeoutError:
564 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
KeyError: 'xgbr_regressor'
I also tried running fit without the sample_weight parameter. In this case the error changes to:
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 418, in _process_worker
r = call_item()
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 608, in __call__
return self.func(*args, **kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 625, in _fit_and_score
test_scores = _score(estimator, X_test, y_test, scorer, error_score)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
scores = scorer(estimator, X_test, y_test)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 74, in inner_f
return f(**kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 336, in mean_squared_error
y_true, y_pred, multioutput)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 88, in _check_reg_targets
check_consistent_length(y_true, y_pred)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 316, in check_consistent_length
lengths = [_num_samples(X) for X in arrays if X is not None]
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 316, in <listcomp>
lengths = [_num_samples(X) for X in arrays if X is not None]
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 249, in _num_samples
raise TypeError(message)
TypeError: Expected sequence or array-like, got <class 'sklearn.pipeline.Pipeline'>
"""
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
<ipython-input-69-a9be9cc5df4a> in <module>
----> 1 rscv.fit(X_train, y_train)#, xgbr_regressor__sample_weight = X_train['Exposure'])
2 #pipe_xgbr.fit(X_train, y_train)
3 #X_train.describe(include = 'all')
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1633 evaluate_candidates(ParameterSampler(
1634 self.param_distributions, self.n_iter,
-> 1635 random_state=self.random_state))
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
807 (split_idx, (train, test)) in product(
808 enumerate(candidate_params),
--> 809 enumerate(cv.split(X, y, groups))))
810
811 if len(out) < 1:
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1015
1016 with self._backend.retrieval_context():
-> 1017 self.retrieve()
1018 # Make sure that we get a last message telling us we are done
1019 elapsed_time = time.time() - self._start_time
~\anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
907 try:
908 if getattr(self._backend, 'supports_timeout', False):
--> 909 self._output.extend(job.get(timeout=self.timeout))
910 else:
911 self._output.extend(job.get())
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
560 AsyncResults.get from multiprocessing."""
561 try:
--> 562 return future.result(timeout=timeout)
563 except LokyTimeoutError:
564 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
TypeError: Expected sequence or array-like, got <class 'sklearn.pipeline.Pipeline'>
When setting verbose = 10 and n_jobs = 1 the following error message shows up:
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5; 1/2] START xgb_regressor__colsample_bytree=0.5, xgb_regressor__learning_rate=0.5, xgb_regressor__max_depth=5, xgb_regressor__n_estimators=5, xgb_regressor__subsample=0.5
C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py:72: FutureWarning: Pass sample_weight=406477 1.0
393150 0.0
252885 0.0
260652 0.0
661256 0.0
...
154663 0.0
398414 0.0
42890 0.0
640774 0.0
114446 0.0
Name: frequency, Length: 108482, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
"will result in an error", FutureWarning)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-84-74435f74c470> in <module>
----> 1 rscv.fit(X_train, y_train, xgb_regressor__sample_weight = X_train['Exposure'])
2 #pipe_xgbr.fit(X_train, y_train)
3 #X_train.describe(include = 'all')
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1633 evaluate_candidates(ParameterSampler(
1634 self.param_distributions, self.n_iter,
-> 1635 random_state=self.random_state))
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
807 (split_idx, (train, test)) in product(
808 enumerate(candidate_params),
--> 809 enumerate(cv.split(X, y, groups))))
810
811 if len(out) < 1:
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1002 # remaining jobs.
1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator):
1005 self._iterating = self._original_iterator is not None
1006
~\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\anaconda3\lib\site-packages\sklearn\utils\fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
623
624 fit_time = time.time() - start_time
--> 625 test_scores = _score(estimator, X_test, y_test, scorer, error_score)
626 score_time = time.time() - start_time - fit_time
627 if return_train_score:
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, error_score)
685 scores = scorer(estimator, X_test)
686 else:
--> 687 scores = scorer(estimator, X_test, y_test)
688 except Exception:
689 if error_score == 'raise':
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
72 "will result in an error", FutureWarning)
73 kwargs.update(zip(sig.parameters, args))
---> 74 return f(**kwargs)
75 return inner_f
76
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in mean_squared_error(y_true, y_pred, sample_weight, multioutput, squared)
334 """
335 y_type, y_true, y_pred, multioutput = _check_reg_targets(
--> 336 y_true, y_pred, multioutput)
337 check_consistent_length(y_true, y_pred, sample_weight)
338 output_errors = np.average((y_true - y_pred) ** 2, axis=0,
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in _check_reg_targets(y_true, y_pred, multioutput, dtype)
86 the dtype argument passed to check_array.
87 """
---> 88 check_consistent_length(y_true, y_pred)
89 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
90 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
314 """
315
--> 316 lengths = [_num_samples(X) for X in arrays if X is not None]
317 uniques = np.unique(lengths)
318 if len(uniques) > 1:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in <listcomp>(.0)
314 """
315
--> 316 lengths = [_num_samples(X) for X in arrays if X is not None]
317 uniques = np.unique(lengths)
318 if len(uniques) > 1:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in _num_samples(x)
247 if hasattr(x, 'fit') and callable(x.fit):
248 # Don't get num_samples from an ensembles length!
--> 249 raise TypeError(message)
250
251 if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
TypeError: Expected sequence or array-like, got <class 'sklearn.pipeline.Pipeline'>
Wow, that was a mess of a traceback, but I think I've finally found it. You set scoring=mean_squared_error, and should instead use scoring="neg_mean_squared_error".
The metric function mean_squared_error has signature (y_true, y_pred, *, <kwargs>), whereas the scorer obtained by using the string "neg_mean_squared_error" has signature (estimator, X_test, y_test). So in the traceback, where you see
--> 687 scores = scorer(estimator, X_test, y_test)
it is calling mean_squared_error with y_true=estimator, y_test=X_test, and sample_weight=y_test (the first kwarg, and hence the FutureWarning about specifying keyword arguments as positional). Going deeper into the traceback, we see a check that the shapes of y_true and y_pred are compatible, but it thinks the former is your pipeline object (and hence the final error message)!
According to your error message, KeyError: 'xgbr_regressor' the code cant find the key xgbr_regressor in your Pipeline. In your pipeline, you have defined the xgb_regressor:
pipe_xgbr = Pipeline(
[('cf_trans', ct),
('ssc', StandardScaler(with_mean = False)),
('xgb_regressor', XGBRegressor())])
But when you try to fit, you call it with a reference to xgbr_regressor which is why the KeyError is thrown:
rscv.fit(X_train, y_train, xgbr_regressor__sample_weight=X_train['Exposure'])
Therefore, you must change the above line to swap out xgbr_regressor__sample_weight to xgb_regressor__sample_weight and this should eliminate that error.
I am trying to use GridSearchCV with CatBoostClassifier for multiclass (3), and am getting error. The code seems to work OK in this Kaggle notebook. The estimator also works successfully without GridSearchCV.
Here is the code and error:
model = CatBoostClassifier()
params = {'iterations': [500],
'depth': [4, 5, 6],
'loss_function': ['Logloss', 'CrossEntropy'],
'l2_leaf_reg': np.logspace(-20, -19, 3),
'leaf_estimation_iterations': [10],
'eval_metric': ['Accuracy'],
'use_best_model': ['True'],
'logging_level':['Silent'],
'random_seed': [42]
}
scorer = make_scorer(accuracy_score)
clf_grid = GridSearchCV(estimator=model, param_grid=params, scoring=scorer, cv=10)
clf_grid.fit(X_train, y_train)
Error:
NotFittedError Traceback (most recent call last)
<ipython-input-49-d6ecb7a4f83f> in <module>
----> 1 clf_grid.fit(X_train, y_train,eval_set=(X_train,y_train))
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1286 def _run_search(self, evaluate_candidates):
1287 """Search all candidates in param_grid"""
-> 1288 evaluate_candidates(ParameterGrid(self.param_grid))
1289
1290
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
825 # of out will be done in `_insert_error_scores`.
826 if callable(self.scoring):
--> 827 _insert_error_scores(out, self.error_score)
828 all_candidate_params.extend(candidate_params)
829 all_out.extend(out)
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _insert_error_scores(results, error_score)
295
296 if successful_score is None:
--> 297 raise NotFittedError("All estimators failed to fit")
298
299 if isinstance(successful_score, dict):
NotFittedError: All estimators failed to fit
could you help me with the problem below? Many thanks in advance.
Without fit_params=fit_params, the code below works fine, but I want to try early stopping with lgbm.
I did try to search for clues but found limited resources but some github issues from lightgbm and BayesSearchCV.
lg = lgb.LGBMClassifier(random_state=42, n_jobs=-1, objective='multiclass', n_estimators=5000)
fullPipeline = Pipeline(steps=[
('scaler', StandardScaler()),
('model', lg)
])
param_space = {'model__max_depth': [2, 63],
'model__num_leaves': [7, 4095],
}
fit_params = {
'early_stopping_rounds':30,
'eval_metric':'accuracy',
'eval_set':[(xValid, yValid)],
}
BSLGB = BayesSearchCV(fullPipeline, param_space, random_state=42, scoring='accuracy', cv=5, n_iter=50, verbose=3, n_jobs=-1,
fit_params=fit_params)
%time BSLGB.fit(xTrain.astype(float), yTrain)
Exception:
ValueError Traceback (most recent call last)
<timed eval> in <module>
C:\Anaconda3x64\envs\ml\lib\site-packages\skopt\searchcv.py in fit(self, X, y, groups, callback)
652 optim_result = self._step(
653 X, y, search_space, optimizer,
--> 654 groups=groups, n_points=n_points_adjusted
655 )
656 n_iter -= n_points
C:\Anaconda3x64\envs\ml\lib\site-packages\skopt\searchcv.py in _step(self, X, y, search_space, optimizer, groups, n_points)
548 refit = self.refit
549 self.refit = False
--> 550 self._fit(X, y, groups, params_dict)
551 self.refit = refit
552
C:\Anaconda3x64\envs\ml\lib\site-packages\skopt\searchcv.py in _fit(self, X, y, groups, parameter_iterable)
401 error_score=self.error_score
402 )
--> 403 for parameters in parameter_iterable
404 for train, test in cv_iter)
405
C:\Anaconda3x64\envs\ml\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
928
929 with self._backend.retrieval_context():
--> 930 self.retrieve()
931 # Make sure that we get a last message telling us we are done
932 elapsed_time = time.time() - self._start_time
C:\Anaconda3x64\envs\ml\lib\site-packages\sklearn\externals\joblib\parallel.py in retrieve(self)
831 try:
832 if getattr(self._backend, 'supports_timeout', False):
--> 833 self._output.extend(job.get(timeout=self.timeout))
834 else:
835 self._output.extend(job.get())
C:\Anaconda3x64\envs\ml\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
519 AsyncResults.get from multiprocessing."""
520 try:
--> 521 return future.result(timeout=timeout)
522 except LokyTimeoutError:
523 raise TimeoutError()
C:\Anaconda3x64\envs\ml\lib\concurrent\futures\_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433 else:
434 raise TimeoutError()
C:\Anaconda3x64\envs\ml\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
ValueError: not enough values to unpack (expected 2, got 1)
The root cause of this issue is I passed a pipeline, not a model into BayesSearchCV. Meanwhile, my fit_params do not have a prefix. To fix:
fit_params = {
'model__early_stopping_rounds':30,
'model__eval_metric':'multi_logloss',
'model__eval_set':[(xValid, yValid)],
}