Related
I am currently working on the "French Motor Claims Datasets freMTPL2freq" Kaggle competition (https://www.kaggle.com/floser/french-motor-claims-datasets-fremtpl2freq). Unfortunately I get a "NotFittedError: All estimators failed to fit" error whenever I am using RandomizedSearchCV and I cannot figure out why that is.
Any help is much appreciated.
import numpy as np
import statsmodels.api as sm
import scipy.stats as stats
from matplotlib import pyplot as plt
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import KBinsDiscretizer
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import StratifiedKFold
from sklearn.metrics import mean_poisson_deviance
from sklearn.metrics import mean_squared_error
from sklearn.ensemble import VotingRegressor
from sklearn.ensemble import StackingRegressor
from sklearn.metrics import mean_gamma_deviance
from sklearn.metrics import mean_squared_error
from xgboost import XGBRegressor
data_freq = pd.read_csv('freMTPL2freq.csv')
data_freq['Area'] = data_freq['Area'].str.replace('\'','')
data_freq['VehBrand'] = data_freq['VehBrand'].str.replace('\'','')
data_freq['VehGas'] = data_freq['VehGas'].str.replace('\'','')
data_freq['Region'] = data_freq['Region'].str.replace('\'','')
data_freq['frequency'] = data_freq['ClaimNb'] / data_freq['Exposure']
y = data_freq['frequency']
X = data_freq.drop(['frequency', 'ClaimNb', 'IDpol'], axis = 1)
X_train, X_val, y_train, y_val = train_test_split(X,y, test_size=0.2, shuffle = True, random_state = 42)
pt_columns = ['VehPower', 'VehAge', 'DrivAge', 'BonusMalus', 'Density']
cat_columns = ['Area', 'Region', 'VehBrand', 'VehGas']
from xgboost import XGBRegressor
ct = ColumnTransformer([('pt', 'passthrough', pt_columns),
('ohe', OneHotEncoder(), cat_columns)])
pipe_xgbr = Pipeline([('cf_trans', ct),
('ssc', StandardScaler(with_mean = False)),
('xgb_regressor', XGBRegressor())
])
param = {'xgb_regressor__n_estimators':[3, 5],
'xgb_regressor__max_depth':[3, 5, 7],
'xgb_regressor__learning_rate':[0.1, 0.5],
'xgb_regressor__colsample_bytree':[0.5, 0.8],
'xgb_regressor__subsample':[0.5, 0.8]
}
rscv = RandomizedSearchCV(pipe_xgbr, param_distributions = param, n_iter = 2, scoring = mean_squared_error, n_jobs = -1, cv = 5, error_score = 'raise')
rscv.fit(X_train, y_train, xgbr_regressor__sample_weight = X_train['Exposure'])
The first five rows of the original dataframe data_freq look like this:
IDpol ClaimNb Exposure Area VehPower VehAge DrivAge BonusMalus VehBrand VehGas Density Region
0 1.0 1 0.10 D 5 0 55 50 B12 Regular 1217 R82
1 3.0 1 0.77 D 5 0 55 50 B12 Regular 1217 R82
2 5.0 1 0.75 B 6 2 52 50 B12 Diesel 54 R22
3 10.0 1 0.09 B 7 0 46 50 B12 Diesel 76 R72
4 11.0 1 0.84 B 7 0 46 50 B12 Diesel 76 R72
The error I get is as follows:
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 418, in _process_worker
r = call_item()
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 608, in __call__
return self.func(*args, **kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 598, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\pipeline.py", line 340, in fit
fit_params_steps = self._check_fit_params(**fit_params)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\pipeline.py", line 261, in _check_fit_params
fit_params_steps[step][param] = pval
KeyError: 'xgbr_regressor'
"""
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
<ipython-input-68-0c1886d1e985> in <module>
----> 1 rscv.fit(X_train, y_train, xgbr_regressor__sample_weight = X_train['Exposure'])
2 #pipe_xgbr.fit(X_train, y_train)
3 #X_train.describe(include = 'all')
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1633 evaluate_candidates(ParameterSampler(
1634 self.param_distributions, self.n_iter,
-> 1635 random_state=self.random_state))
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
807 (split_idx, (train, test)) in product(
808 enumerate(candidate_params),
--> 809 enumerate(cv.split(X, y, groups))))
810
811 if len(out) < 1:
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1015
1016 with self._backend.retrieval_context():
-> 1017 self.retrieve()
1018 # Make sure that we get a last message telling us we are done
1019 elapsed_time = time.time() - self._start_time
~\anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
907 try:
908 if getattr(self._backend, 'supports_timeout', False):
--> 909 self._output.extend(job.get(timeout=self.timeout))
910 else:
911 self._output.extend(job.get())
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
560 AsyncResults.get from multiprocessing."""
561 try:
--> 562 return future.result(timeout=timeout)
563 except LokyTimeoutError:
564 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
KeyError: 'xgbr_regressor'
I also tried running fit without the sample_weight parameter. In this case the error changes to:
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 418, in _process_worker
r = call_item()
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 608, in __call__
return self.func(*args, **kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "C:\Users\Jan\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 625, in _fit_and_score
test_scores = _score(estimator, X_test, y_test, scorer, error_score)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 687, in _score
scores = scorer(estimator, X_test, y_test)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 74, in inner_f
return f(**kwargs)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 336, in mean_squared_error
y_true, y_pred, multioutput)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\metrics\_regression.py", line 88, in _check_reg_targets
check_consistent_length(y_true, y_pred)
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 316, in check_consistent_length
lengths = [_num_samples(X) for X in arrays if X is not None]
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 316, in <listcomp>
lengths = [_num_samples(X) for X in arrays if X is not None]
File "C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 249, in _num_samples
raise TypeError(message)
TypeError: Expected sequence or array-like, got <class 'sklearn.pipeline.Pipeline'>
"""
The above exception was the direct cause of the following exception:
TypeError Traceback (most recent call last)
<ipython-input-69-a9be9cc5df4a> in <module>
----> 1 rscv.fit(X_train, y_train)#, xgbr_regressor__sample_weight = X_train['Exposure'])
2 #pipe_xgbr.fit(X_train, y_train)
3 #X_train.describe(include = 'all')
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1633 evaluate_candidates(ParameterSampler(
1634 self.param_distributions, self.n_iter,
-> 1635 random_state=self.random_state))
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
807 (split_idx, (train, test)) in product(
808 enumerate(candidate_params),
--> 809 enumerate(cv.split(X, y, groups))))
810
811 if len(out) < 1:
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1015
1016 with self._backend.retrieval_context():
-> 1017 self.retrieve()
1018 # Make sure that we get a last message telling us we are done
1019 elapsed_time = time.time() - self._start_time
~\anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
907 try:
908 if getattr(self._backend, 'supports_timeout', False):
--> 909 self._output.extend(job.get(timeout=self.timeout))
910 else:
911 self._output.extend(job.get())
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
560 AsyncResults.get from multiprocessing."""
561 try:
--> 562 return future.result(timeout=timeout)
563 except LokyTimeoutError:
564 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
~\anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
TypeError: Expected sequence or array-like, got <class 'sklearn.pipeline.Pipeline'>
When setting verbose = 10 and n_jobs = 1 the following error message shows up:
Fitting 5 folds for each of 2 candidates, totalling 10 fits
[CV 1/5; 1/2] START xgb_regressor__colsample_bytree=0.5, xgb_regressor__learning_rate=0.5, xgb_regressor__max_depth=5, xgb_regressor__n_estimators=5, xgb_regressor__subsample=0.5
C:\Users\Jan\anaconda3\lib\site-packages\sklearn\utils\validation.py:72: FutureWarning: Pass sample_weight=406477 1.0
393150 0.0
252885 0.0
260652 0.0
661256 0.0
...
154663 0.0
398414 0.0
42890 0.0
640774 0.0
114446 0.0
Name: frequency, Length: 108482, dtype: float64 as keyword args. From version 1.0 (renaming of 0.25) passing these as positional arguments will result in an error
"will result in an error", FutureWarning)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-84-74435f74c470> in <module>
----> 1 rscv.fit(X_train, y_train, xgb_regressor__sample_weight = X_train['Exposure'])
2 #pipe_xgbr.fit(X_train, y_train)
3 #X_train.describe(include = 'all')
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1633 evaluate_candidates(ParameterSampler(
1634 self.param_distributions, self.n_iter,
-> 1635 random_state=self.random_state))
~\anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params, cv, more_results)
807 (split_idx, (train, test)) in product(
808 enumerate(candidate_params),
--> 809 enumerate(cv.split(X, y, groups))))
810
811 if len(out) < 1:
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1002 # remaining jobs.
1003 self._iterating = False
-> 1004 if self.dispatch_one_batch(iterator):
1005 self._iterating = self._original_iterator is not None
1006
~\anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~\anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~\anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~\anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\anaconda3\lib\site-packages\sklearn\utils\fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
623
624 fit_time = time.time() - start_time
--> 625 test_scores = _score(estimator, X_test, y_test, scorer, error_score)
626 score_time = time.time() - start_time - fit_time
627 if return_train_score:
~\anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer, error_score)
685 scores = scorer(estimator, X_test)
686 else:
--> 687 scores = scorer(estimator, X_test, y_test)
688 except Exception:
689 if error_score == 'raise':
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
72 "will result in an error", FutureWarning)
73 kwargs.update(zip(sig.parameters, args))
---> 74 return f(**kwargs)
75 return inner_f
76
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in mean_squared_error(y_true, y_pred, sample_weight, multioutput, squared)
334 """
335 y_type, y_true, y_pred, multioutput = _check_reg_targets(
--> 336 y_true, y_pred, multioutput)
337 check_consistent_length(y_true, y_pred, sample_weight)
338 output_errors = np.average((y_true - y_pred) ** 2, axis=0,
~\anaconda3\lib\site-packages\sklearn\metrics\_regression.py in _check_reg_targets(y_true, y_pred, multioutput, dtype)
86 the dtype argument passed to check_array.
87 """
---> 88 check_consistent_length(y_true, y_pred)
89 y_true = check_array(y_true, ensure_2d=False, dtype=dtype)
90 y_pred = check_array(y_pred, ensure_2d=False, dtype=dtype)
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in check_consistent_length(*arrays)
314 """
315
--> 316 lengths = [_num_samples(X) for X in arrays if X is not None]
317 uniques = np.unique(lengths)
318 if len(uniques) > 1:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in <listcomp>(.0)
314 """
315
--> 316 lengths = [_num_samples(X) for X in arrays if X is not None]
317 uniques = np.unique(lengths)
318 if len(uniques) > 1:
~\anaconda3\lib\site-packages\sklearn\utils\validation.py in _num_samples(x)
247 if hasattr(x, 'fit') and callable(x.fit):
248 # Don't get num_samples from an ensembles length!
--> 249 raise TypeError(message)
250
251 if not hasattr(x, '__len__') and not hasattr(x, 'shape'):
TypeError: Expected sequence or array-like, got <class 'sklearn.pipeline.Pipeline'>
Wow, that was a mess of a traceback, but I think I've finally found it. You set scoring=mean_squared_error, and should instead use scoring="neg_mean_squared_error".
The metric function mean_squared_error has signature (y_true, y_pred, *, <kwargs>), whereas the scorer obtained by using the string "neg_mean_squared_error" has signature (estimator, X_test, y_test). So in the traceback, where you see
--> 687 scores = scorer(estimator, X_test, y_test)
it is calling mean_squared_error with y_true=estimator, y_test=X_test, and sample_weight=y_test (the first kwarg, and hence the FutureWarning about specifying keyword arguments as positional). Going deeper into the traceback, we see a check that the shapes of y_true and y_pred are compatible, but it thinks the former is your pipeline object (and hence the final error message)!
According to your error message, KeyError: 'xgbr_regressor' the code cant find the key xgbr_regressor in your Pipeline. In your pipeline, you have defined the xgb_regressor:
pipe_xgbr = Pipeline(
[('cf_trans', ct),
('ssc', StandardScaler(with_mean = False)),
('xgb_regressor', XGBRegressor())])
But when you try to fit, you call it with a reference to xgbr_regressor which is why the KeyError is thrown:
rscv.fit(X_train, y_train, xgbr_regressor__sample_weight=X_train['Exposure'])
Therefore, you must change the above line to swap out xgbr_regressor__sample_weight to xgb_regressor__sample_weight and this should eliminate that error.
I am trying to train my model (Image classification) using Tensorflow. I keep getting an error when I try to run the following cell:
hist = model.fit(
train_generator,
epochs=100,
verbose=1,
steps_per_epoch=steps_per_epoch,
validation_data=valid_generator,
validation_steps=val_steps_per_epoch).history
Error is:
Epoch 1/100
27/31 [=========================>....] - ETA: 1s - loss: 0.7309 - acc: 0.6181
---------------------------------------------------------------------------
UnknownError Traceback (most recent call last)
<ipython-input-36-b1c104100211> in <module>
2 val_steps_per_epoch = np.ceil(valid_generator.samples/valid_generator.batch_size)
3
----> 4 hist = model.fit(
5 train_generator,
6 epochs=100,
/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/training.py in fit(self, x, y, batch_size, epochs, verbose, callbacks, validation_split, validation_data, shuffle, class_weight, sample_weight, initial_epoch, steps_per_epoch, validation_steps, validation_batch_size, validation_freq, max_queue_size, workers, use_multiprocessing)
1098 _r=1):
1099 callbacks.on_train_batch_begin(step)
-> 1100 tmp_logs = self.train_function(iterator)
1101 if data_handler.should_sync:
1102 context.async_wait()
/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py in __call__(self, *args, **kwds)
826 tracing_count = self.experimental_get_tracing_count()
827 with trace.Trace(self._name) as tm:
--> 828 result = self._call(*args, **kwds)
829 compiler = "xla" if self._experimental_compile else "nonXla"
830 new_tracing_count = self.experimental_get_tracing_count()
/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/def_function.py in _call(self, *args, **kwds)
853 # In this case we have created variables on the first call, so we run the
854 # defunned version which is guaranteed to never create variables.
--> 855 return self._stateless_fn(*args, **kwds) # pylint: disable=not-callable
856 elif self._stateful_fn is not None:
857 # Release the lock early so that multiple threads can perform the call
/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/function.py in __call__(self, *args, **kwargs)
2940 (graph_function,
2941 filtered_flat_args) = self._maybe_define_function(args, kwargs)
-> 2942 return graph_function._call_flat(
2943 filtered_flat_args, captured_inputs=graph_function.captured_inputs) # pylint: disable=protected-access
2944
/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/function.py in _call_flat(self, args, captured_inputs, cancellation_manager)
1916 and executing_eagerly):
1917 # No tape is watching; skip to running the function.
-> 1918 return self._build_call_outputs(self._inference_function.call(
1919 ctx, args, cancellation_manager=cancellation_manager))
1920 forward_backward = self._select_forward_and_backward_functions(
/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/function.py in call(self, ctx, args, cancellation_manager)
553 with _InterpolateFunctionError(self):
554 if cancellation_manager is None:
--> 555 outputs = execute.execute(
556 str(self.signature.name),
557 num_outputs=self._num_outputs,
/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/eager/execute.py in quick_execute(op_name, num_outputs, inputs, attrs, ctx, name)
57 try:
58 ctx.ensure_initialized()
---> 59 tensors = pywrap_tfe.TFE_Py_Execute(ctx._handle, device_name, op_name,
60 inputs, attrs, num_outputs)
61 except core._NotOkStatusException as e:
UnknownError: UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7fc88d55c9a0>
Traceback (most recent call last):
File "/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/ops/script_ops.py", line 249, in __call__
ret = func(*args)
File "/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/autograph/impl/api.py", line 620, in wrapper
return func(*args, **kwargs)
File "/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 891, in generator_py_func
values = next(generator_state.get_iterator(iterator_id))
File "/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/data_adapter.py", line 807, in wrapped_generator
for data in generator_fn():
File "/opt/anaconda3/lib/python3.8/site-packages/tensorflow/python/keras/engine/data_adapter.py", line 933, in generator_fn
yield x[i]
File "/opt/anaconda3/lib/python3.8/site-packages/keras_preprocessing/image/iterator.py", line 65, in __getitem__
return self._get_batches_of_transformed_samples(index_array)
File "/opt/anaconda3/lib/python3.8/site-packages/keras_preprocessing/image/iterator.py", line 227, in _get_batches_of_transformed_samples
img = load_img(filepaths[j],
File "/opt/anaconda3/lib/python3.8/site-packages/keras_preprocessing/image/utils.py", line 114, in load_img
img = pil_image.open(io.BytesIO(f.read()))
File "/opt/anaconda3/lib/python3.8/site-packages/PIL/Image.py", line 2943, in open
raise UnidentifiedImageError(
PIL.UnidentifiedImageError: cannot identify image file <_io.BytesIO object at 0x7fc88d55c9a0>
[[{{node PyFunc}}]]
[[IteratorGetNext]] [Op:__inference_train_function_24233]
Function call stack:
train_function
I tried changing from loss='categorical_crossentropy' to loss='binary_crossentropy' but still the issue persists. I wish to train the model but the Epoch keeps getting stuck.
Edit:
The train generator function and where it is used is as follows:
IMAGE_SHAPE = (224, 224)
TRAINING_DATA_DIR = str(data_root)
datagen_kwargs = dict(rescale=1./255, validation_split=.20)
valid_datagen = tf.keras.preprocessing.image.ImageDataGenerator(**datagen_kwargs)
valid_generator = valid_datagen.flow_from_directory(
TRAINING_DATA_DIR,
subset="validation",
shuffle=True,
target_size=IMAGE_SHAPE
)
train_datagen = tf.keras.preprocessing.image.ImageDataGenerator(**datagen_kwargs)
train_generator = train_datagen.flow_from_directory(
TRAINING_DATA_DIR,
subset="training",
shuffle=True,
target_size=IMAGE_SHAPE)
for image_batch, label_batch in train_generator:
break
image_batch.shape, label_batch.shape
Output: ((32, 224, 224, 3), (32, 2))
print (train_generator.class_indices)
labels = '\n'.join(sorted(train_generator.class_indices.keys()))
with open('labels.txt', 'w') as f:
f.write(labels)
Output: {'off': 0, 'on': 1}
There was an issue with one of the img that was causing an issue and was pointed out by #Lescurel. To view the img you can run the following:
import PIL
from pathlib import Path
from PIL import UnidentifiedImageError
path = Path("INSERT PATH HERE").rglob("*.jpeg")
for img_p in path:
try:
img = PIL.Image.open(img_p)
except PIL.UnidentifiedImageError:
print(img_p)
You can also do the same for png or other formats. If there is an issue with your image, it will list it as soon as you run it
Similar to #EverydayDeveloper, but using glob to hold all image path with class.
import PIL
from PIL import UnidentifiedImageError
import glob
imgs_ = glob.glob("/home/ubuntu/imageTrain_dobby/SKJEWELLERY/classification/dataset/jewellery_dataset/train/*/*.jpg")
for img in imgs_:
try:
img = PIL.Image.open(img)
except PIL.UnidentifiedImageError:
print(img)
I had the same issue when accessing a training dataset on an external hard drive on a mac. The flow_from_directory function returned more files than I had placed in the folders.
Moving the whole training dataset to the local drive resolved the issue.
I encountered the same error when attempting to call Image.open() but I was calling this function on bytes received from a flutter client application via a multipart request.
The problem was that I didn't include the filename argument on the client side:
MultipartFile.fromBytes('image', await image.readAsBytes(),
filename: image.name)
After adding filename: image.name, Image.open was able to identify the image file.
I have the following code which works normally but got a
UserWarning: One or more of the test scores are non-finite: [nan nan]
category=UserWarning
when I revised it into a more concise version (shown in the subsequent code snippet). Is the output of the one-hot encoder the culprit of the issue?
import pandas as pd
from sklearn.model_selection import StratifiedKFold
from sklearn.linear_model import RidgeClassifier
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.model_selection import GridSearchCV
train = pd.read_csv('/train.csv')
test = pd.read_csv('/test.csv')
sparse_features = [col for col in train.columns if col.startswith('cat')]
dense_features = [col for col in train.columns if col not in sparse_features+['target']]
X = train.drop(['target'], axis=1)
y = train['target'].values
skf = StratifiedKFold(n_splits=5)
clf = RidgeClassifier()
full_pipeline = ColumnTransformer(transformers=[
('num', StandardScaler(), dense_features),
('cat', OneHotEncoder(), sparse_features)
])
X_prepared = full_pipeline.fit_transform(X)
param_grid = {
'alpha': [ 0.1],
'fit_intercept': [False]
}
gs = GridSearchCV(
estimator=clf,
param_grid=param_grid,
scoring='roc_auc',
n_jobs=-1,
cv=skf
)
gs.fit(X_prepared, y)
The revision is shown below.
clf2 = RidgeClassifier()
preprocess_pipeline2 = ColumnTransformer([
('num', StandardScaler(), dense_features),
('cat', OneHotEncoder(), sparse_features)
])
from sklearn.pipeline import Pipeline
final_pipeline = Pipeline(steps=[
('p', preprocess_pipeline2),
('c', clf2)
])
param_grid2 = {
'c__alpha': [0.4, 0.1],
'c__fit_intercept': [False]
}
gs2 = GridSearchCV(
estimator=final_pipeline,
param_grid=param_grid2,
scoring='roc_auc',
n_jobs=-1,
cv=skf
)
gs2.fit(X, y)
Can anyone point out which part goes wrong?
EDIT: After setting error_score to raise, I can receive more feedback regarding the issue. It seems to me that I need to fit the one-hot encoder on the merged dataset that combines the training set and the test set. Am I correct? But if it is the case, why doesn't the first version complain about the same issue? BTW, does it make sense to introduce the argument handle_unknown='ignore' to handle this issue?
ValueError
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "/opt/conda/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 431, in _process_worker
r = call_item()
File "/opt/conda/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 285, in __call__
return self.fn(*self.args, **self.kwargs)
File "/opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 595, in __call__
return self.func(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 263, in __call__
for func, args, kwargs in self.items]
File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 263, in <listcomp>
for func, args, kwargs in self.items]
File "/opt/conda/lib/python3.7/site-packages/sklearn/utils/fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 620, in _fit_and_score
test_scores = _score(estimator, X_test, y_test, scorer, error_score)
File "/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 674, in _score
scores = scorer(estimator, X_test, y_test)
File "/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 200, in __call__
sample_weight=sample_weight)
File "/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 334, in _score
y_pred = method_caller(clf, "decision_function", X)
File "/opt/conda/lib/python3.7/site-packages/sklearn/metrics/_scorer.py", line 53, in _cached_call
return getattr(estimator, method)(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/sklearn/utils/metaestimators.py", line 120, in <lambda>
out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py", line 493, in decision_function
Xt = transform.transform(Xt)
File "/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py", line 565, in transform
Xs = self._fit_transform(X, None, _transform_one, fitted=True)
File "/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py", line 444, in _fit_transform
self._iter(fitted=fitted, replace_strings=True), 1))
File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 1044, in __call__
while self.dispatch_one_batch(iterator):
File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 859, in dispatch_one_batch
self._dispatch(tasks)
File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 777, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 208, in apply_async
result = ImmediateResult(func)
File "/opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 572, in __init__
self.results = batch()
File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 263, in __call__
for func, args, kwargs in self.items]
File "/opt/conda/lib/python3.7/site-packages/joblib/parallel.py", line 263, in <listcomp>
for func, args, kwargs in self.items]
File "/opt/conda/lib/python3.7/site-packages/sklearn/utils/fixes.py", line 222, in __call__
return self.function(*args, **kwargs)
File "/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py", line 733, in _transform_one
res = transformer.transform(X)
File "/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/_encoders.py", line 462, in transform
force_all_finite='allow-nan')
File "/opt/conda/lib/python3.7/site-packages/sklearn/preprocessing/_encoders.py", line 136, in _transform
raise ValueError(msg)
ValueError: Found unknown categories ['MR', 'MW', 'DA'] in column 10 during transform
"""
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-48-b81f3b7b0724> in <module>
21 cv=skf
22 )
---> 23 gs2.fit(X, y)
/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1286 def _run_search(self, evaluate_candidates):
1287 """Search all candidates in param_grid"""
-> 1288 evaluate_candidates(ParameterGrid(self.param_grid))
1289
1290
/opt/conda/lib/python3.7/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params, cv, more_results)
807 (split_idx, (train, test)) in product(
808 enumerate(candidate_params),
--> 809 enumerate(cv.split(X, y, groups))))
810
811 if len(out) < 1:
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1052
1053 with self._backend.retrieval_context():
-> 1054 self.retrieve()
1055 # Make sure that we get a last message telling us we are done
1056 elapsed_time = time.time() - self._start_time
/opt/conda/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self)
931 try:
932 if getattr(self._backend, 'supports_timeout', False):
--> 933 self._output.extend(job.get(timeout=self.timeout))
934 else:
935 self._output.extend(job.get())
/opt/conda/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
540 AsyncResults.get from multiprocessing."""
541 try:
--> 542 return future.result(timeout=timeout)
543 except CfTimeoutError as e:
544 raise TimeoutError from e
/opt/conda/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
/opt/conda/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
ValueError: Found unknown categories ['MR', 'MW', 'DA'] in column 10 during transform
First, I'd like to say that I had a similar issue, and thanks for noting
After setting error_score to raise
which really helped me with my issue. I was using a custom transformer and I had some code that was creating variables in the training fold, and then not creating them in the validation fold because those categories were not present in validation. I think you're having a similar problem.
Seems like OneHotEncoder is maybe creating some categories in your training fold and then finding new categories in your validation fold that are unknown to it because they didn't exist in the training fold.
ValueError: Found unknown categories ['MR', 'MW', 'DA'] in column 10
during transform
To get around this, my suggestion would be to look into using custom transformers instead since your data is more complex. https://towardsdatascience.com/custom-transformers-and-ml-data-pipelines-with-python-20ea2a7adb65
Remove roc_auc if it is multi class. They do not play well together. Use default scoring or choose something else.
I stumbled here as I got the same warning message while trying to do multiclass classification. My problem was that I tried to use scoring='roc_auc' with GridSearchCV, which doesn't work with multiclass. I used scoring='f1_micro' instead, which worked fine with multiclass.
F1 scoring is discussed e.g. in here:
How to do GridSearchCV for F1-score in classification problem with scikit-learn?
A list of different scoring options can be found here (part 3.3.1):
https://scikit-learn.org/stable/modules/model_evaluation.html
I am working on an NLP Kaggle project and I am using RandomizedSearchCV in my project. I have defined a function named GO which implements RandomizedSearchCV with KFold and scoring criteria and grid_param. Following is my code and when I call the function GO, it gives an error:
kf = KFold(n_splits=5, random_state=0, shuffle=True)
acc = lambda y, y_pred: accuracy_score(y, y_pred)
scorer = make_scorer(acc, greater_is_better=True)
def GO(model, grid, n_iter=100):
search = RandomizedSearchCV(model, grid, n_iter, scorer, n_jobs=-1, cv=kf, random_state=0, verbose=True)
return search.fit(X_train, y_train)
This is the error I get:
PicklingError Traceback (most recent call last)
<ipython-input-131-310dea03e0ad> in <module>
3
4 for pipe, grid in zip(pipes, grids):
----> 5 fitted_models.append(GO(pipe, grid))
<ipython-input-129-98eb26241ea1> in GO(model, grid, n_iter)
1 def GO(model, grid, n_iter=100):
2 search = RandomizedSearchCV(model, grid, n_iter, scorer, n_jobs=-1, cv=kf, random_state=0, verbose=True)
----> 3 return search.fit(X_train, y_train)
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
720 return results_container[0]
721
--> 722 self._run_search(evaluate_candidates)
723
724 results = results_container[0]
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1513 evaluate_candidates(ParameterSampler(
1514 self.param_distributions, self.n_iter,
-> 1515 random_state=self.random_state))
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
709 for parameters, (train, test)
710 in product(candidate_params,
--> 711 cv.split(X, y, groups)))
712
713 all_candidate_params.extend(candidate_params)
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in __call__(self, iterable)
928
929 with self._backend.retrieval_context():
--> 930 self.retrieve()
931 # Make sure that we get a last message telling us we are done
932 elapsed_time = time.time() - self._start_time
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py in retrieve(self)
831 try:
832 if getattr(self._backend, 'supports_timeout', False):
--> 833 self._output.extend(job.get(timeout=self.timeout))
834 else:
835 self._output.extend(job.get())
~\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
519 AsyncResults.get from multiprocessing."""
520 try:
--> 521 return future.result(timeout=timeout)
522 except LokyTimeoutError:
523 raise TimeoutError()
~\Anaconda3\lib\concurrent\futures\_base.py in result(self, timeout)
430 raise CancelledError()
431 elif self._state == FINISHED:
--> 432 return self.__get_result()
433 else:
434 raise TimeoutError()
~\Anaconda3\lib\concurrent\futures\_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
PicklingError: Could not pickle the task to send it to the workers.
I tried to resolve it but can't do. Can anyone here help me?
I am attempting to run node2vec on a directed networkx network I have created. The network looks like this:
OutEdgeDataView([(7, 1, {'senderId': 7, 'weight': 273}), (7, 8, {'senderId': 7, 'weight': 319}), (7, 9, {'senderId': 7, 'weight': 137})....
With each node having an integer ID and a weight linking one node to another.
I am trying to use the node2vec module on this network as:
from node2vec import Node2Vec
node2vec = Node2Vec(mail_n_basic, dimensions=64, walk_length=30, num_walks=200, workers=4)
And am returned with this error, any help explaining the error would be much appreciated:
---------------------------------------------------------------------------
_RemoteTraceback Traceback (most recent call last)
_RemoteTraceback:
"""
Traceback (most recent call last):
File "C:\Users\Andrew\Anaconda3\lib\site-packages\joblib\externals\loky\process_executor.py", line 398, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "C:\Users\Andrew\Anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 561, in __call__
return self.func(*args, **kwargs)
File "C:\Users\Andrew\Anaconda3\lib\site-packages\joblib\parallel.py", line 224, in __call__
for func, args, kwargs in self.items]
File "C:\Users\Andrew\Anaconda3\lib\site-packages\joblib\parallel.py", line 224, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\Andrew\Anaconda3\lib\site-packages\node2vec\node2vec.py", line 51, in parallel_generate_walks
walk_to = np.random.choice(walk_options, size=1)[0]
File "mtrand.pyx", line 1126, in mtrand.RandomState.choice
ValueError: a must be non-empty
"""
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<ipython-input-58-3ac160061528> in <module>()
1
----> 2 node2vec = Node2Vec(mail_n_basic, dimensions=64, walk_length=30, num_walks=200, workers=4)
~\Anaconda3\lib\site-packages\node2vec\node2vec.py in __init__(self, graph, dimensions, walk_length, num_walks, p, q, weight_key, workers, sampling_strategy)
111
112 self.d_graph = self._precompute_probabilities()
--> 113 self.walks = self._generate_walks()
114
115 def _precompute_probabilities(self):
~\Anaconda3\lib\site-packages\node2vec\node2vec.py in _generate_walks(self)
178 self.NEIGHBORS_KEY,
179 self.PROBABILITIES_KEY) for idx, num_walks
--> 180 in enumerate(num_walks_lists, 1))
181
182 walks = flatten(walk_results)
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
960
961 with self._backend.retrieval_context():
--> 962 self.retrieve()
963 # Make sure that we get a last message telling us we are done
964 elapsed_time = time.time() - self._start_time
~\Anaconda3\lib\site-packages\joblib\parallel.py in retrieve(self)
863 try:
864 if getattr(self._backend, 'supports_timeout', False):
--> 865 self._output.extend(job.get(timeout=self.timeout))
866 else:
867 self._output.extend(job.get())
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in wrap_future_result(future, timeout)
513 AsyncResults.get from multiprocessing."""
514 try:
--> 515 return future.result(timeout=timeout)
516 except LokyTimeoutError:
517 raise TimeoutError()
~\Anaconda3\lib\site-packages\joblib\externals\loky\_base.py in result(self, timeout)
429 raise CancelledError()
430 elif self._state == FINISHED:
--> 431 return self.__get_result()
432 else:
433 raise TimeoutError()
~\Anaconda3\lib\site-packages\joblib\externals\loky\_base.py in __get_result(self)
380 def __get_result(self):
381 if self._exception:
--> 382 raise self._exception
383 else:
384 return self._result
ValueError: a must be non-empty
I'm the author of this library.
If you are using Windows, parallel execution won't work because joblib and Windows issues.
Run the same code with the updated version pip install -U node2vec and when constructing the Node2Vec class, pass workers=1