Consider the following sklearn.pipeline.Pipeline:
pipeline = Pipeline(
[
(
"encoding",
ColumnTransformer(
[
(
"encode categorial",
OneHotEncoder(handle_unknown="ignore"),
categorial_features,
)
],
remainder="passthrough",
),
),
("regressor", regressor),
]
)
This pipeline needs to be run on a feature data set X with many missing values. Dropping rows with missing values or imputation would be problematic. Fortunately, lightgbm handles missing values out of the box.
This works well when regressor is defined as follows:
regressor = MultiOutputRegressor(
lightgbm.LGBMRegressor(n_jobs=-1)
)
However, it fails when using a RegressorChain:
regressor = RegressorChain(
base_estimator=lightgbm.LGBMRegressor(n_jobs=-1),
order=list(range(len(params_pipelines["targets"])))
)
Traceback:
Traceback (most recent call last):
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/concurrent/futures/process.py", line 239, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/kedro/runner/parallel_runner.py", line 135, in _run_node_synchronization
return run_node(node, catalog, is_async, run_id)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/kedro/runner/runner.py", line 213, in run_node
node = _run_node_sequential(node, catalog, run_id)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/kedro/runner/runner.py", line 238, in _run_node_sequential
raise exc
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/kedro/runner/runner.py", line 228, in _run_node_sequential
outputs = node.run(inputs)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/kedro/pipeline/node.py", line 433, in run
raise exc
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/kedro/pipeline/node.py", line 424, in run
outputs = self._run_with_list(inputs, self._inputs)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/kedro/pipeline/node.py", line 471, in _run_with_list
return self._decorated_func(*[inputs[item] for item in node_inputs])
File "/Users/cls/Documents/Work/Projects/H/PackMeasurement/src/packaging_measurement/schema.py", line 79, in transform_wrapper
out = transform(*args, **kwargs)
File "/Users/cls/Documents/Work/Projects/H/PackMeasurement/src/packaging_measurement/nodes/estimation.py", line 219, in fit_estimator
pipelines[level]
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/sklearn/pipeline.py", line 394, in fit
self._final_estimator.fit(Xt, y, **fit_params_last_step)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/sklearn/multioutput.py", line 954, in fit
super().fit(X, Y, **fit_params)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/sklearn/multioutput.py", line 556, in fit
X, Y = self._validate_data(X, Y, multi_output=True, accept_sparse=True)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/sklearn/base.py", line 581, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/sklearn/utils/validation.py", line 964, in check_X_y
X = check_array(
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/sklearn/utils/validation.py", line 720, in check_array
array = _ensure_sparse_format(
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/sklearn/utils/validation.py", line 479, in _ensure_sparse_format
_assert_all_finite(spmatrix.data, allow_nan=force_all_finite == "allow-nan")
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/sklearn/utils/validation.py", line 114, in _assert_all_finite
raise ValueError(
ValueError: Input contains NaN, infinity or a value too large for dtype('float64')
Why does this happen? Is there a reason why the RegressorChain cannot be a drop-in replacement for the MultiOutputRegressor? It is starting to look like a scikit-learn bug.
Related
There are other stack overflow questions same as this, but still, I couldn't find the correct fix,
def _scorerForUnSupervised(estimator, X):
return np.mean(estimator.decision_function(X))
ifclassifier = IForest(behaviour='new',
max_samples="auto",
random_state=np.random.RandomState(42),
verbose=1,
n_jobs=-1)
IF_Hyperparams = {'n_estimators': [100,200], 'contamination': [0.01,0.05], 'bootstrap': [True,False]}
ifgrid = GridSearchCV(ifclassifier,
IF_Hyperparams,
scoring=_scorerForUnSupervised,
cv=3,
n_jobs=-1)
grid_result = ifgrid.fit(train_data)
fit throws the warning, but also getting an OSError
C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\sklearn\model_selection\_validation.py:615: FitFailedWarning: Estimator fit failed. The score on this train-
test partition for these parameters will be set to nan. Details:
Traceback (most recent call last):
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\sklearn\model_selection\_validation.py", line 596, in _fit_and_score
estimator.fit(X_train, **fit_params)
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\pyod\models\iforest.py", line 230, in fit
self.detector_.fit(X=X, y=None, sample_weight=None)
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\sklearn\ensemble\_iforest.py", line 278, in fit
super()._fit(X, y, max_samples,
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\sklearn\ensemble\_bagging.py", line 370, in _fit
all_results = Parallel(n_jobs=n_jobs, verbose=self.verbose,
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\joblib\parallel.py", line 1041, in __call__
if self.dispatch_one_batch(iterator):
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\joblib\parallel.py", line 859, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\joblib\parallel.py", line 777, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\joblib\_parallel_backends.py", line 531, in apply_async
future = self._workers.submit(SafeFunction(func))
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\joblib\externals\loky\reusable_executor.py", line 177, in submit
return super(_ReusablePoolExecutor, self).submit(
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\joblib\externals\loky\process_executor.py", line 1122, in submit
self._ensure_executor_running()
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\joblib\externals\loky\process_executor.py", line 1096, in _ensure_executor_running
self._adjust_process_count()
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\joblib\externals\loky\process_executor.py", line 1087, in _adjust_process_count
p.start()
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\multiprocessing\process.py", line 121, in start
self._popen = self._Popen(self)
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\joblib\externals\loky\backend\process.py", line 39, in _Popen
return Popen(process_obj)
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\joblib\externals\loky\backend\popen_loky_win32.py", line 54, in __init__
prep_data = spawn.get_preparation_data(
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\joblib\externals\loky\backend\spawn.py", line 86, in get_preparation_data
_resource_tracker.ensure_running()
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\joblib\externals\loky\backend\resource_tracker.py", line 102, in ensure_running
if self._check_alive():
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\joblib\externals\loky\backend\resource_tracker.py", line 182, in _check_alive
self._send('PROBE', '', '')
File "C:\Users\AD\AppData\Local\Programs\Python\Python39\Lib\site-packages\joblib\externals\loky\backend\resource_tracker.py", line 209, in _send
nbytes = os.write(self._fd, msg)
OSError: [Errno 22] Invalid argument
data and parameters are good, there is no missing values or typos in parameters
data is
train_data: [[0.39646672]
[0.32037798]
[0.09515201]
[0.08167625]
[0.06491372]
[0.07173377]
[0.16557108]
[0.62966311]
[1. ]
[0.06244864]
....
I would like to know why that warning is coming?
Can I suppress it?
also, not sure why I am getting "OSError: [Errno 22] Invalid argument", I have seen this before due to n_jobs parameter at -1, when I use n_jobs=1, that error went off, in this case even with n_jobs=1 it is still throwing the OSError
this error occurred when I used scikit-learn to perform model fusion on 7 sub-models. I checked the official document and there was no relevant introduction.
code:
# model fusion
estimators = [('DT', model_dt_x), ('KNN', model_knn_x), ('SVR', model_svr_x), ('ANN', model_ann_x), ('RF', model_rf_x), ('GBDT', model_gbdt_x), ('XGBT', model_xgbt_x)]
stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=RidgeCV())
stacking_regressor.fit(X_train, y_train)
error:
Traceback (most recent call last):
File "i:/Lab/20210xxx/ex.py", line 86, in <module>
stacking_regressor.fit(X_train, y_train)
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\ensemble\_stacking.py", line 680, in fit
return super().fit(X, y, sample_weight)
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\ensemble\_stacking.py", line 148, in fit
for est in all_estimators if est != 'drop'
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\parallel.py", line 921, in __call__
if self.dispatch_one_batch(iterator):
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\parallel.py", line 759, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\parallel.py", line 716, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 182, in
apply_async
result = ImmediateResult(func)
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 549, in __init__
self.results = batch()
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\ensemble\_base.py", line 40, in
_fit_single_estimator
estimator.fit(X, y)
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py",
line 641, in fit
return self._fit(X, y, incremental=False)
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py",
line 321, in _fit
self._validate_hyperparameters()
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py",
line 385, in _validate_hyperparameters
if self.max_fun <= 0:
TypeError: '<=' not supported between instances of 'NoneType' and 'int'
Problem solved
model_ann_x = MLPRegressor(**model_ann.get_params())
model_ann_x.set_params(max_fun=15000) # get rid of bug
model_rf_x = RandomForestRegressor(**model_rf.get_params())
model_rf_x.set_params(ccp_alpha=0.0) # get rid of bug
model_gbdt_x = GradientBoostingRegressor(**model_gbdt.get_params())
model_gbdt_x.set_params(ccp_alpha=0.0) # get rid of bug
I am trying to create a machine learning model to predict who would survive on the Titanic. Everytime I try to fit my model, I get this error :
Traceback (most recent call last):
File "c:\Users\seand\.vscode\extensions\ms-python.python-2020.6.89148\pythonFiles\ptvsd_launcher.py", line 48, in <module>
main(ptvsdArgs)
File "c:\Users\seand\.vscode\extensions\ms-python.python-2020.6.89148\pythonFiles\lib\python\old_ptvsd\ptvsd\__main__.py", line 432, in main
run()
File "c:\Users\seand\.vscode\extensions\ms-python.python-2020.6.89148\pythonFiles\lib\python\old_ptvsd\ptvsd\__main__.py", line 316, in run_file
runpy.run_path(target, run_name='__main__')
return _run_module_code(code, init_globals, run_name,
File "D:\Python\lib\runpy.py", line 97, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "D:\Python\lib\runpy.py", line 87, in _run_code
exec(code, run_globals)
File "d:\Kaggle\Titanic\titanic4.py", line 100, in <module>
cat_cols2 = pd.DataFrame(OneHot1.fit_transform(new_df[cat_columns]))
File "D:\Python\lib\site-packages\pandas\core\frame.py", line 2806, in __getitem__
indexer = self.loc._get_listlike_indexer(key, axis=1, raise_missing=True)[1]
File "D:\Python\lib\site-packages\pandas\core\indexing.py", line 1552, in _get_listlike_indexer
self._validate_read_indexer(
File "D:\Python\lib\site-packages\pandas\core\indexing.py", line 1640, in _validate_read_indexer
raise KeyError(f"None of [{key}] are in the [{axis_name}]")
KeyError: "None of [Index(['Name', 'Sex', 'Ticket', 'Cabin', 'Embarked'], dtype='object')] are in the [columns]"
PS D:\Kaggle\Titanic> cd 'd:\Kaggle\Titanic'; ${env:PYTHONIOENCODING}='UTF-8'; ${env:PYTHONUNBUFFERED}='1'; & 'D:\Python\python.exe' 'c:\Users\seand\.vscode\extensions\ms-python.python-2020.6.89148\pythonFiles\ptvsd_launcher.py' '--default' '--client' '--host' 'localhost' '--port' '60778' 'd:\Kaggle\Titanic\titanic4.py'
Traceback (most recent call last):
File "c:\Users\seand\.vscode\extensions\ms-python.python-2020.6.89148\pythonFiles\ptvsd_launcher.py", line 48, in <module>
main(ptvsdArgs)
File "c:\Users\seand\.vscode\extensions\ms-python.python-2020.6.89148\pythonFiles\lib\python\old_ptvsd\ptvsd\__main__.py", line 432, in main
run()
File "c:\Users\seand\.vscode\extensions\ms-python.python-2020.6.89148\pythonFiles\lib\python\old_ptvsd\ptvsd\__main__.py", line 316, in run_file
runpy.run_path(target, run_name='__main__')
File "D:\Python\lib\runpy.py", line 265, in run_path
return _run_module_code(code, init_globals, run_name,
File "D:\Python\lib\runpy.py", line 97, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "D:\Python\lib\runpy.py", line 87, in _run_code
exec(code, run_globals)
File "d:\Kaggle\Titanic\titanic4.py", line 143, in <module>
my_pipeline.fit(new_df,y)
File "D:\Python\lib\site-packages\sklearn\pipeline.py", line 330, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "D:\Python\lib\site-packages\sklearn\pipeline.py", line 292, in _fit
X, fitted_transformer = fit_transform_one_cached(
File "D:\Python\lib\site-packages\joblib\memory.py", line 352, in __call__
return self.func(*args, **kwargs)
File "D:\Python\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "D:\Python\lib\site-packages\sklearn\compose\_column_transformer.py", line 531, in fit_transform
result = self._fit_transform(X, y, _fit_transform_one)
File "D:\Python\lib\site-packages\sklearn\compose\_column_transformer.py", line 458, in _fit_transform
return Parallel(n_jobs=self.n_jobs)(
File "D:\Python\lib\site-packages\joblib\parallel.py", line 1032, in __call__
while self.dispatch_one_batch(iterator):
File "D:\Python\lib\site-packages\joblib\parallel.py", line 847, in dispatch_one_batch
self._dispatch(tasks)
File "D:\Python\lib\site-packages\joblib\parallel.py", line 765, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "D:\Python\lib\site-packages\joblib\_parallel_backends.py", line 206, in apply_async
result = ImmediateResult(func)
File "D:\Python\lib\site-packages\joblib\_parallel_backends.py", line 570, in __init__
self.results = batch()
File "D:\Python\lib\site-packages\joblib\parallel.py", line 252, in __call__
return [func(*args, **kwargs)
File "D:\Python\lib\site-packages\joblib\parallel.py", line 252, in <listcomp>
return [func(*args, **kwargs)
res = transformer.fit_transform(X, y, **fit_params)
File "D:\Python\lib\site-packages\sklearn\pipeline.py", line 367, in fit_transform
Xt = self._fit(X, y, **fit_params_steps)
File "D:\Python\lib\site-packages\sklearn\pipeline.py", line 292, in _fit
X, fitted_transformer = fit_transform_one_cached(
File "D:\Python\lib\site-packages\joblib\memory.py", line 352, in __call__
return self.func(*args, **kwargs)
File "D:\Python\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "D:\Python\lib\site-packages\sklearn\base.py", line 693, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "D:\Python\lib\site-packages\sklearn\impute\_base.py", line 459, in transform
coordinates = np.where(mask.transpose())[::-1]
AttributeError: 'bool' object has no attribute 'transpose'
PS D:\Kaggle\Titanic> cd 'd:\Kaggle\Titanic'; ${env:PYTHONIOENCODING}='UTF-8'; ${env:PYTHONUNBUFFERED}='1'; & 'D:\Python\python.exe' 'c:\Users\seand\.vscode\extensions\ms-python.python-2020.6.89148\pythonFiles\ptvsd_launcher.py' '--default' '--client' '--host' 'localhost' '--port' '60800' 'd:\Kaggle\Titanic\titanic4.py'
Traceback (most recent call last):
File "c:\Users\seand\.vscode\extensions\ms-python.python-2020.6.89148\pythonFiles\ptvsd_launcher.py", line 48, in <module>
main(ptvsdArgs)
File "c:\Users\seand\.vscode\extensions\ms-python.python-2020.6.89148\pythonFiles\lib\python\old_ptvsd\ptvsd\__main__.py", line 432, in main
run()
File "c:\Users\seand\.vscode\extensions\ms-python.python-2020.6.89148\pythonFiles\lib\python\old_ptvsd\ptvsd\__main__.py", line 316, in run_file
runpy.run_path(target, run_name='__main__')
File "D:\Python\lib\runpy.py", line 265, in run_path
return _run_module_code(code, init_globals, run_name,
File "D:\Python\lib\runpy.py", line 97, in _run_module_code
_run_code(code, mod_globals, init_globals,
File "D:\Python\lib\runpy.py", line 87, in _run_code
exec(code, run_globals)
File "d:\Kaggle\Titanic\titanic4.py", line 122, in <module>
my_pipeline.fit(new_df,y)
File "D:\Python\lib\site-packages\sklearn\pipeline.py", line 330, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "D:\Python\lib\site-packages\sklearn\pipeline.py", line 292, in _fit
X, fitted_transformer = fit_transform_one_cached(
File "D:\Python\lib\site-packages\joblib\memory.py", line 352, in __call__
return self.func(*args, **kwargs)
File "D:\Python\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "D:\Python\lib\site-packages\sklearn\compose\_column_transformer.py", line 531, in fit_transform
result = self._fit_transform(X, y, _fit_transform_one)
File "D:\Python\lib\site-packages\sklearn\compose\_column_transformer.py", line 458, in _fit_transform
return Parallel(n_jobs=self.n_jobs)(
File "D:\Python\lib\site-packages\joblib\parallel.py", line 1032, in __call__
while self.dispatch_one_batch(iterator):
File "D:\Python\lib\site-packages\joblib\parallel.py", line 847, in dispatch_one_batch
self._dispatch(tasks)
File "D:\Python\lib\site-packages\joblib\parallel.py", line 765, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "D:\Python\lib\site-packages\joblib\_parallel_backends.py", line 206, in apply_async
result = ImmediateResult(func)
File "D:\Python\lib\site-packages\joblib\_parallel_backends.py", line 570, in __init__
self.results = batch()
File "D:\Python\lib\site-packages\joblib\parallel.py", line 252, in __call__
return [func(*args, **kwargs)
File "D:\Python\lib\site-packages\joblib\parallel.py", line 252, in <listcomp>
return [func(*args, **kwargs)
File "D:\Python\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "D:\Python\lib\site-packages\sklearn\pipeline.py", line 367, in fit_transform
Xt = self._fit(X, y, **fit_params_steps)
File "D:\Python\lib\site-packages\sklearn\pipeline.py", line 292, in _fit
X, fitted_transformer = fit_transform_one_cached(
File "D:\Python\lib\site-packages\joblib\memory.py", line 352, in __call__
return self.func(*args, **kwargs)
File "D:\Python\lib\site-packages\sklearn\pipeline.py", line 740, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "D:\Python\lib\site-packages\sklearn\base.py", line 693, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "D:\Python\lib\site-packages\sklearn\impute\_base.py", line 459, in transform
coordinates = np.where(mask.transpose())[::-1]
AttributeError: 'bool' object has no attribute 'transpose'
The code I am running is the following :
from xgboost import XGBClassifier
from sklearn.preprocessing import OneHotEncoder
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.feature_selection import SelectFromModel
from itertools import combinations
import pandas as pd
import numpy as np
#read in data
training_data = pd.read_csv('train.csv')
testing_data = pd.read_csv('test.csv')
#seperate X and Y
X_train_full = training_data.copy()
y = X_train_full.Survived
X_train_full.drop(['Survived'], axis=1, inplace=True)
y_test = testing_data
#get all str columns
cat_columns1 = [cname for cname in X_train_full.columns if
X_train_full[cname].dtype == "object"]
interactions = pd.DataFrame(index= X_train_full)
#create new features
for combination in combinations(cat_columns1,2):
imputer = SimpleImputer(strategy='constant')
new_col_name = '_'.join(combination)
col1 = X_train_full[combination[0]]
col2 = X_train_full[combination[1]]
col1 = np.array(col1).reshape(-1,1)
col2 = np.array(col2).reshape(-1,1)
col1 = imputer.fit_transform(col1)
col2 = imputer.fit_transform(col2)
new_vals = col1 + '_' + col2
OneHot = OneHotEncoder()
interactions[new_col_name] = OneHot.fit_transform(new_vals)
interactions = interactions.reset_index(drop = True)
#create new dataframe with new features included
new_df = X_train_full.join(interactions)
#do the same for the test file
interactions2 = pd.DataFrame(index= y_test)
for combination in combinations(cat_columns1,2):
imputer = SimpleImputer(strategy='constant')
new_col_name = '_'.join(combination)
col1 = y_test[combination[0]]
col2 = y_test[combination[1]]
col1 = np.array(col1).reshape(-1,1)
col2 = np.array(col2).reshape(-1,1)
col1 = imputer.fit_transform(col1)
col2 = imputer.fit_transform(col2)
new_vals = col1 + '_' + col2
OneHot = OneHotEncoder()
interactions2[new_col_name] = OneHot.fit_transform(new_vals)
interactions2[new_col_name] = new_vals
interactions2 = interactions2.reset_index(drop = True)
y_test = y_test.join(interactions2)
#get names of cat columns (with new features added)
cat_columns = [cname for cname in new_df.columns if
new_df[cname].dtype == "object"]
# Select numerical columns
num_columns = [cname for cname in new_df.columns if
new_df[cname].dtype in ['int64', 'float64']]
#set up pipeline
numerical_transformer = SimpleImputer(strategy = 'constant')
categorical_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy='constant')),
('onehot', OneHotEncoder(handle_unknown='ignore'))
])
preprocessor = ColumnTransformer(
transformers=[
('num', numerical_transformer, num_columns),
('cat', categorical_transformer, cat_columns)
])
model = XGBClassifier()
my_pipeline = Pipeline(steps=[('preprocessor', preprocessor),
('model', model)
])
#fit model
my_pipeline.fit(new_df,y)
The csv files I am reading are available from Kaggle at this link :
https://www.kaggle.com/c/titanic/data
I cannot figure out what is causing this problem. Any help would be much appreciated.
This probably happens because your data contains pd.NA values. pd.NA was introduced in pandas 1.0.0, but is still marked as experimental.
SimpleImputer will ultimately run data == np.nan, which would usually return a numpy array. In stead, it is returning a single boolean scalar when data contains pd.NA values.
An example:
import pandas as pd
import numpy as np
test_pd_na = pd.DataFrame({"A": [1, 2, 3, pd.NA]})
test_np_nan = pd.DataFrame({"A": [1, 2, 3, np.nan]})
test_np_nan.to_numpy() == np.nan:
> array([[False],
[False],
[False],
[False]])
test_pd_na.to_numpy() == np.nan
> False
The solution would be to convert all pd.NA values to np.nan before running SimpleImputer. You can use .replace({pd.NA: np.nan})on your data frames for this purpose. The downside is obviously that you loose the benefits pd.NA brings, such as integer columns with missing data, in stead of those columns being converted to float columns.
I'm trying to build a classifier which takes an array of floats as an input.
Despite following steps here and here to include an array as the input feature I keep getting an TypeError whereby the estimator doesn't recognise the shape of the input.
How do you include an array as a feature for an estimator? Can you simply pass in the numeric_column with an appropriate shape as expected in the docs?
Sample code here:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
z = [[1, 2], [3,4]]
df = pd.DataFrame(z)
df = df.apply(lambda x: np.array(x), axis=1)
feature_columns = []
for col in ['feature']:
feature_columns.append(feature_column.numeric_column(col, shape=(2, )))
df = pd.DataFrame(df)
df.columns = ['feature']
df['target'] = 1
y_train = df.pop('target')
def make_input_fn(X, y, n_epochs=None, shuffle=True):
def input_fn():
dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
if shuffle:
dataset = dataset.shuffle(20)
# For training, cycle thru dataset as many times as need (n_epochs=None).
dataset = dataset.repeat(n_epochs)
# In memory training doesn't use batching.
dataset = dataset.batch(5)
return dataset
return input_fn
train_input_fn = make_input_fn(df, y_train)
linear_est = tf.estimator.LinearRegressor(feature_columns)
linear_est.train(train_input_fn, max_steps=100)
which gives a stack trace of
Traceback (most recent call last):
File "/Applications/PyCharm.app/Contents/helpers/pydev/_pydevd_bundle/pydevd_exec2.py", line 3, in Exec
exec(exp, global_vars, local_vars)
File "<string>", line 39, in <module>
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 359, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1139, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1166, in _train_model_default
input_fn, ModeKeys.TRAIN))
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1003, in _get_features_and_labels_from_input_fn
self._call_input_fn(input_fn, mode))
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1094, in _call_input_fn
return input_fn(**kwargs)
File "<string>", line 23, in input_fn
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 279, in from_tensor_slices
return TensorSliceDataset(tensors)
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 2091, in __init__
for i, t in enumerate(nest.flatten(tensors))
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 2091, in <listcomp>
for i, t in enumerate(nest.flatten(tensors))
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 1050, in convert_to_tensor
return convert_to_tensor_v2(value, dtype, preferred_dtype, name)
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 1108, in convert_to_tensor_v2
as_ref=False)
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 1186, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/framework/constant_op.py", line 304, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/framework/constant_op.py", line 245, in constant
allow_broadcast=True)
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/framework/constant_op.py", line 283, in _constant_impl
allow_broadcast=allow_broadcast))
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/framework/tensor_util.py", line 574, in make_tensor_proto
append_fn(tensor_proto, proto_values)
File "tensorflow/python/framework/fast_tensor_util.pyx", line 127, in tensorflow.python.framework.fast_tensor_util.AppendObjectArrayToTensorProto
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/util/compat.py", line 61, in as_bytes
(bytes_or_text,))
TypeError: Expected binary or unicode string, got array([1, 2])
I am trying to do feature selection for multilabel classification.I extracted the features on which model will be trained into X. Model testing is done on the same X. I am using Pipeline and selecting best 100 features-
#arrFinal contains all the features and the labels. Last 16 columns are labels and features are from 1 to 521. 17th column from the last is not taken
X=np.array(arrFinal[:,1:-17])
Xtest=np.array(X)
Y=np.array(arrFinal[:,522:]).astype(int)
clf = Pipeline([('chi2', SelectKBest(chi2, k=100)),('rbf',SVC())])
clf = OneVsRestClassifier(clf)
clf.fit(X, Y)
ans=clf.predict(X_test)
But I am getting the following error-
Traceback (most recent call last):
File "C:\Users\50004182\Documents\\callee.py", line 10, in <module
>
combine.combine_main(dict_ids,inv_dict_ids,noOfIDs)
File "C:\Users\50004182\Documents\combine.py", line 201, in combi
ne_main
clf.fit(X, Y)
File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 287, in fit
for i, column in enumerate(columns))
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 804, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 662, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 570, in _dispatch
job = ImmediateComputeBatch(batch)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 183, in __init__
self.results = batch()
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 72, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 72, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 74, in _fit_b
inary
estimator.fit(X, y)
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 164, in fit
Xt, fit_params = self._pre_transform(X, y, **fit_params)
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 145, in _pre_tr
ansform
Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
File "C:\Python34\lib\site-packages\sklearn\base.py", line 458, in fit_transfo
rm
return self.fit(X, y, **fit_params).transform(X)
File "C:\Python34\lib\site-packages\sklearn\feature_selection\univariate_selec
tion.py", line 331, in fit
self.scores_, self.pvalues_ = self.score_func(X, y)
File "C:\Python34\lib\site-packages\sklearn\feature_selection\univariate_selec
tion.py", line 213, in chi2
if np.any((X.data if issparse(X) else X) < 0):
TypeError: unorderable types: numpy.ndarray() < int()
So, after a debugging session in the comments above with #JamieBull and #Joker. The solution we came up with was:
Ensure type is correct (originally string)
X=np.array(arrFinal[:,1:-17]).astype(np.float64)
Xtest=np.array(X)
Y=np.array(arrFinal[:,522:]).astype(int)
First use VarianceThreshold to remove constant (0) columns prior to chi2.
clf = Pipeline([
('vt', VarianceThreshold()),
('chi2', SelectKBest(chi2, k=100)),
('rbf',SVC())
])
clf = OneVsRestClassifier(clf)
clf.fit(X, Y)
ans=clf.predict(X_test)