Related
I am dealing with an error that I am unable to handle. Considering that I am using a simple dataset that has both categorical and numerical features, I am a bit suprised by the error ValueError: All columns must be Categorical dtype when 'categories="auto"'.
The dataset before any transformation looks like that
As I am using dask instead of sklearn, I cannot rely on something like OneHotEncoder(handle_unknown='ignore')
What would you recommend to fix that?
from dask import dataframe as pd
from dask_ml.model_selection import train_test_split
from dask_ml.preprocessing import MinMaxScaler, OneHotEncoder
from dask_ml.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from dask_ml.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from numpy import mean
from numpy import std
from numpy import absolute
numeric_features = ['prob','opp','win_proba','checker']
categorical_features = ['card1_hand','card2_hand','card1_table','card2_table','card3_table',
'card1_color_hand','card2_color_hand','card1_color_table','card2_color_table',
'card3_color_table','actions_preflop','actions_flop','actions_turn','actions_river','best_hand']
all_features = ['card1_hand','card2_hand','card1_table','card2_table','card3_table','card1_color_hand',
'card2_color_hand','card1_color_table','card2_color_table','card3_color_table',
'actions_preflop','actions_flop','actions_turn','actions_river','best_hand',
'call','prob','opp','win_proba','checker']
output_col = 'call'
def train(x_train, y_train):
numeric_transformer = MinMaxScaler()
categorical_transformer = OneHotEncoder()
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
model = Pipeline(
steps=[
("preprocessor", preprocessor),
("classifier", LinearRegression())
]
)
model.fit(x_train, y_train)
return model
'''
def evaluate_model(model, x_test, y_test):
#predict_test = model.predict(x_test)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, x_test, y_test, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(scores), std(scores)))
'''
def predict_from_dict(model, data):
x = pd.DataFrame.from_dict({key:[value] for key, value in data.items()})
return model.predict(x)[0]
if __name__ == '__main__':
df = pd.read_csv('output_test.csv')
df = df[all_features]
y = df[output_col]
x = df.drop(['call'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
model = train(x_train, y_train)
Traceback below
Traceback (most recent call last):
File "C:\Users\rapha\Desktop\Consulting\Poker\Cepheus\Model\Dask\dask regression.py", line 73, in <module>
model = train(x_train, y_train)
File "C:\Users\rapha\Desktop\Consulting\Poker\Cepheus\Model\Dask\dask regression.py", line 46, in train
model.fit(x_train, y_train)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 390, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 355, in _fit
**fit_params_steps[name],
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\memory.py", line 355, in __call__
return self.func(*args, **kwargs)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\compose\_column_transformer.py", line 675, in fit_transform
result = self._fit_transform(X, y, _fit_transform_one)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\compose\_column_transformer.py", line 615, in _fit_transform
for idx, (name, trans, column, weight) in enumerate(transformers, 1)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 1007, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 835, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 754, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 209, in apply_async
result = ImmediateResult(func)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 590, in __init__
self.results = batch()
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\fixes.py", line 216, in __call__
return self.function(*args, **kwargs)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\preprocessing\_encoders.py", line 488, in fit_transform
return super().fit_transform(X, y)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\base.py", line 855, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\dask_ml\preprocessing\_encoders.py", line 153, in fit
self._fit(X, handle_unknown=self.handle_unknown)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\dask_ml\preprocessing\_encoders.py", line 225, in _fit
"All columns must be Categorical dtype when "
ValueError: All columns must be Categorical dtype when 'categories="auto"'.
runfile('C:/Users/rapha/Desktop/Consulting/Poker/Cepheus/Model/Dask/dask regression.py', wdir='C:/Users/rapha/Desktop/Consulting/Poker/Cepheus/Model/Dask')
Dask DataFrame Structure:
card1_hand card2_hand card1_table card2_table card3_table card1_color_hand card2_color_hand card1_color_table card2_color_table card3_color_table actions_preflop actions_flop actions_turn actions_river best_hand prob opp win_proba checker
npartitions=1
int64 int64 int64 int64 int64 object object object object object object object object object object float64 float64 float64 float64
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Dask Name: split, 7 tasks
runfile('C:/Users/rapha/Desktop/Consulting/Poker/Cepheus/Model/Dask/dask regression.py', wdir='C:/Users/rapha/Desktop/Consulting/Poker/Cepheus/Model/Dask')
Traceback (most recent call last):
File "C:\Users\rapha\Desktop\Consulting\Poker\Cepheus\Model\Dask\dask regression.py", line 73, in <module>
model = train(x_train, y_train)
File "C:\Users\rapha\Desktop\Consulting\Poker\Cepheus\Model\Dask\dask regression.py", line 46, in train
model.fit(x_train, y_train)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 390, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 355, in _fit
**fit_params_steps[name],
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\memory.py", line 355, in __call__
return self.func(*args, **kwargs)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\compose\_column_transformer.py", line 675, in fit_transform
result = self._fit_transform(X, y, _fit_transform_one)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\compose\_column_transformer.py", line 615, in _fit_transform
for idx, (name, trans, column, weight) in enumerate(transformers, 1)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 1007, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 835, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 754, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 209, in apply_async
result = ImmediateResult(func)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 590, in __init__
self.results = batch()
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\fixes.py", line 216, in __call__
return self.function(*args, **kwargs)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\preprocessing\_encoders.py", line 488, in fit_transform
return super().fit_transform(X, y)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\base.py", line 855, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\dask_ml\preprocessing\_encoders.py", line 153, in fit
self._fit(X, handle_unknown=self.handle_unknown)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\dask_ml\preprocessing\_encoders.py", line 225, in _fit
"All columns must be Categorical dtype when "
ValueError: All columns must be Categorical dtype when 'categories="auto"'.
Consider the following sklearn.pipeline.Pipeline:
pipeline = Pipeline(
[
(
"encoding",
ColumnTransformer(
[
(
"encode categorial",
OneHotEncoder(handle_unknown="ignore"),
categorial_features,
)
],
remainder="passthrough",
),
),
("regressor", regressor),
]
)
This pipeline needs to be run on a feature data set X with many missing values. Dropping rows with missing values or imputation would be problematic. Fortunately, lightgbm handles missing values out of the box.
This works well when regressor is defined as follows:
regressor = MultiOutputRegressor(
lightgbm.LGBMRegressor(n_jobs=-1)
)
However, it fails when using a RegressorChain:
regressor = RegressorChain(
base_estimator=lightgbm.LGBMRegressor(n_jobs=-1),
order=list(range(len(params_pipelines["targets"])))
)
Traceback:
Traceback (most recent call last):
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/concurrent/futures/process.py", line 239, in _process_worker
r = call_item.fn(*call_item.args, **call_item.kwargs)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/kedro/runner/parallel_runner.py", line 135, in _run_node_synchronization
return run_node(node, catalog, is_async, run_id)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/kedro/runner/runner.py", line 213, in run_node
node = _run_node_sequential(node, catalog, run_id)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/kedro/runner/runner.py", line 238, in _run_node_sequential
raise exc
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/kedro/runner/runner.py", line 228, in _run_node_sequential
outputs = node.run(inputs)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/kedro/pipeline/node.py", line 433, in run
raise exc
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/kedro/pipeline/node.py", line 424, in run
outputs = self._run_with_list(inputs, self._inputs)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/kedro/pipeline/node.py", line 471, in _run_with_list
return self._decorated_func(*[inputs[item] for item in node_inputs])
File "/Users/cls/Documents/Work/Projects/H/PackMeasurement/src/packaging_measurement/schema.py", line 79, in transform_wrapper
out = transform(*args, **kwargs)
File "/Users/cls/Documents/Work/Projects/H/PackMeasurement/src/packaging_measurement/nodes/estimation.py", line 219, in fit_estimator
pipelines[level]
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/sklearn/pipeline.py", line 394, in fit
self._final_estimator.fit(Xt, y, **fit_params_last_step)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/sklearn/multioutput.py", line 954, in fit
super().fit(X, Y, **fit_params)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/sklearn/multioutput.py", line 556, in fit
X, Y = self._validate_data(X, Y, multi_output=True, accept_sparse=True)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/sklearn/base.py", line 581, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/sklearn/utils/validation.py", line 964, in check_X_y
X = check_array(
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/sklearn/utils/validation.py", line 720, in check_array
array = _ensure_sparse_format(
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/sklearn/utils/validation.py", line 479, in _ensure_sparse_format
_assert_all_finite(spmatrix.data, allow_nan=force_all_finite == "allow-nan")
File "/Users/cls/miniforge3/envs/packaging-measurement/lib/python3.8/site-packages/sklearn/utils/validation.py", line 114, in _assert_all_finite
raise ValueError(
ValueError: Input contains NaN, infinity or a value too large for dtype('float64')
Why does this happen? Is there a reason why the RegressorChain cannot be a drop-in replacement for the MultiOutputRegressor? It is starting to look like a scikit-learn bug.
this error occurred when I used scikit-learn to perform model fusion on 7 sub-models. I checked the official document and there was no relevant introduction.
code:
# model fusion
estimators = [('DT', model_dt_x), ('KNN', model_knn_x), ('SVR', model_svr_x), ('ANN', model_ann_x), ('RF', model_rf_x), ('GBDT', model_gbdt_x), ('XGBT', model_xgbt_x)]
stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=RidgeCV())
stacking_regressor.fit(X_train, y_train)
error:
Traceback (most recent call last):
File "i:/Lab/20210xxx/ex.py", line 86, in <module>
stacking_regressor.fit(X_train, y_train)
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\ensemble\_stacking.py", line 680, in fit
return super().fit(X, y, sample_weight)
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\ensemble\_stacking.py", line 148, in fit
for est in all_estimators if est != 'drop'
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\parallel.py", line 921, in __call__
if self.dispatch_one_batch(iterator):
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\parallel.py", line 759, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\parallel.py", line 716, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 182, in
apply_async
result = ImmediateResult(func)
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 549, in __init__
self.results = batch()
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\ensemble\_base.py", line 40, in
_fit_single_estimator
estimator.fit(X, y)
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py",
line 641, in fit
return self._fit(X, y, incremental=False)
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py",
line 321, in _fit
self._validate_hyperparameters()
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py",
line 385, in _validate_hyperparameters
if self.max_fun <= 0:
TypeError: '<=' not supported between instances of 'NoneType' and 'int'
Problem solved
model_ann_x = MLPRegressor(**model_ann.get_params())
model_ann_x.set_params(max_fun=15000) # get rid of bug
model_rf_x = RandomForestRegressor(**model_rf.get_params())
model_rf_x.set_params(ccp_alpha=0.0) # get rid of bug
model_gbdt_x = GradientBoostingRegressor(**model_gbdt.get_params())
model_gbdt_x.set_params(ccp_alpha=0.0) # get rid of bug
I'm trying to tune the alpha parameter of a Multinomial Naive Bayes, with the 20newsgroups database. This is my code so far:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import numpy as np
# Divide dataset
dataset_train = fetch_20newsgroups(subset='train', shuffle=True)
dataset_test = fetch_20newsgroups(subset='test', shuffle=True)
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer(sublinear_tf=True)), ('clf',
MultinomialNB())])
param_grid = {'tfidf__use_idf': (True, False),
'clf__alpha' : np.linspace(0.001, 1, 100)}
grid_search = GridSearchCV(text_clf, param_grid=param_grid, scoring='precision', cv = None)
# Training
text_clf = grid_search.fit(dataset_train.data,dataset_train.target, average=None)
#prediction
predicted = text_clf.predict(dataset_test.data)
print("NB Accuracy:", 100*np.mean(predicted == dataset_test.target), '%')
print(classification_report(dataset_test.target, predicted, target_names=dataset_train.target_names))
print("Best estimator for alpha in order to get precision ", grid_search.best_estimator_)
The problem is i'm getting the following error:
runfile('C:/Users/omarl/Downloads/new_NB.py', wdir='C:/Users/omarl/Downloads')
Traceback (most recent call last):
File "<ipython-input-12-d478372ef22a>", line 1, in <module>
runfile('C:/Users/omarl/Downloads/new_NB.py', wdir='C:/Users/omarl/Downloads')
File "C:\Users\omarl\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "C:\Users\omarl\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/omarl/Downloads/new_NB.py", line 28, in <module>
text_clf = grid_search.fit(dataset_train.data,dataset_train.target, average=None)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 639, in fit
cv.split(X, y, groups)))
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
result = ImmediateResult(func)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
self.results = batch()
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 458, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 197, in _fit
step, param = pname.split('__', 1)
ValueError: not enough values to unpack (expected 2, got 1)
I have no clue why this is happening, because from the code I reviewed so far this should work. Also I searched in the Scikit website but I didn't found anything. Thanks.
In this line:
text_clf = grid_search.fit(dataset_train.data,dataset_train.target, average=None)
average=None is being interpreted as a fit_param, which is not what you intend.
Average removing this, you will get this error.
ValueError: Target is multiclass but average='binary'. Please choose another average setting.
This is because precision is not defined in the multi-class setting. If you change your scoring parameter to 'accuracy', the code works.
I am trying to do feature selection for multilabel classification.I extracted the features on which model will be trained into X. Model testing is done on the same X. I am using Pipeline and selecting best 100 features-
#arrFinal contains all the features and the labels. Last 16 columns are labels and features are from 1 to 521. 17th column from the last is not taken
X=np.array(arrFinal[:,1:-17])
Xtest=np.array(X)
Y=np.array(arrFinal[:,522:]).astype(int)
clf = Pipeline([('chi2', SelectKBest(chi2, k=100)),('rbf',SVC())])
clf = OneVsRestClassifier(clf)
clf.fit(X, Y)
ans=clf.predict(X_test)
But I am getting the following error-
Traceback (most recent call last):
File "C:\Users\50004182\Documents\\callee.py", line 10, in <module
>
combine.combine_main(dict_ids,inv_dict_ids,noOfIDs)
File "C:\Users\50004182\Documents\combine.py", line 201, in combi
ne_main
clf.fit(X, Y)
File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 287, in fit
for i, column in enumerate(columns))
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 804, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 662, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 570, in _dispatch
job = ImmediateComputeBatch(batch)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 183, in __init__
self.results = batch()
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 72, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 72, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 74, in _fit_b
inary
estimator.fit(X, y)
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 164, in fit
Xt, fit_params = self._pre_transform(X, y, **fit_params)
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 145, in _pre_tr
ansform
Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
File "C:\Python34\lib\site-packages\sklearn\base.py", line 458, in fit_transfo
rm
return self.fit(X, y, **fit_params).transform(X)
File "C:\Python34\lib\site-packages\sklearn\feature_selection\univariate_selec
tion.py", line 331, in fit
self.scores_, self.pvalues_ = self.score_func(X, y)
File "C:\Python34\lib\site-packages\sklearn\feature_selection\univariate_selec
tion.py", line 213, in chi2
if np.any((X.data if issparse(X) else X) < 0):
TypeError: unorderable types: numpy.ndarray() < int()
So, after a debugging session in the comments above with #JamieBull and #Joker. The solution we came up with was:
Ensure type is correct (originally string)
X=np.array(arrFinal[:,1:-17]).astype(np.float64)
Xtest=np.array(X)
Y=np.array(arrFinal[:,522:]).astype(int)
First use VarianceThreshold to remove constant (0) columns prior to chi2.
clf = Pipeline([
('vt', VarianceThreshold()),
('chi2', SelectKBest(chi2, k=100)),
('rbf',SVC())
])
clf = OneVsRestClassifier(clf)
clf.fit(X, Y)
ans=clf.predict(X_test)