ValueError: not enough values to unpack in GridSearch with Scikit - python

I'm trying to tune the alpha parameter of a Multinomial Naive Bayes, with the 20newsgroups database. This is my code so far:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import numpy as np
# Divide dataset
dataset_train = fetch_20newsgroups(subset='train', shuffle=True)
dataset_test = fetch_20newsgroups(subset='test', shuffle=True)
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer(sublinear_tf=True)), ('clf',
MultinomialNB())])
param_grid = {'tfidf__use_idf': (True, False),
'clf__alpha' : np.linspace(0.001, 1, 100)}
grid_search = GridSearchCV(text_clf, param_grid=param_grid, scoring='precision', cv = None)
# Training
text_clf = grid_search.fit(dataset_train.data,dataset_train.target, average=None)
#prediction
predicted = text_clf.predict(dataset_test.data)
print("NB Accuracy:", 100*np.mean(predicted == dataset_test.target), '%')
print(classification_report(dataset_test.target, predicted, target_names=dataset_train.target_names))
print("Best estimator for alpha in order to get precision ", grid_search.best_estimator_)
The problem is i'm getting the following error:
runfile('C:/Users/omarl/Downloads/new_NB.py', wdir='C:/Users/omarl/Downloads')
Traceback (most recent call last):
File "<ipython-input-12-d478372ef22a>", line 1, in <module>
runfile('C:/Users/omarl/Downloads/new_NB.py', wdir='C:/Users/omarl/Downloads')
File "C:\Users\omarl\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "C:\Users\omarl\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/omarl/Downloads/new_NB.py", line 28, in <module>
text_clf = grid_search.fit(dataset_train.data,dataset_train.target, average=None)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 639, in fit
cv.split(X, y, groups)))
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
result = ImmediateResult(func)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
self.results = batch()
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 458, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 197, in _fit
step, param = pname.split('__', 1)
ValueError: not enough values to unpack (expected 2, got 1)
I have no clue why this is happening, because from the code I reviewed so far this should work. Also I searched in the Scikit website but I didn't found anything. Thanks.

In this line:
text_clf = grid_search.fit(dataset_train.data,dataset_train.target, average=None)
average=None is being interpreted as a fit_param, which is not what you intend.
Average removing this, you will get this error.
ValueError: Target is multiclass but average='binary'. Please choose another average setting.
This is because precision is not defined in the multi-class setting. If you change your scoring parameter to 'accuracy', the code works.

Related

Dask hot encoding - ValueError: All columns must be Categorical dtype when 'categories="auto"'

I am dealing with an error that I am unable to handle. Considering that I am using a simple dataset that has both categorical and numerical features, I am a bit suprised by the error ValueError: All columns must be Categorical dtype when 'categories="auto"'.
The dataset before any transformation looks like that
As I am using dask instead of sklearn, I cannot rely on something like OneHotEncoder(handle_unknown='ignore')
What would you recommend to fix that?
from dask import dataframe as pd
from dask_ml.model_selection import train_test_split
from dask_ml.preprocessing import MinMaxScaler, OneHotEncoder
from dask_ml.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from dask_ml.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from numpy import mean
from numpy import std
from numpy import absolute
numeric_features = ['prob','opp','win_proba','checker']
categorical_features = ['card1_hand','card2_hand','card1_table','card2_table','card3_table',
'card1_color_hand','card2_color_hand','card1_color_table','card2_color_table',
'card3_color_table','actions_preflop','actions_flop','actions_turn','actions_river','best_hand']
all_features = ['card1_hand','card2_hand','card1_table','card2_table','card3_table','card1_color_hand',
'card2_color_hand','card1_color_table','card2_color_table','card3_color_table',
'actions_preflop','actions_flop','actions_turn','actions_river','best_hand',
'call','prob','opp','win_proba','checker']
output_col = 'call'
def train(x_train, y_train):
numeric_transformer = MinMaxScaler()
categorical_transformer = OneHotEncoder()
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
model = Pipeline(
steps=[
("preprocessor", preprocessor),
("classifier", LinearRegression())
]
)
model.fit(x_train, y_train)
return model
'''
def evaluate_model(model, x_test, y_test):
#predict_test = model.predict(x_test)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, x_test, y_test, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(scores), std(scores)))
'''
def predict_from_dict(model, data):
x = pd.DataFrame.from_dict({key:[value] for key, value in data.items()})
return model.predict(x)[0]
if __name__ == '__main__':
df = pd.read_csv('output_test.csv')
df = df[all_features]
y = df[output_col]
x = df.drop(['call'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
model = train(x_train, y_train)
Traceback below
Traceback (most recent call last):
File "C:\Users\rapha\Desktop\Consulting\Poker\Cepheus\Model\Dask\dask regression.py", line 73, in <module>
model = train(x_train, y_train)
File "C:\Users\rapha\Desktop\Consulting\Poker\Cepheus\Model\Dask\dask regression.py", line 46, in train
model.fit(x_train, y_train)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 390, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 355, in _fit
**fit_params_steps[name],
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\memory.py", line 355, in __call__
return self.func(*args, **kwargs)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\compose\_column_transformer.py", line 675, in fit_transform
result = self._fit_transform(X, y, _fit_transform_one)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\compose\_column_transformer.py", line 615, in _fit_transform
for idx, (name, trans, column, weight) in enumerate(transformers, 1)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 1007, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 835, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 754, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 209, in apply_async
result = ImmediateResult(func)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 590, in __init__
self.results = batch()
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\fixes.py", line 216, in __call__
return self.function(*args, **kwargs)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\preprocessing\_encoders.py", line 488, in fit_transform
return super().fit_transform(X, y)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\base.py", line 855, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\dask_ml\preprocessing\_encoders.py", line 153, in fit
self._fit(X, handle_unknown=self.handle_unknown)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\dask_ml\preprocessing\_encoders.py", line 225, in _fit
"All columns must be Categorical dtype when "
ValueError: All columns must be Categorical dtype when 'categories="auto"'.
runfile('C:/Users/rapha/Desktop/Consulting/Poker/Cepheus/Model/Dask/dask regression.py', wdir='C:/Users/rapha/Desktop/Consulting/Poker/Cepheus/Model/Dask')
Dask DataFrame Structure:
card1_hand card2_hand card1_table card2_table card3_table card1_color_hand card2_color_hand card1_color_table card2_color_table card3_color_table actions_preflop actions_flop actions_turn actions_river best_hand prob opp win_proba checker
npartitions=1
int64 int64 int64 int64 int64 object object object object object object object object object object float64 float64 float64 float64
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Dask Name: split, 7 tasks
runfile('C:/Users/rapha/Desktop/Consulting/Poker/Cepheus/Model/Dask/dask regression.py', wdir='C:/Users/rapha/Desktop/Consulting/Poker/Cepheus/Model/Dask')
Traceback (most recent call last):
File "C:\Users\rapha\Desktop\Consulting\Poker\Cepheus\Model\Dask\dask regression.py", line 73, in <module>
model = train(x_train, y_train)
File "C:\Users\rapha\Desktop\Consulting\Poker\Cepheus\Model\Dask\dask regression.py", line 46, in train
model.fit(x_train, y_train)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 390, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 355, in _fit
**fit_params_steps[name],
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\memory.py", line 355, in __call__
return self.func(*args, **kwargs)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\compose\_column_transformer.py", line 675, in fit_transform
result = self._fit_transform(X, y, _fit_transform_one)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\compose\_column_transformer.py", line 615, in _fit_transform
for idx, (name, trans, column, weight) in enumerate(transformers, 1)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 1007, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 835, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 754, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 209, in apply_async
result = ImmediateResult(func)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 590, in __init__
self.results = batch()
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\fixes.py", line 216, in __call__
return self.function(*args, **kwargs)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\preprocessing\_encoders.py", line 488, in fit_transform
return super().fit_transform(X, y)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\base.py", line 855, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\dask_ml\preprocessing\_encoders.py", line 153, in fit
self._fit(X, handle_unknown=self.handle_unknown)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\dask_ml\preprocessing\_encoders.py", line 225, in _fit
"All columns must be Categorical dtype when "
ValueError: All columns must be Categorical dtype when 'categories="auto"'.

ZeroDivisionError when using sklearn's BaggingClassifier with GridSearchCV

I'm trying to improve a perfectly working Bernoulli Naive Bayes model with bagging.
But when I try to cross-validate the BaggingClassifier, I get a very unexpected ZeroDivisionError coming from parallel.py.
I've tried to change all the parameters I know, rebooted python but nothing worked.
Here is a reproducible example with a binary-modified iris dataset:
#%% run
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.datasets import load_iris
data = load_iris()
data.targetbin = (data.target!=0).astype("int")
param_grid2={'max_samples' : np.linspace(0.5,1.0,3),
'base_estimator__alpha':np.linspace(0.1,1,3),
'base_estimator__binarize':[*np.linspace(0.0,1,3)],
'base_estimator__fit_prior':[True,False]}
param_grid2={'max_samples' :[0.7]}
clf = GridSearchCV(
BaggingClassifier(
BernoulliNB(),
n_estimators = 10, max_features = 0.5),
param_grid2,
scoring = "accuracy",
verbose=-1)
clf.fit(data.data, data.targetbin)
And here is the stacktrace of my error:
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1
concurrent workers. Traceback (most recent call last):
File "", line 33, in
clf.fit(data.data, data.targetbin)
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\model_selection_search.py",
line 722, in fit
self._run_search(evaluate_candidates)
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\model_selection_search.py",
line 1191, in _run_search
evaluate_candidates(ParameterGrid(self.param_grid))
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\model_selection_search.py",
line 711, in evaluate_candidates
cv.split(X, y, groups)))
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py",
line 917, in call
if self.dispatch_one_batch(iterator):
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py",
line 759, in dispatch_one_batch
self._dispatch(tasks)
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py",
line 716, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib_parallel_backends.py",
line 184, in apply_async
callback(result)
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py",
line 306, in call
self.parallel.print_progress()
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py",
line 806, in print_progress
if (is_last_item or cursor % frequency):
ZeroDivisionError: integer division or modulo by zero
What am I doing wrong?
I tried to debug the lib and found self.verbose for sklearn/externals/joblib/parallel.py is -1, however it's supposed to be at least 0 by default. So I think it's a bug.

Memory error in Python while using sklearn

I am trying to test my logistic regression model but I get a memory error and cannot solve it. Is it because my sentences take too much space? I will appreciate any help.
From Line 267 in my code:
self.X, self.y = self.transform_to_dataset(training_sentences,_pos__sentences)
self.clf = Pipeline([
('vectorizer', DictVectorizer(sparse=False)),
('classifier', LogisticRegression())])
self.clf.fit(self.X, self.y)
And the error I get after running this:
Traceback (most recent call last):
File "tagger_lr_chunk.py", line 342, in <module>
tagger.train(data_dir + 'train.txt')
File "tagger_lr_chunk.py", line 271, in train
self.clf.fit(self.X, self.y)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.py", line 213, in _fit
**fit_params_steps[name])
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/memory.py", line 362, in __call__
return self.func(*args, **kwargs)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.py", line 230, in fit_transform
return self._transform(X, fitting=True)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.py", line 204, in _transform
result_matrix = result_matrix.toarray()
File "/home/selub/anaconda2/lib/python2.7/site-packages/scipy/sparse/compressed.py", line 943, in toarray
out = self._process_toarray_args(order, out)
File "/home/selub/anaconda2/lib/python2.7/site-packages/scipy/sparse/base.py", line 1130, in _process_toarray_args
return np.zeros(self.shape, dtype=self.dtype, order=order)
MemoryError
I solved this memory issue by changing paramater of DictVectorizer in order to allow producing scipy.sparse matrices
self.X, self.y = self.transform_to_dataset(training_sentences,_pos__sentences)
self.clf = Pipeline([
('vectorizer', DictVectorizer(sparse=True)),
('classifier', LogisticRegression())])
self.clf.fit(self.X, self.y)

ValueError: could not convert string to float: 'Status'

To do logistic regression on python, this is my code below:
Imported Dataset:Facebook Metrics
# Load dataset
url = "dataset_Facebook.csv"
dataset1 = pandas.read_csv(url, sep = ";", header = 0)
# Split-out validation dataset
array = dataset1.values
X = array[:,0:4]
Y = array[:,4]
validation_size = 0.20
seed = 7
X_train, X_validation, Y_train, Y_validation = model_selection.train_test_split(X, Y, test_size=validation_size, random_state=seed)
# Test options and evaluation metric
seed = 7
scoring = 'accuracy'
# Spot Check Algorithms
models = []
models.append(('LR', LogisticRegression()))
models.append(('LDA', LinearDiscriminantAnalysis()))
models.append(('KNN', KNeighborsClassifier()))
models.append(('CART', DecisionTreeClassifier()))
models.append(('NB', GaussianNB()))
models.append(('SVM', SVC()))
# evaluate each model in turn
results = []
names = []
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = np.log10(model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring))
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
When compiling the program, i get these sets of error:
Traceback (most recent call last):
File "/Users/ernestsoo/Desktop/WESTWORLD (Season 01) DUB 720/Assignment2.JackyTen.ErnestSoo/assignment2.py", line 93, in <module>
cv_results = np.log10(model_selection.cross_val_score(model, X_train, Y_train, cv=kfold, scoring=scoring))
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/model_selection/_validation.py", line 140, in cross_val_score
for train, test in cv_iter)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 326, in __init__
self.results = batch()
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/model_selection/_validation.py", line 238, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/linear_model/logistic.py", line 1173, in fit
order="C")
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/utils/validation.py", line 521, in check_X_y
ensure_min_features, warn_on_dtype, estimator)
File "/Library/Frameworks/Python.framework/Versions/3.5/lib/python3.5/site-packages/sklearn/utils/validation.py", line 382, in check_array
array = np.array(array, dtype=dtype, order=order, copy=copy)
ValueError: could not convert string to float: 'Status'
Because it seem like a DataType problem, i tried parsing the value from the dataset to float:
array = float(dataset1.values)
But this is not working.
How can I solve this problem?
ValueError: could not convert string to float: 'Status'
This error means that at some point your code is trying to convert the string 'Status' to float. Casting your data to float won't solve the problem. The problem is that your code is trying to cast something it shouldn't.
If you execute that code: float("Hello") it raise ValueError: could not convert string to float: 'Hello'. Use the error information to debug your code. Try to find where the "Status" string is given where a float is expected.
Hope it will help you to debug your code

Error during feature selection

I am trying to do feature selection for multilabel classification.I extracted the features on which model will be trained into X. Model testing is done on the same X. I am using Pipeline and selecting best 100 features-
#arrFinal contains all the features and the labels. Last 16 columns are labels and features are from 1 to 521. 17th column from the last is not taken
X=np.array(arrFinal[:,1:-17])
Xtest=np.array(X)
Y=np.array(arrFinal[:,522:]).astype(int)
clf = Pipeline([('chi2', SelectKBest(chi2, k=100)),('rbf',SVC())])
clf = OneVsRestClassifier(clf)
clf.fit(X, Y)
ans=clf.predict(X_test)
But I am getting the following error-
Traceback (most recent call last):
File "C:\Users\50004182\Documents\\callee.py", line 10, in <module
>
combine.combine_main(dict_ids,inv_dict_ids,noOfIDs)
File "C:\Users\50004182\Documents\combine.py", line 201, in combi
ne_main
clf.fit(X, Y)
File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 287, in fit
for i, column in enumerate(columns))
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 804, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 662, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 570, in _dispatch
job = ImmediateComputeBatch(batch)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 183, in __init__
self.results = batch()
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 72, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 72, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 74, in _fit_b
inary
estimator.fit(X, y)
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 164, in fit
Xt, fit_params = self._pre_transform(X, y, **fit_params)
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 145, in _pre_tr
ansform
Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
File "C:\Python34\lib\site-packages\sklearn\base.py", line 458, in fit_transfo
rm
return self.fit(X, y, **fit_params).transform(X)
File "C:\Python34\lib\site-packages\sklearn\feature_selection\univariate_selec
tion.py", line 331, in fit
self.scores_, self.pvalues_ = self.score_func(X, y)
File "C:\Python34\lib\site-packages\sklearn\feature_selection\univariate_selec
tion.py", line 213, in chi2
if np.any((X.data if issparse(X) else X) < 0):
TypeError: unorderable types: numpy.ndarray() < int()
So, after a debugging session in the comments above with #JamieBull and #Joker. The solution we came up with was:
Ensure type is correct (originally string)
X=np.array(arrFinal[:,1:-17]).astype(np.float64)
Xtest=np.array(X)
Y=np.array(arrFinal[:,522:]).astype(int)
First use VarianceThreshold to remove constant (0) columns prior to chi2.
clf = Pipeline([
('vt', VarianceThreshold()),
('chi2', SelectKBest(chi2, k=100)),
('rbf',SVC())
])
clf = OneVsRestClassifier(clf)
clf.fit(X, Y)
ans=clf.predict(X_test)

Categories

Resources