Related
I am trying to predict toxic comments using Toxic Comment data from kaggle:
import skmultilearn, sys
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from scipy.sparse import csr_matrix, issparse
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import BinaryRelevance
data_frame = pd.read_csv('data/train.csv')
corpus = data_frame['comment_text']
tfidf = TfidfVectorizer()
Xfeatures = csr_matrix(tfidf.fit_transform(corpus))
y = csr_matrix(data_frame[['toxic','severe_toxic','obscene','threat','insult','identity_hate']])
binary_rel_clf = BinaryRelevance(MultinomialNB())
binary_rel_clf.fit(Xfeatures,y)
predict_text = ['fuck die shit moron suck']
X_predict = tfidf.transform(predict_text)
br_prediction = binary_rel_clf.predict(X_predict)
br_prediction = br_prediction.toarray().astype(bool)
predictions = [y.columns.values[prediction].tolist() for prediction in br_prediction]
print(predictions)
However, I got this error:
Traceback (most recent call last):
File "...\multi_label_toxic.py", line 15, in <module>
binary_rel_clf.fit(Xfeatures,y)
File "...\problem_transform\br.py", line 161, in fit
classifier.fit(self._ensure_input_format(
File "...\base\base.py", line 86, in _ensure_input_format
return X.toarray()
File "...\scipy\sparse\compressed.py", line 1031, in toarray
out = self._process_toarray_args(order, out)
File "...\scipy\sparse\base.py", line 1202, in _process_toarray_args
return np.zeros(self.shape, dtype=self.dtype, order=order)
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 226. GiB for an array with shape (159571, 189775) and data type float64
And even if try to pass the param "require_dense=False" I got another error:
Traceback (most recent call last):
File "...\multi_label_toxic.py", line 15, in <module>
binary_rel_clf.fit(Xfeatures,y)
File "...\skmultilearn\problem_transform\br.py", line 161, in fit
classifier.fit(self._ensure_input_format(
File "...\sklearn\naive_bayes.py", line 612, in fit
X, y = self._check_X_y(X, y)
File "...\sklearn\naive_bayes.py", line 477, in _check_X_y
return self._validate_data(X, y, accept_sparse='csr')
File "...\sklearn\base.py", line 433, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "...\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "...\sklearn\utils\validation.py", line 826, in check_X_y
y = column_or_1d(y, warn=True)
File "...\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "...\sklearn\utils\validation.py", line 864, in column_or_1d
raise ValueError(
ValueError: y should be a 1d array, got an array of shape () instead.
How can I fix that and train using the entire model?
It seems that you specified the required_dense argument incorrectly. You need required_dense=[False, True] in order to specify the X values in sparse format but not the y values. In the second last row (predictions = ...) you need to use y before you convert it to a matrix so you can access the column names.
The following code should work.
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from scipy.sparse import csr_matrix, issparse
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import BinaryRelevance
import numpy as np
data_frame = pd.read_csv('data/train.csv')
corpus = data_frame['comment_text']
tfidf = TfidfVectorizer()
Xfeatures = csr_matrix(tfidf.fit_transform(corpus))
cats = data_frame[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]
y = csr_matrix(cats)
binary_rel_clf = BinaryRelevance(MultinomialNB(), require_dense = [False, True])
binary_rel_clf.fit(Xfeatures, y) # y[:,0].toarray().reshape(-1)
predict_text = ['fuck die shit moron suck']
X_predict = tfidf.transform(predict_text)
br_prediction = binary_rel_clf.predict(X_predict)
br_prediction = br_prediction.toarray().astype(bool)
predictions = [cats.columns[prediction].tolist() for prediction in br_prediction]
print(predictions)
Output:
[['toxic', 'obscene', 'insult']]
I'm trying to build a classifier which takes an array of floats as an input.
Despite following steps here and here to include an array as the input feature I keep getting an TypeError whereby the estimator doesn't recognise the shape of the input.
How do you include an array as a feature for an estimator? Can you simply pass in the numeric_column with an appropriate shape as expected in the docs?
Sample code here:
import numpy as np
import pandas as pd
import tensorflow as tf
from tensorflow import feature_column
z = [[1, 2], [3,4]]
df = pd.DataFrame(z)
df = df.apply(lambda x: np.array(x), axis=1)
feature_columns = []
for col in ['feature']:
feature_columns.append(feature_column.numeric_column(col, shape=(2, )))
df = pd.DataFrame(df)
df.columns = ['feature']
df['target'] = 1
y_train = df.pop('target')
def make_input_fn(X, y, n_epochs=None, shuffle=True):
def input_fn():
dataset = tf.data.Dataset.from_tensor_slices((dict(X), y))
if shuffle:
dataset = dataset.shuffle(20)
# For training, cycle thru dataset as many times as need (n_epochs=None).
dataset = dataset.repeat(n_epochs)
# In memory training doesn't use batching.
dataset = dataset.batch(5)
return dataset
return input_fn
train_input_fn = make_input_fn(df, y_train)
linear_est = tf.estimator.LinearRegressor(feature_columns)
linear_est.train(train_input_fn, max_steps=100)
which gives a stack trace of
Traceback (most recent call last):
File "/Applications/PyCharm.app/Contents/helpers/pydev/_pydevd_bundle/pydevd_exec2.py", line 3, in Exec
exec(exp, global_vars, local_vars)
File "<string>", line 39, in <module>
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 359, in train
loss = self._train_model(input_fn, hooks, saving_listeners)
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1139, in _train_model
return self._train_model_default(input_fn, hooks, saving_listeners)
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1166, in _train_model_default
input_fn, ModeKeys.TRAIN))
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1003, in _get_features_and_labels_from_input_fn
self._call_input_fn(input_fn, mode))
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow_estimator/python/estimator/estimator.py", line 1094, in _call_input_fn
return input_fn(**kwargs)
File "<string>", line 23, in input_fn
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 279, in from_tensor_slices
return TensorSliceDataset(tensors)
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 2091, in __init__
for i, t in enumerate(nest.flatten(tensors))
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/data/ops/dataset_ops.py", line 2091, in <listcomp>
for i, t in enumerate(nest.flatten(tensors))
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 1050, in convert_to_tensor
return convert_to_tensor_v2(value, dtype, preferred_dtype, name)
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 1108, in convert_to_tensor_v2
as_ref=False)
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/framework/ops.py", line 1186, in internal_convert_to_tensor
ret = conversion_func(value, dtype=dtype, name=name, as_ref=as_ref)
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/framework/constant_op.py", line 304, in _constant_tensor_conversion_function
return constant(v, dtype=dtype, name=name)
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/framework/constant_op.py", line 245, in constant
allow_broadcast=True)
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/framework/constant_op.py", line 283, in _constant_impl
allow_broadcast=allow_broadcast))
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/framework/tensor_util.py", line 574, in make_tensor_proto
append_fn(tensor_proto, proto_values)
File "tensorflow/python/framework/fast_tensor_util.pyx", line 127, in tensorflow.python.framework.fast_tensor_util.AppendObjectArrayToTensorProto
File "/Users/nicholashilton/.virtualenvs/fantifi/lib/python3.7/site-packages/tensorflow/python/util/compat.py", line 61, in as_bytes
(bytes_or_text,))
TypeError: Expected binary or unicode string, got array([1, 2])
I'm having trouble with fitting an instance of an MLkNN model (from scikit-multilearn) after doing GridSearchCV (from scikit-learn). I am getting an error. Here is the appropriate code:
#From MachineLearningMastery: https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
def series_to_supervised(n_lags, n_vars, data, n_out=1, dropnan=True):
n_vars = 1 if type(data) is list else data.shape[1]
df = DataFrame(data)
cols, names = list(), list()
#input sequence t-n, ..., t-1
for i in range(n_lags, 0, -1): #for i in 3 to 0 not including 0
cols.append(df.shift(i))
names += [('var%d(t-%d)' % (j+1, i)) for j in range (n_vars)]
#forecast sequence t, t+1, ..., t+n
for i in range(0, n_out):
cols.append(df.shift(-i))
if i==0:
names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
else:
names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
agg = concat(cols, axis=1)
agg.columns = names
if dropnan:
agg.dropna(inplace=True)
return agg
def testexamples():
def average_precision_wrapper(estimator, X, y):
if X.ndim == 2:
X = X.reshape((-1)) #[1, 1497] becomes [1497,], needed for average_precision
if y.ndim == 2:
y = y.reshape((-1)) #[1, 1497] ... as above
y_pred = estimator.predict(X).toarray()
return average_precision_score(y, y_pred)
true_values = np.random.choice([0,1], size=(500, 1497), p=[0.99, 0.01])
#Need to convert this to supervised learning. Use previous 2 days to predict (lag=2)
n_lags = 2
n_vars = true_values.shape[1]
all_data = np.asarray(series_to_supervised(n_lags, n_vars, data=true_values))
train_x = all_data[:400, :int(n_vars*n_lags)]
train_y = all_data[:400, int(n_vars*n_lags):]
test_x = all_data[-100:, :int(n_vars*n_lags)]
test_y = all_data[-100:, int(n_vars*n_lags):]
parameters = {'k': range(1,5), 's': [0.5, 0.75, 1]}
checked_model = GridSearchCV(MLkNN(), parameters, scoring='average_precision')
print('type: train_x: ', type(train_x), ' type: train_y: ', type(train_y))
checked_model.fit(train_x, train_y)
Full trace:
user#GPU8:~/path/to/dir$ python May15_mlknn.py
type: train_x: <type 'numpy.ndarray'> type: train_y: <type 'numpy.ndarray'>
Traceback (most recent call last):
File "May15_mlknn.py", line 380, in <module>
testexamples()
File "May15_mlknn.py", line 340, in testexamples
checked_model.fit(train_x, train_y)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.py", line 945, in fit
return self._fit(X, y, groups, ParameterGrid(self.param_grid))
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.py", line 564, in _fit
for parameters in parameter_iterable
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 326, in __init__
self.results = batch()
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.py", line 260, in _fit_and_score
test_score = _score(estimator, X_test, y_test, scorer)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.py", line 288, in _score
score = scorer(estimator, X_test, y_test)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py", line 196, in __call__
return self._sign * self._score_func(y, y_pred, **self._kwargs)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/metrics/ranking.py", line 184, in average_precision_score
average, sample_weight=sample_weight)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/metrics/base.py", line 88, in _average_binary_score
y_score = check_array(y_score)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py", line 380, in check_array
force_all_finite)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py", line 243, in _ensure_sparse_format
raise TypeError('A sparse matrix was passed, but dense '
TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.
I have already seen this and this and this. My question is different because I checked the types of train_x and train_y, and both are dense numpy arrays.
What am I doing wrong and how can I fix it?
EDIT:
I'm now trying the answer provided below, but with a modification due to the error I got (answer here):
def average_precision_wrapper(estimator, X, y):
if X.ndim == 2:
X = X.reshape((-1)) #(1, 1497) becomes (1497,), needed for average_precision
if y.ndim == 2:
y = y.reshape((-1)) #(1, 1497) ... as above
y_pred = estimator.predict(X).toarray()
return average_precision_score(y, y_pred)
EDIT 2: That was no good after all. I get ValueError: query data dimension must match training data dimension . Here's the trace:
/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
DeprecationWarning)
Traceback (most recent call last):
File "May15_mlknn_to_so.py", line 393, in <module>
testexamples()
File "May15_mlknn_to_so.py", line 353, in testexamples
checked_model.fit(train_x, train_y)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.py", line 945, in fit
return self._fit(X, y, groups, ParameterGrid(self.param_grid))
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.py", line 564, in _fit
for parameters in parameter_iterable
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 326, in __init__
self.results = batch()
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.py", line 260, in _fit_and_score
test_score = _score(estimator, X_test, y_test, scorer)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.py", line 288, in _score
score = scorer(estimator, X_test, y_test)
File "May15_mlknn_to_so.py", line 307, in average_precision_wrapper
y_pred = estimator.predict(X).toarray()
File "May15_mlknn_to_so.py", line 237, in predict
self.knn_.kneighbors(X, self.k + self.ignore_first_neighbours, return_distance=False)]
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/neighbors/base.py", line 381, in kneighbors
for s in gen_even_slices(X.shape[0], n_jobs)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 326, in __init__
self.results = batch()
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "sklearn/neighbors/binary_tree.pxi", line 1294, in sklearn.neighbors.kd_tree.BinaryTree.query (sklearn/neighbors/kd_tree.c:11337)
ValueError: query data dimension must match training data dimension
The MLkNN.predict method returns a scipy.sparse array. The scorer 'average_precision' expects a numpy array. You can write a small wrapper that makes this conversion yourself:
from sklearn.model_selection import GridSearchCV
from skmultilearn.adapt import MLkNN
from sklearn.metrics import average_precision_score
def average_precision_wrapper(estimator, X, y):
y_pred = estimator.predict(X).toarray()
return average_precision_score(y, y_pred)
# Make dummy features of shape (100,5)
train_x = np.random.random((100,5))
# Make dummy one-hot encoded labels of shape (100,4)
train_y = np.zeros((100,4), dtype=int)
for i in range(100):
train_y[i, i%4] = 1
parameters = {'k': range(1,5), 's': [0.5, 0.75, 1]}
checked_model = GridSearchCV(MLkNN(), parameters, scoring=average_precision_wrapper)
checked_model.fit(train_x, train_y)
Solved, with help from user2653663: I changed the metric to the Hamming loss, but had to create a scorer to do that using make_scorer from sklearn.metrics.
parameters = {'k': range(1,5), 's': [0.5, 0.75, 1]}
#checked_model = GridSearchCV(MLkNN(), parameters, scoring='f1_samples')
start = time.time()
#checked_model = GridSearchCV(MLkNN(), parameters, scoring='average_precision')
hloss_scorer = make_scorer(hamming_loss, greater_is_better=False)
checked_model = GridSearchCV(MLkNN(), parameters, scoring=hloss_scorer)
checked_model.fit(train_x, train_y)
end = time.time()
print('best parameters: ', checked_model.best_params_, 'best Hamming loss: ', checked_model.best_score_)
best_model = MLkNN(k=checked_model.best_params_['k'], s=checked_model.best_params_['s'])
best_model.fit(train_x, train_y)
pred_values = best_model.predict(test_x) #returns 0/1 classes, not probabilities
pred_values = np.array(pred_values.todense())
true_values = test_y
#Metrics
bincross = []
ap = []
ap_weighted = []
h_loss = []
for i in range(1, pred_values.shape[0]):
true_vals = true_values[i,:]
pred_vals = pred_values[i,:]
pred_vals = np.squeeze(pred_vals)
h_loss.append(hamming_loss(true_vals, pred_vals))
print("***********************")
print("MLKNN with k=best")
print("***********************")
print("Hamming loss: ", h_loss)
h_loss = np.asarray(h_loss)
print("total Hamming loss: ", np.sum(h_loss))
I am trying to test my logistic regression model but I get a memory error and cannot solve it. Is it because my sentences take too much space? I will appreciate any help.
From Line 267 in my code:
self.X, self.y = self.transform_to_dataset(training_sentences,_pos__sentences)
self.clf = Pipeline([
('vectorizer', DictVectorizer(sparse=False)),
('classifier', LogisticRegression())])
self.clf.fit(self.X, self.y)
And the error I get after running this:
Traceback (most recent call last):
File "tagger_lr_chunk.py", line 342, in <module>
tagger.train(data_dir + 'train.txt')
File "tagger_lr_chunk.py", line 271, in train
self.clf.fit(self.X, self.y)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.py", line 213, in _fit
**fit_params_steps[name])
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/memory.py", line 362, in __call__
return self.func(*args, **kwargs)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.py", line 230, in fit_transform
return self._transform(X, fitting=True)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.py", line 204, in _transform
result_matrix = result_matrix.toarray()
File "/home/selub/anaconda2/lib/python2.7/site-packages/scipy/sparse/compressed.py", line 943, in toarray
out = self._process_toarray_args(order, out)
File "/home/selub/anaconda2/lib/python2.7/site-packages/scipy/sparse/base.py", line 1130, in _process_toarray_args
return np.zeros(self.shape, dtype=self.dtype, order=order)
MemoryError
I solved this memory issue by changing paramater of DictVectorizer in order to allow producing scipy.sparse matrices
self.X, self.y = self.transform_to_dataset(training_sentences,_pos__sentences)
self.clf = Pipeline([
('vectorizer', DictVectorizer(sparse=True)),
('classifier', LogisticRegression())])
self.clf.fit(self.X, self.y)
I am trying to do feature selection for multilabel classification.I extracted the features on which model will be trained into X. Model testing is done on the same X. I am using Pipeline and selecting best 100 features-
#arrFinal contains all the features and the labels. Last 16 columns are labels and features are from 1 to 521. 17th column from the last is not taken
X=np.array(arrFinal[:,1:-17])
Xtest=np.array(X)
Y=np.array(arrFinal[:,522:]).astype(int)
clf = Pipeline([('chi2', SelectKBest(chi2, k=100)),('rbf',SVC())])
clf = OneVsRestClassifier(clf)
clf.fit(X, Y)
ans=clf.predict(X_test)
But I am getting the following error-
Traceback (most recent call last):
File "C:\Users\50004182\Documents\\callee.py", line 10, in <module
>
combine.combine_main(dict_ids,inv_dict_ids,noOfIDs)
File "C:\Users\50004182\Documents\combine.py", line 201, in combi
ne_main
clf.fit(X, Y)
File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 287, in fit
for i, column in enumerate(columns))
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 804, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 662, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 570, in _dispatch
job = ImmediateComputeBatch(batch)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 183, in __init__
self.results = batch()
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 72, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 72, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 74, in _fit_b
inary
estimator.fit(X, y)
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 164, in fit
Xt, fit_params = self._pre_transform(X, y, **fit_params)
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 145, in _pre_tr
ansform
Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
File "C:\Python34\lib\site-packages\sklearn\base.py", line 458, in fit_transfo
rm
return self.fit(X, y, **fit_params).transform(X)
File "C:\Python34\lib\site-packages\sklearn\feature_selection\univariate_selec
tion.py", line 331, in fit
self.scores_, self.pvalues_ = self.score_func(X, y)
File "C:\Python34\lib\site-packages\sklearn\feature_selection\univariate_selec
tion.py", line 213, in chi2
if np.any((X.data if issparse(X) else X) < 0):
TypeError: unorderable types: numpy.ndarray() < int()
So, after a debugging session in the comments above with #JamieBull and #Joker. The solution we came up with was:
Ensure type is correct (originally string)
X=np.array(arrFinal[:,1:-17]).astype(np.float64)
Xtest=np.array(X)
Y=np.array(arrFinal[:,522:]).astype(int)
First use VarianceThreshold to remove constant (0) columns prior to chi2.
clf = Pipeline([
('vt', VarianceThreshold()),
('chi2', SelectKBest(chi2, k=100)),
('rbf',SVC())
])
clf = OneVsRestClassifier(clf)
clf.fit(X, Y)
ans=clf.predict(X_test)