Related
I am dealing with an error that I am unable to handle. Considering that I am using a simple dataset that has both categorical and numerical features, I am a bit suprised by the error ValueError: All columns must be Categorical dtype when 'categories="auto"'.
The dataset before any transformation looks like that
As I am using dask instead of sklearn, I cannot rely on something like OneHotEncoder(handle_unknown='ignore')
What would you recommend to fix that?
from dask import dataframe as pd
from dask_ml.model_selection import train_test_split
from dask_ml.preprocessing import MinMaxScaler, OneHotEncoder
from dask_ml.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from dask_ml.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from numpy import mean
from numpy import std
from numpy import absolute
numeric_features = ['prob','opp','win_proba','checker']
categorical_features = ['card1_hand','card2_hand','card1_table','card2_table','card3_table',
'card1_color_hand','card2_color_hand','card1_color_table','card2_color_table',
'card3_color_table','actions_preflop','actions_flop','actions_turn','actions_river','best_hand']
all_features = ['card1_hand','card2_hand','card1_table','card2_table','card3_table','card1_color_hand',
'card2_color_hand','card1_color_table','card2_color_table','card3_color_table',
'actions_preflop','actions_flop','actions_turn','actions_river','best_hand',
'call','prob','opp','win_proba','checker']
output_col = 'call'
def train(x_train, y_train):
numeric_transformer = MinMaxScaler()
categorical_transformer = OneHotEncoder()
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
model = Pipeline(
steps=[
("preprocessor", preprocessor),
("classifier", LinearRegression())
]
)
model.fit(x_train, y_train)
return model
'''
def evaluate_model(model, x_test, y_test):
#predict_test = model.predict(x_test)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, x_test, y_test, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(scores), std(scores)))
'''
def predict_from_dict(model, data):
x = pd.DataFrame.from_dict({key:[value] for key, value in data.items()})
return model.predict(x)[0]
if __name__ == '__main__':
df = pd.read_csv('output_test.csv')
df = df[all_features]
y = df[output_col]
x = df.drop(['call'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
model = train(x_train, y_train)
Traceback below
Traceback (most recent call last):
File "C:\Users\rapha\Desktop\Consulting\Poker\Cepheus\Model\Dask\dask regression.py", line 73, in <module>
model = train(x_train, y_train)
File "C:\Users\rapha\Desktop\Consulting\Poker\Cepheus\Model\Dask\dask regression.py", line 46, in train
model.fit(x_train, y_train)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 390, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 355, in _fit
**fit_params_steps[name],
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\memory.py", line 355, in __call__
return self.func(*args, **kwargs)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\compose\_column_transformer.py", line 675, in fit_transform
result = self._fit_transform(X, y, _fit_transform_one)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\compose\_column_transformer.py", line 615, in _fit_transform
for idx, (name, trans, column, weight) in enumerate(transformers, 1)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 1007, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 835, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 754, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 209, in apply_async
result = ImmediateResult(func)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 590, in __init__
self.results = batch()
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\fixes.py", line 216, in __call__
return self.function(*args, **kwargs)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\preprocessing\_encoders.py", line 488, in fit_transform
return super().fit_transform(X, y)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\base.py", line 855, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\dask_ml\preprocessing\_encoders.py", line 153, in fit
self._fit(X, handle_unknown=self.handle_unknown)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\dask_ml\preprocessing\_encoders.py", line 225, in _fit
"All columns must be Categorical dtype when "
ValueError: All columns must be Categorical dtype when 'categories="auto"'.
runfile('C:/Users/rapha/Desktop/Consulting/Poker/Cepheus/Model/Dask/dask regression.py', wdir='C:/Users/rapha/Desktop/Consulting/Poker/Cepheus/Model/Dask')
Dask DataFrame Structure:
card1_hand card2_hand card1_table card2_table card3_table card1_color_hand card2_color_hand card1_color_table card2_color_table card3_color_table actions_preflop actions_flop actions_turn actions_river best_hand prob opp win_proba checker
npartitions=1
int64 int64 int64 int64 int64 object object object object object object object object object object float64 float64 float64 float64
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Dask Name: split, 7 tasks
runfile('C:/Users/rapha/Desktop/Consulting/Poker/Cepheus/Model/Dask/dask regression.py', wdir='C:/Users/rapha/Desktop/Consulting/Poker/Cepheus/Model/Dask')
Traceback (most recent call last):
File "C:\Users\rapha\Desktop\Consulting\Poker\Cepheus\Model\Dask\dask regression.py", line 73, in <module>
model = train(x_train, y_train)
File "C:\Users\rapha\Desktop\Consulting\Poker\Cepheus\Model\Dask\dask regression.py", line 46, in train
model.fit(x_train, y_train)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 390, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 355, in _fit
**fit_params_steps[name],
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\memory.py", line 355, in __call__
return self.func(*args, **kwargs)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\compose\_column_transformer.py", line 675, in fit_transform
result = self._fit_transform(X, y, _fit_transform_one)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\compose\_column_transformer.py", line 615, in _fit_transform
for idx, (name, trans, column, weight) in enumerate(transformers, 1)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 1007, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 835, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 754, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 209, in apply_async
result = ImmediateResult(func)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 590, in __init__
self.results = batch()
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\fixes.py", line 216, in __call__
return self.function(*args, **kwargs)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\preprocessing\_encoders.py", line 488, in fit_transform
return super().fit_transform(X, y)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\base.py", line 855, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\dask_ml\preprocessing\_encoders.py", line 153, in fit
self._fit(X, handle_unknown=self.handle_unknown)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\dask_ml\preprocessing\_encoders.py", line 225, in _fit
"All columns must be Categorical dtype when "
ValueError: All columns must be Categorical dtype when 'categories="auto"'.
I'm having trouble with fitting an instance of an MLkNN model (from scikit-multilearn) after doing GridSearchCV (from scikit-learn). I am getting an error. Here is the appropriate code:
#From MachineLearningMastery: https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
def series_to_supervised(n_lags, n_vars, data, n_out=1, dropnan=True):
n_vars = 1 if type(data) is list else data.shape[1]
df = DataFrame(data)
cols, names = list(), list()
#input sequence t-n, ..., t-1
for i in range(n_lags, 0, -1): #for i in 3 to 0 not including 0
cols.append(df.shift(i))
names += [('var%d(t-%d)' % (j+1, i)) for j in range (n_vars)]
#forecast sequence t, t+1, ..., t+n
for i in range(0, n_out):
cols.append(df.shift(-i))
if i==0:
names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
else:
names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
agg = concat(cols, axis=1)
agg.columns = names
if dropnan:
agg.dropna(inplace=True)
return agg
def testexamples():
def average_precision_wrapper(estimator, X, y):
if X.ndim == 2:
X = X.reshape((-1)) #[1, 1497] becomes [1497,], needed for average_precision
if y.ndim == 2:
y = y.reshape((-1)) #[1, 1497] ... as above
y_pred = estimator.predict(X).toarray()
return average_precision_score(y, y_pred)
true_values = np.random.choice([0,1], size=(500, 1497), p=[0.99, 0.01])
#Need to convert this to supervised learning. Use previous 2 days to predict (lag=2)
n_lags = 2
n_vars = true_values.shape[1]
all_data = np.asarray(series_to_supervised(n_lags, n_vars, data=true_values))
train_x = all_data[:400, :int(n_vars*n_lags)]
train_y = all_data[:400, int(n_vars*n_lags):]
test_x = all_data[-100:, :int(n_vars*n_lags)]
test_y = all_data[-100:, int(n_vars*n_lags):]
parameters = {'k': range(1,5), 's': [0.5, 0.75, 1]}
checked_model = GridSearchCV(MLkNN(), parameters, scoring='average_precision')
print('type: train_x: ', type(train_x), ' type: train_y: ', type(train_y))
checked_model.fit(train_x, train_y)
Full trace:
user#GPU8:~/path/to/dir$ python May15_mlknn.py
type: train_x: <type 'numpy.ndarray'> type: train_y: <type 'numpy.ndarray'>
Traceback (most recent call last):
File "May15_mlknn.py", line 380, in <module>
testexamples()
File "May15_mlknn.py", line 340, in testexamples
checked_model.fit(train_x, train_y)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.py", line 945, in fit
return self._fit(X, y, groups, ParameterGrid(self.param_grid))
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.py", line 564, in _fit
for parameters in parameter_iterable
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 326, in __init__
self.results = batch()
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.py", line 260, in _fit_and_score
test_score = _score(estimator, X_test, y_test, scorer)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.py", line 288, in _score
score = scorer(estimator, X_test, y_test)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py", line 196, in __call__
return self._sign * self._score_func(y, y_pred, **self._kwargs)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/metrics/ranking.py", line 184, in average_precision_score
average, sample_weight=sample_weight)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/metrics/base.py", line 88, in _average_binary_score
y_score = check_array(y_score)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py", line 380, in check_array
force_all_finite)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py", line 243, in _ensure_sparse_format
raise TypeError('A sparse matrix was passed, but dense '
TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.
I have already seen this and this and this. My question is different because I checked the types of train_x and train_y, and both are dense numpy arrays.
What am I doing wrong and how can I fix it?
EDIT:
I'm now trying the answer provided below, but with a modification due to the error I got (answer here):
def average_precision_wrapper(estimator, X, y):
if X.ndim == 2:
X = X.reshape((-1)) #(1, 1497) becomes (1497,), needed for average_precision
if y.ndim == 2:
y = y.reshape((-1)) #(1, 1497) ... as above
y_pred = estimator.predict(X).toarray()
return average_precision_score(y, y_pred)
EDIT 2: That was no good after all. I get ValueError: query data dimension must match training data dimension . Here's the trace:
/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
DeprecationWarning)
Traceback (most recent call last):
File "May15_mlknn_to_so.py", line 393, in <module>
testexamples()
File "May15_mlknn_to_so.py", line 353, in testexamples
checked_model.fit(train_x, train_y)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.py", line 945, in fit
return self._fit(X, y, groups, ParameterGrid(self.param_grid))
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.py", line 564, in _fit
for parameters in parameter_iterable
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 326, in __init__
self.results = batch()
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.py", line 260, in _fit_and_score
test_score = _score(estimator, X_test, y_test, scorer)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.py", line 288, in _score
score = scorer(estimator, X_test, y_test)
File "May15_mlknn_to_so.py", line 307, in average_precision_wrapper
y_pred = estimator.predict(X).toarray()
File "May15_mlknn_to_so.py", line 237, in predict
self.knn_.kneighbors(X, self.k + self.ignore_first_neighbours, return_distance=False)]
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/neighbors/base.py", line 381, in kneighbors
for s in gen_even_slices(X.shape[0], n_jobs)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 326, in __init__
self.results = batch()
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "sklearn/neighbors/binary_tree.pxi", line 1294, in sklearn.neighbors.kd_tree.BinaryTree.query (sklearn/neighbors/kd_tree.c:11337)
ValueError: query data dimension must match training data dimension
The MLkNN.predict method returns a scipy.sparse array. The scorer 'average_precision' expects a numpy array. You can write a small wrapper that makes this conversion yourself:
from sklearn.model_selection import GridSearchCV
from skmultilearn.adapt import MLkNN
from sklearn.metrics import average_precision_score
def average_precision_wrapper(estimator, X, y):
y_pred = estimator.predict(X).toarray()
return average_precision_score(y, y_pred)
# Make dummy features of shape (100,5)
train_x = np.random.random((100,5))
# Make dummy one-hot encoded labels of shape (100,4)
train_y = np.zeros((100,4), dtype=int)
for i in range(100):
train_y[i, i%4] = 1
parameters = {'k': range(1,5), 's': [0.5, 0.75, 1]}
checked_model = GridSearchCV(MLkNN(), parameters, scoring=average_precision_wrapper)
checked_model.fit(train_x, train_y)
Solved, with help from user2653663: I changed the metric to the Hamming loss, but had to create a scorer to do that using make_scorer from sklearn.metrics.
parameters = {'k': range(1,5), 's': [0.5, 0.75, 1]}
#checked_model = GridSearchCV(MLkNN(), parameters, scoring='f1_samples')
start = time.time()
#checked_model = GridSearchCV(MLkNN(), parameters, scoring='average_precision')
hloss_scorer = make_scorer(hamming_loss, greater_is_better=False)
checked_model = GridSearchCV(MLkNN(), parameters, scoring=hloss_scorer)
checked_model.fit(train_x, train_y)
end = time.time()
print('best parameters: ', checked_model.best_params_, 'best Hamming loss: ', checked_model.best_score_)
best_model = MLkNN(k=checked_model.best_params_['k'], s=checked_model.best_params_['s'])
best_model.fit(train_x, train_y)
pred_values = best_model.predict(test_x) #returns 0/1 classes, not probabilities
pred_values = np.array(pred_values.todense())
true_values = test_y
#Metrics
bincross = []
ap = []
ap_weighted = []
h_loss = []
for i in range(1, pred_values.shape[0]):
true_vals = true_values[i,:]
pred_vals = pred_values[i,:]
pred_vals = np.squeeze(pred_vals)
h_loss.append(hamming_loss(true_vals, pred_vals))
print("***********************")
print("MLKNN with k=best")
print("***********************")
print("Hamming loss: ", h_loss)
h_loss = np.asarray(h_loss)
print("total Hamming loss: ", np.sum(h_loss))
I am trying to test my logistic regression model but I get a memory error and cannot solve it. Is it because my sentences take too much space? I will appreciate any help.
From Line 267 in my code:
self.X, self.y = self.transform_to_dataset(training_sentences,_pos__sentences)
self.clf = Pipeline([
('vectorizer', DictVectorizer(sparse=False)),
('classifier', LogisticRegression())])
self.clf.fit(self.X, self.y)
And the error I get after running this:
Traceback (most recent call last):
File "tagger_lr_chunk.py", line 342, in <module>
tagger.train(data_dir + 'train.txt')
File "tagger_lr_chunk.py", line 271, in train
self.clf.fit(self.X, self.y)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.py", line 213, in _fit
**fit_params_steps[name])
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/memory.py", line 362, in __call__
return self.func(*args, **kwargs)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.py", line 230, in fit_transform
return self._transform(X, fitting=True)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.py", line 204, in _transform
result_matrix = result_matrix.toarray()
File "/home/selub/anaconda2/lib/python2.7/site-packages/scipy/sparse/compressed.py", line 943, in toarray
out = self._process_toarray_args(order, out)
File "/home/selub/anaconda2/lib/python2.7/site-packages/scipy/sparse/base.py", line 1130, in _process_toarray_args
return np.zeros(self.shape, dtype=self.dtype, order=order)
MemoryError
I solved this memory issue by changing paramater of DictVectorizer in order to allow producing scipy.sparse matrices
self.X, self.y = self.transform_to_dataset(training_sentences,_pos__sentences)
self.clf = Pipeline([
('vectorizer', DictVectorizer(sparse=True)),
('classifier', LogisticRegression())])
self.clf.fit(self.X, self.y)
I'm trying to tune the alpha parameter of a Multinomial Naive Bayes, with the 20newsgroups database. This is my code so far:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import numpy as np
# Divide dataset
dataset_train = fetch_20newsgroups(subset='train', shuffle=True)
dataset_test = fetch_20newsgroups(subset='test', shuffle=True)
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer(sublinear_tf=True)), ('clf',
MultinomialNB())])
param_grid = {'tfidf__use_idf': (True, False),
'clf__alpha' : np.linspace(0.001, 1, 100)}
grid_search = GridSearchCV(text_clf, param_grid=param_grid, scoring='precision', cv = None)
# Training
text_clf = grid_search.fit(dataset_train.data,dataset_train.target, average=None)
#prediction
predicted = text_clf.predict(dataset_test.data)
print("NB Accuracy:", 100*np.mean(predicted == dataset_test.target), '%')
print(classification_report(dataset_test.target, predicted, target_names=dataset_train.target_names))
print("Best estimator for alpha in order to get precision ", grid_search.best_estimator_)
The problem is i'm getting the following error:
runfile('C:/Users/omarl/Downloads/new_NB.py', wdir='C:/Users/omarl/Downloads')
Traceback (most recent call last):
File "<ipython-input-12-d478372ef22a>", line 1, in <module>
runfile('C:/Users/omarl/Downloads/new_NB.py', wdir='C:/Users/omarl/Downloads')
File "C:\Users\omarl\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "C:\Users\omarl\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/omarl/Downloads/new_NB.py", line 28, in <module>
text_clf = grid_search.fit(dataset_train.data,dataset_train.target, average=None)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 639, in fit
cv.split(X, y, groups)))
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
result = ImmediateResult(func)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
self.results = batch()
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 458, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 197, in _fit
step, param = pname.split('__', 1)
ValueError: not enough values to unpack (expected 2, got 1)
I have no clue why this is happening, because from the code I reviewed so far this should work. Also I searched in the Scikit website but I didn't found anything. Thanks.
In this line:
text_clf = grid_search.fit(dataset_train.data,dataset_train.target, average=None)
average=None is being interpreted as a fit_param, which is not what you intend.
Average removing this, you will get this error.
ValueError: Target is multiclass but average='binary'. Please choose another average setting.
This is because precision is not defined in the multi-class setting. If you change your scoring parameter to 'accuracy', the code works.
I am trying to do feature selection for multilabel classification.I extracted the features on which model will be trained into X. Model testing is done on the same X. I am using Pipeline and selecting best 100 features-
#arrFinal contains all the features and the labels. Last 16 columns are labels and features are from 1 to 521. 17th column from the last is not taken
X=np.array(arrFinal[:,1:-17])
Xtest=np.array(X)
Y=np.array(arrFinal[:,522:]).astype(int)
clf = Pipeline([('chi2', SelectKBest(chi2, k=100)),('rbf',SVC())])
clf = OneVsRestClassifier(clf)
clf.fit(X, Y)
ans=clf.predict(X_test)
But I am getting the following error-
Traceback (most recent call last):
File "C:\Users\50004182\Documents\\callee.py", line 10, in <module
>
combine.combine_main(dict_ids,inv_dict_ids,noOfIDs)
File "C:\Users\50004182\Documents\combine.py", line 201, in combi
ne_main
clf.fit(X, Y)
File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 287, in fit
for i, column in enumerate(columns))
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 804, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 662, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 570, in _dispatch
job = ImmediateComputeBatch(batch)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 183, in __init__
self.results = batch()
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 72, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 72, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 74, in _fit_b
inary
estimator.fit(X, y)
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 164, in fit
Xt, fit_params = self._pre_transform(X, y, **fit_params)
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 145, in _pre_tr
ansform
Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
File "C:\Python34\lib\site-packages\sklearn\base.py", line 458, in fit_transfo
rm
return self.fit(X, y, **fit_params).transform(X)
File "C:\Python34\lib\site-packages\sklearn\feature_selection\univariate_selec
tion.py", line 331, in fit
self.scores_, self.pvalues_ = self.score_func(X, y)
File "C:\Python34\lib\site-packages\sklearn\feature_selection\univariate_selec
tion.py", line 213, in chi2
if np.any((X.data if issparse(X) else X) < 0):
TypeError: unorderable types: numpy.ndarray() < int()
So, after a debugging session in the comments above with #JamieBull and #Joker. The solution we came up with was:
Ensure type is correct (originally string)
X=np.array(arrFinal[:,1:-17]).astype(np.float64)
Xtest=np.array(X)
Y=np.array(arrFinal[:,522:]).astype(int)
First use VarianceThreshold to remove constant (0) columns prior to chi2.
clf = Pipeline([
('vt', VarianceThreshold()),
('chi2', SelectKBest(chi2, k=100)),
('rbf',SVC())
])
clf = OneVsRestClassifier(clf)
clf.fit(X, Y)
ans=clf.predict(X_test)