Error during feature selection - python

I am trying to do feature selection for multilabel classification.I extracted the features on which model will be trained into X. Model testing is done on the same X. I am using Pipeline and selecting best 100 features-
#arrFinal contains all the features and the labels. Last 16 columns are labels and features are from 1 to 521. 17th column from the last is not taken
X=np.array(arrFinal[:,1:-17])
Xtest=np.array(X)
Y=np.array(arrFinal[:,522:]).astype(int)
clf = Pipeline([('chi2', SelectKBest(chi2, k=100)),('rbf',SVC())])
clf = OneVsRestClassifier(clf)
clf.fit(X, Y)
ans=clf.predict(X_test)
But I am getting the following error-
Traceback (most recent call last):
File "C:\Users\50004182\Documents\\callee.py", line 10, in <module
>
combine.combine_main(dict_ids,inv_dict_ids,noOfIDs)
File "C:\Users\50004182\Documents\combine.py", line 201, in combi
ne_main
clf.fit(X, Y)
File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 287, in fit
for i, column in enumerate(columns))
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 804, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 662, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 570, in _dispatch
job = ImmediateComputeBatch(batch)
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 183, in __init__
self.results = batch()
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 72, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\externals\joblib\parallel.py", lin
e 72, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Python34\lib\site-packages\sklearn\multiclass.py", line 74, in _fit_b
inary
estimator.fit(X, y)
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 164, in fit
Xt, fit_params = self._pre_transform(X, y, **fit_params)
File "C:\Python34\lib\site-packages\sklearn\pipeline.py", line 145, in _pre_tr
ansform
Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
File "C:\Python34\lib\site-packages\sklearn\base.py", line 458, in fit_transfo
rm
return self.fit(X, y, **fit_params).transform(X)
File "C:\Python34\lib\site-packages\sklearn\feature_selection\univariate_selec
tion.py", line 331, in fit
self.scores_, self.pvalues_ = self.score_func(X, y)
File "C:\Python34\lib\site-packages\sklearn\feature_selection\univariate_selec
tion.py", line 213, in chi2
if np.any((X.data if issparse(X) else X) < 0):
TypeError: unorderable types: numpy.ndarray() < int()

So, after a debugging session in the comments above with #JamieBull and #Joker. The solution we came up with was:
Ensure type is correct (originally string)
X=np.array(arrFinal[:,1:-17]).astype(np.float64)
Xtest=np.array(X)
Y=np.array(arrFinal[:,522:]).astype(int)
First use VarianceThreshold to remove constant (0) columns prior to chi2.
clf = Pipeline([
('vt', VarianceThreshold()),
('chi2', SelectKBest(chi2, k=100)),
('rbf',SVC())
])
clf = OneVsRestClassifier(clf)
clf.fit(X, Y)
ans=clf.predict(X_test)

Related

Dask hot encoding - ValueError: All columns must be Categorical dtype when 'categories="auto"'

I am dealing with an error that I am unable to handle. Considering that I am using a simple dataset that has both categorical and numerical features, I am a bit suprised by the error ValueError: All columns must be Categorical dtype when 'categories="auto"'.
The dataset before any transformation looks like that
As I am using dask instead of sklearn, I cannot rely on something like OneHotEncoder(handle_unknown='ignore')
What would you recommend to fix that?
from dask import dataframe as pd
from dask_ml.model_selection import train_test_split
from dask_ml.preprocessing import MinMaxScaler, OneHotEncoder
from dask_ml.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from dask_ml.linear_model import LinearRegression
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import RepeatedKFold
from numpy import mean
from numpy import std
from numpy import absolute
numeric_features = ['prob','opp','win_proba','checker']
categorical_features = ['card1_hand','card2_hand','card1_table','card2_table','card3_table',
'card1_color_hand','card2_color_hand','card1_color_table','card2_color_table',
'card3_color_table','actions_preflop','actions_flop','actions_turn','actions_river','best_hand']
all_features = ['card1_hand','card2_hand','card1_table','card2_table','card3_table','card1_color_hand',
'card2_color_hand','card1_color_table','card2_color_table','card3_color_table',
'actions_preflop','actions_flop','actions_turn','actions_river','best_hand',
'call','prob','opp','win_proba','checker']
output_col = 'call'
def train(x_train, y_train):
numeric_transformer = MinMaxScaler()
categorical_transformer = OneHotEncoder()
preprocessor = ColumnTransformer(
transformers=[
("num", numeric_transformer, numeric_features),
("cat", categorical_transformer, categorical_features),
]
)
model = Pipeline(
steps=[
("preprocessor", preprocessor),
("classifier", LinearRegression())
]
)
model.fit(x_train, y_train)
return model
'''
def evaluate_model(model, x_test, y_test):
#predict_test = model.predict(x_test)
cv = RepeatedKFold(n_splits=10, n_repeats=3, random_state=1)
# evaluate model
scores = cross_val_score(model, x_test, y_test, scoring='neg_mean_absolute_error', cv=cv, n_jobs=-1)
scores = absolute(scores)
print('Mean MAE: %.3f (%.3f)' % (mean(scores), std(scores)))
'''
def predict_from_dict(model, data):
x = pd.DataFrame.from_dict({key:[value] for key, value in data.items()})
return model.predict(x)[0]
if __name__ == '__main__':
df = pd.read_csv('output_test.csv')
df = df[all_features]
y = df[output_col]
x = df.drop(['call'], axis=1)
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.2, random_state=1)
model = train(x_train, y_train)
Traceback below
Traceback (most recent call last):
File "C:\Users\rapha\Desktop\Consulting\Poker\Cepheus\Model\Dask\dask regression.py", line 73, in <module>
model = train(x_train, y_train)
File "C:\Users\rapha\Desktop\Consulting\Poker\Cepheus\Model\Dask\dask regression.py", line 46, in train
model.fit(x_train, y_train)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 390, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 355, in _fit
**fit_params_steps[name],
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\memory.py", line 355, in __call__
return self.func(*args, **kwargs)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\compose\_column_transformer.py", line 675, in fit_transform
result = self._fit_transform(X, y, _fit_transform_one)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\compose\_column_transformer.py", line 615, in _fit_transform
for idx, (name, trans, column, weight) in enumerate(transformers, 1)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 1007, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 835, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 754, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 209, in apply_async
result = ImmediateResult(func)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 590, in __init__
self.results = batch()
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\fixes.py", line 216, in __call__
return self.function(*args, **kwargs)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\preprocessing\_encoders.py", line 488, in fit_transform
return super().fit_transform(X, y)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\base.py", line 855, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\dask_ml\preprocessing\_encoders.py", line 153, in fit
self._fit(X, handle_unknown=self.handle_unknown)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\dask_ml\preprocessing\_encoders.py", line 225, in _fit
"All columns must be Categorical dtype when "
ValueError: All columns must be Categorical dtype when 'categories="auto"'.
runfile('C:/Users/rapha/Desktop/Consulting/Poker/Cepheus/Model/Dask/dask regression.py', wdir='C:/Users/rapha/Desktop/Consulting/Poker/Cepheus/Model/Dask')
Dask DataFrame Structure:
card1_hand card2_hand card1_table card2_table card3_table card1_color_hand card2_color_hand card1_color_table card2_color_table card3_color_table actions_preflop actions_flop actions_turn actions_river best_hand prob opp win_proba checker
npartitions=1
int64 int64 int64 int64 int64 object object object object object object object object object object float64 float64 float64 float64
... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ... ...
Dask Name: split, 7 tasks
runfile('C:/Users/rapha/Desktop/Consulting/Poker/Cepheus/Model/Dask/dask regression.py', wdir='C:/Users/rapha/Desktop/Consulting/Poker/Cepheus/Model/Dask')
Traceback (most recent call last):
File "C:\Users\rapha\Desktop\Consulting\Poker\Cepheus\Model\Dask\dask regression.py", line 73, in <module>
model = train(x_train, y_train)
File "C:\Users\rapha\Desktop\Consulting\Poker\Cepheus\Model\Dask\dask regression.py", line 46, in train
model.fit(x_train, y_train)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 390, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 355, in _fit
**fit_params_steps[name],
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\memory.py", line 355, in __call__
return self.func(*args, **kwargs)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\compose\_column_transformer.py", line 675, in fit_transform
result = self._fit_transform(X, y, _fit_transform_one)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\compose\_column_transformer.py", line 615, in _fit_transform
for idx, (name, trans, column, weight) in enumerate(transformers, 1)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 1007, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 835, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 754, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 209, in apply_async
result = ImmediateResult(func)
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 590, in __init__
self.results = batch()
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "C:\Users\rapha\anaconda3\lib\site-packages\joblib\parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\fixes.py", line 216, in __call__
return self.function(*args, **kwargs)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py", line 893, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\preprocessing\_encoders.py", line 488, in fit_transform
return super().fit_transform(X, y)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\sklearn\base.py", line 855, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\dask_ml\preprocessing\_encoders.py", line 153, in fit
self._fit(X, handle_unknown=self.handle_unknown)
File "C:\Users\rapha\AppData\Roaming\Python\Python37\site-packages\dask_ml\preprocessing\_encoders.py", line 225, in _fit
"All columns must be Categorical dtype when "
ValueError: All columns must be Categorical dtype when 'categories="auto"'.

Problems with StackingRegressor

this error occurred when I used scikit-learn to perform model fusion on 7 sub-models. I checked the official document and there was no relevant introduction.
code:
# model fusion
estimators = [('DT', model_dt_x), ('KNN', model_knn_x), ('SVR', model_svr_x), ('ANN', model_ann_x), ('RF', model_rf_x), ('GBDT', model_gbdt_x), ('XGBT', model_xgbt_x)]
stacking_regressor = StackingRegressor(estimators=estimators, final_estimator=RidgeCV())
stacking_regressor.fit(X_train, y_train)
error:
Traceback (most recent call last):
File "i:/Lab/20210xxx/ex.py", line 86, in <module>
stacking_regressor.fit(X_train, y_train)
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\ensemble\_stacking.py", line 680, in fit
return super().fit(X, y, sample_weight)
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\ensemble\_stacking.py", line 148, in fit
for est in all_estimators if est != 'drop'
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\parallel.py", line 921, in __call__
if self.dispatch_one_batch(iterator):
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\parallel.py", line 759, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\parallel.py", line 716, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 182, in
apply_async
result = ImmediateResult(func)
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\_parallel_backends.py", line 549, in __init__
self.results = batch()
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\parallel.py", line 225, in __call__
for func, args, kwargs in self.items]
File "C:\Users\xxx\Anaconda3\lib\site-packages\joblib\parallel.py", line 225, in <listcomp>
for func, args, kwargs in self.items]
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\ensemble\_base.py", line 40, in
_fit_single_estimator
estimator.fit(X, y)
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py",
line 641, in fit
return self._fit(X, y, incremental=False)
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py",
line 321, in _fit
self._validate_hyperparameters()
File "C:\Users\xxx\Anaconda3\lib\site-packages\sklearn\neural_network\_multilayer_perceptron.py",
line 385, in _validate_hyperparameters
if self.max_fun <= 0:
TypeError: '<=' not supported between instances of 'NoneType' and 'int'
Problem solved
model_ann_x = MLPRegressor(**model_ann.get_params())
model_ann_x.set_params(max_fun=15000) # get rid of bug
model_rf_x = RandomForestRegressor(**model_rf.get_params())
model_rf_x.set_params(ccp_alpha=0.0) # get rid of bug
model_gbdt_x = GradientBoostingRegressor(**model_gbdt.get_params())
model_gbdt_x.set_params(ccp_alpha=0.0) # get rid of bug

TypeError: A sparse matrix was passed, but dense data is required (multilabel K nearest neighbours)

I'm having trouble with fitting an instance of an MLkNN model (from scikit-multilearn) after doing GridSearchCV (from scikit-learn). I am getting an error. Here is the appropriate code:
#From MachineLearningMastery: https://machinelearningmastery.com/multivariate-time-series-forecasting-lstms-keras/
def series_to_supervised(n_lags, n_vars, data, n_out=1, dropnan=True):
n_vars = 1 if type(data) is list else data.shape[1]
df = DataFrame(data)
cols, names = list(), list()
#input sequence t-n, ..., t-1
for i in range(n_lags, 0, -1): #for i in 3 to 0 not including 0
cols.append(df.shift(i))
names += [('var%d(t-%d)' % (j+1, i)) for j in range (n_vars)]
#forecast sequence t, t+1, ..., t+n
for i in range(0, n_out):
cols.append(df.shift(-i))
if i==0:
names += [('var%d(t)' % (j+1)) for j in range(n_vars)]
else:
names += [('var%d(t+%d)' % (j+1, i)) for j in range(n_vars)]
agg = concat(cols, axis=1)
agg.columns = names
if dropnan:
agg.dropna(inplace=True)
return agg
def testexamples():
def average_precision_wrapper(estimator, X, y):
if X.ndim == 2:
X = X.reshape((-1)) #[1, 1497] becomes [1497,], needed for average_precision
if y.ndim == 2:
y = y.reshape((-1)) #[1, 1497] ... as above
y_pred = estimator.predict(X).toarray()
return average_precision_score(y, y_pred)
true_values = np.random.choice([0,1], size=(500, 1497), p=[0.99, 0.01])
#Need to convert this to supervised learning. Use previous 2 days to predict (lag=2)
n_lags = 2
n_vars = true_values.shape[1]
all_data = np.asarray(series_to_supervised(n_lags, n_vars, data=true_values))
train_x = all_data[:400, :int(n_vars*n_lags)]
train_y = all_data[:400, int(n_vars*n_lags):]
test_x = all_data[-100:, :int(n_vars*n_lags)]
test_y = all_data[-100:, int(n_vars*n_lags):]
parameters = {'k': range(1,5), 's': [0.5, 0.75, 1]}
checked_model = GridSearchCV(MLkNN(), parameters, scoring='average_precision')
print('type: train_x: ', type(train_x), ' type: train_y: ', type(train_y))
checked_model.fit(train_x, train_y)
Full trace:
user#GPU8:~/path/to/dir$ python May15_mlknn.py
type: train_x: <type 'numpy.ndarray'> type: train_y: <type 'numpy.ndarray'>
Traceback (most recent call last):
File "May15_mlknn.py", line 380, in <module>
testexamples()
File "May15_mlknn.py", line 340, in testexamples
checked_model.fit(train_x, train_y)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.py", line 945, in fit
return self._fit(X, y, groups, ParameterGrid(self.param_grid))
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.py", line 564, in _fit
for parameters in parameter_iterable
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 326, in __init__
self.results = batch()
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.py", line 260, in _fit_and_score
test_score = _score(estimator, X_test, y_test, scorer)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.py", line 288, in _score
score = scorer(estimator, X_test, y_test)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/metrics/scorer.py", line 196, in __call__
return self._sign * self._score_func(y, y_pred, **self._kwargs)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/metrics/ranking.py", line 184, in average_precision_score
average, sample_weight=sample_weight)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/metrics/base.py", line 88, in _average_binary_score
y_score = check_array(y_score)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py", line 380, in check_array
force_all_finite)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py", line 243, in _ensure_sparse_format
raise TypeError('A sparse matrix was passed, but dense '
TypeError: A sparse matrix was passed, but dense data is required. Use X.toarray() to convert to a dense numpy array.
I have already seen this and this and this. My question is different because I checked the types of train_x and train_y, and both are dense numpy arrays.
What am I doing wrong and how can I fix it?
EDIT:
I'm now trying the answer provided below, but with a modification due to the error I got (answer here):
def average_precision_wrapper(estimator, X, y):
if X.ndim == 2:
X = X.reshape((-1)) #(1, 1497) becomes (1497,), needed for average_precision
if y.ndim == 2:
y = y.reshape((-1)) #(1, 1497) ... as above
y_pred = estimator.predict(X).toarray()
return average_precision_score(y, y_pred)
EDIT 2: That was no good after all. I get ValueError: query data dimension must match training data dimension . Here's the trace:
/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/utils/validation.py:395: DeprecationWarning: Passing 1d arrays as data is deprecated in 0.17 and will raise ValueError in 0.19. Reshape your data either using X.reshape(-1, 1) if your data has a single feature or X.reshape(1, -1) if it contains a single sample.
DeprecationWarning)
Traceback (most recent call last):
File "May15_mlknn_to_so.py", line 393, in <module>
testexamples()
File "May15_mlknn_to_so.py", line 353, in testexamples
checked_model.fit(train_x, train_y)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.py", line 945, in fit
return self._fit(X, y, groups, ParameterGrid(self.param_grid))
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_search.py", line 564, in _fit
for parameters in parameter_iterable
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 326, in __init__
self.results = batch()
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.py", line 260, in _fit_and_score
test_score = _score(estimator, X_test, y_test, scorer)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/model_selection/_validation.py", line 288, in _score
score = scorer(estimator, X_test, y_test)
File "May15_mlknn_to_so.py", line 307, in average_precision_wrapper
y_pred = estimator.predict(X).toarray()
File "May15_mlknn_to_so.py", line 237, in predict
self.knn_.kneighbors(X, self.k + self.ignore_first_neighbours, return_distance=False)]
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/neighbors/base.py", line 381, in kneighbors
for s in gen_even_slices(X.shape[0], n_jobs)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 326, in __init__
self.results = batch()
File "/user/pkgs/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "sklearn/neighbors/binary_tree.pxi", line 1294, in sklearn.neighbors.kd_tree.BinaryTree.query (sklearn/neighbors/kd_tree.c:11337)
ValueError: query data dimension must match training data dimension
The MLkNN.predict method returns a scipy.sparse array. The scorer 'average_precision' expects a numpy array. You can write a small wrapper that makes this conversion yourself:
from sklearn.model_selection import GridSearchCV
from skmultilearn.adapt import MLkNN
from sklearn.metrics import average_precision_score
def average_precision_wrapper(estimator, X, y):
y_pred = estimator.predict(X).toarray()
return average_precision_score(y, y_pred)
# Make dummy features of shape (100,5)
train_x = np.random.random((100,5))
# Make dummy one-hot encoded labels of shape (100,4)
train_y = np.zeros((100,4), dtype=int)
for i in range(100):
train_y[i, i%4] = 1
parameters = {'k': range(1,5), 's': [0.5, 0.75, 1]}
checked_model = GridSearchCV(MLkNN(), parameters, scoring=average_precision_wrapper)
checked_model.fit(train_x, train_y)
Solved, with help from user2653663: I changed the metric to the Hamming loss, but had to create a scorer to do that using make_scorer from sklearn.metrics.
parameters = {'k': range(1,5), 's': [0.5, 0.75, 1]}
#checked_model = GridSearchCV(MLkNN(), parameters, scoring='f1_samples')
start = time.time()
#checked_model = GridSearchCV(MLkNN(), parameters, scoring='average_precision')
hloss_scorer = make_scorer(hamming_loss, greater_is_better=False)
checked_model = GridSearchCV(MLkNN(), parameters, scoring=hloss_scorer)
checked_model.fit(train_x, train_y)
end = time.time()
print('best parameters: ', checked_model.best_params_, 'best Hamming loss: ', checked_model.best_score_)
best_model = MLkNN(k=checked_model.best_params_['k'], s=checked_model.best_params_['s'])
best_model.fit(train_x, train_y)
pred_values = best_model.predict(test_x) #returns 0/1 classes, not probabilities
pred_values = np.array(pred_values.todense())
true_values = test_y
#Metrics
bincross = []
ap = []
ap_weighted = []
h_loss = []
for i in range(1, pred_values.shape[0]):
true_vals = true_values[i,:]
pred_vals = pred_values[i,:]
pred_vals = np.squeeze(pred_vals)
h_loss.append(hamming_loss(true_vals, pred_vals))
print("***********************")
print("MLKNN with k=best")
print("***********************")
print("Hamming loss: ", h_loss)
h_loss = np.asarray(h_loss)
print("total Hamming loss: ", np.sum(h_loss))

Memory error in Python while using sklearn

I am trying to test my logistic regression model but I get a memory error and cannot solve it. Is it because my sentences take too much space? I will appreciate any help.
From Line 267 in my code:
self.X, self.y = self.transform_to_dataset(training_sentences,_pos__sentences)
self.clf = Pipeline([
('vectorizer', DictVectorizer(sparse=False)),
('classifier', LogisticRegression())])
self.clf.fit(self.X, self.y)
And the error I get after running this:
Traceback (most recent call last):
File "tagger_lr_chunk.py", line 342, in <module>
tagger.train(data_dir + 'train.txt')
File "tagger_lr_chunk.py", line 271, in train
self.clf.fit(self.X, self.y)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.py", line 213, in _fit
**fit_params_steps[name])
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/externals/joblib/memory.py", line 362, in __call__
return self.func(*args, **kwargs)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/pipeline.py", line 581, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.py", line 230, in fit_transform
return self._transform(X, fitting=True)
File "/home/selub/anaconda2/lib/python2.7/site-packages/sklearn/feature_extraction/dict_vectorizer.py", line 204, in _transform
result_matrix = result_matrix.toarray()
File "/home/selub/anaconda2/lib/python2.7/site-packages/scipy/sparse/compressed.py", line 943, in toarray
out = self._process_toarray_args(order, out)
File "/home/selub/anaconda2/lib/python2.7/site-packages/scipy/sparse/base.py", line 1130, in _process_toarray_args
return np.zeros(self.shape, dtype=self.dtype, order=order)
MemoryError
I solved this memory issue by changing paramater of DictVectorizer in order to allow producing scipy.sparse matrices
self.X, self.y = self.transform_to_dataset(training_sentences,_pos__sentences)
self.clf = Pipeline([
('vectorizer', DictVectorizer(sparse=True)),
('classifier', LogisticRegression())])
self.clf.fit(self.X, self.y)

sklearn FeatureUnion won't work with cross_val_score?

I'm playing with FeatureUnion and Pipeline, but got stuck with this use case,
# simply return a column in a Pandas DataFrame
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, col):
self.col = col
def fit(self, x, y=None):
return self
def transform(self, x):
return x[self.col] # return a column
# convert categorical features into one-hot encoding format
class CategoricalEncoder(BaseEstimator, TransformerMixin):
def __init__(self):
self.lb = LabelBinarizer()
def fit(self, x, y=None):
self.lb.fit(x)
return self
def transform(self, x):
rez = self.lb.transform(x)
return rez
# This dummy one just combines the above 2 transformers into one for convenience reasons
class DummyEncoder(BaseEstimator, TransformerMixin):
def __init__(self, col):
# here is a feature union inside which a Pipeline used,
# first, select a column, then one-hot encode the column
self.union = FeatureUnion([('one', Pipeline([('select', ItemSelector(col)), ('encode', CategoricalEncoder())]))])
def fit(self, x, y=None):
self.union.fit(x)
return self
def transform(self, x):
return self.union.transform(x)
# alright, here is the testing code
df = pd.DataFrame(data={'Y': [1,2,1,2,1], 'X': ['a','b','a','b','c']})
pipe_conf = [
('union', FeatureUnion([('union_0', DummyEncoder('X'))])),
('clf', LogisticRegression())
]
pipe = Pipeline(pipe_conf)
# 1) call fit and predict explicitly, it works
pipe.fit(df, df['Y'])
pipe.predict(df)
# 2) via sklearn cross validation, error, lots of error which I'm just not able to understand at all
cross_val_score(pipe, df, df['Y'], cv=2)
Any error in my above code? Please give me a hint.
Error as follows:
Traceback (most recent call last):
File "a.py", line 65, in <module>
cross_val_score(pipe, df, df['Y'], cv=2)
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/model_selection/_validation.py", line 140, in cross_val_score
for train, test in cv.split(X, y, groups))
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 322, in __init__
self.results = batch()
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/model_selection/_validation.py", line 238, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/pipeline.py", line 268, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/pipeline.py", line 234, in _fit
Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/pipeline.py", line 737, in fit_transform
for name, trans, weight in self._iter())
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 322, in __init__
self.results = batch()
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/pipeline.py", line 580, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/base.py", line 497, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "a.py", line 38, in fit
self.union.fit(x)
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/pipeline.py", line 712, in fit
for _, trans, _ in self._iter())
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 758, in __call__
while self.dispatch_one_batch(iterator):
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 608, in dispatch_one_batch
self._dispatch(tasks)
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 571, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 109, in apply_async
result = ImmediateResult(func)
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/_parallel_backends.py", line 322, in __init__
self.results = batch()
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/externals/joblib/parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/pipeline.py", line 566, in _fit_one_transformer
return transformer.fit(X, y)
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/pipeline.py", line 268, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/pipeline.py", line 234, in _fit
Xt = transform.fit_transform(Xt, y, **fit_params_steps[name])
File "/Users/home/miniconda3/lib/python3.5/site-packages/sklearn/base.py", line 494, in fit_transform
return self.fit(X, **fit_params).transform(X)
File "a.py", line 19, in transform
return x[self.col]
File "/Users/home/miniconda3/lib/python3.5/site-packages/pandas/core/frame.py", line 1992, in __getitem__
return self._getitem_column(key)
File "/Users/home/miniconda3/lib/python3.5/site-packages/pandas/core/frame.py", line 1999, in _getitem_column
return self._get_item_cache(key)
File "/Users/home/miniconda3/lib/python3.5/site-packages/pandas/core/generic.py", line 1345, in _get_item_cache
values = self._data.get(item)
File "/Users/home/miniconda3/lib/python3.5/site-packages/pandas/core/internals.py", line 3234, in get
raise ValueError("cannot label index with a null key")
ValueError: cannot label index with a null key

Categories

Resources