I'm trying to improve a perfectly working Bernoulli Naive Bayes model with bagging.
But when I try to cross-validate the BaggingClassifier, I get a very unexpected ZeroDivisionError coming from parallel.py.
I've tried to change all the parameters I know, rebooted python but nothing worked.
Here is a reproducible example with a binary-modified iris dataset:
#%% run
import numpy as np
from sklearn.model_selection import GridSearchCV
from sklearn.ensemble import BaggingClassifier
from sklearn.naive_bayes import BernoulliNB
from sklearn.datasets import load_iris
data = load_iris()
data.targetbin = (data.target!=0).astype("int")
param_grid2={'max_samples' : np.linspace(0.5,1.0,3),
'base_estimator__alpha':np.linspace(0.1,1,3),
'base_estimator__binarize':[*np.linspace(0.0,1,3)],
'base_estimator__fit_prior':[True,False]}
param_grid2={'max_samples' :[0.7]}
clf = GridSearchCV(
BaggingClassifier(
BernoulliNB(),
n_estimators = 10, max_features = 0.5),
param_grid2,
scoring = "accuracy",
verbose=-1)
clf.fit(data.data, data.targetbin)
And here is the stacktrace of my error:
[Parallel(n_jobs=1)]: Using backend SequentialBackend with 1
concurrent workers. Traceback (most recent call last):
File "", line 33, in
clf.fit(data.data, data.targetbin)
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\model_selection_search.py",
line 722, in fit
self._run_search(evaluate_candidates)
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\model_selection_search.py",
line 1191, in _run_search
evaluate_candidates(ParameterGrid(self.param_grid))
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\model_selection_search.py",
line 711, in evaluate_candidates
cv.split(X, y, groups)))
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py",
line 917, in call
if self.dispatch_one_batch(iterator):
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py",
line 759, in dispatch_one_batch
self._dispatch(tasks)
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py",
line 716, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib_parallel_backends.py",
line 184, in apply_async
callback(result)
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py",
line 306, in call
self.parallel.print_progress()
File
"C:\Users\Dan\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py",
line 806, in print_progress
if (is_last_item or cursor % frequency):
ZeroDivisionError: integer division or modulo by zero
What am I doing wrong?
I tried to debug the lib and found self.verbose for sklearn/externals/joblib/parallel.py is -1, however it's supposed to be at least 0 by default. So I think it's a bug.
Related
I'm currently implementing machine learning using SMOTE from imblearn.over_sampling, and as I'm synthesizing data for it, I see a very noticeable cutoff for when the SMOTE method breaks. When I synthesize data using the following code and run it through SMOTE (courtesy of Jason Brownlee):
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10000, n_features=15, n_redundant=0,
n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
It works fine. However, when the number of features is 16...
from imblearn.over_sampling import SMOTE
from sklearn.datasets import make_classification
X, y = make_classification(n_samples=10000, n_features=16, n_redundant=0,
n_clusters_per_class=1, weights=[0.99], flip_y=0, random_state=1)
oversample = SMOTE()
X, y = oversample.fit_resample(X, y)
SMOTE breaks. Why is this? Does anyone know of a SMOTE method that works for more than 15 parameters? By SMOTE breaking, I mean I get the error below:
Traceback (most recent call last):
File "\\arete\shared\Los Angeles\Users\Active\bbonifacio\New ADVANCE\untitled1.py", line 13, in <module>
X, y = oversample.fit_resample(X, y)
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\imblearn\base.py", line 83, in fit_resample
output = self._fit_resample(X, y)
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\imblearn\over_sampling\_smote\base.py", line 324, in _fit_resample
nns = self.nn_k_.kneighbors(X_class, return_distance=False)[:, 1:]
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\sklearn\neighbors\_base.py", line 763, in kneighbors
results = PairwiseDistancesArgKmin.compute(
File "sklearn\metrics\_pairwise_distances_reduction.pyx", line 691, in sklearn.metrics._pairwise_distances_reduction.PairwiseDistancesArgKmin.compute
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\sklearn\utils\fixes.py", line 151, in threadpool_limits
return threadpoolctl.threadpool_limits(limits=limits, user_api=user_api)
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\threadpoolctl.py", line 171, in __init__
self._original_info = self._set_threadpool_limits()
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\threadpoolctl.py", line 268, in _set_threadpool_limits
modules = _ThreadpoolInfo(prefixes=self._prefixes,
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\threadpoolctl.py", line 340, in __init__
self._load_modules()
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\threadpoolctl.py", line 373, in _load_modules
self._find_modules_with_enum_process_module_ex()
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\threadpoolctl.py", line 485, in _find_modules_with_enum_process_module_ex
self._make_module_from_path(filepath)
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\threadpoolctl.py", line 515, in _make_module_from_path
module = module_class(filepath, prefix, user_api, internal_api)
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\threadpoolctl.py", line 606, in __init__
self.version = self.get_version()
File "C:\Users\bbonifacio\Anaconda3\lib\site-packages\threadpoolctl.py", line 646, in get_version
config = get_config().split()
AttributeError: 'NoneType' object has no attribute 'split'
And here are the versions of packages:
Sklearn: 1.1.1
Imblearn: 0.9.1
Threadpoolctl: 2.1.0
Big thanks to rickhg12hs for this answer!
The solution is to update threadpoolctl. It was not working on threadpoolctl on my versin of 2.1.0, but it works on the updated version. If anyone else is having this problem, type
pip install -U threadpoolctl
in your command terminal, and it should be fixed. Happy coding!
I am trying to predict toxic comments using Toxic Comment data from kaggle:
import skmultilearn, sys
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from scipy.sparse import csr_matrix, issparse
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import BinaryRelevance
data_frame = pd.read_csv('data/train.csv')
corpus = data_frame['comment_text']
tfidf = TfidfVectorizer()
Xfeatures = csr_matrix(tfidf.fit_transform(corpus))
y = csr_matrix(data_frame[['toxic','severe_toxic','obscene','threat','insult','identity_hate']])
binary_rel_clf = BinaryRelevance(MultinomialNB())
binary_rel_clf.fit(Xfeatures,y)
predict_text = ['fuck die shit moron suck']
X_predict = tfidf.transform(predict_text)
br_prediction = binary_rel_clf.predict(X_predict)
br_prediction = br_prediction.toarray().astype(bool)
predictions = [y.columns.values[prediction].tolist() for prediction in br_prediction]
print(predictions)
However, I got this error:
Traceback (most recent call last):
File "...\multi_label_toxic.py", line 15, in <module>
binary_rel_clf.fit(Xfeatures,y)
File "...\problem_transform\br.py", line 161, in fit
classifier.fit(self._ensure_input_format(
File "...\base\base.py", line 86, in _ensure_input_format
return X.toarray()
File "...\scipy\sparse\compressed.py", line 1031, in toarray
out = self._process_toarray_args(order, out)
File "...\scipy\sparse\base.py", line 1202, in _process_toarray_args
return np.zeros(self.shape, dtype=self.dtype, order=order)
numpy.core._exceptions._ArrayMemoryError: Unable to allocate 226. GiB for an array with shape (159571, 189775) and data type float64
And even if try to pass the param "require_dense=False" I got another error:
Traceback (most recent call last):
File "...\multi_label_toxic.py", line 15, in <module>
binary_rel_clf.fit(Xfeatures,y)
File "...\skmultilearn\problem_transform\br.py", line 161, in fit
classifier.fit(self._ensure_input_format(
File "...\sklearn\naive_bayes.py", line 612, in fit
X, y = self._check_X_y(X, y)
File "...\sklearn\naive_bayes.py", line 477, in _check_X_y
return self._validate_data(X, y, accept_sparse='csr')
File "...\sklearn\base.py", line 433, in _validate_data
X, y = check_X_y(X, y, **check_params)
File "...\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "...\sklearn\utils\validation.py", line 826, in check_X_y
y = column_or_1d(y, warn=True)
File "...\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "...\sklearn\utils\validation.py", line 864, in column_or_1d
raise ValueError(
ValueError: y should be a 1d array, got an array of shape () instead.
How can I fix that and train using the entire model?
It seems that you specified the required_dense argument incorrectly. You need required_dense=[False, True] in order to specify the X values in sparse format but not the y values. In the second last row (predictions = ...) you need to use y before you convert it to a matrix so you can access the column names.
The following code should work.
from sklearn.feature_extraction.text import TfidfVectorizer
import pandas as pd
from scipy.sparse import csr_matrix, issparse
from sklearn.naive_bayes import MultinomialNB
from skmultilearn.problem_transform import BinaryRelevance
import numpy as np
data_frame = pd.read_csv('data/train.csv')
corpus = data_frame['comment_text']
tfidf = TfidfVectorizer()
Xfeatures = csr_matrix(tfidf.fit_transform(corpus))
cats = data_frame[['toxic','severe_toxic','obscene','threat','insult','identity_hate']]
y = csr_matrix(cats)
binary_rel_clf = BinaryRelevance(MultinomialNB(), require_dense = [False, True])
binary_rel_clf.fit(Xfeatures, y) # y[:,0].toarray().reshape(-1)
predict_text = ['fuck die shit moron suck']
X_predict = tfidf.transform(predict_text)
br_prediction = binary_rel_clf.predict(X_predict)
br_prediction = br_prediction.toarray().astype(bool)
predictions = [cats.columns[prediction].tolist() for prediction in br_prediction]
print(predictions)
Output:
[['toxic', 'obscene', 'insult']]
I'm trying to tune the alpha parameter of a Multinomial Naive Bayes, with the 20newsgroups database. This is my code so far:
from sklearn.datasets import fetch_20newsgroups
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.feature_extraction.text import TfidfTransformer
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.metrics import classification_report
from sklearn.model_selection import GridSearchCV
import numpy as np
# Divide dataset
dataset_train = fetch_20newsgroups(subset='train', shuffle=True)
dataset_test = fetch_20newsgroups(subset='test', shuffle=True)
text_clf = Pipeline([('vect', CountVectorizer()), ('tfidf', TfidfTransformer(sublinear_tf=True)), ('clf',
MultinomialNB())])
param_grid = {'tfidf__use_idf': (True, False),
'clf__alpha' : np.linspace(0.001, 1, 100)}
grid_search = GridSearchCV(text_clf, param_grid=param_grid, scoring='precision', cv = None)
# Training
text_clf = grid_search.fit(dataset_train.data,dataset_train.target, average=None)
#prediction
predicted = text_clf.predict(dataset_test.data)
print("NB Accuracy:", 100*np.mean(predicted == dataset_test.target), '%')
print(classification_report(dataset_test.target, predicted, target_names=dataset_train.target_names))
print("Best estimator for alpha in order to get precision ", grid_search.best_estimator_)
The problem is i'm getting the following error:
runfile('C:/Users/omarl/Downloads/new_NB.py', wdir='C:/Users/omarl/Downloads')
Traceback (most recent call last):
File "<ipython-input-12-d478372ef22a>", line 1, in <module>
runfile('C:/Users/omarl/Downloads/new_NB.py', wdir='C:/Users/omarl/Downloads')
File "C:\Users\omarl\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 705, in runfile
execfile(filename, namespace)
File "C:\Users\omarl\Anaconda3\lib\site-packages\spyder\utils\site\sitecustomize.py", line 102, in execfile
exec(compile(f.read(), filename, 'exec'), namespace)
File "C:/Users/omarl/Downloads/new_NB.py", line 28, in <module>
text_clf = grid_search.fit(dataset_train.data,dataset_train.target, average=None)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py", line 639, in fit
cv.split(X, y, groups)))
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 779, in __call__
while self.dispatch_one_batch(iterator):
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 625, in dispatch_one_batch
self._dispatch(tasks)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 588, in _dispatch
job = self._backend.apply_async(batch, callback=cb)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 111, in apply_async
result = ImmediateResult(func)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\_parallel_backends.py", line 332, in __init__
self.results = batch()
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in __call__
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\externals\joblib\parallel.py", line 131, in <listcomp>
return [func(*args, **kwargs) for func, args, kwargs in self.items]
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py", line 458, in _fit_and_score
estimator.fit(X_train, y_train, **fit_params)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 248, in fit
Xt, fit_params = self._fit(X, y, **fit_params)
File "C:\Users\omarl\Anaconda3\lib\site-packages\sklearn\pipeline.py", line 197, in _fit
step, param = pname.split('__', 1)
ValueError: not enough values to unpack (expected 2, got 1)
I have no clue why this is happening, because from the code I reviewed so far this should work. Also I searched in the Scikit website but I didn't found anything. Thanks.
In this line:
text_clf = grid_search.fit(dataset_train.data,dataset_train.target, average=None)
average=None is being interpreted as a fit_param, which is not what you intend.
Average removing this, you will get this error.
ValueError: Target is multiclass but average='binary'. Please choose another average setting.
This is because precision is not defined in the multi-class setting. If you change your scoring parameter to 'accuracy', the code works.
I have fitted a Random Forest Classifier on my dataset containing 7 features and about 1 million rows or records.
Following is my code.
randForestClassifier=RandomForestClassifier(n_estimators=10,max_depth=3)
randForestClassifier.fit(X_train,y)
pred=randForestClassifier.predict(featues_test)
I am getting Memory error when I use predict method of my classifier.How to fix it?
Following is my complete log
randForestClassifier.predict(featues_test)
Traceback (most recent call last):
File "<ipython-input-15-0b7612d6e958>", line 1, in <module>
randForestClassifier.predict(featues_test)
File "C:\Python27\lib\site-packages\sklearn\ensemble\forest.py", line 462, in predict
proba = self.predict_proba(X)
File "C:\Python27\lib\site-packages\sklearn\ensemble\forest.py", line 513, in predict_proba
for e in self.estimators_)
File "C:\Python27\lib\site-packages\sklearn\externals\joblib\parallel.py", line 659, in __call__
self.dispatch(function, args, kwargs)
File "C:\Python27\lib\site-packages\sklearn\externals\joblib\parallel.py", line 406, in dispatch
job = ImmediateApply(func, args, kwargs)
File "C:\Python27\lib\site-packages\sklearn\externals\joblib\parallel.py", line 140, in __init__
self.results = func(*args, **kwargs)
File "C:\Python27\lib\site-packages\sklearn\ensemble\forest.py", line 106, in _parallel_helper
return getattr(obj, methodname)(*args, **kwargs)
File "C:\Python27\lib\site-packages\sklearn\tree\tree.py", line 592, in predict_proba
proba = self.tree_.predict(X)
File "sklearn/tree/_tree.pyx", line 3207, in sklearn.tree._tree.Tree.predict (sklearn\tree\_tree.c:24468)
File "sklearn/tree/_tree.pyx", line 3209, in sklearn.tree._tree.Tree.predict (sklearn\tree\_tree.c:24340)
MemoryError
Yes, you are getting the MemoryError at randForestClassifier.predict(featues_test), as shown by the stack trace:
File "<ipython-input-15-0b7612d6e958>", line 1, in <module>
randForestClassifier.predict(featues_test)
The remaining lines of the stack trace shows that the problems comes from sklearn, in the C code: sklearn\tree\_tree.c:24340
I am writing a digit recognition program in python. The basic code is as follows:
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
filteredColumns = delete_useless_columns()
train = pd.read_csv('C:\\Users\\abchauhan\\Downloads\\train.csv')
trainData = train.loc[0:24998, filteredColumns]
target = train['label']
targetData = target[0:24999]
rf = RandomForestClassifier(n_estimators=150, min_samples_split=2, n_jobs=-1)
x = trainData/255 #Feature scaling
print('Fitting the data')
rf.fit(x, targetData)
The feature scaling line gives the error: TypeError: Could not operate 255 with block values. Now if I remove the RandomForestClassifier import statement the feature scaling works fine, but obviously then the program is of no use. Why is the division working without the import statement?
Edit:
trainData.info() is as follows:
<class 'pandas.core.frame.DataFrame'>
Int64Index: 24999 entries, 0 to 24998
Columns: 708 entries, pixel12 to pixel779
dtypes: int64(708)
memory usage: 135.2 MB
None
Stack Trace is as follows:
Traceback (most recent call last):
File "C:\Python34\lib\site-packages\pandas-0.15.2-py3.4-win32.egg\pandas\core\internals.py", line 965, in eval
result = get_result(other)
File "C:\Python34\lib\site-packages\pandas-0.15.2-py3.4-win32.egg\pandas\core\internals.py", line 949, in get_result
return self._try_coerce_result(func(values, other))
File "C:\Python34\lib\site-packages\pandas-0.15.2-py3.4-win32.egg\pandas\core\ops.py", line 765, in na_op
op, str_rep, x, y, raise_on_error=True, **eval_kwargs)
File "C:\Python34\lib\site-packages\pandas-0.15.2-py3.4-win32.egg\pandas\computation\expressions.py", line 218, in evaluate
**eval_kwargs)
File "C:\Python34\lib\site-packages\pandas-0.15.2-py3.4-win32.egg\pandas\computation\expressions.py", line 71, in _evaluate_standard
return op(a, b)
MemoryError
During handling of the above exception, another exception occurred:
Traceback (most recent call last):
File "C:/Users/abchauhan/PycharmProjects/DigitRecognition/PreProcess/RandomForest.py", line 20, in <module>
x = trainData/255
File "C:\Python34\lib\site-packages\pandas-0.15.2-py3.4-win32.egg\pandas\core\ops.py", line 831, in f
return self._combine_const(other, na_op)
File "C:\Python34\lib\site-packages\pandas-0.15.2-py3.4-win32.egg\pandas\core\frame.py", line 3111, in _combine_const
new_data = self._data.eval(func=func, other=other, raise_on_error=raise_on_error)
File "C:\Python34\lib\site-packages\pandas-0.15.2-py3.4-win32.egg\pandas\core\internals.py", line 2478, in eval
return self.apply('eval', **kwargs)
File "C:\Python34\lib\site-packages\pandas-0.15.2-py3.4-win32.egg\pandas\core\internals.py", line 2457, in apply
applied = getattr(b, f)(**kwargs)
File "C:\Python34\lib\site-packages\pandas-0.15.2-py3.4-win32.egg\pandas\core\internals.py", line 972, in eval
result = handle_error()
File "C:\Python34\lib\site-packages\pandas-0.15.2-py3.4-win32.egg\pandas\core\internals.py", line 956, in handle_error
% (repr(other), str(detail)))
TypeError: Could not operate 255 with block values
Process finished with exit code 1
This is a memory error, as indicated by the first error in your stack trace.