random forest calculation with Huge Sparse Data - python

I am trying to calculate a random forest on huge sparse multilabel-data. The Dataset has 94 targets, some of them are barly used (2 out of 650000) and some of them aren't used at all. But I am out of (32 GB) Ram and can't calculate all in one go. So I followed the Guide for "batching" a random forest:
https://stats.stackexchange.com/questions/327335/batch-learning-w-random-forest-sklearn
When I tried to predict something I got the error (below).
So I tried a different approach: Calculating a random forest on dataparts and merging them afterwards:
forest_model = None
forest_model_final = None
start = time.time()
for e in range(5): # 5 passes through the data
print("Epoch:", e)
for batch_index, (X, y) in enumerate(dataloader_dict['Train_and_Validation']):
forest_model = RandomForestClassifier(warm_start = False, n_estimators = 1, n_jobs=parameters['num_workers'])
X = np.squeeze(X.numpy(), axis=1)
y = np.squeeze(y.numpy(), axis=1)
y_one_hot = np.array(y > parameters['threshold'], dtype=int)
forest_model.fit(X,y_one_hot)
if forest_model_final is not None:
forest_model_final = combine([forest_model_final, forest_model])
else:
forest_model_final = forest_model
end = time.time()
print("Time (s): %s"%(end-start))
def combine(all_ensembles):
"""Combine the sub-estimators of a group of ensembles
>>> from sklearn.datasets import load_iris
>>> from sklearn.ensemble import ExtraTreesClassifier
>>> iris = load_iris()
>>> X, y = iris.data, iris.target
>>> all_ensembles = [ExtraTreesClassifier(n_estimators=4).fit(X, y)
... for i in range(3)]
>>> big = combine(all_ensembles)
>>> len(big.estimators_)
12
>>> big.n_estimators
12
>>> big.score(X, y)
1.0
"""
final_ensemble = copy(all_ensembles[0])
final_ensemble.estimators_ = []
for ensemble in all_ensembles:
final_ensemble.estimators_ += ensemble.estimators_
# Required in old versions of sklearn
final_ensemble.n_estimators = len(final_ensemble.estimators_)
return final_ensemble
I get the same error when I try to predict something with the calculated random forest.
Error:
ValueError: non-broadcastable output operand with shape (50,1) doesn't match the broadcast shape (50,2)
I found a similar question Unexpected exception when combining random forest trees , but I don't understand what I should do now.
Full Traceback:
ValueError Traceback (most recent call last)
<ipython-input-10-4f8ce9181286> in <module>
7 yval = np.squeeze(yval.numpy(), axis=1)
8 y_one_hot = yval > parameters['threshold']
----> 9 yval_pred = forest_model_final.predict_proba(Xval)
10 #Todo stuff
11 acc_batch = accuracy_score(y_one_hot, yval_pred)
~/anaconda3/envs/column-labeling/lib/python3.6/site-packages/sklearn/ensemble/_forest.py in predict_proba(self, X)
667 delayed(_accumulate_prediction)(e.predict_proba, X, all_proba,
668 lock)
--> 669 for e in self.estimators_)
670
671 for proba in all_proba:
~/.local/lib/python3.6/site-packages/joblib/parallel.py in __call__(self, iterable)
1014
1015 with self._backend.retrieval_context():
-> 1016 self.retrieve()
1017 # Make sure that we get a last message telling us we are done
1018 elapsed_time = time.time() - self._start_time
~/.local/lib/python3.6/site-packages/joblib/parallel.py in retrieve(self)
906 try:
907 if getattr(self._backend, 'supports_timeout', False):
--> 908 self._output.extend(job.get(timeout=self.timeout))
909 else:
910 self._output.extend(job.get())
~/anaconda3/envs/column-labeling/lib/python3.6/multiprocessing/pool.py in get(self, timeout)
642 return self._value
643 else:
--> 644 raise self._value
645
646 def _set(self, i, obj):
~/anaconda3/envs/column-labeling/lib/python3.6/multiprocessing/pool.py in worker(inqueue, outqueue, initializer, initargs, maxtasks, wrap_exception)
117 job, i, func, args, kwds = task
118 try:
--> 119 result = (True, func(*args, **kwds))
120 except Exception as e:
121 if wrap_exception and func is not _helper_reraises_exception:
~/.local/lib/python3.6/site-packages/joblib/_parallel_backends.py in __call__(self, *args, **kwargs)
598 def __call__(self, *args, **kwargs):
599 try:
--> 600 return self.func(*args, **kwargs)
601 except KeyboardInterrupt:
602 # We capture the KeyboardInterrupt and reraise it as
~/.local/lib/python3.6/site-packages/joblib/parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~/.local/lib/python3.6/site-packages/joblib/parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~/anaconda3/envs/column-labeling/lib/python3.6/site-packages/sklearn/ensemble/_forest.py in _accumulate_prediction(predict, X, out, lock)
453 else:
454 for i in range(len(out)):
--> 455 out[i] += prediction[i]
456
457
ValueError: non-broadcastable output operand with shape (50,1) doesn't match the broadcast shape (50,2)

Related

Is it possible to use both MLPRegressor() and make_pipeline() within GridSearchCV()?

I'm trying to utilize make_pipeline() from scikit-learn along with GridSearchCV(). The Pipeline is simple and only includes two steps, a StandardScaler() and an MLPRegressor(). The GridSearchCV()is also pretty simple with the slight wrinkle that I'm using TimeSeriesSplit() for cross-validation.
The error I'm getting is as follows:
ValueError: Invalid parameter MLPRegressor for estimator Pipeline(steps=[('standardscaler', StandardScaler()),('mlpregressor', MLPRegressor())]). Check the list of available parameters with estimator.get_params().keys().
Can someone help me understand how I can rectify this problem so I can use the make_pipeline() framework with both GridSearchCV() and MLPRegressor() .
from sklearn.neural_network import MLPRegressor
...: from sklearn.preprocessing import StandardScaler
...: from sklearn.model_selection import TimeSeriesSplit, GridSearchCV
...: from sklearn.pipeline import make_pipeline
...: import numpy as np
In [2]: tscv = TimeSeriesSplit(n_splits = 5)
In [3]: pipe = make_pipeline(StandardScaler(), MLPRegressor())
In [4]: param_grid = {'MLPRegressor__hidden_layer_sizes': [(16,16,), (64,64,), (
...: 128,128,)], 'MLPRegressor__activation': ['identity', 'logistic', 'tanh',
...: 'relu'],'MLPRegressor__solver': ['adam','sgd']}
In [5]: grid = GridSearchCV(pipe, param_grid = param_grid, cv = tscv)
In [6]: features = np.random.random([1000,10])
In [7]: target = np.random.normal(0,10,1000)
In [8]: grid.fit(features, target)
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-8-7233f9f2005e> in <module>
----> 1 grid.fit(features, target)
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
61 extra_args = len(args) - len(all_args)
62 if extra_args <= 0:
---> 63 return f(*args, **kwargs)
64
65 # extra_args > 0
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/model_selection/_search.py in fit(self, X, y, groups, **fit_params)
839 return results
840
--> 841 self._run_search(evaluate_candidates)
842
843 # multimetric is determined here because in the case of a callable
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/model_selection/_search.py in _run_search(self, evaluate_candidates)
1294 def _run_search(self, evaluate_candidates):
1295 """Search all candidates in param_grid"""
-> 1296 evaluate_candidates(ParameterGrid(self.param_grid))
1297
1298
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/model_selection/_search.py in evaluate_candidates(candidate_params, cv, more_results)
793 n_splits, n_candidates, n_candidates * n_splits))
794
--> 795 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
796 X, y,
797 train=train, test=test,
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in __call__(self, iterable)
1039 # remaining jobs.
1040 self._iterating = False
-> 1041 if self.dispatch_one_batch(iterator):
1042 self._iterating = self._original_iterator is not None
1043
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in dispatch_one_batch(self, iterator)
857 return False
858 else:
--> 859 self._dispatch(tasks)
860 return True
861
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in _dispatch(self, batch)
775 with self._lock:
776 job_idx = len(self._jobs)
--> 777 job = self._backend.apply_async(batch, callback=cb)
778 # A job can complete so quickly than its callback is
779 # called before we get here, causing self._jobs to
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in __call__(self)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/joblib/parallel.py in <listcomp>(.0)
260 # change the default number of processes to -1
261 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 262 return [func(*args, **kwargs)
263 for func, args, kwargs in self.items]
264
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/utils/fixes.py in __call__(self, *args, **kwargs)
220 def __call__(self, *args, **kwargs):
221 with config_context(**self.config):
--> 222 return self.function(*args, **kwargs)
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/model_selection/_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, split_progress, candidate_progress, error_score)
584 cloned_parameters[k] = clone(v, safe=False)
585
--> 586 estimator = estimator.set_params(**cloned_parameters)
587
588 start_time = time.time()
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/pipeline.py in set_params(self, **kwargs)
148 self
149 """
--> 150 self._set_params('steps', **kwargs)
151 return self
152
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/utils/metaestimators.py in _set_params(self, attr, **params)
52 self._replace_estimator(attr, name, params.pop(name))
53 # 3. Step parameters and other initialisation arguments
---> 54 super().set_params(**params)
55 return self
56
~/opt/miniconda3/envs/practice/lib/python3.9/site-packages/sklearn/base.py in set_params(self, **params)
228 key, delim, sub_key = key.partition('__')
229 if key not in valid_params:
--> 230 raise ValueError('Invalid parameter %s for estimator %s. '
231 'Check the list of available parameters '
232 'with `estimator.get_params().keys()`.' %
ValueError: Invalid parameter MLPRegressor for estimator Pipeline(steps=[('standardscaler', StandardScaler()),
('mlpregressor', MLPRegressor())]). Check the list of available parameters with `estimator.get_params().keys()`.
Solution
Yes. Make the pipeline first. Then treat the pipeline as your model and pass it to GridSearchCV.
Your problem is in the following line (you had it mislabeled):
Replace MLPRegressor__ with mlpregressor__.
The Fix:
The pipeline named_step for MLPRegressor estimator was mislabeled as MLPRegressor__ in the param_grid.
Changing it to mlpregressor__ fixed the problem.
You may run and check it in this colab notebook.
# INCORRECT
param_grid = {
'MLPRegressor__hidden_layer_sizes': [(16, 16,), (64, 64,), (128, 128,)],
'MLPRegressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
'MLPRegressor__solver': ['adam', 'sgd'],
}
# CORRECTED
param_grid = {
'mlpregressor__hidden_layer_sizes': [(16, 16,), (64, 64,), (128, 128,)],
'mlpregressor__activation': ['identity', 'logistic', 'tanh', 'relu'],
'mlpregressor__solver': ['adam', 'sgd'],
}
Note
The key to understand what was wrong here, was to observe the last two lines of the error stack.
ValueError: Invalid parameter MLPRegressor for estimator Pipeline(steps=[('standardscaler', StandardScaler()),
('mlpregressor', MLPRegressor())]). Check the list of available parameters with `estimator.get_params().keys()`.

Weird shape when I use randomized search cv

I am learning RandomizedSearchCV with a toy example. Suppose that I want to build a linear model y = ax + b. I wrote a custom sklearn estimator that looks like the following:
import numpy as np
# import the base estimator
from sklearn.base import BaseEstimator, RegressorMixin
class testEstimator(BaseEstimator,RegressorMixin):
def __init__(self, alpha=1, beta=0):
self.alpha = alpha
self.beta = beta
def fit(self, X, y=None):
mu = np.ones((len(X)))
for ii in range(len(X)):
mu[ii] = self.alpha*X[ii] + self.beta
self.mu_ = mu
return self
def predict(self, X):
try:
getattr(self,"mu_")
except:
raise RuntimeError("You must train classifer before predicting data!")
return self.mu_
def score(self, X, y):
print("y: ", y)
print("mu: ", self.mu_)
return np.dot(y - self.predict(X), y - self.predict(X))/len(X)
Then, I need to test this.
# temp estimator
tempEs = testEstimator()
# temp params grid
params_grid_temp = {'alpha': [0,1,2,3,4,5,6], 'beta': [0,1,2,3,4]}
# test randomizedSearchCV
temp = RandomizedSearchCV(tempEs, params_grid_temp)
# define X,y
X = range(10)
y = np.dot(2, range(10)) + 4
# fit model
temp.fit(X,y)
However, I got the error
ValueError Traceback (most recent call last)
<ipython-input-8-72a46fdf9098> in <module>
9 y = np.dot(2, range(10)) + 4
10 # fit model
---> 11 temp.fit(X,y)
~\Anaconda3\lib\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
734 return results
735
--> 736 self._run_search(evaluate_candidates)
737
738 # For multi-metric evaluation, store the best_index_, best_params_ and
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1527 def _run_search(self, evaluate_candidates):
1528 """Search n_iter candidates from param_distributions"""
-> 1529 evaluate_candidates(ParameterSampler(
1530 self.param_distributions, self.n_iter,
1531 random_state=self.random_state))
~\Anaconda3\lib\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
706 n_splits, n_candidates, n_candidates * n_splits))
707
--> 708 out = parallel(delayed(_fit_and_score)(clone(base_estimator),
709 X, y,
710 train=train, test=test,
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self, iterable)
1027 # remaining jobs.
1028 self._iterating = False
-> 1029 if self.dispatch_one_batch(iterator):
1030 self._iterating = self._original_iterator is not None
1031
~\Anaconda3\lib\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
845 return False
846 else:
--> 847 self._dispatch(tasks)
848 return True
849
~\Anaconda3\lib\site-packages\joblib\parallel.py in _dispatch(self, batch)
763 with self._lock:
764 job_idx = len(self._jobs)
--> 765 job = self._backend.apply_async(batch, callback=cb)
766 # A job can complete so quickly than its callback is
767 # called before we get here, causing self._jobs to
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
206 def apply_async(self, func, callback=None):
207 """Schedule a func to be run"""
--> 208 result = ImmediateResult(func)
209 if callback:
210 callback(result)
~\Anaconda3\lib\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
570 # Don't delay the application, to avoid keeping the input
571 # arguments in memory
--> 572 self.results = batch()
573
574 def get(self):
~\Anaconda3\lib\site-packages\joblib\parallel.py in __call__(self)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\Anaconda3\lib\site-packages\joblib\parallel.py in <listcomp>(.0)
250 # change the default number of processes to -1
251 with parallel_backend(self._backend, n_jobs=self._n_jobs):
--> 252 return [func(*args, **kwargs)
253 for func, args, kwargs in self.items]
254
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
558 else:
559 fit_time = time.time() - start_time
--> 560 test_scores = _score(estimator, X_test, y_test, scorer)
561 score_time = time.time() - start_time - fit_time
562 if return_train_score:
~\Anaconda3\lib\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
605 scores = scorer(estimator, X_test)
606 else:
--> 607 scores = scorer(estimator, X_test, y_test)
608
609 error_msg = ("scoring must return a number, got %s (%s) "
~\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
88 *args, **kwargs)
89 else:
---> 90 score = scorer(estimator, *args, **kwargs)
91 scores[name] = score
92 return scores
~\Anaconda3\lib\site-packages\sklearn\metrics\_scorer.py in _passthrough_scorer(estimator, *args, **kwargs)
370 def _passthrough_scorer(estimator, *args, **kwargs):
371 """Function that wraps estimator.score"""
--> 372 return estimator.score(*args, **kwargs)
373
374
<ipython-input-7-0c2138d9bf96> in score(self, X, y)
20 print("y: ", y)
21 print("mu: ", self.mu_)
---> 22 return np.dot(y - self.predict(X), y - self.predict(X))/len(X)
ValueError: operands could not be broadcast together with shapes (2,) (8,)
I figured out (2,) means the size of y and (8,) means the size of self.mu_. How does this happen? They are supposed to be 10.
Okay, I found the problem. Your predict method is totally wrong. It must return predicted value instead of returning mu_.
def predict(self, X):
return self.alpha*X + self.beta
That's it. You can also optimize code in your fit method

ValueError: Buffer dtype mismatch, expected 'double' but got Python object

I am testing an open-source code but I keep running into this error. I am passing a crf model into cross_val_predict and that's where the error starts. Pystruct is being used for the crf model. The crf model is built from sparse matrices which I think is causing the problem but I am not entirely sure.
This is the code:
def evaluate_dataset(config):
# Parameters
folds = config['folds']
results = {}
crf = config['clf']
sample_size = config['sample_size'] if 'sample_size' in config else None
paths = list(Path(config['dir']).iterdir())
preprocessor = SectionPreprocess(ground_truth=True)
paths, X, y = preprocessor.preprocess(paths, sample_size=sample_size)
# Dataset information
results['num_binaries'] = len(y)
results['avg_code_fraction'] = np.mean([np.mean(yy) for yy in y])
results['params'] = crf.get_params()
# Cross-validation
# logreg = LogisticRegression()
results['cv_folds'] = folds
start = time.time()
y_pred = cross_val_predict(crf, X, y, cv=folds, n_jobs=1)
end = time.time()
results['cv_time'] = (end - start) / folds
values = [
(metrics.accuracy_score(true, pred),) + metrics.precision_recall_fscore_support(true, pred, average='binary')
for (true, pred) in zip(y, y_pred)
]
accuracy, precision, recall, f1, _ = zip(*values)
results['cv_metrics'] = {
'accuracy': (np.mean(accuracy), np.std(accuracy)),
'precision': (np.mean(precision), np.std(precision)),
'recall': (np.mean(recall), np.std(recall)),
'f1': (np.mean(f1), np.std(f1)),
}
return results
evaluation_folder = Path('evaluation')
evaluation_folder.mkdir(exist_ok=True)
for (name, config) in configurations.items():
print(config)
%time results = evaluate_dataset(config)
print("## " + name)
print(results)
print('')
file_path = evaluation_folder / (name + '.json')
with file_path.open('w') as f:
json.dump(results, f)
And this is the error:
ValueError Traceback (most recent call last)
in
in evaluate_dataset(config)
24
25 start = time.time()
---> 26 y_pred = cross_val_predict(crf, X, y, cv=folds, n_jobs=1)
27 end = time.time()
28 results['cv_time'] = (end - start) / folds
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in cross_val_predict(estimator, X, y, groups, cv, n_jobs, verbose, fit_params, pre_dispatch, method)
399 prediction_blocks = parallel(delayed(_fit_and_predict)(
400 clone(estimator), X, y, train, test, verbose, fit_params, method)
--> 401 for train, test in cv_iter)
402
403 # Concatenate the predictions
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self, iterable)
756 # was dispatched. In particular this covers the edge
757 # case of Parallel used with an exhausted iterator.
--> 758 while self.dispatch_one_batch(iterator):
759 self._iterating = True
760 else:
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in dispatch_one_batch(self, iterator)
606 return False
607 else:
--> 608 self._dispatch(tasks)
609 return True
610
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in _dispatch(self, batch)
569 dispatch_timestamp = time.time()
570 cb = BatchCompletionCallBack(dispatch_timestamp, len(batch), self)
--> 571 job = self._backend.apply_async(batch, callback=cb)
572 self._jobs.append(job)
573
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in apply_async(self, func, callback)
107 def apply_async(self, func, callback=None):
108 """Schedule a func to be run"""
--> 109 result = ImmediateResult(func)
110 if callback:
111 callback(result)
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/_parallel_backends.py in __init__(self, batch)
324 # Don't delay the application, to avoid keeping the input
325 # arguments in memory
--> 326 self.results = batch()
327
328 def get(self):
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in __call__(self)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/externals/joblib/parallel.py in (.0)
129
130 def __call__(self):
--> 131 return [func(*args, **kwargs) for func, args, kwargs in self.items]
132
133 def __len__(self):
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/sklearn/model_selection/_validation.py in _fit_and_predict(estimator, X, y, train, test, verbose, fit_params, method)
472 estimator.fit(X_train, **fit_params)
473 else:
--> 474 estimator.fit(X_train, y_train, **fit_params)
475 func = getattr(estimator, method)
476 predictions = func(X_test)
~/Google Drive (gdk244#nyu.edu)/MoMa Lab /CRF/code_section_identification/crf_models/CRFModel.py in fit(self, X, y, **fit_params)
74 verbose=self.verbose
75 )
---> 76 self._ssvm.fit(X, y)
77
78 return self
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/pystruct/learners/frankwolfe_ssvm.py in fit(self, X, Y, constraints, initialize)
295 self._frank_wolfe_batch(X, Y)
296 else:
--> 297 self._frank_wolfe_bc(X, Y)
298 except KeyboardInterrupt:
299 pass
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/pystruct/learners/frankwolfe_ssvm.py in _frank_wolfe_bc(self, X, Y)
220 i = perm[j]
221 x, y = X[i], Y[i]
--> 222 y_hat, delta_joint_feature, slack, loss = find_constraint(self.model, x, y, w)
223 # ws and ls
224 ws = delta_joint_feature * self.C
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/pystruct/utils/inference.py in find_constraint(model, x, y, w, y_hat, relaxed, compute_difference)
63
64 if y_hat is None:
---> 65 y_hat = model.loss_augmented_inference(x, y, w, relaxed=relaxed)
66 joint_feature = model.joint_feature
67 if getattr(model, 'rescale_C', False):
~/.pyenv/versions/pythonvenv3.6.1/lib/python3.6/site-packages/pystruct/models/crf.py in loss_augmented_inference(self, x, y, w, relaxed, return_energy)
104 pairwise_potentials = self._get_pairwise_potentials(x, w)
105 edges = self._get_edges(x)
--> 106 loss_augment_unaries(unary_potentials, np.asarray(y), self.class_weight)
107
108 return inference_dispatch(unary_potentials, pairwise_potentials, edges,
utils.pyx in utils.loss_augment_unaries (src/utils.c:5132)()
ValueError: Buffer dtype mismatch, expected 'double' but got Python object

Problem when branching Sklearn Pipiline into a GridSearchCV

I'm trying to build a pipeline with my own functions. To do so I inherited BaseEstimator and TransformerMixin from sklearn base and defined my own transform methods.
When I do pipeline.fit(X,y), it works fine.
The problem is when I try to create a GridSearchCV object with the pipeline. I get the following error:
ValueError: operands could not be broadcast together with shapes (730,36) (228,) (730,36).
730 Is just the number of lines of matrix X divided by 'cv' = 2, the number of folds I choose for the cross-validation in the GridSearchCV.
I have no idea how to debug that. I've tried some prints in the middle of my functions, and the result is pretty weird.
I'm attaching the functions I created as well as the pipeline. I'd be really glad if someone could help.
Here are the functions I created for the Pipeline:
from sklearn.base import BaseEstimator, TransformerMixin
class MissingData(BaseEstimator, TransformerMixin):
def fit( self, X, y = None ):
return self
def transform(self, X , y = None, strategies = ( "most_frequent", "mean") ):
print('Started MissingData')
X_ = X.copy()
#Categorical Variables handling
categorical_variables = list(X_.select_dtypes(include=['category','object']))
imp_category = SimpleImputer(strategy = strategies[0])
X_[categorical_variables] = pd.DataFrame(imp_category.fit_transform(X_[categorical_variables]))
#Numeric varialbes handling
numerical_variables = list(set(X_.columns) - set(categorical_variables))
imp_numerical = SimpleImputer(strategy = strategies[1])
X_[numerical_variables] = pd.DataFrame(imp_numerical.fit_transform(X_[numerical_variables]))
print('Finished MissingData')
print('Inf: ',X_.isnull().sum().sum())
return X_
class OHEncode(BaseEstimator, TransformerMixin):
def fit(self, X, y = None ):
return self
def encode_and_drop_original_and_first_dummy(self,df, feature_to_encode):
dummies = pd.get_dummies(df[feature_to_encode] , prefix = feature_to_encode, drop_first=True) #Drop first equals true will take care of the dummies variables trap
res = pd.concat([df, dummies], axis=1)
res = res.drop([feature_to_encode], axis=1)
return(res)
def transform(self, X , y = None, categorical_variables = None ):
X_ = X.copy()
if categorical_variables == None:
categorical_variables = list(X_.select_dtypes(include=['category','object']))
print('Started Encoding')
#Let's update the matrix X with the one hot ecoded version of all features in categorical_variables
for feature_to_encode in categorical_variables:
X_ = self.encode_and_drop_original_and_first_dummy(X_ , feature_to_encode)
print('Finished Encoding')
print('Inf: ',X_.isnull().sum().sum())
return X_
Here is the Pipeline with the GridSearchCV:
pca = PCA(n_components=10)
pipeline = Pipeline([('MissingData', MissingData()), ('OHEncode', OHEncode()) ,
('scaler', StandardScaler()) , ('pca', pca), ('rf', LinearRegression())])
parameters = {'pca__n_components': [5, 15, 30, 45, 64]}
grid = GridSearchCV(pipeline, param_grid=parameters, cv = 2)
grid.fit(X, y)
And finally here is the full output including my prints and the error:
Started MissingData
Finished MissingData
Inf: 57670
Started Encoding
Finished Encoding
Inf: 26280
Started MissingData
Finished MissingData
Inf: 0
Started Encoding
C:\Users\menoci\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\extmath.py:765: RuntimeWarning: invalid value encountered in true_divide
updated_mean = (last_sum + new_sum) / updated_sample_count
C:\Users\menoci\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\extmath.py:706: RuntimeWarning: Degrees of freedom <= 0 for slice.
result = op(x, *args, **kwargs)
C:\Users\menoci\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py:536: FitFailedWarning: Estimator fit failed. The score on this train-test partition for these parameters will be set to nan. Details:
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
FitFailedWarning)
Finished Encoding
Inf: 0
Started MissingData
Finished MissingData
Inf: 57670
Started Encoding
Finished Encoding
Inf: 26280
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-67-f78b56dad89d> in <module>
15
16 #pipeline.set_params(rf__n_estimators = 50)
---> 17 grid.fit(X, y)
18
19 #rf_val_predictions = pipeline.predict(X)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
710 return results
711
--> 712 self._run_search(evaluate_candidates)
713
714 # For multi-metric evaluation, store the best_index_, best_params_ and
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in _run_search(self, evaluate_candidates)
1151 def _run_search(self, evaluate_candidates):
1152 """Search all candidates in param_grid"""
-> 1153 evaluate_candidates(ParameterGrid(self.param_grid))
1154
1155
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in evaluate_candidates(candidate_params)
689 for parameters, (train, test)
690 in product(candidate_params,
--> 691 cv.split(X, y, groups)))
692
693 if len(out) < 1:
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in __call__(self, iterable)
1005 self._iterating = self._original_iterator is not None
1006
-> 1007 while self.dispatch_one_batch(iterator):
1008 pass
1009
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in dispatch_one_batch(self, iterator)
833 return False
834 else:
--> 835 self._dispatch(tasks)
836 return True
837
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in _dispatch(self, batch)
752 with self._lock:
753 job_idx = len(self._jobs)
--> 754 job = self._backend.apply_async(batch, callback=cb)
755 # A job can complete so quickly than its callback is
756 # called before we get here, causing self._jobs to
~\AppData\Roaming\Python\Python37\site-packages\joblib\_parallel_backends.py in apply_async(self, func, callback)
207 def apply_async(self, func, callback=None):
208 """Schedule a func to be run"""
--> 209 result = ImmediateResult(func)
210 if callback:
211 callback(result)
~\AppData\Roaming\Python\Python37\site-packages\joblib\_parallel_backends.py in __init__(self, batch)
588 # Don't delay the application, to avoid keeping the input
589 # arguments in memory
--> 590 self.results = batch()
591
592 def get(self):
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in __call__(self)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\AppData\Roaming\Python\Python37\site-packages\joblib\parallel.py in <listcomp>(.0)
254 with parallel_backend(self._backend, n_jobs=self._n_jobs):
255 return [func(*args, **kwargs)
--> 256 for func, args, kwargs in self.items]
257
258 def __len__(self):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in _fit_and_score(estimator, X, y, scorer, train, test, verbose, parameters, fit_params, return_train_score, return_parameters, return_n_test_samples, return_times, return_estimator, error_score)
542 else:
543 fit_time = time.time() - start_time
--> 544 test_scores = _score(estimator, X_test, y_test, scorer)
545 score_time = time.time() - start_time - fit_time
546 if return_train_score:
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_validation.py in _score(estimator, X_test, y_test, scorer)
589 scores = scorer(estimator, X_test)
590 else:
--> 591 scores = scorer(estimator, X_test, y_test)
592
593 error_msg = ("scoring must return a number, got %s (%s) "
~\AppData\Roaming\Python\Python37\site-packages\sklearn\metrics\_scorer.py in __call__(self, estimator, *args, **kwargs)
87 *args, **kwargs)
88 else:
---> 89 score = scorer(estimator, *args, **kwargs)
90 scores[name] = score
91 return scores
~\AppData\Roaming\Python\Python37\site-packages\sklearn\metrics\_scorer.py in _passthrough_scorer(estimator, *args, **kwargs)
369 def _passthrough_scorer(estimator, *args, **kwargs):
370 """Function that wraps estimator.score"""
--> 371 return estimator.score(*args, **kwargs)
372
373
~\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\metaestimators.py in <lambda>(*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in score(self, X, y, sample_weight)
611 Xt = X
612 for _, name, transform in self._iter(with_final=False):
--> 613 Xt = transform.transform(Xt)
614 score_params = {}
615 if sample_weight is not None:
~\AppData\Roaming\Python\Python37\site-packages\sklearn\preprocessing\_data.py in transform(self, X, copy)
804 else:
805 if self.with_mean:
--> 806 X -= self.mean_
807 if self.with_std:
808 X /= self.scale_
ValueError: operands could not be broadcast together with shapes (730,36) (228,) (730,36)
The first point, I would should you to use OneHotEncoder (OHE) class from sklearn. Then, define in the constructor of OHEncode an object of OHE and fit it with the all categorical values you have (to make them "seen" at each GridSearch iteration). Then in transform fuction of OHEncode, apply transform using the object of OHE.
DON'T fit the OHE object Inside the fit function because then you will have the same error; at each GridSearch iteration, the fit and transform functions are applied.

ValueError when running MaskedAutoregressiveFlow example

I am trying to run the example for MaskedAutoregressiveFlow at https://www.tensorflow.org/api_docs/python/tf/contrib/distributions/bijectors/MaskedAutoregressiveFlow. It's a plain copy from the docs but I receive the following error. I've tried event_shape=[dims, 1] but that doesn't seem to help (different error). I'm not sure what to make of it.
Has anyone seen this as well?
import tensorflow as tf
import tensorflow.contrib.distributions as tfd
from tensorflow.contrib.distributions import bijectors as tfb
dims = 5
# A common choice for a normalizing flow is to use a Gaussian for the base
# distribution. (However, any continuous distribution would work.) E.g.,
maf = tfd.TransformedDistribution(
distribution=tfd.Normal(loc=0., scale=1.),
bijector=tfb.MaskedAutoregressiveFlow(
shift_and_log_scale_fn=tfb.masked_autoregressive_default_template(
hidden_layers=[512, 512])),
event_shape=[dims])
x = maf.sample() # Expensive; uses `tf.while_loop`, no Bijector caching.
maf.log_prob(x) # Almost free; uses Bijector caching.
maf.log_prob(0.) # Cheap; no `tf.while_loop` despite no Bijector caching.
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-2-3b2fcb2af309> in <module>()
11
12
---> 13 x = maf.sample() # Expensive; uses `tf.while_loop`, no Bijector caching.
14 maf.log_prob(x) # Almost free; uses Bijector caching.
15 maf.log_prob(0.) # Cheap; no `tf.while_loop` despite no Bijector caching.
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/distributions/distribution.py in sample(self, sample_shape, seed, name)
687 samples: a `Tensor` with prepended dimensions `sample_shape`.
688 """
--> 689 return self._call_sample_n(sample_shape, seed, name)
690
691 def _log_prob(self, value):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/distributions/transformed_distribution.py in _call_sample_n(self, sample_shape, seed, name, **kwargs)
411 # work, it is imperative that this is the last modification to the
412 # returned result.
--> 413 y = self.bijector.forward(x, **kwargs)
414 y = self._set_sample_static_shape(y, sample_shape)
415
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/distributions/bijector_impl.py in forward(self, x, name)
618 NotImplementedError: if `_forward` is not implemented.
619 """
--> 620 return self._call_forward(x, name)
621
622 def _inverse(self, y):
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/distributions/bijector_impl.py in _call_forward(self, x, name, **kwargs)
599 if mapping.y is not None:
600 return mapping.y
--> 601 mapping = mapping.merge(y=self._forward(x, **kwargs))
602 self._cache(mapping)
603 return mapping.y
/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py in _forward(self, x)
245 y0 = array_ops.zeros_like(x, name="y0")
246 # call the template once to ensure creation
--> 247 _ = self._shift_and_log_scale_fn(y0)
248 def _loop_body(index, y0):
249 """While-loop body for autoregression calculation."""
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/template.py in __call__(self, *args, **kwargs)
358 custom_getter=self._custom_getter) as vs:
359 self._variable_scope = vs
--> 360 result = self._call_func(args, kwargs)
361 return result
362
/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/template.py in _call_func(self, args, kwargs)
300 trainable_at_start = len(
301 ops.get_collection(ops.GraphKeys.TRAINABLE_VARIABLES))
--> 302 result = self._func(*args, **kwargs)
303
304 if self._variables_created:
/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py in _fn(x)
478 activation=activation,
479 *args,
--> 480 **kwargs)
481 x = masked_dense(
482 inputs=x,
/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py in masked_dense(inputs, units, num_blocks, exclusive, kernel_initializer, reuse, name, *args, **kwargs)
386 *args,
387 **kwargs)
--> 388 return layer.apply(inputs)
389
390
/usr/local/lib/python3.6/dist-packages/tensorflow/python/layers/base.py in apply(self, inputs, *args, **kwargs)
807 Output tensor(s).
808 """
--> 809 return self.__call__(inputs, *args, **kwargs)
810
811 def _add_inbound_node(self,
/usr/local/lib/python3.6/dist-packages/tensorflow/python/layers/base.py in __call__(self, inputs, *args, **kwargs)
671
672 # Check input assumptions set before layer building, e.g. input rank.
--> 673 self._assert_input_compatibility(inputs)
674 if input_list and self._dtype is None:
675 try:
/usr/local/lib/python3.6/dist-packages/tensorflow/python/layers/base.py in _assert_input_compatibility(self, inputs)
1195 ', found ndim=' + str(ndim) +
1196 '. Full shape received: ' +
-> 1197 str(x.get_shape().as_list()))
1198 # Check dtype.
1199 if spec.dtype is not None:
ValueError: Input 0 of layer dense_1 is incompatible with the layer: : expected min_ndim=2, found ndim=1. Full shape received: [5]
originally defined at:
File "<ipython-input-2-3b2fcb2af309>", line 9, in <module>
hidden_layers=[512, 512])),
File "/usr/local/lib/python3.6/dist-packages/tensorflow/contrib/distributions/python/ops/bijectors/masked_autoregressive.py", line 499, in masked_autoregressive_default_template
"masked_autoregressive_default_template", _fn)
File "/usr/local/lib/python3.6/dist-packages/tensorflow/python/ops/template.py", line 152, in make_template
**kwargs)

Categories

Resources