RandomForestClassifer throwing estimator error - python

I am attempting to build a stacking classifier using multiple combinations of available models, however, when I have a RandomForestClassifier the loop throws an error. Here is what I have attempted:
'RandomForestClassifier' object has no attribute 'estimators_'. Did you mean: 'estimator_'?
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
RF = RandomForestClassifier(n_estimators=500, random_state=1250, criterion='entropy', max_depth=2, min_impurity_decrease=0.5)
RF1 = RandomForestClassifier(n_estimators=500, random_state=1250, criterion='entropy', max_depth=2, min_impurity_decrease=0.2, oob_score=True)
ABC = AdaBoostClassifier(random_state=1250)
GBC = GradientBoostingClassifier(random_state=1250)
stackModels = [RF, RF1, GBC, ABC]
from itertools import combinations
classifier_combinations = [ list(np.array(stackModels)[list(x)]) for x in list(combinations(range(len(stackModels)), 2))]
Stackresults = {'estimators': [],'final_estimaor': [], 'accuracy': []}
for list_class in classifier_combinations:
for classify in stackModels:
CLASS = StackingClassifier(estimators = list_class, final_estimator=classify)
CLASS.fit(X_train, y_train)
ypred = CLASS.predict(X_test)
accuracy = accuracy_score(y_test, ypred)
Stackresults['accuracy'].append(accuracy)
Stackresults['estimators'].append(list_class)
Stackresults['final_estimator'].append(classify)
FULL TRACEBACK:
/var/folders/dr/9wh_z8y10fl79chj86pq7knc0000gn/T/ipykernel_7755/3533362225.py in <module>
24 for classify in stackModels:
25 CLASS = StackingClassifier(estimators = list_class, final_estimator=classify)
---> 26 CLASS.fit(X_train, y_train)
27 ypred = CLASS.predict(X_test)
28 accuracy = accuracy_score(y_test, ypred)
~/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
486 self._le = LabelEncoder().fit(y)
487 self.classes_ = self._le.classes_
--> 488 return super().fit(X, self._le.transform(y), sample_weight)
489
490 #if_delegate_has_method(delegate="final_estimator_")
~/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
148 # all_estimators contains all estimators, the one to be fitted and the
149 # 'drop' string.
--> 150 names, all_estimators = self._validate_estimators()
151 self._validate_final_estimator()
152
~/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_base.py in _validate_estimators(self)
245 " of (string, estimator) tuples."
...
--> 188 return iter(self.estimators_)
189
190
AttributeError: 'RandomForestClassifier' object has no attribute 'estimators_'

Related

NotFittedError - Titanic Project Kaggle

I am trying different machine learning projects from Kaggle to make myself better. Here is the model that I am using:
from sklearn.ensemble import RandomForestClassifier
y = train_data["Survived"]
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])
model = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 1)
model.fit = (X, y)
predictions = model.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index = False)
print('Your submission was successfully saved!')
Here is the error I get:
---------------------------------------------------------------------------
NotFittedError Traceback (most recent call last)
/tmp/ipykernel_33/1528591149.py in <module>
9 forest_clf = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 1)
10 forest_clf.fit = (X, y)
---> 11 predictions = forest_clf.predict(X_test)
12
13 output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
/opt/conda/lib/python3.7/site-packages/sklearn/ensemble/_forest.py in predict(self, X)
806 The predicted classes.
807 """
--> 808 proba = self.predict_proba(X)
809
810 if self.n_outputs_ == 1:
/opt/conda/lib/python3.7/site-packages/sklearn/ensemble/_forest.py in predict_proba(self, X)
846 classes corresponds to that in the attribute :term:`classes_`.
847 """
--> 848 check_is_fitted(self)
849 # Check data
850 X = self._validate_X_predict(X)
/opt/conda/lib/python3.7/site-packages/sklearn/utils/validation.py in check_is_fitted(estimator, attributes, msg, all_or_any)
1220
1221 if not fitted:
-> 1222 raise NotFittedError(msg % {"name": type(estimator).__name__})
1223
1224
NotFittedError: This RandomForestClassifier instance is not fitted yet. Call 'fit' with appropriate arguments before using this estimator.
I think this is an example of the estimator cloning itself, but I am not sure which line is the issue here. This is the Titanic project that is seen on Kaggle, whose tutorial code I have copied amidst trying to learn. Any help is appreciated.
As #Blackgaurd pointed out just change model.fit = (X, y) to model.fit(X, y)
Your current code overwrites the fit method of your Random Forest Classifier.
Full code of yours with correction:
from sklearn.ensemble import RandomForestClassifier
y = train_data["Survived"]
features = ["Pclass", "Sex", "SibSp", "Parch"]
X = pd.get_dummies(train_data[features])
X_test = pd.get_dummies(test_data[features])
model = RandomForestClassifier(n_estimators = 100, max_depth = 5, random_state = 1)
model.fit(X, y) # <- line of code fixed
predictions = model.predict(X_test)
output = pd.DataFrame({'PassengerId': test_data.PassengerId, 'Survived': predictions})
output.to_csv('submission.csv', index = False)
print('Your submission was successfully saved!')

what mean error :AttributeError: lower not found in classification project?

I have a python that is written using jupyter notebook and deal with classification topics project that have in the an unbalanced dataset, for this i used SMOTE but when I tried to split the dataset and create a pipeline to use machine learning model the system crash and display the below error:
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-17-7ae8518f1892> in <module>
15 ('clf',MultinomialNB()), # model classifier
16 ])
---> 17 nb.fit(x_train,y_train)
f:\AIenv\lib\site-packages\sklearn\pipeline.py in fit(self, X, y,
**fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
f:\AIenv\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y,
**fit_params_steps)
294 message_clsname='Pipeline',
295 message=self._log_message(step_idx),
--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
f:\AIenv\lib\site-packages\joblib\memory.py in __call__(self, *args,
**kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
f:\AIenv\lib\site-packages\sklearn\pipeline.py in
_fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
f:\AIenv\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y) 1197 1198 vocabulary, X = self._count_vocab(raw_documents,
-> 1199 self.fixed_vocabulary_) 1200 1201 if self.binary:
f:\AIenv\lib\site-packages\sklearn\feature_extraction\text.py in
_count_vocab(self, raw_documents, fixed_vocab) 1108 for doc in raw_documents: 1109 feature_counter = {}
-> 1110 for feature in analyze(doc): 1111 try: 1112 feature_idx = vocabulary[feature]
f:\AIenv\lib\site-packages\sklearn\feature_extraction\text.py in
_analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
102 else:
103 if preprocessor is not None:
--> 104 doc = preprocessor(doc)
105 if tokenizer is not None:
106 doc = tokenizer(doc)
f:\AIenv\lib\site-packages\sklearn\feature_extraction\text.py in
_preprocess(doc, accent_function, lower)
67 """
68 if lower:
---> 69 doc = doc.lower()
70 if accent_function is not None:
71 doc = accent_function(doc)
f:\AIenv\lib\site-packages\scipy\sparse\base.py in __getattr__(self, attr)
685 return self.getnnz()
686 else:
--> 687 raise AttributeError(attr + " not found")
688
689 def transpose(self, axes=None, copy=False):
AttributeError: lower not found
code:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE# for inbalance dataset
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
df = pd.read_csv("data/emotion_dataset_raw.csv")
df["clean_text"] = df["Text"].apply(clean_text)
vectorizer =TfidfVectorizer(ngram_range=(1,2))
vect_df =vectorizer.fit_transform(df["clean_text"])
oversample = SMOTE(random_state = 42)
x_smote,y_smote = oversample.fit_resample(vect_df, df["Emotion"])
print("shape x before SMOTE: {}".format(vect_df.shape))
print("shape x after SMOTE: {}".format(x_smote.shape))
print("balance of targets feild %")
y_smote.value_counts(normalize = True)*100
# the result of the code above :
#shape x before SMOTE: (34792, 209330)
#shape x after SMOTE: (88360, 209330)
x_train,x_test,y_train,y_test = train_test_split(x_smote,y_smote,test_size = 0.2,random_state =42)
#Naiive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
nb = Pipeline([
('vect',CountVectorizer(ngram_range=(1,2))),
('tfidf',TfidfTransformer()),
('clf',MultinomialNB()), # model classifier
])
nb.fit(x_train,y_train)
where is the error in my code and what it mean ???
I believe that the TfidfTransformer is good enough to generate text embeddings. You can drop the CountVectorizer and run the code again. I should work!
pipe = Pipeline(
[
('tfidf', TfidfVectorizer()),
('sampler', RandomOverSampler(sampling_strategy='not majority', random_state=7)),
('model', XGBClassifier())
]
)
pipe.fit(data['features'], data['labels'])

Does sklearn pipeline() feed both X and y to the following steps?

So I'm trying to do outlier removal and supervised feature selection in the pipeline before classifier training. For this I had to create custom transformers to feed into the pipeline. All the examples I found had y=None as an argument for the transform() method, however, since I need to change y (i.e. remove outliers from y), I need to be able to access it. Here's my custom transformer for outlier removal.
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
class OutlierExtractor1(BaseEstimator, TransformerMixin):
def __init__(self):
self.threshold = 2
self.isInlier = None
def transform(self, X, y):
ind = [False if i == -1 else True for i in self.isInlier]
return (X.loc[ind,:], y.loc[ind])
def fit(self, X, y):
X2 = np.asarray(X)
y2 = np.asarray(y)
scaler = StandardScaler()
norm = scaler.fit_transform(X2)
normalized_X = pd.DataFrame(norm, columns=X.columns)
lcf = LocalOutlierFactor(metric = 'euclidean')
self.isInlier = list(lcf.fit_predict(normalized_X))
return self
And here is the pipeline where I use said transformer:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
space = {'rf__max_depth': [9, 11, 12, 14],
'rf__n_estimators': [80, 90, 100]}
pipe = Pipeline([('outliers', OutlierExtractor1()),
('rf', RandomForestClassifier(criterion = 'entropy',
min_samples_split = 4,
min_samples_leaf = 2,
min_impurity_decrease = 0.01,
random_state=0))])
ftwo_scorer = make_scorer(fbeta_score, beta=2)
ftwo_scorer = make_scorer(fbeta_score, beta=2)
search = GridSearchCV(pipe, param_grid = space, scoring = ftwo_scorer, cv = 4, return_train_score = True, verbose = 1)
search.fit(X = downsampled, y = target)
pd.DataFrame(search.cv_results_)
I get this error.
TypeError Traceback (most recent call last)
<ipython-input-34-d10a6e74d8e8> in <module>
20 ftwo_scorer = make_scorer(fbeta_score, beta=2)
21 search = GridSearchCV(pipe, param_grid = space, scoring = ftwo_scorer, cv = 4, return_train_score = True, verbose = 1)
---> 22 search.fit(X = downsampled, y = target)
23 pd.DataFrame(search.cv_results_)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
763 refit_start_time = time.time()
764 if y is not None:
--> 765 self.best_estimator_.fit(X, y, **fit_params)
766 else:
767 self.best_estimator_.fit(X, **fit_params)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
294 message_clsname='Pipeline',
295 message=self._log_message(step_idx),
--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
~\AppData\Roaming\Python\Python37\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
691 else:
692 # fit method of arity 2 (supervised transformation)
--> 693 return self.fit(X, y, **fit_params).transform(X)
694
695
TypeError: transform() missing 1 required positional argument: 'y'
The error goes a way if I set y=None, however, y is not changed! It looks like the pipeline function only feeds X to the pre processing steps. Can someone help please?
EDIT
The pipeline() function source code feeds X and y to the fit() method of each step, however, it only feeds X to the transform() method, so y cannot be changed.
My solution was to do the outlier extraction outside of the pipeline and consequentely outside of cross validation, which is a bummer.
One thing about detecting outlier inside the train / test, bear in mind you are working with a smaller subset, so it might be less accurate. If the purpose is to simply exclusion, you can do that before passing it to a pipeline.
If you do need to do this, then it makes more sense to do the outlier detection within the fit. Below is modification of the code following a comment by jnothman in github:
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import RandomForestClassifier
class WithoutOutliersClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, outlier_detector, classifier):
self.outlier_detector = outlier_detector
self.classifier = classifier
def fit(self, X, y):
self.outlier_detector_ = clone(self.outlier_detector)
mask = self.outlier_detector_.fit_predict(X, y) == 1
self.classifier_ = clone(self.classifier).fit(X[mask], y[mask])
return self
def predict(self, X):
return self.classifier_.predict(X)
We can test this
import numpy as np
np.random.seed(111)
x = np.random.normal(0,1,(200,3))
y = np.random.binomial(1,0.5,200)
We expect 4 outliers:
(LocalOutlierFactor(metric='euclidean').fit_predict(x) == 1).sum()
4
I set oob_score = True to show that the classifier is trained on the subset we expect:
rf = WithoutOutliersClassifier(LocalOutlierFactor(metric='euclidean'),
RandomForestClassifier(oob_score=True))
rf.fit(x,y)
rf.classifier_.oob_decision_function_.shape
(196, 2)
Now put this into a pipeline, note the change in names of your param:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
space = {'rf__classifier__max_depth': [3,4],
'rf__classifier__n_estimators' : [50,100]}
pipe = Pipeline([('scale', StandardScaler()),
('rf', rf)])
search = GridSearchCV(pipe, param_grid = space)
search.fit(X = x, y = y)

cross_val_predict is not producing the same number of variables after transforming data with a Pipeline

I'm running into an value error when using cross_val_predict(). For my project I want to test the performance of my model based on different values of k in SelectKBest(). The maximum value of k is equal to the number of variables in my dataset (447 variables).
One of my evaluation method is a confusion matrix based on cross validation. Prior to creating a confusion matrix I transform the data and tune a model using sklearn Pipeline and RandomizedSearchCV. For this I use the following code:
import pandas as pd
import numpy as np
from scipy.stats import loguniform, uniform, randint
from sklearn.ensemble import RandomForestClassifier
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import Pipeline
from sklearn.model_selection import RandomizedSearchCV
from sklearn.model_selection import cross_val_predict
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
from sklearn.feature_selection import SelectKBest
from sklearn.feature_selection import mutual_info_classif
from sklearn.preprocessing import MinMaxScaler
from sklearn.metrics import plot_confusion_matrix
from sklearn.metrics import confusion_matrix
from sklearn.metrics import ConfusionMatrixDisplay
from sklearn.feature_selection import VarianceThreshold
from sklearn.preprocessing import LabelEncoder
X_train, X_test, y_train, y_test = train_test_split(
X,
y,
test_size=0.2,
random_state=41,
stratify= y,
shuffle= True
)
# Create pipeline
numeric_pipe = Pipeline([
('scale', MinMaxScaler()),
('variance', VarianceThreshold(0)),
])
nominal_pipe = Pipeline([
('onehot', OneHotEncoder(
sparse= False,
drop= None,
handle_unknown='ignore',
)),
('variance', VarianceThreshold(.8 * (1 - .8)))
])
binary_pipe = Pipeline([
('variance', VarianceThreshold(.8 * (1 - .8)))
])
pipe = Pipeline([
('preprocess', ColumnTransformer([
('numeric', numeric_pipe, numeric_vars),
('nominal', nominal_pipe, nominal_vars),
('binary', binary_pipe, binary_vars)
], remainder= 'drop', n_jobs= -1)),
('mi', SelectKBest(partial(mutual_info_classif, random_state=42, n_neighbors= 10), k= 447)),
('predictor', RandomForestClassifier()),
], verbose= 2)
rf_params = {
'predictor__bootstrap': [True],
'predictor__ccp_alpha': loguniform(0.001, 0.01),
'predictor__class_weight': [None],
'predictor__criterion': ['gini', 'entropy'],
'predictor__max_depth': randint(1, 100),,
'predictor__max_features': ['auto'],
'predictor__max_leaf_nodes': [None],
'predictor__max_samples': [None],
'predictor__min_impurity_decrease': [0.0],
'predictor__min_impurity_split': [None],
'predictor__min_samples_leaf': loguniform(0.0001, 1),
'predictor__min_samples_split': loguniform(0.0001, 1),
'predictor__min_weight_fraction_leaf': [0],
'predictor__n_estimators': randint(100, 1000),
'predictor__n_jobs': [-1],
'predictor__oob_score': [False],
'predictor__random_state': [12, 78, 35, 245],
'predictor__verbose': [0],
'predictor__warm_start': [True]
}
cv = StratifiedKFold(n_splits= 10, shuffle= True, random_state= 42)
random_search = RandomizedSearchCV(
estimator= pipe,
param_distributions= rf_params,
n_iter= 10,
scoring= "f1_weighted",
n_jobs= -1,
cv= cv,
refit= True,
verbose= 2,
random_state= 42,
return_train_score= True
)
random_search.fit(X_train, y_train)
When I print the number of variables left when I transform the data I get 447:
print(random_search.best_estimator_[:-1].transform(X_train).shape[1])
Ouptut:
447
When I try cross_val_predict::
y_train_pred = cross_val_predict(random_search.best_estimator_, X_train, y_train, cv= cv, n_jobs=-1, verbose= 2)
I get this error:
ValueError: k should be >=0, <= n_features = 446; got 447. Use k='all' to return all features.
with the following traceback:
Traceback (most recent call last):
File "/databricks/python/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 418, in _process_worker
r = call_item()
File "/databricks/python/lib/python3.7/site-packages/joblib/externals/loky/process_executor.py", line 272, in __call__
return self.fn(*self.args, **self.kwargs)
File "/databricks/python/lib/python3.7/site-packages/joblib/_parallel_backends.py", line 608, in __call__
return self.func(*args, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/joblib/parallel.py", line 256, in __call__
for func, args, kwargs in self.items]
File "/databricks/python/lib/python3.7/site-packages/joblib/parallel.py", line 256, in <listcomp>
for func, args, kwargs in self.items]
File "/databricks/python/lib/python3.7/site-packages/sklearn/model_selection/_validation.py", line 862, in _fit_and_predict
estimator.fit(X_train, y_train, **fit_params)
File "/databricks/python/lib/python3.7/site-packages/sklearn/pipeline.py", line 330, in fit
Xt = self._fit(X, y, **fit_params_steps)
File "/databricks/python/lib/python3.7/site-packages/sklearn/pipeline.py", line 296, in _fit
**fit_params_steps[name])
File "/databricks/python/lib/python3.7/site-packages/joblib/memory.py", line 355, in __call__
return self.func(*args, **kwargs)
File "/databricks/python/lib/python3.7/site-packages/sklearn/pipeline.py", line 740, in _fit_transform_one
res = transformer.fit_transform(X, y, **fit_params)
File "/databricks/python/lib/python3.7/site-packages/sklearn/base.py", line 693, in fit_transform
return self.fit(X, y, **fit_params).transform(X)
File "/databricks/python/lib/python3.7/site-packages/sklearn/feature_selection/_univariate_selection.py", line 352, in fit
self._check_params(X, y)
File "/databricks/python/lib/python3.7/site-packages/sklearn/feature_selection/_univariate_selection.py", line 525, in _check_params
% (X.shape[1], self.k))
ValueError: k should be >=0, <= n_features = 446; got 447. Use k='all' to return all features.
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
<command-1173964809015648> in <module>
----> 1 y_train_pred = cross_val_predict(random_search.best_estimator_, X_train, y_train, cv= cv, n_jobs=-1, verbose= 2)
/databricks/python/lib/python3.7/site-packages/sklearn/utils/validation.py in inner_f(*args, **kwargs)
71 FutureWarning)
72 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 73 return f(**kwargs)
74 return inner_f
75
/databricks/python/lib/python3.7/site-packages/sklearn/model_selection/_validation.py in cross_val_predict(estimator, X, y, groups, cv, n_jobs, verbose, fit_params, pre_dispatch, method)
771 prediction_blocks = parallel(delayed(_fit_and_predict)(
772 clone(estimator), X, y, train, test, verbose, fit_params, method)
--> 773 for train, test in cv.split(X, y, groups))
774
775 # Concatenate the predictions
/databricks/python/lib/python3.7/site-packages/joblib/parallel.py in __call__(self, iterable)
1015
1016 with self._backend.retrieval_context():
-> 1017 self.retrieve()
1018 # Make sure that we get a last message telling us we are done
1019 elapsed_time = time.time() - self._start_time
/databricks/python/lib/python3.7/site-packages/joblib/parallel.py in retrieve(self)
907 try:
908 if getattr(self._backend, 'supports_timeout', False):
--> 909 self._output.extend(job.get(timeout=self.timeout))
910 else:
911 self._output.extend(job.get())
/databricks/python/lib/python3.7/site-packages/joblib/_parallel_backends.py in wrap_future_result(future, timeout)
560 AsyncResults.get from multiprocessing."""
561 try:
--> 562 return future.result(timeout=timeout)
563 except LokyTimeoutError:
564 raise TimeoutError()
/usr/lib/python3.7/concurrent/futures/_base.py in result(self, timeout)
433 raise CancelledError()
434 elif self._state == FINISHED:
--> 435 return self.__get_result()
436 else:
437 raise TimeoutError()
/usr/lib/python3.7/concurrent/futures/_base.py in __get_result(self)
382 def __get_result(self):
383 if self._exception:
--> 384 raise self._exception
385 else:
386 return self._result
ValueError: k should be >=0, <= n_features = 446; got 447. Use k='all' to return all features.
However when I use cross_validate or cross_val_score everything works well:
y_train_pred = cross_validate(random_search.best_estimator_, X_train, y_train, scoring = "f1_weighted", cv= cv, n_jobs=-1, verbose= 2)
y_train_pred = cross_val_score(random_search.best_estimator_, X_train, y_train, scoring = "f1_weighted", cv= cv, n_jobs=-1, verbose= 2)
I have solved the problem by first transforming the data and than apply cross_val_predict() but this is not the desired solution as it creates data leakage.:
model = random_search.best_estimator_
y_train_pred = cross_val_predict(model[-1], model[:-1].transform(X_train), y_train, cv= cv, n_jobs=-1, verbose= 2)
To my understanding the error comes up due to the variance threshold because the variance changes for when using the 10-fold cross validation. This results in different number of variables.
But why no error appears in cross_val_score() and cross_validate()? Am I missing something, am I doing something wrong or is it a bug?
Unfortunately I am not able to share my data due to a Non Disclosure Agreement. I tried to replicate it using make_classification() but without any luck.

Get 'function' object has no attribute 'loss' when doing GridsearchCV

I want to try GridsearchCV on my model, my import is :
from keras import models
from keras import layers
from keras import regularizers
from sklearn.model_selection import GridSearchCV
from keras.wrappers.scikit_learn import KerasClassifier
my code is:
def build_model(X_train = X_train,neurons=4,optimizer='Adam'):
model = models.Sequential()
model.add(layers.Dense(X_train.shape[1], kernel_regularizer=regularizers.l2(0.001),
activation='relu', input_shape=(X_train.shape[1],)))
model.add(layers.BatchNormalization())
model.add(layers.Dense(neurons, kernel_regularizer=regularizers.l2(0.001), activation='relu'))
model.add(layers.BatchNormalization())
model.add(layers.Dense(1, activation='sigmoid'))
model.compile(optimizer=optimizer,loss='binary_crossentropy',metrics=['accuracy'])
return build_model
model = KerasClassifier(build_fn=build_model, verbose=1)
# define the grid search parameters
batch_size = [16, 32, 64]
epochs = [50, 100]
param_grid = dict(batch_size=batch_size, epochs=epochs)
# search the grid
grid = GridSearchCV(estimator=model,
param_grid=param_grid,
cv=10,
verbose=2)
grid_result = grid.fit(X_train, y_train)
but I get a bug as below:
AttributeError Traceback (most recent call last)
<ipython-input-93-2eb813d3aab7> in <module>
12 verbose=2) # include n_jobs=-1 if you are using CPU
13
---> 14 grid_result = grid.fit(X_train, y_train)
15
16 print(model)
/anaconda3/envs/lance/lib/python3.7/site-packages/keras/wrappers/scikit_learn.py in fit(self, x, y, sample_weight, **kwargs)
208 if sample_weight is not None:
209 kwargs['sample_weight'] = sample_weight
--> 210 return super(KerasClassifier, self).fit(x, y, **kwargs)
211
212 def predict(self, x, **kwargs):
/anaconda3/envs/lance/lib/python3.7/site-packages/keras/wrappers/scikit_learn.py in fit(self, x, y, **kwargs)
141 self.model = self.build_fn(**self.filter_sk_params(self.build_fn))
142
--> 143 loss_name = self.model.loss
144 if hasattr(loss_name, '__name__'):
145 loss_name = loss_name.__name__
AttributeError: 'function' object has no attribute 'loss'
I can't understand what the bug is, and I'm sure the data processing is correct because it goes well without grid search, did I do something wrong?
At the end of the build_model function, you write return build_model. This returns a reference to the function itself, not to the model object you've been building so far. I'm pretty sure you want return model instead.

Categories

Resources