Does sklearn pipeline() feed both X and y to the following steps? - python

So I'm trying to do outlier removal and supervised feature selection in the pipeline before classifier training. For this I had to create custom transformers to feed into the pipeline. All the examples I found had y=None as an argument for the transform() method, however, since I need to change y (i.e. remove outliers from y), I need to be able to access it. Here's my custom transformer for outlier removal.
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
class OutlierExtractor1(BaseEstimator, TransformerMixin):
def __init__(self):
self.threshold = 2
self.isInlier = None
def transform(self, X, y):
ind = [False if i == -1 else True for i in self.isInlier]
return (X.loc[ind,:], y.loc[ind])
def fit(self, X, y):
X2 = np.asarray(X)
y2 = np.asarray(y)
scaler = StandardScaler()
norm = scaler.fit_transform(X2)
normalized_X = pd.DataFrame(norm, columns=X.columns)
lcf = LocalOutlierFactor(metric = 'euclidean')
self.isInlier = list(lcf.fit_predict(normalized_X))
return self
And here is the pipeline where I use said transformer:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
space = {'rf__max_depth': [9, 11, 12, 14],
'rf__n_estimators': [80, 90, 100]}
pipe = Pipeline([('outliers', OutlierExtractor1()),
('rf', RandomForestClassifier(criterion = 'entropy',
min_samples_split = 4,
min_samples_leaf = 2,
min_impurity_decrease = 0.01,
random_state=0))])
ftwo_scorer = make_scorer(fbeta_score, beta=2)
ftwo_scorer = make_scorer(fbeta_score, beta=2)
search = GridSearchCV(pipe, param_grid = space, scoring = ftwo_scorer, cv = 4, return_train_score = True, verbose = 1)
search.fit(X = downsampled, y = target)
pd.DataFrame(search.cv_results_)
I get this error.
TypeError Traceback (most recent call last)
<ipython-input-34-d10a6e74d8e8> in <module>
20 ftwo_scorer = make_scorer(fbeta_score, beta=2)
21 search = GridSearchCV(pipe, param_grid = space, scoring = ftwo_scorer, cv = 4, return_train_score = True, verbose = 1)
---> 22 search.fit(X = downsampled, y = target)
23 pd.DataFrame(search.cv_results_)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
763 refit_start_time = time.time()
764 if y is not None:
--> 765 self.best_estimator_.fit(X, y, **fit_params)
766 else:
767 self.best_estimator_.fit(X, **fit_params)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
294 message_clsname='Pipeline',
295 message=self._log_message(step_idx),
--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
~\AppData\Roaming\Python\Python37\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
691 else:
692 # fit method of arity 2 (supervised transformation)
--> 693 return self.fit(X, y, **fit_params).transform(X)
694
695
TypeError: transform() missing 1 required positional argument: 'y'
The error goes a way if I set y=None, however, y is not changed! It looks like the pipeline function only feeds X to the pre processing steps. Can someone help please?
EDIT
The pipeline() function source code feeds X and y to the fit() method of each step, however, it only feeds X to the transform() method, so y cannot be changed.
My solution was to do the outlier extraction outside of the pipeline and consequentely outside of cross validation, which is a bummer.

One thing about detecting outlier inside the train / test, bear in mind you are working with a smaller subset, so it might be less accurate. If the purpose is to simply exclusion, you can do that before passing it to a pipeline.
If you do need to do this, then it makes more sense to do the outlier detection within the fit. Below is modification of the code following a comment by jnothman in github:
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import RandomForestClassifier
class WithoutOutliersClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, outlier_detector, classifier):
self.outlier_detector = outlier_detector
self.classifier = classifier
def fit(self, X, y):
self.outlier_detector_ = clone(self.outlier_detector)
mask = self.outlier_detector_.fit_predict(X, y) == 1
self.classifier_ = clone(self.classifier).fit(X[mask], y[mask])
return self
def predict(self, X):
return self.classifier_.predict(X)
We can test this
import numpy as np
np.random.seed(111)
x = np.random.normal(0,1,(200,3))
y = np.random.binomial(1,0.5,200)
We expect 4 outliers:
(LocalOutlierFactor(metric='euclidean').fit_predict(x) == 1).sum()
4
I set oob_score = True to show that the classifier is trained on the subset we expect:
rf = WithoutOutliersClassifier(LocalOutlierFactor(metric='euclidean'),
RandomForestClassifier(oob_score=True))
rf.fit(x,y)
rf.classifier_.oob_decision_function_.shape
(196, 2)
Now put this into a pipeline, note the change in names of your param:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
space = {'rf__classifier__max_depth': [3,4],
'rf__classifier__n_estimators' : [50,100]}
pipe = Pipeline([('scale', StandardScaler()),
('rf', rf)])
search = GridSearchCV(pipe, param_grid = space)
search.fit(X = x, y = y)

Related

RandomForestClassifer throwing estimator error

I am attempting to build a stacking classifier using multiple combinations of available models, however, when I have a RandomForestClassifier the loop throws an error. Here is what I have attempted:
'RandomForestClassifier' object has no attribute 'estimators_'. Did you mean: 'estimator_'?
import numpy as np
from sklearn.ensemble import RandomForestClassifier, AdaBoostClassifier, GradientBoostingClassifier, StackingClassifier
from sklearn.metrics import accuracy_score
from sklearn.datasets import load_breast_cancer
from sklearn.model_selection import train_test_split
data = load_breast_cancer()
X, y = data.data, data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
RF = RandomForestClassifier(n_estimators=500, random_state=1250, criterion='entropy', max_depth=2, min_impurity_decrease=0.5)
RF1 = RandomForestClassifier(n_estimators=500, random_state=1250, criterion='entropy', max_depth=2, min_impurity_decrease=0.2, oob_score=True)
ABC = AdaBoostClassifier(random_state=1250)
GBC = GradientBoostingClassifier(random_state=1250)
stackModels = [RF, RF1, GBC, ABC]
from itertools import combinations
classifier_combinations = [ list(np.array(stackModels)[list(x)]) for x in list(combinations(range(len(stackModels)), 2))]
Stackresults = {'estimators': [],'final_estimaor': [], 'accuracy': []}
for list_class in classifier_combinations:
for classify in stackModels:
CLASS = StackingClassifier(estimators = list_class, final_estimator=classify)
CLASS.fit(X_train, y_train)
ypred = CLASS.predict(X_test)
accuracy = accuracy_score(y_test, ypred)
Stackresults['accuracy'].append(accuracy)
Stackresults['estimators'].append(list_class)
Stackresults['final_estimator'].append(classify)
FULL TRACEBACK:
/var/folders/dr/9wh_z8y10fl79chj86pq7knc0000gn/T/ipykernel_7755/3533362225.py in <module>
24 for classify in stackModels:
25 CLASS = StackingClassifier(estimators = list_class, final_estimator=classify)
---> 26 CLASS.fit(X_train, y_train)
27 ypred = CLASS.predict(X_test)
28 accuracy = accuracy_score(y_test, ypred)
~/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
486 self._le = LabelEncoder().fit(y)
487 self.classes_ = self._le.classes_
--> 488 return super().fit(X, self._le.transform(y), sample_weight)
489
490 #if_delegate_has_method(delegate="final_estimator_")
~/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_stacking.py in fit(self, X, y, sample_weight)
148 # all_estimators contains all estimators, the one to be fitted and the
149 # 'drop' string.
--> 150 names, all_estimators = self._validate_estimators()
151 self._validate_final_estimator()
152
~/opt/anaconda3/lib/python3.9/site-packages/sklearn/ensemble/_base.py in _validate_estimators(self)
245 " of (string, estimator) tuples."
...
--> 188 return iter(self.estimators_)
189
190
AttributeError: 'RandomForestClassifier' object has no attribute 'estimators_'

what mean error :AttributeError: lower not found in classification project?

I have a python that is written using jupyter notebook and deal with classification topics project that have in the an unbalanced dataset, for this i used SMOTE but when I tried to split the dataset and create a pipeline to use machine learning model the system crash and display the below error:
--------------------------------------------------------------------------- AttributeError Traceback (most recent call last) <ipython-input-17-7ae8518f1892> in <module>
15 ('clf',MultinomialNB()), # model classifier
16 ])
---> 17 nb.fit(x_train,y_train)
f:\AIenv\lib\site-packages\sklearn\pipeline.py in fit(self, X, y,
**fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
f:\AIenv\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y,
**fit_params_steps)
294 message_clsname='Pipeline',
295 message=self._log_message(step_idx),
--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
f:\AIenv\lib\site-packages\joblib\memory.py in __call__(self, *args,
**kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
f:\AIenv\lib\site-packages\sklearn\pipeline.py in
_fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
f:\AIenv\lib\site-packages\sklearn\feature_extraction\text.py in fit_transform(self, raw_documents, y) 1197 1198 vocabulary, X = self._count_vocab(raw_documents,
-> 1199 self.fixed_vocabulary_) 1200 1201 if self.binary:
f:\AIenv\lib\site-packages\sklearn\feature_extraction\text.py in
_count_vocab(self, raw_documents, fixed_vocab) 1108 for doc in raw_documents: 1109 feature_counter = {}
-> 1110 for feature in analyze(doc): 1111 try: 1112 feature_idx = vocabulary[feature]
f:\AIenv\lib\site-packages\sklearn\feature_extraction\text.py in
_analyze(doc, analyzer, tokenizer, ngrams, preprocessor, decoder, stop_words)
102 else:
103 if preprocessor is not None:
--> 104 doc = preprocessor(doc)
105 if tokenizer is not None:
106 doc = tokenizer(doc)
f:\AIenv\lib\site-packages\sklearn\feature_extraction\text.py in
_preprocess(doc, accent_function, lower)
67 """
68 if lower:
---> 69 doc = doc.lower()
70 if accent_function is not None:
71 doc = accent_function(doc)
f:\AIenv\lib\site-packages\scipy\sparse\base.py in __getattr__(self, attr)
685 return self.getnnz()
686 else:
--> 687 raise AttributeError(attr + " not found")
688
689 def transpose(self, axes=None, copy=False):
AttributeError: lower not found
code:
import pandas as pd
import numpy as np
from imblearn.over_sampling import SMOTE# for inbalance dataset
from sklearn.feature_extraction.text import CountVectorizer,TfidfVectorizer
from sklearn.model_selection import train_test_split
from sklearn.naive_bayes import MultinomialNB
df = pd.read_csv("data/emotion_dataset_raw.csv")
df["clean_text"] = df["Text"].apply(clean_text)
vectorizer =TfidfVectorizer(ngram_range=(1,2))
vect_df =vectorizer.fit_transform(df["clean_text"])
oversample = SMOTE(random_state = 42)
x_smote,y_smote = oversample.fit_resample(vect_df, df["Emotion"])
print("shape x before SMOTE: {}".format(vect_df.shape))
print("shape x after SMOTE: {}".format(x_smote.shape))
print("balance of targets feild %")
y_smote.value_counts(normalize = True)*100
# the result of the code above :
#shape x before SMOTE: (34792, 209330)
#shape x after SMOTE: (88360, 209330)
x_train,x_test,y_train,y_test = train_test_split(x_smote,y_smote,test_size = 0.2,random_state =42)
#Naiive Bayes Classifier
from sklearn.naive_bayes import MultinomialNB
from sklearn.pipeline import Pipeline
from sklearn.feature_extraction.text import TfidfTransformer
nb = Pipeline([
('vect',CountVectorizer(ngram_range=(1,2))),
('tfidf',TfidfTransformer()),
('clf',MultinomialNB()), # model classifier
])
nb.fit(x_train,y_train)
where is the error in my code and what it mean ???
I believe that the TfidfTransformer is good enough to generate text embeddings. You can drop the CountVectorizer and run the code again. I should work!
pipe = Pipeline(
[
('tfidf', TfidfVectorizer()),
('sampler', RandomOverSampler(sampling_strategy='not majority', random_state=7)),
('model', XGBClassifier())
]
)
pipe.fit(data['features'], data['labels'])

ValueError when using ColumnTransformer() in an Sklearn Pipeline - Using custom class of Spacy for GloveVectorizer

I've got a dataset with multiple text columns and a target column. I'm trying to use a Cusom Class of Spacy to use Glove embeddings for my text column, and also trying to do it with a Pipeline. But I'm getting a ValueError. Following is my code:
data_features = df.copy()[["title", "description"]]
train_data, test_data, train_target, test_target = train_test_split(data_features, df['target'], test_size = 0.1)
I created this custom class to use glove embeddings. I got the code from this tutorial.
class SpacyVectorTransformer(BaseEstimator, TransformerMixin):
def __init__(self, nlp):
self.nlp = nlp
self.dim = 300
def fit(self, X, y):
return self
def transform(self, X):
return [self.nlp(text).vector for text in X]
Loading the nlp model:
nlp = spacy.load("en_core_web_sm")
This is the column transformer that I'm trying to use in my pipeline:
col_preprocessor = ColumnTransformer(
[
('title_glove', SpacyVectorTransformer(nlp), 'title'),
('description_glove', SpacyVectorTransformer(nlp), 'description'),
],
remainder='drop',
n_jobs=1
)
Here is my pipeline:
pipeline_glove = Pipeline([
('col_preprocessor', col_preprocessor),
('classifier', LogisticRegression())
])
When I run the fit method, I get the error that follows:
pipeline_glove.fit(train_data, train_target)
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-219-8543ea744205> in <module>
----> 1 pipeline_glove.fit(train_data, train_target)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
294 message_clsname='Pipeline',
295 message=self._log_message(step_idx),
--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
549
550 self._update_fitted_transformers(transformers)
--> 551 self._validate_output(Xs)
552
553 return self._hstack(list(Xs))
/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _validate_output(self, result)
410 raise ValueError(
411 "The output of the '{0}' transformer should be 2D (scipy "
--> 412 "matrix, array, or pandas DataFrame).".format(name))
413
414 def _validate_features(self, n_features, feature_names):
ValueError: The output of the 'title_glove' transformer should be 2D (scipy matrix, array, or pandas DataFrame).
the error message tells you, what you need to fix.
ValueError: The output of the 'title_glove' transformer should be 2D
(scipy matrix, array, or pandas DataFrame).
But what you are returning with your current transformer (SpacyVectorTransformer) is a list. You can fix it, by turning the list into a pandas DataFrame for instance like this:
import pandas as pd
class SpacyVectorTransformer(BaseEstimator, TransformerMixin):
def __init__(self, nlp):
self.nlp = nlp
self.dim = 300
def fit(self, X, y):
return self
def transform(self, X):
return pd.DataFrame([self.nlp(text).vector for text in X])
Next time, please also provide a minimal, reproducible example. In your provided code, there are no imports as well as no DataFrame called "df".

Unable to fit() a Scikit-Learn pipeline without being returned a ValueError

I need your help!
I've been getting a ValueError below when trying to fit my Pipeline.
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 1, expected 13892.
I've been tasked to build a model that combines the business_features of nursing homes with their cycle 1 survey results, as well as the time between the cycle 1 and cycle 2 survey to predict the cycle 2 total score.
This is my code that I'm using to accomplish the task above.
# Creating a custom transformer to calculate the difference between survey
# 1 & survey 2 times
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
self.col_1 = X[self.t1_col].apply(pd.to_datetime)
self.col_2 = X[self.t2_col].apply(pd.to_datetime)
return self
def transform(self, X):
difference = self.col_1 - self.col_2
return difference.values
# Creating TimedeltaTransformer object
cycle_1_date = 'CYCLE_1_SURVEY_DATE'
cycle_2_date = 'CYCLE_2_SURVEY_DATE'
time_feature = TimedeltaTransformer(cycle_1_date, cycle_2_date)
# Using a custom column selecter transformer to extract cycle_1_features
cycle_1_cols = ['CYCLE_1_DEFS', 'CYCLE_1_NFROMDEFS', 'CYCLE_1_NFROMCOMP',
'CYCLE_1_DEFS_SCORE', 'CYCLE_1_NUMREVIS',
'CYCLE_1_REVISIT_SCORE', 'CYCLE_1_TOTAL_SCORE']
cycle_1_features = Pipeline([
('cst2', ColumnSelectTransformer(cycle_1_cols)),
])
# Creating my survey_model Pipeline object
# Pipeline object is a 2 step process, first a feature union transforming
# and combining the business features, cycle_1 features as well as time
# feature; followed by fitting the transformed features into a
# RandomForestRegressor
survey_model = Pipeline([
('features', FeatureUnion([
('business', business_features),
('survey', cycle_1_features),
('time', time_feature),
])),
('forest', RandomForestRegressor()),
])
# Trying to fit my Pipeline throws the ValueError described above
survey_model.fit(data, cycle_2_score.astype(int))
Some additional context: I'm building this model to have its predict_proba method passed into a custom grader for a project. The grader passes a list of dictionaries to the predict or predict_proba method of my estimator, not a DataFrame. This means that the model must work with both data types. For this reason, I need to provide a custom ColumnSelectTransformer to use instead scikit-learn's own ColumnTransformer.
Below is additional code related to the business features and ColumnSelectTransformer
# Custom transformer to select columns from a dataframe and returns the
# dataframe as an array
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
return X[self.columns].values
simple_features = Pipeline([
('cst', ColumnSelectTransformer(simple_cols)),
('imputer', SimpleImputer(strategy='mean')),
])
owner_onehot = Pipeline([
('cst', ColumnSelectTransformer(['OWNERSHIP'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
])
cert_onehot = Pipeline([
('cst', ColumnSelectTransformer(['CERTIFICATION'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
])
categorical_features = FeatureUnion([
('owner_onehot', owner_onehot),
('cert_onehot', cert_onehot),
])
business_features = FeatureUnion([
('simple', simple_features),
('categorical', categorical_features)
])
Finally, below is the full error raised
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-218-046724d81b69> in <module>()
----> 1 survey_model.fit(data, cycle_2_score.astype(int))
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
350 This estimator
351 """
--> 352 Xt, fit_params = self._fit(X, y, **fit_params)
353 with _print_elapsed_time('Pipeline',
354 self._log_message(len(self.steps) - 1)):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
315 message_clsname='Pipeline',
316 message=self._log_message(step_idx),
--> 317 **fit_params_steps[name])
318 # Replace the transformer of the step with the fitted
319 # transformer. This is necessary when loading the transformer
/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
714 with _print_elapsed_time(message_clsname, message):
715 if hasattr(transformer, 'fit_transform'):
--> 716 res = transformer.fit_transform(X, y, **fit_params)
717 else:
718 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
919
920 if any(sparse.issparse(f) for f in Xs):
--> 921 Xs = sparse.hstack(Xs).tocsr()
922 else:
923 Xs = np.hstack(Xs)
/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
463
464 """
--> 465 return bmat([blocks], format=format, dtype=dtype)
466
467
/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
584 exp=brow_lengths[i],
585 got=A.shape[0]))
--> 586 raise ValueError(msg)
587
588 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 1, expected 13892.
Further, the data and metadata can be gotten here
%%bash
mkdir data
wget http://dataincubator-wqu.s3.amazonaws.com/mldata/providers-train.csv -nc -P ./ml-data
wget http://dataincubator-wqu.s3.amazonaws.com/mldata/providers-metadata.csv -nc -P ./ml-data
Changing my TimeDeltaConverter seems to have helped.
Firstly by changing it to be a series of ints and then reshaping it to be reshape(-1,1).
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
self.col_1 = X[self.t1_col].apply(pd.to_datetime)
self.col_2 = X[self.t2_col].apply(pd.to_datetime)
return self
def transform(self, X):
difference_list = []
difference = self.col_1 - self.col_2
for obj in difference:
difference_list.append(obj.total_seconds())
return np.array(difference_list).reshape(-1,1)

Raising ValueError (extra rows) when submitting pipeline.predict into grader

I've been getting a ValueError below when trying to submit my Pipeline into a grader. And I'm not sure where I'm supposed to shave off 12500 rows of data.
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 13892, expected 1544.
I've been tasked to build a model that combines the business_features of nursing homes with their cycle 1 survey results, as well as the time between the cycle 1 and cycle 2 survey to predict the cycle 2 total score.
This is my code that I'm using to accomplish the task above.
# Creating a custom transformer to calculate the difference between survey
# 1 & survey 2 times
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
self.col_1 = X[self.t1_col].apply(pd.to_datetime)
self.col_2 = X[self.t2_col].apply(pd.to_datetime)
return self
def transform(self, X):
difference_list = []
difference = self.col_1 - self.col_2
for obj in difference:
difference_list.append(obj.total_seconds())
return np.array(difference_list).reshape(-1,1)
# Creating TimedeltaTransformer object
cycle_1_date = 'CYCLE_1_SURVEY_DATE'
cycle_2_date = 'CYCLE_2_SURVEY_DATE'
time_feature = TimedeltaTransformer(cycle_1_date, cycle_2_date)
# Using a custom column selecter transformer to extract cycle_1_features
cycle_1_cols = ['CYCLE_1_DEFS', 'CYCLE_1_NFROMDEFS', 'CYCLE_1_NFROMCOMP',
'CYCLE_1_DEFS_SCORE', 'CYCLE_1_NUMREVIS',
'CYCLE_1_REVISIT_SCORE', 'CYCLE_1_TOTAL_SCORE']
cycle_1_features = Pipeline([
('cst2', ColumnSelectTransformer(cycle_1_cols)),
])
# Creating my survey_model Pipeline object
# Pipeline object is a 2 step process, first a feature union transforming
# and combining the business features, cycle_1 features as well as time
# feature; followed by fitting the transformed features into a
# RandomForestRegressor
survey_model = Pipeline([
('features', FeatureUnion([
('business', business_features),
('survey', cycle_1_features),
('time', time_feature),
])),
('forest', RandomForestRegressor()),
])
# Fitting my pipeline produces no error
survey_model.fit(data, cycle_2_score.astype(int))
# Calling the predict function and passing it into the grader raises a ValueError
grader.score.ml__survey_model(survey_model.predict)
The fitted pipeline looks like this
Pipeline(memory=None,
steps=[('features',
FeatureUnion(n_jobs=None,
transformer_list=[('business',
FeatureUnion(n_jobs=None,
transformer_list=[('simple',
Pipeline(memory=None,
steps=[('cst',
ColumnSelectTransformer(columns=['BEDCERT',
'RESTOT',
'INHOSP',
'CCRC_FACIL',
'SFF',
'CHOW_LAST_12MOS',
'SPRINKLER_STATUS',
'EXP_TOTAL',
'ADJ_TOTAL'])),
('imputer',
SimpleImpute...
transformer_weights=None, verbose=False)),
('forest',
RandomForestRegressor(bootstrap=True, criterion='mse',
max_depth=None, max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=10, n_jobs=None,
oob_score=False, random_state=None,
verbose=0, warm_start=False))],
verbose=False)
Some additional context: I'm building this model to have its predict method passed into a custom grader for a project. The grader passes a list of dictionaries to the predict or predict_proba method of my estimator, not a DataFrame. This means that the model must work with both data types. For this reason, I need to provide a custom ColumnSelectTransformer to use instead scikit-learn's own ColumnTransformer.
Below is additional code related to the business features and ColumnSelectTransformer
# Custom transformer to select columns from a dataframe and returns the
# dataframe as an array
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
return X[self.columns].values
simple_features = Pipeline([
('cst', ColumnSelectTransformer(simple_cols)),
('imputer', SimpleImputer(strategy='mean')),
])
owner_onehot = Pipeline([
('cst', ColumnSelectTransformer(['OWNERSHIP'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
])
cert_onehot = Pipeline([
('cst', ColumnSelectTransformer(['CERTIFICATION'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
])
categorical_features = FeatureUnion([
('owner_onehot', owner_onehot),
('cert_onehot', cert_onehot),
])
business_features = FeatureUnion([
('simple', simple_features),
('categorical', categorical_features)
])
Finally, below is the full error raised
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-165-790ca6139493> in <module>()
----> 1 grader.score.ml__survey_model(survey_model.predict)
/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in func(*args, **kw)
92 def __getattr__(self, method):
93 def func(*args, **kw):
---> 94 return self(method, *args, **kw)
95 return func
96
/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in __call__(self, question_name, func)
88 return
89 test_cases = json.loads(resp.text)
---> 90 test_cases_grading(question_name, func, test_cases)
91
92 def __getattr__(self, method):
/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in test_cases_grading(question_name, func, test_cases)
40 for test_case in test_cases:
41 if inspect.isroutine(func):
---> 42 sub_res = func(*test_case['args'], **test_case['kwargs'])
43 elif not test_case['args'] and not test_case['kwargs']:
44 sub_res = func
/opt/conda/lib/python3.7/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in predict(self, X, **predict_params)
419 Xt = X
420 for _, name, transform in self._iter(with_final=False):
--> 421 Xt = transform.transform(Xt)
422 return self.steps[-1][-1].predict(Xt, **predict_params)
423
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in transform(self, X)
963 return np.zeros((X.shape[0], 0))
964 if any(sparse.issparse(f) for f in Xs):
--> 965 Xs = sparse.hstack(Xs).tocsr()
966 else:
967 Xs = np.hstack(Xs)
/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
463
464 """
--> 465 return bmat([blocks], format=format, dtype=dtype)
466
467
/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
584 exp=brow_lengths[i],
585 got=A.shape[0]))
--> 586 raise ValueError(msg)
587
588 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 13892, expected 1544.
Fixing my TimeDeltaTransformer helped.
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
timedelta_series = (pd.to_datetime(X[self.t1_col]) - pd.to_datetime(X[self.t2_col]))
array_list = []
for x in timedelta_series:
array_list.append(x.total_seconds())
return np.array(array_list).reshape(-1,1)

Categories

Resources