Raising ValueError (extra rows) when submitting pipeline.predict into grader - python

I've been getting a ValueError below when trying to submit my Pipeline into a grader. And I'm not sure where I'm supposed to shave off 12500 rows of data.
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 13892, expected 1544.
I've been tasked to build a model that combines the business_features of nursing homes with their cycle 1 survey results, as well as the time between the cycle 1 and cycle 2 survey to predict the cycle 2 total score.
This is my code that I'm using to accomplish the task above.
# Creating a custom transformer to calculate the difference between survey
# 1 & survey 2 times
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
self.col_1 = X[self.t1_col].apply(pd.to_datetime)
self.col_2 = X[self.t2_col].apply(pd.to_datetime)
return self
def transform(self, X):
difference_list = []
difference = self.col_1 - self.col_2
for obj in difference:
difference_list.append(obj.total_seconds())
return np.array(difference_list).reshape(-1,1)
# Creating TimedeltaTransformer object
cycle_1_date = 'CYCLE_1_SURVEY_DATE'
cycle_2_date = 'CYCLE_2_SURVEY_DATE'
time_feature = TimedeltaTransformer(cycle_1_date, cycle_2_date)
# Using a custom column selecter transformer to extract cycle_1_features
cycle_1_cols = ['CYCLE_1_DEFS', 'CYCLE_1_NFROMDEFS', 'CYCLE_1_NFROMCOMP',
'CYCLE_1_DEFS_SCORE', 'CYCLE_1_NUMREVIS',
'CYCLE_1_REVISIT_SCORE', 'CYCLE_1_TOTAL_SCORE']
cycle_1_features = Pipeline([
('cst2', ColumnSelectTransformer(cycle_1_cols)),
])
# Creating my survey_model Pipeline object
# Pipeline object is a 2 step process, first a feature union transforming
# and combining the business features, cycle_1 features as well as time
# feature; followed by fitting the transformed features into a
# RandomForestRegressor
survey_model = Pipeline([
('features', FeatureUnion([
('business', business_features),
('survey', cycle_1_features),
('time', time_feature),
])),
('forest', RandomForestRegressor()),
])
# Fitting my pipeline produces no error
survey_model.fit(data, cycle_2_score.astype(int))
# Calling the predict function and passing it into the grader raises a ValueError
grader.score.ml__survey_model(survey_model.predict)
The fitted pipeline looks like this
Pipeline(memory=None,
steps=[('features',
FeatureUnion(n_jobs=None,
transformer_list=[('business',
FeatureUnion(n_jobs=None,
transformer_list=[('simple',
Pipeline(memory=None,
steps=[('cst',
ColumnSelectTransformer(columns=['BEDCERT',
'RESTOT',
'INHOSP',
'CCRC_FACIL',
'SFF',
'CHOW_LAST_12MOS',
'SPRINKLER_STATUS',
'EXP_TOTAL',
'ADJ_TOTAL'])),
('imputer',
SimpleImpute...
transformer_weights=None, verbose=False)),
('forest',
RandomForestRegressor(bootstrap=True, criterion='mse',
max_depth=None, max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=10, n_jobs=None,
oob_score=False, random_state=None,
verbose=0, warm_start=False))],
verbose=False)
Some additional context: I'm building this model to have its predict method passed into a custom grader for a project. The grader passes a list of dictionaries to the predict or predict_proba method of my estimator, not a DataFrame. This means that the model must work with both data types. For this reason, I need to provide a custom ColumnSelectTransformer to use instead scikit-learn's own ColumnTransformer.
Below is additional code related to the business features and ColumnSelectTransformer
# Custom transformer to select columns from a dataframe and returns the
# dataframe as an array
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
return X[self.columns].values
simple_features = Pipeline([
('cst', ColumnSelectTransformer(simple_cols)),
('imputer', SimpleImputer(strategy='mean')),
])
owner_onehot = Pipeline([
('cst', ColumnSelectTransformer(['OWNERSHIP'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
])
cert_onehot = Pipeline([
('cst', ColumnSelectTransformer(['CERTIFICATION'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
])
categorical_features = FeatureUnion([
('owner_onehot', owner_onehot),
('cert_onehot', cert_onehot),
])
business_features = FeatureUnion([
('simple', simple_features),
('categorical', categorical_features)
])
Finally, below is the full error raised
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-165-790ca6139493> in <module>()
----> 1 grader.score.ml__survey_model(survey_model.predict)
/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in func(*args, **kw)
92 def __getattr__(self, method):
93 def func(*args, **kw):
---> 94 return self(method, *args, **kw)
95 return func
96
/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in __call__(self, question_name, func)
88 return
89 test_cases = json.loads(resp.text)
---> 90 test_cases_grading(question_name, func, test_cases)
91
92 def __getattr__(self, method):
/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in test_cases_grading(question_name, func, test_cases)
40 for test_case in test_cases:
41 if inspect.isroutine(func):
---> 42 sub_res = func(*test_case['args'], **test_case['kwargs'])
43 elif not test_case['args'] and not test_case['kwargs']:
44 sub_res = func
/opt/conda/lib/python3.7/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in predict(self, X, **predict_params)
419 Xt = X
420 for _, name, transform in self._iter(with_final=False):
--> 421 Xt = transform.transform(Xt)
422 return self.steps[-1][-1].predict(Xt, **predict_params)
423
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in transform(self, X)
963 return np.zeros((X.shape[0], 0))
964 if any(sparse.issparse(f) for f in Xs):
--> 965 Xs = sparse.hstack(Xs).tocsr()
966 else:
967 Xs = np.hstack(Xs)
/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
463
464 """
--> 465 return bmat([blocks], format=format, dtype=dtype)
466
467
/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
584 exp=brow_lengths[i],
585 got=A.shape[0]))
--> 586 raise ValueError(msg)
587
588 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 13892, expected 1544.

Fixing my TimeDeltaTransformer helped.
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
timedelta_series = (pd.to_datetime(X[self.t1_col]) - pd.to_datetime(X[self.t2_col]))
array_list = []
for x in timedelta_series:
array_list.append(x.total_seconds())
return np.array(array_list).reshape(-1,1)

Related

Custom Transformer to add additional column [duplicate]

This question already has answers here:
Why do these list operations (methods: clear / extend / reverse / append / sort / remove) return None, rather than the resulting list?
(6 answers)
Apply multiple preprocessing steps to a column in sklearn pipeline
(1 answer)
ColumnTransformer & Pipeline with OHE - Is the OHE encoded field retained or removed after ct is performed?
(1 answer)
Closed 5 months ago.
I am trying to replicate my lambda function into my pipeline
def determine_healthy(_list):
if ('no' in _list['smoker'] and (_list['bmi'] >= 18.5) and (_list['bmi']<= 24.9)):
return True
else:
return False
df['healthy'] = df.apply(lambda row: determine_healthy(row), axis=1)
The problem comes when I am integrating it into my pipeline, I'm not sure if the issue is that there is an additional column 'healthy' that is being added. This error is thrown when I'm trying to transform my X_train
from sklearn.base import BaseEstimator, TransformerMixin
class HealthyAttributeAdder(BaseEstimator, TransformerMixin):
def __init__(self, items=None):
if items is None: items = []
self.l = items
def fit(self, X , y=None):
return self
def transform(self, X):
#X = X.copy()
temp_cols = X.columns.to_list()
temp_cols = temp_cols.append('healthy')
new_cols = {k:v for k,v in zip(range(len(temp_cols)),temp_cols)}
healthy = X.apply(lambda row: determine_healthy(row), axis=1)
combined_df = pd.DataFrame(np.c_[X, healthy]).rename(columns=new_cols)
return combined_df
num_col = ['age','bmi']
cat_col = ['sex', 'smoker','region','children','healthy']
y = df.pop('charges')
X = df
all_col = X.columns
X_train, X_test, y_train, y_test = train_test_split(X,y , test_size=0.2, random_state = 42)
transform_pipeline = ColumnTransformer([
('healthy', HealthyAttributeAdder(), all_col),
('ss', StandardScaler(), num_col),
('ohe', OneHotEncoder(drop='first'), cat_col),
])
price_pipeline = Pipeline([
('transform', transform_pipeline),
('lasso',Lasso())
])
health_transform = HealthyAttributeAdder()
health_transform.fit_transform(X_train)
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_19796/500623650.py in <module>
----> 1 health_transform.fit_transform(X_train)
~\Venv\hdbtest\lib\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
850 if y is None:
851 # fit method of arity 1 (unsupervised transformation)
--> 852 return self.fit(X, **fit_params).transform(X)
853 else:
854 # fit method of arity 2 (supervised transformation)
~\AppData\Local\Temp/ipykernel_19796/3713134512.py in transform(self, X)
11 temp_cols = X.columns.to_list()
12 temp_cols = temp_cols.append('healthy')
---> 13 new_cols = {k:v for k,v in zip(range(len(temp_cols)),temp_cols)}
14 healthy = X.apply(lambda row: determine_healthy(row), axis=1)
15 combined_df = pd.DataFrame(np.c_[X, healthy]).rename(columns=new_cols)
TypeError: object of type 'NoneType' has no len()
Error when I use it to predict:
price_pipeline.fit(X_train,y_train)
y_pred = price_pipeline.predict(X_test)
---------------------------------------------------------------------------
KeyError Traceback (most recent call last)
~\Venv\hdbtest\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3360 try:
-> 3361 return self._engine.get_loc(casted_key)
3362 except KeyError as err:
~\Venv\hdbtest\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
~\Venv\hdbtest\lib\site-packages\pandas\_libs\index.pyx in pandas._libs.index.IndexEngine.get_loc()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
pandas\_libs\hashtable_class_helper.pxi in pandas._libs.hashtable.PyObjectHashTable.get_item()
KeyError: 'healthy'
The above exception was the direct cause of the following exception:
KeyError Traceback (most recent call last)
~\Venv\hdbtest\lib\site-packages\sklearn\utils\__init__.py in _get_column_indices(X, key)
432 for col in columns:
--> 433 col_idx = all_columns.get_loc(col)
434 if not isinstance(col_idx, numbers.Integral):
~\Venv\hdbtest\lib\site-packages\pandas\core\indexes\base.py in get_loc(self, key, method, tolerance)
3362 except KeyError as err:
-> 3363 raise KeyError(key) from err
3364
KeyError: 'healthy'
The above exception was the direct cause of the following exception:
ValueError Traceback (most recent call last)
~\AppData\Local\Temp/ipykernel_19796/993407432.py in <module>
----> 1 price_pipeline.fit(X_train,y_train)
2 y_pred = price_pipeline.predict(X_test)
~\Venv\hdbtest\lib\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
388 """
389 fit_params_steps = self._check_fit_params(**fit_params)
--> 390 Xt = self._fit(X, y, **fit_params_steps)
391 with _print_elapsed_time("Pipeline", self._log_message(len(self.steps) - 1)):
392 if self._final_estimator != "passthrough":
~\Venv\hdbtest\lib\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
346 cloned_transformer = clone(transformer)
347 # Fit or load from cache the current transformer
--> 348 X, fitted_transformer = fit_transform_one_cached(
349 cloned_transformer,
350 X,
~\Venv\hdbtest\lib\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
347
348 def __call__(self, *args, **kwargs):
--> 349 return self.func(*args, **kwargs)
350
351 def call_and_shelve(self, *args, **kwargs):
~\Venv\hdbtest\lib\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
891 with _print_elapsed_time(message_clsname, message):
892 if hasattr(transformer, "fit_transform"):
--> 893 res = transformer.fit_transform(X, y, **fit_params)
894 else:
895 res = transformer.fit(X, y, **fit_params).transform(X)
~\Venv\hdbtest\lib\site-packages\sklearn\compose\_column_transformer.py in fit_transform(self, X, y)
670 self._check_n_features(X, reset=True)
671 self._validate_transformers()
--> 672 self._validate_column_callables(X)
673 self._validate_remainder(X)
674
~\Venv\hdbtest\lib\site-packages\sklearn\compose\_column_transformer.py in _validate_column_callables(self, X)
350 columns = columns(X)
351 all_columns.append(columns)
--> 352 transformer_to_input_indices[name] = _get_column_indices(X, columns)
353
354 self._columns = all_columns
~\Venv\hdbtest\lib\site-packages\sklearn\utils\__init__.py in _get_column_indices(X, key)
439
440 except KeyError as e:
--> 441 raise ValueError("A given column is not a column of the dataframe") from e
442
443 return column_indices
ValueError: A given column is not a column of the dataframe
The first issue is actually independent from the ColumnTransformer usage and it is due to a bug in method transform's implementation in your HealthyAttributeAdder class.
In order to get a consistent result you should modify line
temp_cols = temp_cols.append('healthy')
into
temp_cols.append('healthy')
Actually, the issue is the one described here.
On the other hand, when you switch to ColumnTransformer, the issue is the one described either here or here eg (you'll find other posts related, too). Namely, ColumnTransformer applies its transformers in parallel (to the X_train dataset you're passing); therefore, when it comes to One-Hot-Encoding your categorical features, the OneHotEncoder is asked to transform the 'healthy' column (as present in cat_col), the same column not being present on X_train.
A possible way of solving the problem might be defining a separate pipeline to deal with HealthyAttributeAdder and prepend its application with respect to your ColumnTransformer's instance transform_pipeline.
class HealthyAttributeAdder(BaseEstimator, TransformerMixin):
def fit(self, X , y=None):
return self
def transform(self, X):
#X = X.copy()
temp_cols = X.columns.to_list()
temp_cols.append('healthy')
new_cols = {k:v for k,v in zip(range(len(temp_cols)),temp_cols)}
healthy = X.apply(lambda row: determine_healthy(row), axis=1)
combined_df = pd.DataFrame(np.c_[X, healthy]).rename(columns=new_cols)
return combined_df
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.linear_model import Lasso
transform_pipeline = ColumnTransformer([
#('healthy', HealthyAttributeAdder(), all_col),
('ss', StandardScaler(), num_col),
('ohe', OneHotEncoder(drop='first'), cat_col),
])
healthy_pipeline = Pipeline([
('healthy', HealthyAttributeAdder())
])
price_pipeline = Pipeline([
('add_healthy', healthy_pipeline),
('transform', transform_pipeline),
('lasso',Lasso())
])
price_pipeline.fit(X_train,y_train)
y_pred = price_pipeline.predict(X_test)
Like so, the output of the first step (add_healthy) of your price_pipeline will add the healthy column to X_train first; then this transformed X_train will be passed parallely to both StandardScaler() and OneHotEncoder() and - in particular - OneHotEncoder() won't have any problems in One-Hot-Encoding column 'healthy'.

Does sklearn pipeline() feed both X and y to the following steps?

So I'm trying to do outlier removal and supervised feature selection in the pipeline before classifier training. For this I had to create custom transformers to feed into the pipeline. All the examples I found had y=None as an argument for the transform() method, however, since I need to change y (i.e. remove outliers from y), I need to be able to access it. Here's my custom transformer for outlier removal.
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
class OutlierExtractor1(BaseEstimator, TransformerMixin):
def __init__(self):
self.threshold = 2
self.isInlier = None
def transform(self, X, y):
ind = [False if i == -1 else True for i in self.isInlier]
return (X.loc[ind,:], y.loc[ind])
def fit(self, X, y):
X2 = np.asarray(X)
y2 = np.asarray(y)
scaler = StandardScaler()
norm = scaler.fit_transform(X2)
normalized_X = pd.DataFrame(norm, columns=X.columns)
lcf = LocalOutlierFactor(metric = 'euclidean')
self.isInlier = list(lcf.fit_predict(normalized_X))
return self
And here is the pipeline where I use said transformer:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
space = {'rf__max_depth': [9, 11, 12, 14],
'rf__n_estimators': [80, 90, 100]}
pipe = Pipeline([('outliers', OutlierExtractor1()),
('rf', RandomForestClassifier(criterion = 'entropy',
min_samples_split = 4,
min_samples_leaf = 2,
min_impurity_decrease = 0.01,
random_state=0))])
ftwo_scorer = make_scorer(fbeta_score, beta=2)
ftwo_scorer = make_scorer(fbeta_score, beta=2)
search = GridSearchCV(pipe, param_grid = space, scoring = ftwo_scorer, cv = 4, return_train_score = True, verbose = 1)
search.fit(X = downsampled, y = target)
pd.DataFrame(search.cv_results_)
I get this error.
TypeError Traceback (most recent call last)
<ipython-input-34-d10a6e74d8e8> in <module>
20 ftwo_scorer = make_scorer(fbeta_score, beta=2)
21 search = GridSearchCV(pipe, param_grid = space, scoring = ftwo_scorer, cv = 4, return_train_score = True, verbose = 1)
---> 22 search.fit(X = downsampled, y = target)
23 pd.DataFrame(search.cv_results_)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\validation.py in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
74
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\_search.py in fit(self, X, y, groups, **fit_params)
763 refit_start_time = time.time()
764 if y is not None:
--> 765 self.best_estimator_.fit(X, y, **fit_params)
766 else:
767 self.best_estimator_.fit(X, **fit_params)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in fit(self, X, y, **fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in _fit(self, X, y, **fit_params_steps)
294 message_clsname='Pipeline',
295 message=self._log_message(step_idx),
--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
~\AppData\Roaming\Python\Python37\site-packages\joblib\memory.py in __call__(self, *args, **kwargs)
350
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
353
354 def call_and_shelve(self, *args, **kwargs):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\base.py in fit_transform(self, X, y, **fit_params)
691 else:
692 # fit method of arity 2 (supervised transformation)
--> 693 return self.fit(X, y, **fit_params).transform(X)
694
695
TypeError: transform() missing 1 required positional argument: 'y'
The error goes a way if I set y=None, however, y is not changed! It looks like the pipeline function only feeds X to the pre processing steps. Can someone help please?
EDIT
The pipeline() function source code feeds X and y to the fit() method of each step, however, it only feeds X to the transform() method, so y cannot be changed.
My solution was to do the outlier extraction outside of the pipeline and consequentely outside of cross validation, which is a bummer.
One thing about detecting outlier inside the train / test, bear in mind you are working with a smaller subset, so it might be less accurate. If the purpose is to simply exclusion, you can do that before passing it to a pipeline.
If you do need to do this, then it makes more sense to do the outlier detection within the fit. Below is modification of the code following a comment by jnothman in github:
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import RandomForestClassifier
class WithoutOutliersClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, outlier_detector, classifier):
self.outlier_detector = outlier_detector
self.classifier = classifier
def fit(self, X, y):
self.outlier_detector_ = clone(self.outlier_detector)
mask = self.outlier_detector_.fit_predict(X, y) == 1
self.classifier_ = clone(self.classifier).fit(X[mask], y[mask])
return self
def predict(self, X):
return self.classifier_.predict(X)
We can test this
import numpy as np
np.random.seed(111)
x = np.random.normal(0,1,(200,3))
y = np.random.binomial(1,0.5,200)
We expect 4 outliers:
(LocalOutlierFactor(metric='euclidean').fit_predict(x) == 1).sum()
4
I set oob_score = True to show that the classifier is trained on the subset we expect:
rf = WithoutOutliersClassifier(LocalOutlierFactor(metric='euclidean'),
RandomForestClassifier(oob_score=True))
rf.fit(x,y)
rf.classifier_.oob_decision_function_.shape
(196, 2)
Now put this into a pipeline, note the change in names of your param:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
space = {'rf__classifier__max_depth': [3,4],
'rf__classifier__n_estimators' : [50,100]}
pipe = Pipeline([('scale', StandardScaler()),
('rf', rf)])
search = GridSearchCV(pipe, param_grid = space)
search.fit(X = x, y = y)

ValueError when using ColumnTransformer() in an Sklearn Pipeline - Using custom class of Spacy for GloveVectorizer

I've got a dataset with multiple text columns and a target column. I'm trying to use a Cusom Class of Spacy to use Glove embeddings for my text column, and also trying to do it with a Pipeline. But I'm getting a ValueError. Following is my code:
data_features = df.copy()[["title", "description"]]
train_data, test_data, train_target, test_target = train_test_split(data_features, df['target'], test_size = 0.1)
I created this custom class to use glove embeddings. I got the code from this tutorial.
class SpacyVectorTransformer(BaseEstimator, TransformerMixin):
def __init__(self, nlp):
self.nlp = nlp
self.dim = 300
def fit(self, X, y):
return self
def transform(self, X):
return [self.nlp(text).vector for text in X]
Loading the nlp model:
nlp = spacy.load("en_core_web_sm")
This is the column transformer that I'm trying to use in my pipeline:
col_preprocessor = ColumnTransformer(
[
('title_glove', SpacyVectorTransformer(nlp), 'title'),
('description_glove', SpacyVectorTransformer(nlp), 'description'),
],
remainder='drop',
n_jobs=1
)
Here is my pipeline:
pipeline_glove = Pipeline([
('col_preprocessor', col_preprocessor),
('classifier', LogisticRegression())
])
When I run the fit method, I get the error that follows:
pipeline_glove.fit(train_data, train_target)
Error:
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-219-8543ea744205> in <module>
----> 1 pipeline_glove.fit(train_data, train_target)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params_steps)
294 message_clsname='Pipeline',
295 message=self._log_message(step_idx),
--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in fit_transform(self, X, y)
549
550 self._update_fitted_transformers(transformers)
--> 551 self._validate_output(Xs)
552
553 return self._hstack(list(Xs))
/opt/conda/lib/python3.7/site-packages/sklearn/compose/_column_transformer.py in _validate_output(self, result)
410 raise ValueError(
411 "The output of the '{0}' transformer should be 2D (scipy "
--> 412 "matrix, array, or pandas DataFrame).".format(name))
413
414 def _validate_features(self, n_features, feature_names):
ValueError: The output of the 'title_glove' transformer should be 2D (scipy matrix, array, or pandas DataFrame).
the error message tells you, what you need to fix.
ValueError: The output of the 'title_glove' transformer should be 2D
(scipy matrix, array, or pandas DataFrame).
But what you are returning with your current transformer (SpacyVectorTransformer) is a list. You can fix it, by turning the list into a pandas DataFrame for instance like this:
import pandas as pd
class SpacyVectorTransformer(BaseEstimator, TransformerMixin):
def __init__(self, nlp):
self.nlp = nlp
self.dim = 300
def fit(self, X, y):
return self
def transform(self, X):
return pd.DataFrame([self.nlp(text).vector for text in X])
Next time, please also provide a minimal, reproducible example. In your provided code, there are no imports as well as no DataFrame called "df".

Unable to fit() a Scikit-Learn pipeline without being returned a ValueError

I need your help!
I've been getting a ValueError below when trying to fit my Pipeline.
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 1, expected 13892.
I've been tasked to build a model that combines the business_features of nursing homes with their cycle 1 survey results, as well as the time between the cycle 1 and cycle 2 survey to predict the cycle 2 total score.
This is my code that I'm using to accomplish the task above.
# Creating a custom transformer to calculate the difference between survey
# 1 & survey 2 times
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
self.col_1 = X[self.t1_col].apply(pd.to_datetime)
self.col_2 = X[self.t2_col].apply(pd.to_datetime)
return self
def transform(self, X):
difference = self.col_1 - self.col_2
return difference.values
# Creating TimedeltaTransformer object
cycle_1_date = 'CYCLE_1_SURVEY_DATE'
cycle_2_date = 'CYCLE_2_SURVEY_DATE'
time_feature = TimedeltaTransformer(cycle_1_date, cycle_2_date)
# Using a custom column selecter transformer to extract cycle_1_features
cycle_1_cols = ['CYCLE_1_DEFS', 'CYCLE_1_NFROMDEFS', 'CYCLE_1_NFROMCOMP',
'CYCLE_1_DEFS_SCORE', 'CYCLE_1_NUMREVIS',
'CYCLE_1_REVISIT_SCORE', 'CYCLE_1_TOTAL_SCORE']
cycle_1_features = Pipeline([
('cst2', ColumnSelectTransformer(cycle_1_cols)),
])
# Creating my survey_model Pipeline object
# Pipeline object is a 2 step process, first a feature union transforming
# and combining the business features, cycle_1 features as well as time
# feature; followed by fitting the transformed features into a
# RandomForestRegressor
survey_model = Pipeline([
('features', FeatureUnion([
('business', business_features),
('survey', cycle_1_features),
('time', time_feature),
])),
('forest', RandomForestRegressor()),
])
# Trying to fit my Pipeline throws the ValueError described above
survey_model.fit(data, cycle_2_score.astype(int))
Some additional context: I'm building this model to have its predict_proba method passed into a custom grader for a project. The grader passes a list of dictionaries to the predict or predict_proba method of my estimator, not a DataFrame. This means that the model must work with both data types. For this reason, I need to provide a custom ColumnSelectTransformer to use instead scikit-learn's own ColumnTransformer.
Below is additional code related to the business features and ColumnSelectTransformer
# Custom transformer to select columns from a dataframe and returns the
# dataframe as an array
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
return X[self.columns].values
simple_features = Pipeline([
('cst', ColumnSelectTransformer(simple_cols)),
('imputer', SimpleImputer(strategy='mean')),
])
owner_onehot = Pipeline([
('cst', ColumnSelectTransformer(['OWNERSHIP'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
])
cert_onehot = Pipeline([
('cst', ColumnSelectTransformer(['CERTIFICATION'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
])
categorical_features = FeatureUnion([
('owner_onehot', owner_onehot),
('cert_onehot', cert_onehot),
])
business_features = FeatureUnion([
('simple', simple_features),
('categorical', categorical_features)
])
Finally, below is the full error raised
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-218-046724d81b69> in <module>()
----> 1 survey_model.fit(data, cycle_2_score.astype(int))
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit(self, X, y, **fit_params)
350 This estimator
351 """
--> 352 Xt, fit_params = self._fit(X, y, **fit_params)
353 with _print_elapsed_time('Pipeline',
354 self._log_message(len(self.steps) - 1)):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit(self, X, y, **fit_params)
315 message_clsname='Pipeline',
316 message=self._log_message(step_idx),
--> 317 **fit_params_steps[name])
318 # Replace the transformer of the step with the fitted
319 # transformer. This is necessary when loading the transformer
/opt/conda/lib/python3.7/site-packages/joblib/memory.py in __call__(self, *args, **kwargs)
353
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
356
357 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
714 with _print_elapsed_time(message_clsname, message):
715 if hasattr(transformer, 'fit_transform'):
--> 716 res = transformer.fit_transform(X, y, **fit_params)
717 else:
718 res = transformer.fit(X, y, **fit_params).transform(X)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
919
920 if any(sparse.issparse(f) for f in Xs):
--> 921 Xs = sparse.hstack(Xs).tocsr()
922 else:
923 Xs = np.hstack(Xs)
/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
463
464 """
--> 465 return bmat([blocks], format=format, dtype=dtype)
466
467
/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
584 exp=brow_lengths[i],
585 got=A.shape[0]))
--> 586 raise ValueError(msg)
587
588 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 1, expected 13892.
Further, the data and metadata can be gotten here
%%bash
mkdir data
wget http://dataincubator-wqu.s3.amazonaws.com/mldata/providers-train.csv -nc -P ./ml-data
wget http://dataincubator-wqu.s3.amazonaws.com/mldata/providers-metadata.csv -nc -P ./ml-data
Changing my TimeDeltaConverter seems to have helped.
Firstly by changing it to be a series of ints and then reshaping it to be reshape(-1,1).
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
self.col_1 = X[self.t1_col].apply(pd.to_datetime)
self.col_2 = X[self.t2_col].apply(pd.to_datetime)
return self
def transform(self, X):
difference_list = []
difference = self.col_1 - self.col_2
for obj in difference:
difference_list.append(obj.total_seconds())
return np.array(difference_list).reshape(-1,1)

Combine CountVectorizer and SelectKBest causes labels to disappear

I have a class that creates a feature extraction pipeline and fits a logistic regression model. The input is a set of string data in a DF structure. The ItemSelector class just returns the column that has the clean data from the original data frame, then passes it onto CountVectorizer and Kbest selector. If i remove Kbest, this pipeline works:
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelector(BaseEstimator, TransformerMixin):
# returns a single column from a DF
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class LogisticRegressionWithWordFeatures(object):
def __init__(self):
self.model = LogisticRegression()
def fit(self, df, labels):
self.pipeline = self.get_preprocessing_pipeline(df)
fitted_df = self.pipeline.fit_transform(df)
self.model.fit(fitted_df, labels)
return self
def predict(self, df):
fitted_df = self.pipeline.transform(df)
y = self.model.predict(fitted_df)
return y
def get_preprocessing_pipeline(self, data_frame):
"""
Get data frame containing features and labels from raw feature input DF.
:param input_file: input DF
"""
process_and_join_features = Pipeline([
('features', FeatureUnion([
('count_lemma_features', Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('counts', CountVectorizer(analyzer="word", stop_words='english'))]))])),
('reducer', SelectKBest(chi2, k=1000))
])
return process_and_join_features
If I try to fit/transform based on this pipeline I get this error:
model = LogisticRegressionWithWordFeatures()
model.fit(train_data, train_labels)
test_y = model.predict(test_data)
>>>
TypeError Traceback (most recent call last)
<ipython-input-183-536a1c9c0a09> in <module>
1 b_logistic_regression_with_hypers_bow_clean = LogisticRegressionWithWordFeatures()
----> 2 b_logistic_regression_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.fit(b_ebay_train_data, b_ebay_train_labels)
3 b_ebay_y_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.predict(b_ebay_test_data)
4 b_gold_y_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.predict(gold_df)
<ipython-input-181-6974b6ea2a5b> in fit(self, df, labels)
6 def fit(self, df, labels):
7 self.pipeline = self.get_preprocessing_pipeline(df)
----> 8 fitted_df = self.pipeline.fit_transform(df)
9 self.model.fit(fitted_df, labels)
10 return self
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
391 return Xt
392 if hasattr(last_step, 'fit_transform'):
--> 393 return last_step.fit_transform(Xt, y, **fit_params)
394 else:
395 return last_step.fit(Xt, y, **fit_params).transform(Xt)
~/anaconda3/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
551 if y is None:
552 # fit method of arity 1 (unsupervised transformation)
--> 553 return self.fit(X, **fit_params).transform(X)
554 else:
555 # fit method of arity 2 (supervised transformation)
TypeError: fit() missing 1 required positional argument: 'y'
Clearly the issue is that the training labels aren't making its way into the pipeline. I tried adding another itemselector for the training labels:
process_and_join_features = Pipeline([
('features', FeatureUnion([
('count_lemma_features', Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('counts', CountVectorizer(analyzer="word", stop_words='english'))])),
('labels', ItemSelector(key='Expense_Category'))])),
('reducer', SelectKBest(chi2, k=1000))
])
return process_and_join_features
But this causes a key error for the label (Expense_Category), even though that column is there in the training data.
If I do it step by step, this works:
item_selector = ItemSelector(key='clean_Invoice_Description').fit(train_data)
count_selector = CountVectorizer(analyzer="word", stop_words='english')
k_best = SelectKBest(chi2, k=1000)
invoice_desc = item_selector.transform(train_data)
invoice_desc = count_selector.fit_transform(invoice_desc)
reduced_desc = k_best.fit_transform(invoice_desc, train_labels)
print(reduced_desc.shape)
>>> (6130, 1000)
The problem with doing the step-by-step way is that there are other features in other columns that I would like to use in along with these, and pipeline provides a nice way of doing so without having to manually combine them.
Solved it. The main issue being the nesting of each feature. Pipelines() expects a list of tuples, where the first item in the tuple is the feature/pipe name, and the second being the actual class. It is very easy to lose track of the nesting as you add more features. Here is the final code:
def get_preprocessing_pipeline(self, data_frame):
"""
Get data frame containing features and labels from raw feature input csv file"""
process_and_join_features = Pipeline([
('features',
FeatureUnion([
('tokens',
Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('vec', CountVectorizer(analyzer="word", stop_words='english')),
('dim_red', SelectKBest(chi2, k=5000))
])),
('hypernyms',
Pipeline([
('selector', ItemSelector(key='hypernyms_combined')),
('vec', TfidfVectorizer(analyzer="word")),
('dim_red', SelectKBest(chi2, k=5000))
]))]))])
return process_and_join_features

Categories

Resources