Combine CountVectorizer and SelectKBest causes labels to disappear - python

I have a class that creates a feature extraction pipeline and fits a logistic regression model. The input is a set of string data in a DF structure. The ItemSelector class just returns the column that has the clean data from the original data frame, then passes it onto CountVectorizer and Kbest selector. If i remove Kbest, this pipeline works:
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelector(BaseEstimator, TransformerMixin):
# returns a single column from a DF
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class LogisticRegressionWithWordFeatures(object):
def __init__(self):
self.model = LogisticRegression()
def fit(self, df, labels):
self.pipeline = self.get_preprocessing_pipeline(df)
fitted_df = self.pipeline.fit_transform(df), labels)
return self
def predict(self, df):
fitted_df = self.pipeline.transform(df)
y = self.model.predict(fitted_df)
return y
def get_preprocessing_pipeline(self, data_frame):
Get data frame containing features and labels from raw feature input DF.
:param input_file: input DF
process_and_join_features = Pipeline([
('features', FeatureUnion([
('count_lemma_features', Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('counts', CountVectorizer(analyzer="word", stop_words='english'))]))])),
('reducer', SelectKBest(chi2, k=1000))
return process_and_join_features
If I try to fit/transform based on this pipeline I get this error:
model = LogisticRegressionWithWordFeatures(), train_labels)
test_y = model.predict(test_data)
TypeError Traceback (most recent call last)
<ipython-input-183-536a1c9c0a09> in <module>
1 b_logistic_regression_with_hypers_bow_clean = LogisticRegressionWithWordFeatures()
----> 2 b_logistic_regression_with_hypers_bow_clean =, b_ebay_train_labels)
3 b_ebay_y_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.predict(b_ebay_test_data)
4 b_gold_y_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.predict(gold_df)
<ipython-input-181-6974b6ea2a5b> in fit(self, df, labels)
6 def fit(self, df, labels):
7 self.pipeline = self.get_preprocessing_pipeline(df)
----> 8 fitted_df = self.pipeline.fit_transform(df)
9, labels)
10 return self
~/anaconda3/lib/python3.7/site-packages/sklearn/ in fit_transform(self, X, y, **fit_params)
391 return Xt
392 if hasattr(last_step, 'fit_transform'):
--> 393 return last_step.fit_transform(Xt, y, **fit_params)
394 else:
395 return, y, **fit_params).transform(Xt)
~/anaconda3/lib/python3.7/site-packages/sklearn/ in fit_transform(self, X, y, **fit_params)
551 if y is None:
552 # fit method of arity 1 (unsupervised transformation)
--> 553 return, **fit_params).transform(X)
554 else:
555 # fit method of arity 2 (supervised transformation)
TypeError: fit() missing 1 required positional argument: 'y'
Clearly the issue is that the training labels aren't making its way into the pipeline. I tried adding another itemselector for the training labels:
process_and_join_features = Pipeline([
('features', FeatureUnion([
('count_lemma_features', Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('counts', CountVectorizer(analyzer="word", stop_words='english'))])),
('labels', ItemSelector(key='Expense_Category'))])),
('reducer', SelectKBest(chi2, k=1000))
return process_and_join_features
But this causes a key error for the label (Expense_Category), even though that column is there in the training data.
If I do it step by step, this works:
item_selector = ItemSelector(key='clean_Invoice_Description').fit(train_data)
count_selector = CountVectorizer(analyzer="word", stop_words='english')
k_best = SelectKBest(chi2, k=1000)
invoice_desc = item_selector.transform(train_data)
invoice_desc = count_selector.fit_transform(invoice_desc)
reduced_desc = k_best.fit_transform(invoice_desc, train_labels)
>>> (6130, 1000)
The problem with doing the step-by-step way is that there are other features in other columns that I would like to use in along with these, and pipeline provides a nice way of doing so without having to manually combine them.

Solved it. The main issue being the nesting of each feature. Pipelines() expects a list of tuples, where the first item in the tuple is the feature/pipe name, and the second being the actual class. It is very easy to lose track of the nesting as you add more features. Here is the final code:
def get_preprocessing_pipeline(self, data_frame):
Get data frame containing features and labels from raw feature input csv file"""
process_and_join_features = Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('vec', CountVectorizer(analyzer="word", stop_words='english')),
('dim_red', SelectKBest(chi2, k=5000))
('selector', ItemSelector(key='hypernyms_combined')),
('vec', TfidfVectorizer(analyzer="word")),
('dim_red', SelectKBest(chi2, k=5000))
return process_and_join_features


Does sklearn pipeline() feed both X and y to the following steps?

So I'm trying to do outlier removal and supervised feature selection in the pipeline before classifier training. For this I had to create custom transformers to feed into the pipeline. All the examples I found had y=None as an argument for the transform() method, however, since I need to change y (i.e. remove outliers from y), I need to be able to access it. Here's my custom transformer for outlier removal.
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.neighbors import LocalOutlierFactor
from sklearn.preprocessing import StandardScaler
class OutlierExtractor1(BaseEstimator, TransformerMixin):
def __init__(self):
self.threshold = 2
self.isInlier = None
def transform(self, X, y):
ind = [False if i == -1 else True for i in self.isInlier]
return (X.loc[ind,:], y.loc[ind])
def fit(self, X, y):
X2 = np.asarray(X)
y2 = np.asarray(y)
scaler = StandardScaler()
norm = scaler.fit_transform(X2)
normalized_X = pd.DataFrame(norm, columns=X.columns)
lcf = LocalOutlierFactor(metric = 'euclidean')
self.isInlier = list(lcf.fit_predict(normalized_X))
return self
And here is the pipeline where I use said transformer:
from sklearn.pipeline import Pipeline
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import GridSearchCV
from sklearn.metrics import fbeta_score, make_scorer
space = {'rf__max_depth': [9, 11, 12, 14],
'rf__n_estimators': [80, 90, 100]}
pipe = Pipeline([('outliers', OutlierExtractor1()),
('rf', RandomForestClassifier(criterion = 'entropy',
min_samples_split = 4,
min_samples_leaf = 2,
min_impurity_decrease = 0.01,
ftwo_scorer = make_scorer(fbeta_score, beta=2)
ftwo_scorer = make_scorer(fbeta_score, beta=2)
search = GridSearchCV(pipe, param_grid = space, scoring = ftwo_scorer, cv = 4, return_train_score = True, verbose = 1) = downsampled, y = target)
I get this error.
TypeError Traceback (most recent call last)
<ipython-input-34-d10a6e74d8e8> in <module>
20 ftwo_scorer = make_scorer(fbeta_score, beta=2)
21 search = GridSearchCV(pipe, param_grid = space, scoring = ftwo_scorer, cv = 4, return_train_score = True, verbose = 1)
---> 22 = downsampled, y = target)
23 pd.DataFrame(search.cv_results_)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\utils\ in inner_f(*args, **kwargs)
70 FutureWarning)
71 kwargs.update({k: arg for k, arg in zip(sig.parameters, args)})
---> 72 return f(**kwargs)
73 return inner_f
~\AppData\Roaming\Python\Python37\site-packages\sklearn\model_selection\ in fit(self, X, y, groups, **fit_params)
763 refit_start_time = time.time()
764 if y is not None:
--> 765, y, **fit_params)
766 else:
767, **fit_params)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\ in fit(self, X, y, **fit_params)
328 """
329 fit_params_steps = self._check_fit_params(**fit_params)
--> 330 Xt = self._fit(X, y, **fit_params_steps)
331 with _print_elapsed_time('Pipeline',
332 self._log_message(len(self.steps) - 1)):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\ in _fit(self, X, y, **fit_params_steps)
294 message_clsname='Pipeline',
295 message=self._log_message(step_idx),
--> 296 **fit_params_steps[name])
297 # Replace the transformer of the step with the fitted
298 # transformer. This is necessary when loading the transformer
~\AppData\Roaming\Python\Python37\site-packages\joblib\ in __call__(self, *args, **kwargs)
351 def __call__(self, *args, **kwargs):
--> 352 return self.func(*args, **kwargs)
354 def call_and_shelve(self, *args, **kwargs):
~\AppData\Roaming\Python\Python37\site-packages\sklearn\ in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
738 with _print_elapsed_time(message_clsname, message):
739 if hasattr(transformer, 'fit_transform'):
--> 740 res = transformer.fit_transform(X, y, **fit_params)
741 else:
742 res =, y, **fit_params).transform(X)
~\AppData\Roaming\Python\Python37\site-packages\sklearn\ in fit_transform(self, X, y, **fit_params)
691 else:
692 # fit method of arity 2 (supervised transformation)
--> 693 return, y, **fit_params).transform(X)
TypeError: transform() missing 1 required positional argument: 'y'
The error goes a way if I set y=None, however, y is not changed! It looks like the pipeline function only feeds X to the pre processing steps. Can someone help please?
The pipeline() function source code feeds X and y to the fit() method of each step, however, it only feeds X to the transform() method, so y cannot be changed.
My solution was to do the outlier extraction outside of the pipeline and consequentely outside of cross validation, which is a bummer.
One thing about detecting outlier inside the train / test, bear in mind you are working with a smaller subset, so it might be less accurate. If the purpose is to simply exclusion, you can do that before passing it to a pipeline.
If you do need to do this, then it makes more sense to do the outlier detection within the fit. Below is modification of the code following a comment by jnothman in github:
from sklearn.base import BaseEstimator, ClassifierMixin, clone
from sklearn.neighbors import LocalOutlierFactor
from sklearn.ensemble import RandomForestClassifier
class WithoutOutliersClassifier(BaseEstimator, ClassifierMixin):
def __init__(self, outlier_detector, classifier):
self.outlier_detector = outlier_detector
self.classifier = classifier
def fit(self, X, y):
self.outlier_detector_ = clone(self.outlier_detector)
mask = self.outlier_detector_.fit_predict(X, y) == 1
self.classifier_ = clone(self.classifier).fit(X[mask], y[mask])
return self
def predict(self, X):
return self.classifier_.predict(X)
We can test this
import numpy as np
x = np.random.normal(0,1,(200,3))
y = np.random.binomial(1,0.5,200)
We expect 4 outliers:
(LocalOutlierFactor(metric='euclidean').fit_predict(x) == 1).sum()
I set oob_score = True to show that the classifier is trained on the subset we expect:
rf = WithoutOutliersClassifier(LocalOutlierFactor(metric='euclidean'),
(196, 2)
Now put this into a pipeline, note the change in names of your param:
from sklearn.preprocessing import StandardScaler
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
space = {'rf__classifier__max_depth': [3,4],
'rf__classifier__n_estimators' : [50,100]}
pipe = Pipeline([('scale', StandardScaler()),
('rf', rf)])
search = GridSearchCV(pipe, param_grid = space) = x, y = y)

Unable to fit() a Scikit-Learn pipeline without being returned a ValueError

I need your help!
I've been getting a ValueError below when trying to fit my Pipeline.
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 1, expected 13892.
I've been tasked to build a model that combines the business_features of nursing homes with their cycle 1 survey results, as well as the time between the cycle 1 and cycle 2 survey to predict the cycle 2 total score.
This is my code that I'm using to accomplish the task above.
# Creating a custom transformer to calculate the difference between survey
# 1 & survey 2 times
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
self.col_1 = X[self.t1_col].apply(pd.to_datetime)
self.col_2 = X[self.t2_col].apply(pd.to_datetime)
return self
def transform(self, X):
difference = self.col_1 - self.col_2
return difference.values
# Creating TimedeltaTransformer object
cycle_1_date = 'CYCLE_1_SURVEY_DATE'
cycle_2_date = 'CYCLE_2_SURVEY_DATE'
time_feature = TimedeltaTransformer(cycle_1_date, cycle_2_date)
# Using a custom column selecter transformer to extract cycle_1_features
cycle_1_features = Pipeline([
('cst2', ColumnSelectTransformer(cycle_1_cols)),
# Creating my survey_model Pipeline object
# Pipeline object is a 2 step process, first a feature union transforming
# and combining the business features, cycle_1 features as well as time
# feature; followed by fitting the transformed features into a
# RandomForestRegressor
survey_model = Pipeline([
('features', FeatureUnion([
('business', business_features),
('survey', cycle_1_features),
('time', time_feature),
('forest', RandomForestRegressor()),
# Trying to fit my Pipeline throws the ValueError described above, cycle_2_score.astype(int))
Some additional context: I'm building this model to have its predict_proba method passed into a custom grader for a project. The grader passes a list of dictionaries to the predict or predict_proba method of my estimator, not a DataFrame. This means that the model must work with both data types. For this reason, I need to provide a custom ColumnSelectTransformer to use instead scikit-learn's own ColumnTransformer.
Below is additional code related to the business features and ColumnSelectTransformer
# Custom transformer to select columns from a dataframe and returns the
# dataframe as an array
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
return X[self.columns].values
simple_features = Pipeline([
('cst', ColumnSelectTransformer(simple_cols)),
('imputer', SimpleImputer(strategy='mean')),
owner_onehot = Pipeline([
('cst', ColumnSelectTransformer(['OWNERSHIP'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
cert_onehot = Pipeline([
('cst', ColumnSelectTransformer(['CERTIFICATION'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
categorical_features = FeatureUnion([
('owner_onehot', owner_onehot),
('cert_onehot', cert_onehot),
business_features = FeatureUnion([
('simple', simple_features),
('categorical', categorical_features)
Finally, below is the full error raised
ValueError Traceback (most recent call last)
<ipython-input-218-046724d81b69> in <module>()
----> 1, cycle_2_score.astype(int))
/opt/conda/lib/python3.7/site-packages/sklearn/ in fit(self, X, y, **fit_params)
350 This estimator
351 """
--> 352 Xt, fit_params = self._fit(X, y, **fit_params)
353 with _print_elapsed_time('Pipeline',
354 self._log_message(len(self.steps) - 1)):
/opt/conda/lib/python3.7/site-packages/sklearn/ in _fit(self, X, y, **fit_params)
315 message_clsname='Pipeline',
316 message=self._log_message(step_idx),
--> 317 **fit_params_steps[name])
318 # Replace the transformer of the step with the fitted
319 # transformer. This is necessary when loading the transformer
/opt/conda/lib/python3.7/site-packages/joblib/ in __call__(self, *args, **kwargs)
354 def __call__(self, *args, **kwargs):
--> 355 return self.func(*args, **kwargs)
357 def call_and_shelve(self, *args, **kwargs):
/opt/conda/lib/python3.7/site-packages/sklearn/ in _fit_transform_one(transformer, X, y, weight, message_clsname, message, **fit_params)
714 with _print_elapsed_time(message_clsname, message):
715 if hasattr(transformer, 'fit_transform'):
--> 716 res = transformer.fit_transform(X, y, **fit_params)
717 else:
718 res =, y, **fit_params).transform(X)
/opt/conda/lib/python3.7/site-packages/sklearn/ in fit_transform(self, X, y, **fit_params)
920 if any(sparse.issparse(f) for f in Xs):
--> 921 Xs = sparse.hstack(Xs).tocsr()
922 else:
923 Xs = np.hstack(Xs)
/opt/conda/lib/python3.7/site-packages/scipy/sparse/ in hstack(blocks, format, dtype)
464 """
--> 465 return bmat([blocks], format=format, dtype=dtype)
/opt/conda/lib/python3.7/site-packages/scipy/sparse/ in bmat(blocks, format, dtype)
584 exp=brow_lengths[i],
585 got=A.shape[0]))
--> 586 raise ValueError(msg)
588 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 1, expected 13892.
Further, the data and metadata can be gotten here
mkdir data
wget -nc -P ./ml-data
wget -nc -P ./ml-data
Changing my TimeDeltaConverter seems to have helped.
Firstly by changing it to be a series of ints and then reshaping it to be reshape(-1,1).
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
self.col_1 = X[self.t1_col].apply(pd.to_datetime)
self.col_2 = X[self.t2_col].apply(pd.to_datetime)
return self
def transform(self, X):
difference_list = []
difference = self.col_1 - self.col_2
for obj in difference:
return np.array(difference_list).reshape(-1,1)

Raising ValueError (extra rows) when submitting pipeline.predict into grader

I've been getting a ValueError below when trying to submit my Pipeline into a grader. And I'm not sure where I'm supposed to shave off 12500 rows of data.
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 13892, expected 1544.
I've been tasked to build a model that combines the business_features of nursing homes with their cycle 1 survey results, as well as the time between the cycle 1 and cycle 2 survey to predict the cycle 2 total score.
This is my code that I'm using to accomplish the task above.
# Creating a custom transformer to calculate the difference between survey
# 1 & survey 2 times
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
self.col_1 = X[self.t1_col].apply(pd.to_datetime)
self.col_2 = X[self.t2_col].apply(pd.to_datetime)
return self
def transform(self, X):
difference_list = []
difference = self.col_1 - self.col_2
for obj in difference:
return np.array(difference_list).reshape(-1,1)
# Creating TimedeltaTransformer object
cycle_1_date = 'CYCLE_1_SURVEY_DATE'
cycle_2_date = 'CYCLE_2_SURVEY_DATE'
time_feature = TimedeltaTransformer(cycle_1_date, cycle_2_date)
# Using a custom column selecter transformer to extract cycle_1_features
cycle_1_features = Pipeline([
('cst2', ColumnSelectTransformer(cycle_1_cols)),
# Creating my survey_model Pipeline object
# Pipeline object is a 2 step process, first a feature union transforming
# and combining the business features, cycle_1 features as well as time
# feature; followed by fitting the transformed features into a
# RandomForestRegressor
survey_model = Pipeline([
('features', FeatureUnion([
('business', business_features),
('survey', cycle_1_features),
('time', time_feature),
('forest', RandomForestRegressor()),
# Fitting my pipeline produces no error, cycle_2_score.astype(int))
# Calling the predict function and passing it into the grader raises a ValueError
The fitted pipeline looks like this
transformer_weights=None, verbose=False)),
RandomForestRegressor(bootstrap=True, criterion='mse',
max_depth=None, max_features='auto',
min_samples_leaf=1, min_samples_split=2,
n_estimators=10, n_jobs=None,
oob_score=False, random_state=None,
verbose=0, warm_start=False))],
Some additional context: I'm building this model to have its predict method passed into a custom grader for a project. The grader passes a list of dictionaries to the predict or predict_proba method of my estimator, not a DataFrame. This means that the model must work with both data types. For this reason, I need to provide a custom ColumnSelectTransformer to use instead scikit-learn's own ColumnTransformer.
Below is additional code related to the business features and ColumnSelectTransformer
# Custom transformer to select columns from a dataframe and returns the
# dataframe as an array
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
return X[self.columns].values
simple_features = Pipeline([
('cst', ColumnSelectTransformer(simple_cols)),
('imputer', SimpleImputer(strategy='mean')),
owner_onehot = Pipeline([
('cst', ColumnSelectTransformer(['OWNERSHIP'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
cert_onehot = Pipeline([
('cst', ColumnSelectTransformer(['CERTIFICATION'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
categorical_features = FeatureUnion([
('owner_onehot', owner_onehot),
('cert_onehot', cert_onehot),
business_features = FeatureUnion([
('simple', simple_features),
('categorical', categorical_features)
Finally, below is the full error raised
ValueError Traceback (most recent call last)
<ipython-input-165-790ca6139493> in <module>()
----> 1 grader.score.ml__survey_model(survey_model.predict)
/opt/conda/lib/python3.7/site-packages/static_grader/ in func(*args, **kw)
92 def __getattr__(self, method):
93 def func(*args, **kw):
---> 94 return self(method, *args, **kw)
95 return func
/opt/conda/lib/python3.7/site-packages/static_grader/ in __call__(self, question_name, func)
88 return
89 test_cases = json.loads(resp.text)
---> 90 test_cases_grading(question_name, func, test_cases)
92 def __getattr__(self, method):
/opt/conda/lib/python3.7/site-packages/static_grader/ in test_cases_grading(question_name, func, test_cases)
40 for test_case in test_cases:
41 if inspect.isroutine(func):
---> 42 sub_res = func(*test_case['args'], **test_case['kwargs'])
43 elif not test_case['args'] and not test_case['kwargs']:
44 sub_res = func
/opt/conda/lib/python3.7/site-packages/sklearn/utils/ in <lambda>(*args, **kwargs)
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
/opt/conda/lib/python3.7/site-packages/sklearn/ in predict(self, X, **predict_params)
419 Xt = X
420 for _, name, transform in self._iter(with_final=False):
--> 421 Xt = transform.transform(Xt)
422 return self.steps[-1][-1].predict(Xt, **predict_params)
/opt/conda/lib/python3.7/site-packages/sklearn/ in transform(self, X)
963 return np.zeros((X.shape[0], 0))
964 if any(sparse.issparse(f) for f in Xs):
--> 965 Xs = sparse.hstack(Xs).tocsr()
966 else:
967 Xs = np.hstack(Xs)
/opt/conda/lib/python3.7/site-packages/scipy/sparse/ in hstack(blocks, format, dtype)
464 """
--> 465 return bmat([blocks], format=format, dtype=dtype)
/opt/conda/lib/python3.7/site-packages/scipy/sparse/ in bmat(blocks, format, dtype)
584 exp=brow_lengths[i],
585 got=A.shape[0]))
--> 586 raise ValueError(msg)
588 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 13892, expected 1544.
Fixing my TimeDeltaTransformer helped.
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
timedelta_series = (pd.to_datetime(X[self.t1_col]) - pd.to_datetime(X[self.t2_col]))
array_list = []
for x in timedelta_series:
return np.array(array_list).reshape(-1,1)

How to fix "TypeError: fit_transform() takes 2 positional arguments but 3 were given"

I have tried to write complicated pipeline with own classes and I got this error:
TypeError: fit_transform() takes 2 positional arguments but 3 were given
I tried to apply solutions from similar issue by using custom LabelBinarizer, but the error did not fixed.
class NewLabelBinarizer(LabelBinarizer):
def fit(self, X, y=None):
return super(NewLabelBinarizer, self).fit(X)
def transform(self, X, y=None):
return super(NewLabelBinarizer, self).transform(X)
def fit_transform(self, X, y=None):
return super(NewLabelBinarizer, self).fit(X).transform(X)
class LabelPreprocessing(NewLabelBinarizer, TransformerMixin):
def __init__(self, *args, **kwargs):
def fit(self, y,X=None):
super(LabelPreprocessing, self).fit(y)
return self
def transform(self, y, X=None):
y = y.str.findall(r'([a-zA-Z]{3,})')
y = y.replace(regex=r'(film)', value=' ')
return y.values
class PlotPreprocessing(BaseEstimator, TransformerMixin):
def __init__(self, *args, **kwargs):
self.REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|#,;]')
self.BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
X = X.str.lower()
X = x: re.sub(self.REPLACE_BY_SPACE_RE," ",x))
X = x: re.sub(r'\s+'," ",x))
return X.values
pipeline = Pipeline([
'text_preparation', FeatureUnion([
('label', Pipeline([
('labelPreprocessing', LabelPreprocessing()),
('mlb', MultiLabelBinarizer())
('plot', PlotPreprocessing()
('tfidf_vectorizer', TfidfVectorizer(ngram_range=(1,2),max_df=0.9,min_df=5,token_pattern=r'(\S+)')
('model', LinearRegression())
train_X, train_y, test_X, test_y = train_test_split(plot, label), train_y)
y_pred = pipeline.predict( test_X )
And I get
TypeError Traceback (most recent call last)
<ipython-input-27-a8a60de025fd> in <module>()
17 train_X, train_y, test_X, test_y = train_test_split(plot, label)
---> 19, train_y)
20 y_pred = pipeline.predict( test_X )
14 frames
/usr/local/lib/python3.6/dist-packages/sklearn/ in fit_transform(self, X, y, **fit_params)
391 return Xt
392 if hasattr(last_step, 'fit_transform'):
--> 393 return last_step.fit_transform(Xt, y, **fit_params)
394 else:
395 return, y, **fit_params).transform(Xt)
TypeError: fit_transform() takes 2 positional arguments but 3 were given
Also I tried to add **fit_params to fit/predict params.
I think that the problem is with MultiLabelBinarizer signature. Try out replacing it with:
class MLBinarizer(MultiLabelBinarizer):
def fit_transform(self, X, y=None):
return super(MultiLabelBinarizer, self).fit_transform(X)
note that methods fit, fit_transform take X as the first argument. So I recommend rewriting your LabelPreprocessing like this:
class LabelPreprocessing(NewLabelBinarizer, TransformerMixin):
def fit(self, X, y=None):
super(LabelPreprocessing, self).fit(X)
return self
def transform(self, y, X=None):
y = y.str.findall(r'([a-zA-Z]{3,})')
y = y.replace(regex=r'(film)', value=' ')
return y.values

How use leave one out encoding in sklearn pipelines

I would like to test different encoding strategies as implemented in categorical encoding package using sklearn pipelines.
I mean something like this:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy="median")),
('std_scaler', StandardScaler()),
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('cat_encoder', LeaveOneOutEncoder()),
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
housing_prepared = full_pipeline.fit_transform(housing)
But I get an error:
TypeError: fit() missing 1 required positional argument: 'y'
Can anyone suggest a solution?
Let show just part of the code as I do. I add XGBRegressor because I think you may predict housing price
class MultiColumn(BaseEstimator, TransformerMixin):
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self
def transform(self, X):
return X[self.columns]
NUMERIC = df[['var1', 'var2']]
CATEGORICAL = df[['var3', 'var4']]
class Imputation(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
return X.fillna(NUMERIC.median())
def fit_transform(self, X, y=None, **fit_params):, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
class Cat(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
enc = DictVectorizer(sparse = False)
encc =
enc_data = encc.transform(X.T.to_dict().values())
enc_data[np.isnan(enc_data)] = 1
return enc_data
def fit_transform(self, X, y=None, **fit_params):, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
And Pipeline
pipeline = Pipeline([
# Use FeatureUnion to combine the features
('union', FeatureUnion(
# numeric
('numeric', Pipeline([
('selector', MultiColumn(columns=['var1', 'var2'])),
('imp', Imputation()),
('scaling', preprocessing.StandardScaler(with_mean = 0.))
# categorical
('categorical', Pipeline([
('selector', MultiColumn(columns=['var3', 'var4'])),
('one_hot', Cat()),
('model_fitting', xgb.XGBRegressor()),
Your categorical encoder (LeaveOneOutEncoder) needs the target variable to adjust and replace the new labels (levels) for your variables defined in cat_attribs. So, you just need to invoke fit_transform method joined with y_train:
housing_prepared = full_pipeline.fit_transform(housing, y_train)

