ColumnTransformer fit_transform not working with pipeline - python

I am writing a pipeline with custom transformer. When calling fit_transform of categorical pipeline I am getting the desired result but when calling fit_transform of ColumnTransformer, whatever I have initialised in init of custom transformer is getting lost.
Note: not including code of numericalTransformer for readability
class categoryTransformer(BaseEstimator, TransformerMixin):
def __init__(self, use_dates=['year', 'month', 'day']):
self._use_dates = use_dates
print('==========>',self._use_dates)
def fit(self, X, y=None):
return self
def get_year(self, obj):
return str(obj)[:4]
def get_month(self, obj):
return str(obj)[4:6]
def get_day(self, obj):
return str(obj)[6:8]
def create_boolean(self, obj):
if obj == '0':
return 'No'
else:
return 'Yes'
def transform(self, X, y=None):
print(self._use_dates)
for spec in self._use_dates:
print(spec)
exec("X.loc[:,'{}'] = X['date'].apply(self.get_{})".format(spec, spec))
X = X.drop('date', axis=1)
X.loc[:,'yr_renovated'] = X['yr_renovated'].apply(self.create_boolean)
X.loc[:, 'view'] = X['view'].apply(self.create_boolean)
return X.values
cat_pipe = Pipeline([
('cat_transform', categoryTransformer()),
('one_hot', OneHotEncoder(sparse=False))])
num_pipe = Pipeline([
('num_transform', numericalTransformer()),
('imputer', SimpleImputer(strategy = 'median')),
('std_scaler', StandardScaler())])
full_pipe = ColumnTransformer([
('num', num_pipe, numerical_features),
('cat', cat_pipe, categorical_features)])
cat_pipe.fit_transform(data[categorical_features])#working fine
df2 = full_pipe.fit_transform(X_train)# __init__ initialisation lost
"output"
==========> ['year', 'month', 'day']
['year', 'month', 'day']
year
month
day
==========> None
None
After that long traceback that I am not able to debug. Workaround is if I can create use_dates=['year', 'month', 'day'] in transform function itself but I want to understand why this is happening.

The parameters of __init__ need to have the same names as the attributes that get set (so use_dates and _use_dates is the problem).
This is required for cloning to work properly, and ColumnTransformer clones all its transformers before fitting.
https://scikit-learn.org/stable/developers/develop.html#instantiation

Related

Transform y_train in scikit learn pipeline

I have a test train split named X_train and y_train that I pass to a sci-kit learn pipeline. Is it possible to have a custom step to only transform y_train i.e remove nan and infs from y_train.
class columnDropperTransformer():
def __init__(self,columns):
self.columns=columns
def transform(self,X,y=None):
print('---- Dropping ID cols :', self.columns)
return X.drop(self.columns,axis=1)
def fit(self, X, y=None):
return self
print('---- Making pipeline')
drop_cols = Pipeline(steps=[
("columnDropper", columnDropperTransformer(id_cols))
])
feature_remover = Pipeline(steps=[
("columnDropper", missingRemover())
])
fill_na_zero_transformer = Pipeline(steps=[
('zero_imputer', SimpleImputer(strategy='constant', fill_value=0))
])
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy = "constant", fill_value=-1, add_indicator=True)),
('scaler', StandardScaler())
])
class SkipSimpleImputer(SimpleImputer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def transform(self, X, y=None):
if 'MARITAL_STATUS' in X.columns:
print('\t---- MARITAL STATUS found in skipsimpleimpute, all cols are: ', X.columns)
transformed_X = super().transform(X['MARITAL_STATUS'])
X['MARITAL_STATUS'] = transformed_X
return X
def fit(self, X, y=None):
return self
categorical_transformer = Pipeline(steps=[
('categorical_imputer', SkipSimpleImputer(strategy="constant", fill_value='Unknown')),
('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocess_ppl = ColumnTransformer(
transformers=[
('encode', categorical_transformer, make_column_selector(dtype_include=object)),
('zero_impute', fill_na_zero_transformer, lambda X: [col for col in fill_zero_cols if col in X.columns] ),
('numeric', numeric_transformer, lambda X: [col for col in num_cols if col in X.columns])
]
)
pipeline2 = Pipeline(
steps=[
('dropper', drop_cols),
('remover',feature_remover),
("preprocessor", preprocess_ppl),
("estimator", customOLS(sm.OLS, LinearRegression()))
]
)
Could this be done via custom column transformer or via pipeline step transformer. In custom COlumntransformer we return X, how do we update y?

Getting feature names from a pipeline with tfidfvectorizer

I have been trying to get the feature names on my model for quite some time now but have a hard time understanding how to do it. I have tried many posts on here but can't get it to work. Here is my code:
loading the classes I need to combine tfidfvectorizer with other features
from sklearn.base import TransformerMixin, BaseEstimator
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class FeatureTypeSelector(TransformerMixin, BaseEstimator):
FEATURE_TYPES = {
'categorical': [
'COLUMN_A','COLUMN_B'
],
'continuous': [
'COLULMN_C','COLUMN_D'
]
}
def __init__(self, feature_type):
self.columns = self.FEATURE_TYPES[feature_type]
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.columns]
class RowToDictTransformer(TransformerMixin, BaseEstimator):
def fit(self, X, y=None):
return self
def transform(self, X):
return (row[1] for row in X.iterrows())
Then the code to put everything in a pipeline and run the regressor
from sklearn.pipeline import make_union, make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
# Create the preprocessor
preprocessor = make_union(
make_pipeline(
ItemSelector(key='TEXT_COLUMN'),
TfidfVectorizer(lowercase=False, min_df=1),
),
make_pipeline(
FeatureTypeSelector('continuous'),
MinMaxScaler(),
),
make_pipeline(
FeatureTypeSelector('categorical'),
RowToDictTransformer(),
DictVectorizer(sparse=False), # set sparse=True if you get MemoryError
),
)
# fit and transform the data
preprocessor.fit_transform(x_train)
# choose some estimator
# estimator = MultinomialNB()
estimator = LinearRegression()
# Create the model
model = make_pipeline(preprocessor, estimator)
# Training the model
model.fit(x_train, y_train)
# Predicting the model
predicted = model.predict(x_test)
I can run the model.coef_ to get all the coefficients but I want to see how each item of the TEXT_COLUMN is affected by which weight. I have tried calling get_feature_names() or tried passing them in the pipeline but with no succes (most of google's results are purple by now).
Anyone that can give me a bit of guidance how to pass the feature names to the end of the pipeline? The ideal result would be a dataframe with the feature (row from the TEXT_COLUMN) and feature_weight as value.

Combine CountVectorizer and SelectKBest causes labels to disappear

I have a class that creates a feature extraction pipeline and fits a logistic regression model. The input is a set of string data in a DF structure. The ItemSelector class just returns the column that has the clean data from the original data frame, then passes it onto CountVectorizer and Kbest selector. If i remove Kbest, this pipeline works:
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelector(BaseEstimator, TransformerMixin):
# returns a single column from a DF
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class LogisticRegressionWithWordFeatures(object):
def __init__(self):
self.model = LogisticRegression()
def fit(self, df, labels):
self.pipeline = self.get_preprocessing_pipeline(df)
fitted_df = self.pipeline.fit_transform(df)
self.model.fit(fitted_df, labels)
return self
def predict(self, df):
fitted_df = self.pipeline.transform(df)
y = self.model.predict(fitted_df)
return y
def get_preprocessing_pipeline(self, data_frame):
"""
Get data frame containing features and labels from raw feature input DF.
:param input_file: input DF
"""
process_and_join_features = Pipeline([
('features', FeatureUnion([
('count_lemma_features', Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('counts', CountVectorizer(analyzer="word", stop_words='english'))]))])),
('reducer', SelectKBest(chi2, k=1000))
])
return process_and_join_features
If I try to fit/transform based on this pipeline I get this error:
model = LogisticRegressionWithWordFeatures()
model.fit(train_data, train_labels)
test_y = model.predict(test_data)
>>>
TypeError Traceback (most recent call last)
<ipython-input-183-536a1c9c0a09> in <module>
1 b_logistic_regression_with_hypers_bow_clean = LogisticRegressionWithWordFeatures()
----> 2 b_logistic_regression_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.fit(b_ebay_train_data, b_ebay_train_labels)
3 b_ebay_y_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.predict(b_ebay_test_data)
4 b_gold_y_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.predict(gold_df)
<ipython-input-181-6974b6ea2a5b> in fit(self, df, labels)
6 def fit(self, df, labels):
7 self.pipeline = self.get_preprocessing_pipeline(df)
----> 8 fitted_df = self.pipeline.fit_transform(df)
9 self.model.fit(fitted_df, labels)
10 return self
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
391 return Xt
392 if hasattr(last_step, 'fit_transform'):
--> 393 return last_step.fit_transform(Xt, y, **fit_params)
394 else:
395 return last_step.fit(Xt, y, **fit_params).transform(Xt)
~/anaconda3/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
551 if y is None:
552 # fit method of arity 1 (unsupervised transformation)
--> 553 return self.fit(X, **fit_params).transform(X)
554 else:
555 # fit method of arity 2 (supervised transformation)
TypeError: fit() missing 1 required positional argument: 'y'
Clearly the issue is that the training labels aren't making its way into the pipeline. I tried adding another itemselector for the training labels:
process_and_join_features = Pipeline([
('features', FeatureUnion([
('count_lemma_features', Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('counts', CountVectorizer(analyzer="word", stop_words='english'))])),
('labels', ItemSelector(key='Expense_Category'))])),
('reducer', SelectKBest(chi2, k=1000))
])
return process_and_join_features
But this causes a key error for the label (Expense_Category), even though that column is there in the training data.
If I do it step by step, this works:
item_selector = ItemSelector(key='clean_Invoice_Description').fit(train_data)
count_selector = CountVectorizer(analyzer="word", stop_words='english')
k_best = SelectKBest(chi2, k=1000)
invoice_desc = item_selector.transform(train_data)
invoice_desc = count_selector.fit_transform(invoice_desc)
reduced_desc = k_best.fit_transform(invoice_desc, train_labels)
print(reduced_desc.shape)
>>> (6130, 1000)
The problem with doing the step-by-step way is that there are other features in other columns that I would like to use in along with these, and pipeline provides a nice way of doing so without having to manually combine them.
Solved it. The main issue being the nesting of each feature. Pipelines() expects a list of tuples, where the first item in the tuple is the feature/pipe name, and the second being the actual class. It is very easy to lose track of the nesting as you add more features. Here is the final code:
def get_preprocessing_pipeline(self, data_frame):
"""
Get data frame containing features and labels from raw feature input csv file"""
process_and_join_features = Pipeline([
('features',
FeatureUnion([
('tokens',
Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('vec', CountVectorizer(analyzer="word", stop_words='english')),
('dim_red', SelectKBest(chi2, k=5000))
])),
('hypernyms',
Pipeline([
('selector', ItemSelector(key='hypernyms_combined')),
('vec', TfidfVectorizer(analyzer="word")),
('dim_red', SelectKBest(chi2, k=5000))
]))]))])
return process_and_join_features

customed transformer in sklearn

I tried to make my pipe and add my customed transformer as follow:
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[list(self.attribute_names)]
and
class DummyTransform(BaseEstimator, TransformerMixin):
def __init__(self):
return None
def transform(self, X):
return pd.get_dummies(X).values
def fit(self, X, y=None):
return self
but when i do:
RF=RandomForestClassifier(n_estimators=100,oob_score=True,random_state=3)
pipe= Pipeline(steps=[
('Selector', DataFrameSelector(attribute_names=('lat','long','type'))), # selects the second and 4th column
('Encoder', DummyTransform() )
('clf',RF)
])
rforest=pipe.fit(X_train,Y_train)
I have the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-168-108f5c7552a0> in <module>()
4 ('Selector', DataFrameSelector(attribute_names=('lat','long','type'))), # selects the second and 4th column
5 ('Encoder', DummyTransform() )
----> 6 ('clf',RF)
7 ])
8 rforest=pipe.fit(X_train,Y_train)
TypeError: 'tuple' object is not callable
Why ???
PS: strangely this one works:
RF=RandomForestClassifier(n_estimators=100,oob_score=True,random_state=3)
pipe= Pipeline(steps=[
('Selector', DataFrameSelector(attribute_names=('lat','long','type'))), # selects the second and 4th column
('Encoder', DummyTransform() )
#('clf',DecisionTreeClassifier())
])
X=pipe.fit_transform(X_train,Y_train)
RF.fit(X,Y_train)
Edit: RF stand for this line of code
RF=RandomForestClassifier(n_estimators=100,oob_score=True,random_state=3)
There's a missing a comma one line above your error, at the end, it worked when you commented it because the comma was missing on the last item

How use leave one out encoding in sklearn pipelines

I would like to test different encoding strategies as implemented in categorical encoding package using sklearn pipelines.
I mean something like this:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy="median")),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('cat_encoder', LeaveOneOutEncoder()),
])
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
But I get an error:
TypeError: fit() missing 1 required positional argument: 'y'
Can anyone suggest a solution?
Let show just part of the code as I do. I add XGBRegressor because I think you may predict housing price
class MultiColumn(BaseEstimator, TransformerMixin):
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self
def transform(self, X):
return X[self.columns]
NUMERIC = df[['var1', 'var2']]
CATEGORICAL = df[['var3', 'var4']]
class Imputation(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
return X.fillna(NUMERIC.median())
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
class Cat(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
enc = DictVectorizer(sparse = False)
encc = enc.fit(CATEGORICAL.T.to_dict().values())
enc_data = encc.transform(X.T.to_dict().values())
enc_data[np.isnan(enc_data)] = 1
return enc_data
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
And Pipeline
pipeline = Pipeline([
# Use FeatureUnion to combine the features
('union', FeatureUnion(
transformer_list=[
# numeric
('numeric', Pipeline([
('selector', MultiColumn(columns=['var1', 'var2'])),
('imp', Imputation()),
('scaling', preprocessing.StandardScaler(with_mean = 0.))
])),
# categorical
('categorical', Pipeline([
('selector', MultiColumn(columns=['var3', 'var4'])),
('one_hot', Cat()),
(CategoricalImputer())
])),
])),
('model_fitting', xgb.XGBRegressor()),
])
Your categorical encoder (LeaveOneOutEncoder) needs the target variable to adjust and replace the new labels (levels) for your variables defined in cat_attribs. So, you just need to invoke fit_transform method joined with y_train:
housing_prepared = full_pipeline.fit_transform(housing, y_train)

Categories

Resources