I have a test train split named X_train and y_train that I pass to a sci-kit learn pipeline. Is it possible to have a custom step to only transform y_train i.e remove nan and infs from y_train.
class columnDropperTransformer():
def __init__(self,columns):
self.columns=columns
def transform(self,X,y=None):
print('---- Dropping ID cols :', self.columns)
return X.drop(self.columns,axis=1)
def fit(self, X, y=None):
return self
print('---- Making pipeline')
drop_cols = Pipeline(steps=[
("columnDropper", columnDropperTransformer(id_cols))
])
feature_remover = Pipeline(steps=[
("columnDropper", missingRemover())
])
fill_na_zero_transformer = Pipeline(steps=[
('zero_imputer', SimpleImputer(strategy='constant', fill_value=0))
])
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy = "constant", fill_value=-1, add_indicator=True)),
('scaler', StandardScaler())
])
class SkipSimpleImputer(SimpleImputer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def transform(self, X, y=None):
if 'MARITAL_STATUS' in X.columns:
print('\t---- MARITAL STATUS found in skipsimpleimpute, all cols are: ', X.columns)
transformed_X = super().transform(X['MARITAL_STATUS'])
X['MARITAL_STATUS'] = transformed_X
return X
def fit(self, X, y=None):
return self
categorical_transformer = Pipeline(steps=[
('categorical_imputer', SkipSimpleImputer(strategy="constant", fill_value='Unknown')),
('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocess_ppl = ColumnTransformer(
transformers=[
('encode', categorical_transformer, make_column_selector(dtype_include=object)),
('zero_impute', fill_na_zero_transformer, lambda X: [col for col in fill_zero_cols if col in X.columns] ),
('numeric', numeric_transformer, lambda X: [col for col in num_cols if col in X.columns])
]
)
pipeline2 = Pipeline(
steps=[
('dropper', drop_cols),
('remover',feature_remover),
("preprocessor", preprocess_ppl),
("estimator", customOLS(sm.OLS, LinearRegression()))
]
)
Could this be done via custom column transformer or via pipeline step transformer. In custom COlumntransformer we return X, how do we update y?
Related
I am writing a pipeline with custom transformer. When calling fit_transform of categorical pipeline I am getting the desired result but when calling fit_transform of ColumnTransformer, whatever I have initialised in init of custom transformer is getting lost.
Note: not including code of numericalTransformer for readability
class categoryTransformer(BaseEstimator, TransformerMixin):
def __init__(self, use_dates=['year', 'month', 'day']):
self._use_dates = use_dates
print('==========>',self._use_dates)
def fit(self, X, y=None):
return self
def get_year(self, obj):
return str(obj)[:4]
def get_month(self, obj):
return str(obj)[4:6]
def get_day(self, obj):
return str(obj)[6:8]
def create_boolean(self, obj):
if obj == '0':
return 'No'
else:
return 'Yes'
def transform(self, X, y=None):
print(self._use_dates)
for spec in self._use_dates:
print(spec)
exec("X.loc[:,'{}'] = X['date'].apply(self.get_{})".format(spec, spec))
X = X.drop('date', axis=1)
X.loc[:,'yr_renovated'] = X['yr_renovated'].apply(self.create_boolean)
X.loc[:, 'view'] = X['view'].apply(self.create_boolean)
return X.values
cat_pipe = Pipeline([
('cat_transform', categoryTransformer()),
('one_hot', OneHotEncoder(sparse=False))])
num_pipe = Pipeline([
('num_transform', numericalTransformer()),
('imputer', SimpleImputer(strategy = 'median')),
('std_scaler', StandardScaler())])
full_pipe = ColumnTransformer([
('num', num_pipe, numerical_features),
('cat', cat_pipe, categorical_features)])
cat_pipe.fit_transform(data[categorical_features])#working fine
df2 = full_pipe.fit_transform(X_train)# __init__ initialisation lost
"output"
==========> ['year', 'month', 'day']
['year', 'month', 'day']
year
month
day
==========> None
None
After that long traceback that I am not able to debug. Workaround is if I can create use_dates=['year', 'month', 'day'] in transform function itself but I want to understand why this is happening.
The parameters of __init__ need to have the same names as the attributes that get set (so use_dates and _use_dates is the problem).
This is required for cloning to work properly, and ColumnTransformer clones all its transformers before fitting.
https://scikit-learn.org/stable/developers/develop.html#instantiation
I have a class that creates a feature extraction pipeline and fits a logistic regression model. The input is a set of string data in a DF structure. The ItemSelector class just returns the column that has the clean data from the original data frame, then passes it onto CountVectorizer and Kbest selector. If i remove Kbest, this pipeline works:
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelector(BaseEstimator, TransformerMixin):
# returns a single column from a DF
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class LogisticRegressionWithWordFeatures(object):
def __init__(self):
self.model = LogisticRegression()
def fit(self, df, labels):
self.pipeline = self.get_preprocessing_pipeline(df)
fitted_df = self.pipeline.fit_transform(df)
self.model.fit(fitted_df, labels)
return self
def predict(self, df):
fitted_df = self.pipeline.transform(df)
y = self.model.predict(fitted_df)
return y
def get_preprocessing_pipeline(self, data_frame):
"""
Get data frame containing features and labels from raw feature input DF.
:param input_file: input DF
"""
process_and_join_features = Pipeline([
('features', FeatureUnion([
('count_lemma_features', Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('counts', CountVectorizer(analyzer="word", stop_words='english'))]))])),
('reducer', SelectKBest(chi2, k=1000))
])
return process_and_join_features
If I try to fit/transform based on this pipeline I get this error:
model = LogisticRegressionWithWordFeatures()
model.fit(train_data, train_labels)
test_y = model.predict(test_data)
>>>
TypeError Traceback (most recent call last)
<ipython-input-183-536a1c9c0a09> in <module>
1 b_logistic_regression_with_hypers_bow_clean = LogisticRegressionWithWordFeatures()
----> 2 b_logistic_regression_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.fit(b_ebay_train_data, b_ebay_train_labels)
3 b_ebay_y_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.predict(b_ebay_test_data)
4 b_gold_y_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.predict(gold_df)
<ipython-input-181-6974b6ea2a5b> in fit(self, df, labels)
6 def fit(self, df, labels):
7 self.pipeline = self.get_preprocessing_pipeline(df)
----> 8 fitted_df = self.pipeline.fit_transform(df)
9 self.model.fit(fitted_df, labels)
10 return self
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
391 return Xt
392 if hasattr(last_step, 'fit_transform'):
--> 393 return last_step.fit_transform(Xt, y, **fit_params)
394 else:
395 return last_step.fit(Xt, y, **fit_params).transform(Xt)
~/anaconda3/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
551 if y is None:
552 # fit method of arity 1 (unsupervised transformation)
--> 553 return self.fit(X, **fit_params).transform(X)
554 else:
555 # fit method of arity 2 (supervised transformation)
TypeError: fit() missing 1 required positional argument: 'y'
Clearly the issue is that the training labels aren't making its way into the pipeline. I tried adding another itemselector for the training labels:
process_and_join_features = Pipeline([
('features', FeatureUnion([
('count_lemma_features', Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('counts', CountVectorizer(analyzer="word", stop_words='english'))])),
('labels', ItemSelector(key='Expense_Category'))])),
('reducer', SelectKBest(chi2, k=1000))
])
return process_and_join_features
But this causes a key error for the label (Expense_Category), even though that column is there in the training data.
If I do it step by step, this works:
item_selector = ItemSelector(key='clean_Invoice_Description').fit(train_data)
count_selector = CountVectorizer(analyzer="word", stop_words='english')
k_best = SelectKBest(chi2, k=1000)
invoice_desc = item_selector.transform(train_data)
invoice_desc = count_selector.fit_transform(invoice_desc)
reduced_desc = k_best.fit_transform(invoice_desc, train_labels)
print(reduced_desc.shape)
>>> (6130, 1000)
The problem with doing the step-by-step way is that there are other features in other columns that I would like to use in along with these, and pipeline provides a nice way of doing so without having to manually combine them.
Solved it. The main issue being the nesting of each feature. Pipelines() expects a list of tuples, where the first item in the tuple is the feature/pipe name, and the second being the actual class. It is very easy to lose track of the nesting as you add more features. Here is the final code:
def get_preprocessing_pipeline(self, data_frame):
"""
Get data frame containing features and labels from raw feature input csv file"""
process_and_join_features = Pipeline([
('features',
FeatureUnion([
('tokens',
Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('vec', CountVectorizer(analyzer="word", stop_words='english')),
('dim_red', SelectKBest(chi2, k=5000))
])),
('hypernyms',
Pipeline([
('selector', ItemSelector(key='hypernyms_combined')),
('vec', TfidfVectorizer(analyzer="word")),
('dim_red', SelectKBest(chi2, k=5000))
]))]))])
return process_and_join_features
I'm writing a custom transformer for a scikit-learn Pipeline. The transformer seems to work on it's own, and the fit() and transform() methods work individually, but when I include it in a pipeline, it raises an error stating:
AttributeError: 'NoneType' object has no attribute 'transform'
For reference, here is the code for my custom transformer:
class feature_union(TransformerMixin, BaseEstimator):
def __init__(self):
self.Xt = None
self.PI2_categories = ['D3', 'D4', 'A6', 'A5', 'D1', 'D2', 'A8', 'B2', 'E1',
'A1', 'A2', 'C1', 'C4', 'A7', 'C2', 'C3', 'A4', 'A3', 'B1']
def fit(self, X, y=None):
product_columns = ['Product_Info_1', 'Product_Info_3', 'Product_Info_5', 'Product_Info_6', 'Product_Info_7'] + self.PI2_categories
product_idx = [col for col in range(X.shape[1]) if X.columns[col] in product_columns]
personal_columns = ['Ins_Age', 'Ht', 'Wt', 'BMI']
personal_idx = [col for col in range(X.shape[1]) if X.columns[col] in personal_columns]
medical_hist_columns = ["Medical_History_{}".format(x) for x in range(1, 42, 1)]
medical_hist_idx = [col for col in range(X.shape[1]) if X.columns[col] in medical_hist_columns]
family_hist_columns = ["Family_Hist_{}".format(x) for x in range(1, 6, 1)]
family_hist_idx = [col for col in range(X.shape[1]) if X.columns[col] in family_hist_columns]
insured_info_columns = ["InsuredInfo_{}".format(x) for x in range(1, 8, 1)]
insured_info_idx = [col for col in range(X.shape[1]) if X.columns[col] in insured_info_columns]
insurance_hist_columns = ["Insurance_History_{}".format(x) for x in range(1, 10, 1)]
insurance_hist_idx = [col for col in range(X.shape[1]) if X.columns[col] in insurance_hist_columns]
employment_info_columns = ["Employment_Info_{}".format(x) for x in range(1, 7, 1)]
employment_info_idx = [col for col in range(X.shape[1]) if X.columns[col] in employment_info_columns]
medical_keyword_columns = ["Medical_Keyword_{}".format(x) for x in range(1, 49, 1)]
medical_keyword_idx = [col for col in range(X.shape[1]) if X.columns[col] in medical_keyword_columns]
medical_keyword_columns = ["Medical_Keyword_{}".format(x) for x in range(1, 49, 1)]
medical_keyword_idx = [col for col in range(X.shape[1]) if X.columns[col] in medical_keyword_columns]
get_original_features = lambda X: X
get_product_columns = lambda X: X[:, product_idx]
get_personal_columns = lambda X: X[:, personal_idx]
get_medical_hist_columns = lambda X: X[:, medical_hist_idx]
get_family_hist_columns = lambda X: X[:, family_hist_idx]
get_insured_info_columns = lambda X: X[:, insured_info_idx]
get_insurance_hist_columns = lambda X: X[:, insurance_hist_idx]
get_employment_info_columns = lambda X: X[:, employment_info_idx]
get_medical_keyword_columns = lambda X: X[:, medical_keyword_idx]
get_medical_and_family = lambda X: X[:, medical_keyword_idx + medical_hist_idx + family_hist_idx]
union = FeatureUnion([
("original_features", FunctionTransformer(get_original_features)),
("product_interaction", Pipeline([('select_product', FunctionTransformer(get_product_columns)),
('product_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
])),
("personal_interaction", Pipeline([('select_personal', FunctionTransformer(get_personal_columns)),
('personal_interaction', PolynomialFeatures(4, include_bias=False, interaction_only=True))
])),
("medical_hist_interaction", Pipeline([('select_medical', FunctionTransformer(get_medical_hist_columns)),
('medical_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
])),
("family_hist_interaction", Pipeline([('select_family_hist', FunctionTransformer(get_family_hist_columns)),
('family_hist_interaction', PolynomialFeatures(5, include_bias=False, interaction_only=True))
])),
("insured_info_interaction", Pipeline([('select_insured_info', FunctionTransformer(get_insured_info_columns)),
('insured_info_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
])),
("insurance_hist_interaction", Pipeline([('select_insurance_hist', FunctionTransformer(get_insurance_hist_columns)),
('insurance_hist_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
])),
("employment_info_interaction", Pipeline([('select_employment_info', FunctionTransformer(get_employment_info_columns)),
('employment_info_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
])),
("medical_keyword_interaction", Pipeline([('select_medical_keyword', FunctionTransformer(get_medical_keyword_columns)),
('medical_keyword_interaction', PolynomialFeatures(2, include_bias=False, interaction_only=True))
])),
])
Xt = union.fit_transform(X)
return self.Xt
def transform(self, X, y=None):
Xt = self.Xt
return Xt
And when I use it in a pipeline like this:
pipeline_feat_union = Pipeline([('preprocess', preprocess()),
('feat_union', feature_union()),
('classifier', GaussianNB())])
It raises the following error:
AttributeError: 'NoneType' object has no attribute 'transform'
When writing custom transformer for a sklearn pipeline, your fit() method needs to return self or something with a similar interface, like so:
class Intercept(BaseEstimator, TransformerMixin):
def __init__(self):
# maybe do some initialization here, if your transformer needs it
def fit(self, X,y=None):
# Do something here to "fit" your transformer
return self # Always return self or something with a similar interface.
def transform(self, X,y=None):
# apply your transformation here
return some_awesome_transformation(X)
and for reference, this is most likely the line that is throwing the exception (which is helpful because you can see why you need to return self in the fit() method)
I ran into the same problem. The GuassianNB() class doesn't have a transform method defined.
But you don't need to use the transform method at all if you are including your classifier in the pipeline. The only two methods that you need are the fit method and the predict method.
pipeline_feat_union.fit(X_train, y_train)
pipeline_feat_union.predict(X_train)
I would like to test different encoding strategies as implemented in categorical encoding package using sklearn pipelines.
I mean something like this:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy="median")),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('cat_encoder', LeaveOneOutEncoder()),
])
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
But I get an error:
TypeError: fit() missing 1 required positional argument: 'y'
Can anyone suggest a solution?
Let show just part of the code as I do. I add XGBRegressor because I think you may predict housing price
class MultiColumn(BaseEstimator, TransformerMixin):
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self
def transform(self, X):
return X[self.columns]
NUMERIC = df[['var1', 'var2']]
CATEGORICAL = df[['var3', 'var4']]
class Imputation(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
return X.fillna(NUMERIC.median())
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
class Cat(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
enc = DictVectorizer(sparse = False)
encc = enc.fit(CATEGORICAL.T.to_dict().values())
enc_data = encc.transform(X.T.to_dict().values())
enc_data[np.isnan(enc_data)] = 1
return enc_data
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
And Pipeline
pipeline = Pipeline([
# Use FeatureUnion to combine the features
('union', FeatureUnion(
transformer_list=[
# numeric
('numeric', Pipeline([
('selector', MultiColumn(columns=['var1', 'var2'])),
('imp', Imputation()),
('scaling', preprocessing.StandardScaler(with_mean = 0.))
])),
# categorical
('categorical', Pipeline([
('selector', MultiColumn(columns=['var3', 'var4'])),
('one_hot', Cat()),
(CategoricalImputer())
])),
])),
('model_fitting', xgb.XGBRegressor()),
])
Your categorical encoder (LeaveOneOutEncoder) needs the target variable to adjust and replace the new labels (levels) for your variables defined in cat_attribs. So, you just need to invoke fit_transform method joined with y_train:
housing_prepared = full_pipeline.fit_transform(housing, y_train)
I defined a class for feature union. The python 2.7 complain "AttributeError: 'module' object has no attribute "TextTransformer". The code can be runned on Kaggle's platform but cannot run on my local ipython.
from sklearn.base import BaseEstimator, TransformerMixin
class TextTransformer(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key].apply(str)
rfr = RandomForestRegressor()
tfidf = TfidfVectorizer()
tsvd = TruncatedSVD(n_components=10)
clf = pipeline.Pipeline([
('union', FeatureUnion(
transformer_list = [
('txt1', pipeline.Pipeline([('s1', TextTransformer(key='search_term')), ('tfidf1', tfidf), ('tsvd1', tsvd)])),
('txt2', pipeline.Pipeline([('s2', TextTransformer(key='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])),
('txt3', pipeline.Pipeline([('s3', TextTransformer(key='product_description')), ('tfidf3', tfidf), ('tsvd3', tsvd)])),
('txt4', pipeline.Pipeline([('s4', TextTransformer(key='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)]))
],
transformer_weights = {
'txt1': 0.5,
'txt2': 0.25,
'txt3': 0.25,
'txt4': 0.5
},
n_jobs = -1
)),
('rfr', rfr)])
param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]}
model = grid_search.GridSearchCV(estimator = clf, param_grid = param_grid,n_jobs = -1, cv = 10)
model.fit(X_train, y_train)
You probably forgot some import. Try this, it is working for me.
from sklearn.base import TransformerMixin
from sklearn.ensemble import RandomForestRegressor
from sklearn.feature_extraction import *
from sklearn.feature_extraction.text import *
from sklearn.decomposition import *
from sklearn.pipeline import *
from sklearn.grid_search import *
class TextTransformer(TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key].apply(str)
rfr = RandomForestRegressor()
tfidf = TfidfVectorizer()
tsvd = TruncatedSVD(n_components=10)
clf = Pipeline([
('union', FeatureUnion(
transformer_list = [
('txt1', Pipeline([('s1', TextTransformer(key='search_term')), ('tfidf1', tfidf), ('tsvd1', tsvd)])),
('txt2', Pipeline([('s2', TextTransformer(key='product_title')), ('tfidf2', tfidf), ('tsvd2', tsvd)])),
('txt3', Pipeline([('s3', TextTransformer(key='product_description')), ('tfidf3', tfidf), ('tsvd3', tsvd)])),
('txt4', Pipeline([('s4', TextTransformer(key='brand')), ('tfidf4', tfidf), ('tsvd4', tsvd)]))
],
transformer_weights = {
'txt1': 0.5,
'txt2': 0.25,
'txt3': 0.25,
'txt4': 0.5
},
n_jobs = -1
)),
('rfr', rfr)])
param_grid = {'rfr__max_features': [10], 'rfr__max_depth': [20]}
model = GridSearchCV(estimator = clf, param_grid = param_grid,n_jobs = -1, cv = 10)
model.fit(X_train, y_train)