I would like to test different encoding strategies as implemented in categorical encoding package using sklearn pipelines.
I mean something like this:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy="median")),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('cat_encoder', LeaveOneOutEncoder()),
])
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
But I get an error:
TypeError: fit() missing 1 required positional argument: 'y'
Can anyone suggest a solution?
Let show just part of the code as I do. I add XGBRegressor because I think you may predict housing price
class MultiColumn(BaseEstimator, TransformerMixin):
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self
def transform(self, X):
return X[self.columns]
NUMERIC = df[['var1', 'var2']]
CATEGORICAL = df[['var3', 'var4']]
class Imputation(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
return X.fillna(NUMERIC.median())
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
class Cat(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
enc = DictVectorizer(sparse = False)
encc = enc.fit(CATEGORICAL.T.to_dict().values())
enc_data = encc.transform(X.T.to_dict().values())
enc_data[np.isnan(enc_data)] = 1
return enc_data
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
And Pipeline
pipeline = Pipeline([
# Use FeatureUnion to combine the features
('union', FeatureUnion(
transformer_list=[
# numeric
('numeric', Pipeline([
('selector', MultiColumn(columns=['var1', 'var2'])),
('imp', Imputation()),
('scaling', preprocessing.StandardScaler(with_mean = 0.))
])),
# categorical
('categorical', Pipeline([
('selector', MultiColumn(columns=['var3', 'var4'])),
('one_hot', Cat()),
(CategoricalImputer())
])),
])),
('model_fitting', xgb.XGBRegressor()),
])
Your categorical encoder (LeaveOneOutEncoder) needs the target variable to adjust and replace the new labels (levels) for your variables defined in cat_attribs. So, you just need to invoke fit_transform method joined with y_train:
housing_prepared = full_pipeline.fit_transform(housing, y_train)
Related
I have a test train split named X_train and y_train that I pass to a sci-kit learn pipeline. Is it possible to have a custom step to only transform y_train i.e remove nan and infs from y_train.
class columnDropperTransformer():
def __init__(self,columns):
self.columns=columns
def transform(self,X,y=None):
print('---- Dropping ID cols :', self.columns)
return X.drop(self.columns,axis=1)
def fit(self, X, y=None):
return self
print('---- Making pipeline')
drop_cols = Pipeline(steps=[
("columnDropper", columnDropperTransformer(id_cols))
])
feature_remover = Pipeline(steps=[
("columnDropper", missingRemover())
])
fill_na_zero_transformer = Pipeline(steps=[
('zero_imputer', SimpleImputer(strategy='constant', fill_value=0))
])
numeric_transformer = Pipeline(steps=[
('imputer', SimpleImputer(strategy = "constant", fill_value=-1, add_indicator=True)),
('scaler', StandardScaler())
])
class SkipSimpleImputer(SimpleImputer):
def __init__(self, **kwargs):
super().__init__(**kwargs)
def transform(self, X, y=None):
if 'MARITAL_STATUS' in X.columns:
print('\t---- MARITAL STATUS found in skipsimpleimpute, all cols are: ', X.columns)
transformed_X = super().transform(X['MARITAL_STATUS'])
X['MARITAL_STATUS'] = transformed_X
return X
def fit(self, X, y=None):
return self
categorical_transformer = Pipeline(steps=[
('categorical_imputer', SkipSimpleImputer(strategy="constant", fill_value='Unknown')),
('encoder', OneHotEncoder(handle_unknown='ignore'))
])
preprocess_ppl = ColumnTransformer(
transformers=[
('encode', categorical_transformer, make_column_selector(dtype_include=object)),
('zero_impute', fill_na_zero_transformer, lambda X: [col for col in fill_zero_cols if col in X.columns] ),
('numeric', numeric_transformer, lambda X: [col for col in num_cols if col in X.columns])
]
)
pipeline2 = Pipeline(
steps=[
('dropper', drop_cols),
('remover',feature_remover),
("preprocessor", preprocess_ppl),
("estimator", customOLS(sm.OLS, LinearRegression()))
]
)
Could this be done via custom column transformer or via pipeline step transformer. In custom COlumntransformer we return X, how do we update y?
I am writing a pipeline with custom transformer. When calling fit_transform of categorical pipeline I am getting the desired result but when calling fit_transform of ColumnTransformer, whatever I have initialised in init of custom transformer is getting lost.
Note: not including code of numericalTransformer for readability
class categoryTransformer(BaseEstimator, TransformerMixin):
def __init__(self, use_dates=['year', 'month', 'day']):
self._use_dates = use_dates
print('==========>',self._use_dates)
def fit(self, X, y=None):
return self
def get_year(self, obj):
return str(obj)[:4]
def get_month(self, obj):
return str(obj)[4:6]
def get_day(self, obj):
return str(obj)[6:8]
def create_boolean(self, obj):
if obj == '0':
return 'No'
else:
return 'Yes'
def transform(self, X, y=None):
print(self._use_dates)
for spec in self._use_dates:
print(spec)
exec("X.loc[:,'{}'] = X['date'].apply(self.get_{})".format(spec, spec))
X = X.drop('date', axis=1)
X.loc[:,'yr_renovated'] = X['yr_renovated'].apply(self.create_boolean)
X.loc[:, 'view'] = X['view'].apply(self.create_boolean)
return X.values
cat_pipe = Pipeline([
('cat_transform', categoryTransformer()),
('one_hot', OneHotEncoder(sparse=False))])
num_pipe = Pipeline([
('num_transform', numericalTransformer()),
('imputer', SimpleImputer(strategy = 'median')),
('std_scaler', StandardScaler())])
full_pipe = ColumnTransformer([
('num', num_pipe, numerical_features),
('cat', cat_pipe, categorical_features)])
cat_pipe.fit_transform(data[categorical_features])#working fine
df2 = full_pipe.fit_transform(X_train)# __init__ initialisation lost
"output"
==========> ['year', 'month', 'day']
['year', 'month', 'day']
year
month
day
==========> None
None
After that long traceback that I am not able to debug. Workaround is if I can create use_dates=['year', 'month', 'day'] in transform function itself but I want to understand why this is happening.
The parameters of __init__ need to have the same names as the attributes that get set (so use_dates and _use_dates is the problem).
This is required for cloning to work properly, and ColumnTransformer clones all its transformers before fitting.
https://scikit-learn.org/stable/developers/develop.html#instantiation
I have been trying to get the feature names on my model for quite some time now but have a hard time understanding how to do it. I have tried many posts on here but can't get it to work. Here is my code:
loading the classes I need to combine tfidfvectorizer with other features
from sklearn.base import TransformerMixin, BaseEstimator
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class FeatureTypeSelector(TransformerMixin, BaseEstimator):
FEATURE_TYPES = {
'categorical': [
'COLUMN_A','COLUMN_B'
],
'continuous': [
'COLULMN_C','COLUMN_D'
]
}
def __init__(self, feature_type):
self.columns = self.FEATURE_TYPES[feature_type]
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.columns]
class RowToDictTransformer(TransformerMixin, BaseEstimator):
def fit(self, X, y=None):
return self
def transform(self, X):
return (row[1] for row in X.iterrows())
Then the code to put everything in a pipeline and run the regressor
from sklearn.pipeline import make_union, make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
# Create the preprocessor
preprocessor = make_union(
make_pipeline(
ItemSelector(key='TEXT_COLUMN'),
TfidfVectorizer(lowercase=False, min_df=1),
),
make_pipeline(
FeatureTypeSelector('continuous'),
MinMaxScaler(),
),
make_pipeline(
FeatureTypeSelector('categorical'),
RowToDictTransformer(),
DictVectorizer(sparse=False), # set sparse=True if you get MemoryError
),
)
# fit and transform the data
preprocessor.fit_transform(x_train)
# choose some estimator
# estimator = MultinomialNB()
estimator = LinearRegression()
# Create the model
model = make_pipeline(preprocessor, estimator)
# Training the model
model.fit(x_train, y_train)
# Predicting the model
predicted = model.predict(x_test)
I can run the model.coef_ to get all the coefficients but I want to see how each item of the TEXT_COLUMN is affected by which weight. I have tried calling get_feature_names() or tried passing them in the pipeline but with no succes (most of google's results are purple by now).
Anyone that can give me a bit of guidance how to pass the feature names to the end of the pipeline? The ideal result would be a dataframe with the feature (row from the TEXT_COLUMN) and feature_weight as value.
I have a class that creates a feature extraction pipeline and fits a logistic regression model. The input is a set of string data in a DF structure. The ItemSelector class just returns the column that has the clean data from the original data frame, then passes it onto CountVectorizer and Kbest selector. If i remove Kbest, this pipeline works:
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelector(BaseEstimator, TransformerMixin):
# returns a single column from a DF
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class LogisticRegressionWithWordFeatures(object):
def __init__(self):
self.model = LogisticRegression()
def fit(self, df, labels):
self.pipeline = self.get_preprocessing_pipeline(df)
fitted_df = self.pipeline.fit_transform(df)
self.model.fit(fitted_df, labels)
return self
def predict(self, df):
fitted_df = self.pipeline.transform(df)
y = self.model.predict(fitted_df)
return y
def get_preprocessing_pipeline(self, data_frame):
"""
Get data frame containing features and labels from raw feature input DF.
:param input_file: input DF
"""
process_and_join_features = Pipeline([
('features', FeatureUnion([
('count_lemma_features', Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('counts', CountVectorizer(analyzer="word", stop_words='english'))]))])),
('reducer', SelectKBest(chi2, k=1000))
])
return process_and_join_features
If I try to fit/transform based on this pipeline I get this error:
model = LogisticRegressionWithWordFeatures()
model.fit(train_data, train_labels)
test_y = model.predict(test_data)
>>>
TypeError Traceback (most recent call last)
<ipython-input-183-536a1c9c0a09> in <module>
1 b_logistic_regression_with_hypers_bow_clean = LogisticRegressionWithWordFeatures()
----> 2 b_logistic_regression_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.fit(b_ebay_train_data, b_ebay_train_labels)
3 b_ebay_y_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.predict(b_ebay_test_data)
4 b_gold_y_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.predict(gold_df)
<ipython-input-181-6974b6ea2a5b> in fit(self, df, labels)
6 def fit(self, df, labels):
7 self.pipeline = self.get_preprocessing_pipeline(df)
----> 8 fitted_df = self.pipeline.fit_transform(df)
9 self.model.fit(fitted_df, labels)
10 return self
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
391 return Xt
392 if hasattr(last_step, 'fit_transform'):
--> 393 return last_step.fit_transform(Xt, y, **fit_params)
394 else:
395 return last_step.fit(Xt, y, **fit_params).transform(Xt)
~/anaconda3/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
551 if y is None:
552 # fit method of arity 1 (unsupervised transformation)
--> 553 return self.fit(X, **fit_params).transform(X)
554 else:
555 # fit method of arity 2 (supervised transformation)
TypeError: fit() missing 1 required positional argument: 'y'
Clearly the issue is that the training labels aren't making its way into the pipeline. I tried adding another itemselector for the training labels:
process_and_join_features = Pipeline([
('features', FeatureUnion([
('count_lemma_features', Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('counts', CountVectorizer(analyzer="word", stop_words='english'))])),
('labels', ItemSelector(key='Expense_Category'))])),
('reducer', SelectKBest(chi2, k=1000))
])
return process_and_join_features
But this causes a key error for the label (Expense_Category), even though that column is there in the training data.
If I do it step by step, this works:
item_selector = ItemSelector(key='clean_Invoice_Description').fit(train_data)
count_selector = CountVectorizer(analyzer="word", stop_words='english')
k_best = SelectKBest(chi2, k=1000)
invoice_desc = item_selector.transform(train_data)
invoice_desc = count_selector.fit_transform(invoice_desc)
reduced_desc = k_best.fit_transform(invoice_desc, train_labels)
print(reduced_desc.shape)
>>> (6130, 1000)
The problem with doing the step-by-step way is that there are other features in other columns that I would like to use in along with these, and pipeline provides a nice way of doing so without having to manually combine them.
Solved it. The main issue being the nesting of each feature. Pipelines() expects a list of tuples, where the first item in the tuple is the feature/pipe name, and the second being the actual class. It is very easy to lose track of the nesting as you add more features. Here is the final code:
def get_preprocessing_pipeline(self, data_frame):
"""
Get data frame containing features and labels from raw feature input csv file"""
process_and_join_features = Pipeline([
('features',
FeatureUnion([
('tokens',
Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('vec', CountVectorizer(analyzer="word", stop_words='english')),
('dim_red', SelectKBest(chi2, k=5000))
])),
('hypernyms',
Pipeline([
('selector', ItemSelector(key='hypernyms_combined')),
('vec', TfidfVectorizer(analyzer="word")),
('dim_red', SelectKBest(chi2, k=5000))
]))]))])
return process_and_join_features
I have tried to write complicated pipeline with own classes and I got this error:
TypeError: fit_transform() takes 2 positional arguments but 3 were given
I tried to apply solutions from similar issue by using custom LabelBinarizer, but the error did not fixed.
class NewLabelBinarizer(LabelBinarizer):
def fit(self, X, y=None):
return super(NewLabelBinarizer, self).fit(X)
def transform(self, X, y=None):
return super(NewLabelBinarizer, self).transform(X)
def fit_transform(self, X, y=None):
return super(NewLabelBinarizer, self).fit(X).transform(X)
class LabelPreprocessing(NewLabelBinarizer, TransformerMixin):
def __init__(self, *args, **kwargs):
pass
def fit(self, y,X=None):
super(LabelPreprocessing, self).fit(y)
return self
def transform(self, y, X=None):
y = y.str.findall(r'([a-zA-Z]{3,})')
y = y.replace(regex=r'(film)', value=' ')
return y.values
class PlotPreprocessing(BaseEstimator, TransformerMixin):
def __init__(self, *args, **kwargs):
self.REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|#,;]')
self.BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
X = X.str.lower()
X = X.map(lambda x: re.sub(self.REPLACE_BY_SPACE_RE," ",x))
X = X.map(lambda x: re.sub(r'\s+'," ",x))
return X.values
pipeline = Pipeline([
(
'text_preparation', FeatureUnion([
('label', Pipeline([
('labelPreprocessing', LabelPreprocessing()),
('mlb', MultiLabelBinarizer())
])),
('plot', PlotPreprocessing()
)
])),
('tfidf_vectorizer', TfidfVectorizer(ngram_range=(1,2),max_df=0.9,min_df=5,token_pattern=r'(\S+)')
),
('model', LinearRegression())
])
train_X, train_y, test_X, test_y = train_test_split(plot, label)
pipeline.fit(train_X, train_y)
y_pred = pipeline.predict( test_X )
And I get
TypeError Traceback (most recent call last)
<ipython-input-27-a8a60de025fd> in <module>()
17 train_X, train_y, test_X, test_y = train_test_split(plot, label)
18
---> 19 pipeline.fit(train_X, train_y)
20 y_pred = pipeline.predict( test_X )
14 frames
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
391 return Xt
392 if hasattr(last_step, 'fit_transform'):
--> 393 return last_step.fit_transform(Xt, y, **fit_params)
394 else:
395 return last_step.fit(Xt, y, **fit_params).transform(Xt)
TypeError: fit_transform() takes 2 positional arguments but 3 were given
Also I tried to add **fit_params to fit/predict params.
I think that the problem is with MultiLabelBinarizer signature. Try out replacing it with:
class MLBinarizer(MultiLabelBinarizer):
def fit_transform(self, X, y=None):
return super(MultiLabelBinarizer, self).fit_transform(X)
note that methods fit, fit_transform take X as the first argument. So I recommend rewriting your LabelPreprocessing like this:
class LabelPreprocessing(NewLabelBinarizer, TransformerMixin):
def fit(self, X, y=None):
super(LabelPreprocessing, self).fit(X)
return self
def transform(self, y, X=None):
y = y.str.findall(r'([a-zA-Z]{3,})')
y = y.replace(regex=r'(film)', value=' ')
return y.values