customed transformer in sklearn - python

I tried to make my pipe and add my customed transformer as follow:
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attribute_names):
self.attribute_names = attribute_names
def fit(self, X, y=None):
return self
def transform(self, X):
return X[list(self.attribute_names)]
and
class DummyTransform(BaseEstimator, TransformerMixin):
def __init__(self):
return None
def transform(self, X):
return pd.get_dummies(X).values
def fit(self, X, y=None):
return self
but when i do:
RF=RandomForestClassifier(n_estimators=100,oob_score=True,random_state=3)
pipe= Pipeline(steps=[
('Selector', DataFrameSelector(attribute_names=('lat','long','type'))), # selects the second and 4th column
('Encoder', DummyTransform() )
('clf',RF)
])
rforest=pipe.fit(X_train,Y_train)
I have the following error:
---------------------------------------------------------------------------
TypeError Traceback (most recent call last)
<ipython-input-168-108f5c7552a0> in <module>()
4 ('Selector', DataFrameSelector(attribute_names=('lat','long','type'))), # selects the second and 4th column
5 ('Encoder', DummyTransform() )
----> 6 ('clf',RF)
7 ])
8 rforest=pipe.fit(X_train,Y_train)
TypeError: 'tuple' object is not callable
Why ???
PS: strangely this one works:
RF=RandomForestClassifier(n_estimators=100,oob_score=True,random_state=3)
pipe= Pipeline(steps=[
('Selector', DataFrameSelector(attribute_names=('lat','long','type'))), # selects the second and 4th column
('Encoder', DummyTransform() )
#('clf',DecisionTreeClassifier())
])
X=pipe.fit_transform(X_train,Y_train)
RF.fit(X,Y_train)
Edit: RF stand for this line of code
RF=RandomForestClassifier(n_estimators=100,oob_score=True,random_state=3)

There's a missing a comma one line above your error, at the end, it worked when you commented it because the comma was missing on the last item

Related

ColumnTransformer fit_transform not working with pipeline

I am writing a pipeline with custom transformer. When calling fit_transform of categorical pipeline I am getting the desired result but when calling fit_transform of ColumnTransformer, whatever I have initialised in init of custom transformer is getting lost.
Note: not including code of numericalTransformer for readability
class categoryTransformer(BaseEstimator, TransformerMixin):
def __init__(self, use_dates=['year', 'month', 'day']):
self._use_dates = use_dates
print('==========>',self._use_dates)
def fit(self, X, y=None):
return self
def get_year(self, obj):
return str(obj)[:4]
def get_month(self, obj):
return str(obj)[4:6]
def get_day(self, obj):
return str(obj)[6:8]
def create_boolean(self, obj):
if obj == '0':
return 'No'
else:
return 'Yes'
def transform(self, X, y=None):
print(self._use_dates)
for spec in self._use_dates:
print(spec)
exec("X.loc[:,'{}'] = X['date'].apply(self.get_{})".format(spec, spec))
X = X.drop('date', axis=1)
X.loc[:,'yr_renovated'] = X['yr_renovated'].apply(self.create_boolean)
X.loc[:, 'view'] = X['view'].apply(self.create_boolean)
return X.values
cat_pipe = Pipeline([
('cat_transform', categoryTransformer()),
('one_hot', OneHotEncoder(sparse=False))])
num_pipe = Pipeline([
('num_transform', numericalTransformer()),
('imputer', SimpleImputer(strategy = 'median')),
('std_scaler', StandardScaler())])
full_pipe = ColumnTransformer([
('num', num_pipe, numerical_features),
('cat', cat_pipe, categorical_features)])
cat_pipe.fit_transform(data[categorical_features])#working fine
df2 = full_pipe.fit_transform(X_train)# __init__ initialisation lost
"output"
==========> ['year', 'month', 'day']
['year', 'month', 'day']
year
month
day
==========> None
None
After that long traceback that I am not able to debug. Workaround is if I can create use_dates=['year', 'month', 'day'] in transform function itself but I want to understand why this is happening.
The parameters of __init__ need to have the same names as the attributes that get set (so use_dates and _use_dates is the problem).
This is required for cloning to work properly, and ColumnTransformer clones all its transformers before fitting.
https://scikit-learn.org/stable/developers/develop.html#instantiation

Raising ValueError (extra rows) when submitting pipeline.predict into grader

I've been getting a ValueError below when trying to submit my Pipeline into a grader. And I'm not sure where I'm supposed to shave off 12500 rows of data.
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 13892, expected 1544.
I've been tasked to build a model that combines the business_features of nursing homes with their cycle 1 survey results, as well as the time between the cycle 1 and cycle 2 survey to predict the cycle 2 total score.
This is my code that I'm using to accomplish the task above.
# Creating a custom transformer to calculate the difference between survey
# 1 & survey 2 times
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
self.col_1 = X[self.t1_col].apply(pd.to_datetime)
self.col_2 = X[self.t2_col].apply(pd.to_datetime)
return self
def transform(self, X):
difference_list = []
difference = self.col_1 - self.col_2
for obj in difference:
difference_list.append(obj.total_seconds())
return np.array(difference_list).reshape(-1,1)
# Creating TimedeltaTransformer object
cycle_1_date = 'CYCLE_1_SURVEY_DATE'
cycle_2_date = 'CYCLE_2_SURVEY_DATE'
time_feature = TimedeltaTransformer(cycle_1_date, cycle_2_date)
# Using a custom column selecter transformer to extract cycle_1_features
cycle_1_cols = ['CYCLE_1_DEFS', 'CYCLE_1_NFROMDEFS', 'CYCLE_1_NFROMCOMP',
'CYCLE_1_DEFS_SCORE', 'CYCLE_1_NUMREVIS',
'CYCLE_1_REVISIT_SCORE', 'CYCLE_1_TOTAL_SCORE']
cycle_1_features = Pipeline([
('cst2', ColumnSelectTransformer(cycle_1_cols)),
])
# Creating my survey_model Pipeline object
# Pipeline object is a 2 step process, first a feature union transforming
# and combining the business features, cycle_1 features as well as time
# feature; followed by fitting the transformed features into a
# RandomForestRegressor
survey_model = Pipeline([
('features', FeatureUnion([
('business', business_features),
('survey', cycle_1_features),
('time', time_feature),
])),
('forest', RandomForestRegressor()),
])
# Fitting my pipeline produces no error
survey_model.fit(data, cycle_2_score.astype(int))
# Calling the predict function and passing it into the grader raises a ValueError
grader.score.ml__survey_model(survey_model.predict)
The fitted pipeline looks like this
Pipeline(memory=None,
steps=[('features',
FeatureUnion(n_jobs=None,
transformer_list=[('business',
FeatureUnion(n_jobs=None,
transformer_list=[('simple',
Pipeline(memory=None,
steps=[('cst',
ColumnSelectTransformer(columns=['BEDCERT',
'RESTOT',
'INHOSP',
'CCRC_FACIL',
'SFF',
'CHOW_LAST_12MOS',
'SPRINKLER_STATUS',
'EXP_TOTAL',
'ADJ_TOTAL'])),
('imputer',
SimpleImpute...
transformer_weights=None, verbose=False)),
('forest',
RandomForestRegressor(bootstrap=True, criterion='mse',
max_depth=None, max_features='auto',
max_leaf_nodes=None,
min_impurity_decrease=0.0,
min_impurity_split=None,
min_samples_leaf=1, min_samples_split=2,
min_weight_fraction_leaf=0.0,
n_estimators=10, n_jobs=None,
oob_score=False, random_state=None,
verbose=0, warm_start=False))],
verbose=False)
Some additional context: I'm building this model to have its predict method passed into a custom grader for a project. The grader passes a list of dictionaries to the predict or predict_proba method of my estimator, not a DataFrame. This means that the model must work with both data types. For this reason, I need to provide a custom ColumnSelectTransformer to use instead scikit-learn's own ColumnTransformer.
Below is additional code related to the business features and ColumnSelectTransformer
# Custom transformer to select columns from a dataframe and returns the
# dataframe as an array
class ColumnSelectTransformer(BaseEstimator, TransformerMixin):
def __init__(self, columns):
self.columns = columns
def fit(self, X, y=None):
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
return X[self.columns].values
simple_features = Pipeline([
('cst', ColumnSelectTransformer(simple_cols)),
('imputer', SimpleImputer(strategy='mean')),
])
owner_onehot = Pipeline([
('cst', ColumnSelectTransformer(['OWNERSHIP'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
])
cert_onehot = Pipeline([
('cst', ColumnSelectTransformer(['CERTIFICATION'])),
('imputer', SimpleImputer(strategy='most_frequent')),
('encoder', OneHotEncoder()),
])
categorical_features = FeatureUnion([
('owner_onehot', owner_onehot),
('cert_onehot', cert_onehot),
])
business_features = FeatureUnion([
('simple', simple_features),
('categorical', categorical_features)
])
Finally, below is the full error raised
---------------------------------------------------------------------------
ValueError Traceback (most recent call last)
<ipython-input-165-790ca6139493> in <module>()
----> 1 grader.score.ml__survey_model(survey_model.predict)
/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in func(*args, **kw)
92 def __getattr__(self, method):
93 def func(*args, **kw):
---> 94 return self(method, *args, **kw)
95 return func
96
/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in __call__(self, question_name, func)
88 return
89 test_cases = json.loads(resp.text)
---> 90 test_cases_grading(question_name, func, test_cases)
91
92 def __getattr__(self, method):
/opt/conda/lib/python3.7/site-packages/static_grader/grader.py in test_cases_grading(question_name, func, test_cases)
40 for test_case in test_cases:
41 if inspect.isroutine(func):
---> 42 sub_res = func(*test_case['args'], **test_case['kwargs'])
43 elif not test_case['args'] and not test_case['kwargs']:
44 sub_res = func
/opt/conda/lib/python3.7/site-packages/sklearn/utils/metaestimators.py in <lambda>(*args, **kwargs)
114
115 # lambda, but not partial, allows help() to work with update_wrapper
--> 116 out = lambda *args, **kwargs: self.fn(obj, *args, **kwargs)
117 # update the docstring of the returned function
118 update_wrapper(out, self.fn)
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in predict(self, X, **predict_params)
419 Xt = X
420 for _, name, transform in self._iter(with_final=False):
--> 421 Xt = transform.transform(Xt)
422 return self.steps[-1][-1].predict(Xt, **predict_params)
423
/opt/conda/lib/python3.7/site-packages/sklearn/pipeline.py in transform(self, X)
963 return np.zeros((X.shape[0], 0))
964 if any(sparse.issparse(f) for f in Xs):
--> 965 Xs = sparse.hstack(Xs).tocsr()
966 else:
967 Xs = np.hstack(Xs)
/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in hstack(blocks, format, dtype)
463
464 """
--> 465 return bmat([blocks], format=format, dtype=dtype)
466
467
/opt/conda/lib/python3.7/site-packages/scipy/sparse/construct.py in bmat(blocks, format, dtype)
584 exp=brow_lengths[i],
585 got=A.shape[0]))
--> 586 raise ValueError(msg)
587
588 if bcol_lengths[j] == 0:
ValueError: blocks[0,:] has incompatible row dimensions. Got blocks[0,2].shape[0] == 13892, expected 1544.
Fixing my TimeDeltaTransformer helped.
class TimedeltaTransformer(BaseEstimator, TransformerMixin):
def __init__(self, t1_col, t2_col):
self.t1_col = t1_col
self.t2_col = t2_col
def fit(self, X, y=None):
return self
def transform(self, X):
if not isinstance(X, pd.DataFrame):
X = pd.DataFrame(X)
timedelta_series = (pd.to_datetime(X[self.t1_col]) - pd.to_datetime(X[self.t2_col]))
array_list = []
for x in timedelta_series:
array_list.append(x.total_seconds())
return np.array(array_list).reshape(-1,1)

Combine CountVectorizer and SelectKBest causes labels to disappear

I have a class that creates a feature extraction pipeline and fits a logistic regression model. The input is a set of string data in a DF structure. The ItemSelector class just returns the column that has the clean data from the original data frame, then passes it onto CountVectorizer and Kbest selector. If i remove Kbest, this pipeline works:
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelector(BaseEstimator, TransformerMixin):
# returns a single column from a DF
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class LogisticRegressionWithWordFeatures(object):
def __init__(self):
self.model = LogisticRegression()
def fit(self, df, labels):
self.pipeline = self.get_preprocessing_pipeline(df)
fitted_df = self.pipeline.fit_transform(df)
self.model.fit(fitted_df, labels)
return self
def predict(self, df):
fitted_df = self.pipeline.transform(df)
y = self.model.predict(fitted_df)
return y
def get_preprocessing_pipeline(self, data_frame):
"""
Get data frame containing features and labels from raw feature input DF.
:param input_file: input DF
"""
process_and_join_features = Pipeline([
('features', FeatureUnion([
('count_lemma_features', Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('counts', CountVectorizer(analyzer="word", stop_words='english'))]))])),
('reducer', SelectKBest(chi2, k=1000))
])
return process_and_join_features
If I try to fit/transform based on this pipeline I get this error:
model = LogisticRegressionWithWordFeatures()
model.fit(train_data, train_labels)
test_y = model.predict(test_data)
>>>
TypeError Traceback (most recent call last)
<ipython-input-183-536a1c9c0a09> in <module>
1 b_logistic_regression_with_hypers_bow_clean = LogisticRegressionWithWordFeatures()
----> 2 b_logistic_regression_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.fit(b_ebay_train_data, b_ebay_train_labels)
3 b_ebay_y_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.predict(b_ebay_test_data)
4 b_gold_y_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.predict(gold_df)
<ipython-input-181-6974b6ea2a5b> in fit(self, df, labels)
6 def fit(self, df, labels):
7 self.pipeline = self.get_preprocessing_pipeline(df)
----> 8 fitted_df = self.pipeline.fit_transform(df)
9 self.model.fit(fitted_df, labels)
10 return self
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
391 return Xt
392 if hasattr(last_step, 'fit_transform'):
--> 393 return last_step.fit_transform(Xt, y, **fit_params)
394 else:
395 return last_step.fit(Xt, y, **fit_params).transform(Xt)
~/anaconda3/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
551 if y is None:
552 # fit method of arity 1 (unsupervised transformation)
--> 553 return self.fit(X, **fit_params).transform(X)
554 else:
555 # fit method of arity 2 (supervised transformation)
TypeError: fit() missing 1 required positional argument: 'y'
Clearly the issue is that the training labels aren't making its way into the pipeline. I tried adding another itemselector for the training labels:
process_and_join_features = Pipeline([
('features', FeatureUnion([
('count_lemma_features', Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('counts', CountVectorizer(analyzer="word", stop_words='english'))])),
('labels', ItemSelector(key='Expense_Category'))])),
('reducer', SelectKBest(chi2, k=1000))
])
return process_and_join_features
But this causes a key error for the label (Expense_Category), even though that column is there in the training data.
If I do it step by step, this works:
item_selector = ItemSelector(key='clean_Invoice_Description').fit(train_data)
count_selector = CountVectorizer(analyzer="word", stop_words='english')
k_best = SelectKBest(chi2, k=1000)
invoice_desc = item_selector.transform(train_data)
invoice_desc = count_selector.fit_transform(invoice_desc)
reduced_desc = k_best.fit_transform(invoice_desc, train_labels)
print(reduced_desc.shape)
>>> (6130, 1000)
The problem with doing the step-by-step way is that there are other features in other columns that I would like to use in along with these, and pipeline provides a nice way of doing so without having to manually combine them.
Solved it. The main issue being the nesting of each feature. Pipelines() expects a list of tuples, where the first item in the tuple is the feature/pipe name, and the second being the actual class. It is very easy to lose track of the nesting as you add more features. Here is the final code:
def get_preprocessing_pipeline(self, data_frame):
"""
Get data frame containing features and labels from raw feature input csv file"""
process_and_join_features = Pipeline([
('features',
FeatureUnion([
('tokens',
Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('vec', CountVectorizer(analyzer="word", stop_words='english')),
('dim_red', SelectKBest(chi2, k=5000))
])),
('hypernyms',
Pipeline([
('selector', ItemSelector(key='hypernyms_combined')),
('vec', TfidfVectorizer(analyzer="word")),
('dim_red', SelectKBest(chi2, k=5000))
]))]))])
return process_and_join_features

How to fix "TypeError: fit_transform() takes 2 positional arguments but 3 were given"

I have tried to write complicated pipeline with own classes and I got this error:
TypeError: fit_transform() takes 2 positional arguments but 3 were given
I tried to apply solutions from similar issue by using custom LabelBinarizer, but the error did not fixed.
class NewLabelBinarizer(LabelBinarizer):
def fit(self, X, y=None):
return super(NewLabelBinarizer, self).fit(X)
def transform(self, X, y=None):
return super(NewLabelBinarizer, self).transform(X)
def fit_transform(self, X, y=None):
return super(NewLabelBinarizer, self).fit(X).transform(X)
class LabelPreprocessing(NewLabelBinarizer, TransformerMixin):
def __init__(self, *args, **kwargs):
pass
def fit(self, y,X=None):
super(LabelPreprocessing, self).fit(y)
return self
def transform(self, y, X=None):
y = y.str.findall(r'([a-zA-Z]{3,})')
y = y.replace(regex=r'(film)', value=' ')
return y.values
class PlotPreprocessing(BaseEstimator, TransformerMixin):
def __init__(self, *args, **kwargs):
self.REPLACE_BY_SPACE_RE = re.compile('[/(){}\[\]\|#,;]')
self.BAD_SYMBOLS_RE = re.compile('[^0-9a-z #+_]')
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
X = X.str.lower()
X = X.map(lambda x: re.sub(self.REPLACE_BY_SPACE_RE," ",x))
X = X.map(lambda x: re.sub(r'\s+'," ",x))
return X.values
pipeline = Pipeline([
(
'text_preparation', FeatureUnion([
('label', Pipeline([
('labelPreprocessing', LabelPreprocessing()),
('mlb', MultiLabelBinarizer())
])),
('plot', PlotPreprocessing()
)
])),
('tfidf_vectorizer', TfidfVectorizer(ngram_range=(1,2),max_df=0.9,min_df=5,token_pattern=r'(\S+)')
),
('model', LinearRegression())
])
train_X, train_y, test_X, test_y = train_test_split(plot, label)
pipeline.fit(train_X, train_y)
y_pred = pipeline.predict( test_X )
And I get
TypeError Traceback (most recent call last)
<ipython-input-27-a8a60de025fd> in <module>()
17 train_X, train_y, test_X, test_y = train_test_split(plot, label)
18
---> 19 pipeline.fit(train_X, train_y)
20 y_pred = pipeline.predict( test_X )
14 frames
/usr/local/lib/python3.6/dist-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
391 return Xt
392 if hasattr(last_step, 'fit_transform'):
--> 393 return last_step.fit_transform(Xt, y, **fit_params)
394 else:
395 return last_step.fit(Xt, y, **fit_params).transform(Xt)
TypeError: fit_transform() takes 2 positional arguments but 3 were given
Also I tried to add **fit_params to fit/predict params.
I think that the problem is with MultiLabelBinarizer signature. Try out replacing it with:
class MLBinarizer(MultiLabelBinarizer):
def fit_transform(self, X, y=None):
return super(MultiLabelBinarizer, self).fit_transform(X)
note that methods fit, fit_transform take X as the first argument. So I recommend rewriting your LabelPreprocessing like this:
class LabelPreprocessing(NewLabelBinarizer, TransformerMixin):
def fit(self, X, y=None):
super(LabelPreprocessing, self).fit(X)
return self
def transform(self, y, X=None):
y = y.str.findall(r'([a-zA-Z]{3,})')
y = y.replace(regex=r'(film)', value=' ')
return y.values

How use leave one out encoding in sklearn pipelines

I would like to test different encoding strategies as implemented in categorical encoding package using sklearn pipelines.
I mean something like this:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy="median")),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('cat_encoder', LeaveOneOutEncoder()),
])
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
But I get an error:
TypeError: fit() missing 1 required positional argument: 'y'
Can anyone suggest a solution?
Let show just part of the code as I do. I add XGBRegressor because I think you may predict housing price
class MultiColumn(BaseEstimator, TransformerMixin):
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self
def transform(self, X):
return X[self.columns]
NUMERIC = df[['var1', 'var2']]
CATEGORICAL = df[['var3', 'var4']]
class Imputation(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
return X.fillna(NUMERIC.median())
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
class Cat(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
enc = DictVectorizer(sparse = False)
encc = enc.fit(CATEGORICAL.T.to_dict().values())
enc_data = encc.transform(X.T.to_dict().values())
enc_data[np.isnan(enc_data)] = 1
return enc_data
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
And Pipeline
pipeline = Pipeline([
# Use FeatureUnion to combine the features
('union', FeatureUnion(
transformer_list=[
# numeric
('numeric', Pipeline([
('selector', MultiColumn(columns=['var1', 'var2'])),
('imp', Imputation()),
('scaling', preprocessing.StandardScaler(with_mean = 0.))
])),
# categorical
('categorical', Pipeline([
('selector', MultiColumn(columns=['var3', 'var4'])),
('one_hot', Cat()),
(CategoricalImputer())
])),
])),
('model_fitting', xgb.XGBRegressor()),
])
Your categorical encoder (LeaveOneOutEncoder) needs the target variable to adjust and replace the new labels (levels) for your variables defined in cat_attribs. So, you just need to invoke fit_transform method joined with y_train:
housing_prepared = full_pipeline.fit_transform(housing, y_train)

Categories

Resources