AttributeError: 'Pipeline' object has no attribute 'get_feature_names - python

I have a Pipeline built as follows:
Pipeline(steps=[('preprocessor',
ColumnTransformer(remainder='passthrough',
transformers=[('text',
Pipeline(steps=[('CV',
CountVectorizer())]),
'Tweet'),
('category',
OneHotEncoder(handle_unknown='ignore'),
['Tweet_ID']),
('numeric',
Pipeline(steps=[('knnImputer',
KNNImputer(n_neighbors=2)),
('scaler',
MinMaxScale...
'CS',
'UC',
'CL',
'S',
'SS',
'UW',
...])])),
('classifier', LogisticRegression())])
I am trying to get feature names:
feature_names = lr['preprocessor'].transformers_[0][1].get_feature_names()
coefs = lr.named_steps["classifier"].coef_.flatten()
zipped = zip(feature_names, coefs)
features_df = pd.DataFrame(zipped, columns=["feature", "value"])
features_df["ABS"] = features_df["value"].apply(lambda x: abs(x))
features_df["colors"] = features_df["value"].apply(lambda x: "green" if x > 0 else "red")
features_df = features_df.sort_values("ABS", ascending=False)
features_df
However I am getting an error:
----> 6 feature_names = lr['preprocessor'].transformers_[0][1].get_feature_names()
7 coefs = lr.named_steps["classifier"].coef_.flatten()
8
AttributeError: 'Pipeline' object has no attribute 'get_feature_names
I already went through the following answers:
'OneHotEncoder' object has no attribute 'get_feature_names'
'Pipeline' object has no attribute 'get_feature_names' in scikit-learn
but unfortunately they were not so helpful as I would have expected.
Does anyone know how to fix it?
Happy to provide more info, if needed.
An example of pipeline is the following:
lr = Pipeline(steps=[('preprocessor', preprocessing),
('classifier', LogisticRegression(C=5, tol=0.01, solver='lbfgs', max_iter=10000))])
where preprocessing is
preprocessing = ColumnTransformer(
transformers=[
('text',text_preprocessing, 'Tweet'),
('category', categorical_preprocessing, c_feat),
('numeric', numeric_preprocessing, n_feat)
], remainder='passthrough')
I am separating before splitting train and test sets the different types of features:
text_columns=['Tweet']
target=['Label']
c_feat=['Tweet_ID']
num_features=['CS','UC','CL','S','SS','UW']
Following David's answer and link, I have tried as follows:
For numerical:
class NumericalTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
super().__init__()
def fit(self, X, y=None):
return self
def transform(self, X, y=None):
# Numerical features to pass down the numerical pipeline
X = X[[num_features]]
X = X.replace([np.inf, -np.inf], np.nan)
return X.values
# Defining the steps in the numerical pipeline
numerical_pipeline = Pipeline(steps=[
('num_transformer', NumericalTransformer()),
('imputer', KNNImputer(n_neighbors=2)),
('minmax', MinMaxScaler())])
For categorical:
class CategoricalTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
super().__init__()
# Return self nothing else to do here
def fit(self, X, y=None):
return self
# Helper function that converts values to Binary depending on input
def create_binary(self, obj):
if obj == 0:
return 'No'
else:
return 'Yes'
# Transformer method for this transformer
def transform(self, X, y=None):
# Categorical features to pass down the categorical pipeline
return X[[c_feat]].values
# Defining the steps in the categorical pipeline
categorical_pipeline = Pipeline(steps=[
('cat_transformer', CategoricalTransformer()),
('one_hot_encoder', OneHotEncoder(handle_unknown='ignore'))])
and for text feature:
class TextTransformer(BaseEstimator, TransformerMixin):
def __init__(self):
super().__init__()
# Return self nothing else to do here
def fit(self, X, y=None):
return self
# Helper function that converts values to Binary depending on input
def create_binary(self, obj):
if obj == 0:
return 'No'
else:
return 'Yes'
# Transformer method for this transformer
def transform(self, X, y=None):
# Text features to pass down the text pipeline
return X[['Tweet']].values
# Defining the steps in the text pipeline
text_pipeline = Pipeline(steps=[
('text_transformer', TextTransformer()),
('cv', CountVectorizer())])
Then I combine numerical, text and categorical pipeline into one full big pipeline horizontally:
# using FeatureUnion
union_pipeline = FeatureUnion(transformer_list=[
('categorical_pipeline', categorical_pipeline),
('numerical_pipeline', numerical_pipeline),
('text_pipeline', text_pipeline)])
and finally:
# Combining the custom imputer with the categorical, text and numerical pipeline
preprocess_pipeline = Pipeline(steps=[('custom_imputer', CustomImputer()),
('full_pipeline', union_pipeline)])
What it is still not clear is how to get features names.

You need to implement a dedicated get_feature_names function, as you are using a custom transformer.
Please refer to this question for details, where you can find a code example.

Related

Create iterator from a Data Frame in Python

I am working on an NLP project using Seq2Seq. I created a data frame from my dataset then created a batch iterator using data loader, see the following code:
# creates lists containing each pair
original_word_pairs = [[w for w in l.split('\t')] for l in lines[:num_examples]]
data = pd.DataFrame(original_word_pairs, columns=["src", "trg"])
# conver the data to tensors and pass to the Dataloader
# to create a batch iterator
class MyData(Dataset):
def __init__(self, X, y):
self.data = X
self.target = y
# TODO: convert this into torch code is possible
self.length = [ np.sum(1 - np.equal(x, 0)) for x in X]
def __getitem__(self, index):
x = self.data[index]
y = self.target[index]
x_len = self.length[index]
return x,y,x_len
def __len__(self):
return len(self.data)
train_dataset = MyData(input_tensor_train, target_tensor_train)
val_dataset = MyData(input_tensor_val, target_tensor_val)
train_dataset = DataLoader(train_dataset, batch_size = BATCH_SIZE,
drop_last=True,
shuffle=True)
test_dataset= DataLoader(val_dataset, batch_size = BATCH_SIZE,
drop_last=True,
shuffle=True)
That is a part of my code, the thing is I want to use the iterators like this
for i, batch in enumerate(iterator):
src = batch.src
trg = batch.trg
But I got an error "AttributeError: 'list' object has no attribute 'src'"
How can I use the iterator and access a specific column?
You can redefine __getitem__ in your Dataset to return a dictionary:
def __getitem__(self, index):
x = self.data[index]
y = self.target[index]
x_len = self.length[index]
return {"src": x, "trg": y, "x_len": x_len}
The default collate_fn of DataLoader will take care to provide a dictionary containing batches instead of single observations, but you need to convert x_len to a tensor into __getitem__ to make it work (or you can pass a custom collate_fn).

Why is target encoder encoding some values as NaN?

I am using a target encoder from category_encoders to encode a feature, here is the code I m using:
from category_encoders import TargetEncoder
def encode_large_features(features, X_train, X_test, y_train):
print('target encoding features ...')
for _ in features:
target_encoder = TargetEncoder(_)
target_encoder.fit(X_train[_], y_train)
name = _ + '_encoded'
X_train[name] = target_encoder.transform(X_train[_])
X_train.drop([_], axis=1, inplace=True)
X_test[name] = target_encoder.transform(X_test[_])
X_test.drop([_], axis=1, inplace=True)
return X_train, X_test
the target encoder encodes some values as NaN and I dont know why? here is an example:
Faced the same issue: Raised Issue n Repo
Found a workaround by Building a Custom KFold-Target Encoder which is better than the library version. KFold Target Encoder is less susceptible to data leakage / fewer chances of overfitting.
This will not return NaN in the training Dataset like category_encoder library.
Below example: chid is a categorical column apply KFoldTargetEncoder on it.
Libraries required:
from tqdm import tqdm
from sklearn.model_selection import KFold
from sklearn import base
Training Dataset:
class KFoldTargetEncoderTrain(base.BaseEstimator, base.TransformerMixin):
def __init__(self, colnames,targetName,n_fold=5,verbosity=True,discardOriginal_col=False):
self.colnames = colnames
self.targetName = targetName
self.n_fold = n_fold
self.verbosity = verbosity
self.discardOriginal_col = discardOriginal_col
def fit(self, X, y=None):
return self
def transform(self,X):
assert(type(self.targetName) == str)
assert(type(self.colnames) == str)
assert(self.colnames in X.columns)
assert(self.targetName in X.columns)
mean_of_target = X[self.targetName].mean()
kf = KFold(n_splits = self.n_fold, shuffle = False, random_state=2019)
col_mean_name = self.colnames + '_' + 'Kfold_Target_Enc'
X[col_mean_name] = np.nan
for tr_ind, val_ind in kf.split(X):
X_tr, X_val = X.iloc[tr_ind], X.iloc[val_ind]
X.loc[X.index[val_ind], col_mean_name] = X_val[self.colnames].map(X_tr.groupby(self.colnames)[self.targetName].mean())
X[col_mean_name].fillna(mean_of_target, inplace = True)
if self.verbosity:
encoded_feature = X[col_mean_name].values
print('Correlation between the new feature, {} and, {} is {}.'.format(col_mean_name,
self.targetName,
np.corrcoef(X[self.targetName].values, encoded_feature)[0][1]))
if self.discardOriginal_col:
X = X.drop(self.targetName, axis=1)
return X
Fit_Transform on Training Data:
targetc_chid = KFoldTargetEncoderTrain('chid','target',n_fold=5)
train_df = targetc_chid.fit_transform(train_df)
Test Dataset:
class KFoldTargetEncoderTest(base.BaseEstimator, base.TransformerMixin):
def __init__(self,train,colNames,encodedName):
self.train = train
self.colNames = colNames
self.encodedName = encodedName
def fit(self, X, y=None):
return self
def transform(self,X):
mean = self.train[[self.colNames,
self.encodedName]].groupby(
self.colNames).mean().reset_index()
dd = {}
for row in tqdm(mean.itertuples(index=False)):
dd[row[0]] = row[1]
X[self.encodedName] = X[self.colNames]
X[self.encodedName] = X[self.encodedName].map(dd.get)
return X
Fit on Test Data:
test_targetc_chid = KFoldTargetEncoderTest(train_df,'chid','chid_Kfold_Target_Enc')
valid_df = test_targetc_chid.fit_transform(valid_df)

Pipeline with Custom Transformer Class does not work within a full Pipeline using Featureunion

I am preparing the data from german credit dataset (https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)), I build a custom transformer to extract features from an attribute in the dataset, and it works in a small pipeline alone.
The custom transformer (AddGenderStatus) adds gender and status as features.
The problem arise when I put this pipeline in a full pipeline using FeatureUnion.
KeyError: "['gender', 'status'] not in index
import pandas as pd
from sklearn.base import TransformerMixin, BaseEstimator
from sklearn.preprocessing import OneHotEncoder
from sklearn.pipeline import FeatureUnion, Pipeline
from sklearn.model_selection import train_test_split
# %% set column names
attributes=['checking_balance',
'months_loan_duration',
'credit_history',
'purpose',
'amount',
'savings_balance',
'employment_duration',
'installment_rate_income',
'status_gender',
'debtors_guarantors',
'residence_years',
'property',
'age',
'other_installment',
'housing',
'existing_loans_count',
'job',
'dependents',
'phone',
'class']
# %% load the data
# https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)
url ='https://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data'
credit = pd.read_csv(url, sep=' ',header=None, names=attributes, index_col=False)
# %% Split the data
X=credit.drop('class', axis=1)
y=credit['class']
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.25, random_state = 0)
# check class balance
y.value_counts()/len(y)
y_train.value_counts()/len(y_train)
y_test.value_counts()/len(y_test)
# %% Calss to extract gender and status features
""" Attribute 9: (qualitative)
Personal status and sex - status_sex
A91 : male : divorced/separated
A92 : female : divorced/separated/married
A93 : male : single
A94 : male : married/widowed
A95 : (female : single - does not exist)
"""
class AddGenderStatus(TransformerMixin, BaseEstimator):
def __init__(self, key):
# key is the column name as str
self.key = key
def fit(self,X,y=None):
return self
def transform(self,X):
function_gender = lambda x:'male'if x=='A91'or x=='A93'or x=='A94' else 'female'
function_status = lambda x: 'divorced' if x=='A91' else ('married' if x=='A92' or x=='A94' else 'single')
X_new = X.copy()
X_new["status"] = X[self.key].map(function_status)
X_new["gender"] = X[self.key].map(function_gender)
X_new.drop([self.key], axis=1,inplace=True)
return X_new
# %% Pipeline new_attribs
gender_status_attribs = Pipeline([
('AddGenderStatus',AddGenderStatus(key='status_gender'))
])
X_train_check = gender_status_attribs.transform(X_train)
'gender' and 'status' in list(X_train_check) #True
# %% Create a class to select numerical or categorical columns
class ColumnExtractor(BaseEstimator,TransformerMixin):
def __init__(self, key):
# key is the column name as str
self.key = key
def fit(self, X, y=None):
# stateless transformer
return self
def transform(self, X):
# assumes X is a DataFrame
return X[self.key]
# %% Encoding categorical data
cat_attribs = ['checking_balance',
'credit_history',
'purpose',
'savings_balance',
'employment_duration',
'debtors_guarantors',
'property',
'other_installment',
'housing',
'job',
'phone',
'gender',
'status']
# %% Pipeline categorical
categorical_attribs = Pipeline([
('selector', ColumnExtractor(key=cat_attribs)),
('encoder',OneHotEncoder(drop='first',sparse=False,))
])
# %% Full Pipeline
full_pipeline = FeatureUnion(transformer_list=[("gender_status_attribs", gender_status_attribs),
("categorical_attribs", categorical_attribs),
])
X_train_prepared=full_pipeline.transform(X_train)
# KeyError: "['gender', 'status'] not in index"
I see 2 problems in your code:
Rather than doing a FeatureUnion you need to do a Pipeline because second transformer expect inputs from the first (if you do expect a feature union you need to do a combination of FeatureUnion and a Pipeline)
full_pipeline.transform should become full_pipeline.fit_transform
When you change those lines to:
# %% Full Pipeline
full_pipeline = Pipeline([("gender_status_attribs", gender_status_attribs),
("categorical_attribs", categorical_attribs),
])
X_train_prepared=full_pipeline.fit_transform(X_train)
your code will run without an error.
Edit
Should you insist on using FeatureUnion you may consider:
ppl = Pipeline([("gender_status", gender_status_attribs),("categorical_attribs", categorical_attribs)])
full_pipeline = FeatureUnion([("gender_status_attribs", gender_status_attribs),("pipeline",ppl)])
X_train_prepared=full_pipeline.fit_transform(X_train)

getting different scaling in sklearn grid search

I'm trying to set a GridSearchCV in sklearn that uses a TimeSeriesSplit with data normalized on the training set. What I did is to create a TransformerMixin called DivisorTransform that gets the divisor of the normalization and store it. The DivisorTransform is instantiated before the Pipeline. Into the pipeline, I set the DivisorTransform (in order to fit it) and then a NormalizeTransformer takes as input the DivisorTransform and perform the division. However, using this pipeline into the GridSearchCV the transformer are pickled. This causes the DivisorTransform to be pickled and fitted, then the NormalizeTransformer is pickled but having the DivisorTransform in itself, the DivisorTransform is pickled again. This causes the NormalizeTransformer to use un un-fitted DivisorTransform.
Here is an example
dt = DivisorTransform()
pipe = Pipeline([('divisor',dt),('normalize',NormalizeTransformer(dt))])
gridS = GridSearchCV(pipe,params={...},cv=TimeSeriesSplit())
How different normalizations have to be managed into a GridSearchCV? Which are the best practice?
Here a python example
import pandas as pd
import numpy as np
from sklearn.base import BaseEstimator
from sklearn.base import TransformerMixin
from sklearn.pipeline import Pipeline
from sklearn.grid_search import GridSearchCV
from sklearn.model_selection import TimeSeriesSplit
class DivisorTransform(BaseEstimator,TransformerMixin):
def __init__(self):
pass
def fit(self, X, y=None):
print(f'{type(self).__name__} id {id(self)} fit')
self.divisor_ = X.max()
return self
def transform(self, X):
print(f'{type(self).__name__} id {id(self)} transform')
return X
def getDivisor(self):
return self.divisor_
class NormalizationTransform(BaseEstimator,TransformerMixin):
def __init__(self, divisorTransform, fakeParam):
self.divTrns = divisorTransform
self.fakeParam = fakeParam
print(f'{type(self).__name__} id {id(self)} init saving {type(self.divTrns).__name__} at {id(self.divTrns)}')
def fit(self, X, y=None):
print(f'{type(self).__name__} id {id(self)} fit going to fit {type(self.divTrns).__name__} {id(self.divTrns)}')
self.divisor_ = self.divTrns.fit(X).getDivisor()
return self
def transform(self, X):
print(f'{type(self).__name__} id {id(self)} transform')
res = X.copy()
res = res / self.divisor_
print('_______________________________________')
print(res)
return res
def anti_transform(self, X):
res = X.copy()
res = res * self.divisor_
return res
def score(self, X, y=None, sample_weight=None):
return 1
x = pd.DataFrame([[i+j*10 for j in range(3)] for i in range(10)],columns=['A','B','C'])
dvT = DivisorTransform()
print(type(dvT).__name__)
pipe = Pipeline([('divisor',dvT),('normalization',NormalizationTransform(dvT, 0))])
res1 = pipe.fit_transform(x)
params = {'normalization__fakeParam':[0,1]}
gs = GridSearchCV(pipe,params,cv=TimeSeriesSplit(n_splits=3).split(x))
print('Starting Grid Search')
gs.fit(x)
Thi produces as print:
Starting Grid Search
NormalizationTransform id 140321510292896 init saving NoneType at 94405154462352
NormalizationTransform id 140321722266344 init saving NoneType at 94405154462352
That shows the problem

Use attributes of preceding estimators as parameters in ML pipeline

I am using Pipelines from Pyspark's ML library to preprocess text and calculate the TF-IDF values for all tokens. I also created a custom Transformer that returns for each text snippet the 5 tokens with the highest TF-IDF values. The main code looks like this:
%pyspark
tokenizer = RegexTokenizer(inputCol="text", outputCol="tokenized", pattern="\\W")
remover = StopWordsRemover(inputCol="tokenized", outputCol="filtered")
count_vectorizer = CountVectorizer(inputCol="filtered", outputCol="count", vocabSize=pow(2,10))
idf = IDF(inputCol="count", outputCol="TF-IDF")
normalizer = Normalizer(inputCol="TF-IDF", outputCol="normalized", p=2.0)
top_token_extractor = TopTokenExtractor(inputCol="normalized", outputCol="topTokens", vocabulary=model.stages[2].vocabulary) # !!! does not work
pipeline = Pipeline(stages=[tokenizer, remover, count_vectorizer, idf, normalizer, top_token_extractor])
model = pipeline.fit(df)
And here is the implementation of TopTokenExtractor:
%pyspark
from pyspark import keyword_only
from pyspark.ml.pipeline import Transformer
from pyspark.ml.param.shared import HasInputCol, HasOutputCol, Param
from pyspark.sql.functions import udf
from pyspark.sql.types import ArrayType, StringType
class TopTokenExtractor(Transformer, HasInputCol, HasOutputCol):
#keyword_only
def __init__(self, inputCol=None, outputCol=None, vocabulary=None):
super(TopTokenExtractor, self).__init__()
self.vocabulary = Param(self, "vocabulary", "")
self._setDefault(vocabulary=set())
kwargs = self._input_kwargs
self.setParams(**kwargs)
#keyword_only
def setParams(self, inputCol=None, outputCol=None, vocabulary=None):
kwargs = self._input_kwargs
return self._set(**kwargs)
def setVocabulary(self, value):
self._paramMap[self.vocabulary] = value
return self
def getVocabulary(self):
return self.getOrDefault(self.vocabulary)
def _transform(self, dataset):
out_col = self.getOutputCol()
in_col = dataset[self.getInputCol()]
vocabulary = self.getVocabulary()
def f(s):
token_tuples = sorted(list(zip(s.indices, s.values)), key=lambda x: x[1], reverse=True)
top_tokens = list()
for i in range(0, min(5, len(token_tuples))):
top_tokens.append(vocabulary[token_tuples[i][0]])
return top_tokens
t = ArrayType(StringType())
return dataset.withColumn(out_col, udf(f, t)(in_col))
The problem is that in order to return a list of tokens rather than indices, I need to pass the vocabulary from the CountVectorizer as a parameter to TopTokenExtractor. After calling pipeline.fit(df) the vocabulary could be accessed by model.stages[2].vocabulary, but I could not figure out how to pass it as a parameter in the course of a pipeline. Is this possible at all?
As a workaround, I might split up the pipeline into two parts, but I would really prefer to have a single pipeline if possible.

Categories

Resources