Getting feature names from a pipeline with tfidfvectorizer - python

I have been trying to get the feature names on my model for quite some time now but have a hard time understanding how to do it. I have tried many posts on here but can't get it to work. Here is my code:
loading the classes I need to combine tfidfvectorizer with other features
from sklearn.base import TransformerMixin, BaseEstimator
class ItemSelector(BaseEstimator, TransformerMixin):
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class FeatureTypeSelector(TransformerMixin, BaseEstimator):
FEATURE_TYPES = {
'categorical': [
'COLUMN_A','COLUMN_B'
],
'continuous': [
'COLULMN_C','COLUMN_D'
]
}
def __init__(self, feature_type):
self.columns = self.FEATURE_TYPES[feature_type]
def fit(self, X, y=None):
return self
def transform(self, X):
return X[self.columns]
class RowToDictTransformer(TransformerMixin, BaseEstimator):
def fit(self, X, y=None):
return self
def transform(self, X):
return (row[1] for row in X.iterrows())
Then the code to put everything in a pipeline and run the regressor
from sklearn.pipeline import make_union, make_pipeline
from sklearn.feature_extraction import DictVectorizer
from sklearn.preprocessing import RobustScaler
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import MinMaxScaler
# Create the preprocessor
preprocessor = make_union(
make_pipeline(
ItemSelector(key='TEXT_COLUMN'),
TfidfVectorizer(lowercase=False, min_df=1),
),
make_pipeline(
FeatureTypeSelector('continuous'),
MinMaxScaler(),
),
make_pipeline(
FeatureTypeSelector('categorical'),
RowToDictTransformer(),
DictVectorizer(sparse=False), # set sparse=True if you get MemoryError
),
)
# fit and transform the data
preprocessor.fit_transform(x_train)
# choose some estimator
# estimator = MultinomialNB()
estimator = LinearRegression()
# Create the model
model = make_pipeline(preprocessor, estimator)
# Training the model
model.fit(x_train, y_train)
# Predicting the model
predicted = model.predict(x_test)
I can run the model.coef_ to get all the coefficients but I want to see how each item of the TEXT_COLUMN is affected by which weight. I have tried calling get_feature_names() or tried passing them in the pipeline but with no succes (most of google's results are purple by now).
Anyone that can give me a bit of guidance how to pass the feature names to the end of the pipeline? The ideal result would be a dataframe with the feature (row from the TEXT_COLUMN) and feature_weight as value.

Related

RFE from scikit-learn feature_selection with NegativeBinomial from statsmodels as estimator

I'm trying to use RFE from scikit-learn with an estimator from statsmodels NegativeBinomial.
So I created my own class:
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.base import BaseEstimator
import statsmodels.api as sm
class MyEstimator(BaseEstimator):
def __init__(self, formula_, data_, family_):
self.model = sm.formula.glm(formula, data=data_, family=family_)
def fit(self, **kwargs):
self.model.fit()
self.coef_ = self.model.params.values
def predict(self, X):
result = self.model.predict(X)
return np.array(result)
X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
dataset = pd.DataFrame({'X1':X[:,0], 'X2':X[:,1], 'X3':X[:,2], 'y':y})
estimator = MyEstimator("y ~ X1 + X2 + X3", dataset, sm.families.NegativeBinomial())
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit()
But I get this error:
TypeError: fit() missing 2 required positional arguments: 'X' and 'y'
Does someone has an idea?
You can modify your code to require endog and exog variables, instead of using the formula API:
import numpy as np
import pandas as pd
from sklearn.datasets import make_friedman1
from sklearn.feature_selection import RFE
from sklearn.base import BaseEstimator
import statsmodels.api as sm
class MyEstimator(BaseEstimator):
def __init__(self, family_):
self.family_ = family_
def fit(self, exog, endog):
self.model = sm.GLM(endog, exog, family=self.family_)
fit_results = self.model.fit()
self.coef_ = fit_results.params
def predict(self, X):
result = self.model.predict(X)
return np.array(result)
X, y = make_friedman1(n_samples=50, n_features=10, random_state=0)
estimator = MyEstimator(sm.families.NegativeBinomial())
selector = RFE(estimator, n_features_to_select=5, step=1)
selector = selector.fit(X, y.reshape(-1,1))
print(selector.ranking_)
# [1 1 3 1 1 5 1 6 4 2]

AttributeError: 'Specificity' object has no attribute 'update_state_fn'

I am trying to dump a model of MIL package and its metrics to the disk using pickle. I have no issues with the load function but when I try to extract the loaded contents (model, the metrics), I am getting an attribute error for the same.
I have added the code snippet for the metrics class here. You can also refer the trainer class from the author here
Is there any fix for this?
import numpy as np
import tensorflow as tf
from tensorflow.keras.backend import epsilon
from mil import metrics
class Metric(tf.keras.metrics.Metric):
""" Custom base class for implementing a metric,
each Metric subclass has to implement this methods. """
def __init__(self, name, **kwargs):
super(Metric, self).__init__(name=name, **kwargs)
def update_state_fn(self, y_true, y_pred, sample_weight=None):
""" Update the state of the metric
Parameters
----------
y_true : array-like containing the ground_truth
y_pred : array-like containing the network predictions
sample_weight : optional. weight more some predictions.
"""
raise NotImplementedError
def result(self):
""" Get the result of a metric """
raise NotImplementedError
def reset_states(self):
""" Reset the state of the metric """
raise NotImplementedError
class Specificity(Metric):
def __init__(self, name='specificity', **kwargs):
super(Specificity, self).__init__(name=name, **kwargs)
self.specificity = self.add_weight(name='specificity', initializer='zeros')
self.specificity.assign(np.nan)
self.tn = metrics.TrueNegatives()
self.fp = metrics.FalsePositives()
def update_state_fn(self, y_true, y_pred, sample_weight=None):
self.tn.update_state(y_true, y_pred, sample_weight)
self.fp.update_state(y_true, y_pred, sample_weight)
tn = self.tn.result()
fp = self.fp.result()
value = tf.where(tn + fp == 0, np.nan, tn / (tn + fp + epsilon()))
self.specificity.assign(value)
def result(self):
return self.specificity
def reset_states(self):
self.specificity.assign(np.nan)
self.tn.reset_states()
self.fp.reset_states()
I have added the entire code for calling this function: (from Github)
# importing dataset
from mil.data.datasets import musk1
# importing bag_representation
from mil.bag_representation import MILESMapping
# importing validation strategy
from mil.validators import LeaveOneOut
# importing final model, which in this case is the SVC classifier from sklearn
from mil.models import SVC
# importing trainer
from mil.trainer import Trainer
# importing preprocessing
from mil.preprocessing import StandarizerBagsList
# importing metrics, which in this case are from tf keras metrics
from mil.metrics import AUC
# loading dataset
(bags_train, y_train), (bags_test, y_test) = musk1.load()
# instantiate trainer
trainer = Trainer()
# preparing trainer
metrics = ['spec']
model = SVC(kernel='linear', C=1, class_weight='balanced')
pipeline = [('scale', StandarizerBagsList()), ('disc_mapping', MILESMapping())]
trainer.prepare(model, preprocess_pipeline=pipeline ,metrics=metrics)
# fitting trainer
valid = LeaveOneOut()
history = trainer.fit(bags_train, y_train, sample_weights='balanced', validation_strategy=valid, verbose=1)
# printing validation results for each fold
print(history['metrics_val'])
# predicting metrics for the test set
trainer.predict_metrics(bags_test, y_test)
import pickle
with open('model_pkl.pkl', 'wb') as files:
pickle.dump(trainer, files)
with open('/home/mylaptop/Res/model_pkl.pkl','rb') as sample_test:
abc = pickle.load(sample_test)
Stack trace:
AttributeError Traceback (most recent call last)
<ipython-input-84-9214b6d16ff4> in <module>
1 with open('/home/mylaptop/Res/model_pkl.pkl','rb') as sample_test:
----> 2 abc = pickle.load(sample_test)
AttributeError: 'Specificity' object has no attribute 'update_state_fn'

Combine CountVectorizer and SelectKBest causes labels to disappear

I have a class that creates a feature extraction pipeline and fits a logistic regression model. The input is a set of string data in a DF structure. The ItemSelector class just returns the column that has the clean data from the original data frame, then passes it onto CountVectorizer and Kbest selector. If i remove Kbest, this pipeline works:
from sklearn.base import BaseEstimator, TransformerMixin
class ItemSelector(BaseEstimator, TransformerMixin):
# returns a single column from a DF
def __init__(self, key):
self.key = key
def fit(self, x, y=None):
return self
def transform(self, data_dict):
return data_dict[self.key]
class LogisticRegressionWithWordFeatures(object):
def __init__(self):
self.model = LogisticRegression()
def fit(self, df, labels):
self.pipeline = self.get_preprocessing_pipeline(df)
fitted_df = self.pipeline.fit_transform(df)
self.model.fit(fitted_df, labels)
return self
def predict(self, df):
fitted_df = self.pipeline.transform(df)
y = self.model.predict(fitted_df)
return y
def get_preprocessing_pipeline(self, data_frame):
"""
Get data frame containing features and labels from raw feature input DF.
:param input_file: input DF
"""
process_and_join_features = Pipeline([
('features', FeatureUnion([
('count_lemma_features', Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('counts', CountVectorizer(analyzer="word", stop_words='english'))]))])),
('reducer', SelectKBest(chi2, k=1000))
])
return process_and_join_features
If I try to fit/transform based on this pipeline I get this error:
model = LogisticRegressionWithWordFeatures()
model.fit(train_data, train_labels)
test_y = model.predict(test_data)
>>>
TypeError Traceback (most recent call last)
<ipython-input-183-536a1c9c0a09> in <module>
1 b_logistic_regression_with_hypers_bow_clean = LogisticRegressionWithWordFeatures()
----> 2 b_logistic_regression_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.fit(b_ebay_train_data, b_ebay_train_labels)
3 b_ebay_y_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.predict(b_ebay_test_data)
4 b_gold_y_with_hypers_bow_clean = b_logistic_regression_with_hypers_bow_clean.predict(gold_df)
<ipython-input-181-6974b6ea2a5b> in fit(self, df, labels)
6 def fit(self, df, labels):
7 self.pipeline = self.get_preprocessing_pipeline(df)
----> 8 fitted_df = self.pipeline.fit_transform(df)
9 self.model.fit(fitted_df, labels)
10 return self
~/anaconda3/lib/python3.7/site-packages/sklearn/pipeline.py in fit_transform(self, X, y, **fit_params)
391 return Xt
392 if hasattr(last_step, 'fit_transform'):
--> 393 return last_step.fit_transform(Xt, y, **fit_params)
394 else:
395 return last_step.fit(Xt, y, **fit_params).transform(Xt)
~/anaconda3/lib/python3.7/site-packages/sklearn/base.py in fit_transform(self, X, y, **fit_params)
551 if y is None:
552 # fit method of arity 1 (unsupervised transformation)
--> 553 return self.fit(X, **fit_params).transform(X)
554 else:
555 # fit method of arity 2 (supervised transformation)
TypeError: fit() missing 1 required positional argument: 'y'
Clearly the issue is that the training labels aren't making its way into the pipeline. I tried adding another itemselector for the training labels:
process_and_join_features = Pipeline([
('features', FeatureUnion([
('count_lemma_features', Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('counts', CountVectorizer(analyzer="word", stop_words='english'))])),
('labels', ItemSelector(key='Expense_Category'))])),
('reducer', SelectKBest(chi2, k=1000))
])
return process_and_join_features
But this causes a key error for the label (Expense_Category), even though that column is there in the training data.
If I do it step by step, this works:
item_selector = ItemSelector(key='clean_Invoice_Description').fit(train_data)
count_selector = CountVectorizer(analyzer="word", stop_words='english')
k_best = SelectKBest(chi2, k=1000)
invoice_desc = item_selector.transform(train_data)
invoice_desc = count_selector.fit_transform(invoice_desc)
reduced_desc = k_best.fit_transform(invoice_desc, train_labels)
print(reduced_desc.shape)
>>> (6130, 1000)
The problem with doing the step-by-step way is that there are other features in other columns that I would like to use in along with these, and pipeline provides a nice way of doing so without having to manually combine them.
Solved it. The main issue being the nesting of each feature. Pipelines() expects a list of tuples, where the first item in the tuple is the feature/pipe name, and the second being the actual class. It is very easy to lose track of the nesting as you add more features. Here is the final code:
def get_preprocessing_pipeline(self, data_frame):
"""
Get data frame containing features and labels from raw feature input csv file"""
process_and_join_features = Pipeline([
('features',
FeatureUnion([
('tokens',
Pipeline([
('selector', ItemSelector(key='clean_Invoice_Description')),
('vec', CountVectorizer(analyzer="word", stop_words='english')),
('dim_red', SelectKBest(chi2, k=5000))
])),
('hypernyms',
Pipeline([
('selector', ItemSelector(key='hypernyms_combined')),
('vec', TfidfVectorizer(analyzer="word")),
('dim_red', SelectKBest(chi2, k=5000))
]))]))])
return process_and_join_features

How use leave one out encoding in sklearn pipelines

I would like to test different encoding strategies as implemented in categorical encoding package using sklearn pipelines.
I mean something like this:
num_attribs = list(housing_num)
cat_attribs = ["ocean_proximity"]
num_pipeline = Pipeline([
('selector', DataFrameSelector(num_attribs)),
('imputer', Imputer(strategy="median")),
('std_scaler', StandardScaler()),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attribs)),
('cat_encoder', LeaveOneOutEncoder()),
])
from sklearn.pipeline import FeatureUnion
full_pipeline = FeatureUnion(transformer_list=[
("num_pipeline", num_pipeline),
("cat_pipeline", cat_pipeline),
])
housing_prepared = full_pipeline.fit_transform(housing)
housing_prepared
But I get an error:
TypeError: fit() missing 1 required positional argument: 'y'
Can anyone suggest a solution?
Let show just part of the code as I do. I add XGBRegressor because I think you may predict housing price
class MultiColumn(BaseEstimator, TransformerMixin):
def __init__(self,columns = None):
self.columns = columns # array of column names to encode
def fit(self,X,y=None):
return self
def transform(self, X):
return X[self.columns]
NUMERIC = df[['var1', 'var2']]
CATEGORICAL = df[['var3', 'var4']]
class Imputation(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
return X.fillna(NUMERIC.median())
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
class Cat(BaseEstimator, TransformerMixin):
def transform(self, X, y=None, **fit_params):
enc = DictVectorizer(sparse = False)
encc = enc.fit(CATEGORICAL.T.to_dict().values())
enc_data = encc.transform(X.T.to_dict().values())
enc_data[np.isnan(enc_data)] = 1
return enc_data
def fit_transform(self, X, y=None, **fit_params):
self.fit(X, y, **fit_params)
return self.transform(X)
def fit(self, X, y=None, **fit_params):
return self
And Pipeline
pipeline = Pipeline([
# Use FeatureUnion to combine the features
('union', FeatureUnion(
transformer_list=[
# numeric
('numeric', Pipeline([
('selector', MultiColumn(columns=['var1', 'var2'])),
('imp', Imputation()),
('scaling', preprocessing.StandardScaler(with_mean = 0.))
])),
# categorical
('categorical', Pipeline([
('selector', MultiColumn(columns=['var3', 'var4'])),
('one_hot', Cat()),
(CategoricalImputer())
])),
])),
('model_fitting', xgb.XGBRegressor()),
])
Your categorical encoder (LeaveOneOutEncoder) needs the target variable to adjust and replace the new labels (levels) for your variables defined in cat_attribs. So, you just need to invoke fit_transform method joined with y_train:
housing_prepared = full_pipeline.fit_transform(housing, y_train)

Pipeline Sklearn ( missing basic ) Predictions Error

Need some assistance, I am kind of stuck at the concept of implementing pipelines using sklearn. This dataset is KC Housing Dataset from Kaggle. I am trying to build a simple Linear regression , using Pipelines. However, I am missing something quite basic from the concept as I am unable to get past the error pasted at the bottom of this post. Please advise, its really appreciated. This is the complete code for this, feel free to mend the code where necessary.
**ERROR:**
Traceback (most recent call last):
File "/media/JBook/Software/PythonProjects/KCH/OneFi.py", line 123, in <module>
main()
File "/media/JBook/Software/PythonProjects/KCH/OneFi.py", line 118, in main
predictions_some_data = lin_reg.predict(some_data_prepared)
File "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/base.py", line 256, in predict
return self._decision_function(X)
File "/usr/local/lib/python3.5/dist-packages/sklearn/linear_model/base.py", line 241, in _decision_function
dense_output=True) + self.intercept_
File "/usr/local/lib/python3.5/dist-packages/sklearn/utils/extmath.py", line 135, in safe_sparse_dot
ret = a * b
File "/usr/local/lib/python3.5/dist-packages/scipy/sparse/base.py", line 387, in __mul__
raise ValueError('dimension mismatch')
ValueError: dimension mismatch
PS: Problem I am facing in this is at the almost end of this code
"predictions_some_data = lin_reg.predict(some_data_prepared)"
import pandas as pd
import os, sys
import numpy as np
import matplotlib.pyplot as plt
import data_visualize
from sklearn.model_selection import StratifiedShuffleSplit
import dataPrep
from sklearn.model_selection import train_test_split
from pandas.plotting import scatter_matrix
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import Imputer
from sklearn.preprocessing import LabelBinarizer
from sklearn.pipeline import Pipeline
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.pipeline import FeatureUnion
**## Loading the data**
KC_housing_path = "/media/JBook/Software/PythonProjects/KCH/datasets"
def load_housing_data(housing_path=KC_housing_path):
'''if not os.path.isfile("datasets/kc_house_data.csv"):
print("Check file location, program exiting..")
else:
csv_path = os.path.join(housing_path, "kc_house_data.csv")
print("reading csv file ...")
return pd.read_csv(csv_path)'''
try:
csv_path = os.path.join(housing_path, "kc_house_data.csv")
print("reading csv file -->")
return pd.read_csv(csv_path)
except FileNotFoundError:
print("Check file location, program exiting ...")
sys.exit()
**### Defining 2 classes for custom transformers**
class DataFrameSelector(BaseEstimator, TransformerMixin):
def __init__(self, attributeNames):
self.attributes = attributeNames
# print('\n In constructor', self.attributes)
def fit(self, X, y=None):
# print("__DF Fit Method:\n", (X[self.attributes].values).shape)
return self
def transform(self, X):
# print("__Transform Method:\n", (X[self.attributes].values).shape)
return X[self.attributes].values
class LabelBinarizerPipelineFriendly(LabelBinarizer):
def fit(self, X, y=None):
# print("LB-->X.shape", X.shape)
"""this would allow us to fit the model based on the X input."""
super(LabelBinarizerPipelineFriendly, self).fit(X)
def transform(self, X, y=None):
# print("LB-Transform-X.shape", X.shape)
return super(LabelBinarizerPipelineFriendly, self).transform(X)
def fit_transform(self, X, y=None):
# print("LB-FIT_TRANSFORM-X.Shape", X.shape)
return super(LabelBinarizerPipelineFriendly, self).fit(X).transform(X)
def main():
# Loading House data
housing = load_housing_data()
housing_labels = housing['price']
# Removing not needed features & label ( price)
rem_attributes = ['id', 'date', 'price']
housing_col_removed = housing.drop(rem_attributes, axis=1, inplace=False)
**### Splitting the data**
train_set, test_set = train_test_split(housing_col_removed, test_size=0.3, random_state=42)
**#### Pipleline for numeric & categorical attribute transformations
#### Adding median to missing values & making one hot vectors of categorical attributes**
data_numeric = housing_col_removed.drop('ocean_proximity', axis=1, inplace=False)
numeric_attrib = list(data_numeric)
cat_attrib = ['ocean_proximity']
num_pipeline = Pipeline([
('selector', DataFrameSelector(numeric_attrib)),
('imputing', Imputer(missing_values=0, strategy='median')),
])
cat_pipeline = Pipeline([
('selector', DataFrameSelector(cat_attrib)),
('label_Bin', LabelBinarizerPipelineFriendly(sparse_output=True)),
])
full_pipeline = FeatureUnion(transformer_list=[
('num_pipeline', num_pipeline),
('cat_pipeline', cat_pipeline),
])
**#### Fitting the linear regression model**
# print('This housing data is passed to prepare\n', housing_col_removed.head())
housing_prepared = dataPrep.prepData(housing_col_removed)
lin_reg = LinearRegression()
lin_reg.fit(housing_prepared, housing_labels)
print('Housing Prepared Shape: \n', housing_prepared.shape)
print('\t\t\t\t\************* Predictions from Linear Regression Are ***********\n',lin_reg.predict(housing_prepared))
**### Below section is trying to use some data (5 rows ) from whole data set to predict values**
some_data = housing_col_removed[:5]
some_labels = housing_labels.iloc[:5]
some_data_prepared = full_pipeline.fit_transform(some_data)
print('\t\t\tSome Data Prepared is\n', some_data_prepared)
predictions_some_data = lin_reg.predict(some_data_prepared)
print('\t\t\t\t\************* Predictions from Linear Regression Are ***********\n', predictions_some_data)
# print('\t\t\t\t\************* Labels Are ***********\n', list(some_labels))
main()

Categories

Resources