How can I create a custom Pipeline in python? I tried with sklearn pipeline but seems it not running successfully. Mostly I need my pre-process as a customize pipeline with a logistics model.
import pandas as pd
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder
from sklearn import metrics
import matplotlib.pyplot as plt
import seaborn as sns
%matplotlib inline
import eli5
from eli5.sklearn import PermutationImportance
from sklearn.pipeline import Pipeline, make_pipeline, FeatureUnion
from sklearn.preprocessing import FunctionTransformer
from sklearn.compose import ColumnTransformer
path = 'C:/Users/Desktop/'
df = pd.read_excel (path + "df.xlsx", sheet_name='df')
# import the BaseEstimator
from sklearn.base import BaseEstimator
# define the class OutletTypeEncoder
# custom transformer must have methods fit and transform
class OutletTypeEncoder(BaseEstimator):
def __init__(self):
pass
def fit(self, documents, y=None):
return self
def transform(self, df):
# replace NaN
df[['pdf_tbl_pn_identifier', 'pdf_tbl_qty_identifier', 'pdf_header_present']] = df[['pdf_tbl_pn_identifier', 'pdf_tbl_qty_identifier', 'pdf_header_present']].fillna(value=-999)
df[['pdf_tbl_cnt']] = df[['pdf_tbl_cnt']].fillna(value=0)
# Replace gt 1 count as 0
df['pdf_tbl_cnt'] = np.where( ( df['pdf_tbl_cnt'] == '1'), 1, 0)
df['part_cnt'] = np.where( (df['part_cnt'] == '1'), 1, 0)
# create numeric and categorica coulmns
obj_df= df[['pdf_tbl_pn_identifier','pdf_tbl_qty_identifier','pdf_header_present',
'pdf_body_pn_identifier','pdf_body_qty_identifier','pdf_model_rel_returned','pdf_model_ent_returned']]
num_df= df[['pdf_tbl_cnt', 'pdf_model_avg_relationship_score','pdf_model_avg_entity_score','part_cnt','matching']]
# Labelencoding for categorica columns and then
obj_df=obj_df.apply(LabelEncoder().fit_transform)
df = pd.concat([obj_df, num_df], axis=1)
df.reset_index(inplace=True, drop=True)
df.pdf_tbl_pn_identifier = df.pdf_tbl_pn_identifier.astype(str)
df.pdf_tbl_qty_identifier = df.pdf_tbl_qty_identifier.astype(str)
df.pdf_body_pn_identifier = df.pdf_body_pn_identifier.astype(str)
df.pdf_body_qty_identifier = df.pdf_body_qty_identifier.astype(str)
df.pdf_model_rel_returned = df.pdf_model_rel_returned.astype(str)
df.pdf_model_ent_returned = df.pdf_model_ent_returned.astype(str)
df.pdf_header_present = df.pdf_header_present.astype(str)
df.matching = df.matching.astype(str)
#df['pdf_tbl_cnt'] = df['pdf_tbl_cnt'].apply(np.int64)
df.pdf_tbl_cnt = df.pdf_tbl_cnt.apply(np.int64)
return df
feature_cols = df.drop(['matching'], axis=1)
X = feature_cols # Features
y = df.matching # Target variable
# split into train test sets
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=0.25,random_state=0)
logreg = LogisticRegression()
model_pipeline = Pipeline(steps=[('preprocess', OutletTypeEncoder()),
('logreg', LogisticRegression())
])
# fit the pipeline with the training data
model_pipeline.fit(X_train,y_train)
I am getting error as below. Plz help me out
UnboundLocalError: local variable 'df' referenced before assignment
Related
I'm trying to fit a dataframe with SkLearn DecisionTree with the following code. But I get a error Length of feature_names, 9 does not match number of features, 8. The DecisionTree seems to have only fitted categorical features after transformed by onehotencoding, not the numerical feature. How can I include the numerical feature in the decisiontree model?
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.pipeline import make_pipeline
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import StandardScaler, OneHotEncoder
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn import tree
from matplotlib import pyplot as plt
import graphviz
import numpy as np
import pandas as pd
from sklearn.preprocessing import OneHotEncoder,StandardScaler
from sklearn.compose import ColumnTransformer, make_column_transformer
from sklearn.linear_model import LinearRegression
df = pd.DataFrame({'brand' : ['aaaa', 'asdfasdf', 'sadfds', 'NaN'],
'category' : ['asdf','asfa','asdfas','as'],
'num1' : [1, 1, 0, 0] ,
'target' : [1,0,0,1]})
df
dtarget=df['target']
dfeatures=df.drop('target', axis=1)
num = dfeatures.select_dtypes(include=["int64"]).columns.tolist()
cat = dfeatures.select_dtypes(include=["object"]).columns.tolist()
transformer = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(), cat),
]
)
clf= DecisionTreeClassifier(criterion="entropy", max_depth = 5)
pipe = Pipeline(steps=[
('onehotenc', transformer),
('decisiontree', clf)
])
#Fit the training data to the pipeline
pipe.fit(dfeatures, dtarget)
pipe.named_steps['onehotenc'].get_feature_names_out().tolist(),
dot_data= tree.export_graphviz(clf,
out_file=None,
feature_names = num + pipe.named_steps['onehotenc'].get_feature_names_out().tolist(),
class_names= ['1', '0'],
filled = True)
The numeric feature isn't in your transformer. Since you don't want to do any changes to it, try letting it pass through. You can explicitly define passthrough columns, or pass the remainder. remainder is fine if you know that's the only other column that could ever be sent to the model.
transformer = ColumnTransformer(
transformers=[
("cat", OneHotEncoder(), cat),
],remainder='passthrough'
)
With this you will see your features names include the num1 column
pipe.named_steps['onehotenc'].get_feature_names_out().tolist()
Output
['cat__brand_NaN',
'cat__brand_aaaa',
'cat__brand_asdfasdf',
'cat__brand_sadfds',
'cat__category_as',
'cat__category_asdf',
'cat__category_asdfas',
'cat__category_asfa',
'remainder__num1']
I'm trying to use SequentialFeatureSelector and for estimator parameter I'm passing it a pipeline that includes a step that inputes the missing values:
model = Pipeline(steps=[('preprocessing',
ColumnTransformer(transformers=[('pipeline-1',
Pipeline(steps=[('imputing',
SimpleImputer(fill_value=-1,
strategy='constant')),
('preprocessing',
StandardScaler())]),
<sklearn.compose._column_transformer.make_column_selector object at 0x1300013d0>),
('pipeline-2',
Pipeline(steps=[('imputing',
SimpleImputer(fill_value='missing',
strategy='constant')),
('encoding',
OrdinalEncoder(handle_unknown='ignore'))]),
<sklearn.compose._column_transformer.make_column_selector object at 0x1300015b0>)])),
('model',
LGBMClassifier(class_weight='balanced', random_state=1,
reg_lambda=0.1))])
Nonetheless when passing this to selector it shows an error, what does not make any sense since I have already fit and evaluated my model and it runs ok
fselector = SequentialFeatureSelector(estimator = model, scoring= "roc_auc", cv = 3, n_jobs= -1, ).fit(X, target)
_assert_all_finite(X, allow_nan, msg_dtype)
101 not allow_nan and not np.isfinite(X).all()):
102 type_err = 'infinity' if allow_nan else 'NaN, infinity'
--> 103 raise ValueError(
104 msg_err.format
105 (type_err,
ValueError: Input contains NaN, infinity or a value too large for dtype('float64').
EDIT:
Reproducible example:
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
X, y = load_iris(return_X_y = True)
X[:10,0] = np.NaN
clf = Pipeline([("preprocessing", SimpleImputer(missing_values= np.NaN)),("model",LogisticRegression(random_state = 1))])
SequentialFeatureSelector(estimator = clf,
scoring= "accuracy",
cv = 3).fit(X, y)
It shows the same error, in spite of the clf can be fit without problems
ScikitLearn's documentation does not state that the SequentialFeatureSelector works with pipeline objects. It only states that the class accepts an unfitted estimator. In view of this, you could remove the classifier from your pipeline, preprocess X, and then pass it along with an unfitted classifier for feature selection as shown in the example below.
import numpy as np
from sklearn.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
from sklearn.preprocessing import MaxAbsScaler
X, y = load_iris(return_X_y = True)
X[:10,0] = np.NaN
pipe = Pipeline([("preprocessing", SimpleImputer(missing_values= np.NaN)),
('scaler', MaxAbsScaler())])
# Preprocess your data
X = pipe.fit_transform(X)
# Run the SequentialFeatureSelector
sfs = SequentialFeatureSelector(estimator = LogisticRegression(),
scoring= "accuracy",
cv = 3).fit(X, y)
# Check which features are important and transform X
sfs.get_support()
X = sfs.transform(X)
You can use SequentialFeatureSelection from mlxtend package
https://rasbt.github.io/mlxtend/
from mlxtend.feature_selection import SequentialFeatureSelector
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_iris
from sklearn.pipeline import Pipeline
from sklearn.impute import SimpleImputer
import numpy as np
X, y = load_iris(return_X_y = True)
X[:10,0] = np.NaN
clf = Pipeline([
("preprocessing", SimpleImputer(missing_values= np.NaN)),
("model",LogisticRegression(random_state = 1))
])
sfs = SequentialFeatureSelector(estimator = clf,
forward = True,
k_features = 'best',
scoring = "accuracy",
cv = 3, n_jobs=-1).fit(X, y)
sfs.k_feature_idx_
>>> (0, 1, 2, 3)
I wanted to test different scaler in my pipeline, so I created a class that can take as parameter the scaler I want (Standard, MinMax etc..)
It works fine. But i want to specifiy in my param_grid to test without scaler. And I added 'passthrough' in my pipeline estimators, but it doesn't work and i get the following error :
AttributeError: 'str' object has no attribute 'set_params'
I know it might have something to do with the construction of my class but i can't find how to fix it.
Here is the code you can run to test.
# import dependencies
from sklearn.tree import DecisionTreeClassifier
from sklearn.pipeline import Pipeline
from sklearn.compose import ColumnTransformer
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import GridSearchCV
from sklearn.preprocessing import StandardScaler, MinMaxScaler, RobustScaler
from sklearn.datasets import load_breast_cancer
from sklearn.base import BaseEstimator, TransformerMixin
import pandas as pd
class ScalerSelector(BaseEstimator, TransformerMixin):
def __init__(self, scaler=StandardScaler()):
super().__init__()
self.scaler = scaler
def fit(self, X, y=None):
return self.scaler.fit(X)
def transform(self, X, y=None):
return self.scaler.transform(X)
data = load_breast_cancer()
features = data["data"]
target = data["target"]
data = pd.DataFrame(data['data'], columns=data['feature_names'])
col_names = data.columns.tolist()
# scaler and encoder options
my_scaler = ScalerSelector()
preprocessor = ColumnTransformer(transformers = [('numerical', my_scaler, col_names)
])
# combine the preprocessor with LogisticRegression() using Pipeline
full_pipeline = Pipeline(steps = [('preprocessor', preprocessor),
('log_reg', LogisticRegression())
])
# set params combination I want to try
scaler_options = {'preprocessor': ['passthrough'],
'preprocessor__numerical__scaler':[StandardScaler(), RobustScaler(), MinMaxScaler()]}
# : ['passthrough', ScalerSelector()]
# initialize GridSearchCV using full_pipeline as final estimator
grid_cv = GridSearchCV(full_pipeline, param_grid = scaler_options)
# fit the data
grid_cv.fit(data, target)
# best params :
grid_cv.best_params_
I'm using the scikit pipeline to create a preprocess on a dataset. I have a dataset with four variables: ['monetary', 'frequency1', 'frequency2', 'recency'] and I want to preprocess all but recency. To preprocess, I first want to get the log and then standardize. However, when I get the transformed data from the pipeline, I get 7 columns (3 log, 3 standardize, recency). Is there a way to chain the transformations and so I can get the log and after the log perform standardize and only get a 4 feature dataset?
def create_pipeline(df):
all_but_recency = ['monetary', 'frequency1','frequency2']
# Preprocess
preprocessor = ColumnTransformer(
transformers=[
( 'log', FunctionTransformer(np.log), all_but_recency ),
( 'standardize', preprocessing.StandardScaler(), all_but_recency ) ],
remainder='passthrough')
# Pipeline
estimators = [( 'preprocess', preprocessor )]
pipe = Pipeline(steps=estimators)
print(pipe.set_params().fit_transform(df).shape)
Thanks in advance
You have to apply the FunctionTransformer sequentially. Try this!
def create_pipeline(df):
all_but_recency = ['monetary', 'frequency1','frequency2']
# Preprocess
# Preprocess
preprocessor1 = ColumnTransformer([('log', FunctionTransformer(np.log), all_but_recency)],'passthrough')
preprocessor2 = ColumnTransformer([('standardize', preprocessing.StandardScaler(), all_but_recency)],'passthrough' )
# Pipeline
estimators = [('preprocess1', preprocessor1),('standardize', preprocessor2)]
pipe = Pipeline(steps=estimators)
print(pipe.set_params().fit_transform(df).shape)
working example
from sklearn.datasets import load_iris
import pandas as pd
import numpy as np
from sklearn.compose import ColumnTransformer
from sklearn.preprocessing import Normalizer
from sklearn.preprocessing import FunctionTransformer
from sklearn.pipeline import Pipeline
from sklearn import preprocessing
iris = load_iris()
X, y = iris.data, iris.target
df= pd.DataFrame(X,columns = iris.feature_names)
all_but_one = [0,1,2]
# Preprocess
preprocessor1 = ColumnTransformer([('log', FunctionTransformer(np.log), all_but_one)],'passthrough')
preprocessor2 = ColumnTransformer([('standardize', preprocessing.StandardScaler(), all_but_one)],'passthrough' )
# Pipeline
estimators = [('preprocess1', preprocessor1),('scalling', preprocessor2)]
pipe = Pipeline(steps=estimators,)
pipe.fit_transform(df)
I would like to use GridSearchCV and pipelines in sklearn not only to select best hyper-paramters for the choosen classifier but to select best categorical encoding strategy.
Considering Titanic dataset ([https://www.kaggle.com/c/titanic][1]) and using Sklearn-pandas I could define some DataFrameMappers to select and encode some features, then cross-validate a RandomForestClassifier() to search for it's best hyper-parameters.
Consider the following code:
from __future__ import division
import csv as csv
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler
from category_encoders import BinaryEncoder, LeaveOneOutEncoder
from sklearn_pandas import DataFrameMapper
df_train = pd.read_csv('train.csv', header = 0, index_col = 'PassengerId')
df_test = pd.read_csv('test.csv', header = 0, index_col = 'PassengerId')
df = pd.concat([df_train, df_test], keys=["train", "test"])
df['Title'] = df['Name'].apply(lambda c: c[c.index(',') + 2 : c.index('.')])
df['LastName'] = df['Name'].apply(lambda n: n[0:n.index(',')])
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df.loc[df['Embarked'].isnull(), 'Embarked'] = df['Embarked'].mode()[0]
df.loc[df['Fare'].isnull(), 'Fare'] = df['Fare'].mode()[0]
df['FamilyID'] = df['LastName'] + ':' + df['FamilySize'].apply(str)
df.loc[df['FamilySize'] <= 2, 'FamilyID'] = 'Small_Family'
df['AgeOriginallyNaN'] = df['Age'].isnull().astype(int)
medians_by_title = pd.DataFrame(df.groupby('Title')['Age'].median()).rename(columns = {'Age': 'AgeFilledMedianByTitle'})
df = df.merge(medians_by_title, left_on = 'Title', right_index = True).sort_index(level = 0).sort_index(level = 1)
df_train = df.loc['train']
df_test = df.loc['test']
y_train = df_train['Survived']
X_train = df_train[df_train.columns.drop('Survived')]
mapper1 = DataFrameMapper([
('Embarked',BinaryEncoder()),
(['AgeFilledMedianByTitle'], StandardScaler()),
('Pclass', LeaveOneOutEncoder())
])
mapper2=DataFrameMapper([
('Embarked',LeaveOneOutEncoder()),
(['AgeFilledMedianByTitle'], StandardScaler()),
('Pclass', LeaveOneOutEncoder())
])
pipe = Pipeline([('featurize', mapper1),
('forest', RandomForestClassifier(n_estimators=10))])
param_grid = dict(forest__n_estimators = [2, 16, 32,64],
forest__criterion = ['gini', 'entropy'])
grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='accuracy')
best_pipeline = grid_search.fit(X_train, y_train).best_estimator_
best_pipeline.get_params()['forest']
grid_search.best_score_
Is it possible to use Pipeline in GridSearchCV to select best possible mapper (mapper1 and mapper2)? How?