I would like to use GridSearchCV and pipelines in sklearn not only to select best hyper-paramters for the choosen classifier but to select best categorical encoding strategy.
Considering Titanic dataset ([https://www.kaggle.com/c/titanic][1]) and using Sklearn-pandas I could define some DataFrameMappers to select and encode some features, then cross-validate a RandomForestClassifier() to search for it's best hyper-parameters.
Consider the following code:
from __future__ import division
import csv as csv
import numpy as np
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import KFold, StratifiedKFold, cross_val_score, GridSearchCV
from sklearn.pipeline import Pipeline, FeatureUnion
from sklearn.linear_model import LogisticRegression
from sklearn.preprocessing import LabelEncoder, LabelBinarizer, StandardScaler
from category_encoders import BinaryEncoder, LeaveOneOutEncoder
from sklearn_pandas import DataFrameMapper
df_train = pd.read_csv('train.csv', header = 0, index_col = 'PassengerId')
df_test = pd.read_csv('test.csv', header = 0, index_col = 'PassengerId')
df = pd.concat([df_train, df_test], keys=["train", "test"])
df['Title'] = df['Name'].apply(lambda c: c[c.index(',') + 2 : c.index('.')])
df['LastName'] = df['Name'].apply(lambda n: n[0:n.index(',')])
df['FamilySize'] = df['SibSp'] + df['Parch'] + 1
df.loc[df['Embarked'].isnull(), 'Embarked'] = df['Embarked'].mode()[0]
df.loc[df['Fare'].isnull(), 'Fare'] = df['Fare'].mode()[0]
df['FamilyID'] = df['LastName'] + ':' + df['FamilySize'].apply(str)
df.loc[df['FamilySize'] <= 2, 'FamilyID'] = 'Small_Family'
df['AgeOriginallyNaN'] = df['Age'].isnull().astype(int)
medians_by_title = pd.DataFrame(df.groupby('Title')['Age'].median()).rename(columns = {'Age': 'AgeFilledMedianByTitle'})
df = df.merge(medians_by_title, left_on = 'Title', right_index = True).sort_index(level = 0).sort_index(level = 1)
df_train = df.loc['train']
df_test = df.loc['test']
y_train = df_train['Survived']
X_train = df_train[df_train.columns.drop('Survived')]
mapper1 = DataFrameMapper([
('Embarked',BinaryEncoder()),
(['AgeFilledMedianByTitle'], StandardScaler()),
('Pclass', LeaveOneOutEncoder())
])
mapper2=DataFrameMapper([
('Embarked',LeaveOneOutEncoder()),
(['AgeFilledMedianByTitle'], StandardScaler()),
('Pclass', LeaveOneOutEncoder())
])
pipe = Pipeline([('featurize', mapper1),
('forest', RandomForestClassifier(n_estimators=10))])
param_grid = dict(forest__n_estimators = [2, 16, 32,64],
forest__criterion = ['gini', 'entropy'])
grid_search = GridSearchCV(pipe, param_grid=param_grid, scoring='accuracy')
best_pipeline = grid_search.fit(X_train, y_train).best_estimator_
best_pipeline.get_params()['forest']
grid_search.best_score_
Is it possible to use Pipeline in GridSearchCV to select best possible mapper (mapper1 and mapper2)? How?
Related
I have the following toy code.
I use a pipeline to automatically normalize numerical variables and apply one-hot-encoding to the categorical ones.
I can get the coefficients of the linear regression model easily using pipe['logisticregression'].coef_ but how can I get all the feature names in the right order as this appearing in the coef matrix?
from sklearn.compose import ColumnTransformer
import numpy as np, pandas as pd
from sklearn.compose import make_column_transformer, make_column_selector
from sklearn.preprocessing import OneHotEncoder, StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import make_pipeline
from sklearn.model_selection import train_test_split
# data from https://www.kaggle.com/datasets/uciml/adult-census-income
data = pd.read_csv("adult.csv")
data = data.iloc[0:3000,:]
target = "workclass"
y = data[target]
X = data.drop(columns=target)
numerical_columns_selector = make_column_selector(dtype_exclude=object)
categorical_columns_selector = make_column_selector(dtype_include=object)
numerical_columns = numerical_columns_selector(X)
categorical_columns = categorical_columns_selector(X)
ct = ColumnTransformer([ ('onehot', OneHotEncoder(handle_unknown='ignore'), categorical_columns) ,
('std', StandardScaler(), numerical_columns)])
model = LogisticRegression(max_iter=500)
pipe = make_pipeline(ct, model)
data_train, data_test, target_train, target_test = train_test_split(
X, y, random_state=42)
pipe.fit(data_train, target_train)
pipe['logisticregression'].coef_.shape
My ML project is about "Loan Eligibility prediction"
For that I used data below : https://www.kaggle.com/code/sazid28/home-loan-prediction/data?select=train.csv
and my code is as shown :
import random
import pandas as pd
import numpy as np
from sklearn.compose import make_column_transformer
from sklearn.experimental import enable_iterative_imputer
from sklearn.impute import \
SimpleImputer, KNNImputer, IterativeImputer
from sklearn.preprocessing import \
OneHotEncoder, OrdinalEncoder, StandardScaler, MinMaxScaler, RobustScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.pipeline import make_pipeline
from sklearn.linear_model import LogisticRegression, SGDClassifier
from sklearn.feature_selection import RFECV
df_train_original = pd.read_csv("train.csv.xls")
df = df_train_original.drop(df_train_original.columns[0], axis=1)
# Remplace 'Credit_History' by random value (0 or 1)
random.seed(0)
df['Credit_History'] = \
df['Credit_History'].apply(
lambda x: np.random.choice(df['Credit_History'].dropna().values)
if np.isnan(x) else x)
X = df.iloc[:, :-1]
y = df.iloc[:, -1]
# Data pre-processing
numerical_feature, categorical_feature = [], []
for i in X.columns:
if X[i].dtype == 'O':
categorical_feature.append(i)
else:
numerical_feature.append(i)
imputer = IterativeImputer(random_state=0)
scaler = StandardScaler()
encoder = OrdinalEncoder()
# Replace categorical features with the most frequent value of the column
# Gender (-13) , Married (-3), Self_Employed (-32)
# LoanAmount (-22) Loan_Amount_Term (-14) Credit_History (-50)
numerical_pipeline = make_pipeline(imputer, scaler)
categorical_pipeline = make_pipeline(SimpleImputer(strategy='most_frequent'), encoder)
preprocessor = make_column_transformer((numerical_pipeline, numerical_feature),
(categorical_pipeline, categorical_feature),
remainder='passthrough')
clf = LogisticRegression(random_state=0, max_iter=df.shape[0])
X_train, X_test, y_train, y_test = train_test_split(X,
y,
test_size=0.2,
random_state=0)
params = {
'logisticregression__class_weight': [None, 'balanced'],
'logisticregression__solver': ['newton-cg', 'lbfgs', 'liblinear', 'sag', 'saga'],
'logisticregression__C': np.linspace(0.001, 0.1, 30),
}
model = make_pipeline(preprocessor, clf)
selector = RFECV(model, step=1, min_features_to_select=2, cv=5)
selector.fit(X_train, y_train)
when I run the code I get :
ValueError: could not convert string to float: 'Male'
I think that the data is not fitted and transformed before going through RFECV.
How to fix this?
RFECV does not work with a pipeline as the estimator, as it requires the estimator to expose either a coef_ or a feature_importances_. Pipelines do not, and even if they did, there would be no guarantee that the feature importances of the final estimator correspond to the features input to the pipeline with arbitrary transformations in the intermediate.
What you can do is make the RFECV transformer an element of your pipeline between the preprocessing and the final estimator, ie
preprocessor = make_column_transformer((numerical_pipeline, numerical_feature),
(categorical_pipeline, categorical_feature),
remainder='passthrough')
clf_fs = LogisticRegression(random_state=0, max_iter=df.shape[0])
clf = LogisticRegression(random_state=0, max_iter=df.shape[0])
feature_selector = RFECV(clf_fs , step=1, min_features_to_select=2, cv=5)
model = make_pipeline(preprocessor, feature_selector, clf)
The number of rows is close to 1 million. I define the models I want to test as such
X = df_final[['short_description', 'details', 'root_cause']]
cv = StratifiedKFold(n_splits = 5, shuffle = True, random_state = 0)
tfidf_pipeline = Pipeline([
('tfidf' ,TfidfVectorizer(max_features=1500, ngram_range=(1, 3), stop_words = 'english', strip_accents= 'ascii',))])
countvec_pipeline = Pipeline([
('countvec' ,CountVectorizer(max_features=1500, ngram_range=(1, 1), stop_words = 'english', strip_accents= 'ascii', binary = True))])
preprocessor_pipeline = ColumnTransformer(
transformers=[
('short_description', countvec_pipeline,'short_description'),
('details', tfidf_pipeline,'details'),
('root_cause', countvec_pipeline, 'root_cause'),
])
models = [
('rf', RandomForestClassifier(n_estimators=100,
max_depth=3,
random_state=0,
n_jobs = -1)),
('svc',LinearSVC()),
('nb', MultinomialNB()),
('lr', LogisticRegression(random_state=0,
solver = 'saga',
n_jobs = -1))
]
I then do a fit_transform as so
X_prepped = preprocessor_pipeline.fit_transform(X)
The below part is where my kernel once it hits the first model being random forest it breaks.
for model_name, model in models:
print(model_name)
results_dict = cross_validate(model, X_prepped, labels, cv = cv, scoring = 'accuracy', return_train_score = True)
results_df = pd.DataFrame(results_dict)
Is there something I am doing wrong here?
import pandas as pd
from sklearn.preprocessing import FunctionTransformer
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline, FeatureUnion, make_pipeline
from sklearn.preprocessing import OneHotEncoder
from sklearn.preprocessing import StandardScaler
from sklearn.preprocessing import LabelEncoder
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.naive_bayes import MultinomialNB
import nltk
from sklearn.impute import SimpleImputer
from sklearn.feature_extraction.text import CountVectorizer
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.svm import LinearSVC
from sklearn.metrics import roc_curve
import matplotlib.pyplot as plt
from sklearn.multiclass import OneVsRestClassifier
from sklearn.feature_selection import chi2, SelectKBest
import spacy
from sklearn.preprocessing import MaxAbsScaler
df=pd.read_csv('input_data.csv')
NUMERIC=['Cost','Field2']
TEXT=['Text1', 'Text2']
def combine_text_columns(data_frame, text_labels=TEXT):
""" converts all text in each row of data_frame to single vector """
#to_drop = set(to_drop) & set(data_frame.columns.tolist())
#text_dta=data_frame.drop(to_drop,axis=1)
text_data =data_frame[text_labels]
# Replace nans with blanks
text_data.fillna("",inplace=True)
# Join all text items in a row that have a space in between
return text_data.apply(lambda x: " ".join(x), axis=1)
nlp = spacy.load('en_core_web_sm')
stopwords=spacy.lang.en.stop_words.STOP_WORDS
chi_k = 300
# Create the token pattern: TOKENS_ALPHANUMERIC
TOKENS_ALPHANUMERIC = '[A-Za-z0-9]+(?=\\s+)'
get_text_data = FunctionTransformer(combine_text_columns,validate=False)
# Preprocess the numeric data: get_numeric_data
get_numeric_data = FunctionTransformer(lambda x: x[NUMERIC], validate=False)
pl = Pipeline([
('union', FeatureUnion(
transformer_list = [
('numeric_features', Pipeline([
('selector', get_numeric_data),
('imputer', SimpleImputer())
])),
('text_features', Pipeline([
('selector', get_text_data),
('vectorizer', TfidfVectorizer(stop_words='english')),
#('vectorizer',CountVectorizer(stop_words=stopwords,token_pattern=TOKENS_ALPHANUMERIC, ngram_range=(1,2))),('dim_red', SelectKBest(chi2, chi_k))
]))
]
)),
('scale', MaxAbsScaler()),
#('svc',LinearSVC())
#('nb', MultinomialNB()),
#('clf', OneVsRestClassifier(RandomForestClassifier(n_estimators=15)))
('lr', OneVsRestClassifier(LogisticRegression(C=100)))
])
TARGET=['IsApartment']
sc_X = StandardScaler()
encoder=LabelEncoder()
label_enc=pd.Series(encoder.fit_transform(df['OpportunityName']))
NUMERIC2=[]
[NUMERIC2.append(x) for x in NUMERIC]
NUMERIC2.append('opportunityName_enc')
X2=df[NUMERIC]
X2=pd.concat([X2,label_enc],axis=1)
X2 = pd.DataFrame(sc_X.fit_transform(X2),columns=NUMERIC2)
X=pd.concat([X,X2],axis=1)
X=pd.concat([X,df[TEXT]],axis=1)
y=df[TARGET]
X_train,X_test,y_train,y_test=train_test_split(X,y,test_size=.3, random_state=42)
pl.fit(X_train,y_train)
predictions=pl.predict(X_train)
accuracy = pl.score(X_train, y_train)
print("\nAccuracy on sample data - numeric, no nans:{:.2f}% ".format(accuracy))
ytrain_pred_probas = pl.predict_proba(X_train)[:, 1]
# prob of predict as 1
fpr, tpr, thresholds = roc_curve(y_train, ytrain_pred_probas) # precision_recall_curve
roc = pd.DataFrame({'FPR':fpr,'TPR':tpr,'Thresholds':thresholds})
_ = plt.figure()
plt.plot(roc.FPR, roc.TPR)
plt.axvline(0.1, color = '#00C851', linestyle = '--')
plt.xlabel("FPR")
plt.ylabel("TPR")
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import cross_val_score
from sklearn.datasets import load_iris
import pandas as pd
data = load_iris()
df = pd.DataFrame(data['data'], columns=data['feature_names'])
df['target'] = data['target']
X = df.drop(columns=['target'])
y = df['target']
clf = RandomForestClassifier(n_estimators = 50, max_depth = 4)
scores = []
print(len(X.columns))
num_features = len(X.columns)
for i in range(num_features):
col = X.columns[i]
score = np.mean(cross_val_score(clf, X[col].values.reshape(-1,1), y, cv=10))
scores.append((int(score*100), col))
print(sorted(scores, reverse = True))
I intended to perform 10-fold cross-validation to select most important features. I am confused with my approach. It doesn't seems right! Also, how can I plot those most important features. I appreciate your suggestions!
I want to add an evaluation model using the cross-validation and confusion matrix k-fold (k = 10) method, but I'm confused
dataset : https://github.com/fadholifh/dats/blob/master/cpas.txt
Using Pyhon 3.7
import sklearn.metrics
import sen
import csv
import os
import re
import nltk
import scipy
import numpy as np
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn import svm
from sklearn.externals import joblib
from sklearn.pipeline import Pipeline
from sklearn import model_selection
from sklearn.model_selection import train_test_split
from sklearn.model_selection import KFold
from sklearn.metrics import confusion_matrix
from Sastrawi.Stemmer.StemmerFactory import StemmerFactory
from Sastrawi.StopWordRemover.StopWordRemoverFactory import StopWordRemoverFactory
factorys = StemmerFactory()
stemmer = factorys.create_stemmer()
if __name__ == "__main__":
main()
the result is confusion matrix and for k-fold each fold has a percentage of F1-score, precission, and recall
df = pd.read_csv("cpas.txt", header=None, delimiter="\t")
X = df[1].values
y = df[0].values
stop_words = stopwords.words('english')
stemmer = PorterStemmer()
def clean_text(text, stop_words, stemmer):
return " ".join([stemmer.stem(word) for word in word_tokenize(text)
if word not in stop_words and not word.isnumeric()])
X = np.array([clean_text(text, stop_words, stemmer) for text in X])
kfold = KFold(3, shuffle=True, random_state=33)
i = 1
for train_idx, test_idx in kfold.split(X):
X_train = X[train_idx]
y_train = y[train_idx]
X_test = X[test_idx]
y_test = y[test_idx]
vectorizer = TfidfVectorizer()
X_train = vectorizer.fit_transform(X_train)
X_test = vectorizer.transform(X_test)
model = LinearSVC()
model.fit(X_train, y_train)
print ("Fold : {0}".format(i))
i += 1
print (classification_report(y_test, model.predict(X_test)))
The reason you use cross validation is for parameter tuning when the data is less. One can use grid search with CV to do this.
df = pd.read_csv("cpas.txt", header=None, delimiter="\t")
X = df[1].values
labels = df[0].values
text = np.array([clean_text(text, stop_words, stemmer) for text in X])
idx = np.arange(len(text))
np.random.shuffle(idx)
text = text[idx]
labels = labels[idx]
pipeline = Pipeline([
('vectorizer', TfidfVectorizer()),
('svm', LinearSVC())])
params = {
'vectorizer__ngram_range' : [(1,1),(1,2),(2,2)],
'vectorizer__lowercase' : [True, False],
'vectorizer__norm' : ['l1','l2']}
model = GridSearchCV(pipeline, params, cv=3, verbose=1)
model.fit(text, y)