I continue to investigate about pipeline. My aim is to execute each step of machine learning only with pipeline. It will be more flexible and easier to adapt my pipeline with an other use case. So what I do:
Step 1: Fill NaN Values
Step 2: Transforming Categorical Values into Numbers
Step 3: Classifier
Step 4: GridSearch
Step 5: Add a metrics (failed)
Here is my code:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
class FillNa(BaseEstimator, TransformerMixin):
def transform(self, x, y=None):
non_numerics_columns = x.columns.difference(
x._get_numeric_data().columns)
for column in x.columns:
if column in non_numerics_columns:
x.loc[:, column] = x.loc[:, column].fillna(
df[column].value_counts().idxmax())
else:
x.loc[:, column] = x.loc[:, column].fillna(
x.loc[:, column].mean())
return x
def fit(self, x, y=None):
return self
class CategoricalToNumerical(BaseEstimator, TransformerMixin):
def transform(self, x, y=None):
non_numerics_columns = x.columns.difference(
x._get_numeric_data().columns)
le = LabelEncoder()
for column in non_numerics_columns:
x.loc[:, column] = x.loc[:, column].fillna(
x.loc[:, column].value_counts().idxmax())
le.fit(x.loc[:, column])
x.loc[:, column] = le.transform(x.loc[:, column]).astype(int)
return x
def fit(self, x, y=None):
return self
class Perf(BaseEstimator, TransformerMixin):
def fit(self, clf, x, y, perf="all"):
"""Only for classifier model.
Return AUC, ROC, Confusion Matrix and F1 score from a classifier and df
You can put a list of eval instead a string for eval paramater.
Example: eval=['all', 'auc', 'roc', 'cm', 'f1'] will return these 4
evals.
"""
evals = {}
y_pred_proba = clf.predict_proba(x)[:, 1]
y_pred = clf.predict(x)
perf_list = perf.split(',')
if ("all" or "roc") in perf.split(','):
fpr, tpr, _ = roc_curve(y, y_pred_proba)
roc_auc = round(auc(fpr, tpr), 3)
plt.style.use('bmh')
plt.figure(figsize=(12, 9))
plt.title('ROC Curve')
plt.plot(fpr, tpr, 'b',
label='AUC = {}'.format(roc_auc))
plt.legend(loc='lower right', borderpad=1, labelspacing=1,
prop={"size": 12}, facecolor='white')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([-0.1, 1.])
plt.ylim([-0.1, 1.])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
if "all" in perf_list or "auc" in perf_list:
fpr, tpr, _ = roc_curve(y, y_pred_proba)
evals['auc'] = auc(fpr, tpr)
if "all" in perf_list or "cm" in perf_list:
evals['cm'] = confusion_matrix(y, y_pred)
if "all" in perf_list or "f1" in perf_list:
evals['f1'] = f1_score(y, y_pred)
return evals
path = '~/proj/akd-doc/notebooks/data/'
df = pd.read_csv(path + 'titanic_tuto.csv', sep=';')
y = df.pop('Survival-Status').replace(to_replace=['dead', 'alive'],
value=[0., 1.])
X = df.copy()
X_train, X_test, y_train, y_test = train_test_split(
X.copy(), y.copy(), test_size=0.2, random_state=42)
percent = 0.50
nb_features = round(percent * df.shape[1]) + 1
clf = RandomForestClassifier()
pipeline = Pipeline([('fillna', FillNa()),
('categorical_to_numerical', CategoricalToNumerical()),
('features_selection', SelectKBest(k=nb_features)),
('random_forest', clf),
('perf', Perf())])
params = dict(random_forest__max_depth=list(range(8, 12)),
random_forest__n_estimators=list(range(30, 110, 10)))
cv = GridSearchCV(pipeline, param_grid=params)
cv.fit(X_train, y_train)
I am aware that it is not ideal to print a roc curve but that's not the problem right now.
So, when I execute this code I have:
TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator Pipeline(steps=[('fillna', FillNa()), ('categorical_to_numerical', CategoricalToNumerical()), ('features_selection', SelectKBest(k=10, score_func=<function f_classif at 0x7f4ed4c3eae8>)), ('random_forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None,...=1, oob_score=False, random_state=None,
verbose=0, warm_start=False)), ('perf', Perf())]) does not.
I'm interested in all ideas...
As the error states, you need to specify the scoring parameter in GridSearchCV.
Use
GridSearchCV(pipeline, param_grid=params, scoring = 'accuracy')
Edit (Based on questions in comments):
If you need the roc, auc curve and f1 for the entire X_train and y_train (and not for all the splits of GridSearchCV), its better to keep the Perf class out of the pipeline.
pipeline = Pipeline([('fillna', FillNa()),
('categorical_to_numerical', CategoricalToNumerical()),
('features_selection', SelectKBest(k=nb_features)),
('random_forest', clf)])
#Fit the data in the pipeline
pipeline.fit(X_train, y_train)
performance_meas = Perf()
performance_meas.fit(pipeline, X_train, y_train)
Related
I want to create the Negative Predictive Value (NPV) metric to evaluate Data inside a gridsearchCV.
I prepared an example with the iris dataset.
from sklearn import svm, datasets
import numpy as np
import pandas as pd
from sklearn.metrics import make_scorer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X = iris.data
target = iris.target
names = iris.target_names
df = pd.DataFrame(X, columns=iris.feature_names)
df['species'] = iris.target
#df['species'] = df['species'].replace(to_replace= [0, 1, 2], value = ['setosa', 'versicolor', 'virginica'])
indexNames = df[ df['species'] == 2 ].index
df.drop(indexNames , inplace=True)
x_data = pd.DataFrame({'sepal length': df[df.columns[0]],
'sepal width': df[df.columns[1]],
'petal length': df[df.columns[2]],
'petal width': df[df.columns[3]]})
y_data = pd.DataFrame(df['species']).astype(float)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.5)
parameters = {
'kernel':('linear', 'rbf'),
'C':[1, 10]
}
svc = svm.SVC()
scoring = ['accuracy']
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
grid_search = GridSearchCV(estimator=svc,
param_grid=parameters,
scoring=scoring,
refit='accuracy',
n_jobs=-1,
cv=kfold,
verbose=0)
grid_result = grid_search.fit(x_train, y_train.values.ravel())
grid_result
print(f'best precision score train set {grid_result.best_score_:.4f}')
print(f'best hyperparameters{grid_result.best_params_}')
print(f'precision score test set{grid_search.score(x_test, y_test):.4f}')
Unfortunately the exemplaric code is not working, accuracy is always 1. Thats one issue i got when i adapted my real code to an example with iris data.
Another Code works, but its for accuracy only and i need a binary classifier for metrics like precision or NPV.
from sklearn import svm, datasets
import numpy as np
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score
iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC()
scoring = ['accuracy']
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
grid_search = GridSearchCV(estimator=svc,
param_grid=parameters,
scoring=scoring,
refit='accuracy',
n_jobs=-1,
cv=kfold,
verbose=0)
grid_result = grid_search.fit(iris.data, iris.target)
print(f'best accuracy score {grid_result.best_score_:.4f}')
print(f'best hyperparameters{grid_result.best_params_}')
print(f'accuracy score{grid_search.score(iris.data, iris.target):.4f}')
To sum up, whys is the first code block not working like intended and how would an implementation with the metric NPV look like?
Ideas are welcome.
Also, in other classification tasks, I wrote columns with my predicted value and I could compute TP, FP, TN, and FN directly and got my metrics. Inside the gridCV I couldn't figure out how to access the predictions.
In the first code above, the data is really well separated. To remove doubts, print the confusion_matrix in it, you can see that this is so.
from sklearn.metrics import confusion_matrix
y_pred = grid_search.best_estimator_.predict(x_test)
print(confusion_matrix(y_test, y_pred))
Output
[[22 0]
[ 0 28]]
As for custom criteria, you can use: make_scorer. In which the custom_scorer function is fed, which considers what you want. I took the confusion_matrix and extracted the values from it to calculate the score. What she thinks, namely the formula, you need to check, because I can be wrong.
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
def custom_scorer(y_true, y_pred, **kwargs):
cnm = confusion_matrix(y_true, y_pred)
return cnm[1, 1] / (cnm[1, 1] + cnm[0, 1])
mysc = make_scorer(custom_scorer, greater_is_better=True)
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
grid_search = GridSearchCV(estimator=svc,
param_grid=parameters,
scoring=mysc,
refit='accuracy',
n_jobs=-1,
cv=kfold,
verbose=0)
grid_result = grid_search.fit(x_train, y_train.values.ravel())
import csv
import pandas as pd
import numpy as np
from matplotlib import pyplot
import shap
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
df1=pd.read_csv("./wine.data",sep=",",encoding='utf_8_sig')
X_train = df1
le = preprocessing.LabelEncoder()
X_train['alc_class'] = le.fit_transform(X_train.alc_class.values)
print(X_train.columns)
print(X_train.describe())
y = X_train['alc_class']
X = X_train.drop(columns='alc_class')
import xgboost as xgb
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 2100, stratify = y)
# import XGBClassifier
import xgboost as xgb
from sklearn.metrics import mean_squared_error
DM_train = xgb.DMatrix(data = X_train,
label = y_train)
DM_test = xgb.DMatrix(data = X_test,
label = y_test)
xgb_param_grid = {
'colsample_bytree': np.linspace(0.5, 0.9, 2),
'n_estimators':[30],
'max_depth': [5],
'learning_rate':[0.01],
'alpha':[10],
'objective':['binary:logistic'],
'tree_method':['hist'],
'min_child_weight': [1],
'gamma': [0.5],
'subsample': [0.6],
}
# instantiate the classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric="auc")
# perform 5 fold cross-validation using mean square error as a scoring method
grid_mse = GridSearchCV(estimator = xgb_clf, param_grid = xgb_param_grid, scoring = 'neg_mean_squared_error', cv = 5, verbose = 1)
# Fit grid_mse to the data, get best parameters and best score (lowest RMSE)
grid_mse.fit(X_train, y_train)
print("Best parameters found: ",grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))
#Predict using the test data
y_pred = grid_mse.predict(X_test)
y_pred_prob = grid_mse.predict_proba(X_test)
print("Root mean square error for test dataset: {}".format(np.round(np.sqrt(mean_squared_error(y_test, y_pred)), 2)))
from sklearn.metrics import accuracy_score, roc_curve, auc,recall_score,precision_score, precision_recall_curve,f1_score, classification_report, confusion_matrix,roc_auc_score
print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
print('XGBoost model F1 score: {0:0.4f}'. format(f1_score(y_test, y_pred, average='weighted')))
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
area = auc(recall, precision)
print("----------------")
print("\n\n Evaluation Metrics \n\n")
aucroc_score = roc_auc_score(y_test, y_pred_prob[:,1])
print("Area Under ROC Curve: ",aucroc_score)
# roc curve for models
fpr, tpr, thresh = roc_curve(y_test, y_pred_prob[:,1], pos_label=1)
# roc curve for tpr = fpr
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)
print("confusion_matrix ", confusion_matrix(y_test,y_pred))
print("classification_report ", classification_report(y_test,y_pred))
explainer = shap.TreeExplainer(grid_mse.best_estimator_)
shap_values = explainer(X_train)
shap.plots.beeswarm(shap_values, plot_size = 1.8, max_display = 13)
print(grid_mse.best_estimator_.feature_importances_)
for col,score in zip(X_train.columns,grid_mse.best_estimator_.feature_importances_):
print('%s, %0.3f ' %(col,score))
I have long feature names and I plot the beeswarm shapley plots and feature names get truncated. I would like the full feature name to be displayed on y-axis. Any help would be greatly appreciated.
I have tried changing the plot size but it did not work.
Add a flag to hide the plot. Then save to output with tight bbox layout:
path = 'save_path_here.png'
shap.plots.beeswarm(shap_values, plot_size = 1.8, max_display = 13, show=False)
plt.savefig(path, bbox_inches='tight', dpi=300)
I am working on a dataset TelcoSigtel which has 5k observations, 21 features, and an imbalanced target with 86% non-churner and 16% churner.
Sorry, I wanted to give an extract of the dataframe but it is way too big or when I try to take a small bunch there are not enough churners.
My problem is the following those two methods below should give the same results but it is dramatically different on some algorithms and on some other they give the exact same results.
Information about the dataset:
models = [('logit',
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=600,
multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
solver='liblinear', tol=0.0001, verbose=0, warm_start=False)), ....]
# Method 1:
from sklearn import model_selection
from sklearn.model_selection import KFold
X = telcom.drop("churn", axis=1)
Y = telcom["churn"]
results = []
names = []
seed = 0
scoring = "roc_auc"
for name, model in models:
kfold = model_selection.KFold(n_splits = 5, random_state = seed)
cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison-AUC')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.grid()
plt.show()
# Method 2:
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
kf = KFold(n_splits=5, random_state=0)
X = telcom.drop("churn", axis=1)
Y = telcom["churn"]
results = []
names = []
to_store1 = list()
seed = 0
scoring = "roc_auc"
cv_results = np.array([])
for name, model in models:
for train_index, test_index in kf.split(X):
# split the data
X_train, X_test = X.loc[train_index,:].values, X.loc[test_index,:].values
y_train, y_test = np.ravel(Y[train_index]), np.ravel(Y[test_index])
model = model # Choose a model here
model.fit(X_train, y_train )
y_pred = model.predict(X_test)
to_store1.append(train_index)
# store fold results
result = roc_auc_score(y_test, y_pred)
cv_results = np.append(cv_results, result)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
cv_results = np.array([])
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison-AUC')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.grid()
plt.show()
The short answer is that you should use model.predict_proba(X_test)[:, 1] or model.decision_function(X_test) to get identical results since roc auc scorer needs class probabilities. The long answer is that you can reproduce the same behavior with a toy example:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import roc_auc_score, make_scorer
def assert_equal_scores(rnd_seed, needs_threshold):
"""Assert two different scorings, return equal results."""
X, y, *_ = load_breast_cancer().values()
kfold = KFold(random_state=rnd_seed)
lr = LogisticRegression(random_state=rnd_seed + 10)
roc_auc_scorer = make_scorer(roc_auc_score, needs_threshold=needs_threshold)
cv_scores1 = cross_val_score(lr, X, y, cv=kfold, scoring=roc_auc_scorer)
cv_scores2 = cross_val_score(lr, X, y, cv=kfold, scoring='roc_auc')
np.testing.assert_equal(cv_scores1, cv_scores2)
Try assert_equal_scores(10, False) and assert_equal_scores(10, True) (or any other random seed). The first one raises an AssertionError. The difference is that roc auc scorer requires the needs_threshold parameter to be True.
I'm trying to make a heart disease prediction program using Naive Bayes. When I finished the classifier, the cross validation showed a mean accuracy of 80% However when I try to make a prediction on a given sample, the prediction is all wrong! The dataset is the heart disease dataset from UCI repository, it contains 303 samples. There are two classes 0: healthy and 1: ill, when I try making a prediction on a sample from the dataset, it doesn't predicts its true value, except for very few samples. Here is the code:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import Imputer, StandardScaler
class Predict:
def Read_Clean(self,dataset):
header_row = ['Age', 'Gender', 'Chest_Pain', 'Resting_Blood_Pressure', 'Serum_Cholestrol',
'Fasting_Blood_Sugar', 'Resting_ECG', 'Max_Heart_Rate',
'Exercise_Induced_Angina', 'OldPeak',
'Slope', 'CA', 'Thal', 'Num']
df = pd.read_csv(dataset, names=header_row)
df = df.replace('[?]', np.nan, regex=True)
df = pd.DataFrame(Imputer(missing_values='NaN', strategy='mean', axis=0)
.fit_transform(df), columns=header_row)
df = df.astype(float)
return df
def Train_Test_Split_data(self,dataset):
Y = dataset['Num'].apply(lambda x: 1 if x > 0 else 0)
X = dataset.drop('Num', axis=1)
validation_size = 0.20
seed = 42
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=validation_size, random_state=seed)
return X_train, X_test, Y_train, Y_test
def Scaler(self, X_train, X_test):
scaler = StandardScaler()
X_train = scaler.fit_transform(X_train)
X_test = scaler.transform(X_test)
return X_train, X_test
def Cross_Validate(self, clf, X_train, Y_train, cv=5):
scores = cross_val_score(clf, X_train, Y_train, cv=cv, scoring='f1')
score = scores.mean()
print("CV scores mean: %.4f " % (score))
return score, scores
def Fit_Score(self, clf, X_train, Y_train, X_test, Y_test, label='x'):
clf.fit(X_train, Y_train)
fit_score = clf.score(X_train, Y_train)
pred_score = clf.score(X_test, Y_test)
print("%s: fit score %.5f, predict score %.5f" % (label, fit_score, pred_score))
return pred_score
def ReturnPredictionValue(self, clf, sample):
y = clf.predict([sample])
return y[0]
def PredictionMain(self, sample, dataset_path='dataset/processed.cleveland.data'):
data = self.Read_Clean(dataset_path)
X_train, X_test, Y_train, Y_test = self.Train_Test_Split_data(data)
X_train, X_test = self.Scaler(X_train, X_test)
self.NB = GaussianNB()
self.Fit_Score(self.NB, X_train, Y_train, X_test, Y_test, label='NB')
self.Cross_Validate(self.NB, X_train, Y_train, 10)
return self.ReturnPredictionValue(self.NB, sample)
When I run:
if __name__ == '__main__':
sample = [41.0, 0.0, 2.0, 130.0, 204.0, 0.0, 2.0, 172.0, 0.0, 1.4, 1.0, 0.0, 3.0]
p = Predict()
print "Prediction value: {}".format(p.PredictionMain(sample))
The result is:
NB: fit score 0.84711, predict score 0.83607 CV scores mean: 0.8000
Prediction value: 1
I get 1 instead of 0 (this sample is already one of the dataset samples).
I did this for more than one sample from the dataset and I get wrong result most of the time, it's as if the accuracy is not 80%!
Any help would be appreciated.
Thanks in advance.
Edit:
Problem solved using Pipeline. The final code is:
import pandas as pd
import numpy as np
from sklearn.naive_bayes import GaussianNB
from sklearn.model_selection import cross_val_score, train_test_split
from sklearn.preprocessing import Imputer, StandardScaler, OneHotEncoder
from sklearn.pipeline import Pipeline
class Predict:
def __init__(self):
self.X = []
self.Y = []
def Read_Clean(self,dataset):
header_row = ['Age', 'Gender', 'Chest_Pain', 'Resting_Blood_Pressure', 'Serum_Cholestrol',
'Fasting_Blood_Sugar', 'Resting_ECG', 'Max_Heart_Rate',
'Exercise_Induced_Angina', 'OldPeak',
'Slope', 'CA', 'Thal', 'Num']
df = pd.read_csv(dataset, names=header_row)
df = df.replace('[?]', np.nan, regex=True)
df = pd.DataFrame(Imputer(missing_values='NaN', strategy='mean', axis=0)
.fit_transform(df), columns=header_row)
df = df.astype(float)
return df
def Split_Dataset(self, df):
self.Y = df['Num'].apply(lambda x: 1 if x > 0 else 0)
self.X = df.drop('Num', axis=1)
def Create_Pipeline(self):
estimators = []
estimators.append(('standardize', StandardScaler()))
estimators.append(('bayes', GaussianNB()))
model = Pipeline(estimators)
return model
def Cross_Validate(self, clf, cv=5):
scores = cross_val_score(clf, self.X, self.Y, cv=cv, scoring='f1')
score = scores.mean()
print("CV scores mean: %.4f " % (score))
def Fit_Score(self, clf, label='x'):
clf.fit(self.X, self.Y)
fit_score = clf.score(self.X, self.Y)
print("%s: fit score %.5f" % (label, fit_score))
def ReturnPredictionValue(self, clf, sample):
y = clf.predict([sample])
return y[0]
def PredictionMain(self, sample, dataset_path='dataset/processed.cleveland.data'):
print "dataset: "+ dataset_path
data = self.Read_Clean(dataset_path)
self.Split_Dataset(data)
self.model = self.Create_Pipeline()
self.Fit_Score(self.model, label='NB')
self.Cross_Validate(self.model, 10)
return self.ReturnPredictionValue(self.model, sample)
Now making a prediction on the same sample in the question returns [0] which is the true value. Actually by running the following method:
def CheckTrue(self):
clf = self.Create_Pipeline()
out = cross_val_predict(clf, self.X, self.Y)
p = [out == self.Y]
c = 0
for i in range(303):
if p[0][i] == True:
c += 1
print "Samples with true values: {}".format(c)
I get 249 true samples using the pipeline code, whereas I got only 150 before.
You're not applying StandardScaler to the sample. Classifier expects scaled data as it was trained on StandardScaler.transform output, but sample is not scaled the same way as in training.
It is easy to make such mistakes when combining multiple steps (scaling, preprocessing, classification) manually. To avoid such issues it is a good idea to use scikit-learn Pipeline.
I would like to predict the probability from Logistic Regression model with cross-validation. I know you can get the cross-validation scores, but is it possible to return the values from predict_proba instead of the scores?
# imports
from sklearn.linear_model import LogisticRegression
from sklearn.cross_validation import (StratifiedKFold, cross_val_score,
train_test_split)
from sklearn import datasets
# setup data
iris = datasets.load_iris()
X = iris.data
y = iris.target
# setup model
cv = StratifiedKFold(y, 10)
logreg = LogisticRegression()
# cross-validation scores
scores = cross_val_score(logreg, X, y, cv=cv)
# predict probabilities
Xtrain, Xtest, ytrain, ytest = train_test_split(X, y)
logreg.fit(Xtrain, ytrain)
proba = logreg.predict_proba(Xtest)
This is now implemented as part of scikit-learn version 0.18. You can pass a 'method' string parameter to the cross_val_predict method. Documentation is here.
Example:
proba = cross_val_predict(logreg, X, y, cv=cv, method='predict_proba')
Also note that this is part of the new sklearn.model_selection package so you will need this import:
from sklearn.model_selection import cross_val_predict
An easy workaround for this is to create a wrapper class, which for your case would be
class proba_logreg(LogisticRegression):
def predict(self, X):
return LogisticRegression.predict_proba(self, X)
and then pass an instance of it as the classifier object to cross_val_predict
# cross validation probabilities
probas = cross_val_predict(proba_logreg(), X, y, cv=cv)
There is a function cross_val_predict that gives you the predicted values, but there is no such function for "predict_proba" yet. Maybe we could make that an option.
This is easy to implement:
def my_cross_val_predict(
m, X, y, cv=KFold(),
predict=lambda m, x: m.predict_proba(x),
combine=np.vstack
):
preds = []
for train, test in cv.split(X):
m.fit(X[train, :], y[train])
pred = predict(m, X[test, :])
preds.append(pred)
return combine(preds)
This one returns predict_proba.
If you need both predict and predict_proba just change predict and combine arguments:
def stack(arrs):
if arrs[0].ndim == 1:
return np.hstack(arrs)
else:
return np.vstack(arrs)
def my_cross_val_predict(
m, X, y, cv=KFold(),
predict=lambda m, x:[ m.predict(x)
, m.predict_proba(x)
],
combine=lambda preds: list(map(stack, zip(*preds)))
):
preds = []
for train, test in cv.split(X):
m.fit(X[train, :], y[train])
pred = predict(m, X[test, :])
preds.append(pred)
return combine(preds)