modelselection.Kfold gives different results than kf.split - python

I am working on a dataset TelcoSigtel which has 5k observations, 21 features, and an imbalanced target with 86% non-churner and 16% churner.
Sorry, I wanted to give an extract of the dataframe but it is way too big or when I try to take a small bunch there are not enough churners.
My problem is the following those two methods below should give the same results but it is dramatically different on some algorithms and on some other they give the exact same results.
Information about the dataset:
models = [('logit',
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=600,
multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
solver='liblinear', tol=0.0001, verbose=0, warm_start=False)), ....]
# Method 1:
from sklearn import model_selection
from sklearn.model_selection import KFold
X = telcom.drop("churn", axis=1)
Y = telcom["churn"]
results = []
names = []
seed = 0
scoring = "roc_auc"
for name, model in models:
kfold = model_selection.KFold(n_splits = 5, random_state = seed)
cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison-AUC')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.grid()
plt.show()
# Method 2:
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
kf = KFold(n_splits=5, random_state=0)
X = telcom.drop("churn", axis=1)
Y = telcom["churn"]
results = []
names = []
to_store1 = list()
seed = 0
scoring = "roc_auc"
cv_results = np.array([])
for name, model in models:
for train_index, test_index in kf.split(X):
# split the data
X_train, X_test = X.loc[train_index,:].values, X.loc[test_index,:].values
y_train, y_test = np.ravel(Y[train_index]), np.ravel(Y[test_index])
model = model # Choose a model here
model.fit(X_train, y_train )
y_pred = model.predict(X_test)
to_store1.append(train_index)
# store fold results
result = roc_auc_score(y_test, y_pred)
cv_results = np.append(cv_results, result)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
cv_results = np.array([])
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison-AUC')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.grid()
plt.show()

The short answer is that you should use model.predict_proba(X_test)[:, 1] or model.decision_function(X_test) to get identical results since roc auc scorer needs class probabilities. The long answer is that you can reproduce the same behavior with a toy example:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import roc_auc_score, make_scorer
def assert_equal_scores(rnd_seed, needs_threshold):
"""Assert two different scorings, return equal results."""
X, y, *_ = load_breast_cancer().values()
kfold = KFold(random_state=rnd_seed)
lr = LogisticRegression(random_state=rnd_seed + 10)
roc_auc_scorer = make_scorer(roc_auc_score, needs_threshold=needs_threshold)
cv_scores1 = cross_val_score(lr, X, y, cv=kfold, scoring=roc_auc_scorer)
cv_scores2 = cross_val_score(lr, X, y, cv=kfold, scoring='roc_auc')
np.testing.assert_equal(cv_scores1, cv_scores2)
Try assert_equal_scores(10, False) and assert_equal_scores(10, True) (or any other random seed). The first one raises an AssertionError. The difference is that roc auc scorer requires the needs_threshold parameter to be True.

Related

Got ValueError when calling cross_val_score

I am trying to make a project for Machine Learning and I wanted to perform an accuracy evaluation of multiple alhorithms. I am using this CSV and I am loading only Date, Time and CO columns ( I manually renamed it in the CSV). After I prepare my training data, I am trying to perform the evaluations, but I am getting:
ValueError: Supported target types are: ('binary', 'multiclass'). Got 'unknown' instead.
The shapes for the vectors used for evaluations (X_train and Y_train) are:
(9357, 2)
(9357,)
The class:
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.linear_model import LogisticRegression
from sklearn.model_selection import train_test_split, StratifiedKFold, cross_val_score
from sklearn.naive_bayes import GaussianNB
from sklearn.neighbors import KNeighborsClassifier
from sklearn.svm import SVC
from sklearn.tree import DecisionTreeClassifier
class Models:
test_size: float
random_state: int
def __init__(self, test_size: float = 0.20, random_state: int = 1) -> None:
super().__init__()
self.test_size = test_size
self.random_state = random_state
#staticmethod
def init_models() -> []:
return [
('LR', LogisticRegression(solver='liblinear', multi_class='ovr')),
('LDA', LinearDiscriminantAnalysis()),
('KNN', KNeighborsClassifier()),
('CART', DecisionTreeClassifier()),
('NB', GaussianNB()),
('SVM', SVC(gamma='auto'))
]
def train(self, x: [], y: []):
x_train, x_validation, y_train, y_validation = train_test_split(x, y, test_size=self.test_size,
random_state=self.random_state)
return x_train, x_validation, y_train, y_validation
def evaluate(self, x_train: [], y_train: [], splits: int = 10, random_state: int = 1):
results = []
names = []
models = self.init_models()
for name, model in models:
kfold = StratifiedKFold(n_splits=splits, random_state=random_state)
cv_results = cross_val_score(estimator=model, X=x_train, y=y_train, cv=kfold, scoring='accuracy')
results.append(cv_results)
names.append(name)
print('%s: %f (%f)' % (name, cv_results.mean(), cv_results.std()))
And I am calling my class as:
models_helper = Models()
array = dataset.values
X = array[:, 1:3]
Y = array[:, 2]
prepared = models_helper.train(X, Y)
classification = models_helper.evaluate(prepared[0], prepared[2])
I avoided this problem by first calculating predicted values with cross_val_predict and then using the predicted values with y_test to get score with metrics.accuracy_score.
# Function that runs the requested algorithm and returns the accuracy metrics.
# Passing the sklearn model as an argument along with cv values and training data.
def fit_ml_algo(algo, X_train, y_train, cv):
# One Pass
model = algo.fit(X_train, y_train)
acc = round(model.score(X_train, y_train) * 100, 2)
# Cross Validation
train_pred = model_selection.cross_val_predict(algo,
X_train,
y_train,
cv=cv,
n_jobs = -1)
# Cross-validation accuracy metric
acc_cv = round(metrics.accuracy_score(y_train, train_pred) * 100, 2)
return train_pred, acc, acc_cv

'PolynomialFeatures' object has no attribute 'predict'

I want to apply k-fold cross validation on the following regression models:
Linear Regression
Polynomial Regression
Support Vector Regression
Decision Tree Regression
Random Forest Regression
I am able to apply k-fold cross validation on all except polynomial regression which gives me this error PolynomialFeatures' object has no attribute 'predict. How to work around this issue. Also am I doing the job correctly, actually my main motive is to see which model is performing better, so is there a better way to do this job ??
# Compare Algorithms
import pandas
import matplotlib.pyplot as plt
from sklearn import model_selection
from sklearn.linear_model import LinearRegression
from sklearn.preprocessing import PolynomialFeatures
from sklearn.svm import SVR
from sklearn.tree import DecisionTreeRegressor
from sklearn.ensemble import RandomForestRegressor
# load dataset
names = ['YearsExperience', 'Salary']
dataframe = pandas.read_csv('Salary_Data.csv', names=names)
array = dataframe.values
X = array[1:,0]
Y = array[1:,1]
X = X.reshape(-1, 1)
Y = Y.reshape(-1, 1)
# prepare configuration for cross validation test harness
seed = 7
# prepare models
models = []
models.append(('LR', LinearRegression()))
models.append(('PR', PolynomialFeatures(degree = 4)))
models.append(('SVR', SVR(kernel = 'rbf')))
models.append(('DTR', DecisionTreeRegressor()))
models.append(('RFR', RandomForestRegressor(n_estimators = 10)))
# evaluate each model in turn
results = []
names = []
scoring = 'neg_mean_absolute_error'
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
cv_results = model_selection.cross_val_score(model, X, Y.ravel(), cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.show()
In sklearn you get polynomial regression by:
generating polynomial and interaction features on your original dataset by using sklearn.preprocessing.PolynomialFeatures
running ordinary least squares Linear Regression on the transformed dataset by using sklearn.linear_model.LinearRegression
Toy example:
from sklearn.preprocessing import PolynomialFeatures
from sklearn import linear_model
# Create linear regression object
poly = PolynomialFeatures(degree=3)
X_train = poly.fit_transform(X_train)
X_test = poly.fit_transform(X_test)
model = linear_model.LinearRegression()
model.fit(X_train, y_train)
print(model.score(X_train, y_train))
Here is the changed part of the code if someone wants for reference :
# prepare models
models = []
models.append(('LR', LinearRegression()))
models.append(('PR', LinearRegression()))
models.append(('SVR', SVR(kernel = 'rbf')))
models.append(('DTR', DecisionTreeRegressor()))
models.append(('RFR', RandomForestRegressor(n_estimators = 10)))
# evaluate each model in turn
results = []
names = []
scoring = 'neg_mean_absolute_error'
for name, model in models:
kfold = model_selection.KFold(n_splits=10, random_state=seed)
if name == 'PR':
poly_reg = PolynomialFeatures(degree = 4)
X_poly = poly_reg.fit_transform(X)
cv_results = model_selection.cross_val_score(model, X_poly, Y.ravel(), cv=kfold, scoring=scoring)
else:
cv_results = model_selection.cross_val_score(model, X, Y.ravel(), cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())

UndefinedMetricWarning in Sklearn

I'm not able to see my resultant accuracy score in my final graph and I get precision/recall being ill-defined where I don't see any 0's.
I'm using this yeast data: https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data
I've tried making the whole set my training set by making train_frac=1.
import pandas as pd
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.naive_bayes import GaussianNB
df = pd.read_csv("<my_dir>",names = ['sample','mcg', 'gvh', 'alm', 'mit', 'erl', 'pox', 'vac', 'nuc','site'])
df=df.drop(columns=['sample'])
model_type = GaussianNB()
target = 'site'
train_frac = 0.5
Y = df[target]
df2 = df.drop(columns=[target])
dtype='object'). Everything but site.
X = df[df2.columns[:]]
def naive_split(X, Y, n):
# Take first n lines of X and Y for training and the rest for testing
X_train = X[:n]
X_test = X[n:]
Y_train = Y[:n]
Y_test = Y[n:]
return (X_train, X_test, Y_train, Y_test)
def train_model(n=int(train_frac*df.shape[0])):
X_train, X_test, Y_train, Y_test = naive_split(X, Y, n)
clf = model_type
clf = clf.fit(X_train, Y_train)
return (X_test, Y_test, clf)
X_test, Y_test, clf = train_model()
import sklearn.metrics as metrics
from sklearn import model_selection
sizes = np.arange(0.98,0.01, -0.02)
result = {}
for size in sizes:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
X, Y, test_size=size, random_state=200)
clf = model_type
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)
precision = metrics.precision_score(Y_test, clf.predict(X_test), average='weighted')
recall = metrics.recall_score(Y_test, clf.predict(X_test), average='weighted')
result[len(Y_train)] = (score, precision, recall)
result = pd.DataFrame(result).transpose()
result.columns = ['Accuracy','Precision', 'Recall']
result.plot(marker='*', figsize=(15,5))
plt.title('Metrics measures using random train/test splitting')
plt.xlabel('Size of training set')
plt.ylabel('Value');
I get the following results when I expect it to run without error:
C:\Users\<user>\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\metrics\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.'precision', 'predicted', average, warn_for)
C:\Users\<user>\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\metrics\classification.py:1137: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. 'recall', 'true', average, warn_for)

Random in SVM answer

I am using a svm to see if I can take baseball data and classify hits and estimate home runs. I seem to get different results when I run the model multiple times, as a result, I made into a simulation and it runs the model 100 times, but I don't understand why and what is causing the variation. Can someone please explain why this would be? I did set random_state=42
import pandas as pd
from mlxtend.plotting import plot_decision_regions
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn import metrics
import statistics
import numpy as np
result_array = []
players = [488768, 517369, 461314, 477165, 506560, 572114, 641319, 592669, 622534, 605486, 602922, 518466, 572362, 519082, 623182, 595978, 543272]
dfSave = pd.DataFrame(columns=['Mean','Max','Min','Std', 'Accuracy', 'Precision', 'f1_score', 'Recall_Score', 'First_Name', 'Last_Name'])
for i in players:
batter = i
df = pd.read_csv('D:baseballData_2016_use.csv')
df2 = pd.read_csv('D:padres_2016_home.csv') #Team to test
dataFilter = df.loc[df['Home_Team'] == 'Orioles'] #Stadium to train model to.
dataFilter2 = df2.loc[df2['Batter_ID'] == batter] #Players to test in stadium
j = 0
while j <= 100:
predict = dataFilter2.iloc[:,[4,5]]
X =dataFilter.iloc[:,[4,5]]
y = dataFilter.iloc[:,3]
y = y.astype(np.integer)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)
svclassifier = SVC(C=4, cache_size=200, class_weight= None, coef0=0.0,
decision_function_shape='ovo', degree=3, gamma=0.001, kernel='rbf',
max_iter=-1, probability=False, random_state=42, shrinking=False,
tol=0.001, verbose=False) #defaults
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
predicted= svclassifier.predict(predict)
listDf = []
sum = 0 # print predicted home runs
for i in predicted:
if i == 1:
sum = sum + 1
result_array.append(sum)
print(sum)
j = j + 1
firstName = dataFilter2.loc[1:,'Batter_First_Name'].values
lastName = dataFilter2.loc[1:,'Batter_Last_Name'].values
listDf.append({'Mean': statistics.mean(result_array),'Max' : max(result_array),'Min' : min(result_array),'Std' : statistics.stdev(result_array),
'Accuracy' : metrics.accuracy_score(y_test, y_pred), 'Precision' : precision_score(y_test, y_pred, average="macro"), 'f1_score' :f1_score(y_test, y_pred, average="macro"),
"Recall_Score" : recall_score(y_test, y_pred, average="macro"), 'First_Name' : firstName[0],'Last_Name' : lastName[0]})
dfSave = pd.DataFrame(listDf)
dfSave.to_csv('D:test9999.csv', mode='a')
result_array = []
In your code, the randomness comes from train_test_split giving a different split at each run.
You can avoid this by fixing random_state but it is considered a better practice to run it multiple times (as you did), get the distribution of the output score, calculate the confidence interval on the score and report that.

Use a metric after a classifier in a Pipeline

I continue to investigate about pipeline. My aim is to execute each step of machine learning only with pipeline. It will be more flexible and easier to adapt my pipeline with an other use case. So what I do:
Step 1: Fill NaN Values
Step 2: Transforming Categorical Values into Numbers
Step 3: Classifier
Step 4: GridSearch
Step 5: Add a metrics (failed)
Here is my code:
import pandas as pd
from sklearn.base import BaseEstimator, TransformerMixin
from sklearn.feature_selection import SelectKBest
from sklearn.preprocessing import LabelEncoder
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import train_test_split
from sklearn.ensemble import RandomForestClassifier
from sklearn.pipeline import Pipeline
from sklearn.metrics import roc_curve, auc
import matplotlib.pyplot as plt
from sklearn.metrics import confusion_matrix
from sklearn.metrics import f1_score
class FillNa(BaseEstimator, TransformerMixin):
def transform(self, x, y=None):
non_numerics_columns = x.columns.difference(
x._get_numeric_data().columns)
for column in x.columns:
if column in non_numerics_columns:
x.loc[:, column] = x.loc[:, column].fillna(
df[column].value_counts().idxmax())
else:
x.loc[:, column] = x.loc[:, column].fillna(
x.loc[:, column].mean())
return x
def fit(self, x, y=None):
return self
class CategoricalToNumerical(BaseEstimator, TransformerMixin):
def transform(self, x, y=None):
non_numerics_columns = x.columns.difference(
x._get_numeric_data().columns)
le = LabelEncoder()
for column in non_numerics_columns:
x.loc[:, column] = x.loc[:, column].fillna(
x.loc[:, column].value_counts().idxmax())
le.fit(x.loc[:, column])
x.loc[:, column] = le.transform(x.loc[:, column]).astype(int)
return x
def fit(self, x, y=None):
return self
class Perf(BaseEstimator, TransformerMixin):
def fit(self, clf, x, y, perf="all"):
"""Only for classifier model.
Return AUC, ROC, Confusion Matrix and F1 score from a classifier and df
You can put a list of eval instead a string for eval paramater.
Example: eval=['all', 'auc', 'roc', 'cm', 'f1'] will return these 4
evals.
"""
evals = {}
y_pred_proba = clf.predict_proba(x)[:, 1]
y_pred = clf.predict(x)
perf_list = perf.split(',')
if ("all" or "roc") in perf.split(','):
fpr, tpr, _ = roc_curve(y, y_pred_proba)
roc_auc = round(auc(fpr, tpr), 3)
plt.style.use('bmh')
plt.figure(figsize=(12, 9))
plt.title('ROC Curve')
plt.plot(fpr, tpr, 'b',
label='AUC = {}'.format(roc_auc))
plt.legend(loc='lower right', borderpad=1, labelspacing=1,
prop={"size": 12}, facecolor='white')
plt.plot([0, 1], [0, 1], 'r--')
plt.xlim([-0.1, 1.])
plt.ylim([-0.1, 1.])
plt.ylabel('True Positive Rate')
plt.xlabel('False Positive Rate')
plt.show()
if "all" in perf_list or "auc" in perf_list:
fpr, tpr, _ = roc_curve(y, y_pred_proba)
evals['auc'] = auc(fpr, tpr)
if "all" in perf_list or "cm" in perf_list:
evals['cm'] = confusion_matrix(y, y_pred)
if "all" in perf_list or "f1" in perf_list:
evals['f1'] = f1_score(y, y_pred)
return evals
path = '~/proj/akd-doc/notebooks/data/'
df = pd.read_csv(path + 'titanic_tuto.csv', sep=';')
y = df.pop('Survival-Status').replace(to_replace=['dead', 'alive'],
value=[0., 1.])
X = df.copy()
X_train, X_test, y_train, y_test = train_test_split(
X.copy(), y.copy(), test_size=0.2, random_state=42)
percent = 0.50
nb_features = round(percent * df.shape[1]) + 1
clf = RandomForestClassifier()
pipeline = Pipeline([('fillna', FillNa()),
('categorical_to_numerical', CategoricalToNumerical()),
('features_selection', SelectKBest(k=nb_features)),
('random_forest', clf),
('perf', Perf())])
params = dict(random_forest__max_depth=list(range(8, 12)),
random_forest__n_estimators=list(range(30, 110, 10)))
cv = GridSearchCV(pipeline, param_grid=params)
cv.fit(X_train, y_train)
I am aware that it is not ideal to print a roc curve but that's not the problem right now.
So, when I execute this code I have:
TypeError: If no scoring is specified, the estimator passed should have a 'score' method. The estimator Pipeline(steps=[('fillna', FillNa()), ('categorical_to_numerical', CategoricalToNumerical()), ('features_selection', SelectKBest(k=10, score_func=<function f_classif at 0x7f4ed4c3eae8>)), ('random_forest', RandomForestClassifier(bootstrap=True, class_weight=None, criterion='gini',
max_depth=None,...=1, oob_score=False, random_state=None,
verbose=0, warm_start=False)), ('perf', Perf())]) does not.
I'm interested in all ideas...
As the error states, you need to specify the scoring parameter in GridSearchCV.
Use
GridSearchCV(pipeline, param_grid=params, scoring = 'accuracy')
Edit (Based on questions in comments):
If you need the roc, auc curve and f1 for the entire X_train and y_train (and not for all the splits of GridSearchCV), its better to keep the Perf class out of the pipeline.
pipeline = Pipeline([('fillna', FillNa()),
('categorical_to_numerical', CategoricalToNumerical()),
('features_selection', SelectKBest(k=nb_features)),
('random_forest', clf)])
#Fit the data in the pipeline
pipeline.fit(X_train, y_train)
performance_meas = Perf()
performance_meas.fit(pipeline, X_train, y_train)

Categories

Resources