Random in SVM answer - python

I am using a svm to see if I can take baseball data and classify hits and estimate home runs. I seem to get different results when I run the model multiple times, as a result, I made into a simulation and it runs the model 100 times, but I don't understand why and what is causing the variation. Can someone please explain why this would be? I did set random_state=42
import pandas as pd
from mlxtend.plotting import plot_decision_regions
import matplotlib.pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
from sklearn.metrics import accuracy_score, f1_score, precision_score, recall_score, classification_report, confusion_matrix
from sklearn import metrics
import statistics
import numpy as np
result_array = []
players = [488768, 517369, 461314, 477165, 506560, 572114, 641319, 592669, 622534, 605486, 602922, 518466, 572362, 519082, 623182, 595978, 543272]
dfSave = pd.DataFrame(columns=['Mean','Max','Min','Std', 'Accuracy', 'Precision', 'f1_score', 'Recall_Score', 'First_Name', 'Last_Name'])
for i in players:
batter = i
df = pd.read_csv('D:baseballData_2016_use.csv')
df2 = pd.read_csv('D:padres_2016_home.csv') #Team to test
dataFilter = df.loc[df['Home_Team'] == 'Orioles'] #Stadium to train model to.
dataFilter2 = df2.loc[df2['Batter_ID'] == batter] #Players to test in stadium
j = 0
while j <= 100:
predict = dataFilter2.iloc[:,[4,5]]
X =dataFilter.iloc[:,[4,5]]
y = dataFilter.iloc[:,3]
y = y.astype(np.integer)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30)
svclassifier = SVC(C=4, cache_size=200, class_weight= None, coef0=0.0,
decision_function_shape='ovo', degree=3, gamma=0.001, kernel='rbf',
max_iter=-1, probability=False, random_state=42, shrinking=False,
tol=0.001, verbose=False) #defaults
svclassifier.fit(X_train, y_train)
y_pred = svclassifier.predict(X_test)
predicted= svclassifier.predict(predict)
listDf = []
sum = 0 # print predicted home runs
for i in predicted:
if i == 1:
sum = sum + 1
result_array.append(sum)
print(sum)
j = j + 1
firstName = dataFilter2.loc[1:,'Batter_First_Name'].values
lastName = dataFilter2.loc[1:,'Batter_Last_Name'].values
listDf.append({'Mean': statistics.mean(result_array),'Max' : max(result_array),'Min' : min(result_array),'Std' : statistics.stdev(result_array),
'Accuracy' : metrics.accuracy_score(y_test, y_pred), 'Precision' : precision_score(y_test, y_pred, average="macro"), 'f1_score' :f1_score(y_test, y_pred, average="macro"),
"Recall_Score" : recall_score(y_test, y_pred, average="macro"), 'First_Name' : firstName[0],'Last_Name' : lastName[0]})
dfSave = pd.DataFrame(listDf)
dfSave.to_csv('D:test9999.csv', mode='a')
result_array = []

In your code, the randomness comes from train_test_split giving a different split at each run.
You can avoid this by fixing random_state but it is considered a better practice to run it multiple times (as you did), get the distribution of the output score, calculate the confidence interval on the score and report that.

Related

Explication cross_val_score scikit_learn parameter cv

I don't understand why i have different result in this configuration of cross_val_score
and a simple model.
from sklearn.datasets import load_iris
from sklearn.utils import shuffle
from sklearn import tree
import numpy as np
np.random.seed(1234)
iris = load_iris()
X, y = iris.data, iris.target
X,y = shuffle(X,y)
print(y)
clf = tree.DecisionTreeClassifier(max_depth=2,class_weight={2: 0.3, 1: 10,0:0.3},random_state=1234)
clf2 = clf.fit(X, y)
tree.plot_tree(clf2)
from sklearn.metrics import confusion_matrix
from sklearn.metrics import accuracy_score
predi = clf2.predict(X)
cm = confusion_matrix(y_true=y, y_pred=predi)
print(cm)
print("Accuracy = ",round(accuracy_score(y,predi)* 100.0,2))
from sklearn.model_selection import cross_val_score,cross_val_predict
max_id = len(X)
limit = round(max_id*0.6,0)
min_id=0
train = np.arange(0,limit)
test = np.arange(limit,max_id)
test = [int(x) for x in test]
train = [int(x) for x in train]
print(train)
print(test)
predi = cross_val_score(clf,X,y,cv=[(train,test)])
print(predi)
train = X[train[0]:train[-1]]
y_train = y[train[0]:train[-1]]
Xtest = X[test[0]:test[-1]]
y_test = y[test[0]:test[-1]]
clf3 = clf.fit(Xtrain,y_train)
predi = clf3.predict(Xtest)
cm = confusion_matrix(y_true=y_test, y_pred=predi)
print(cm)
print("Accuracy = ",round(accuracy_score(y_test,predi)* 100.0,2))
I don't understand why i have different accuracy whereas i have the same parameters en the same train test sample
Basically, the kind of data split you use will have an impact on your model accuracy. This is well documented in machine learning field. Secondly, your first model is strictly biased as you have used your training set for testing which will result in ~100% accuracy.
https://www.analyticsvidhya.com/blog/2021/05/4-ways-to-evaluate-your-machine-learning-model-cross-validation-techniques-with-python-code/
https://towardsdatascience.com/train-test-split-c3eed34f763b

How to evaluate the effect of different methods of handling missing values?

I am a total beginner and I am trying to compare different methods of handling missing data. In order to evaluate the effect of each method (drop raws with missing values, drop columns with missigness over 40%, impute with the mean, impute with the KNN), I compare the results of the LDA accuracy and LogReg accuracy on the training set between a dataset with 10% missing values, 20% missing values against the results of the original complete dataset. Unfortunately, I get pretty much the same results even between the complete dataset and the dataset with 20% missing-ness. I don't know what I am doing wrong.
from numpy import nan
from numpy import isnan
from pandas import read_csv
from sklearn.impute import SimpleImputer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
#dataset = read_csv('telecom_churn_rev10.csv')
dataset = read_csv('telecom_churn_rev20.csv')
dataset = dataset.replace(nan, 0)
values = dataset.values
X = values[:,1:11]
y = values[:,0]
dataset.fillna(dataset.mean(), inplace=True)
#dataset.fillna(dataset.mode(), inplace=True)
print(dataset.isnull().sum())
imputer = SimpleImputer(missing_values = nan, strategy = 'mean')
transformed_values = imputer.fit_transform(X)
print('Missing: %d' % isnan(transformed_values).sum())
model = LinearDiscriminantAnalysis()
cv = KFold(n_splits = 3, shuffle = True, random_state = 1)
result = cross_val_score(model, X, y, cv = cv, scoring = 'accuracy')
print('Accuracy: %.3f' % result.mean())
#print('Accuracy: %.3f' % result.mode())
print(dataset.describe())
print(dataset.head(20))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test,y_pred)
from sklearn import metrics
# make predictions on X
expected = y
predicted = classifier.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
# make predictions on X test
expected = y_test
predicted = classifier.predict(X_test)
# summarize the fit of the model
print(metrics.confusion_matrix(expected, predicted))
print(metrics.classification_report(expected, predicted))
You replace all your missing values with 0 at that line : dataset = dataset.replace(nan, 0). After this line, you have a full dataset without missing values. So, the .fillna() and the SimpleImputer() are useless after that line.

modelselection.Kfold gives different results than kf.split

I am working on a dataset TelcoSigtel which has 5k observations, 21 features, and an imbalanced target with 86% non-churner and 16% churner.
Sorry, I wanted to give an extract of the dataframe but it is way too big or when I try to take a small bunch there are not enough churners.
My problem is the following those two methods below should give the same results but it is dramatically different on some algorithms and on some other they give the exact same results.
Information about the dataset:
models = [('logit',
LogisticRegression(C=1.0, class_weight=None, dual=False, fit_intercept=True,
intercept_scaling=1, l1_ratio=None, max_iter=600,
multi_class='ovr', n_jobs=1, penalty='l2', random_state=None,
solver='liblinear', tol=0.0001, verbose=0, warm_start=False)), ....]
# Method 1:
from sklearn import model_selection
from sklearn.model_selection import KFold
X = telcom.drop("churn", axis=1)
Y = telcom["churn"]
results = []
names = []
seed = 0
scoring = "roc_auc"
for name, model in models:
kfold = model_selection.KFold(n_splits = 5, random_state = seed)
cv_results = model_selection.cross_val_score(model, X, Y, cv=kfold, scoring=scoring)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison-AUC')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.grid()
plt.show()
# Method 2:
from sklearn.model_selection import KFold
from imblearn.over_sampling import SMOTE
from sklearn.metrics import roc_auc_score
kf = KFold(n_splits=5, random_state=0)
X = telcom.drop("churn", axis=1)
Y = telcom["churn"]
results = []
names = []
to_store1 = list()
seed = 0
scoring = "roc_auc"
cv_results = np.array([])
for name, model in models:
for train_index, test_index in kf.split(X):
# split the data
X_train, X_test = X.loc[train_index,:].values, X.loc[test_index,:].values
y_train, y_test = np.ravel(Y[train_index]), np.ravel(Y[test_index])
model = model # Choose a model here
model.fit(X_train, y_train )
y_pred = model.predict(X_test)
to_store1.append(train_index)
# store fold results
result = roc_auc_score(y_test, y_pred)
cv_results = np.append(cv_results, result)
results.append(cv_results)
names.append(name)
msg = "%s: %f (%f)" % (name, cv_results.mean(), cv_results.std())
print(msg)
cv_results = np.array([])
# boxplot algorithm comparison
fig = plt.figure()
fig.suptitle('Algorithm Comparison-AUC')
ax = fig.add_subplot(111)
plt.boxplot(results)
ax.set_xticklabels(names)
plt.grid()
plt.show()
The short answer is that you should use model.predict_proba(X_test)[:, 1] or model.decision_function(X_test) to get identical results since roc auc scorer needs class probabilities. The long answer is that you can reproduce the same behavior with a toy example:
import numpy as np
from sklearn.model_selection import KFold, cross_val_score
from sklearn.linear_model import LogisticRegression
from sklearn.datasets import load_breast_cancer
from sklearn.metrics import roc_auc_score, make_scorer
def assert_equal_scores(rnd_seed, needs_threshold):
"""Assert two different scorings, return equal results."""
X, y, *_ = load_breast_cancer().values()
kfold = KFold(random_state=rnd_seed)
lr = LogisticRegression(random_state=rnd_seed + 10)
roc_auc_scorer = make_scorer(roc_auc_score, needs_threshold=needs_threshold)
cv_scores1 = cross_val_score(lr, X, y, cv=kfold, scoring=roc_auc_scorer)
cv_scores2 = cross_val_score(lr, X, y, cv=kfold, scoring='roc_auc')
np.testing.assert_equal(cv_scores1, cv_scores2)
Try assert_equal_scores(10, False) and assert_equal_scores(10, True) (or any other random seed). The first one raises an AssertionError. The difference is that roc auc scorer requires the needs_threshold parameter to be True.

Why can't I match LGBM's cv score?

I'm unable to match LGBM's cv score by hand.
Here's a MCVE:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import numpy as np
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
folds = KFold(5, random_state=42)
params = {'random_state': 42}
results = lgb.cv(params, lgb.Dataset(X_train, y_train), folds=folds, num_boost_round=1000, early_stopping_rounds=100, metrics=['auc'])
print('LGBM\'s cv score: ', results['auc-mean'][-1])
clf = lgb.LGBMClassifier(**params, n_estimators=len(results['auc-mean']))
val_scores = []
for train_idx, val_idx in folds.split(X_train):
clf.fit(X_train.iloc[train_idx], y_train.iloc[train_idx])
val_scores.append(roc_auc_score(y_train.iloc[val_idx], clf.predict_proba(X_train.iloc[val_idx])[:,1]))
print('Manual score: ', np.mean(np.array(val_scores)))
I was expecting the two CV scores to be identical - I have set random seeds, and done exactly the same thing. Yet they differ.
Here's the output I get:
LGBM's cv score: 0.9851513530737058
Manual score: 0.9903622177441328
Why? Am I not using LGMB's cv module correctly?
You are splitting X into X_train and X_test.
For cv you split X_train into 5 folds while manually you split X into 5 folds. i.e you use more points manually than with cv.
change results = lgb.cv(params, lgb.Dataset(X_train, y_train) to results = lgb.cv(params, lgb.Dataset(X, y)
Futhermore, there can be different parameters. For example, the number of threads used by lightgbm changes the result. During cv the models are fitted in parallel. Hence the number of threads used might differ from your manual sequential training.
EDIT after 1st correction:
You can achieve the same results using manual splitting / cv using this code:
from sklearn.datasets import load_breast_cancer
import pandas as pd
from sklearn.model_selection import train_test_split, KFold
from sklearn.metrics import roc_auc_score
import lightgbm as lgb
import numpy as np
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = pd.Series(data.target)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=42)
folds = KFold(5, random_state=42)
params = {
'task': 'train',
'boosting_type': 'gbdt',
'objective':'binary',
'metric':'auc',
}
data_all = lgb.Dataset(X_train, y_train)
results = lgb.cv(params, data_all,
folds=folds.split(X_train),
num_boost_round=1000,
early_stopping_rounds=100)
print('LGBM\'s cv score: ', results['auc-mean'][-1])
val_scores = []
for train_idx, val_idx in folds.split(X_train):
data_trd = lgb.Dataset(X_train.iloc[train_idx],
y_train.iloc[train_idx],
reference=data_all)
gbm = lgb.train(params,
data_trd,
num_boost_round=len(results['auc-mean']),
verbose_eval=100)
val_scores.append(roc_auc_score(y_train.iloc[val_idx], gbm.predict(X_train.iloc[val_idx])))
print('Manual score: ', np.mean(np.array(val_scores)))
yields
LGBM's cv score: 0.9914524426410262
Manual score: 0.9914524426410262
What makes the difference is this line reference=data_all. During cv, the binning of the variables (refers to lightgbm doc) is constructed using the whole dataset (X_train) while in you manual for loop it was built on the training subset (X_train.iloc[train_idx]). By passing the reference to the dataset containg all the data, lightGBM will reuse the same binning, giving same results.

How to run through loop to use non-scaled and scaled data in python for loop

I have the following code running through and fitting a model on the iris data using different modeling techniques. How can I add a second step in this process so I can demonstrate the improvement between using scaled and non-scaled data?
I don't need to run the scale transform outside of the loop, i was just having a lot of issues with transforming the data type from pandas dataframe to np array and back again.
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import accuracy_score
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
sc = StandardScaler()
X_train_scale = sc.fit_transform(X_train)
X_test_scale = sc.transform(X_test)
numFolds = 10
kf = KFold(len(y_train), numFolds, shuffle=True)
# These are "Class objects". For each Class, find the AUC through
# 10 fold cross validation.
Models = [LogisticRegression, svm.SVC]
params = [{},{}]
for param, Model in zip(params, Models):
total = 0
for train_indices, test_indices in kf:
train_X = X_train[train_indices]; train_Y = y_train[train_indices]
test_X = X_train[test_indices]; test_Y = y_train[test_indices]
reg = Model(**param)
reg.fit(train_X, train_Y)
predictions = reg.predict(test_X)
total += accuracy_score(test_Y, predictions)
accuracy = total / numFolds
print ("CV accuracy score of {0}: {1}".format(Model.__name__, round(accuracy, 6)))
So ideally my output would be:
CV standard accuracy score of LogisticRegression: 0.683333
CV scaled accuracy score of LogisticRegression: 0.766667
CV standard accuracy score of SVC: 0.766667
CV scaled accuracy score of SVC: 0.783333
It seems like this is unclear, I am trying to loop through scaled and unscaled data, similar to how I am looping through the different ML algorithms.
I wanted to follow up with this. I was able to do this by creating a pipeline and using gridsearchCV
pipe = Pipeline([('scale', StandardScaler()),
('clf', LogisticRegression())])
param_grid = [{
'scale':[None,StandardScaler()],
'clf':[SVC(),LogisticRegression()]}]
grid_search = GridSearchCV(pipe, param_grid=param_grid,n_jobs=-1, verbose=1 )
In the end this got me the results I wanted and was able to test easily how to work between scaling and not scaling.
try this:
from __future__ import division
for param, Model in zip(params, Models):
total = 0
for train_indices, test_indices in kf:
train_X = X_train[train_indices]; train_Y = y_train[train_indices]
test_X = X_train[test_indices]; test_Y = y_train[test_indices]
reg = Model(**param)
reg.fit(train_X, train_Y)
predictions = reg.predict(test_X)
total += accuracy_score(test_Y, predictions)
accuracy = total / numFolds
print ("CV accuracy score of {0}: {1}".format(Model.__name__, round(accuracy, 6)))
# added to your code
if previous_accuracy:
improvement = 1 - (accuracy / previous_accuracy)
print "CV accuracy score improved by", improvement
else:
previous_accuracy = accuracy

Categories

Resources