import csv
import pandas as pd
import numpy as np
from matplotlib import pyplot
import shap
from sklearn import preprocessing
from sklearn.preprocessing import StandardScaler
df1=pd.read_csv("./wine.data",sep=",",encoding='utf_8_sig')
X_train = df1
le = preprocessing.LabelEncoder()
X_train['alc_class'] = le.fit_transform(X_train.alc_class.values)
print(X_train.columns)
print(X_train.describe())
y = X_train['alc_class']
X = X_train.drop(columns='alc_class')
import xgboost as xgb
# split X and y into training and testing sets
from sklearn.model_selection import train_test_split
from sklearn.model_selection import GridSearchCV, RandomizedSearchCV
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.30, random_state = 2100, stratify = y)
# import XGBClassifier
import xgboost as xgb
from sklearn.metrics import mean_squared_error
DM_train = xgb.DMatrix(data = X_train,
label = y_train)
DM_test = xgb.DMatrix(data = X_test,
label = y_test)
xgb_param_grid = {
'colsample_bytree': np.linspace(0.5, 0.9, 2),
'n_estimators':[30],
'max_depth': [5],
'learning_rate':[0.01],
'alpha':[10],
'objective':['binary:logistic'],
'tree_method':['hist'],
'min_child_weight': [1],
'gamma': [0.5],
'subsample': [0.6],
}
# instantiate the classifier
xgb_clf = xgb.XGBClassifier(use_label_encoder=False, eval_metric="auc")
# perform 5 fold cross-validation using mean square error as a scoring method
grid_mse = GridSearchCV(estimator = xgb_clf, param_grid = xgb_param_grid, scoring = 'neg_mean_squared_error', cv = 5, verbose = 1)
# Fit grid_mse to the data, get best parameters and best score (lowest RMSE)
grid_mse.fit(X_train, y_train)
print("Best parameters found: ",grid_mse.best_params_)
print("Lowest RMSE found: ", np.sqrt(np.abs(grid_mse.best_score_)))
#Predict using the test data
y_pred = grid_mse.predict(X_test)
y_pred_prob = grid_mse.predict_proba(X_test)
print("Root mean square error for test dataset: {}".format(np.round(np.sqrt(mean_squared_error(y_test, y_pred)), 2)))
from sklearn.metrics import accuracy_score, roc_curve, auc,recall_score,precision_score, precision_recall_curve,f1_score, classification_report, confusion_matrix,roc_auc_score
print('XGBoost model accuracy score: {0:0.4f}'. format(accuracy_score(y_test, y_pred)))
print('XGBoost model F1 score: {0:0.4f}'. format(f1_score(y_test, y_pred, average='weighted')))
precision, recall, thresholds = precision_recall_curve(y_test, y_pred)
area = auc(recall, precision)
print("----------------")
print("\n\n Evaluation Metrics \n\n")
aucroc_score = roc_auc_score(y_test, y_pred_prob[:,1])
print("Area Under ROC Curve: ",aucroc_score)
# roc curve for models
fpr, tpr, thresh = roc_curve(y_test, y_pred_prob[:,1], pos_label=1)
# roc curve for tpr = fpr
random_probs = [0 for i in range(len(y_test))]
p_fpr, p_tpr, _ = roc_curve(y_test, random_probs, pos_label=1)
print("confusion_matrix ", confusion_matrix(y_test,y_pred))
print("classification_report ", classification_report(y_test,y_pred))
explainer = shap.TreeExplainer(grid_mse.best_estimator_)
shap_values = explainer(X_train)
shap.plots.beeswarm(shap_values, plot_size = 1.8, max_display = 13)
print(grid_mse.best_estimator_.feature_importances_)
for col,score in zip(X_train.columns,grid_mse.best_estimator_.feature_importances_):
print('%s, %0.3f ' %(col,score))
I have long feature names and I plot the beeswarm shapley plots and feature names get truncated. I would like the full feature name to be displayed on y-axis. Any help would be greatly appreciated.
I have tried changing the plot size but it did not work.
Add a flag to hide the plot. Then save to output with tight bbox layout:
path = 'save_path_here.png'
shap.plots.beeswarm(shap_values, plot_size = 1.8, max_display = 13, show=False)
plt.savefig(path, bbox_inches='tight', dpi=300)
Related
I got error while using SVM and MLP classifiers from SkLearn package. The error is C:\Users\cse_s\anaconda3\lib\site-packages\sklearn\metrics_classification.py:1327: UndefinedMetricWarning: Precision and F-score are ill-defined and being set to 0.0 in labels with no predicted samples. Use zero_division parameter to control this behavior.
_warn_prf(average, modifier, msg_start, len(result))
Code for splitting dataset
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, stratify=y)
Code for SVM classifier
from sklearn import svm
SVM_classifier = svm.SVC(kernel="rbf", probability = True, random_state=1)
SVM_classifier.fit(X_train, y_train)
SVM_y_pred = SVM_classifier.predict(X_test)
print(classification_report(y_test, SVM_y_pred))
Code for MLP classifier
from sklearn.neural_network import MLPClassifier
MLP = MLPClassifier(random_state=1, learning_rate = "constant", learning_rate_init=0.3, momentum = 0.2 )
MLP.fit(X_train, y_train)
R_y_pred = MLP.predict(X_test)
target_names = ['No class', 'Yes Class']
print(classification_report(y_test, R_y_pred, target_names=target_names))
The error is same for both classifiers
I hope, it could help.
Classification_report:
Sets the value to return when there is a zero division. You can provide 0 or 1 if zero division occur. by the precision or recall formula
classification_report(y_test, R_y_pred, target_names=target_names, zero_division=0)
I don't know what's your data look like. Here's an example
Features of cancer dataset
import pandas as pd
import numpy as np
from sklearn import svm
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from sklearn.neural_network import MLPClassifier
from sklearn.metrics import classification_report
cancer = load_breast_cancer()
df_feat = pd.DataFrame(cancer['data'],columns=cancer['feature_names'])
df_feat.head()
Target of dataset:
df_target = pd.DataFrame(cancer['target'],columns=['Cancer'])
np.ravel(df_target) # convert it into a 1-d array
Generate classification report:
X_train, X_test, y_train, y_test = train_test_split(df_feat, np.ravel(df_target), test_size=0.3, random_state=101)
SVM_classifier = svm.SVC(kernel="rbf", probability = True, random_state=1)
SVM_classifier.fit(X_train, y_train)
SVM_y_pred = SVM_classifier.predict(X_test)
print(classification_report(y_test, SVM_y_pred))
Generate classification report for MLP Classifier:
MLP = MLPClassifier(random_state=1, learning_rate = "constant", learning_rate_init=0.3, momentum = 0.2 )
MLP.fit(X_train, y_train)
R_y_pred = MLP.predict(X_test)
target_names = ['No class', 'Yes Class']
print(classification_report(y_test, R_y_pred, target_names=target_names, zero_division=0))
I want to create the Negative Predictive Value (NPV) metric to evaluate Data inside a gridsearchCV.
I prepared an example with the iris dataset.
from sklearn import svm, datasets
import numpy as np
import pandas as pd
from sklearn.metrics import make_scorer
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score
from sklearn.model_selection import train_test_split
iris = datasets.load_iris()
X = iris.data
target = iris.target
names = iris.target_names
df = pd.DataFrame(X, columns=iris.feature_names)
df['species'] = iris.target
#df['species'] = df['species'].replace(to_replace= [0, 1, 2], value = ['setosa', 'versicolor', 'virginica'])
indexNames = df[ df['species'] == 2 ].index
df.drop(indexNames , inplace=True)
x_data = pd.DataFrame({'sepal length': df[df.columns[0]],
'sepal width': df[df.columns[1]],
'petal length': df[df.columns[2]],
'petal width': df[df.columns[3]]})
y_data = pd.DataFrame(df['species']).astype(float)
x_train, x_test, y_train, y_test = train_test_split(x_data, y_data, test_size=0.5)
parameters = {
'kernel':('linear', 'rbf'),
'C':[1, 10]
}
svc = svm.SVC()
scoring = ['accuracy']
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
grid_search = GridSearchCV(estimator=svc,
param_grid=parameters,
scoring=scoring,
refit='accuracy',
n_jobs=-1,
cv=kfold,
verbose=0)
grid_result = grid_search.fit(x_train, y_train.values.ravel())
grid_result
print(f'best precision score train set {grid_result.best_score_:.4f}')
print(f'best hyperparameters{grid_result.best_params_}')
print(f'precision score test set{grid_search.score(x_test, y_test):.4f}')
Unfortunately the exemplaric code is not working, accuracy is always 1. Thats one issue i got when i adapted my real code to an example with iris data.
Another Code works, but its for accuracy only and i need a binary classifier for metrics like precision or NPV.
from sklearn import svm, datasets
import numpy as np
from sklearn.metrics import make_scorer
from sklearn.model_selection import GridSearchCV
from sklearn.model_selection import StratifiedKFold, GridSearchCV, RandomizedSearchCV, cross_val_score
iris = datasets.load_iris()
parameters = {'kernel':('linear', 'rbf'), 'C':[1, 10]}
svc = svm.SVC()
scoring = ['accuracy']
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
grid_search = GridSearchCV(estimator=svc,
param_grid=parameters,
scoring=scoring,
refit='accuracy',
n_jobs=-1,
cv=kfold,
verbose=0)
grid_result = grid_search.fit(iris.data, iris.target)
print(f'best accuracy score {grid_result.best_score_:.4f}')
print(f'best hyperparameters{grid_result.best_params_}')
print(f'accuracy score{grid_search.score(iris.data, iris.target):.4f}')
To sum up, whys is the first code block not working like intended and how would an implementation with the metric NPV look like?
Ideas are welcome.
Also, in other classification tasks, I wrote columns with my predicted value and I could compute TP, FP, TN, and FN directly and got my metrics. Inside the gridCV I couldn't figure out how to access the predictions.
In the first code above, the data is really well separated. To remove doubts, print the confusion_matrix in it, you can see that this is so.
from sklearn.metrics import confusion_matrix
y_pred = grid_search.best_estimator_.predict(x_test)
print(confusion_matrix(y_test, y_pred))
Output
[[22 0]
[ 0 28]]
As for custom criteria, you can use: make_scorer. In which the custom_scorer function is fed, which considers what you want. I took the confusion_matrix and extracted the values from it to calculate the score. What she thinks, namely the formula, you need to check, because I can be wrong.
from sklearn.metrics import confusion_matrix
from sklearn.metrics import make_scorer
def custom_scorer(y_true, y_pred, **kwargs):
cnm = confusion_matrix(y_true, y_pred)
return cnm[1, 1] / (cnm[1, 1] + cnm[0, 1])
mysc = make_scorer(custom_scorer, greater_is_better=True)
kfold = StratifiedKFold(n_splits=3, shuffle=True, random_state=0)
grid_search = GridSearchCV(estimator=svc,
param_grid=parameters,
scoring=mysc,
refit='accuracy',
n_jobs=-1,
cv=kfold,
verbose=0)
grid_result = grid_search.fit(x_train, y_train.values.ravel())
I am a total beginner and I am trying to compare different methods of handling missing data. In order to evaluate the effect of each method (drop raws with missing values, drop columns with missigness over 40%, impute with the mean, impute with the KNN), I compare the results of the LDA accuracy and LogReg accuracy on the training set between a dataset with 10% missing values, 20% missing values against the results of the original complete dataset. Unfortunately, I get pretty much the same results even between the complete dataset and the dataset with 20% missing-ness. I don't know what I am doing wrong.
from numpy import nan
from numpy import isnan
from pandas import read_csv
from sklearn.impute import SimpleImputer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
#dataset = read_csv('telecom_churn_rev10.csv')
dataset = read_csv('telecom_churn_rev20.csv')
dataset = dataset.replace(nan, 0)
values = dataset.values
X = values[:,1:11]
y = values[:,0]
dataset.fillna(dataset.mean(), inplace=True)
#dataset.fillna(dataset.mode(), inplace=True)
print(dataset.isnull().sum())
imputer = SimpleImputer(missing_values = nan, strategy = 'mean')
transformed_values = imputer.fit_transform(X)
print('Missing: %d' % isnan(transformed_values).sum())
model = LinearDiscriminantAnalysis()
cv = KFold(n_splits = 3, shuffle = True, random_state = 1)
result = cross_val_score(model, X, y, cv = cv, scoring = 'accuracy')
print('Accuracy: %.3f' % result.mean())
#print('Accuracy: %.3f' % result.mode())
print(dataset.describe())
print(dataset.head(20))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test,y_pred)
from sklearn import metrics
# make predictions on X
expected = y
predicted = classifier.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
# make predictions on X test
expected = y_test
predicted = classifier.predict(X_test)
# summarize the fit of the model
print(metrics.confusion_matrix(expected, predicted))
print(metrics.classification_report(expected, predicted))
You replace all your missing values with 0 at that line : dataset = dataset.replace(nan, 0). After this line, you have a full dataset without missing values. So, the .fillna() and the SimpleImputer() are useless after that line.
Let's take data
import numpy as np
import pandas as pd
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
data = load_breast_cancer()
X = data.data
y = data.target
I want to create model using only first principal component and calculate AUC for it.
My work so far
scaler = StandardScaler()
scaler.fit(X_train)
X_scaled = scaler.transform(X)
pca = PCA(n_components=1)
principalComponents = pca.fit_transform(X)
principalDf = pd.DataFrame(data = principalComponents
, columns = ['principal component 1'])
clf = LogisticRegression()
clf = clf.fit(principalDf, y)
pred = clf.predict_proba(principalDf)
But while I'm trying to use
fpr, tpr, thresholds = metrics.roc_curve(y, pred, pos_label=2)
Following error occurs :
y should be a 1d array, got an array of shape (569, 2) instead.
I tried to reshape my data
fpr, tpr, thresholds = metrics.roc_curve(y.reshape(1,-1), pred, pos_label=2)
But it didn't solve the issue (it outputs) :
multilabel-indicator format is not supported
Do you have any idea how can I perform AUC on this first principal component?
You may wish to try:
from sklearn.datasets import load_breast_cancer
from sklearn.decomposition import PCA
from sklearn import datasets
from sklearn.preprocessing import StandardScaler
from sklearn import metrics
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
X,y = load_breast_cancer(return_X_y=True)
X_train, X_test, y_train, y_test = train_test_split(X,y)
scaler = StandardScaler()
pca = PCA(2)
clf = LogisticRegression()
ppl = Pipeline([("scaler",scaler),("pca",pca),("clf",clf)])
ppl.fit(X_train, y_train)
preds = ppl.predict(X_test)
fpr, tpr, thresholds = metrics.roc_curve(y_test, preds, pos_label=1)
metrics.plot_roc_curve(ppl, X_test, y_test)
The problem is that predict_proba returns a column for each class. Generally with binary classification, your classes are 0 and 1, so you want the probability of the second class, so it's quite common to slice as follows (replacing the last line in your code block):
pred = clf.predict_proba(principalDf)[:, 1]
I want to use nested cross-validation with Grid search for a 2-class classification problem, using the roc_auc function as a scorer. I also want to print the classification matrix, so I have tried to create a simple custom scorer function which prints out a classification report. However, I get a different nested_score with the 2 functions. Here is an example using the breast cancer dataset adapted from sklearn's example (https://scikit-learn.org/stable/auto_examples/model_selection/plot_nested_cross_validation_iris.html):
from sklearn.datasets import load_breast_cancer
from matplotlib import pyplot as plt
from sklearn.svm import SVC
from sklearn.model_selection import GridSearchCV, cross_val_score, KFold
from sklearn import metrics
import numpy as np
def classification_report_with_roc_score(y_true, y_pred):
print (classification_report(y_true, y_pred)) # print classification report
fpr, tpr, thresholds = metrics.roc_curve(y_true, y_pred)
roc_auc = metrics.auc(fpr, tpr)
return roc_auc # return auc score
NUM_TRIALS = 1
breast_cancer = load_breast_cancer()
X_cancer = breast_cancer.data
y_cancer = breast_cancer.target
p_grid = {"C": [1, 10, 100],
"gamma": [.01, .1]}
svm = SVC(kernel="rbf")
for i in range(NUM_TRIALS):
inner_cv = KFold(n_splits=4, shuffle=True, random_state=i)
outer_cv = KFold(n_splits=4, shuffle=True, random_state=i)
clf = GridSearchCV(estimator=svm, param_grid=p_grid, cv=inner_cv)
nested_score = cross_val_score(clf, X=X_cancer, y=y_cancer, scoring = 'roc_auc', cv=outer_cv)
print('nested_score', nested_score)
custom_nested_score = cross_val_score(clf, X=X_cancer, y=y_cancer, scoring =
make_scorer(classification_report_with_roc_score), cv=outer_cv)
print('nested_score_custom', custom_nested_score)
The result is
nested_score [0.9836478 0.97074468 0.97853535 0.98266254]
nested_score_custom [0.92672956 0.92176418 0.88110269 0.89174407]
I was expecting them to be the same. Can someone please provide suggestions for why the results are different and what has gone wrong with the classification_report_with_roc_score() function?
Thank you.
As your scorer needs probabilties to be calculated you have to set 'needs_proba' argument of the make_scorer function to True.
custom_nested_score = cross_val_score(clf, X=X_cancer, y=y_cancer, scoring =
metrics.make_scorer(classification_report_with_roc_score, needs_proba=True), cv=outer_cv)
Secondly, you also have to set 'probability=True' when you initialize you SVC:
svm = SVC(kernel="rbf", probability=True)
Doing so, i got following results running your code:
nested_score [0.91278826 0.94326241 0.94760101 0.94097007]
nested_score_custom [0.91278826 0.94326241 0.94760101 0.94097007]