How to convert XGBoost model SHAP values from log odds to probabilities? - python

I trained an XGBoost Classifier and am trying to generate SHAP contributions in probabilities. I understand the output of shap.TreeExplainer for XGBoost models is in log odds ratios. I expect the expected_value of explainer to be equal or close to the average predicted value in the dataset. However, I get exptected_value of -2.7776 (explainer.expected_value), which corresponds to probability of 0.0585 (expit(-2.7776)). This is significantlty lower compared to average predicted score of 0.21. Is there any step I am missing in converting expected value to probability?
# Import libraries
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import shap
from scipy.special import expit
# Generate data
X, Y = make_classification(n_samples=10000,
n_features=20,
n_redundant=0,
n_classes=2,
random_state=17,
weights = [0.8, 0.2])
# Split into train and test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=7)
# Data check
print('Target rate: {:.0%}'.format(sum(Y)/len(Y)))
print('Target rate in train dataset: {:.0%}'.format(sum(y_train)/len(y_train)))
print('Target rate in test dataset: {:.0%}'.format(sum(y_test)/len(y_test)))
print('Total observations: {:.0f}'.format(len(X)))
print('Train observations: {:.0f}'.format(len(x_train)))
print('Test observations: {:.0f}'.format(len(x_test)))
# Train XGBoost model
model = GradientBoostingClassifier(
n_estimators = 50,
max_depth = 3,
random_state = 17
)
model.fit(x_train, y_train)
# Get accuracy score and confusion matrix for train and test datasets
# There doesn't seem to be issues with model performance, it is pretty close for train and test datasets
acc_train = model.score(x_train, y_train)
acc_test = model.score(x_test, y_test)
cm_train = confusion_matrix(y_train, y_pred_class_train, normalize = 'true')
cl_report_train = classification_report(y_train, y_pred_class_train)
cm_test = confusion_matrix(y_test, y_pred_class_test, normalize = 'true')
cl_report_test = classification_report(y_test, y_pred_class_test)
# Print results
print('MODEL ACCURACY:\n \
training data: {:.2%}\n \
test data: {:.2%}'.format(acc_train, acc_test))
print('\nCONFUSION MATRIX (train data):\n {}'.format(cm_train.round(3)))
print('\nCLASSIFICATION REPORT (train data):\n {}'.format(cl_report_train))
print('\nCONFUSION MATRIX (test data):\n {}'.format(cm_test.round(3)))
print('\nCLASSIFICATION REPORT (test data):\n {}'.format(cl_report_test))
# Check average predicted score
# Train
y_pred_prob_train = model.predict_proba(x_train)
y_pred_class_train = model.predict(x_train)
print('Train: Average predicted score: {:.2%}'.format(np.mean(y_pred_prob_train[:,1])))
# Test
y_pred_prob_test = model.predict_proba(x_test)
y_pred_class_test = model.predict(x_test)
print('Test: Average predicted score: {:.2%}'.format(np.mean(y_pred_prob_test[:,1])))
# Get SHAP values in log odds for test dataset
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(x_test)
# Check SHAP expected value
print('SHAP expected value: {:.4f}'.format(explainer.expected_value[0]))
print('SHAP expected value transformed: {:.4f}'.format(expit(explainer.expected_value[0])))
print('Average predicted value: {:.4f}'.format(np.mean(y_pred_prob_test[:,1])))
# Average predicted value is ~ 0.21 while shap expected value only ~ 0.06.

Related

Predicting the percentage accuracy of predicted handwritten numbers

I have two arrays with some differences when predicting handwritten numbers. How would I calculate the percentage accuracy for each number?
Loading the data:
from sklearn.datasets import load_digits
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LogisticRegression
Loading data:
digits = load_digits()
X = digits.data
y = digits.target
Splitting data into test train:
#Perform test train split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
Logistic regression:
#Create a logistic regression object
clf = LogisticRegression(random_state=0,penalty='none')
#Fit model to data
clf.fit(X_train,y_train)
#Print the coefficients
b0 = clf.intercept_[0]
b1 = clf.coef_[0,0]
print('beta_0 =', b0)
print('beta_1 =', b1)
#Calculate the test error rate
yp = clf.predict(X_test)
err = (yp!=y_test).mean()
print('Error rate = {}'.format(err))
I want to calculate the percentage error for each number as yp doesn't equal y_test

Cannot calculate roc_auc_score, only one class present in y_true

I want to print sklearn roc_auc_score and I have this error :
ValueError: Only one class present in y_true. ROC AUC score is not defined in that case.
I use random forest to predict topics in text.
my code :
# No validation data in rnd forest
x_train = np.concatenate((x_train, x_valid))
y_train = np.concatenate((y_train, y_valid))
model = RandomForestClassifier(n_estimators=int(clf_params['n_estimators']),
max_features=clf_params['max_features'])
model.fit(pipe_vect.transform(x_train), y_train)
x_test_vect = pipe_vect.transform(x_test)
y_pred = model.predict_proba(x_test_vect)
auc_score = roc_auc_score(y_test, y_pred)

DecisionTreeRegressor score not calculated

I'm trying to calculate the score of a DecisionTreeRegressor with the following code:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn import tree
# some features are better using LabelEncoder like HouseStyle but the chance that they will affect
# the target LotFrontage are small so we just use HotEncoder and drop unwanted columns later
encoded_df = pd.get_dummies(train_df, prefix_sep="_", columns=['MSZoning', 'Street', 'Alley',
'LotShape', 'LandContour', 'Utilities',
'LotConfig', 'LandSlope', 'Neighborhood',
'Condition1', 'Condition2', 'BldgType', 'HouseStyle'])
encoded_df = encoded_df[['LotFrontage', 'LotArea', 'LotShape_IR1', 'LotShape_IR2', 'LotShape_IR3',
'LotConfig_Corner', 'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3', 'LotConfig_Inside']]
# imputate LotFrontage with the mean value (we saw low outliers ratio so we gonna use the mean value)
encoded_df['LotFrontage'].fillna(encoded_df['LotFrontage'].mean(), inplace=True)
X = encoded_df.drop('LotFrontage', axis=1)
y = encoded_df['LotFrontage'].astype('int32')
X_train, X_test, y_train, y_test = train_test_split(X, y)
classifier = DecisionTreeRegressor()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_test = y_test.values.reshape(-1, 1)
classifier.score(y_test, y_pred)
print("Accuracy is: ", accuracy_score(y_test, y_pred) * 100)
when it's gets to calculating the score of the model I get the following error:
ValueError: Number of features of the model must match the input. Model n_features is 9 and input n_features is 1
Not sure as to why it happens because according sklearn docs the Test Samples are to be in the shape of (n_samples, n_features)
and y_test is indeed in this shape:
y_test.shape # (365, 1)
and the True labels should be in the shape of (n_samples) or (n_samples, n_outputs) and y_pred is indeed in this shape:
y_pred.shape # (365,)
The dataset: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data
The first argument of the score function shouldn't be the target value of the test set, it should be the input value of the test set, so you should do
classifier.score(X_test, y_test)

How to run through loop to use non-scaled and scaled data in python for loop

I have the following code running through and fitting a model on the iris data using different modeling techniques. How can I add a second step in this process so I can demonstrate the improvement between using scaled and non-scaled data?
I don't need to run the scale transform outside of the loop, i was just having a lot of issues with transforming the data type from pandas dataframe to np array and back again.
from sklearn import datasets
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler
from sklearn.cross_validation import KFold
from sklearn.linear_model import LogisticRegression
from sklearn import svm
from sklearn.metrics import accuracy_score
iris = datasets.load_iris()
X = iris.data[:, :2] # we only take the first two features.
y = iris.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.2)
sc = StandardScaler()
X_train_scale = sc.fit_transform(X_train)
X_test_scale = sc.transform(X_test)
numFolds = 10
kf = KFold(len(y_train), numFolds, shuffle=True)
# These are "Class objects". For each Class, find the AUC through
# 10 fold cross validation.
Models = [LogisticRegression, svm.SVC]
params = [{},{}]
for param, Model in zip(params, Models):
total = 0
for train_indices, test_indices in kf:
train_X = X_train[train_indices]; train_Y = y_train[train_indices]
test_X = X_train[test_indices]; test_Y = y_train[test_indices]
reg = Model(**param)
reg.fit(train_X, train_Y)
predictions = reg.predict(test_X)
total += accuracy_score(test_Y, predictions)
accuracy = total / numFolds
print ("CV accuracy score of {0}: {1}".format(Model.__name__, round(accuracy, 6)))
So ideally my output would be:
CV standard accuracy score of LogisticRegression: 0.683333
CV scaled accuracy score of LogisticRegression: 0.766667
CV standard accuracy score of SVC: 0.766667
CV scaled accuracy score of SVC: 0.783333
It seems like this is unclear, I am trying to loop through scaled and unscaled data, similar to how I am looping through the different ML algorithms.
I wanted to follow up with this. I was able to do this by creating a pipeline and using gridsearchCV
pipe = Pipeline([('scale', StandardScaler()),
('clf', LogisticRegression())])
param_grid = [{
'scale':[None,StandardScaler()],
'clf':[SVC(),LogisticRegression()]}]
grid_search = GridSearchCV(pipe, param_grid=param_grid,n_jobs=-1, verbose=1 )
In the end this got me the results I wanted and was able to test easily how to work between scaling and not scaling.
try this:
from __future__ import division
for param, Model in zip(params, Models):
total = 0
for train_indices, test_indices in kf:
train_X = X_train[train_indices]; train_Y = y_train[train_indices]
test_X = X_train[test_indices]; test_Y = y_train[test_indices]
reg = Model(**param)
reg.fit(train_X, train_Y)
predictions = reg.predict(test_X)
total += accuracy_score(test_Y, predictions)
accuracy = total / numFolds
print ("CV accuracy score of {0}: {1}".format(Model.__name__, round(accuracy, 6)))
# added to your code
if previous_accuracy:
improvement = 1 - (accuracy / previous_accuracy)
print "CV accuracy score improved by", improvement
else:
previous_accuracy = accuracy

Large mean squared error in sklearn regressors

I'm a beginner in machine learning and I want to build a model to predict the price of houses. I prepared a dataset by crawling a local housing website and it consists 1000 samples and only 4 features (latitude, longitude, area and number of rooms).
I tried RandomForestRegressor and LinearSVR models in sklearn, but I can't train the model properly and the MSE is super high.
MSE almost equals 90,000,000 (the true values of prices' range are between 5,000,000 - 900,000,000)
Here is my code:
import numpy as np
from sklearn.svm import LinearSVR
import pandas as pd
import csv
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
df = pd.read_csv('dataset.csv', index_col=False)
X = df.drop('price', axis=1)
X_data = X.values
Y_data = df.price.values
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=5)
rgr = RandomForestRegressor(n_estimators=100)
svr = LinearSVR()
rgr.fit(X_train, Y_train)
svr.fit(X_train, Y_train)
MSEs = cross_val_score(estimator=rgr,
X=X_train,
y=Y_train,
scoring='mean_squared_error',
cv=5)
MSEsSVR = cross_val_score(estimator=svr,
X=X_train,
y=Y_train,
scoring='mean_squared_error',
cv=5)
MSEs *= -1
RMSEs = np.sqrt(MSEs)
print("Root mean squared error with 95% confidence interval:")
print("{:.3f} (+/- {:.3f})".format(RMSEs.mean(), RMSEs.std()*2))
print("")
Is the problem with my dataset and count of features? How can I build a prediction model with this type of dataset?

Categories

Resources