confidence interval for random forest regressor

confidence interval for random forest regressor - python

i'm using a kaggle dataset (https://www.kaggle.com/datasets/harlfoxem/housesalesprediction) to make a prediction on house prices.
This is the code I used and so far so good.
import pandas as pd
import seaborn as sns
import numpy as np
from sklearn.model_selection import train_test_split
from sklearn import linear_model
from sklearn import metrics
import matplotlib.pyplot as plt
from sklearn.metrics import r2_score
from sklearn.ensemble import RandomForestRegressor
#import dataset
dataset = pd.read_csv(path_to_dataset)
dataset.head()
dataset['date'] = pd.to_datetime(dataset['date']) #convert date in datetime
#house_age is a new feature
dataset["house_age"] = dataset["date"].dt.year - dataset['yr_built']
#drop useful features
dataset=dataset.drop('date', axis=1)
dataset=dataset.drop('yr_built', axis=1)
dataset = dataset.drop(["id"],axis=1)
train, test = train_test_split(dataset, test_size=0.3, random_state=43)
xtrain = train.drop(['price'], axis = 1) #train array without price
ytrain = train['price'] #train array with price
xtest = test.drop(['price'], axis = 1) #test array without price
ytest = test['price'] #test array with price
reg = RandomForestRegressor()
reg.fit(xtrain,ytrain)
pred = reg.predict(xtest)
print("Score: ",r2_score(ytrain, reg.predict(xtrain)))
print("Score: ",r2_score(ytest, pred))
print('MSE: ', metrics.mean_squared_error(ytest, pred))
Now, however, I would like to calculate and draw a confidence interval for the predictions made with my model.
I have already tried to look at many articles and libraries for several hours but I have not yet been able to find a solution that works for my case.
These are a couple of the references I followed but with little success:
http://contrib.scikit-learn.org/forest-confidence-interval/auto_examples/plot_mpg.html#sphx-glr-auto-examples-plot-mpg-py
https://scikit-garden.github.io/examples/QuantileRegressionForests/#quantile-regression-forests_1
Does anyone know how to create a confidence interval for this situation?

To construct confidence intervals, you can use the quantile-forest package. Using the RandomForestQuantileRegressor method in the package, you can specify quantiles to estimate during training, which can then be used to construct intervals.
Here's an example that extends your code with the above package to do this:
import matplotlib.pyplot as plt
import numpy as np
import pandas as pd
from quantile_forest import RandomForestQuantileRegressor
from sklearn.metrics import mean_squared_error
from sklearn.metrics import r2_score
from sklearn.model_selection import train_test_split
# import dataset
dataset = pd.read_csv(path_to_dataset)
dataset.head()
dataset['date'] = pd.to_datetime(dataset['date']) # convert date in datetime
# house_age is a new feature
dataset['house_age'] = dataset["date"].dt.year - dataset['yr_built']
# drop useful features
dataset = dataset.drop('date', axis=1)
dataset = dataset.drop('yr_built', axis=1)
dataset = dataset.drop(['id'], axis=1)
train, test = train_test_split(dataset, test_size=0.3, random_state=43)
x_train = train.drop(['price'], axis=1) # train array without price
y_train = train['price'] # train array with price
x_test = test.drop(['price'], axis=1) # test array without price
y_test = test['price'] # test array with price
reg = RandomForestQuantileRegressor(n_estimators=100, random_state=0)
reg.fit(x_train, y_train)
# Get predictions at 95% prediction intervals and median.
y_pred = reg.predict(x_test, quantiles=[0.025, 0.5, 0.975])
def plot_intervals(y_true, y_pred_lower, y_pred_upper):
fig = plt.figure(figsize=(10, 4))
y_pred_interval = y_pred_upper - y_pred_lower
sort_idx = np.argsort(y_pred_interval)
y_true = y_true[sort_idx]
y_pred_lower = y_pred_lower[sort_idx]
y_pred_upper = y_pred_upper[sort_idx]
# Center data, with the mean of the prediction interval at 0.
mean = (y_pred_lower + y_pred_upper) / 2
y_true -= mean
y_pred_lower -= mean
y_pred_upper -= mean
plt.plot(y_true, marker='.', ms=5, c='r', lw=0)
plt.fill_between(
np.arange(len(y_pred_upper)),
y_pred_lower,
y_pred_upper,
alpha=0.2,
color='gray',
)
plt.plot(np.arange(len(y_true)), y_pred_lower, marker='_', c='0.2', lw=0)
plt.plot(np.arange(len(y_true)), y_pred_upper, marker='_', c='0.2', lw=0)
plt.xlim([0, len(y_true)])
plt.xlabel('Ordered Samples')
plt.ylabel('Observed Values and Prediction Intervals (Centered)')
plt.show()
plot_intervals(y_test.values, y_pred[:, 0], y_pred[:, 2])
print('Score: ', r2_score(y_train, reg.predict(x_train)))
print('Score: ', r2_score(y_test, y_pred[:, 1]))
print('MSE: ', mean_squared_error(y_test, y_pred[:, 1]))
The code plots the generated intervals from smallest to largest along with the observed values:

Related

Problem with plotting decision regions for classification model

I have a problem with plotting decision regions for Logistic Regression classification model. Can somebody help me and explain something how to do that? I put the colab link to this project here -> https://colab.research.google.com/drive/1JqFyoAk0zithy4esfjiyo6MdB12iBndi?usp=sharing
Dataset from Kaggle -> https://www.kaggle.com/datasets/muratkokludataset/date-fruit-datasets
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
import seaborn as sns
from sklearn.model_selection import train_test_split
from sklearn.preprocessing import StandardScaler, LabelEncoder
from sklearn.metrics import accuracy_score, confusion_matrix, classification_report
from sklearn.linear_model import LogisticRegression
from mlxtend.plotting import plot_decision_regions
np.set_printoptions(suppress=True, edgeitems=30, linewidth=100000, formatter=dict(float=lambda x: f'{x:.8f}'))
np.random.seed(42)
sns.set()
desired_width = 320
pd.options.display.float_format = '{:,.8f}'.format
pd.set_option('display.width', desired_width)
pd.set_option('display.max_columns', 12)
raw_data = pd.read_excel(io='/content/Date_Fruit_Datasets.xlsx',
sheet_name='Date_Fruit_Datasets')
data = raw_data.copy()
data.head(n=10)
data.describe().transpose()
data.info()
data.shape
# Creating data and target
X = data.drop(columns='Class')
y = data['Class']
X.shape
y.shape
# Encoding target
encoder = LabelEncoder()
y = encoder.fit_transform(y=y)
# Creating train and test sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25, random_state=42)
# Scalling data
scaler = StandardScaler()
X_train = scaler.fit_transform(X=X_train)
X_test = scaler.transform(X=X_test)
# Creating classifier, fitting and predicting
classifier = LogisticRegression(max_iter=1000)
classifier.fit(X=X_train, y=y_train)
y_pred = classifier.predict(X=X_test)
y_pred_proba = classifier.predict_proba(X=X_test)
# Checking finally reports and scores
score = accuracy_score(y_true=y_test, y_pred=y_pred)
report = classification_report(y_true=y_test, y_pred=y_pred, target_names=encoder.classes_)
confusion_matrix = confusion_matrix(y_true=y_test, y_pred=y_pred)
# Compare y_true and y_pred in DataFrame
results = pd.DataFrame(data={
'y_true': y_test,
'y_pred': y_pred
})
# Creating Data Frame with predict proba
predict_proba = pd.DataFrame(data=classifier.predict_proba(X=X_test), columns=encoder.classes_)
# Saving results to csv
results.to_csv(path_or_buf='/content/data_fruit_predictions.csv')
predict_proba.to_csv(path_or_buf='/content/data_fruit_predict_proba.csv')
# Plotting decision regions
value = 1.5
width = 0.75
plt.figure(figsize=(10, 8))
plot_decision_regions(X=X.values, y=y, clf=classifier,
filler_feature_values={i: value for i in range(1, 34)},
filler_feature_ranges={i: width for i in range(1, 34)}, legend=2)
plt.show()
After using function plot_decision_regions PyCharm shows me error like:
UserWarning: No contour levels were found within the data range.
ax.contour(xx, yy, Z, cset.levels,
and
UserWarning: You passed a edgecolor/edgecolors ('black') for an unfilled marker ('x'). Matplotlib is ignoring the edgecolor in favor of the facecolor. This behavior may change in the future.
ax.scatter(x=x_data,

How to output Shap values in probability and make force_plot from binary classifier

I need to plot how each feature impacts the predicted probability for each sample from my LightGBM binary classifier. So I need to output Shap values in probability, instead of normal Shap values. It does not appear to have any options to output in term of probability.
The example code below is what I use to generate dataframe of Shap values and do a force_plot for the first data sample. Does anyone know how I should modify the code to change the output?
I'm new to Shap value and the Shap package. Thanks a lot in advance.
import pandas as pd
import numpy as np
import shap
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
model = lgbm.LGBMClassifier()
model.fit(X_train, y_train)
explainer = shap.TreeExplainer(model)
shap_values = explainer(X_train)
# force plot of first row for class 1
class_idx = 1
row_idx = 0
expected_value = explainer.expected_value[class_idx]
shap_value = shap_values[:,:,class_idx].values[row_idx]
shap.force_plot (base_value = expected_value, shap_values = shap_value, features = X_train.iloc[row_idx, :], matplotlib=True)
# dataframe of shap values for class 1
shap_df = pd.DataFrame(shap_values[:,:, 1 ].values, columns = shap_values.feature_names)

TL;DR:
You can achieve plotting results in probability space with link="logit" in the force_plot method:
import pandas as pd
import numpy as np
import shap
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
from scipy.special import expit
shap.initjs()
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
X_train, X_test, y_train, y_test = train_test_split(
X, y, test_size=0.2, random_state=42
)
model = lgbm.LGBMClassifier()
model.fit(X_train, y_train)
explainer_raw = shap.TreeExplainer(model)
shap_values = explainer_raw(X_train)
# force plot of first row for class 1
class_idx = 1
row_idx = 0
expected_value = explainer_raw.expected_value[class_idx]
shap_value = shap_values[:, :, class_idx].values[row_idx]
shap.force_plot(
base_value=expected_value,
shap_values=shap_value,
features=X_train.iloc[row_idx, :],
link="logit",
)
Expected output:
Alternatively, you may achieve the same with the following, explicitly specifying model_output="probability" you're interested in to explain:
explainer = shap.TreeExplainer(
model,
data=X_train,
feature_perturbation="interventional",
model_output="probability",
)
shap_values = explainer(X_train)
# force plot of first row for class 1
class_idx = 1
row_idx = 0
shap_value = shap_values.values[row_idx]
shap.force_plot(
base_value=expected_value,
shap_values=shap_value,
features=X_train.iloc[row_idx, :]
)
Expected output:
However, it might be more interesting for understanding what's happening here to find out where these figures come from:
Our target proba for the point of interest:
model_proba= model.predict_proba(X_train.iloc[[row_idx]])
model_proba
# array([[0.00275887, 0.99724113]])
Base case raw from model given X_train as background (note, LightGBM outputs raw for class 1):
model.predict(X_train, raw_score=True).mean()
# 2.4839751932445577
Base case raw from SHAP (note, they are symmetric):
bv = explainer_raw(X_train).base_values[0]
bv
# array([-2.48397519, 2.48397519])
Raw SHAP values for the point of interest:
sv_0 = explainer_raw(X_train).values[row_idx].sum(0)
sv_0
# array([-3.40619584, 3.40619584])
Proba inferred from SHAP values (via sigmoid):
shap_proba = expit(bv + sv_0)
shap_proba
# array([0.00275887, 0.99724113])
Check:
assert np.allclose(model_proba, shap_proba)
Please ask questions if something is not clear.
Side notes
Proba might be misleading if you're analyzing raw size effect of different features because sigmoid is non-linear and saturates after reaching certain threshold.
Some people expect to see SHAP values in probability space as well, but this is not feasible because:
SHAP values are additive by construction (to be precise SHapley Additive exPlanations are average marginal contributions over all possible feature coalitions)
exp(a + b) != exp(a) + exp(b)
You may find useful:
Feature importance in a binary classification and extracting SHAP values for one of the classes only answer
How to interpret base_value of GBT classifier when using SHAP? answer

You can consider running your output values through a softmax() function. For reference, it is defined as :
def get_softmax_probabilities(x):
return np.exp(x) / np.sum(np.exp(x)).reshape(-1, 1)
and there is a scipy implementation as well:
from scipy.special import softmax
The output from softmax() will be probabilities proportional to the (relative) values in vector x, which are your shop values.

import pandas as pd
import numpy as np
import shap
import lightgbm as lgbm
from sklearn.model_selection import train_test_split
from sklearn.datasets import load_breast_cancer
data = load_breast_cancer()
X = pd.DataFrame(data.data, columns=data.feature_names)
y = data.target
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
print('X_train: ',X_train.shape)
print('X_test: ',X_test.shape)
model = lgbm.LGBMClassifier()
model.fit(X_train, y_train)
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(X_train)
# plot
# shap.summary_plot(shap_values[class_idx], X_train, plot_type='bar')
# shap.summary_plot(shap_values[class_idx], X_train)
# shap_value = shap_values[:,:,class_idx].values[row_idx]
# shap.force_plot (base_value = expected_value, shap_values = shap_value, features = X_train.iloc[row_idx, :], matplotlib=True)
# # dataframe of shap values for class 1
# shap_df = pd.DataFrame(shap_values[:,:, 1 ].values, columns = shap_values.feature_names)
# verification
def verification(index_number,class_idx):
print('-----------------------------------')
print('index_number: ', index_number)
print('class_idx: ', class_idx)
print('')
y_base = explainer.expected_value[class_idx]
print('y_base: ', y_base)
player_explainer = pd.DataFrame()
player_explainer['feature_value'] = X_train.iloc[j].values
player_explainer['shap_value'] = shap_values[class_idx][j]
print('verification: ')
print('y_base + sum_of_shap_values: %.2f'%(y_base + player_explainer['shap_value'].sum()))
print('y_pred: %.2f'%(y_train[j]))
j = 10 # index
verification(j,0)
verification(j,1)
# show:
# X_train: (455, 30)
# X_test: (114, 30)
# -----------------------------------
# index_number: 10
# class_idx: 0
# y_base: -2.391423081639827
# verification:
# y_base + sum_of_shap_values: -9.40
# y_pred: 1.00
# -----------------------------------
# index_number: 10
# class_idx: 1
# y_base: 2.391423081639827
# verification:
# y_base + sum_of_shap_values: 9.40
# y_pred: 1.00
# -9.40,9.40 takes the maximum value（class_idx:1 = y_pred）, and the result is obviously correct.
I helped you achieve it and verified the reliability of the results.

Why is my MSE so high when the difference between test and prediction values are so close?

In Python, I have conducted a small multiple linear regression model to explain house prices in areas based on other variables (all of which are percentages multiplied by 100) such as percentage of people with bachelor degrees in an area, percentage of people who work from home. I have conducted this in R and it works fine, but I am new to Python ML. I have shown the output of y_pred = regressor.predict(X_test) and the MSE I get. I have included a sample of my data where avgincome PctSingleDetached and PctDrivetoWork are X, and AvgHousingPrice is the Y.
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.impute import SimpleImputer
sample data:
avgincome PctSingleDetached PctDrivetoWork AvgHousingPrice
0 44388.0 61.528497 81.151832 448954
1 40650.0 54.372197 77.882798 349758
2 43350.0 68.393782 79.553265 428740
X = hamiltondata.iloc[:, :-1].values
Y = hamiltondata.iloc[:, -1].values
imputer = SimpleImputer(missing_values = np.nan, strategy = 'mean') # This is an object of the imputer class. It will help us find that average to infer.
# Instructs to find missing and replace it with mean
# Fit method in SimpleImputer will connect imputer to our matrix of features
imputer.fit(X[:,:]) # We exclude column "O" AKA Country because they are strings
X[:, :] = imputer.transform(X[:,:])
# from sklearn.compose import ColumnTransformer
# from sklearn.preprocessing import OneHotEncoder
# ct = ColumnTransformer(transformers = [('encoder', OneHotEncoder(), [0])], remainder = 'passthrough')
# X = np.array(ct.fit_transform(X))
print(X)
print(Y)
## Splitting into training and testing ##
from sklearn.model_selection import train_test_split
X_train, X_test, Y_train, Y_test = train_test_split(X,Y,test_size = 0.2, random_state = 0)
### Feature Scaling ###
from sklearn.preprocessing import StandardScaler
sc = StandardScaler() # this does STANDARDIZATION for you. See data standardization formula
X_train[:, 0:] = sc.fit_transform(X_train[:,0:])
# Fit changes the data, Transform applies it! Here we have a method that does both
X_test[:, 0:] = sc.transform(X_test[:, 0:])
print(X_train)
print(X_test)
## Training ##
from sklearn.linear_model import LinearRegression
regressor = LinearRegression() # This class takes care of selecting the best variables. Very convenient
regressor.fit(X_train, Y_train)
### Predicting Test Set results ###
y_pred = regressor.predict(X_test)
np.set_printoptions(precision = 2) # Display any numerical value with only 2 numebrs after decimal
print(np.concatenate((y_pred.reshape(len(y_pred),1), Y_test.reshape(len(Y_test),1 )), axis=1)) # this just simply makes everything vertical
from sklearn.metrics import mean_squared_error
mse = mean_squared_error(Y_test, y_pred)
print(mse)
OUTPUT:
[[489066.76 300334. ]
[227458.2 200352. ]
[928249.59 946729. ]
[339032.27 350116. ]
[689668.21 600322. ]
[489179.58 577936. ]]
...
...
MSE = 2375985640.8102403

You can calculate mse yourself to check if there is something wrong. In my opinion the obtained result is coherent. Anyway I built a simple my_mse function to check the result output by sklearn, with your example data
from sklearn.metrics import mean_squared_error
list_ = [[489066.76, 300334.],
[227458.2, 200352. ],
[928249.59, 946729. ],
[339032.27, 350116. ],
[689668.21, 600322. ],
[489179.58, 577936. ]]
y_true = [y[0] for y in list_]
y_pred = [y[1] for y in list_]
mse = mean_squared_error(y_true, y_pred)
print(mse)
# 8779930962.14985
def my_mse(y_true, y_pred):
diff = 0
for couple in zip(y_true, y_pred):
diff+=pow(couple[0]-couple[1], 2)
return diff/len(y_true)
print(my_mse(y_true, y_pred))
# 8779930962.14985
Remember the mse is the mean squared error. (Each error is squared in the sum)
If you are asking if your model is bad or good, it depends on the main objective. Anyway, I think that your model is performing poor because it's a linear model. A model with more complexity could handle the problem and output better results

Create a ROC curve in python, but find the confusion matrix may not generate correctly

I created the model to plot the curve. I could see the curve, but do not feel it correct.
After some debug, I found the confusion matrix did not equal to the one I got without split to train & test data. I used the test size as 0.25, the default value to split dataset.
Here is my code. Can anyone help to check? Thanks
# Import the libraries we will be using
import numpy as np
import pandas as pd
from sklearn.tree import DecisionTreeClassifier
from sklearn.linear_model import LogisticRegression
from sklearn.svm import SVC
from sklearn import metrics
from sklearn.model_selection import cross_val_score
from sklearn.model_selection import train_test_split
import matplotlib.pylab as plt
# Load the data
data = pd.read_excel("data.xlsx")
# Let's take a look at the data
data.head()
# Split our data
X = data.drop(['Actual'], axis=1)
Y = data['Actual']
X_data_train, X_data_test, Y_data_train, Y_data_test = train_test_split(X, Y, test_size=0.25, random_state=0)
# Make and fit a model on the training data
model_data = LogisticRegression(C=1000000, solver='liblinear')
model_data.fit(X_data_train, Y_data_train)
# Get probabilities of being a (We saw this last class !!)
probabilities = model_data.predict_proba(X_data_test)[:, 1]
prediction = probabilities > 0.5
# Build and print a confusion matrix
confusion_matrix_large = pd.DataFrame(metrics.confusion_matrix(Y_data_test, prediction, labels=[1, 0]).T,
columns=['p', 'n'], index=['Y', 'N'])
print (confusion_matrix_large)
# Let's move the threshold down
prediction = probabilities > 0.05
# Build and print a confusion matrix
confusion_matrix_small = pd.DataFrame(metrics.confusion_matrix(Y_data_test, prediction, labels=[1, 0]).T,
columns=['p', 'n'], index=['Y', 'N'])
print (confusion_matrix_small)
# Use the metrics.roc_curve function to get the true positive rate (tpr) and false positive rate (fpr)
fpr, tpr, thresholds = metrics.roc_curve(Y_data_test, probabilities)
# Get the area under the curve (AUC)
auc = np.mean(cross_val_score(model_data, X, Y, scoring="roc_auc", cv=5))
# Plot the ROC curve
plt.plot(fpr, tpr, label="AUC (C=" + str(data) + ") = " + str(round(auc, 2)))
plt.xlabel("False positive rate (fpr)")
plt.ylabel("True positive rate (tpr)")
plt.plot([0,1], [0,1], 'k--', label="Random")
plt.legend(loc='best')

Dataset indices for predicted values is not matching with those for actual values

I am a python novice who is trying to solve a regression problem with neural networks. I am at the stage where I want to plot the predicted vs actual followed by determining the regression coefficient.
Model training
#import statements
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from sklearn.neural_network import MLPRegressor
from sklearn.model_selection import train_test_split
%matplotlib inline
#importing the dataset
data = pd.read_csv("PPV_dataset.csv")
X = np.array(data.drop(["PPV"],1))
y = np.array(data["PPV"])
#model training & prediction
nn = MLPRegressor(hidden_layer_sizes=(100,), activation = 'logistic', solver = 'sgd')
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.25)
nn.fit(X_train, y_train)
pred = nn.predict(X_test)
#indices of test set
a = X_test
indices = []
for row in range(len(X)):
for i in range(len(a)):
if np.all(a[i]==X[row]):
indices.append(row)
#listing actual values in an array
actual_values = []
for i in range(len(indices)):
actual_values.append(y[indices[i]])
Comparing actual to predicted values
len(actual_values)
13
len(pred)
12
Image of dataset

You should use the matplotlib and the seaborn libraries for plotting you graph,
and for coeficient r_sq = nn.score(actual_values, pred)
I recommend using seaborn.lmplot() in your case
for roberts particular case I suggest:
from sklearn.metrics import r2_score
r2_score(y_true, y_pred)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

confidence interval for random forest regressor - python

Related

Problem with plotting decision regions for classification model

How to output Shap values in probability and make force_plot from binary classifier

Why is my MSE so high when the difference between test and prediction values are so close?

Create a ROC curve in python, but find the confusion matrix may not generate correctly

Dataset indices for predicted values is not matching with those for actual values

Categories

Resources