DecisionTreeRegressor score not calculated - python

I'm trying to calculate the score of a DecisionTreeRegressor with the following code:
from sklearn import preprocessing
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier, DecisionTreeRegressor
from sklearn.metrics import accuracy_score
from sklearn import tree
# some features are better using LabelEncoder like HouseStyle but the chance that they will affect
# the target LotFrontage are small so we just use HotEncoder and drop unwanted columns later
encoded_df = pd.get_dummies(train_df, prefix_sep="_", columns=['MSZoning', 'Street', 'Alley',
'LotShape', 'LandContour', 'Utilities',
'LotConfig', 'LandSlope', 'Neighborhood',
'Condition1', 'Condition2', 'BldgType', 'HouseStyle'])
encoded_df = encoded_df[['LotFrontage', 'LotArea', 'LotShape_IR1', 'LotShape_IR2', 'LotShape_IR3',
'LotConfig_Corner', 'LotConfig_CulDSac', 'LotConfig_FR2', 'LotConfig_FR3', 'LotConfig_Inside']]
# imputate LotFrontage with the mean value (we saw low outliers ratio so we gonna use the mean value)
encoded_df['LotFrontage'].fillna(encoded_df['LotFrontage'].mean(), inplace=True)
X = encoded_df.drop('LotFrontage', axis=1)
y = encoded_df['LotFrontage'].astype('int32')
X_train, X_test, y_train, y_test = train_test_split(X, y)
classifier = DecisionTreeRegressor()
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
y_test = y_test.values.reshape(-1, 1)
classifier.score(y_test, y_pred)
print("Accuracy is: ", accuracy_score(y_test, y_pred) * 100)
when it's gets to calculating the score of the model I get the following error:
ValueError: Number of features of the model must match the input. Model n_features is 9 and input n_features is 1
Not sure as to why it happens because according sklearn docs the Test Samples are to be in the shape of (n_samples, n_features)
and y_test is indeed in this shape:
y_test.shape # (365, 1)
and the True labels should be in the shape of (n_samples) or (n_samples, n_outputs) and y_pred is indeed in this shape:
y_pred.shape # (365,)
The dataset: https://www.kaggle.com/c/house-prices-advanced-regression-techniques/data

The first argument of the score function shouldn't be the target value of the test set, it should be the input value of the test set, so you should do
classifier.score(X_test, y_test)

Related

How to convert XGBoost model SHAP values from log odds to probabilities?

I trained an XGBoost Classifier and am trying to generate SHAP contributions in probabilities. I understand the output of shap.TreeExplainer for XGBoost models is in log odds ratios. I expect the expected_value of explainer to be equal or close to the average predicted value in the dataset. However, I get exptected_value of -2.7776 (explainer.expected_value), which corresponds to probability of 0.0585 (expit(-2.7776)). This is significantlty lower compared to average predicted score of 0.21. Is there any step I am missing in converting expected value to probability?
# Import libraries
import numpy as np
from sklearn.ensemble import GradientBoostingClassifier
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix, classification_report
import shap
from scipy.special import expit
# Generate data
X, Y = make_classification(n_samples=10000,
n_features=20,
n_redundant=0,
n_classes=2,
random_state=17,
weights = [0.8, 0.2])
# Split into train and test
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.1, random_state=7)
# Data check
print('Target rate: {:.0%}'.format(sum(Y)/len(Y)))
print('Target rate in train dataset: {:.0%}'.format(sum(y_train)/len(y_train)))
print('Target rate in test dataset: {:.0%}'.format(sum(y_test)/len(y_test)))
print('Total observations: {:.0f}'.format(len(X)))
print('Train observations: {:.0f}'.format(len(x_train)))
print('Test observations: {:.0f}'.format(len(x_test)))
# Train XGBoost model
model = GradientBoostingClassifier(
n_estimators = 50,
max_depth = 3,
random_state = 17
)
model.fit(x_train, y_train)
# Get accuracy score and confusion matrix for train and test datasets
# There doesn't seem to be issues with model performance, it is pretty close for train and test datasets
acc_train = model.score(x_train, y_train)
acc_test = model.score(x_test, y_test)
cm_train = confusion_matrix(y_train, y_pred_class_train, normalize = 'true')
cl_report_train = classification_report(y_train, y_pred_class_train)
cm_test = confusion_matrix(y_test, y_pred_class_test, normalize = 'true')
cl_report_test = classification_report(y_test, y_pred_class_test)
# Print results
print('MODEL ACCURACY:\n \
training data: {:.2%}\n \
test data: {:.2%}'.format(acc_train, acc_test))
print('\nCONFUSION MATRIX (train data):\n {}'.format(cm_train.round(3)))
print('\nCLASSIFICATION REPORT (train data):\n {}'.format(cl_report_train))
print('\nCONFUSION MATRIX (test data):\n {}'.format(cm_test.round(3)))
print('\nCLASSIFICATION REPORT (test data):\n {}'.format(cl_report_test))
# Check average predicted score
# Train
y_pred_prob_train = model.predict_proba(x_train)
y_pred_class_train = model.predict(x_train)
print('Train: Average predicted score: {:.2%}'.format(np.mean(y_pred_prob_train[:,1])))
# Test
y_pred_prob_test = model.predict_proba(x_test)
y_pred_class_test = model.predict(x_test)
print('Test: Average predicted score: {:.2%}'.format(np.mean(y_pred_prob_test[:,1])))
# Get SHAP values in log odds for test dataset
explainer = shap.TreeExplainer(model)
shap_values = explainer.shap_values(x_test)
# Check SHAP expected value
print('SHAP expected value: {:.4f}'.format(explainer.expected_value[0]))
print('SHAP expected value transformed: {:.4f}'.format(expit(explainer.expected_value[0])))
print('Average predicted value: {:.4f}'.format(np.mean(y_pred_prob_test[:,1])))
# Average predicted value is ~ 0.21 while shap expected value only ~ 0.06.

How to evaluate the effect of different methods of handling missing values?

I am a total beginner and I am trying to compare different methods of handling missing data. In order to evaluate the effect of each method (drop raws with missing values, drop columns with missigness over 40%, impute with the mean, impute with the KNN), I compare the results of the LDA accuracy and LogReg accuracy on the training set between a dataset with 10% missing values, 20% missing values against the results of the original complete dataset. Unfortunately, I get pretty much the same results even between the complete dataset and the dataset with 20% missing-ness. I don't know what I am doing wrong.
from numpy import nan
from numpy import isnan
from pandas import read_csv
from sklearn.impute import SimpleImputer
from sklearn.discriminant_analysis import LinearDiscriminantAnalysis
from sklearn.model_selection import KFold
from sklearn.model_selection import cross_val_score
#dataset = read_csv('telecom_churn_rev10.csv')
dataset = read_csv('telecom_churn_rev20.csv')
dataset = dataset.replace(nan, 0)
values = dataset.values
X = values[:,1:11]
y = values[:,0]
dataset.fillna(dataset.mean(), inplace=True)
#dataset.fillna(dataset.mode(), inplace=True)
print(dataset.isnull().sum())
imputer = SimpleImputer(missing_values = nan, strategy = 'mean')
transformed_values = imputer.fit_transform(X)
print('Missing: %d' % isnan(transformed_values).sum())
model = LinearDiscriminantAnalysis()
cv = KFold(n_splits = 3, shuffle = True, random_state = 1)
result = cross_val_score(model, X, y, cv = cv, scoring = 'accuracy')
print('Accuracy: %.3f' % result.mean())
#print('Accuracy: %.3f' % result.mode())
print(dataset.describe())
print(dataset.head(20))
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state = 0)
from sklearn.preprocessing import StandardScaler
sc = StandardScaler()
X_train = sc.fit_transform(X_train)
X_test = sc.transform(X_test)
from sklearn.linear_model import LogisticRegression
classifier = LogisticRegression(random_state = 0)
classifier.fit(X_train, y_train)
y_pred = classifier.predict(X_test)
from sklearn.metrics import confusion_matrix, accuracy_score
cm = confusion_matrix(y_test, y_pred)
print(cm)
accuracy_score(y_test,y_pred)
from sklearn import metrics
# make predictions on X
expected = y
predicted = classifier.predict(X)
# summarize the fit of the model
print(metrics.classification_report(expected, predicted))
print(metrics.confusion_matrix(expected, predicted))
# make predictions on X test
expected = y_test
predicted = classifier.predict(X_test)
# summarize the fit of the model
print(metrics.confusion_matrix(expected, predicted))
print(metrics.classification_report(expected, predicted))
You replace all your missing values with 0 at that line : dataset = dataset.replace(nan, 0). After this line, you have a full dataset without missing values. So, the .fillna() and the SimpleImputer() are useless after that line.

MAE using Pipeline and GridSearchCV

I am facing a challenge finding Mean Average Error (MAE) using Pipeline and GridSearchCV
Background:
I have worked on a Data Science project (MWE as below) where a MAE value would be returned of a classifier as it's performance metric.
#Library
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import mean_absolute_error
#Data import and preparation
data = pd.read_csv("data.csv")
data_features = ['location','event_type_count','log_feature_count','total_volume','resource_type_count','severity_type']
X = data[data_features]
y = data.fault_severity
#Train Validation Split for Cross Validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
#RandomForest Modeling
RF_model = RandomForestClassifier(n_estimators=100, random_state=0)
RF_model.fit(X_train, y_train)
#RandomForest Prediction
y_predict = RF_model.predict(X_valid)
#MAE
print(mean_absolute_error(y_valid, y_predict))
#Output:
# 0.38727149627623564
Challenge:
Now I am trying to implement the same using Pipeline and GridSearchCV (MWE as below). The expectation is the same MAE value would be returned as above. Unfortunately I could not get it right using the 3 approaches below.
#Library
import pandas as pd
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.pipeline import Pipeline
from sklearn.model_selection import GridSearchCV
#Data import and preparation
data = pd.read_csv("data.csv")
data_features = ['location','event_type_count','log_feature_count','total_volume','resource_type_count','severity_type']
X = data[data_features]
y = data.fault_severity
#Train Validation Split for Cross Validation
X_train, X_valid, y_train, y_valid = train_test_split(X, y, train_size=0.8, test_size=0.2, random_state=0)
#RandomForest Modeling via Pipeline and Hyper-parameter tuning
steps = [('rf', RandomForestClassifier(random_state=0))]
pipeline = Pipeline(steps) # define the pipeline object.
parameters = {'rf__n_estimators':[100]}
grid = GridSearchCV(pipeline, param_grid=parameters, scoring='neg_mean_squared_error', cv=None, refit=True)
grid.fit(X_train, y_train)
#Approach 1:
print(grid.best_score_)
# Output:
# -0.508130081300813
#Approach 2:
y_predict=grid.predict(X_valid)
print("score = %3.2f"%(grid.score(y_predict, y_valid)))
# Output:
# ValueError: Expected 2D array, got 1D array instead:
# array=[0. 0. 0. ... 0. 1. 0.].
# Reshape your data either using array.reshape(-1, 1) if your data has a single feature or array.reshape(1, -1) if it contains a single sample.
#Approach 3:
y_predict_df = pd.DataFrame(y_predict.reshape(len(y_predict), -1),columns=['fault_severity'])
print("score = %3.2f"%(grid.score(y_predict_df, y_valid)))
# Output:
# ValueError: Number of features of the model must match the input. Model n_features is 6 and input n_features is 1
Discussion:
Approach 1:
As in GridSearchCV() the scoring variable is set to neg_mean_squared_error, tried to read the grid.best_score_. But it did not get the same MAE result.
Approach 2:
Tried to get the y_predict values using grid.predict(X_valid). Then tried to get the MAE using grid.score(y_predict, y_valid) as the scoring variable in GridSearchCV() is set to neg_mean_squared_error. It returned a ValueError complaining "Expected 2D array, got 1D array instead".
Approach 3:
Tried to reshape y_predict and it did not work either. This time it returned "ValueError: Number of features of the model must match the input."
It would be helpful if you can assist to point where I could have made the error?
If you need, the data.csv is available at https://www.dropbox.com/s/t1h53jg1hy4x33b/data.csv
Thank you very much
You are trying to compare mean_absolute_error with neg_mean_squared_error which is very different refer here for more details. You should have used neg_mean_absolute_error in your GridSearchCV object creation like shown below:
grid = GridSearchCV(pipeline, param_grid=parameters,scoring='neg_mean_absolute_error', cv=None, refit=True)
Also, the score method in sklearn takes (X,y) as inputs, where x is your input feature of shape (n_samples, n_features) and y is the target labels, you need to change your grid.score(y_predict, y_valid) into grid.score(X_valid, y_valid).

How to compute "y_train_true, y_train_prob, y_test_true, y_test_prob"?

I have computed X_train, X_test, y_train, y_test. But I can not compute y_train_true, y_train_prob, y_test_true, y_test_prob.
How can I compute y_train_true, y_train_prob, y_test_true, y_test_prob from the following code ?
X_train:
X_test:
y_train:
y_test:
N.B,
y_train_true: True binary labels of 0 or 1 in the training dataset
y_train_prob: Probability in range {0,1} predicted by the model for the training dataset
y_test_true: True binary labels of 0 or 1 in the testing dataset
y_test_prob: Probability in range {0,1} predicted by the model for the testing dataset
Code :
# Split test and train data
import numpy as np
from sklearn.model_selection import train_test_split
X = np.array(dataset.ix[:, 1:10])
y = np.array(dataset['benign_malignant'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#Define Classifier and ====
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
# knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train, y_train)
# Predicting the Test set results
y_pred = knn.predict(X_train)
Well in your case y_train and y_test is already y_train_true and y_test_true. To get y_train_prob and y_test_prob, you need to take a model. I don't know which dataset you're using but it seems to be a binary classification problem so that you could use logistic regression to do this so,
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn.fit(X_train, y_train)
y_train_prob = knn.predict_proba(X_train)
y_test_prob = knn.predict_proba(X_test)

Large mean squared error in sklearn regressors

I'm a beginner in machine learning and I want to build a model to predict the price of houses. I prepared a dataset by crawling a local housing website and it consists 1000 samples and only 4 features (latitude, longitude, area and number of rooms).
I tried RandomForestRegressor and LinearSVR models in sklearn, but I can't train the model properly and the MSE is super high.
MSE almost equals 90,000,000 (the true values of prices' range are between 5,000,000 - 900,000,000)
Here is my code:
import numpy as np
from sklearn.svm import LinearSVR
import pandas as pd
import csv
from sklearn.preprocessing import normalize
from sklearn.model_selection import train_test_split
df = pd.read_csv('dataset.csv', index_col=False)
X = df.drop('price', axis=1)
X_data = X.values
Y_data = df.price.values
X_train, X_test, Y_train, Y_test = train_test_split(X_data, Y_data, test_size=0.2, random_state=5)
rgr = RandomForestRegressor(n_estimators=100)
svr = LinearSVR()
rgr.fit(X_train, Y_train)
svr.fit(X_train, Y_train)
MSEs = cross_val_score(estimator=rgr,
X=X_train,
y=Y_train,
scoring='mean_squared_error',
cv=5)
MSEsSVR = cross_val_score(estimator=svr,
X=X_train,
y=Y_train,
scoring='mean_squared_error',
cv=5)
MSEs *= -1
RMSEs = np.sqrt(MSEs)
print("Root mean squared error with 95% confidence interval:")
print("{:.3f} (+/- {:.3f})".format(RMSEs.mean(), RMSEs.std()*2))
print("")
Is the problem with my dataset and count of features? How can I build a prediction model with this type of dataset?

Categories

Resources