I'm not able to see my resultant accuracy score in my final graph and I get precision/recall being ill-defined where I don't see any 0's.
I'm using this yeast data: https://archive.ics.uci.edu/ml/machine-learning-databases/yeast/yeast.data
I've tried making the whole set my training set by making train_frac=1.
import pandas as pd
import numpy as np
%matplotlib inline
from matplotlib import pyplot as plt
from sklearn.naive_bayes import GaussianNB
df = pd.read_csv("<my_dir>",names = ['sample','mcg', 'gvh', 'alm', 'mit', 'erl', 'pox', 'vac', 'nuc','site'])
df=df.drop(columns=['sample'])
model_type = GaussianNB()
target = 'site'
train_frac = 0.5
Y = df[target]
df2 = df.drop(columns=[target])
dtype='object'). Everything but site.
X = df[df2.columns[:]]
def naive_split(X, Y, n):
# Take first n lines of X and Y for training and the rest for testing
X_train = X[:n]
X_test = X[n:]
Y_train = Y[:n]
Y_test = Y[n:]
return (X_train, X_test, Y_train, Y_test)
def train_model(n=int(train_frac*df.shape[0])):
X_train, X_test, Y_train, Y_test = naive_split(X, Y, n)
clf = model_type
clf = clf.fit(X_train, Y_train)
return (X_test, Y_test, clf)
X_test, Y_test, clf = train_model()
import sklearn.metrics as metrics
from sklearn import model_selection
sizes = np.arange(0.98,0.01, -0.02)
result = {}
for size in sizes:
X_train, X_test, Y_train, Y_test = model_selection.train_test_split(
X, Y, test_size=size, random_state=200)
clf = model_type
clf = clf.fit(X_train, Y_train)
score = clf.score(X_test, Y_test)
precision = metrics.precision_score(Y_test, clf.predict(X_test), average='weighted')
recall = metrics.recall_score(Y_test, clf.predict(X_test), average='weighted')
result[len(Y_train)] = (score, precision, recall)
result = pd.DataFrame(result).transpose()
result.columns = ['Accuracy','Precision', 'Recall']
result.plot(marker='*', figsize=(15,5))
plt.title('Metrics measures using random train/test splitting')
plt.xlabel('Size of training set')
plt.ylabel('Value');
I get the following results when I expect it to run without error:
C:\Users\<user>\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\metrics\classification.py:1135: UndefinedMetricWarning: Precision is ill-defined and being set to 0.0 in labels with no predicted samples.'precision', 'predicted', average, warn_for)
C:\Users\<user>\AppData\Local\Continuum\anaconda3\lib\site-packages\sklearn\metrics\classification.py:1137: UndefinedMetricWarning: Recall is ill-defined and being set to 0.0 in labels with no true samples. 'recall', 'true', average, warn_for)
Related
How and where do I need to add code for setting maximal depths to 3 and 5 for this decision tree? I need to form 2 more Decision Trees with maximal depth 3 and 5.
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size = 0.2, random_state=0)
from sklearn import tree
clf = tree.DecisionTreeClassifier(max_depth = 4, random_state = 0)
clf = clf.fit(X_train, y_train)
y_pred = clf.predict(X_test)
from sklearn.metrics import confusion_matrix
cm_test = confusion_matrix(y_pred, y_test)
y_pred_train = clf.predict(X_train)
cm_train = confusion_matrix (y_pred_train, y_train)
training_accuracy = (cm_train[0][0] + cm_train[1][1])/len(y_train)
print('Training Accuracy = ', training_accuracy)
testing_accuracy = (cm_test[0][0] + cm_test[1][1])/len(y_test)
print('Testing Accuracy = ', testing_accuracy)
I have tried to train a decision tree classifier with the dataset data.csv which contains 1500 datapoints and 107 columns with Column 107 as the target, and test the classifier on the dataset data_test.csv which contains 917 datapoints with 107 columns with Column 107 as the target. This is the code I have written
import pandas as pd
from sklearn.model_selection import train_test_split
from sklearn.tree import DecisionTreeClassifier
from sklearn.metrics import classification_report,confusion_matrix
test_data = pd.read_csv("data_test.csv")
Data = pd.read_csv("data.csv")
Data = Data.fillna(0)
test_data = test_data.fillna(0)
Data.head(10)
Data.shape
Data.describe()
Data.info()
X = Data.iloc[:, 0:106]
y = Data["Target (Col 107)"]
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=.30, random_state=1)
dTree = DecisionTreeClassifier(criterion = 'gini', random_state=1)
dTree.fit(X_train, y_train)
print(dTree.score(X_train, y_train))
print(dTree.score(X_test, y_test))
dTreeR = DecisionTreeClassifier(criterion = 'gini', max_depth = 3, random_state=1)
dTreeR.fit(X_train, y_train)
print(dTreeR.score(X_train, y_train))
print(dTreeR.score(X_test, y_test))
#print(dTreeR.score(X_test , y_test))
y_predict = dTreeR.predict(test_data)
cm = confusion_matrix(y_test, y_predict, labels=[0, 1])
print(cm)
print(classification_report(y_test, y_predict))
And after running this code, it gives the following error when executing the y_predict line
ValueError: Found input variables with inconsistent numbers of samples: [450, 917]
Please let me know where I am going wrong.
I also wanted to know how to export the prediction results of the decision tree to a csv file
Thanks!
Part 1
decision_tree.fit(X_train, y_train)
Y_val = decision_tree.predict(X_val)
acc_decision_tree_train = round(decision_tree.score(X_train, y_train) * 100, 2)
acc_decision_tree_train
Part 2
acc_decision_tree_val = round(decision_tree.score(X_val, y_val) * 100, 2)
print('accuracy:', acc_decision_tree_val)
Part 3
con_mat=confusion_matrix(y_val, Y_pred_val)
sns.heatmap(con_mat,annot=True,annot_kws= {"size":20},cmap="viridis")
plt.show()
Part 4
acc_decision_tree_test = round(decision_tree.score(X_test, y_test) * 100, 2)
print('accuracy:', acc_decision_tree_test)
Y_pred_test = decision_tree.predict(X_test)
There are 4 parts in the above code
Q1 -> Fit on train and and predict on Val,
In this step the model learns by fitting on the training data x_train but we are not performing any prediction to obtain y_train so in this case how can we get the accuracy score of prediction for Train(model is learning, right?)
Q2 ->In part 2, as we already did "Y_val = decision_tree.predict(X_val)" above we can calculate the score of Validation, is this score same as the accuracy metric in the confusion matrix.
Q3-> Also in the part 4 I just asked for the accuracy score for the test data however I did not perform any 'predict' for the Test data but how was it able to give me the score with out even predicting.
Please let me know if something is not clear & Thanks in advance :)
I adjusted the code snippet and I've got accuracies on iris dataset.
from sklearn.datasets import load_iris
from sklearn.model_selection import cross_val_score
from sklearn.tree import DecisionTreeClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
import matplotlib.pyplot as plt
import seaborn as sns
decision_tree = DecisionTreeClassifier(random_state=0)
iris = load_iris()
X_train, X_test, y_train, y_test= train_test_split(iris.data, iris.target, test_size= 0.25, random_state=0)
X_train, X_val, y_train, y_val= train_test_split(X_train, y_train, test_size= 0.25, random_state=0)
#score = cross_val_score(decision_tree, iris.data, iris.target, cv=10)
decision_tree.fit(X_train, y_train)
y_pred_val = decision_tree.predict(X_val)
acc_decision_tree_train = round(decision_tree.score(X_train, y_train) * 100, 2)
print("acc_decision_tree_train ", acc_decision_tree_train)
acc_decision_tree_val = round(decision_tree.score(X_val, y_val) * 100, 2)
print('accuracy:', acc_decision_tree_val)
con_mat=confusion_matrix(y_val, y_pred_val)
sns.heatmap(con_mat,annot=True,annot_kws= {"size":20},cmap="viridis")
plt.show()
acc_decision_tree_test = round(decision_tree.score(X_test, y_test) * 100, 2)
print('accuracy:', acc_decision_tree_test)
y_pred_test = decision_tree.predict(X_test)
Output:
acc_decision_tree_train 100.0
accuracy: 100.0
accuracy: 97.37
For example, Xs has 5 independent variables, and Ys has 5 dependent variables:
x_train, x_test, y_train, y_test = train_test_split(Xs, Ys, test_size=0.2, random_state=2)
model = lgb.LGBMRegressor()
wrapper = MultiOutputRegressor(model)
model.fit(x_train, y_train)
model.score(x_test, y_test)
Could only get the overall R2 through the code above, what if I want to check the R2 for each Y?
Is it possible?
Thanks
You can use scikit-learn r2_score with multioutput='raw_values':
from sklearn.datasets import make_regression
from sklearn.model_selection import train_test_split
from sklearn.multioutput import MultiOutputRegressor
from sklearn.metrics import r2_score
import lightgbm as lgb
# generate the data
X, Y = make_regression(n_targets=5, n_features=10, n_samples=1000, random_state=42)
# split the data
X_train, X_test, Y_train, Y_test = train_test_split(X, Y, test_size=0.2, random_state=42)
# instantiate the model
model = MultiOutputRegressor(estimator=lgb.LGBMRegressor())
# fit the model
model.fit(X_train, Y_train)
# generate the model predictions
Y_pred = model.predict(X_test)
# calculate the individual R2's
print(r2_score(Y_test, Y_pred, multioutput='raw_values'))
# [0.907924 0.925267 0.906492 0.939653 0.881619]
print([r2_score(Y_test[:, i], Y_pred[:, i]) for i in range(Y_test.shape[1])])
# [0.907924, 0.925267, 0.906492, 0.939653, 0.881619]
# calculate the overall R2
print(model.score(X_test, Y_test))
# 0.9121908184618046
print(r2_score(Y_test, Y_pred, multioutput='uniform_average'))
# 0.9121908184618046
I have computed X_train, X_test, y_train, y_test. But I can not compute y_train_true, y_train_prob, y_test_true, y_test_prob.
How can I compute y_train_true, y_train_prob, y_test_true, y_test_prob from the following code ?
X_train:
X_test:
y_train:
y_test:
N.B,
y_train_true: True binary labels of 0 or 1 in the training dataset
y_train_prob: Probability in range {0,1} predicted by the model for the training dataset
y_test_true: True binary labels of 0 or 1 in the testing dataset
y_test_prob: Probability in range {0,1} predicted by the model for the testing dataset
Code :
# Split test and train data
import numpy as np
from sklearn.model_selection import train_test_split
X = np.array(dataset.ix[:, 1:10])
y = np.array(dataset['benign_malignant'])
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2, random_state=0)
#Define Classifier and ====
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
# knn = KNeighborsClassifier(n_neighbors=11)
knn.fit(X_train, y_train)
# Predicting the Test set results
y_pred = knn.predict(X_train)
Well in your case y_train and y_test is already y_train_true and y_test_true. To get y_train_prob and y_test_prob, you need to take a model. I don't know which dataset you're using but it seems to be a binary classification problem so that you could use logistic regression to do this so,
from sklearn.neighbors import KNeighborsClassifier
knn = KNeighborsClassifier(n_neighbors = 5, metric = 'minkowski', p = 2)
knn.fit(X_train, y_train)
y_train_prob = knn.predict_proba(X_train)
y_test_prob = knn.predict_proba(X_test)