Why do some contour lines have two different labels? - python

Does my code have a bug or something else?
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.model_selection import train_test_split
from sklearn.linear_model import LinearRegression
file = 'https://aegis4048.github.io/downloads/notebooks/sample_data/unconv_MV_v5.csv'
myDF = pd.read_csv(file)
# Split the data into features and target
feature1 = "Brittle"
feature2 = "Por"
X = myDF[[feature1, feature2]] #.iloc[:, :-1].values # A NumPy array!
print("X.info():", X.info())
y = myDF["Prod"] #.iloc[:, -1].values
# Split the data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.2)
# Create a linear regression object
reg = LinearRegression()
# Fit the model to the training data
reg.fit(X_train, y_train)
# Predict the target variable using the test data
y_pred = reg.predict(X_test)
# Evaluate the model using mean squared error (MSE)
mse = np.mean((y_test - y_pred)**2)
print("Mean Squared Error: ", mse)
print("R2 Score:", reg.score(X_test, y_test))
#define figure size in (width, height) for all plots
plt.rcParams['figure.figsize'] = [10, 7]
# Create a mesh of values for the features
print(X_train.shape) # NumPy array
x1_min, x1_max = X_train[feature1].min(), X_train[feature1].max()
x2_min, x2_max = X_train[feature2].min(), X_train[feature2].max()
x1, x2 = np.meshgrid(np.linspace(x1_min, x1_max, 100), np.linspace(x2_min, x2_max, 100))
X_mesh = np.c_[x1.ravel(), x2.ravel()]
# Compute the predictions for the mesh of values
y_pred_mesh = reg.predict(X_mesh).reshape(x1.shape)
# Plot the predictions as a surface. Request 10 contour lines.
contours = plt.contourf(x1, x2, y_pred_mesh, 10, cmap='coolwarm', alpha=0.8) # https://matplotlib.org/stable/api/_as_gen/matplotlib.pyplot.contourf.html
# Scatter plot of the training data.
# The colors of the points don't mean much except to stand out from the background
plt.scatter(X_train[feature1], X_train[feature2], c=y_train, cmap='coolwarm', s=20)
# Label the contour lines
plt.clabel(contours, inline=1, fontsize=12, colors = "black")
# Label the plot
plt.xlabel(feature1)
plt.ylabel(feature2)
plt.title('Multivariate Linear Regression Contour Plot')
# Show the plot
plt.show()
The output:

Related

How I can fix the error in: "Fit the PCA transformer on the training data and transform the data line" and "model = svm.SVC(kernel='linear')"

I have create this Support Vector machine model but I get errors in "Fit the PCA transformer on the training data and transform the data line" and "model = svm.SVC(kernel='linear')"
The first error is:
NameError: name 'x_train' is not defined
The second error is:
NameError: name 'svm' is not defined
import pandas as pd
import numpy as np
from sklearn.svm import SVC
from sklearn.model_selection import train_test_split
import seaborn as sns
import seaborn as sns # for data visualization
train_df = pd.read_csv('diabetes_data_upload.csv')
train_df.head()
# checking total of rows and columns
train_df.shape
# Transforming the Gender into 0 and 1
train_df["Gender"] = train_df["Gender"].map({"Male": 0, "Female": 1}).astype(int)
#Rounding the Age
train_df["Age"] = train_df.Age.round()
# Separating the data to predict the missing ages
X_train = train_df[train_df.Age.notnull()][['Age','Gender','weakness','Obesity', 'class']]
X_test = train_df[train_df.Age.isnull()][['Age','Gender','weakness','Obesity', 'class']]
y = train_df.Age.dropna()
# Just confirming if there is no more ages missing
train_df.Age.isnull().sum()
# Taking only the features that is important for now
X = train_df[['Gender', 'Age', 'weakness']]
# Taking the labels
Y = train_df['class']
# Spliting into 80% for training set and 20% for testing set so we can see our accuracy
X_train, x_test, Y_train, y_test = train_test_split(X, Y, test_size=0.2, random_state=0)
# Declaring the SVC with no tunning
print(X_train.shape)
print(Y_train.shape)
from sklearn.decomposition import PCA
# Create a PCA transformer with 3 components
pca = PCA(n_components=3)
# Fit the PCA transformer on the training data and transform the data
x_train_pca = pca.fit_transform(x_train)
# Transform the test data using the PCA transformer fitted on the training data
x_test_pca = pca.transform(x_test)
# Fit the classifier on the transformed training data
classifier.fit(x_train_pca, y_train)
# Predict the labels for the transformed test data
predictions = classifier.predict(x_test_pca)
# Calculate the accuracy of the model
accuracy = classifier.score(x_test_pca, y_test)
print("Accuracy:", accuracy)
#make it binary classification problem
X = X[np.logical_or(Y==0,Y==1)]
Y = Y[np.logical_or(Y==0,Y==1)]
model = svm.SVC(kernel='linear')
clf = model.fit(X, Y)
# The equation of the separating plane is given by all x so that np.dot(svc.coef_[0], x) + b = 0.
# Solve for w3 (z)
z = lambda x,y: (-clf.intercept_[0]-clf.coef_[0][0]*x -clf.coef_[0][1]*y) / clf.coef_[0][2]
tmp = np.linspace(-5,5,30)
x,y = np.meshgrid(tmp,tmp)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.plot3D(X[Y==0,0], X[Y==0,1], X[Y==0,2],'ob')
ax.plot3D(X[Y==1,0], X[Y==1,1], X[Y==1,2],'sr')
ax.plot_surface(x, y, z(x,y))
ax.view_init(30, 60)
plt.show()
The two errors you mention can be solved by the following -
1. NameError: name 'x_train' is not defined
This is because you are using x_train instead of the X_train variable you have defined right above. Remember, variables names are case sensitive.
x_train_pca = pca.fit_transform(x_train) # your code
x_train_pca = pca.fit_transform(X_train) # change it to this
2. NameError: name 'svm' is not defined
This is because you are already importing the SVC class using from svm import SVC. But while trying to instantiating the class the model you are using svm.SVC.
model = svm.SVC(kernel='linear') # your code
model = SVC(kernel='linear') # change it to this

How to create a precision-recall curve plot to compare 2 classifiers in Python?

My goal is to find the precision-recall curve, comparing with Logistic Regression and Random Forest and plotting them in one graph. I wanted to know if I used the right steps to create a plot to compare both classifiers.
I appreciate all the help!
Code:
from sklearn.preprocessing import MultiLabelBinarizer as mlb
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification
from sklearn import metrics
from sklearn.linear_model import LogisticRegression
from sklearn.ensemble import RandomForestClassifier
from sklearn.model_selection import train_test_split
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import f1_score
from sklearn.metrics import auc
from matplotlib import pyplot
X = df[["DIAGNOSIS_CD_Dummy"]]
y = df[["TEST_RESULT_Dummy"]]
# X = pd.DataFrame(df.iloc[:, -1])
# y = pd.DataFrame(df.iloc[:, :-1])
# raw confusion matrix
df = pd.DataFrame(df, columns=["DIAGNOSIS_CD_Dummy", "TEST_RESULT_Dummy"])
confusion_matrix = pd.crosstab(
df["TEST_RESULT_Dummy"],
df["DIAGNOSIS_CD_Dummy"],
rownames=["Test Result"],
colnames=["Diagnosis"],
)
print(confusion_matrix)
# Logistic Regression Confusion Matrix
from sklearn.preprocessing import MultiLabelBinarizer as mlb
import numpy as np
import pandas as pd
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import confusion_matrix
from sklearn.metrics import classification_report
from sklearn.datasets import make_classification
from sklearn import metrics
# split into training and test using scikit
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(
X, y.values.ravel(), test_size=0.3, random_state=1, stratify=y
)
log_model = LogisticRegression()
log_model.fit(X_train, y_train)
# use logistic regression model to make predictions
y_score = log_model.predict_proba(X_test)[:, 1]
y_pred = log_model.predict(X_test)
y_pred = np.round(y_pred)
confusion_matrix = confusion_matrix(y_test, y_pred)
print("\n")
print(confusion_matrix)
print("\n")
print(classification_report(y_test, y_pred, zero_division=0))
# calculate precision and recall
precision, recall, thresholds = precision_recall_curve(y_test, y_score)
# create precision recall curve
fig, ax = plt.subplots()
ax.plot(recall, precision, color="purple")
# add axis labels to plot
ax.set_title("Precision-Recall Curve")
ax.set_ylabel("Precision")
ax.set_xlabel("Recall")
# display plot
plt.show()
# precision-recall curve
# generate 2 class dataset
X = df[["DIAGNOSIS_CD_Dummy"]]
y = df[["TEST_RESULT_Dummy"]]
# X = pd.DataFrame(df.iloc[:, :-1])
# y = pd.DataFrame(df.iloc[:, -1])
# split into train/test sets
trainX, testX, trainy, testy = train_test_split(
X, y.values.ravel(), test_size=0.3, random_state=2
)
# fit a model
model = LogisticRegression(solver="lbfgs")
model.fit(trainX, trainy)
# predict probabilities
lr_probs = model.predict_proba(testX)
# probs_rf = model_rf.predict_proba(testX)[:, 1]
# keep probabilities for the positive outcome only
lr_probs = lr_probs[:, 1]
# predict class values
yhat = model.predict(testX)
lr_precision, lr_recall, _ = precision_recall_curve(testy, lr_probs)
lr_f1, lr_auc = f1_score(testy, yhat), auc(lr_recall, lr_precision)
# precision_rf, recall_rf, _ = precision_recall_curve(testy, probs_rf)
# f1_rf, auc_rf = f1_score(testy, yhat), auc(recall_rf, precision_rf)
# auc_rf = auc(recall_rf, precision_rf)
# summarize scores
print("Logistic: f1=%.3f auc=%.3f" % (lr_f1, lr_auc))
# plot the precision-recall curves
no_skill = len(testy[testy == 1]) / len(testy)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle="--", label="No Skill")
pyplot.plot(lr_recall, lr_precision, marker=".", label="Logistic")
plt.plot(lr_precision, lr_recall, label=f"AUC (Logistic Regression) = {lr_auc:.2f}")
# axis labels
pyplot.xlabel("Recall")
pyplot.ylabel("Precision")
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
# Random Forest
model_rf = RandomForestClassifier()
model_rf.fit(trainX, trainy)
# model_rf = RandomForestClassifier().fit(trainX, trainy)
# predict probabilities
lr_probs = model.predict_proba(testX)
probs_rf = model_rf.predict_proba(testX)
# keep probabilities for the positive outcome only
probs_rf = probs_rf[:, 1]
# predict class values
yhat = model.predict(testX)
precision_rf, recall_rf, _ = precision_recall_curve(testy, probs_rf)
f1_rf, auc_rf = f1_score(testy, yhat), auc(recall_rf, precision_rf)
auc_rf = auc(recall_rf, precision_rf)
print("Random Forest: f1=%.3f auc=%.3f" % (f1_rf, auc_rf))
# plot the precision-recall curves
no_skill = len(testy[testy == 1]) / len(testy)
pyplot.plot([0, 1], [no_skill, no_skill], linestyle="--", label="No Skill")
pyplot.plot(lr_recall, lr_precision, marker=".", label="Random Forest")
plt.plot(recall_rf, precision_rf, label=f"AUC (Random Forests) = {auc_rf:.2f}")
# axis labels
pyplot.xlabel("Recall")
pyplot.ylabel("Precision")
# show the legend
pyplot.legend()
# show the plot
pyplot.show()
Output:
Diagnosis 0 1
Test Result
0 18385 32
1 1268 165
[[5514 11]
[ 374 56]]
precision recall f1-score support
0 0.94 1.00 0.97 5525
1 0.84 0.13 0.23 430
accuracy 0.94 5955
macro avg 0.89 0.56 0.60 5955
weighted avg 0.93 0.94 0.91 5955
Logistic: f1=0.193 auc=0.488
Random Forest: f1=0.193 auc=0.488
This is my attempt to plot it.
import pathlib
import numpy as np
import matplotlib.pyplot as plt
from matplotlib.ticker import (MultipleLocator, AutoMinorLocator)
from sklearn.metrics import PrecisionRecallDisplay
from sklearn.multiclass import OneVsRestClassifier
from sklearn.metrics import precision_recall_curve
from sklearn.metrics import average_precision_score
from itertools import cycle
from imblearn.pipeline import Pipeline
from sklearn.preprocessing import label_binarize
def __plot_binary_precision_recall_curve(X_test, y_test, *args, **kwargs):
"""
Private function to be used by plot_precision_recall_curve for binary applications.
"""
if 'fig_size' in kwargs and 'dpi' in kwargs:
fig, ax = plt.subplots(figsize=kwargs['fig_size'], dpi=kwargs['dpi'])
else:
fig, ax = plt.subplots()
plt.rcParams["figure.facecolor"] = 'white'
plt.rcParams["axes.facecolor"] = 'white'
plt.rcParams["savefig.facecolor"] = 'white'
ax.xaxis.set_major_locator(MultipleLocator(0.1))
ax.xaxis.set_major_formatter('{x:.1f}')
ax.yaxis.set_major_locator(MultipleLocator(0.1))
ax.yaxis.set_major_formatter('{x:.1f}')
ax.xaxis.set_minor_locator(MultipleLocator(0.05))
ax.yaxis.set_minor_locator(MultipleLocator(0.05))
ax.tick_params(which='both', width=2)
ax.tick_params(which='major', length=7)
ax.tick_params(which='minor', length=4, color='black')
plt.grid(True, zorder=0)
plt.plot([0, 1], [1, 0], linestyle='--', lw=1, color='k',
label='Luck', alpha=.8, zorder=1) # random prediction curve
plt.plot([1, 1], [1, 0], c='k', linestyle='dashdot'), plt.plot([1, 1], c='k', linestyle='dashdot', zorder=2, label="Perfect model") #perfect model prediction curve
f_scores = np.linspace(0.2, 0.8, num=4)
lines, labels = [], []
for f_score in f_scores:
x = np.linspace(0.01, 1)
y = f_score * x / (2 * x - f_score)
(l,) = plt.plot(x[y >= 0], y[y >= 0], color="gray", alpha=0.2)
plt.annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))
zorder = 3
for classifier in args:
display = PrecisionRecallDisplay.from_estimator(classifier, X_test, y_test, ax=ax, zorder=zorder)
zorder +=1
# add the legend for the iso-f1 curves
handles, labels = display.ax_.get_legend_handles_labels()
handles.extend([l])
labels.extend(["iso-f1 curves"])
# set the legend and the axes
ax.set_xlim([0.0, 1.0])
ax.set_ylim([0.0, 1.05])
ax.legend(handles=handles, labels=labels, loc="best")
plt.xlabel('Recall', fontsize=18)
plt.ylabel('Precision', fontsize=18)
if 'title' in kwargs:
ax.set_title(kwargs['title'], fontsize=18)
else:
ax.set_title("Precision-Recall Curve", fontsize=18)
if 'save_fig_path' in kwargs:
path = pathlib.Path(kwargs['save_fig_path'])
path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(kwargs['save_fig_path'], dpi=kwargs['dpi'], facecolor=fig.get_facecolor(), edgecolor='none')
return fig, ax
def __plot_multiclass_precision_recall_curve(X_train, y_train, X_test, y_test, *args, **kwargs):
"""
Private function designed to be used by plot_precision_recall_curve for multiclass applications.
"""
my_vals = y_test.unique().tolist()
my_vals.sort()
# binarize the y_test series
y_test = label_binarize(y_test, classes=my_vals)
n_classes = y_test.shape[1]
# setup plot details
colors = cycle(["navy", "turquoise", "darkorange", "cornflowerblue", "teal"])
if 'fig_size' in kwargs and 'dpi' in kwargs:
fig, ax = plt.subplots(len(args), figsize=kwargs['fig_size'], dpi=kwargs['dpi'], facecolor='white')
else:
fig, ax = plt.subplots(len(args), facecolor='white')
for count, clfs in enumerate(args):
ax[count].xaxis.set_major_locator(MultipleLocator(0.1))
ax[count].xaxis.set_major_formatter('{x:.1f}')
ax[count].yaxis.set_major_locator(MultipleLocator(0.1))
ax[count].yaxis.set_major_formatter('{x:.1f}')
ax[count].xaxis.set_minor_locator(MultipleLocator(0.05))
ax[count].yaxis.set_minor_locator(MultipleLocator(0.05))
ax[count].tick_params(which='both', width=2)
ax[count].tick_params(which='major', length=7)
ax[count].tick_params(which='minor', length=4, color='black')
ax[count].grid(True, zorder=0)
ax[count].plot([0, 1], [1, 0], linestyle='--', lw=1, color='k',
label='Luck', alpha=.8, zorder=1) # random prediction curve
ax[count].plot([1, 1], [1, 0], c='k', linestyle='dashdot'), ax[count].plot([1, 1], c='k', linestyle='dashdot', zorder=2, label="Perfect model") #perfect model prediction curve
# set up the model, wrapped by the OneVsRestClassifier
classifier = OneVsRestClassifier(clfs)
classifier.fit(X_train, y_train) # train the model
# produce the predictions (as probabilities)
y_score = classifier.predict_proba(X_test)
# For each class
precision = dict()
recall = dict()
average_precision = dict()
for i in range(n_classes):
precision[i], recall[i], _ = precision_recall_curve(y_test[:, i], y_score[:, i])
average_precision[i] = average_precision_score(y_test[:, i], y_score[:, i])
# A "micro-average": quantifying score on all classes jointly
precision["micro"], recall["micro"], _ = precision_recall_curve(
y_test.ravel(), y_score.ravel()
)
average_precision["micro"] = average_precision_score(y_test, y_score, average="micro")
f_scores = np.linspace(0.2, 0.8, num=4)
lines, labels = [], []
for f_score in f_scores:
x = np.linspace(0.01, 1)
y = f_score * x / (2 * x - f_score)
(l,) = ax[count].plot(x[y >= 0], y[y >= 0], color="gray", alpha=0.2)
ax[count].annotate("f1={0:0.1f}".format(f_score), xy=(0.9, y[45] + 0.02))
display = PrecisionRecallDisplay(
recall=recall["micro"],
precision=precision["micro"],
average_precision=average_precision["micro"],
)
display.plot(ax=ax[count], name="Micro-average precision-recall", color="gold")
for i, color in zip(range(n_classes), colors):
display = PrecisionRecallDisplay(
recall=recall[i],
precision=precision[i],
average_precision=average_precision[i],
)
display.plot(ax=ax[count], name=f"Precision-recall for class {i}", color=color)
# add the legend for the iso-f1 curves
handles, labels = display.ax_.get_legend_handles_labels()
handles.extend([l])
labels.extend(["iso-f1 curves"])
# set the legend and the axes
ax[count].set_xlim([0.0, 1.0])
ax[count].set_ylim([0.0, 1.05])
ax[count].legend(handles=handles, labels=labels, loc="best")
if type(clfs) == Pipeline:
estimator_name = str(type(clfs['clf'])).split(".")[-1][:-2]
else:
estimator_name = str(type(clfs)).split(".")[-1][:-2]
if 'title' in kwargs:
ax[count].set_title(kwargs['title'] + " - " + estimator_name, fontsize=18)
else:
ax[count].set_title("Precision-Recall Curve" + " - " + estimator_name, fontsize=18)
ax[count].set_xlabel('Recall', fontsize=18)
ax[count].set_ylabel('Precision', fontsize=18)
if 'save_fig_path' in kwargs:
path = pathlib.Path(kwargs['save_fig_path'])
path.parent.mkdir(parents=True, exist_ok=True)
fig.savefig(kwargs['save_fig_path'], dpi=kwargs['dpi'], facecolor=fig.get_facecolor(), edgecolor='none')
return fig, ax
def plot_precision_recall_curve(X_train, y_train, X_test, y_test, *args, **kwargs):
"""
Plots precision recall curves for the given models
Parameters
----------
X_test : pandas.DataFrame of shape (n_samples, n_features)
Test values.
y_test : pandas.Series of shape (n_samples,)
Target values.
*args : estimators to plot precision and recall curves
estimator instance (either sklearn.Pipeline, imblearn.Pipeline or a classifier)
PRE-FITTED classifier or a PRE-FITTED Pipeline in which the last estimator is a classifier.
**kwargs : The following options are available with kwargs
fig_size : tuple
Size (inches) of the plot.
dpi : int, default = 100
Image DPI.
title : str
The title of the plot.
save_fig_path : str
Full path where to save the plot. Will generate the folders if they don't exist already.
Returns
-------
fig : Matplotlib.pyplot.Figure
Figure from matplotlib
ax : Matplotlib.pyplot.Axe
Axe object from matplotlib
Example Syntax
--------------
fig, ax = reporting.plot_precision_recall_curve(X_train, y_train, X_test, y_test,
rf_pipe, catboost_classifier,
fig_size=(10,16), dpi=100,
title="Precision-Recall Curve",
save_fig_path="dir1/dir2/precision_recall_curve.png")
"""
if (len(y_test.unique()) == 2):
fig, ax = __plot_binary_precision_recall_curve(X_test, y_test, *args, **kwargs)
else:
fig, ax = __plot_multiclass_precision_recall_curve(X_train, y_train, X_test, y_test, *args, **kwargs)
return fig, ax
Syntax and Output for Binary Classification
fig, ax = plot_precision_recall_curve(X_train, y_train, X_test, y_test,
rf_pipe, xgboost_classifier,
fig_size=(10,8), dpi=100,
title="Precision-Recall Curve",
save_fig_path="dir1/dir2/precision_recall_curve.png")
Syntax and Output for Multi-class Classification
fig, ax = plot_precision_recall_curve(X_train, y_train, X_test, y_test,
rf_pipe, catboost_classifier,
fig_size=(10,16), dpi=100,
title="Precision-Recall Curve",
save_fig_path="dir1/dir2/precision_recall_curve.png")

Python Machine Learning SGD Classification Error

# This set will get us started, but you will need to add
# others.
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import SGDClassifier
from sklearn.model_selection import cross_val_predict
from sklearn.metrics import confusion_matrix
from sklearn.metrics import recall_score
from sklearn.metrics import precision_score
from sklearn.metrics import f1_score
from sklearn.preprocessing import label_binarize
from sklearn.metrics import plot_roc_curve
from sklearn.metrics import roc_curve, auc
from sklearn.metrics import roc_auc_score
from sklearn.datasets import load_wine
from sklearn.model_selection import train_test_split
from sklearn.metrics import confusion_matrix
from sklearn.metrics import plot_confusion_matrix
# We can now access the dataset
wine = load_wine()
X = wine.data
y = wine.target
nclasses = len(wine.target_names)
colors = "bry"
h = .02 # step size in the mesh
# Split the data into a training set and a test set - this is
# where your implementation will need to start. Maybe you
# will need to work with the train_test_split( ... ) function
X_train, X_test, y_train, y_test = train_test_split(X, y)
# Standardize
mean = X_train.mean(axis=0)
std = X_train.std(axis=0)
X_train = (X_train - mean) / std
# Next we can look at the cross validation. Remember we are
# selecting the two best features through this process.
# Take a look at tutorial 4 for an example implementation
best_performance = 0
best_mean_f1 = 0
best_f1 = 0
best_f2 = 0
# We will be testing every combination of pairs of features
for f1 in range(0,13):
for f2 in range(0,13):
# We want 2 features, not 1
if f1 == f2:
continue
features_idx_to_use = [f1,f2]
clf = SGDClassifier(alpha=0.001, max_iter=100, random_state=42)
clf.fit(X_train[:,features_idx_to_use], y_train)
# Return the predictions for the 3-Fold crossvalidation
y_predicted = cross_val_predict(clf, X_train[:,features_idx_to_use],y_train, cv=3)
# Construct the confusion matricies
conf_mat_train = confusion_matrix(y_train, y_predicted)
# Print out the recall, precision and F1 scores
# There will be a value for each class
# CV Train
print("CV Train:",f1,":",f2," - ", recall_score(y_train,y_predicted,average=None))
print("CV Train:",f1,":",f2," - ",precision_score(y_train,y_predicted,average=None))
print("CV Train:",f1,":",f2," - ",f1_score(y_train,y_predicted,average=None))
current_f1 = np.mean(f1_score(y_train,y_predicted,average=None))
if current_f1 > best_mean_f1:
best_f1 = f1
best_f2 = f2
best_mean_f1 = current_f1
best_clf = clf
fig, ax = plt.subplots()
disp = plot_confusion_matrix(best_clf, X_test[:,[best_f1, best_f2]], y_test,
display_labels=wine.target_names,
cmap=plt.cm.Blues,ax=ax)
ax.set_title('Testing')
# Once you have selected the best performing set of features
# in the cross-validation, we can test the best performing
# classifier
# Now we need to test our classifier using the test set.
# Recall that we standardised the data - we need to do the same with the stored
# Mean and standard deviation from the training set.
X_test = (X_test - mean) / std
y_test_predicted = best_clf.predict(X_test[:,[best_f1, best_f2]])
conf_mat_test = confusion_matrix(y_test, y_test_predicted)
# Test
print("Test:",recall_score(y_test,y_test_predicted,average=None))
print("Test:",precision_score(y_test,y_test_predicted,average=None))
print("Test:",f1_score(y_test,y_test_predicted,average=None))
y_score = best_clf.decision_function(X_test[:,[best_f1, best_f2]])
# Now we can plot a ROC curve and calculate the AUC
fpr = dict()
tpr = dict()
roc_auc = dict()
for i in range(nclasses):
fpr[i], tpr[i], _ = roc_curve(y_test, y_score)
roc_auc[i] = auc(fpr, tpr)
# Once we have finished on the performance, we can plot the
# the classifier boundaries and test points. This one is
# letf to you ;)
h = .02 # step size in the mesh
# create a mesh to plot in
x_min, x_max = X_test[:, best_f1].min() - 1, X_test[:, best_f1].max() + 1
y_min, y_max = X_test[:, best_f2].min() - 1, X_test[:, best_f2].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
fig, ax = plt.subplots(figsize=(10,10))
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = best_clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
cs = ax.contourf(xx, yy, Z,cmap=plt.get_cmap('plasma'))
fig, ax = plt.subplots(figsize=(10,10))
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, x_max]x[y_min, y_max].
Z = best_clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
cs = ax.contourf(xx, yy, Z,cmap=plt.get_cmap('plasma'))
colors = ['cyan','orange']
# Plot also the training points
for i, color in zip(best_clf.classes_, colors):
idx = np.where(y_test == i)
ax.scatter(X_test[idx, best_f1], X_test[idx, best_f2], c=color, label=wine.target_names[i],
cmap=plt.get_cmap('plasma'), edgecolor='black', s=20)
ax.set_title("Decision surface of Binary SGD")
ax.axis('tight')
xmin, xmax = plt.xlim()
ymin, ymax = plt.ylim()
coef = best_clf.coef_
intercept = best_clf.intercept_
# Lets make a function to plot the hyperplanes used by the SVM for
# Classification.
def plot_hyperplane(c, color):
def line(x0):
return (-(x0 * coef[c, 0]) - intercept[c]) / coef[c, 1]
plt.plot([xmin, xmax], [line(xmin), line(xmax)],
ls="--", color=color)
plot_hyperplane(0, "red")
My error is
Traceback (most recent call last):
File "C:\Users\Doug.spyder-py3\temp.py", line 120, in
fpr[i], tpr[i], _ = roc_curve(y_test, y_score)
File "C:\Users\Doug\anaconda3\lib\site-packages\sklearn\utils\validation.py", line 63, in inner_f
return f(*args, **kwargs)
File "C:\Users\Doug\anaconda3\lib\site-packages\sklearn\metrics_ranking.py", line 913, in roc_curve
fps, tps, thresholds = _binary_clf_curve(
File "C:\Users\Doug\anaconda3\lib\site-packages\sklearn\metrics_ranking.py", line 691, in _binary_clf_curve
raise ValueError("{0} format is not supported".format(y_type))
ValueError: multiclass format is not supported
And the problem line is fpr[i], tpr[i], _ = roc_curve(y_test, y_score)
Can someone explain what it is I am missing please? I know it's got something to do with the kind of data that I am inputting but I am not sure what?
First, as you do not use fpr and tpr later, you can merge roc_curve and auc into roc_auc_score which you already imported.
See the docs.
If you are performing multiclass classification (more than two classes), you need to provide the multi_class parameter.
ovo and ovr give different results; see here.
Maybe relevant:
ROC for multiclass classification
How to fix ValueError: multiclass format is not supported

Scatter Plot of predicted vs actual value with regression curve

I am trying to use scatter plots with regression curves using the following code. I am using different algorithms like Linear regression, SVM and Gaussian Process etc. I have tried different options for plotting the data mentioned below
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
from sklearn.preprocessing import StandardScaler
from sklearn.svm import SVR
from sklearn.gaussian_process import GaussianProcessRegressor
df=pd.read_excel(coded.xlsx)
dfnew=df[['FL','FW','TL','LL','KH']]
Y = df['KH']
X = df[['FL']]
X=X.values.reshape(len(X),1)
Y=Y.values.reshape(len(Y),1)
# Split the data into training/testing sets
X_train = X[:-270]
X_test = X[-270:]
# Split the targets into training/testing sets
Y_train = Y[:-270]
Y_test = Y[-270:]
#regressor = SVR(kernel = 'rbf')
#regressor.fit(X_train, np.ravel(Y_train))
#training the algorithm
regressor = GaussianProcessRegressor(random_state=42)
regressor.fit(X_train, Y_train)
y_pred = regressor.predict(X_test)
mse = np.sum((y_pred - Y_test)**2)
# root mean squared error
# m is the number of training examples
rmse = np.sqrt(mse/270)
print(rmse)
#X_grid = np.arange(min(X), max(X), 0.01) #this step required because data is feature scaled.
#X_grid = np.arange(0, 15, 0.01) #this step required because data is feature scaled.
#X_grid = X_grid.reshape((len(X_grid), 1))
#plt.scatter(X, Y, color = 'red')
print('size of Y_train = {0}'.format(Y_train.size))
print('size of y_pred = {0}'.format(y_pred.size))
#plt.scatter(Y_train, y_pred, color = 'red')
#plt.plot(X_grid, regressor.predict(X_grid), color = 'blue')
#plt.title('GPR')
#plt.xlabel('Measured')
#plt.ylabel('Predicted')
#plt.show()
fig, ax = plt.subplots(1, figsize=(12, 6))
plt.plot(X[:, 0], Y_train, marker='o', color='black', linewidth=0)
plt.plot(X[:, 0], y_pred, marker='x', color='steelblue')
plt.suptitle("$GaussianProcessRegressor(kernel=RBF)$ [default]", fontsize=20)
plt.axis('off')
pass
But I am getting error like:
ValueError: x and y must have same first dimension, but have shapes (540,) and (270, 1)
What is the possible solution?
This code splits X and Y into training/testing sets, but then tries to plot a column from all of X with Y_train and y_pred, which have only half as many values as X. Try creating plots with X_train and X_test instead.

Matplotlib is graphing a 'zig zag' line when trying to graph polynomial

My line in matplotlib is the correct shape, however, it is made up of zig zagging lines.
I've tried restarting and graphing the same equation on desmos. The equation on desmos looks exactly how I want it to. I think this is a matplotlib issue.
#imports
import numpy as np
import pandas as pd
import seaborn as sns; sns.set() # just makes your plots look prettier run 'pip install seaborn'
import matplotlib.pyplot as plt
from IPython.core.pylabtools import figsize
figsize(15, 7)
from sklearn.model_selection import train_test_split
x_train, x_test, y_train, y_test = train_test_split(X, Y, test_size=0.3)
noise = np.random.randn(100)
x = np.linspace(-2,2, 100)
y = x + noise + np.random.randn()*2 + x**2
plt.scatter(x, y); plt.show()
#pre processing
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3)
#initializing m and b variables
current_z_val = 0.1
current_m_val = 0.1
current_b_val = 0.1
#setting # of iterations
iterations = 5
#calculating length of examples for functions used below
n = len(x_train)
#learning rate
learning_rate = 0.01
#plot the data and estimates
plt.scatter(x_train,y_train)
plt.title("Example data and hypothesis lines")
plt.xlabel('X Axis')
plt.ylabel('Y Axis')
cost_history = []
#main graident descent loop
for i in range(iterations):
#creating the hypothesis using y=z^2 + mx+b form
y_hypothesis = (current_z_val * (x_train**2)) + (current_m_val * x_train) + current_b_val
#calculating the derivatives from the image embedded above in code
z_deriv = -(2/n)*sum(y_train-y_hypothesis)
m_deriv = -(2/n)*sum(x_train*(y_train-y_hypothesis))
b_deriv = -(2/n)*sum(y_train-y_hypothesis)
#updating m and b values
current_z_val = current_z_val - (learning_rate * z_deriv)
current_m_val = current_m_val - (learning_rate * m_deriv)
current_b_val = current_b_val - (learning_rate * b_deriv)
#calculate the cost (error) of the model
cost = (1/n)*sum(y_train-y_hypothesis)**2
cost_history.append(cost)
#print the m and b values
#print("iteration {}, cost {}, m {}, b {}".format(i,cost,current_m_val,current_b_val))
plt.plot(x_train,y_hypothesis)
plt.show()
#plot the final graph
plt.plot(range(1,len(cost_history)+1),cost_history)
plt.title("Cost at each iteration")
plt.xlabel('Iterations')
plt.ylabel('MSE')
plt.show()
This is what a graph looks like on my plot. And this is what it should look like.
matplotlib plots the point following their order in the list, not their "natural" order given by their magnitude.
I think you should sort x_train before computing y_hypothesis in order to get the function you expect to have.
Note that this is happening in both plt.scatter() and plt.plot(), but you see it only in the latter because while connecting the dots with plt.plot() you actually see the sequence.
The function train_test_split will randomly select xtrain and xtest, due to which your x will be shuffled. Matplotlib will not be able to plot a line if your x is not in order.
Use shuffle=False in the following line. That should do that plot right.
x_train, x_test, y_train, y_test = train_test_split(x, y, test_size=0.3, shuffle=False)

Categories

Resources