To plot the learning-curve in a regression problem, we should use 'RMSE' as a measure of analysis, like:
def plot_learning_curves(model, X_train, y_train, X_val, y_val):
plt.figure(figsize=(15, 5))
train_errors, val_errors = [], []
for m in range(5, len(X_train)):
model.fit(X_train[:m], y_train[:m])
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(metrics.mean_squared_error(y_train_predict, y_train[:m]))
val_errors.append(metrics.mean_squared_error(y_val_predict, y_val))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=1, label="training data")
plt.plot(np.sqrt(val_errors), "b-", linewidth=1, label="validation data")
plt.legend(loc="upper right", fontsize=10)
plt.xlabel("Size", fontsize=10)
plt.ylabel("RMSE", fontsize=10)
plt.title("Learning Curves")
plt.show()
However, I would like to have a learning curve for the classification problem, and I know that we should use accuracy as a metric for analysis instead of 'RMSE'.
So far, I've only found snippet codes for this problem using the 'learning-curve' class in sklearn , like:
from sklearn.model_selection import learning_curve
from sklearn.model_selection import ShuffleSplit
def plot_learning_curve(estimator, X, y, ax=None, cv=None, n_jobs=4, train_sizes=np.linspace(.1, 1.0, 5)):
train_sizes, train_scores, test_scores = \
learning_curve(estimator, X, y, cv=cv, n_jobs=n_jobs, train_sizes=train_sizes)
train_scores_mean = np.mean(train_scores, axis=1)
train_scores_std = np.std(train_scores, axis=1)
test_scores_mean = np.mean(test_scores, axis=1)
test_scores_std = np.std(test_scores, axis=1)
# Plot learning curve
ax.set_xlabel("Training examples")
ax.set_ylabel("Score")
ax.plot(train_sizes, train_scores_mean, 'o-', color="r", label="Training score")
ax.plot(train_sizes, test_scores_mean, 'o-', color="g", label="Cross-validation score")
ax.legend(loc="best")
return plt
However, as you can see in this solution, we need to pass the x and y datasets and it uses cross-validation to split them into train and test datasets.
But, I've already split my dataset and applied a lot of preprocessing to x-train and x-test.
So, I intend to use my train and test dataset just like the code I wrote for the regression problem, which is based on the "RMSE" metric (without using the "learning-curve" class in sklearn).
Related
I have a problem. I want to plot my RMSE value. However, I now use a pipeline because I use cross-validation and also use other steps like feature selection.
My question is, is there a way to get this plot through the pipeline (without training the model a second time)? So how can I display the training and validation RMSE value nicely in a diagram in the pipeline?
Pipeline
dfListingsFeature_regression = pd.read_csv(r"https://raw.githubusercontent.com/Coderanker3/dataset4/main/listings_cleaned.csv")
d = {True: 1, False: 0, np.nan : np.nan}
dfListingsFeature_regression['host_is_superhost'] = dfListingsFeature_regression[
'host_is_superhost'].map(d).astype('int')
X = dfListingsFeature_regression.drop(columns=['host_id', 'id', 'price']) # Features
y = dfListingsFeature_regression['price'] # Target variable
print(dfListingsFeature_nor.shape)
steps = [('feature_selection', SelectFromModel(estimator=LogisticRegression(max_iter=10000))),
('lasso', Lasso(alpha=0.4))]
pipeline = Pipeline(steps)
X_train, X_test, y_train, y_test = train_test_split(X,y,test_size=0.2, random_state=30)
parameteres = { }
grid = GridSearchCV(pipeline, param_grid=parameteres, cv=5)
grid.fit(X_train, y_train)
print("score = %3.2f" %(grid.score(X_test,y_test)))
print('Training set score: ' + str(grid.score(X_train,y_train)))
print('Test set score: ' + str(grid.score(X_test,y_test)))
y_pred = grid.predict(X_test)
print("RMSE Val:", metrics.mean_squared_error(y_test, y_pred, squared=False))
y_train_predict = grid.predict(X_train)
print("Train:" , metrics.mean_squared_error(y_train, y_train_predict , squared=False))
r2 = metrics.r2_score(y_test, y_pred)
print(r2)
Plot
from sklearn.metrics import mean_squared_error
from sklearn.model_selection import train_test_split
def plot_learning_curves(model, X, y):
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
train_errors, val_errors = [], []
for m in range(1, 500 + 1):
model.fit(X_train[:m], y_train[:m])
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
val_errors.append(mean_squared_error(y_val, y_val_predict))
plt.figure( figsize=(10,10))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
plt.legend(loc="upper right", fontsize=14)
plt.xlabel("Training set size", fontsize=14)
plt.ylabel("RMSE", fontsize=14)
%%time
lin_reg = Lasso(alpha=0.1)
plot_learning_curves(lin_reg, X, y)
#plt.axis([0, 80, 0, 3])
plt.show()
You don't have to fit() your model again in plot_learning_curves. You can simply use your fitted pipeline to predict value for both train and validation set and then plot your learning curve.
You function should look as follow without the model.fit():
def plot_learning_curves(model, X, y):
X_train, X_val, y_train, y_val = train_test_split(X, y, test_size=0.2, random_state=10)
train_errors, val_errors = [], []
for m in range(1, 500 + 1):
y_train_predict = model.predict(X_train[:m])
y_val_predict = model.predict(X_val)
train_errors.append(mean_squared_error(y_train[:m], y_train_predict))
val_errors.append(mean_squared_error(y_val, y_val_predict))
plt.figure( figsize=(10,10))
plt.plot(np.sqrt(train_errors), "r-+", linewidth=2, label="train")
plt.plot(np.sqrt(val_errors), "b-", linewidth=3, label="val")
plt.legend(loc="upper right", fontsize=14)
plt.xlabel("Training set size", fontsize=14)
plt.ylabel("RMSE", fontsize=14)
Then you should call this function using your fitted model as parameter.
I am not software back ground yet i am learning regression technique to predict motor data.
I have 3d data for which i have used multi variant regression.
Result is fine. But now i want to visualize the best fir plane for this data.
following are the code which i copied paste from different site to try to visualize my data.
X_final=df3[['Ampere','Voltage']]
y_final=df3[['ReactivePower']].copy() #copy column data in to y_final
X_final=X_final.dropna()
y_final=y_final.dropna()
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 0 )
lr = LinearRegression().fit(X_train,y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
#print score
print("lr.coef_: {}".format(lr.coef_))
print("lr.intercept_: {}".format(lr.intercept_))
print('lr train score %.3f, lr test score: %.3f' % (
lr.score(X_train,y_train),
lr.score(X_test, y_test)))
# Visualize the Data for Multiple Linear Regression
x_surf, y_surf = np.meshgrid(np.linspace(df3.Voltage.min(), df3.Voltage.max()),np.linspace(df3.Ampere.min(), df3.Ampere.max()))
y_train_pred_random= y_train_pred[np.random.choice(y_train_pred.shape[0], 2500, replace=False), :]
y_train_pred_random=np.array(y_train_pred_random)
y_train_pred1=y_train_pred_random.reshape(x_surf.shape)
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df3['Voltage'],df3['Ampere'],df3['ReactivePower'],c='red', marker='o', alpha=0.5)
ax.plot_surface(x_surf,y_surf,y_train_pred1,rstride=1, cstride=1, color='b', alpha=0.3)
ax.set_xlabel('Voltage')
ax.set_ylabel('Ampere')
ax.set_zlabel('Reactive Power')
plt.show()
when i run code for visualization i get following graph,
Please help
yeah, i solved myself with some refrence online,
here is the code,
#Test train split mullti variant
X_final=df3[['Ampere','Voltage']]
y_final=df3[['ReactivePower']].copy() #copy column data in to y_final
X_final=X_final.dropna()
y_final=y_final.dropna()
X_train, X_test, y_train, y_test = train_test_split(X_final, y_final, test_size = 0.33, random_state = 0 )
lr = LinearRegression().fit(X_train,y_train)
y_train_pred = lr.predict(X_train)
y_test_pred = lr.predict(X_test)
#print score
print("lr.coef_: {}".format(lr.coef_))
print("lr.intercept_: {}".format(lr.intercept_))
print('lr train score %.3f, lr test score: %.3f' % (
lr.score(X_train,y_train),
lr.score(X_test, y_test)))
# Visualize the Data for Multiple Linear Regression
x_surf, y_surf = np.meshgrid(np.linspace(df3.Ampere.min(), df3.Ampere.max()),np.linspace(df3.Voltage.min(), df3.Voltage.max()))
z_surf=lr.coef_[0,0]*x_surf+lr.coef_[0,1]*y_surf+lr.intercept_
fig = plt.figure()
ax = fig.add_subplot(111, projection='3d')
ax.scatter(df3['Ampere'],df3['Voltage'],df3['ReactivePower'],c='red', marker='o', alpha=0.5)
ax.plot_surface(x_surf,y_surf,z_surf,rstride=1, cstride=1, color='b', alpha=0.3)
ax.set_xlabel('Ampere')
ax.set_ylabel('Voltage')
ax.set_zlabel('Reactive Power')
plt.show()
Here is the plot,
Thanks,
I have a dataset with X = ['x', 'y'] the two first columns of my dataset
and in target the data['class'].
But i doesn't how display a plot of linear regression in this case.
Because I have the error "x and y must be the same size".
So how i can plot a linear regression and predict with a dataset
or i take X as the first two column of my dataset and in target the last column ?
Thanks so much for the help, here my code below :
data = pd.read_csv('data.csv')
X = data[['x', 'y']]
data['class'] = np.where(data['class']=='P', 1, 0)
Y = data['class']
plt.scatter(X, Y, color='blue')
plt.xlabel('x')
plt.ylabel('y')
plt.plot(X, Y, color='red', linewidth=2)
plt.show()
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
Based on the offical documentation:
X_train, X_test, y_train, y_test = train_test_split(X, Y, test_size=0.4, random_state=0)
regressor = LinearRegression()
regressor.fit(X_train, y_train)
y_pred = regressor.predict(X_test) #adding your prediction, this was missing
import matplotlib.pyplot as plt
import numpy as np
# Plot outputs
plt.scatter(X_test, y_test, color='black') #plot scatters
plt.plot(X_test, y_pred, color='red', linewidth=2) #plot line
plt.xticks(())
plt.yticks(())
plt.show()
I would greatly appreciate if you could let me know how to plot validation curve for class weight. In fact, I tired the following code to do this task:
from sklearn.datasets import make_classification
from sklearn.model_selection import train_test_split, StratifiedKFold, validation_curve, GridSearchCV
from sklearn.preprocessing import StandardScaler
from sklearn.linear_model import LogisticRegression
from sklearn.pipeline import Pipeline
import numpy as np
import matplotlib.pyplot as plt
def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1):
train_mean = np.mean(train_scores, axis=1)
train_std = np.std(train_scores, axis=1)
test_mean = np.mean(test_scores, axis=1)
test_std = np.std(test_scores, axis=1)
plt.plot(param_range, train_mean, label='train score', color='blue', marker='o')
plt.fill_between(param_range, train_mean + train_std,
train_mean - train_std, color='blue', alpha=alpha)
plt.plot(param_range, test_mean, label='test score', color='red', marker='o')
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
plt.title(title)
plt.grid(ls='--')
plt.xlabel('Parameter value')
plt.ylabel('F-measure')
plt.legend(loc='best')
plt.show()
if __name__ == '__main__':
X, y = make_classification(n_classes=2, class_sep=2, weights=[0.9, 0.1], n_informative=3, n_redundant=1, flip_y=0,
n_features=20, n_clusters_per_class=1, n_samples=1000, random_state=10)
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
st = StandardScaler()
rg = LogisticRegression(class_weight={0: 1, 1: 6.5}, random_state=42, solver='saga', max_iter=100, n_jobs=-1)
param_grid = {'clf__C': [0.001, 0.01, 0.1],
'clf__class_weight': [{0: 1, 1: 6}, {0: 1, 1: 5.5}]}
pipeline = Pipeline(steps=[('scaler', st),
('clf', rg)])
cv = StratifiedKFold(n_splits=5, random_state=42)
rg_cv = GridSearchCV(pipeline, param_grid, cv=cv, scoring='f1')
rg_cv.fit(X_train, y_train)
plt.figure(figsize=(9, 6))
param_range2 = [{0: 1, 1: 6}, {0: 1, 1: 4}, {0: 1, 1: 5.5}]
train_scores, test_scores = validation_curve(
estimator=rg_cv.best_estimator_, X=X_train, y=y_train, param_name="clf__class_weight", param_range=param_range2,
cv=cv, scoring="f1", n_jobs=-1)
plot_validation_curve(param_range2, train_scores, test_scores, title="Validation Curve for class_weight", alpha=0.1)
However, this error is reported, which is related to the param_range2 in the last line:
TypeError: float() argument must be a string or a number, not 'dict'
Thanks in advance.
Best regards,
You have to decide what you want to plot on your x-axis. You do a parameter sweep over the different class weights. Since you fixed the weight for the first class to 1 and only modified the weight for class 2, I decided to plot the scores against the weight for class two.
I then sorted the weights in the plot function in ascending order so that you get nicely connected lines.
def plot_validation_curve(param_range, train_scores, test_scores, title, alpha=0.1):
param_range = [x[1] for x in param_range]
sort_idx = np.argsort(param_range)
param_range=np.array(param_range)[sort_idx]
train_mean = np.mean(train_scores, axis=1)[sort_idx]
train_std = np.std(train_scores, axis=1)[sort_idx]
test_mean = np.mean(test_scores, axis=1)[sort_idx]
test_std = np.std(test_scores, axis=1)[sort_idx]
plt.plot(param_range, train_mean, label='train score', color='blue', marker='o')
plt.fill_between(param_range, train_mean + train_std,
train_mean - train_std, color='blue', alpha=alpha)
plt.plot(param_range, test_mean, label='test score', color='red', marker='o')
plt.fill_between(param_range, test_mean + test_std, test_mean - test_std, color='red', alpha=alpha)
plt.title(title)
plt.grid(ls='--')
plt.xlabel('Weight of class 2')
plt.ylabel('Average values and standard deviation for F1-Score')
plt.legend(loc='best')
plt.show()
This results in the following plot
I fit a Ridge Regression with GridSearchCV but am having trouble using matplotlib to show the model performance versus regularizer(alpha)
Could anyone please help?
My code:
from sklearn.datasets import fetch_california_housing
cal=fetch_california_housing()
X = cal.data
y = cal.target
from sklearn.model_selection import train_test_split
X_train, X_test, y_train, y_test = train_test_split(X, y, random_state=0)
param_grid = {'alpha': np.logspace(-3, 3, 13)}
print(param_grid)
grid = GridSearchCV(Ridge(normalize=True), param_grid, cv=10)
grid.fit(X_train, y_train)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
import matplotlib.pyplot as plt
alphas = np.logspace(-3, 3, 13)
plt.semilogx(alphas, grid.fit(X_train, y_train), label='Train')
plt.semilogx(alphas, grid.fit(X_test, y_test), label='Test')
plt.legend(loc='lower left')
plt.ylim([0, 1.0])
plt.xlabel('alpha')
plt.ylabel('performance')
# the error code I got was "ValueError: x and y must have same first dimension"
Basically, I want to see the something like the following:
When plotting model selection performance resulting from using GridSearch, it's typical to plot the mean and standard deviation of test and training sets of the cross_validation folds.
Also care should be taken to identify which scoring criteria is to be used in the grid search to select the best model. this is typically R-squared for regression.
The grid search returns a dictionary (accessible through .cv_results_) containing the scores for each fold train/test scores as well as the time it took to train/test each fold. Also a summary of that data is included using the mean and the standard deviation.
PS. in newer version of pandas you'll need to include return_train_score=True
PS.S. when using grid search, splitting the data to train/test is not necessary for model selection, because the grid search splits the data automatically (cv=10 means that the data is split to 10 folds)
given the above I modified the code to
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import Ridge
from sklearn.model_selection import GridSearchCV
from sklearn.datasets import fetch_california_housing
cal = fetch_california_housing()
X = cal.data
y = cal.target
param_grid = {'alpha': np.logspace(-3, 3, 13)}
print(param_grid)
grid = GridSearchCV(Ridge(normalize=True), param_grid,
cv=10, return_train_score=True, scoring='r2')
grid.fit(X, y)
print("Best cross-validation score: {:.2f}".format(grid.best_score_))
print("Best parameters: ", grid.best_params_)
alphas = np.logspace(-3, 3, 13)
train_scores_mean = grid.cv_results_["mean_train_score"]
train_scores_std = grid.cv_results_["std_train_score"]
test_scores_mean = grid.cv_results_["mean_test_score"]
test_scores_std = grid.cv_results_["std_test_score"]
plt.figure()
plt.title('Model')
plt.xlabel('$\\alpha$ (alpha)')
plt.ylabel('Score')
# plot train scores
plt.semilogx(alphas, train_scores_mean, label='Mean Train score',
color='navy')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alphas,
train_scores_mean - train_scores_std,
train_scores_mean + train_scores_std,
alpha=0.2,
color='navy')
plt.semilogx(alphas, test_scores_mean,
label='Mean Test score', color='darkorange')
# create a shaded area between [mean - std, mean + std]
plt.gca().fill_between(alphas,
test_scores_mean - test_scores_std,
test_scores_mean + test_scores_std,
alpha=0.2,
color='darkorange')
plt.legend(loc='best')
plt.show()
The resulting figure is shown below
you should plot scores, not the result of grid.fit().
First of all use return_train_score=True:
grid = GridSearchCV(Ridge(normalize=True), param_grid, cv=10, return_train_score=True)
then after fitting the model plot it as follows:
plt.semilogx(alphas, grid.cv_results_['mean_train_score'], label='Train')
plt.semilogx(alphas, grid.cv_results_['mean_test_score'], label='Test')
plt.legend()
Result: