I am trying to use linear and polynomial regression for the data.
import numpy as np
import matplotlib.pyplot as plt
from sklearn import linear_model, cross_validation
from sklearn.preprocessing import PolynomialFeatures
from sklearn.pipeline import make_pipeline
def f(x):
return np.sin(2 * np.pi * x)
x = np.random.uniform(0, 1, size=100)[:, np.newaxis]
y = f(x) + np.random.normal(scale=0.3, size=100)[:, np.newaxis]
x_train, x_test, y_train, y_test = cross_validation.train_test_split(x, y, test_size=0.5, random_state=0)
poly_model = make_pipeline(PolynomialFeatures(degree=2), linear_model.LinearRegression())
poly_model.fit(x_train, y_train)
linear_model_1 = linear_model.LinearRegression()
linear_model_1.fit(x_train, y_train)
fig = plt.figure()
ax = plt.axes()
ax.set(xlabel='X', ylabel='Y', title='X vs Y')
ax.scatter(x,y, alpha=0.5, cmap='viridis')
ax.plot(x_test, linear_model_1.predict(x_test), color='green', label='linear')
ax.plot(x_test, poly_model.predict(x_test), color='red', label='poly')
ax.legend()
With the above code, I am receiving this image:
But as you can see, the polynomial regression is not right.
I tried different approaches ( not using make_pipeline etc) but with no success.
If I've understood you correctly, just add a sorting to your x_test before passing it to predict() function and increase the degree of polynomial to 3:
poly_model = make_pipeline(PolynomialFeatures(degree=3), linear_model.LinearRegression())
and
x_test.sort(axis=0)
with these adjustments I'm getting following plot:
1) You can just call plot twice, it will add new line to existing plot. eg:
ax.plot(x_test, model1.predict(x_test), color='red', linewidth=2)
ax.plot(x_test, model2.predict(x_test), color='green', linewidth=2)
In your case I'd do sth like that:
linear_model = linear_model.LinearRegression(fit_intercept=False)
poly_model = model = Pipeline([('poly', PolynomialFeatures(degree=2)),
('linear', linear_model.LinearRegression(fit_intercept=False))])
linear_model.fit(x_train, y_train)
poly_model.fit(x_train, y_train)
And then:
ax.plot(x_test, linear_model.predict(x_test), color='red', linewidth=2, label='linear')
ax.plot(x_test, poly_model.predict(x_test), color='green', linewidth=2, label='poly')
ax.legend()
Related
I have an assignment in which I need to compare my own multi-class logistic regression and the built-in SKlearn one.
As part of it, I need to plot the decision boundaries of each, on the same figure (for 2,3, and 4 classes separately).
This is my model's decision boundaries for 3 classes:
Made with this code:
x1_min, x1_max = X[:,0].min()-.5, X[:,0].max()+.5
x2_min, x2_max = X[:,1].min()-.5, X[:,1].max()+.5
xx1, xx2 = np.meshgrid(np.linspace(x1_min, x1_max), np.linspace(x2_min, x2_max))
grid = np.c_[xx1.ravel(), xx2.ravel()]
for i in range(len(ws)):
probs = ol.predict_prob(grid, ws[i]).reshape(xx1.shape)
plt.contour(xx1, xx2, probs, [0.5], linewidths=1, colors='green')
where
ol - is my Own Linear regression
ws - the current weights
That's how I tried to plot the Sklearn boundaries:
for i in range(len(clf.coef_)):
w = clf.coef_[i]
a = -w[0] / w[1]
xx = np.linspace(x1_min, x1_max)
yy = a * xx - (clf.intercept_[0]) / w[1]
plt.plot(xx, yy, 'k-')
Resulting
I understand that it's due to the 1dim vs 2dim grids, but I can't understand how to solve it.
I also tried to use the built-in DecisionBoundaryDisplay but I couldn't figure out how to plot it with my boundaries + it doesn't plot only the lines but also the whole background is painted in the corresponding color.
A couple fixes:
Change clf.intercept_[1] to clf.intercept_[i]
If the xlimits and ylimits in the plot look strange, you can constrain them.
ax.set_xlim([x1_min, x1_max])
ax.set_ylim([x2_min, x2_max])
MRE:
import matplotlib.pyplot as plt
import numpy as np
from sklearn.datasets import make_blobs
from sklearn.linear_model import LogisticRegression
X, y = make_blobs(n_features=2, centers=3, random_state=42)
fig, ax = plt.subplots(1, 2)
x1_min, x1_max = X[:,0].min()-.5, X[:,0].max()+.5
x2_min, x2_max = X[:,1].min()-.5, X[:,1].max()+.5
def draw_coef_lines(clf, X, y, ax, title):
for i in range(len(clf.coef_)):
w = clf.coef_[i]
a = -w[0] / w[1]
xx = np.linspace(x1_min, x1_max)
yy = a * xx - (clf.intercept_[i]) / w[1]
ax.plot(xx, yy, 'k-')
ax.scatter(X[:, 0], X[:, 1], c=y)
ax.set_xlim([x1_min, x1_max])
ax.set_ylim([x2_min, x2_max])
ax.set_title(title)
clf1 = LogisticRegression().fit(X, y)
clf2 = LogisticRegression(multi_class="ovr").fit(X, y)
draw_coef_lines(clf1, X, y, ax[0], "Multinomial")
draw_coef_lines(clf2, X, y, ax[1], "OneVsRest")
plt.show()
I'm trying to plot a curve fit line in a 3D scatterplot but I keep getting an error when trying to plot the line of best fit that is predicted by the model. I have to use the following x_fit and y_fit data to compute z_fit by invoking the model's predict() method. The points are x,y,z.
Small sample of the data:
x = [-3.319120, 8.812980, -19.995425, -7.906697,-14.129764]
y = [-2.070820, 8.055306, -19.407131, -7.559838,-12.762732]
z = [-12.762732, 109.527857, 4867.408412, -149.039706,1493.048153]
This is my code:
x = df['x']
y = df['y']
z = df['z']
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
features = np.array(df[['x','y']])
response = np.array(df[['z']])
poly_features= PolynomialFeatures(degree=3, include_bias=False)
x_poly = poly_features.fit_transform(features, response)
features[0]
x_poly[0]
#Train Linear Regression Model
from sklearn.linear_model import LinearRegression
model = LinearRegression()
model.fit(x_poly, response)
# Plot Curve Fit
x_fit = np.linspace(-21,21,1000).reshape(-1,1)
y_fit = x_fit
x_fit_poly = poly_features.fit_transform(x_fit, y_fit )
z_fit = model.predict(x_fit_poly)
ax=fig.add_subplot(2,2,1, projection='3d')
ax.scatter3D(x,y,z, c=z, cmap='jet')
ax.set_xlabel('x',c='r', size=12)
ax.set_ylabel('y',c='r', size=12)
ax.set_zlabel('z',c='r', size=12)
#plot the line
ax.plot3D(x_fit,y_fit, z_fit, 'black')
ax.view_init(0,90)
Output:
ValueError Traceback (most recent call last)
Input In [119], in <cell line: 6>()
3 y_fit = x_fit
5 x_fit_poly = poly_features.fit_transform(x_fit, y_fit )
----> 6 z_fit = model.predict(x_fit_poly)
ValueError: X has 3 features, but LinearRegression is expecting 9 features as input.
Expected output:
Any suggestions would be appreciated. Thank you!
fit is for training data (and it uses fit_transform() with data ([x, y], z) to train and transform data) but later you need only transform([x, y]) to make transformation without traning.
x_fit = np.linspace(-21, 21, 1000).reshape(-1,1)
y_fit = x_fit
arr = np.hstack([x_fit, y_fit])
#print('arr:', arr.shape) # (1000, 2)
x_fit_poly = poly_features.transform(arr)
z_fit = model.predict(x_fit_poly)
And this should resolve this problem with ValueError: X has 3 features, but you still may have problem with next lines of code.
Full code for test:
import numpy as np
import pandas as pd
from sklearn.preprocessing import PolynomialFeatures
from sklearn.linear_model import LinearRegression
import matplotlib.pyplot as plt
df = pd.DataFrame({
'x': [-3.319120, 8.812980, -19.995425, -7.906697,-14.129764],
'y': [-2.070820, 8.055306, -19.407131, -7.559838,-12.762732],
'z': [-12.762732, 109.527857, 4867.408412, -149.039706,1493.048153],
)
features = np.array(df[['x','y']])
response = np.array(df[['z']])
print('features:', features.shape)
print('response:', response.shape)
poly_features = PolynomialFeatures(degree=3, include_bias=False)
x_poly = poly_features.fit_transform(features, response)
model = LinearRegression()
model.fit(x_poly, response)
x_fit = np.linspace(-21, 21, 1000).reshape(-1, 1)
y_fit = x_fit
arr = np.hstack([x_fit, y_fit])
print('arr:', arr.shape)
print('transform')
x_fit_poly = poly_features.transform(arr)
print('predict')
z_fit = model.predict(x_fit_poly)
print('z_fit:', z_fit.shape)
x = df['x']
y = df['y']
z = df['z']
fig = plt.figure()
ax = fig.add_subplot(1, 1, 1, projection='3d')
ax.scatter3D(x, y, z, c=z, cmap='jet')
ax.set_xlabel('x', c='r', size=12)
ax.set_ylabel('y', c='r', size=12)
ax.set_zlabel('z', c='r', size=12)
#plot the line
ax.plot3D(x_fit.flatten(), y_fit.flatten(), z_fit.flatten(), 'black')
ax.view_init(0, 90)
plt.show()
I want to build a chart similar to this
I have created a bar chart, and I have the logistic regression completed.
#imports
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
plt.bar(prob_df['diff'], prob_df['full_win_prob'])
plt.show()
#logistic regression
X = dfx['home_diff'].values
y = dfx['away_win'].values
X = X.reshape(-1, 1)
logreg = LogisticRegression()
logreg.fit(X, y)
print(logreg.intercept_, logreg.coef_)
[-0.67032214] [[0.04948131]] #results
I have the chart and I have the model, I can't figure out how to plot the model on top of the chart, its a bit frustrating I'm sure the answer is simple. I would prefer an answer in matplotlib but seaborn is also ok.
You can use yy = logreg.predict_proba(XX)[:,1] to plot the logistic curve for an array of x-values. logreg.predict_proba(XX)[:,0] gives the inverted curve, the probability of being 0. logreg.predict(XX) gives the predictions, i.e. the logistic curve rounded to 0 or 1.
Here is an example starting from some generated test data.
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
import numpy as np
np.random.seed(123)
# logistic regression
X = np.random.uniform(1, 9, 20) # dfx['home_diff'].values
y = np.random.choice([0, 1], 20, p=[0.8, 0.2])
y = np.where(X < 5, y, 1 - y) # dfx['away_win'].values
X = X.reshape(-1, 1)
plt.scatter(X[:, 0], y, color='black', label='Given values')
logreg = LogisticRegression()
logreg.fit(X, y)
XX = np.linspace(0, 10, 1000).reshape(-1, 1)
prediction = logreg.predict(XX)
probability_0, probability_1 = logreg.predict_proba(XX).T
plt.plot(XX[:, 0], prediction, color='limegreen', lw=2, alpha=0.7, label='Predicted values')
plt.plot(XX[:, 0], probability_0, color='crimson', lw=2, alpha=0.7, ls='--', label='Probability of being 0')
plt.plot(XX[:, 0], probability_1, color='deepskyblue', lw=2, alpha=0.7, label='Probability of being 1')
plt.legend()
Thanks to JohanCs help I was able to get it. Clearly my answer is heavily inspired by his posting. I adjusted the code a bit so it displays the model over the bar graph rather than the scatterplot. I tried to upvote you but I don't have enough karma.
Here is what I used and the result:
# logistic regression
X = dfx['home_diff'].values
y = dfx['away_win'].values
X = X.reshape(-1, 1)
logreg = LogisticRegression()
logreg.fit(X, y)
prediction = logreg.predict(X)
probability_0, probability_1 = logreg.predict_proba(X).T
plt.plot(X[:, 0], probability_1, color='blue', lw=2, alpha=0.6, label='Probability of being 1')
plt.bar(prob_df['diff'], prob_df['full_win_prob'], color='orange')
plt.legend()
Sorry if this is a very simple question. But I'm a newcomer to the field.
My specific question is this: I have trained an XGboost classifier in Python. After the training, how can I get the samples in my training data that are closer than a fixed value to the decision boundary of the model?
Thanks
I don't think if xgboost has a built-in method for that or if there is a mathematical formula for that like for SVC. This visualization could help though for 2D feature spaces:
import xgboost as xgb
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
from sklearn.datasets import make_moons
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
# setup marker generator and color map
markers = ('s', 'x', 'o', '^', 'v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
# plot the decision surface
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.4, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0], y=X[y == cl, 1],
alpha=0.8, c=cmap(idx),
marker=markers[idx], label=cl)
# highlight test samples
if test_idx:
# plot all samples
if not versiontuple(np.__version__) >= versiontuple('1.9.0'):
X_test, y_test = X[list(test_idx), :], y[list(test_idx)]
warnings.warn('Please update to NumPy 1.9.0 or newer')
else:
X_test, y_test = X[test_idx, :], y[test_idx]
plt.scatter(X_test[:, 0],
X_test[:, 1],
c='',
alpha=1.0,
linewidths=1,
marker='o',
s=55, label='test set')
X, y = make_moons(noise=0.3, random_state=0)
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3)
xgb_clf = xgb.XGBClassifier()
xgb_clf = xgb_clf.fit(X_train, y_train)
plot_decision_regions(X_test, y_test, xgb_clf)
plt.show()
The plot_decision_regions function is from Python Machine Learning book, available on its public GitHub here.
I basically want to add a colorbar at each of the subplots in the code below (link to code ). My attempts add all color bars at the end of the loop in the last subplot.
print(__doc__)
import matplotlib.pyplot as plt
from sklearn.datasets import fetch_mldata
from sklearn.neural_network import MLPClassifier
mnist = fetch_mldata("MNIST original")
# rescale the data, use the traditional train/test split
X, y = mnist.data / 255., mnist.target
X_train, X_test = X[:60000], X[60000:]
y_train, y_test = y[:60000], y[60000:]
# mlp = MLPClassifier(hidden_layer_sizes=(100, 100), max_iter=400, alpha=1e-4,
# solver='sgd', verbose=10, tol=1e-4, random_state=1)
mlp = MLPClassifier(hidden_layer_sizes=(50,), max_iter=10, alpha=1e-4,
solver='sgd', verbose=10, tol=1e-4, random_state=1,
learning_rate_init=.1)
mlp.fit(X_train, y_train)
print("Training set score: %f" % mlp.score(X_train, y_train))
print("Test set score: %f" % mlp.score(X_test, y_test))
fig, axes = plt.subplots(4, 4)
# use global min / max to ensure all weights are shown on the same scale
vmin, vmax = mlp.coefs_[0].min(), mlp.coefs_[0].max()
for coef, ax in zip(mlp.coefs_[0].T, axes.ravel()):
ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=.5 * vmin,
vmax=.5 * vmax)
ax.set_xticks(())
ax.set_yticks(())
plt.show()
UPDATE:
based on the link in the comment below, here is the code which adds the colorbar at the right of the diagram
fig, axes = plt.subplots(4, 4)
# use global min / max to ensure all weights are shown on the same scale
vmin, vmax = mlp.coefs_[0].min(), mlp.coefs_[0].max()
for coef, ax in zip(mlp.coefs_[0].T, axes.ravel()):
im = ax.matshow(coef.reshape(28, 28), cmap=plt.cm.gray, vmin=.5 * vmin, vmax=.5 * vmax)
ax.set_xticks(())
ax.set_yticks(())
fig.subplots_adjust(right=0.8)
cbar_ax = fig.add_axes([0.85, 0.15, 0.05, 0.7])
fig.colorbar(im, cax=cbar_ax)
plt.show()