How to mask points below the line? - python

I was practicing with matplotlib.pyplot and getting used with masking data (np.ma.masked_where) point. Is there any math formula or way to mask data points below the line? Expected result:

Yes, check if the y-values are lower than the linear function of your x-values.
In your case it seems to be angle bisector of the first quadrant, so offset is 0 and slope is 1:
y < x
In general check for
y < m * x + t # with slope m and offset t
I.e. in your case simply
y.mask = y < x
plt.plot(x, y)
Example:
import numpy as np
import matplotlib.pyplot as plt
plt.style.use('ggplot')
fig = plt.figure()
np.random.seed(7) # prepare data
x = np.random.random(10)
y = np.random.random(10)
y = np.ma.masked_array(y)
# plot all values
plt.plot(x, y, 'o', ms=10, mec='k', mfc=(0,0,0,0), label = 'all points')
y.mask = y < x # mask values below angular bisector
plt.plot(x, y, 'bo', label = '$y \geq x$') # plot masked array
plt.plot((0, 1), (0, 1), 'b') # plot angular bisector
m = 3 # prepare the general case
t = -1
y.mask = y < m * x + t # mask values below linear function
plt.plot(x, y, 'rx', label = '$y \geq 3x - 1$') # plot masked array
plt.plot((0, 1), (m*0+t, m*1+t), 'r') # plot linear function
plt.ylim(0, 1)
fig.legend(ncol=3, loc='upper center')

points_X = [1,2,3,4,5,6,7,8,9,10] // your points_X data
points_Y = [1,2,3,4,5,6,7,8,9,10] // your points_Y data
new_points_X=[]
new_points_Y=[]
for i in range(len(points_X)):
if(points_Y[i] <= points_X[i]):
new_points_Y.append(points_Y[i])
new_points_X.append(points_X[i])
plot(new_points_X, new_points_Y)

Related

How to plot the equation for a semicircle

My half circle doesn't really look like how I expected.
Am I doing this right or am I missing something pretty big here ?
import math
import numpy as np
import matplotlib.pyplot as plt
coord_list = []
h = 0
k = 0
r = 6
for x in range((1 + h - r), (h + r - 1), 1):
y1 = k + math.sqrt(r**2 - (x-h)**2)
coord_list.append([x, y1])
for each in coord_list:
print(each)
data = np.array([coord_list])
x, y = data.T
figure = plt.scatter(x, y)
figure = plt.grid(color = 'green', linestyle = '--', linewidth = 0.2)
figure = plt.show()
Use np.linspace to create an array for the x values. Use many points to create the circle
Use np.sqrt to solve for an array, instead of looping through each value.
import numpy as np
import matplotlib.pyplot as plt
# function for semicircle
def semicircle(r, h, k):
x0 = h - r # determine x start
x1 = h + r # determine x finish
x = np.linspace(x0, x1, 10000) # many points to solve for y
# use numpy for array solving of the semicircle equation
y = k + np.sqrt(r**2 - (x - h)**2)
return x, y
x, y = semicircle(6, 0, 0) # function call
plt.scatter(x, y, s=3, c='turquoise') # plot
plt.gca().set_aspect('equal', adjustable='box') # set the plot aspect to be equal
Answering my own question.
Plotting the output of the code:
import math
import numpy as np
import matplotlib.pyplot as plt
coord_list = []
h = 0
k = 0
r = 6
for x in range((1 + h - r), (h + r - 1), 1):
y1 = k + math.sqrt(r**2 - (x-h)**2)
coord_list.append([x, y1])
for each in coord_list:
print(each)
data = np.array([coord_list])
x, y = data.T
figure = plt.scatter(x, y)
figure = plt.grid(color = 'green', linestyle = '--', linewidth = 0.2)
figure = plt.show()
[-5, 3.3166247903554]
[-4, 4.47213595499958]
[-3, 5.196152422706632]
[-2, 5.656854249492381]
[-1, 5.916079783099616]
[0, 6.0]
[1, 5.916079783099616]
[2, 5.656854249492381]
[3, 5.196152422706632]
[4, 4.47213595499958]
Looking at the output of the coordinates, it doesn't appear to be a circle.
But if we take our equation and our coordinates and graph them on this website we see that they are indeed a circle. It's an optical illusion that they aren't. Partially because the graph is not evenly displayed and also because plotting points in a range function with steps of 1 (line 13) doesn't plot points at equal arc length distances away from each other.

How to create a plot with a repeating color pattern?

For my report, I'm creating a special color plot in jupyter notebook. There are two parameters, x and y.
import numpy as np
x = np.arange(-1,1,0.1)
y = np.arange(1,11,1)
with which I compute a third quantity. Here is an example to demonstrate the concept:
values = []
for i in range(len(y)) :
z = y[i] * x**3
# in my case the value z represents phases of oscillators
# so I will transform the computed values to the intervall [0,2pi)
values.append(z)
values = np.array(values) % 2*np.pi
I'm plotting y vs x. For each y = 1,2,3,4... there will be a horizontal line with total length two. For example: The coordinate (0.5,8) stands for a single point on line 8 at position x = 0.5 and z(0.5,8) is its associated value.
Now I want to represent each point on all ten lines with a unique color that is determined by z(x,y). Since z(x,y) takes only values in [0,2pi) I need a color scheme that starts at zero (for example z=0 corresponds to blue). For increasing z the color continuously changes and in the end at 2pi it takes the same color again (so at z ~ 2pi it becomes blue again).
Does someone know how this can be done in python?
The kind of structure for x, y and z you need, is easier using a meshgrid. Also, to have a lot of x-values between -1 and 1, np.linspace(-1,1,N) divides the range in N even intervals.
Using meshgrid, z can be calculated in one line using numpy's vectorization. This runs much faster.
To set a repeating color, a cyclic colormap such as hsv can be used. There the last color is the same as the starting color.
import numpy as np
from matplotlib import pyplot as plt
x, y = np.meshgrid(np.linspace(-1,1,100), np.arange(1,11,1))
z = (y * x**3) % 2*np.pi
plt.scatter(x, y, c=z, s=6, cmap='hsv')
plt.yticks(range(1,11))
plt.show()
Alternatively, a symmetric colormap could be built taken the colors from and existing map and combining them with the same colors in reverse order.
import numpy as np
from matplotlib import pyplot as plt
import matplotlib.colors as mcolors
colors_orig = plt.cm.viridis_r(np.linspace(0, 1, 128))
# combine the colors with the reversed array and build a new colormap
colors = np.vstack((colors_orig, colors_orig[::-1]))
symcmap = mcolors.LinearSegmentedColormap.from_list('symcmap', colors)
x, y = np.meshgrid(np.linspace(-1,1,100), np.arange(1,11,1))
z = (y * x**3) % 2*np.pi
plt.scatter(x, y, c=z, s=6, cmap=symcmap)
plt.yticks(range(1,11))
plt.show()
Multicolored lines are somewhat more complicated than just scatter plots. The docs have an example using LineCollections. Here is the adapted code. Note that the line segments are colored using their start point, so make sure there are enough x values. Also, the x and y limits aren't set automatically any more.
The code also adds a colorbar to illustrate how the colors map to the z values. Some interesting code from Jake VanderPlas shows how to create ticks for multiples of π.
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.collections import LineCollection
# code from Jake VanderPlas
def format_func(value, tick_number):
# find number of multiples of pi/2
N = int(np.round(2 * value / np.pi))
if N == 0:
return "0"
elif N == 1:
return r"$\pi/2$"
elif N == 2:
return r"$\pi$"
elif N % 2 > 0:
return r"${0}\pi/2$".format(N)
else:
return r"${0}\pi$".format(N // 2)
x = np.linspace(-1, 1, 500)
y_max = 10
# Create a continuous norm to map from data points to colors
norm = plt.Normalize(0, 2 * np.pi)
for y in range(1, y_max + 1):
z = (y * x ** 3) % 2 * np.pi
y_array = y * np.ones_like(x)
points = np.array([x, y_array]).T.reshape(-1, 1, 2)
segments = np.concatenate([points[:-1], points[1:]], axis=1)
lc = LineCollection(segments, cmap='hsv', norm=norm)
lc.set_array(z) # Set the values used for colormapping
lc.set_linewidth(2)
line = plt.gca().add_collection(lc)
# plt.scatter(x, y_array, c=z, s=10, norm=norm, cmap='hsv')
cbar = plt.colorbar(line) # , ticks=[k*np.pi for k in np.arange(0, 2.001, 0.25)])
cbar.locator = plt.MultipleLocator(np.pi / 2)
cbar.minor_locator = plt.MultipleLocator(np.pi / 4)
cbar.formatter = plt.FuncFormatter(format_func)
cbar.ax.minorticks_on()
cbar.update_ticks()
plt.yticks(range(1, y_max + 1)) # one tick for every y
plt.xlim(x.min(), x.max()) # the LineCollection doesn't force the limits
plt.ylim(0.5, y_max + 0.5)
plt.show()

Generate Dataset from plot

Hello I have come across a problem where I need to generate dataset from a distribution given on a scatter plot where datapoints are mostly centred around the centre of the circle and also surrounded within particular radius of the circle.Any ideas of generating such datasets in python ?
One way of producing a distribution over a circular shape is to sample a one dimensional distribution and then stretch it over the 2 Pi circonference of a circle.
One could then decide to use a uniform or a normal distribution.
import matplotlib.pyplot as plt
import numpy as np
def dist(R=4., width=1., num=1000, uniform=True):
if uniform:
r = np.random.rand(num)*width+R
else:
r = np.random.normal(R, width, num)
phi = np.linspace(0,2.*np.pi, len(r))
x= r * np.sin(phi)
y = r* np.cos(phi)
return x,y
fig, ax = plt.subplots(ncols=2, figsize=(9,4))
ax[0].set_title("uniform")
x,y = dist()
ax[0].plot(x,y, linestyle="", marker="o", markersize="2")
x,y = dist(0,1.2, 400)
ax[0].plot(x,y, linestyle="", marker="o", markersize="2")
ax[1].set_title("normal")
x,y = dist(4,0.4, uniform=False)
ax[1].plot(x,y, linestyle="", marker="o", markersize="2")
x,y = dist(0,0.6, uniform=False)
ax[1].plot(x,y, linestyle="", marker="o", markersize="2")
for a in ax:
a.set_aspect("equal")
plt.show()
You can easily generalize random numbers with some distribution centered on a point, for example normal centered on the 0, 0.
x = np.random.normal(size=1000)
y = np.random.normal(size=1000)
plt.plot(x, y, 'o', alpha=0.6)
EDIT:
What we do is generate random points in polar coordinates. First we do a random for the angle (between 0 and 2 pi) and then we give the noise multiplying it by some random number.
n = 300
theta_out = np.random.uniform(low=0, high=2*np.pi, size=n)
noise_out = np.random.uniform(low=0.9, high=1.1, size=n)
x_out = np.cos(theta_out) * noise_out
y_out = np.sin(theta_out) * noise_out
theta_in = np.random.uniform(low=0, high=2*np.pi, size=n)
noise_in = np.random.uniform(low=0, high=0.5, size=n)
x_in = np.cos(theta_in) * noise_in
y_in = np.sin(theta_in) * noise_in
ax = plt.gca()
ax.set_aspect('equal')
plt.plot(x_out, y_out, 'o')
plt.plot(x_in, y_in, 'o')
Note that there is more density of points while the lower the radius.

How do you create a 3D surface plot with missing values matplotlib?

I am trying to create a 3D surface energy diagram where an x,y position on a grid contains an associated z level. The issue is that the grid is not uniform (ie, there is not a z component for every x,y position). Is there a way to refrain from plotting those values by calling them NaN in the corresponding position in the array?
Here is what I have tried so far:
import numpy as np
from matplotlib import pyplot as plt
from mpl_toolkits.mplot3d import Axes3D
import pylab
from matplotlib import cm
#Z levels
energ = np.array([0,3.5,1,-0.3,-1.5,-2,-3.4,-4.8])
#function for getting x,y associated z values?
def fun(x,y,array):
return array[x]
#arrays for grid
x = np.arange(0,7,0.5)
y = np.arange(0,7,0.5)
#create grid
X, Y = np.meshgrid(x,y)
zs = np.array([fun(x,y,energ) for x in zip(np.ravel(X))])
Z = zs.reshape(X.shape)
plt3d = plt.figure().gca(projection='3d')
#gradients now with respect to x and y, but ideally with respect to z only
Gx, Gz = np.gradient(X * Y)
G = (Gx ** 2 + Gz ** 2) ** .5 # gradient magnitude
N = G / G.max() # normalize 0..1
plt3d.plot_surface(X, Y, Z, rstride=1, cstride=1,
facecolors=cm.jet(N), edgecolor='k', linewidth=0, antialiased=False, shade=False)
plt.show()
I cannot post image here of this plot but if you run the code you will see it
But I would like to not plot certain x,y pairs, so the figure should triangle downward to the minimum. Can this be accomplished by using nan values? Also would like spacing between each level, to be connected by lines.
n = np.NAN
#energ represents the z levels, so the overall figure should look like a triangle.
energ = np.array([[0,0,0,0,0,0,0,0,0,0,0,0,0],[n,n,n,n,n,n,n,n,n,n,n,n,n],[n,2.6,n,2.97,n,2.6,n,2.97,n,2.6,n,3.58,n],[n,n,n,n,n,n,n,n,n,n,n,n,n],[n,n,1.09,n,1.23,n,1.09,n,1.23,n,1.7,n,n],[n,n,n,n,n,n,n,n,n,n,n,n,n],[n,n,n,-0.65,n,-0.28,n,-0.65,n,0.33,n,n,n],[n,n,n,n,n,n,n,n,n,n,n,n,n],[n,n,n,n,-2.16,n,-2.02,n,-1.55,n,n,n,n],[n,n,n,n,n,n,n,n,n,n,n,n,n],[n,n,n,n,n,-3.9,n,-2.92,n,n,n,n,n,],[n,n,n,n,n,n,n,n,n,n,n,n,n],[n,n,n,n,n,n,-4.8,n,n,n,n,n,n,]])
plt3d = plt.figure().gca(projection='3d')
Gx, Gz = np.gradient(X * energ) # gradients with respect to x and z
G = (Gx ** 2 + Gz ** 2) ** .5 # gradient magnitude
N = G / G.max() # normalize 0..1
x = np.arange(0,13,1)
y = np.arange(0,13,1)
X, Y = np.meshgrid(x,y)
#but the shapes don't seem to match up
plt3d.plot_surface(X, Y, energ, rstride=1, cstride=1,
facecolors=cm.jet(N), edgecolor='k',
linewidth=0, antialiased=False, shade=False
)
Using masked arrays generates the following error: local Python[7155] : void CGPathCloseSubpath(CGMutablePathRef): no current point.
n = np.NAN
energ = np.array([[0,0,0,0,0,0,0,0,0,0,0,0,0],[n,n,n,n,n,n,n,n,n,n,n,n,n],[n,2.6,n,2.97,n,2.6,n,2.97,n,2.6,n,3.58,n],[n,n,n,n,n,n,n,n,n,n,n,n,n],[n,n,1.09,n,1.23,n,1.09,n,1.23,n,1.7,n,n],[n,n,n,n,n,n,n,n,n,n,n,n,n],[n,n,n,-0.65,n,-0.28,n,-0.65,n,0.33,n,n,n],[n,n,n,n,n,n,n,n,n,n,n,n,n],[n,n,n,n,-2.16,n,-2.02,n,-1.55,n,n,n,n],[n,n,n,n,n,n,n,n,n,n,n,n,n],[n,n,n,n,n,-3.9,n,-2.92,n,n,n,n,n,],[n,n,n,n,n,n,n,n,n,n,n,n,n],[n,n,n,n,n,n,-4.8,n,n,n,n,n,n,]])
x = np.arange(0,13,1)
y = np.arange(0,13,1)
X, Y = np.meshgrid(x,y)
#create masked arrays
mX = ma.masked_array(X, mask=[[0,0,0,0,0,0,0,0,0,0,0,0,0],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,0,1,0,1,0,1,0,1,0,1,0,1],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,1,0,1,0,1,0,1,0,1,0,1,1],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,1,1,0,1,0,1,0,1,0,1,1,1],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,1,1,1,0,1,0,1,0,1,1,1,1],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,1,1,1,1,0,1,0,1,1,1,1,1],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,1,1,1,1,1,0,1,1,1,1,1,1]])
mY = ma.masked_array(Y, mask=[[0,0,0,0,0,0,0,0,0,0,0,0,0],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,0,1,0,1,0,1,0,1,0,1,0,1],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,1,0,1,0,1,0,1,0,1,0,1,1],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,1,1,0,1,0,1,0,1,0,1,1,1],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,1,1,1,0,1,0,1,0,1,1,1,1],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,1,1,1,1,0,1,0,1,1,1,1,1],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,1,1,1,1,1,0,1,1,1,1,1,1]])
m_energ = ma.masked_array(energ, mask=[[0,0,0,0,0,0,0,0,0,0,0,0,0],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,0,1,0,1,0,1,0,1,0,1,0,1],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,1,0,1,0,1,0,1,0,1,0,1,1],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,1,1,0,1,0,1,0,1,0,1,1,1],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,1,1,1,0,1,0,1,0,1,1,1,1],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,1,1,1,1,0,1,0,1,1,1,1,1],[1,1,1,1,1,1,1,1,1,1,1,1,1],[1,1,1,1,1,1,0,1,1,1,1,1,1]])
plt3d = plt.figure().gca(projection='3d')
plt3d.plot_surface(mX, mY, m_energ, rstride=1, cstride=1, edgecolor='k', linewidth=0, antialiased=False, shade=False)
plt.show()
I was playing around with the code from this forum post, and I was able to make the graph have missing values. You can try the code yourself! I got it to work using float("nan") for the missing values.
import plotly.graph_objects as go
import numpy as np
x = np.arange(0.1,1.1,0.1)
y = np.linspace(-np.pi,np.pi,10)
#print(x)
#print(y)
X,Y = np.meshgrid(x,y)
#print(X)
#print(Y)
result = []
for i,j in zip(X,Y):
result.append(np.log(i)+np.sin(j))
result[0][0] = float("nan")
upper_bound = np.array(result)+1
lower_bound = np.array(result)-1
fig = go.Figure(data=[
go.Surface(z=result),
go.Surface(z=upper_bound, showscale=False, opacity=0.3,colorscale='purp'),
go.Surface(z=lower_bound, showscale=False, opacity=0.3,colorscale='purp')])
fig.show()

Plotting a decision boundary separating 2 classes using Matplotlib's pyplot

I could really use a tip to help me plotting a decision boundary to separate to classes of data. I created some sample data (from a Gaussian distribution) via Python NumPy. In this case, every data point is a 2D coordinate, i.e., a 1 column vector consisting of 2 rows. E.g.,
[ 1
2 ]
Let's assume I have 2 classes, class1 and class2, and I created 100 data points for class1 and 100 data points for class2 via the code below (assigned to the variables x1_samples and x2_samples).
mu_vec1 = np.array([0,0])
cov_mat1 = np.array([[2,0],[0,2]])
x1_samples = np.random.multivariate_normal(mu_vec1, cov_mat1, 100)
mu_vec1 = mu_vec1.reshape(1,2).T # to 1-col vector
mu_vec2 = np.array([1,2])
cov_mat2 = np.array([[1,0],[0,1]])
x2_samples = np.random.multivariate_normal(mu_vec2, cov_mat2, 100)
mu_vec2 = mu_vec2.reshape(1,2).T
When I plot the data points for each class, it would look like this:
Now, I came up with an equation for an decision boundary to separate both classes and would like to add it to the plot. However, I am not really sure how I can plot this function:
def decision_boundary(x_vec, mu_vec1, mu_vec2):
g1 = (x_vec-mu_vec1).T.dot((x_vec-mu_vec1))
g2 = 2*( (x_vec-mu_vec2).T.dot((x_vec-mu_vec2)) )
return g1 - g2
I would really appreciate any help!
EDIT:
Intuitively (If I did my math right) I would expect the decision boundary to look somewhat like this red line when I plot the function...
Your question is more complicated than a simple plot : you need to draw the contour which will maximize the inter-class distance. Fortunately it's a well-studied field, particularly for SVM machine learning.
The easiest method is to download the scikit-learn module, which provides a lot of cool methods to draw boundaries: scikit-learn: Support Vector Machines
Code :
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import scipy
from sklearn import svm
mu_vec1 = np.array([0,0])
cov_mat1 = np.array([[2,0],[0,2]])
x1_samples = np.random.multivariate_normal(mu_vec1, cov_mat1, 100)
mu_vec1 = mu_vec1.reshape(1,2).T # to 1-col vector
mu_vec2 = np.array([1,2])
cov_mat2 = np.array([[1,0],[0,1]])
x2_samples = np.random.multivariate_normal(mu_vec2, cov_mat2, 100)
mu_vec2 = mu_vec2.reshape(1,2).T
fig = plt.figure()
plt.scatter(x1_samples[:,0],x1_samples[:,1], marker='+')
plt.scatter(x2_samples[:,0],x2_samples[:,1], c= 'green', marker='o')
X = np.concatenate((x1_samples,x2_samples), axis = 0)
Y = np.array([0]*100 + [1]*100)
C = 1.0 # SVM regularization parameter
clf = svm.SVC(kernel = 'linear', gamma=0.7, C=C )
clf.fit(X, Y)
Linear Plot
w = clf.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-5, 5)
yy = a * xx - (clf.intercept_[0]) / w[1]
plt.plot(xx, yy, 'k-')
MultiLinear Plot
C = 1.0 # SVM regularization parameter
clf = svm.SVC(kernel = 'rbf', gamma=0.7, C=C )
clf.fit(X, Y)
h = .02 # step size in the mesh
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contour(xx, yy, Z, cmap=plt.cm.Paired)
Implementation
If you want to implement it yourself, you need to solve the following quadratic equation:
The Wikipedia article
Unfortunately, for non-linear boundaries like the one you draw, it's a difficult problem relying on a kernel trick but there isn't a clear cut solution.
Based on the way you've written decision_boundary you'll want to use the contour function, as Joe noted above. If you just want the boundary line, you can draw a single contour at the 0 level:
f, ax = plt.subplots(figsize=(7, 7))
c1, c2 = "#3366AA", "#AA3333"
ax.scatter(*x1_samples.T, c=c1, s=40)
ax.scatter(*x2_samples.T, c=c2, marker="D", s=40)
x_vec = np.linspace(*ax.get_xlim())
ax.contour(x_vec, x_vec,
decision_boundary(x_vec, mu_vec1, mu_vec2),
levels=[0], cmap="Greys_r")
Which makes:
Those were some great suggestions, thanks a lot for your help! I ended up solving the equation analytically and this is the solution I ended up with (I just want to post it for future reference:
# 2-category classification with random 2D-sample data
# from a multivariate normal distribution
import numpy as np
from matplotlib import pyplot as plt
def decision_boundary(x_1):
""" Calculates the x_2 value for plotting the decision boundary."""
return 4 - np.sqrt(-x_1**2 + 4*x_1 + 6 + np.log(16))
# Generating a Gaussion dataset:
# creating random vectors from the multivariate normal distribution
# given mean and covariance
mu_vec1 = np.array([0,0])
cov_mat1 = np.array([[2,0],[0,2]])
x1_samples = np.random.multivariate_normal(mu_vec1, cov_mat1, 100)
mu_vec1 = mu_vec1.reshape(1,2).T # to 1-col vector
mu_vec2 = np.array([1,2])
cov_mat2 = np.array([[1,0],[0,1]])
x2_samples = np.random.multivariate_normal(mu_vec2, cov_mat2, 100)
mu_vec2 = mu_vec2.reshape(1,2).T # to 1-col vector
# Main scatter plot and plot annotation
f, ax = plt.subplots(figsize=(7, 7))
ax.scatter(x1_samples[:,0], x1_samples[:,1], marker='o', color='green', s=40, alpha=0.5)
ax.scatter(x2_samples[:,0], x2_samples[:,1], marker='^', color='blue', s=40, alpha=0.5)
plt.legend(['Class1 (w1)', 'Class2 (w2)'], loc='upper right')
plt.title('Densities of 2 classes with 25 bivariate random patterns each')
plt.ylabel('x2')
plt.xlabel('x1')
ftext = 'p(x|w1) ~ N(mu1=(0,0)^t, cov1=I)\np(x|w2) ~ N(mu2=(1,1)^t, cov2=I)'
plt.figtext(.15,.8, ftext, fontsize=11, ha='left')
# Adding decision boundary to plot
x_1 = np.arange(-5, 5, 0.1)
bound = decision_boundary(x_1)
plt.plot(x_1, bound, 'r--', lw=3)
x_vec = np.linspace(*ax.get_xlim())
x_1 = np.arange(0, 100, 0.05)
plt.show()
And the code can be found here
EDIT:
I also have a convenience function for plotting decision regions for classifiers that implement a fit and predict method, e.g., the classifiers in scikit-learn, which is useful if the solution cannot be found analytically. A more detailed description how it works can be found here.
You can create your own equation for the boundary:
where you have to find the positions x0 and y0, as well as the constants ai and bi for the radius equation. So, you have 2*(n+1)+2 variables. Using scipy.optimize.leastsq is straightforward for this type of problem.
The code attached below builds the residual for the leastsq penalizing the points outsize the boundary. The result for your problem, obtained with:
x, y = find_boundary(x2_samples[:,0], x2_samples[:,1], n)
ax.plot(x, y, '-k', lw=2.)
x, y = find_boundary(x1_samples[:,0], x1_samples[:,1], n)
ax.plot(x, y, '--k', lw=2.)
using n=1:
using n=2:
usng n=5:
using n=7:
import numpy as np
from numpy import sin, cos, pi
from scipy.optimize import leastsq
def find_boundary(x, y, n, plot_pts=1000):
def sines(theta):
ans = np.array([sin(i*theta) for i in range(n+1)])
return ans
def cosines(theta):
ans = np.array([cos(i*theta) for i in range(n+1)])
return ans
def residual(params, x, y):
x0 = params[0]
y0 = params[1]
c = params[2:]
r_pts = ((x-x0)**2 + (y-y0)**2)**0.5
thetas = np.arctan2((y-y0), (x-x0))
m = np.vstack((sines(thetas), cosines(thetas))).T
r_bound = m.dot(c)
delta = r_pts - r_bound
delta[delta>0] *= 10
return delta
# initial guess for x0 and y0
x0 = x.mean()
y0 = y.mean()
params = np.zeros(2 + 2*(n+1))
params[0] = x0
params[1] = y0
params[2:] += 1000
popt, pcov = leastsq(residual, x0=params, args=(x, y),
ftol=1.e-12, xtol=1.e-12)
thetas = np.linspace(0, 2*pi, plot_pts)
m = np.vstack((sines(thetas), cosines(thetas))).T
c = np.array(popt[2:])
r_bound = m.dot(c)
x_bound = popt[0] + r_bound*cos(thetas)
y_bound = popt[1] + r_bound*sin(thetas)
return x_bound, y_bound
I like the mglearn library to draw decision boundaries. Here is one example from the book "Introduction to Machine Learning with Python" by A. Mueller:
fig, axes = plt.subplots(1, 3, figsize=(10, 3))
for n_neighbors, ax in zip([1, 3, 9], axes):
clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y)
mglearn.plots.plot_2d_separator(clf, X, fill=True, eps=0.5, ax=ax, alpha=.4)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
ax.set_title("{} neighbor(s)".format(n_neighbors))
ax.set_xlabel("feature 0")
ax.set_ylabel("feature 1")
axes[0].legend(loc=3)
If you want to use scikit learn, you can write your code like this:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
# read data
data = pd.read_csv('ex2data1.txt', header=None)
X = data[[0,1]].values
y = data[2]
# use LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X, y)
# Coefficient of the features in the decision function. (from theta 1 to theta n)
parameters = log_reg.coef_[0]
# Intercept (a.k.a. bias) added to the decision function. (theta 0)
parameter0 = log_reg.intercept_
# Plotting the decision boundary
fig = plt.figure(figsize=(10,7))
x_values = [np.min(X[:, 1] -5 ), np.max(X[:, 1] +5 )]
# calcul y values
y_values = np.dot((-1./parameters[1]), (np.dot(parameters[0],x_values) + parameter0))
colors=['red' if l==0 else 'blue' for l in y]
plt.scatter(X[:, 0], X[:, 1], label='Logistics regression', color=colors)
plt.plot(x_values, y_values, label='Decision Boundary')
plt.show()
see: Building-a-Logistic-Regression-with-Scikit-learn
Just solved a very similar problem with a different approach (root finding) and wanted to post this alternative as answer here for future reference:
def discr_func(x, y, cov_mat, mu_vec):
"""
Calculates the value of the discriminant function for a dx1 dimensional
sample given covariance matrix and mean vector.
Keyword arguments:
x_vec: A dx1 dimensional numpy array representing the sample.
cov_mat: numpy array of the covariance matrix.
mu_vec: dx1 dimensional numpy array of the sample mean.
Returns a float value as result of the discriminant function.
"""
x_vec = np.array([[x],[y]])
W_i = (-1/2) * np.linalg.inv(cov_mat)
assert(W_i.shape[0] > 1 and W_i.shape[1] > 1), 'W_i must be a matrix'
w_i = np.linalg.inv(cov_mat).dot(mu_vec)
assert(w_i.shape[0] > 1 and w_i.shape[1] == 1), 'w_i must be a column vector'
omega_i_p1 = (((-1/2) * (mu_vec).T).dot(np.linalg.inv(cov_mat))).dot(mu_vec)
omega_i_p2 = (-1/2) * np.log(np.linalg.det(cov_mat))
omega_i = omega_i_p1 - omega_i_p2
assert(omega_i.shape == (1, 1)), 'omega_i must be a scalar'
g = ((x_vec.T).dot(W_i)).dot(x_vec) + (w_i.T).dot(x_vec) + omega_i
return float(g)
#g1 = discr_func(x, y, cov_mat=cov_mat1, mu_vec=mu_vec_1)
#g2 = discr_func(x, y, cov_mat=cov_mat2, mu_vec=mu_vec_2)
x_est50 = list(np.arange(-6, 6, 0.1))
y_est50 = []
for i in x_est50:
y_est50.append(scipy.optimize.bisect(lambda y: discr_func(i, y, cov_mat=cov_est_1, mu_vec=mu_est_1) - \
discr_func(i, y, cov_mat=cov_est_2, mu_vec=mu_est_2), -10,10))
y_est50 = [float(i) for i in y_est50]
Here is the result:
(blue the quadratic case, red the linear case (equal variances)
I know this question has been answered in a very thorough way analytically. I just wanted to share a possible 'hack' to the problem. It is unwieldy but gets the job done.
Start by building a mesh grid of the 2d area and then based on the classifier just build a class map of the entire space. Subsequently detect changes in the decision made row-wise and store the edges points in a list and scatter plot the points.
def disc(x): # returns the class of the point based on location x = [x,y]
temp = 0.5 + 0.5*np.sign(disc0(x)-disc1(x))
# disc0() and disc1() are the discriminant functions of the respective classes
return 0*temp + 1*(1-temp)
num = 200
a = np.linspace(-4,4,num)
b = np.linspace(-6,6,num)
X,Y = np.meshgrid(a,b)
def decColor(x,y):
temp = np.zeros((num,num))
print x.shape, np.size(x,axis=0)
for l in range(num):
for m in range(num):
p = np.array([x[l,m],y[l,m]])
#print p
temp[l,m] = disc(p)
return temp
boundColorMap = decColor(X,Y)
group = 0
boundary = []
for x in range(num):
group = boundColorMap[x,0]
for y in range(num):
if boundColorMap[x,y]!=group:
boundary.append([X[x,y],Y[x,y]])
group = boundColorMap[x,y]
boundary = np.array(boundary)
Sample Decision Boundary for a simple bivariate gaussian classifier
Given two bi-variate normal distributions, you can use Gaussian Discriminant Analysis (GDA) to come up with a decision boundary as the difference between the log of the 2 pdf's.
Here's a way to do it using scipy multivariate_normal (the code is not optimized):
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from numpy.linalg import norm
from numpy.linalg import inv
from scipy.spatial.distance import mahalanobis
def normal_scatter(mean, cov, p):
size = 100
sigma_x = cov[0,0]
sigma_y = cov[1,1]
mu_x = mean[0]
mu_y = mean[1]
x_ps, y_ps = np.random.multivariate_normal(mean, cov, size).T
x,y = np.mgrid[mu_x-3*sigma_x:mu_x+3*sigma_x:1/size, mu_y-3*sigma_y:mu_y+3*sigma_y:1/size]
grid = np.empty(x.shape + (2,))
grid[:, :, 0] = x; grid[:, :, 1] = y
z = p*multivariate_normal.pdf(grid, mean, cov)
return x_ps, y_ps, x,y,z
# Dist 1
mu_1 = np.array([1, 1])
cov_1 = .5*np.array([[1, 0], [0, 1]])
p_1 = .5
x_ps, y_ps, x,y,z = normal_scatter(mu_1, cov_1, p_1)
plt.plot(x_ps,y_ps,'x')
plt.contour(x, y, z, cmap='Blues', levels=3)
# Dist 2
mu_2 = np.array([2, 1])
#cov_2 = np.array([[2, -1], [-1, 1]])
cov_2 = cov_1
p_2 = .5
x_ps, y_ps, x,y,z = normal_scatter(mu_2, cov_2, p_2)
plt.plot(x_ps,y_ps,'.')
plt.contour(x, y, z, cmap='Oranges', levels=3)
# Decision Boundary
X = np.empty(x.shape + (2,))
X[:, :, 0] = x; X[:, :, 1] = y
g = np.log(p_1*multivariate_normal.pdf(X, mu_1, cov_1)) - np.log(p_2*multivariate_normal.pdf(X, mu_2, cov_2))
plt.contour(x, y, g, [0])
plt.grid()
plt.axhline(y=0, color='k')
plt.axvline(x=0, color='k')
plt.plot([mu_1[0], mu_2[0]], [mu_1[1], mu_2[1]], 'k')
plt.show()
If p_1 != p_2, then you get non-linear boundary. The decision boundary is given by g above.
Then to plot the decision hyper-plane (line in 2D), you need to evaluate g for a 2D mesh, then get the contour which will give a separating line.
You can also assume to have equal co-variance matrices for both distributions, which will give a linear decision boundary. In this case, you can replace the calculation of g in the above code with the following:
W = inv(cov_1).dot(mu_1-mu_2)
x_0 = 1/2*(mu_1+mu_2) - cov_1.dot(np.log(p_1/p_2)).dot((mu_1-mu_2)/mahalanobis(mu_1, mu_2, cov_1))
X = np.empty(x.shape + (2,))
X[:, :, 0] = x; X[:, :, 1] = y
g = (X-x_0).dot(W)
i use this method from this book python-machine-learning-2nd.pdf URL
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
# setup marker generator and color map
markers = ('s', 'x', 'o', '^', 'v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
# plot the decision surface
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0],
y=X[y == cl, 1],
alpha=0.8,
c=colors[idx],
marker=markers[idx],
label=cl,
edgecolor='black')
# highlight test samples
if test_idx:
# plot all samples
X_test, y_test = X[test_idx, :], y[test_idx]
plt.scatter(X_test[:, 0],
X_test[:, 1],
c='',
edgecolor='black',
alpha=1.0,
linewidth=1,
marker='o',
s=100,
label='test set')
Since version 1.1, sklearn has a function for this:
https://scikit-learn.org/stable/modules/generated/sklearn.inspection.DecisionBoundaryDisplay.html#sklearn.inspection.DecisionBoundaryDisplay

Categories

Resources