How to determine function or curve fit unknown function with Python? - python

I have data with 4 X values and one common Y value for all X. Y is a function of X (observed from graphs), but the function value is unknown, which implies, Y = f1(X1), Y = f2(X2), Y = f3(X3) and Y = f4(X4). I am trying to determine the function f1, f2, f3 and f4 with Python. Variation of Y with X is shown in figure.
MWE
import numpy as np
import matplotlib.pyplot as plt
data = np.genfromtxt('unknown_function.dat', delimiter = '\t', skiprows = 1)
y = data[:, 0]
x1 = data[:, 1]
x2 = data[:, 2]
x3 = data[:, 3]
x4 = data[:, 4]
fig = plt.figure()
fig.clf()
plot = plt.subplot(111)
plt.plot(x1, y, color = 'k')
plt.plot(x2, y, color = 'r')
plt.plot(x3, y, color = 'b')
plt.plot(x4, y, color = 'g')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()
Are there any methods to determine this unknown function with Python?

The data in each column appear to be well-represented by the function f(x) = a0 + a1/x + a2/x^2 + a3/x^3. The scipy.optimize package (good example at http://www2.mpia-hd.mpg.de/~robitaille/PY4SCI_SS_2014/_static/15.%20Fitting%20models%20to%20data.html) can do the function fitting relatively easily and return the fitted parameters. I've added the relevant code to your MWE below for the 'x1' column of data:
import numpy as np
import matplotlib.pyplot as plt
from scipy.optimize import curve_fit
data = np.genfromtxt('unknown_function.dat', delimiter = '\t', skiprows = 1)
y = data[:, 0]
x1 = data[:, 1]
x2 = data[:, 2]
x3 = data[:, 3]
x4 = data[:, 4]
def f1(x, a0, a1, a2, a3):
return a0 + a1/x + a2/x**2 + a3/x**3
popt, pcov = curve_fit(f1, x1, y)
print "Function fit to x1 data column: a0 + a1/x + a2/x**2 + a3/x**3"
print "Parameters from least-squares fit:"
print "a0 =", popt[0], "+/-", pcov[0,0]**0.5
print "a1 =", popt[1], "+/-", pcov[1,1]**0.5
print "a2 =", popt[2], "+/-", pcov[2,2]**0.5
print "a3 =", popt[3], "+/-", pcov[3,3]**0.5
fig = plt.figure()
fig.clf()
plot = plt.subplot(111)
plt.plot(x1, y, color = 'k')
xfine = np.linspace(min(x1), max(x1), 100)
plt.plot(xfine, f1(xfine, popt[0], popt[1], popt[2], popt[3]), 'r-')
plt.plot(x2, y, color = 'r')
plt.plot(x3, y, color = 'b')
plt.plot(x4, y, color = 'g')
plt.xlabel('X')
plt.ylabel('Y')
plt.show()

#davmat gave an excellent solution. This is a scikit-learn based version of his solution using a linear model with his basis functions.
Load the data using Pandas
import numpy as np
import matplotlib.pyplot as plt
import pandas as pd
df = pd.read_csv('unknown_function.dat', delimiter='\t')from sklearn.linear_model import LinearRegression
Define a function to fit the data using a linear model with #davmat's basis.
def fit_function(x,y):
linmodel = LinearRegression(fit_intercept=False)
x_vectors = np.column_stack([x**-i for i in range(4)])
linmodel.fit(x_vectors, y)
return linmodel.coef_
coef1 = fit_function(df['X1'], df['Y'])
coef2 = fit_function(df['X2'], df['Y'])
coef3 = fit_function(df['X3'], df['Y'])
coef4 = fit_function(df['X4'], df['Y'])
Evaluate function with each set of fitted coefficients.
def f1(x, coefs):
return coefs[0] + coefs[1]/x + coefs[2]/x**2 + coefs[3]/x**3
%matplotlib inline
df['y1hat'] = df['X1'].apply(lambda x: f1(x,coef1))
df['y2hat'] = df['X2'].apply(lambda x: f1(x,coef2))
df['y3hat'] = df['X3'].apply(lambda x: f1(x,coef3))
df['y4hat'] = df['X4'].apply(lambda x: f1(x,coef3))
Plot the results
df.plot(x='X1', y=['Y','y1hat'])
df.plot(x='X2', y=['Y','y2hat'])
df.plot(x='X3', y=['Y','y3hat'])
df.plot(x='X4', y=['Y','y4hat'])

You can do it by a polynomial regression.
Y = F(x) = c0 + c1 x + c2 x^2 + c3 ×^3 + ...... + cn x^n
the idea is to choose the value of n and to compute the coefficients c0, c1, ...cn
But keep in mind that this is only an approximation

Related

Python for confidence limit

I try to simulate the confidence limit of the mean value in the figure above.
But something wrong with my code. The result is far different than the above figure. I used confidence interval to obtain the slope and intercept. Could someone one give me hints ? Thanks
Here is my code.
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import t
from scipy import stats
x = np.array([3,7,11,15,18,27,29,30,30,31,31, 32,33,33,34,36,36,36,37,38,39,39,39,40,41,42,42,43,44,45,46,47,50])
y = np.array([5,11,21,16,16,28,27,25,35,30,40,32,34,32,34,37,38,34,36,38,37,36,45,39,41,40,44,37,44,46,46,49,51])
mean_x = np.mean(x)
n = len(x)
res = stats.linregress(x, y)
tinv = lambda p, df: abs(t.ppf(p/2, df))
ts = tinv(0.05, n-2)
bins = np.linspace(0,3,54)
plt.plot(x, y, 'o', label='Data Points')
plt.plot(x,res.intercept + res.slope*x, 'c', label='fitted line')
plt.plot(x,(res.intercept+ts*res.intercept_stderr)+(res.slope + ts*res.stderr)*x,'b', label='Upper Limit')
plt.plot(x,(res.intercept-ts*res.intercept_stderr)+(res.slope - ts*res.stderr)*x, 'g' ,label='Lower Limit')
plt.legend()
plt.show()
There are lots of ways to estimate the error of your model. As an example, here is a linear fit of the error:
import matplotlib.pyplot as plt
import numpy as np
from scipy.stats import t
from scipy import stats
x = np.array([3,7,11,15,18,27,29,30,30,31,31, 32,33,33,34,36,36,36,37,38,39,39,39,40,41,42,42,43,44,45,46,47,50])
y = np.array([5,11,21,16,16,28,27,25,35,30,40,32,34,32,34,37,38,34,36,38,37,36,45,39,41,40,44,37,44,46,46,49,51])
n = len(x)
res = stats.linregress(x, y)
tinv = lambda p, df: abs(t.ppf(p/2, df))
ts = tinv(0.05, n - 2)
bins = np.linspace(0, 3, 54)
pred = res.intercept + res.slope * x
errors = np.abs(y - pred)
plt.scatter(x, y, label='Data Points')
plt.plot(x, pred, 'c', label='fitted line')
error_res = stats.linregress(x, errors)
pred_err = error_res.intercept + error_res.slope * x
plt.plot(x, pred + 1 * pred_err, label='Upper confidence')
plt.plot(x, pred - 1 * pred_err, label='Lower confidence')
plt.legend()
plt.savefig('hey')

how to plot the decision boundary of a polynomial logistic regression in python?

I have looked into the example on this website: https://scipython.com/blog/plotting-the-decision-boundary-of-a-logistic-regression-model/
I understand how they plot the decision boundary for a linear feature vector. But how would I plot the decision boundary if I apply
from sklearn.preprocessing import PolynomialFeatures
...
poly = PolynomialFeatures(degree = 3, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X)
# Fit the data to a logistic regression model.
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_poly, Y)
to get a curved decision boundary? (I know it doesnt make a lot of sense for the example on the webiste, but it may be easier to talk about it).
I have tried to plot the resulting polynomial decision boundary by overlaying the polynomial plot but only got weird results like this:
So how could I do a curved decision boundary plot?
the edited code:
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model
plt.rc('text', usetex=True)
plt.figure(dpi=1200)
pts = np.loadtxt(r'C:\Users\stefa\OneDrive\Desktop\linpts.txt')
X = pts[:,:2]
Y = pts[:,2].astype('int')
poly = PolynomialFeatures(degree = 2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X)
# Fit the data to a logistic regression model.
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_poly, Y)
# Retrieve the model parameters.
b = clf.intercept_[0]
w1, w2,w3,w4,w5 = clf.coef_.T
# In[]
def PolyCoefficients(x, coeffs):
""" Returns a polynomial for ``x`` values for the ``coeffs`` provided.
The coefficients must be in ascending order (``x**0`` to ``x**o``).
"""
o = len(coeffs)
print(f'# This is a polynomial of order {ord}.')
y = 0
for i in range(o):
y += coeffs[i]*x**i
return y
x = np.linspace(0, 9, 100)
coeffs = [b, w1, w2, w3, w4, w5]
plt.plot(x, PolyCoefficients(x, coeffs))
plt.show()
# In[]
# Calculate the intercept and gradient of the decision boundary.
c = -b/w2
m = -w1/w2
# Plot the data and the classification with the decision boundary.
xmin, xmax = -1, 2
ymin, ymax = -1, 2.5
xd = np.array([xmin, xmax])
yd = m*xd + c
#plt.plot(xd, yd, 'k', lw=1, ls='--')
plt.plot(x, PolyCoefficients(x, coeffs))
plt.fill_between(xd, yd, ymin, color='tab:blue', alpha=0.2)
plt.fill_between(xd, yd, ymax, color='tab:orange', alpha=0.2)
plt.scatter(*X[Y==0].T, s=8, alpha=0.5)
plt.scatter(*X[Y==1].T, s=8, alpha=0.5)
plt.xlim(xmin, xmax)
plt.ylim(ymin, ymax)
plt.ylabel(r'$x_2$')
plt.xlabel(r'$x_1$')
plt.show()
Let me generate a demo data.
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model
X = np.random.normal(size=(1000, 2))
Y = ((X[:,0] - X[:,1] + 0.4*X[:,0]*X[:,1] + 0.7*X[:,0]**2 - 0.8*X[:,1]**2 +
np.random.normal(scale=0.1, size=(1000,))) >= 0).astype(int)
flg = (Y > 0)
plt.scatter(X[flg,0], X[flg,1], alpha=0.3, marker="o")
plt.scatter(X[~flg,0], X[~flg,1], alpha=0.3, marker="x")
Apart from the randomness, the data looks something like this.
Train the model like you did.
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X)
# Fit the data to a logistic regression model.
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_poly, Y)
print(poly.powers_)
#[[1 0]
# [0 1]
# [2 0]
# [1 1]
# [0 2]]
This tells us that the features are ordered as: x1, x2, x1^2, x1*x2, x2^2.
So collect the coefficients and the intercept and give them intuitive names.
w1, w2, w11, w12, w22 = clf.coef_[0]
b = clf.intercept_[0]
By definition, the decision boundary is a set of (x1, x2) such that the probability is even between the two classes. Mathematically, they are the solutions to:
b + w1*x1 + w2*x2 + w11*x1^2 + w12*x1*x2 + w22x2^2 = 0
If we fix x1, then this is a quadratic equation of x2, which we can solve analytically. The following function does this job.
def boundary(x1):
# returns x2 on the boundary for a given x1
# we solve square equation
# a x^2 + b x + c = 0
# --> x = (-b +- sqrt(b^2 - 4ac)) / 2a
a_ = w22
b_ = w2 + w12 * x1
c_ = b + w1*x1 + w11*x1**2
tmp = b_**2 - 4*a_*c_
if tmp < 0:
return None
ans = [(-b_ + tmp**0.5) / (2*a_), (-b_ - tmp**0.5) / (2*a_)]
ans.sort() # smaller first
return ans
# compute the boundaries
xs = np.linspace(X[:,0].min(), X[:,0].max(), num=100)
ys_1 = []
ys_2 = []
for x1 in xs:
tmp = boundary(x1)
if tmp is None:
ys_1.append(None)
ys_2.append(None)
else:
ys_1.append(tmp[0]) # smaller boundary
ys_2.append(tmp[1]) # larger boundary
Now we have the boundaries as data, we can visualize them easily.
flg = (Y > 0)
plt.scatter(X[flg,0], X[flg,1], alpha=0.3, marker="o")
plt.scatter(X[~flg,0], X[~flg,1], alpha=0.3, marker="x")
plt.plot(xs, ys_1, c="green")
plt.plot(xs, ys_2, c="gray")
# if ys contains None, need to skip them
plt.fill_between(xs, ys_1, ys_2, color='tab:blue', alpha=0.2)
plt.fill_between(xs, min(ys_1), ys_1, color='tab:orange', alpha=0.2)
plt.fill_between(xs, ys_2, max(ys_2), color='tab:orange', alpha=0.2)
Notice that the boundaries can be explicitly computed because the model is quadratic. Different approaches are needed for more general, complex classifiers.
An easier, generally applicable approach is to create dummy data containing various combination of variables and let the classifier predict, and plot with the color given by the predicted class.
xs = np.linspace(X[:,0].min(), X[:,0].max(), num=100)
ys = np.linspace(X[:,1].min(), X[:,1].max(), num=100)
newX = []
for x1 in xs:
for x2 in ys:
newX.append((x1, x2))
newX = np.array(newX)
p = clf.predict(poly.transform(newX))
flg = (Y > 0)
plt.scatter(X[flg,0], X[flg,1], alpha=0.3, marker="o")
plt.scatter(X[~flg,0], X[~flg,1], alpha=0.3, marker="x")
flg = (p > 0)
plt.scatter(newX[flg,0], newX[flg,1], alpha=0.02, c="tab:blue", marker="s", s=20)
plt.scatter(newX[~flg,0], newX[~flg,1], alpha=0.02, c="tab:orange", marker="s", s=20)
The output of your PolyCoefficients function is a 4th order polynomial made up of:
coeffs[0]*x^0 + coeffs[1]*x^1 + coeffs[2]*x^2 + coeffs[3]*x^3 + coeffs[i]*x^4
Instead what you need is a 2nd order polynomial (specified by your parameter degree = 2 inside the sklearn.preprocessing.PolynomialFeatures object), which is really the following:
(coeffs[0]*x1^1) + (coeffs[1]*x2^1) + (coeffs[2]*x1^2 + 2*coeffs[3]*x1*x2 + coeffs[4]*x2^2)
This formula is valid as soon as you are using two features x1 and x2, otherwise you would need to use x1, x2, ..., xN and all terms that derive from (x1 + x2 + ... + xN)^2.
You can find more details and examples here.

Fit 3d coordinates into a parabola

I would like to predict a ball trajectory by fitting its 3d coordinates into a parabola. Below is my code. But instead of a parabola, I got a straight line. If you have any clue about it, please let me know. Thanks!
# draw scatter coordiante
fig = plt.figure()
ax = plt.axes(projection = '3d')
x_list = []
y_list = []
z_list = []
for x in rm_list:
x_list.append(x[0][0])
y_list.append(x[0][1])
z_list.append(x[0][2])
x = np.array(x_list)
y = np.array(y_list)
z = np.array(z_list)
ax.scatter(x, y, z)
# curve fit
def func(x, a, b, c, d):
return a * x[0]**2 + b * x[1]**2 + c * x[0] * x[1] + d
data = np.column_stack([x_list, y_list, z_list])
popt, _ = curve_fit(func, data[:,:2].T, ydata=data[:,2])
a, b, c, d = popt
print('y= %.5f * x ^ 2 + %.5f * y ^ 2 + %.5f * x * y + %.5f' %(a, b, c, d))
x1 = np.linspace(0.3, 0.4, 100)
y1 = np.linspace(0.02, 0.06, 100)
z1 = a * x1 ** 2 + b * y1 ** 2 + c * x1 * y1 + d
ax.plot(x1, y1, z1, color='green')
plt.show()
Update 1
After changing the func to ax^2 + by^2 + cxy + dx + ey + f, I got a parabola but not fitting to the coordinate.
That you have your underlying timestamp data makes the fitting procedure easier:
import matplotlib.pyplot as plt
import numpy as np
from scipy.optimize import curve_fit
from numpy.polynomial import Polynomial
# test data generation with some noise
# here read in your data
np.random.seed(123)
n = 40
x_param = [ 1, 21, -1]
y_param = [12, -3, 0]
z_param = [-3, 0, -2]
px = Polynomial(x_param)
py = Polynomial(y_param)
pz = Polynomial(z_param)
t = np.random.choice(np.linspace (-3000, 2000, 1000)/500, n)
x = px(t) + np.random.random(n)
y = py(t) + np.random.random(n)
z = pz(t) + np.random.random(n)
# here start the real calculations
# draw scatter coordinates of raw data
fig = plt.figure()
ax = plt.axes(projection = '3d')
ax.scatter(x, y, z, label="raw data")
# curve fit function
def func(t, x2, x1, x0, y2, y1, y0, z2, z1, z0):
Px=Polynomial([x2, x1, x0])
Py=Polynomial([y2, y1, y0])
Pz=Polynomial([z2, z1, z0])
return np.concatenate([Px(t), Py(t), Pz(t)])
# curve fit
# start values are not necessary for this example
# but make it your rule to always provide start values for curve_fit
start_vals = [ 1, 10, 1,
10, 1, 1,
-1, -1, -1]
xyz = np.concatenate([x, y, z])
popt, _ = curve_fit(func, t, xyz, p0=start_vals)
print(popt)
#[ 1.58003630e+00 2.10059868e+01 -1.00401965e+00
# 1.25895591e+01 -2.97374035e+00 -3.23358241e-03
# -2.44293562e+00 3.96407428e-02 -1.99671092e+00]
# regularly spaced fit data
t_fit = np.linspace(min(t), max(t), 100)
xyz_fit = func(t_fit, *popt).reshape(3, -1)
ax.plot(xyz_fit[0, :], xyz_fit[1, :], xyz_fit[2, :], color="green", label="fitted data")
ax.legend()
plt.show()
Sample output:

Matplotlib regression scattered plot using Python?

My Question
I tried regression using curve_fit function from scipy module, and now I am getting a scattered kind of plot, I can't figure it out, Why I am getting a scattered plot here ?
My Code
def scipyFunction(x,y):
plot(x, y, 'o', label='Original data', markersize=10)
x = [float(xn) for xn in x] #every element (xn) in x becomes a float
y = [float(yn) for yn in y] #every element (yn) in y becomes a float
x = np.array(x) #tranform data into numpy array
y = np.array(y) #transform data into numpy array
def functionForScipy(x,a,b,c,d):
return a*x**3 + b*x**2 + c*x + d
#make the curve_fit
popt,pcov = curve_fit(functionForScipy,x,y)
'''
The result is :
popt[0] = a, popt[1] = b, popt[2] = d of the function,
so f(x) = popt[0]*x**3 + popt[1]*x**2 + popt[2]*x + popt[3].
'''
print(popt)
plt.plot(x, popt[0]*x**3 + popt[1]*x**2 + popt[2]*x + popt[3], label="Fitted Curve") # same as lin eabove
plt.legend(loc='upper left')
plt.show()
x and y plot is like this :
I suspect this occurs because the values in your x array are not monotonically increasing (that is to say that the each subsequent number is larger than the last).
You need to sort your x values before you plot them otherwise they will be all over the place, as shown in the example below.
import numpy as np
import matplotlib.pyplot as plt
def func(x):
return x**2
x = np.array([0, 5, 2, 1, 3, 4])
y = func(x)
plt.plot(x, y, 'b-', label='Unsorted')
x.sort()
y = func(x)
plt.plot(x, y, 'r-', label='Sorted')
plt.legend()
plt.show()
Probably your x and y are not sorted.
Don't forget the apply the same sorting you do on x on y as well. To achieve this zip is very handy. You could add the following at the beginning of your function:
comb = zip(x,y)
comb.sort(key=lambda x:x[0]) #sort according to x
x, y = zip(*comb) #both x and y are sorted now according to x

Plotting a decision boundary separating 2 classes using Matplotlib's pyplot

I could really use a tip to help me plotting a decision boundary to separate to classes of data. I created some sample data (from a Gaussian distribution) via Python NumPy. In this case, every data point is a 2D coordinate, i.e., a 1 column vector consisting of 2 rows. E.g.,
[ 1
2 ]
Let's assume I have 2 classes, class1 and class2, and I created 100 data points for class1 and 100 data points for class2 via the code below (assigned to the variables x1_samples and x2_samples).
mu_vec1 = np.array([0,0])
cov_mat1 = np.array([[2,0],[0,2]])
x1_samples = np.random.multivariate_normal(mu_vec1, cov_mat1, 100)
mu_vec1 = mu_vec1.reshape(1,2).T # to 1-col vector
mu_vec2 = np.array([1,2])
cov_mat2 = np.array([[1,0],[0,1]])
x2_samples = np.random.multivariate_normal(mu_vec2, cov_mat2, 100)
mu_vec2 = mu_vec2.reshape(1,2).T
When I plot the data points for each class, it would look like this:
Now, I came up with an equation for an decision boundary to separate both classes and would like to add it to the plot. However, I am not really sure how I can plot this function:
def decision_boundary(x_vec, mu_vec1, mu_vec2):
g1 = (x_vec-mu_vec1).T.dot((x_vec-mu_vec1))
g2 = 2*( (x_vec-mu_vec2).T.dot((x_vec-mu_vec2)) )
return g1 - g2
I would really appreciate any help!
EDIT:
Intuitively (If I did my math right) I would expect the decision boundary to look somewhat like this red line when I plot the function...
Your question is more complicated than a simple plot : you need to draw the contour which will maximize the inter-class distance. Fortunately it's a well-studied field, particularly for SVM machine learning.
The easiest method is to download the scikit-learn module, which provides a lot of cool methods to draw boundaries: scikit-learn: Support Vector Machines
Code :
# -*- coding: utf-8 -*-
import numpy as np
import matplotlib
from matplotlib import pyplot as plt
import scipy
from sklearn import svm
mu_vec1 = np.array([0,0])
cov_mat1 = np.array([[2,0],[0,2]])
x1_samples = np.random.multivariate_normal(mu_vec1, cov_mat1, 100)
mu_vec1 = mu_vec1.reshape(1,2).T # to 1-col vector
mu_vec2 = np.array([1,2])
cov_mat2 = np.array([[1,0],[0,1]])
x2_samples = np.random.multivariate_normal(mu_vec2, cov_mat2, 100)
mu_vec2 = mu_vec2.reshape(1,2).T
fig = plt.figure()
plt.scatter(x1_samples[:,0],x1_samples[:,1], marker='+')
plt.scatter(x2_samples[:,0],x2_samples[:,1], c= 'green', marker='o')
X = np.concatenate((x1_samples,x2_samples), axis = 0)
Y = np.array([0]*100 + [1]*100)
C = 1.0 # SVM regularization parameter
clf = svm.SVC(kernel = 'linear', gamma=0.7, C=C )
clf.fit(X, Y)
Linear Plot
w = clf.coef_[0]
a = -w[0] / w[1]
xx = np.linspace(-5, 5)
yy = a * xx - (clf.intercept_[0]) / w[1]
plt.plot(xx, yy, 'k-')
MultiLinear Plot
C = 1.0 # SVM regularization parameter
clf = svm.SVC(kernel = 'rbf', gamma=0.7, C=C )
clf.fit(X, Y)
h = .02 # step size in the mesh
# create a mesh to plot in
x_min, x_max = X[:, 0].min() - 1, X[:, 0].max() + 1
y_min, y_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx, yy = np.meshgrid(np.arange(x_min, x_max, h),
np.arange(y_min, y_max, h))
# Plot the decision boundary. For that, we will assign a color to each
# point in the mesh [x_min, m_max]x[y_min, y_max].
Z = clf.predict(np.c_[xx.ravel(), yy.ravel()])
# Put the result into a color plot
Z = Z.reshape(xx.shape)
plt.contour(xx, yy, Z, cmap=plt.cm.Paired)
Implementation
If you want to implement it yourself, you need to solve the following quadratic equation:
The Wikipedia article
Unfortunately, for non-linear boundaries like the one you draw, it's a difficult problem relying on a kernel trick but there isn't a clear cut solution.
Based on the way you've written decision_boundary you'll want to use the contour function, as Joe noted above. If you just want the boundary line, you can draw a single contour at the 0 level:
f, ax = plt.subplots(figsize=(7, 7))
c1, c2 = "#3366AA", "#AA3333"
ax.scatter(*x1_samples.T, c=c1, s=40)
ax.scatter(*x2_samples.T, c=c2, marker="D", s=40)
x_vec = np.linspace(*ax.get_xlim())
ax.contour(x_vec, x_vec,
decision_boundary(x_vec, mu_vec1, mu_vec2),
levels=[0], cmap="Greys_r")
Which makes:
Those were some great suggestions, thanks a lot for your help! I ended up solving the equation analytically and this is the solution I ended up with (I just want to post it for future reference:
# 2-category classification with random 2D-sample data
# from a multivariate normal distribution
import numpy as np
from matplotlib import pyplot as plt
def decision_boundary(x_1):
""" Calculates the x_2 value for plotting the decision boundary."""
return 4 - np.sqrt(-x_1**2 + 4*x_1 + 6 + np.log(16))
# Generating a Gaussion dataset:
# creating random vectors from the multivariate normal distribution
# given mean and covariance
mu_vec1 = np.array([0,0])
cov_mat1 = np.array([[2,0],[0,2]])
x1_samples = np.random.multivariate_normal(mu_vec1, cov_mat1, 100)
mu_vec1 = mu_vec1.reshape(1,2).T # to 1-col vector
mu_vec2 = np.array([1,2])
cov_mat2 = np.array([[1,0],[0,1]])
x2_samples = np.random.multivariate_normal(mu_vec2, cov_mat2, 100)
mu_vec2 = mu_vec2.reshape(1,2).T # to 1-col vector
# Main scatter plot and plot annotation
f, ax = plt.subplots(figsize=(7, 7))
ax.scatter(x1_samples[:,0], x1_samples[:,1], marker='o', color='green', s=40, alpha=0.5)
ax.scatter(x2_samples[:,0], x2_samples[:,1], marker='^', color='blue', s=40, alpha=0.5)
plt.legend(['Class1 (w1)', 'Class2 (w2)'], loc='upper right')
plt.title('Densities of 2 classes with 25 bivariate random patterns each')
plt.ylabel('x2')
plt.xlabel('x1')
ftext = 'p(x|w1) ~ N(mu1=(0,0)^t, cov1=I)\np(x|w2) ~ N(mu2=(1,1)^t, cov2=I)'
plt.figtext(.15,.8, ftext, fontsize=11, ha='left')
# Adding decision boundary to plot
x_1 = np.arange(-5, 5, 0.1)
bound = decision_boundary(x_1)
plt.plot(x_1, bound, 'r--', lw=3)
x_vec = np.linspace(*ax.get_xlim())
x_1 = np.arange(0, 100, 0.05)
plt.show()
And the code can be found here
EDIT:
I also have a convenience function for plotting decision regions for classifiers that implement a fit and predict method, e.g., the classifiers in scikit-learn, which is useful if the solution cannot be found analytically. A more detailed description how it works can be found here.
You can create your own equation for the boundary:
where you have to find the positions x0 and y0, as well as the constants ai and bi for the radius equation. So, you have 2*(n+1)+2 variables. Using scipy.optimize.leastsq is straightforward for this type of problem.
The code attached below builds the residual for the leastsq penalizing the points outsize the boundary. The result for your problem, obtained with:
x, y = find_boundary(x2_samples[:,0], x2_samples[:,1], n)
ax.plot(x, y, '-k', lw=2.)
x, y = find_boundary(x1_samples[:,0], x1_samples[:,1], n)
ax.plot(x, y, '--k', lw=2.)
using n=1:
using n=2:
usng n=5:
using n=7:
import numpy as np
from numpy import sin, cos, pi
from scipy.optimize import leastsq
def find_boundary(x, y, n, plot_pts=1000):
def sines(theta):
ans = np.array([sin(i*theta) for i in range(n+1)])
return ans
def cosines(theta):
ans = np.array([cos(i*theta) for i in range(n+1)])
return ans
def residual(params, x, y):
x0 = params[0]
y0 = params[1]
c = params[2:]
r_pts = ((x-x0)**2 + (y-y0)**2)**0.5
thetas = np.arctan2((y-y0), (x-x0))
m = np.vstack((sines(thetas), cosines(thetas))).T
r_bound = m.dot(c)
delta = r_pts - r_bound
delta[delta>0] *= 10
return delta
# initial guess for x0 and y0
x0 = x.mean()
y0 = y.mean()
params = np.zeros(2 + 2*(n+1))
params[0] = x0
params[1] = y0
params[2:] += 1000
popt, pcov = leastsq(residual, x0=params, args=(x, y),
ftol=1.e-12, xtol=1.e-12)
thetas = np.linspace(0, 2*pi, plot_pts)
m = np.vstack((sines(thetas), cosines(thetas))).T
c = np.array(popt[2:])
r_bound = m.dot(c)
x_bound = popt[0] + r_bound*cos(thetas)
y_bound = popt[1] + r_bound*sin(thetas)
return x_bound, y_bound
I like the mglearn library to draw decision boundaries. Here is one example from the book "Introduction to Machine Learning with Python" by A. Mueller:
fig, axes = plt.subplots(1, 3, figsize=(10, 3))
for n_neighbors, ax in zip([1, 3, 9], axes):
clf = KNeighborsClassifier(n_neighbors=n_neighbors).fit(X, y)
mglearn.plots.plot_2d_separator(clf, X, fill=True, eps=0.5, ax=ax, alpha=.4)
mglearn.discrete_scatter(X[:, 0], X[:, 1], y, ax=ax)
ax.set_title("{} neighbor(s)".format(n_neighbors))
ax.set_xlabel("feature 0")
ax.set_ylabel("feature 1")
axes[0].legend(loc=3)
If you want to use scikit learn, you can write your code like this:
import numpy as np
import pandas as pd
import matplotlib.pyplot as plt
from sklearn.linear_model import LogisticRegression
# read data
data = pd.read_csv('ex2data1.txt', header=None)
X = data[[0,1]].values
y = data[2]
# use LogisticRegression
log_reg = LogisticRegression()
log_reg.fit(X, y)
# Coefficient of the features in the decision function. (from theta 1 to theta n)
parameters = log_reg.coef_[0]
# Intercept (a.k.a. bias) added to the decision function. (theta 0)
parameter0 = log_reg.intercept_
# Plotting the decision boundary
fig = plt.figure(figsize=(10,7))
x_values = [np.min(X[:, 1] -5 ), np.max(X[:, 1] +5 )]
# calcul y values
y_values = np.dot((-1./parameters[1]), (np.dot(parameters[0],x_values) + parameter0))
colors=['red' if l==0 else 'blue' for l in y]
plt.scatter(X[:, 0], X[:, 1], label='Logistics regression', color=colors)
plt.plot(x_values, y_values, label='Decision Boundary')
plt.show()
see: Building-a-Logistic-Regression-with-Scikit-learn
Just solved a very similar problem with a different approach (root finding) and wanted to post this alternative as answer here for future reference:
def discr_func(x, y, cov_mat, mu_vec):
"""
Calculates the value of the discriminant function for a dx1 dimensional
sample given covariance matrix and mean vector.
Keyword arguments:
x_vec: A dx1 dimensional numpy array representing the sample.
cov_mat: numpy array of the covariance matrix.
mu_vec: dx1 dimensional numpy array of the sample mean.
Returns a float value as result of the discriminant function.
"""
x_vec = np.array([[x],[y]])
W_i = (-1/2) * np.linalg.inv(cov_mat)
assert(W_i.shape[0] > 1 and W_i.shape[1] > 1), 'W_i must be a matrix'
w_i = np.linalg.inv(cov_mat).dot(mu_vec)
assert(w_i.shape[0] > 1 and w_i.shape[1] == 1), 'w_i must be a column vector'
omega_i_p1 = (((-1/2) * (mu_vec).T).dot(np.linalg.inv(cov_mat))).dot(mu_vec)
omega_i_p2 = (-1/2) * np.log(np.linalg.det(cov_mat))
omega_i = omega_i_p1 - omega_i_p2
assert(omega_i.shape == (1, 1)), 'omega_i must be a scalar'
g = ((x_vec.T).dot(W_i)).dot(x_vec) + (w_i.T).dot(x_vec) + omega_i
return float(g)
#g1 = discr_func(x, y, cov_mat=cov_mat1, mu_vec=mu_vec_1)
#g2 = discr_func(x, y, cov_mat=cov_mat2, mu_vec=mu_vec_2)
x_est50 = list(np.arange(-6, 6, 0.1))
y_est50 = []
for i in x_est50:
y_est50.append(scipy.optimize.bisect(lambda y: discr_func(i, y, cov_mat=cov_est_1, mu_vec=mu_est_1) - \
discr_func(i, y, cov_mat=cov_est_2, mu_vec=mu_est_2), -10,10))
y_est50 = [float(i) for i in y_est50]
Here is the result:
(blue the quadratic case, red the linear case (equal variances)
I know this question has been answered in a very thorough way analytically. I just wanted to share a possible 'hack' to the problem. It is unwieldy but gets the job done.
Start by building a mesh grid of the 2d area and then based on the classifier just build a class map of the entire space. Subsequently detect changes in the decision made row-wise and store the edges points in a list and scatter plot the points.
def disc(x): # returns the class of the point based on location x = [x,y]
temp = 0.5 + 0.5*np.sign(disc0(x)-disc1(x))
# disc0() and disc1() are the discriminant functions of the respective classes
return 0*temp + 1*(1-temp)
num = 200
a = np.linspace(-4,4,num)
b = np.linspace(-6,6,num)
X,Y = np.meshgrid(a,b)
def decColor(x,y):
temp = np.zeros((num,num))
print x.shape, np.size(x,axis=0)
for l in range(num):
for m in range(num):
p = np.array([x[l,m],y[l,m]])
#print p
temp[l,m] = disc(p)
return temp
boundColorMap = decColor(X,Y)
group = 0
boundary = []
for x in range(num):
group = boundColorMap[x,0]
for y in range(num):
if boundColorMap[x,y]!=group:
boundary.append([X[x,y],Y[x,y]])
group = boundColorMap[x,y]
boundary = np.array(boundary)
Sample Decision Boundary for a simple bivariate gaussian classifier
Given two bi-variate normal distributions, you can use Gaussian Discriminant Analysis (GDA) to come up with a decision boundary as the difference between the log of the 2 pdf's.
Here's a way to do it using scipy multivariate_normal (the code is not optimized):
import numpy as np
import matplotlib.pyplot as plt
from scipy.stats import multivariate_normal
from numpy.linalg import norm
from numpy.linalg import inv
from scipy.spatial.distance import mahalanobis
def normal_scatter(mean, cov, p):
size = 100
sigma_x = cov[0,0]
sigma_y = cov[1,1]
mu_x = mean[0]
mu_y = mean[1]
x_ps, y_ps = np.random.multivariate_normal(mean, cov, size).T
x,y = np.mgrid[mu_x-3*sigma_x:mu_x+3*sigma_x:1/size, mu_y-3*sigma_y:mu_y+3*sigma_y:1/size]
grid = np.empty(x.shape + (2,))
grid[:, :, 0] = x; grid[:, :, 1] = y
z = p*multivariate_normal.pdf(grid, mean, cov)
return x_ps, y_ps, x,y,z
# Dist 1
mu_1 = np.array([1, 1])
cov_1 = .5*np.array([[1, 0], [0, 1]])
p_1 = .5
x_ps, y_ps, x,y,z = normal_scatter(mu_1, cov_1, p_1)
plt.plot(x_ps,y_ps,'x')
plt.contour(x, y, z, cmap='Blues', levels=3)
# Dist 2
mu_2 = np.array([2, 1])
#cov_2 = np.array([[2, -1], [-1, 1]])
cov_2 = cov_1
p_2 = .5
x_ps, y_ps, x,y,z = normal_scatter(mu_2, cov_2, p_2)
plt.plot(x_ps,y_ps,'.')
plt.contour(x, y, z, cmap='Oranges', levels=3)
# Decision Boundary
X = np.empty(x.shape + (2,))
X[:, :, 0] = x; X[:, :, 1] = y
g = np.log(p_1*multivariate_normal.pdf(X, mu_1, cov_1)) - np.log(p_2*multivariate_normal.pdf(X, mu_2, cov_2))
plt.contour(x, y, g, [0])
plt.grid()
plt.axhline(y=0, color='k')
plt.axvline(x=0, color='k')
plt.plot([mu_1[0], mu_2[0]], [mu_1[1], mu_2[1]], 'k')
plt.show()
If p_1 != p_2, then you get non-linear boundary. The decision boundary is given by g above.
Then to plot the decision hyper-plane (line in 2D), you need to evaluate g for a 2D mesh, then get the contour which will give a separating line.
You can also assume to have equal co-variance matrices for both distributions, which will give a linear decision boundary. In this case, you can replace the calculation of g in the above code with the following:
W = inv(cov_1).dot(mu_1-mu_2)
x_0 = 1/2*(mu_1+mu_2) - cov_1.dot(np.log(p_1/p_2)).dot((mu_1-mu_2)/mahalanobis(mu_1, mu_2, cov_1))
X = np.empty(x.shape + (2,))
X[:, :, 0] = x; X[:, :, 1] = y
g = (X-x_0).dot(W)
i use this method from this book python-machine-learning-2nd.pdf URL
from matplotlib.colors import ListedColormap
import matplotlib.pyplot as plt
def plot_decision_regions(X, y, classifier, test_idx=None, resolution=0.02):
# setup marker generator and color map
markers = ('s', 'x', 'o', '^', 'v')
colors = ('red', 'blue', 'lightgreen', 'gray', 'cyan')
cmap = ListedColormap(colors[:len(np.unique(y))])
# plot the decision surface
x1_min, x1_max = X[:, 0].min() - 1, X[:, 0].max() + 1
x2_min, x2_max = X[:, 1].min() - 1, X[:, 1].max() + 1
xx1, xx2 = np.meshgrid(np.arange(x1_min, x1_max, resolution),
np.arange(x2_min, x2_max, resolution))
Z = classifier.predict(np.array([xx1.ravel(), xx2.ravel()]).T)
Z = Z.reshape(xx1.shape)
plt.contourf(xx1, xx2, Z, alpha=0.3, cmap=cmap)
plt.xlim(xx1.min(), xx1.max())
plt.ylim(xx2.min(), xx2.max())
for idx, cl in enumerate(np.unique(y)):
plt.scatter(x=X[y == cl, 0],
y=X[y == cl, 1],
alpha=0.8,
c=colors[idx],
marker=markers[idx],
label=cl,
edgecolor='black')
# highlight test samples
if test_idx:
# plot all samples
X_test, y_test = X[test_idx, :], y[test_idx]
plt.scatter(X_test[:, 0],
X_test[:, 1],
c='',
edgecolor='black',
alpha=1.0,
linewidth=1,
marker='o',
s=100,
label='test set')
Since version 1.1, sklearn has a function for this:
https://scikit-learn.org/stable/modules/generated/sklearn.inspection.DecisionBoundaryDisplay.html#sklearn.inspection.DecisionBoundaryDisplay

Categories

Resources