I am using the following code in order to smooth my data
a = get_data()
y, x = a.T
t = np.linspace(0, 1, len(x))
t2 = np.linspace(0, 1, len(x))
x2 = np.interp(t2, t, x)
y2 = np.interp(t2, t, y)
sigma = 50
x3 = gaussian_filter1d(x2, sigma)
y3 = gaussian_filter1d(y2, sigma)
x4 = np.interp(t, t2, x3)
y4 = np.interp(t, t2, y3)
plt.plot(x, y, "o-", lw=2)
plt.plot(x3, y3, "r", lw=2)
plt.plot(x4, y4, "o", lw=2)
plt.show()
I found this code here:
line smoothing algorithm in python?
My problem is that I need to get points from the new fit which are exactly with the same x values as my original x values (the points that I have smoothed).
The fit works well but the x values of the new points are different.
How can I get points from the new fit which has the same x value but with the new fit y values. The x values for the points starts from 0 and the space between each one should be 1800.
I think what is particular to your case is that the data to smooth are like a free line in the plane (x, y) = f(t) rather than a function y = f(x)
Maybe the trick is that the points have to be sorted before the interpolation (see numpy.interp):
# Generate random data:
t = np.linspace(0, 3, 20)
x = np.cos(t) + 0.1*np.random.randn(np.size(t))
y = np.sin(t) + 0.1*np.random.randn(np.size(t))
# Smooth the 2D data:
sigma = 2
x_smooth = gaussian_filter1d(x, sigma)
y_smooth = gaussian_filter1d(y, sigma)
# Sort (see: https://stackoverflow.com/a/1903579/8069403)
permutation = x_smooth.argsort()
x_smooth = x_smooth[permutation]
y_smooth = y_smooth[permutation]
x_new = np.sort(x) # not mandatory
# Interpolation on the original x points:
y_smooth_new = np.interp(x_new, x_smooth, y_smooth)
# Plot:
plt.plot(x, y, label='x, y');
plt.plot(x_smooth, y_smooth, label='x_smooth, y_smooth');
plt.plot(x_new, y_smooth_new, '-ro', label='x_new, Y_smooth_new', alpha=0.7);
plt.legend(); plt.xlabel('x');
Related
I have looked into the example on this website: https://scipython.com/blog/plotting-the-decision-boundary-of-a-logistic-regression-model/
I understand how they plot the decision boundary for a linear feature vector. But how would I plot the decision boundary if I apply
from sklearn.preprocessing import PolynomialFeatures
...
poly = PolynomialFeatures(degree = 3, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X)
# Fit the data to a logistic regression model.
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_poly, Y)
to get a curved decision boundary? (I know it doesnt make a lot of sense for the example on the webiste, but it may be easier to talk about it).
I have tried to plot the resulting polynomial decision boundary by overlaying the polynomial plot but only got weird results like this:
So how could I do a curved decision boundary plot?
the edited code:
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model
plt.rc('text', usetex=True)
plt.figure(dpi=1200)
pts = np.loadtxt(r'C:\Users\stefa\OneDrive\Desktop\linpts.txt')
X = pts[:,:2]
Y = pts[:,2].astype('int')
poly = PolynomialFeatures(degree = 2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X)
# Fit the data to a logistic regression model.
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_poly, Y)
# Retrieve the model parameters.
b = clf.intercept_[0]
w1, w2,w3,w4,w5 = clf.coef_.T
# In[]
def PolyCoefficients(x, coeffs):
""" Returns a polynomial for ``x`` values for the ``coeffs`` provided.
The coefficients must be in ascending order (``x**0`` to ``x**o``).
"""
o = len(coeffs)
print(f'# This is a polynomial of order {ord}.')
y = 0
for i in range(o):
y += coeffs[i]*x**i
return y
x = np.linspace(0, 9, 100)
coeffs = [b, w1, w2, w3, w4, w5]
plt.plot(x, PolyCoefficients(x, coeffs))
plt.show()
# In[]
# Calculate the intercept and gradient of the decision boundary.
c = -b/w2
m = -w1/w2
# Plot the data and the classification with the decision boundary.
xmin, xmax = -1, 2
ymin, ymax = -1, 2.5
xd = np.array([xmin, xmax])
yd = m*xd + c
#plt.plot(xd, yd, 'k', lw=1, ls='--')
plt.plot(x, PolyCoefficients(x, coeffs))
plt.fill_between(xd, yd, ymin, color='tab:blue', alpha=0.2)
plt.fill_between(xd, yd, ymax, color='tab:orange', alpha=0.2)
plt.scatter(*X[Y==0].T, s=8, alpha=0.5)
plt.scatter(*X[Y==1].T, s=8, alpha=0.5)
plt.xlim(xmin, xmax)
plt.ylim(ymin, ymax)
plt.ylabel(r'$x_2$')
plt.xlabel(r'$x_1$')
plt.show()
Let me generate a demo data.
from sklearn.preprocessing import PolynomialFeatures
import numpy as np
import matplotlib.pyplot as plt
import sklearn.linear_model
X = np.random.normal(size=(1000, 2))
Y = ((X[:,0] - X[:,1] + 0.4*X[:,0]*X[:,1] + 0.7*X[:,0]**2 - 0.8*X[:,1]**2 +
np.random.normal(scale=0.1, size=(1000,))) >= 0).astype(int)
flg = (Y > 0)
plt.scatter(X[flg,0], X[flg,1], alpha=0.3, marker="o")
plt.scatter(X[~flg,0], X[~flg,1], alpha=0.3, marker="x")
Apart from the randomness, the data looks something like this.
Train the model like you did.
poly = PolynomialFeatures(degree=2, interaction_only=False, include_bias=False)
X_poly = poly.fit_transform(X)
# Fit the data to a logistic regression model.
clf = sklearn.linear_model.LogisticRegression()
clf.fit(X_poly, Y)
print(poly.powers_)
#[[1 0]
# [0 1]
# [2 0]
# [1 1]
# [0 2]]
This tells us that the features are ordered as: x1, x2, x1^2, x1*x2, x2^2.
So collect the coefficients and the intercept and give them intuitive names.
w1, w2, w11, w12, w22 = clf.coef_[0]
b = clf.intercept_[0]
By definition, the decision boundary is a set of (x1, x2) such that the probability is even between the two classes. Mathematically, they are the solutions to:
b + w1*x1 + w2*x2 + w11*x1^2 + w12*x1*x2 + w22x2^2 = 0
If we fix x1, then this is a quadratic equation of x2, which we can solve analytically. The following function does this job.
def boundary(x1):
# returns x2 on the boundary for a given x1
# we solve square equation
# a x^2 + b x + c = 0
# --> x = (-b +- sqrt(b^2 - 4ac)) / 2a
a_ = w22
b_ = w2 + w12 * x1
c_ = b + w1*x1 + w11*x1**2
tmp = b_**2 - 4*a_*c_
if tmp < 0:
return None
ans = [(-b_ + tmp**0.5) / (2*a_), (-b_ - tmp**0.5) / (2*a_)]
ans.sort() # smaller first
return ans
# compute the boundaries
xs = np.linspace(X[:,0].min(), X[:,0].max(), num=100)
ys_1 = []
ys_2 = []
for x1 in xs:
tmp = boundary(x1)
if tmp is None:
ys_1.append(None)
ys_2.append(None)
else:
ys_1.append(tmp[0]) # smaller boundary
ys_2.append(tmp[1]) # larger boundary
Now we have the boundaries as data, we can visualize them easily.
flg = (Y > 0)
plt.scatter(X[flg,0], X[flg,1], alpha=0.3, marker="o")
plt.scatter(X[~flg,0], X[~flg,1], alpha=0.3, marker="x")
plt.plot(xs, ys_1, c="green")
plt.plot(xs, ys_2, c="gray")
# if ys contains None, need to skip them
plt.fill_between(xs, ys_1, ys_2, color='tab:blue', alpha=0.2)
plt.fill_between(xs, min(ys_1), ys_1, color='tab:orange', alpha=0.2)
plt.fill_between(xs, ys_2, max(ys_2), color='tab:orange', alpha=0.2)
Notice that the boundaries can be explicitly computed because the model is quadratic. Different approaches are needed for more general, complex classifiers.
An easier, generally applicable approach is to create dummy data containing various combination of variables and let the classifier predict, and plot with the color given by the predicted class.
xs = np.linspace(X[:,0].min(), X[:,0].max(), num=100)
ys = np.linspace(X[:,1].min(), X[:,1].max(), num=100)
newX = []
for x1 in xs:
for x2 in ys:
newX.append((x1, x2))
newX = np.array(newX)
p = clf.predict(poly.transform(newX))
flg = (Y > 0)
plt.scatter(X[flg,0], X[flg,1], alpha=0.3, marker="o")
plt.scatter(X[~flg,0], X[~flg,1], alpha=0.3, marker="x")
flg = (p > 0)
plt.scatter(newX[flg,0], newX[flg,1], alpha=0.02, c="tab:blue", marker="s", s=20)
plt.scatter(newX[~flg,0], newX[~flg,1], alpha=0.02, c="tab:orange", marker="s", s=20)
The output of your PolyCoefficients function is a 4th order polynomial made up of:
coeffs[0]*x^0 + coeffs[1]*x^1 + coeffs[2]*x^2 + coeffs[3]*x^3 + coeffs[i]*x^4
Instead what you need is a 2nd order polynomial (specified by your parameter degree = 2 inside the sklearn.preprocessing.PolynomialFeatures object), which is really the following:
(coeffs[0]*x1^1) + (coeffs[1]*x2^1) + (coeffs[2]*x1^2 + 2*coeffs[3]*x1*x2 + coeffs[4]*x2^2)
This formula is valid as soon as you are using two features x1 and x2, otherwise you would need to use x1, x2, ..., xN and all terms that derive from (x1 + x2 + ... + xN)^2.
You can find more details and examples here.
I'm plotting two surface plots in python obtained from np.meshgrid, which I want to append to create only one surface plot. For instance:
fig = plt.figure(figsize=(8,4))
ax = fig.add_subplot(projection='3d')
# First surface:
x1 = np.linspace(0,1,100)
y1 = np.linspace(0,1,100)
X1,Y1 = np.meshgrid(x1,y1)
Z1 = 2*Y1
solid = ax.plot_surface(X1,Y1,Z1,cmap=cm.coolwarm,linewidth=0, antialiased=True)
# Second surface:
x2 = np.linspace(0,1,100)
y2 = np.linspace(1,2,100)
X2,Y2 = np.meshgrid(x2,y2)
Z2 = 2*Y2
solid = ax.plot_surface(X2,Y2,Z2,cmap=cm.coolwarm,linewidth=0, antialiased=True)
My first idea was to append them with np.append, such that:
X = np.append(X1,X2)
Y = np.append(Y1,Y2)
Z = np.append(Z1,Z2)
solid = ax.plot_surface(X,Y,Z,cmap=cm.coolwarm,linewidth=0, antialiased=True)
But as expected, I got an error since Z was flattened and, moreover, I'm not sure if X and Y are right. In the example above it's trivial how could I create only one surface since both of them have the same angle, but I want to solve for the general case where they could have different inclinations. How can I overcome this problem? Thank you in advance!
I would simply fill only one Z array, for example:
fig = plt.figure(figsize=(8,4))
ax = fig.add_subplot(projection='3d')
x = np.linspace(0,1,100)
y = np.linspace(0,2,100)
X, Y = np.meshgrid(x, y)
Z = np.empty_like(Y)
Z[Y < 1] = 2 * Y[Y < 1]
Z[Y >= 1] = 10 * Y[Y >= 1] - 8 # different slope
solid = ax.plot_surface(X , Y, Z, cmap=plt.cm.coolwarm, linewidth=0, antialiased=True)
I'm working on a contourplot with matplotlib and for my data I have a region where I have a strong gradient - Now I have the problem that matplotlib will display the regions with different colors, according to the selected colormap, and distribute the colors linear over the whole spectrum.
Since 90% of my datapoints are within one end of the spectra, and only this small region acts completely differently, my contourplot looks kind of
monochrome, as you can see in the attached picture
Also, I've added some contours to make the differences in the values more visible. Since we have a huge gradient at a specific spot, there area lot of contours and it is super hard to see the underlying colors or the values.
Is there a good way how to handle such "problematic" regions with matplotlib? Maybe to define another colormap there? I've tried to set some manual levels and to "cut out" the specific region, but it would be nice to find a way to display the value of this region
Just to get a feeling: My minimal value to display is around 7, the maximum value is 145 and the average
Here is the important part of my code:
z = [] # z is a list of values that i've read in before from a file
X = np.arange(0, 61, 1)
Y = np.arange(0, 151, 1)
z = z.reshape((len(Y), len(X)))
blurred = gaussian_filter(z, sigma=2) # applies a gaussian filter to smooth the plot
xx, yy = np.meshgrid(X, Y) # gets the grid for the plot
procent = np.arange(np.min(z), np.max(z), 5) # levels for the contourlines
newlevels = [5,10,15,20,30,40,50, 80, 100, 120, 140] # sets manual levels for the plot, where I've tried to set a stronger focus on the first part of the spectra
plusmin = plt.contourf(xx,yy,z, origin='lower', extend='both', levels=procent,)
levels = np.arange(np.min(z), np.max(z), 3)
CS = plt.contourf(xx, yy, z, levels=newlevels, extend="both", cmap=cm.viridis)
s = plt.contour(xx, yy, blurred, plusmin.levels, colors='white', linewidths=2)
cbar = plt.colorbar(CS, fraction=0.042, pad=0.04)
ax.clabel(s, fontsize=12, inline=1, colors ='white')
A solution might be to scale the colormap so that each colors is equally displayed.
Here is a piece of code I use to handle this kind of problem. There is certainly more proper way to do it with matplotlib, but I do not know it.
import matplotlib.pyplot as plt
from matplotlib import colors
import numpy as np
import copy
#-----------------------
def repartition(z):
"""compute the repartition function of an array"""
hist, bin_edges = np.histogram(z.flat[:], bins = 1000)
x = 0.5 * (bin_edges[1:] + bin_edges[:-1])
y = np.cumsum(np.array(hist, float))
y = (y - y[0]) / (y[-1] - y[0])
return x, y
#-----------------------
def adjustcmap2data(Z, cmap, N = 16384):
"""scale the colormap so that all colors are equally displayed"""
def cmap2xs(cmap):
"convert cmap to matrix"
sd = cmap._segmentdata
xr = [tup[0] for tup in sd['red']]
xg = [tup[0] for tup in sd['green']]
xb = [tup[0] for tup in sd['blue']]
return tuple([np.array(x) for x in xr, xg, xb])
def xs2cmap(cmap, xr, xg, xb):
"convert matrix to cmap"
sd = cmap._segmentdata
for k in 'red', 'green', 'blue':
sd[k] = list(sd[k])
for i in xrange(len(sd[k])): sd[k][i] = list(sd[k][i])
for i in xrange(len(sd['red'])): sd['red'][i] = (xr[i], sd['red'][i][1], sd['red'][i][2])
for i in xrange(len(sd['green'])): sd['green'][i] = (xg[i], sd['green'][i][1], sd['green'][i][2])
for i in xrange(len(sd['blue'])): sd['blue'][i] = (xb[i], sd['blue'][i][1], sd['blue'][i][2])
for k in 'red', 'green', 'blue':
sd[k] = tuple(sd[k])
return colors.LinearSegmentedColormap('mycmap_%010.0f' % (np.random.randn() * 1.e10), sd, N)
x, y = repartition(Z)
x = (x - x[0]) / (x[-1] - x[0])
xr, xg, xb = cmap2xs(cmap)
xrr = np.interp(xr, xp = y, fp = x)
xgg = np.interp(xg, xp = y, fp = x)
xbb = np.interp(xb, xp = y, fp = x)
for x in xrr, xgg, xbb:
x[x < 0.] = 0.
x[x > 1.] = 1.
x[0], x[-1] = 0., 1.
x.sort()
mycmap = xs2cmap(copy.deepcopy(cmap), xrr, xgg, xbb)
return mycmap
#---------------------
def fake_data():
"""generate a fake dataset"""
x = np.linspace(-1., 1., 256)
y = np.linspace(-1., 1., 256)
X, Y = np.meshgrid(x, y)
Z = np.zeros_like(X)
#create background noise
for _ in xrange(100):
x0 = np.random.randn()
y0 = np.random.randn()
Z += 0.05 * np.exp(-0.5 * (((X - x0) / 0.1) ** 2. + ((Y - y0) / 0.1) ** 2.))
#add strong peak
Z += np.exp(-0.5 * (((X - 0.5) / 0.3) ** 2. + ((Y - 0.5) / 0.02) ** 2.))
return X, Y, Z
#---------------------
if __name__ == "__main__":
X, Y, Z = fake_data()
plt.figure()
cmap = plt.cm.spectral
plt.pcolormesh(X, Y, Z, cmap = cmap)
plt.colorbar()
plt.contour(X, Y, Z, colors = "w")
plt.gcf().show()
plt.figure()
scaledcmap = adjustcmap2data(Z, cmap = cmap)
plt.pcolormesh(X, Y, Z, cmap = scaledcmap)
plt.colorbar()
plt.gcf().show()
raw_input('pause')
which should give you the following results
1) linear colorbar
2) scaled colorbar
I have a set of data for that I need identify patterns, so I tried to use plt.contours and plt.contourf for that task and it works well, now I can plot contours and show graphically the overdensity among the data. In this step, I tried to get the information of the surface contours (I mean, save the values that define the contour in a variable to use it later.) without success: Is there a way to do this?
Also, I have doubts of what does the values in the color bar means, I know that is the level of overdensity of the data, but if someone could tell me more details, would be great.
I attach the code that I'm using so far (I generate the data in this case), and a plot of the code.
import scipy.interpolate
import numpy as np
import scipy.stats as st
import matplotlib.pyplot as plt
np.random.seed(20)
data = np.random.rand(400,2)
x = data[:,0]
y = data[:,1]
plt.figure(figsize=(12,7))
# Set up a regular grid of points
xi, yi = np.linspace(x.min(), x.max(), 100), np.linspace(y.min(), y.max(), 100)
xi, yi = np.meshgrid(xi, yi)
#contours:
n_contours = 6
positions = np.vstack([xi.ravel(), yi.ravel()])
values = np.vstack([x, y])
kernel = st.gaussian_kde(values)
f = np.reshape(kernel(positions).T, xi.shape)
cfset = plt.contourf(xi, yi, f,n_contours, cmap='Greens')
cset = plt.contour(xi, yi, f,n_contours, colors='k')
#For the points data:
positions = np.vstack([x.ravel(), y.ravel()])
values = np.vstack([x, y])
kernel = st.gaussian_kde(values)
z = np.reshape(kernel(positions).T, x.shape)
#plot:
plt.scatter(x, y, c=z)
plt.colorbar(cfset)
plt.show()
thanks!
EDIT:
I founded a way to do this, using the get_paths() feature, so basically, you need to choose the contour, and then the number of the segment that you need to get the values (x,y), for example:
#contour 3, section 0
p = cset.collections[3].get_paths()[0]
v = p.vertices
x0 = v[:,0]
y0 = v[:,1]
#contour 3, section 1
p = cset.collections[3].get_paths()[1]
v = p.vertices
x1 = v[:,0]
y1 = v[:,1]
#contour 3, section 2
p = cset.collections[3].get_paths()[2]
v = p.vertices
x2 = v[:,0]
y2 = v[:,1]
plt.plot(x0,y0,'-',x1,y1,'-',x2,y2,'-')
With this, you get:
I am trying to plot profiles from an interpolated data. To begin with my data is three columns x,y,c.
First I interpolate the data onto a regular grid using:
xi , yi = np.linspace(np.min(X), np.max(X),300) , np.linspace(np.min(Y), np.max(Y),300)
xi, yi = np.meshgrid(xi, yi)
zi = scipy.interpolate.griddata((X, Y), C , (xi, yi),method='nearest')
now following the thread: How to extract an arbitrary line of values from a numpy array?
I want to plot the values at X = 5 for Y = -4,4
x0, y0 = 5, -4 # These are in _pixel_ coordinates!!
x1, y1 = 5, 4
num = 50
x, y = np.linspace(x0, x1, num), np.linspace(y0, y1, num)
#Getting values at that location
zi2 = scipy.ndimage.map_coordinates(np.transpose(zi), np.vstack((x,y)))
#Plotting
fig, axes = plt.subplots(nrows=2)
axes[0].imshow(zi, vmin=np.min(C), vmax=110, origin='lower',extent= [np.min(X), np.max(X), np.min(Y), np.max(Y)])
axes[0].plot([x0, x1], [y0, y1], 'ro-')
axes[0].axis('image')
axes[1].plot(y,zi2)
plt.show()
I get the following plot which is not looking the same as the way contour is behaving.