My idea is to apply linear regression to draw a line on a time series dataset to approximate the direction it is evolving in (first I draw the line, then I calculate the slope and I see if my plot is increasing decreasing, or constant).
For that, I relied on this code
def estimate_coef(x, y):
# number of observations/points
n = np.size(x)
# mean of x and y vector
m_x = np.mean(x)
m_y = np.mean(y)
# calculating cross-deviation and deviation about x
SS_xy = np.sum(y*x) - n*m_y*m_x
SS_xx = np.sum(x*x) - n*m_x*m_x
# calculating regression coefficients
b_1 = SS_xy / SS_xx
b_0 = m_y - b_1*m_x
return (b_0, b_1)
def plot_regression_line(x, y, b):
# plotting the actual points as scatter plot
plt.scatter(x, y, color = "m",
marker = "o", s = 30)
# predicted response vector
y_pred = b[0] + b[1]*x
# plotting the regression line
plt.plot(x, y_pred, color = "g")
# putting labels
plt.xlabel('x')
plt.ylabel('y')
# function to show plot
plt.show()
For that I need an X and Y array.
The data I extracted had an index in the format of a date "Y-M-D".
enter image description here
As you may know for linear regression it does not make sense to have the "date" as index, hence I used the A.reset_index() to get numeric indexes
enter image description here
Now that I got my data I need to extract the indexes to put them in an array "X" and the data to be plotted in an array "Y".
Therefore my question would be how to extract these new indexes and put them in the array X.
You can do:
x=[i + 1 for i in A.index] # to make data x starts with 1 instead of 0
y=A['lift']
And you apply your functions on those x and y
Related
I have been trying to code a piece of software to bootstrap data where every data point has a different and unique uncertainty. I take this uncertainty as the standard deviation when sampling that point from a gaussian distribution.
I run many samples, however the bootstrapped results does not agree with the curve_fit best result (the only difference I can think of is that this takes the datapoints and assumes they have no uncertainty), but this best result, by definition, be identical. Any ideas why?
The code is as follows:
with inputs as:
def f(x, a, b):
y = a*x + b
return y
x (array, x data points)
y (array, y data points)
x_err (array, uncertainty in each x point)
y_err (array, uncertainty in each y point)
n_samples = 10000
conf_pct = 68 (% for a 1 sigma test)
So just for clarity, x[i], y[i], x_err[i] and y_err[i] is all the information associated with the ith datapoint. (I did have these in a dataframe but took it out into arrays because I understood the processing of it better)
def bootstrap_fit(f, x, y, x_err, y_err, n_samples, conf_pct):
# n_samples number of draws from each data point, and then
# take the transpose to make n_samples number of samples (because n_samples >> len(x) )
x_sampling = []
y_sampling = []
a_boot = []
b_boot = []
# cov_boot = [] # don't think we'll need this but just in case?
for i, this_x in enumerate(x):
this_x_err = x_err[i]
this_y = y[i]
this_y_err = y_err[i]
this_x_samp = np.random.normal(loc=this_x, scale=this_x_err, size=n_samples)
this_y_samp = np.random.normal(loc=this_y, scale=this_y_err, size=n_samples)
x_sampling.append(this_x_samp)
y_sampling.append(this_y_samp)
# convert to np arrays and take the transpose
x_sampling = np.array(x_sampling).T
y_sampling = np.array(y_sampling).T
# ok, now that we have n_samples number of datasets randomly sampled within
# the actual errorbars of the data, let's fit each of those datasets
# notice how this_x and this_y and i, etc, are temporary variables
# that will get overwritten from the past loop
for i, this_x in enumerate(x_sampling):
this_y = y_sampling[i]
p_opt, p_cov = curve_fit(f, this_x, this_y)
a_boot.append(p_opt[0])
b_boot.append(p_opt[1])
# cov_boot.append(p_cov)
# make these into np arrays as well
a_boot = np.array(a_boot)
b_boot = np.array(b_boot)
# set up an array to use to plot the lines (because each x, y random dataset
# actually has slightly different min and max x values, and that gets messy)
x_fit = np.linspace(np.min(x), np.max(x), num=1000, endpoint=True)
y_fit = []
for i, this_a in enumerate(a_boot):
this_b = b_boot[i]
this_y = f(x_fit, this_a, this_b)
y_fit.append(this_y)
y_fit = np.array(y_fit)
# figure out from that what percentiles we actually need to identify
conf_lo = (100. - conf_pct)/2.
conf_hi = 100. - conf_lo
# set up the lists that will hold the upper and lower lines
y_upper = []
y_lower = []
y_median = []
y_difference = []
for i, this_x in enumerate(x_fit):
# we need to extract all the y-values for every random sample that correspond
# to this x value. We will just take the ith array of the transpose of y_boot.
this_y = y_fit.T[i]
# add the percentile values to each list for this value of x
y_lower.append(np.percentile(this_y, conf_lo))
y_upper.append(np.percentile(this_y, conf_hi))
y_median.append(np.percentile(this_y, 50.))
# make them numpy arrays because sometimes matplotlib doesn't like plotting lists
y_lower = np.array(y_lower)
y_upper = np.array(y_upper)
y_median = np.array(y_median)
# finding equation for the median line
p_opt, p_cov = curve_fit(f, x_fit, y_median)
a = float("{:.4f}".format(p_opt[0]))
b = float("{:.4f}".format(p_opt[1]))
for i, this_x in enumerate(x_fit):
this_y = y_fit.T[i]
spread_above = abs(point_line_distance(x, np.percentile(this_y, conf_hi), p_opt[0], p_opt[1]))
spread_below = abs(point_line_distance(x, np.percentile(this_y, conf_lo), p_opt[0], p_opt[1]))
orthog_distance = spread_above + spread_below
y_difference.append(orthog_distance)
spread = float("{:.4f}".format(np.amin(y_difference)))
print("narrowest orthogonal point on bootstrap_2 "+str(spread))
plt.fill_between(x_fit, y_lower, y_upper, alpha=0.4, label='Bootstrapped uncertainty at '+str(conf_pct)+'%')
plt.plot(x_fit, y_median, label='Bootstrapped curve_fit: y = ('+str(a)+')x + ('+str(b)+')')
def CURVE_fit(x, y):
# do a standard linear fitting to the data
p_opt, p_cov = curve_fit(f, x, y)
p_err = np.sqrt(np.diag(p_cov))
# y=ax+b for a linear fit
a, b = p_opt
a_err, b_err = p_err
x_plot = np.sort(x)
plt.plot(x, a*x+b, label='best curve_fit: y = ('+result(a,a_err)+')x + ('+result(b,b_err)+')', color = 'purple', linestyle = 'dashed')
I've tried playing around with the number of samples, the data input, and I've even coded an entirely seperate straight line fit using ODR and a corresponding independant bootstrapping method (they don't agree but thats a whole different issue) and nothing seems to reconsile these two values. Any ideas would be much appreciated.
I am trying to use Gaussian process regression on a cancer dataset using GPy, but the problem is when I fit a combination of 3 or 4 kernels the system collapses and gives the LinAlgError: not positive definite, even with jitter error. But it produces some output when I use a combination of two kernels. Here is the main code and the dataset image(the year in x-axis and tumor count in y-axis) I am trying to predict is attached below:
k_rbf = GPy.kern.RBF(1, lengthscale=50,name = "rbf")
k_exp = GPy.kern.Exponential(1,lengthscale=6)
k_lin = GPy.kern.Linear(1)
k_per = GPy.kern.StdPeriodic(1, period = 5)
k = k_rbf * k_per + k_lin + k_exp
m = GPy.models.GPRegression(X, Y, k)
m.optimize()
def plot_gp(X, m, C, training_points=None):
""" Plotting utility to plot a GP fit with 95% confidence interval """
# Plot 95% confidence interval
plt.fill_between(X[:,0],
m[:,0] - 1.96*np.sqrt(np.diag(C)),
m[:,0] + 1.96*np.sqrt(np.diag(C)),
alpha=0.5)
# Plot GP mean and initial training points
plt.plot(X, m, "-")
plt.legend(labels=["GP fit"])
plt.xlabel("x"), plt.ylabel("f")
# Plot training points if included
if training_points is not None:
X_, Y_ = training_points
plt.plot(X_, Y_, "kx", mew=2)
plt.legend(labels=["GP fit", "sample points"])
X_ = np.linspace(X.min(), X.max() + 30, 1000)[:, np.newaxis]
mean, Cov = m.predict(X_, full_cov=True)
plt.figure(figsize=(20, 10))
plot_gp(X_, mean, Cov)
plt.gca().set_xlim([1990,2060]), plt.gca().set_ylim([35000, 150000])
plt.plot(X, Y, "b.");
I have a dataset that resembles the data created in the MWE below:
from matplotlib import pyplot as plt
import numpy as np
sz=100
x = np.linspace(-1, 1, sz)
mean = -np.sign(x)
noise = np.random.randn(*x.shape)
K = -2
y_true = K*x
y = y_true + mean + noise
plt.scatter(x, y, label="Data with error")
plt.plot(x, y_true, "-", label="True line")
plt.grid()
That is, the errors around the line I want are mostly negative for x>0 and mostly positive for x<0. What I'm looking for is a way to estimate the coefficient K (which in this case is -2).
Really I think the way to do it would be to minimize the error only of the points that fall above the line for x<0 and below the line for x>0, but I'm not sure how to go about it effectively in Python, since everything I can think of involves iterative processes which are slow in Python.
Basically you want to include something that can account for the mean variable in your data generating model. You can do this by modeling a discontinuity at the point x=0 by including a variable in your model that is 0 where x < 0 and 1 where x > 0.
We can even just include the "mean" variable itself and get the same model (with a different interpretation for the second coefficient). Here is a linear model that recovers the correct value for the slope of this discontinuous line. Note that this assumes the slope is the same on the right side of 0 as the left side.
from sklearn.linear_model import LinearRegression
X = np.array([x, mean]).T
reg = LinearRegression().fit(X, y)
print(reg.coef_)
Here is my attempt where I A) fit all data to a straight line, and then B) separate data depending on two criteria: whether x is greater than or less than zero and whether predicted Y is above or below that straight line, and finally C) fit the separated data. The slope is here -2.417 and will vary from run to run depending on the random data.
from matplotlib import pyplot as plt
import numpy as np
sz=100
x = np.linspace(-1, 1, sz)
mean = -np.sign(x)
noise = np.random.randn(*x.shape)
K = -2
y_true = K*x
y = y_true + mean + noise
plt.scatter(x, y, label="Data with error")
plt.plot(x, y_true, "-", label="True line")
###############################
# new section for calculatiing new line
allDataFirstOrderParameters = np.polyfit(x, y, 1)
allDataFirstOrderErrors = y - np.polyval(allDataFirstOrderParameters, x)
newX = []
newY = []
for i in range(len(x)):
if x[i] < 0 and allDataFirstOrderErrors[i] < 0:
newX.append(x[i])
newY.append(y[i])
if x[i] > 0 and allDataFirstOrderErrors[i] > 0:
newX.append(x[i])
newY.append(y[i])
newX = np.array(newX)
newY = np.array(newY)
newFirstOrderParameters = np.polyfit(newX, newY, 1)
print("New Parameters", newFirstOrderParameters)
plotNewX = np.linspace(min(x), max(x))
plotNewY = np.polyval(newFirstOrderParameters, plotNewX)
plt.plot(plotNewX, plotNewY, label="New line")
plt.legend()
plt.show()
I have several data points in 3 dimensional space (x, y, z) and have interpolated them using scipy.interpolate.Rbf. This gives me a spline nicely representing the surface of my 3D object. I would now like to determine several x and y pairs that have the same, arbitrary z value. I would like to do that in order to compute the cross section of my 3D object at any given value of z. Does someone know how to do that? Maybe there is also a better way to do that instead of using scipy.interpolate.Rbf.
Up to now I have evaluated the cross sections by making a contour plot using matplotlib.pyplot and extracting the displayed segments. 3D points and interpolated spline
segments extracted using a contour plot
I was able to solve the problem. I have calculated the area by triangulating the x-y data and cutting the triangles with the z-plane I wanted to calculate the cross-sectional area of (z=z0). Specifically, I have searched for those triangles whose z-values are both above and below z0. Then I have calculated the x and y values of the sides of these triangles where the sides are equal to z0. Then I use scipy.spatial.ConvexHull to sort the intersected points. Using the shoelace formula I can then determine the area.
I have attached the example code here:
import numpy as np
from scipy import spatial
import matplotlib.pyplot as plt
# Generation of random test data
n = 500
x = np.random.random(n)
y = np.random.random(n)
z = np.exp(-2*(x-.5)**2-4*(y-.5)**2)
z0 = .75
# Triangulation of the test data
triang= spatial.Delaunay(np.array([x, y]).T)
# Determine all triangles where not all points are above or below z0, i.e. the triangles that intersect z0
tri_inter=np.zeros_like(triang.simplices, dtype=np.int) # The triangles which intersect the plane at z0, filled below
i = 0
for tri in triang.simplices:
if ~np.all(z[tri] > z0) and ~np.all(z[tri] < z0):
tri_inter[i,:] = tri
i += 1
tri_inter = tri_inter[~np.all(tri_inter==0, axis=1)] # Remove all rows with only 0
# The number of interpolated values for x and y has twice the length of the triangles
# Because each triangle intersects the plane at z0 twice
x_inter = np.zeros(tri_inter.shape[0]*2)
y_inter = np.zeros(tri_inter.shape[0]*2)
for j, tri in enumerate(tri_inter):
# Determine which of the three points are above and which are below z0
points_above = []
points_below = []
for i in tri:
if z[i] > z0:
points_above.append(i)
else:
points_below.append(i)
# Calculate the intersections and put the values into x_inter and y_inter
t = (z0-z[points_below[0]])/(z[points_above[0]]-z[points_below[0]])
x_new = t * (x[points_above[0]]-x[points_below[0]]) + x[points_below[0]]
y_new = t * (y[points_above[0]]-y[points_below[0]]) + y[points_below[0]]
x_inter[j*2] = x_new
y_inter[j*2] = y_new
if len(points_above) > len(points_below):
t = (z0-z[points_below[0]])/(z[points_above[1]]-z[points_below[0]])
x_new = t * (x[points_above[1]]-x[points_below[0]]) + x[points_below[0]]
y_new = t * (y[points_above[1]]-y[points_below[0]]) + y[points_below[0]]
else:
t = (z0-z[points_below[1]])/(z[points_above[0]]-z[points_below[1]])
x_new = t * (x[points_above[0]]-x[points_below[1]]) + x[points_below[1]]
y_new = t * (y[points_above[0]]-y[points_below[1]]) + y[points_below[1]]
x_inter[j*2+1] = x_new
y_inter[j*2+1] = y_new
# sort points to calculate area
hull = spatial.ConvexHull(np.array([x_inter, y_inter]).T)
x_hull, y_hull = x_inter[hull.vertices], y_inter[hull.vertices]
# Calculation of are using the shoelace formula
area = 0.5*np.abs(np.dot(x_hull,np.roll(y_hull,1))-np.dot(y_hull,np.roll(x_hull,1)))
print('Area:', area)
plt.figure()
plt.plot(x_inter, y_inter, 'ro')
plt.plot(x_hull, y_hull, 'b--')
plt.triplot(x, y, triangles=tri_inter, color='k')
plt.show()
Let's say that I have the following data (measurements):
As you can see, there are a lot of sharp points (i.e. where the slope changes a lot). It would therefore, be good to take some more measurements around those points. To do that I wrote a script:
I calculate the curvature of 3 consecutive points:
Menger curvature: https://en.wikipedia.org/wiki/Menger_curvature#Definition
Then I decide which values I should resample, based on the curvature.
...and I iterate until the average curvature goes down... but it does not work, because, it goes up. Do you know why ?
Here is the complete code (stopped it after the length of the x values get 60):
import numpy as np
import matplotlib.pyplot as plt
def curvature(A,B,C):
"""Calculates the Menger curvature fro three Points, given as numpy arrays.
Sources:
Menger curvature: https://en.wikipedia.org/wiki/Menger_curvature#Definition
Area of a triangle given 3 points: https://math.stackexchange.com/questions/516219/finding-out-the-area-of-a-triangle-if-the-coordinates-of-the-three-vertices-are
"""
# Pre-check: Making sure that the input points are all numpy arrays
if any(x is not np.ndarray for x in [type(A),type(B),type(C)]):
print("The input points need to be a numpy array, currently it is a ", type(A))
# Augment Columns
A_aug = np.append(A,1)
B_aug = np.append(B,1)
C_aug = np.append(C,1)
# Caclulate Area of Triangle
matrix = np.column_stack((A_aug,B_aug,C_aug))
area = 1/2*np.linalg.det(matrix)
# Special case: Two or more points are equal
if np.all(A == B) or np.all(B == C):
curvature = 0
else:
curvature = 4*area/(np.linalg.norm(A-B)*np.linalg.norm(B-C)*np.linalg.norm(C-A))
# Return Menger curvature
return curvature
def values_to_calulate(x,curvature_list, max_curvature):
"""Calculates the new x values which need to be calculated
Middle point between the three points that were used to calculate the curvature """
i = 0
new_x = np.empty(0)
for curvature in curvature_list:
if curvature > max_curvature:
new_x = np.append(new_x, x[i]+(x[i+2]-x[i])/3 )
i = i+1
return new_x
def plot(x,y, title, xLabel, yLabel):
"""Just to visualize"""
# Plot
plt.scatter(x,y)
plt.plot(x, y, '-o')
# Give a title for the sine wave plot
plt.title(title)
# Give x axis label for the sine wave plot
plt.xlabel(xLabel)
# Give y axis label for the sine wave plot
plt.ylabel(yLabel)
plt.grid(True, which='both')
plt.axhline(y=0, color='k')
# Display the sine wave
plt.show
plt.pause(0.05)
### STARTS HERE
# Get x values of the sine wave
x = np.arange(0, 10, 1);
# Amplitude of the sine wave is sine of a variable like time
def function(x):
return 1+np.sin(x)*np.cos(x)**2
y = function(x)
# Plot it
plot(x,y, title='Data', xLabel='Time', yLabel='Amplitude')
continue_Loop = True
while continue_Loop == True :
curvature_list = np.empty(0)
for i in range(len(x)-2):
# Get the three points
A = np.array([x[i],y[i]])
B = np.array([x[i+1],y[i+1]])
C = np.array([x[i+2],y[i+2]])
# Calculate the curvature
curvature_value = abs(curvature(A,B,C))
curvature_list = np.append(curvature_list, curvature_value)
print("len: ", len(x) )
print("average curvature: ", np.average(curvature_list))
# Calculate the points that need to be added
x_new = values_to_calulate(x,curvature_list, max_curvature=0.3)
# Add those values to the current x list:
x = np.sort(np.append(x, x_new))
# STOPED IT AFTER len(x) == 60
if len(x) >= 60:
continue_Loop = False
# Amplitude of the sine wave is sine of a variable like time
y = function(x)
# Plot it
plot(x,y, title='Data', xLabel='Time', yLabel='Amplitude')
This is how it should look:
EDIT:
If you let it run even further... :
So summarize my comments above:
you are computing the average curvature of your curve which has no reason to go to 0. At every point, no matter how close your points get, the circle radius will converge to whatever the curvature is at that point, not 0.
an alternative would be to use the absolute derivative change between two points: keep sampling until abs(d(df/dx)) < some_threshold where d(df/dx) = (df/dx)[n] - (df/dx)[n-1]