I am currently trying to learn Bayesian Statistics and how to implement it with pymc for a project at work.
So far I'm just playing around with this linear regression tutorial.
Here's the code snippet to generate y_observed:
size = 200
true_intercept = 1
true_slope = 2
x = np.linspace(0,1,size)
# y = a + b*x
true_y = true_intercept + true_slope * x
y_observed = true_y + np.random.normal(scale=0.5, size=size)
And here's the plot of y_observed and true_y:
Then, following the tutorial I fit the model as
with pm.Model() as model:
sigma = pm.HalfCauchy("sigma", beta=10)
intercept = pm.Normal("intercept", 0, sigma=20)
slope = pm.Normal("slope", 0, sigma=20)
likelihood = pm.Normal("y", mu = intercept + slope*x, sigma=sigma, observed=y_observed)
idata = pm.sample(3000)
My question then is... how should I represent the posterior that I got (idata.posterior) in my graph?
Considering that the posterior was sampled from 4 different chains, I thought in calculating the mean of my intercept, slope and sigma; and plot them plus and minus the std of sigma... but that gave me a very bad line, which makes me believe that is wrong.
Below my code snippet for what I just mentioned and the graph that I got:
def plot(x, true_y, true_line,idata):
intercept_mean = float(idata.posterior.intercept.mean()) # get the mean of the intercept (from all 4 chains)
slope_mean = float(idata.posterior.slope.mean()) # get the mean of the slope (from all 4 chains)
sigma_mean = float(idata.posterior.sigma.mean()) # the mean of the error (from all 4 chains)
sigma_std = float(idata.posterior.sigma.std()) # the std of the error (from all 4 chains)
y_pred_no_std = intercept_mean + slope_mean*x + sigma_mean
y_pred_plus = y_pred_no_std + sigma_std
y_pred_minus = y_pred_no_std - sigma_std
fig = plt.figure(figsize=(7, 7))
ax = fig.add_subplot(111, xlabel="x", ylabel="y", title="Generated data and underlying model")
ax.plot(x, true_line, "x", label="y_observed")
ax.plot(x, true_y, label="true regression line", lw=2.0)
ax.plot(x, y_pred_plus, label="pymc regression mean + std")
ax.plot(x, y_pred_no_std, label='pymc regression mean')
ax.plot(x, y_pred_minus, label="pymc regression mean - std")
plt.legend(loc=0)
Related
I am currently trying to curve fit some data from a OSA of dBm vs frequency, and i need to curve fit 2 peaks. I am either thinking that Gaussian or Lorentzian is the way to go. Either way my curve fit does not work very well. The first image is the Gaussian and the second is the Lorentzian. I am thinking that i could possibly cut the data in half? and curve each side separately, but im not entirely sure how to do it. If anyone has any ideas how to fix my problem or has a better solution id really appreciate Gaussian Lorentizian
def _2gaussian(x, amp1,cen1,sigma1, amp2,cen2,sigma2):
return amp1*(1/(sigma1*(np.sqrt(2*np.pi))))*(np.exp((-1.0/2.0)*(((x-cen1)/sigma1)**2))) + \
amp2*(1/(sigma2*(np.sqrt(2*np.pi))))*(np.exp((-1.0/2.0)*(((x-cen2)/sigma2)**2)))
def plot(x,y):
peaks, _ = find_peaks(y,height = -65.0, distance= 30000)
w = peaks[0]
z = peaks[1]
cen1 = x[w]
cen2 = x[z]
amp1 = y[w]
amp2 = y[z]
sigma1 = 1000
sigma2 = 500
popt_2gauss, pcov_2gauss = curve_fit(_2gaussian, x, y, p0=[amp1, cen1, sigma1, amp2, cen2, sigma2], maxfev=2500)
perr_2gauss = np.sqrt(np.diag(pcov_2gauss))
pars_1 = popt_2gauss[0:3]
pars_2 = popt_2gauss[3:6]
gauss_peak_1 = _1gaussian(x, *pars_1)
gauss_peak_2 = _1gaussian(x, *pars_2)
fig, ax = plt.subplots()
ax.plot(x/1e3,y,',',label = 'data')
ax.plot(x/1e3,_2gaussian(x,*popt_2gauss), label ='cruve fit')
plt.xlim([193.300,193.460])
plt.ylim([-100,0])
plt.title('Fig. 3 - Fit for Time Constant')
plt.legend(loc ='best')
plt.show()
I am trying to create a Gaussian Fit by using scipy.opimize curve fit.
My y datas have a poisson error, so i need to integrate this uncertainties into my curve fit, but I don't know how.
At first a create a function fit_gauss which worked without the error in y. Now i try to modifize this code.
That's what i got:
x = x_data #datas are imported from file
y = y_data
y_un = unp.uarray(y, np.sqrt(y))
print("DATA - Gauss")
#Define Gauss function
def f_gauss(x,a,x0,sigma):
return a*exp(-(x-x0)**2/(2*sigma**2))
#Define Fitting function
def fit_gauss(x,y,title,path):
n=len(x)
mean=sum(x*y)/n
w=[0]*len(x)
for i in range(len(x)):
w[i]=y[i]*(x[i]-mean)**2
sigma = (sum(w) / sum(y))**(1 / 2)
#sigma = (sum(y * (x - mean)**2) / sum(y))**(1/2)
gopt, gcov = curve_fit(
f_gauss,
x, y,
p0=[max(y),mean,sigma]
) #trying to use curve fit
gerrors = np.sqrt(np.diag(gcov))
unparams_gauss = unp.uarray(gopt, gerrors)
print(f"""
{title}
Mean: {mean}
Sigma: {sigma}
a={unparams_gauss[0]}
x0={unparams_gauss[1]}
sigma={unparams_gauss[2]}
""")
#plotting
plt.title(title)
plt.plot(x, y, "k", label=f"{title}")
plt.plot(x, f_gauss(x, *gopt), "r--", label="Gauß Fit")
plt.legend(loc="best")
plt.savefig(path)
plt.close()
fit_gauss(x,y_un,"Cs-137","plots/gauss_fit.pdf")
I am trying to use Gaussian process regression on a cancer dataset using GPy, but the problem is when I fit a combination of 3 or 4 kernels the system collapses and gives the LinAlgError: not positive definite, even with jitter error. But it produces some output when I use a combination of two kernels. Here is the main code and the dataset image(the year in x-axis and tumor count in y-axis) I am trying to predict is attached below:
k_rbf = GPy.kern.RBF(1, lengthscale=50,name = "rbf")
k_exp = GPy.kern.Exponential(1,lengthscale=6)
k_lin = GPy.kern.Linear(1)
k_per = GPy.kern.StdPeriodic(1, period = 5)
k = k_rbf * k_per + k_lin + k_exp
m = GPy.models.GPRegression(X, Y, k)
m.optimize()
def plot_gp(X, m, C, training_points=None):
""" Plotting utility to plot a GP fit with 95% confidence interval """
# Plot 95% confidence interval
plt.fill_between(X[:,0],
m[:,0] - 1.96*np.sqrt(np.diag(C)),
m[:,0] + 1.96*np.sqrt(np.diag(C)),
alpha=0.5)
# Plot GP mean and initial training points
plt.plot(X, m, "-")
plt.legend(labels=["GP fit"])
plt.xlabel("x"), plt.ylabel("f")
# Plot training points if included
if training_points is not None:
X_, Y_ = training_points
plt.plot(X_, Y_, "kx", mew=2)
plt.legend(labels=["GP fit", "sample points"])
X_ = np.linspace(X.min(), X.max() + 30, 1000)[:, np.newaxis]
mean, Cov = m.predict(X_, full_cov=True)
plt.figure(figsize=(20, 10))
plot_gp(X_, mean, Cov)
plt.gca().set_xlim([1990,2060]), plt.gca().set_ylim([35000, 150000])
plt.plot(X, Y, "b.");
I have a problem with fitting a custom function using scipy.optimize in Python and I do not know, why that is happening. I generate data from centered and normalized binomial distribution (Gaussian curve) and then fit a curve. The expected outcome is in the picture when I plot my function over the fitted data. But when I do the fitting, it fails.
I'm convinced it is a pythonic thing because it should give the parameter a = 1 (that's how I define it) and it gives it but then the fit is bad (see picture). However, if I change sigma to 0.65*sigma in:
p_halfg, p_halfg_cov = optimize.curve_fit(lambda x, a:piecewise_half_gauss(x, a, sigma = 0.65*sigma_fit), x, y, p0=[1])
, it gives almost perfect fit (a is then 5/3, as predicted by math). Those fits should be the same and they are not!
I give more comments bellow. Could you please tell me what is happening and where the problem could be?
Plot with a=1 and sigma = sigma_fit
Plot with sigma = 0.65*sigma_fit
I generate data from normalized binomial distribution (I can provide my code but the values are more important now). It is a distribution with N = 10 and p = 0.5 and I'm centering it and taking only the right side of the curve. Then I'm fitting it with my half-gauss function, which should be the same distribution as binomial if its parameter a = 1 (and the sigma is equal to the sigma of the distribution, sqrt(np(1-p)) ). Now the problem is first that it does not fit the data as shown in the picture despite getting the correct value of parameter a.
Notice weird stuff... if I set sigma = 3* sigma_fit, I get a = 1/3 and a very bad fit (underestimate). If I set it to 0.2*sigma_fit, I get also a bad fit and a = 1/0.2 = 5 (overestimate). And so on. Why? (btw. a=1/sigma so the fitting procedure should work).
import numpy as np
import matplotlib.pyplot as plt
import math
pi = math.pi
import scipy.optimize as optimize
# define my function
sigma_fit = 1
def piecewise_half_gauss(x, a, sigma=sigma_fit):
"""Half of normal distribution curve, defined as gaussian centered at 0 with constant value of preexponential factor for x < 0
Arguments: x values as ndarray whose numbers MUST be float type (use linspace or np.arange(start, end, step, dtype=float),
a as a parameter of width of the distribution,
sigma being the deviation, second moment
Returns: Half gaussian curve
Ex:
>>> piecewise_half_gauss(5., 1)
array(0.04839414)
>>> x = np.linspace(0,10,11)
... piecewise_half_gauss(x, 2, 3)
array([0.06649038, 0.06557329, 0.0628972 , 0.05867755, 0.05324133,
0.04698531, 0.04032845, 0.03366645, 0.02733501, 0.02158627,
0.01657952])
>>> piecewise_half_gauss(np.arange(0,11,1, dtype=float), 1, 2.4)
array([1.66225950e-01, 1.52405153e-01, 1.17463281e-01, 7.61037856e-02,
4.14488078e-02, 1.89766470e-02, 7.30345854e-03, 2.36286717e-03,
6.42616248e-04, 1.46914868e-04, 2.82345875e-05])
"""
return np.piecewise(x, [x >= 0, x < 0],
[lambda x: np.exp(-x ** 2 / (2 * ((a * sigma) ** 2))) / (np.sqrt(2 * pi) * sigma * a),
lambda x: 1 / (np.sqrt(2 * pi) * sigma)])
# Create normalized data for binomial distribution Bin(N,p)
n = 10
p = 0.5
x = np.array([0., 1., 2., 3., 4., 5.])
y = np.array([0.25231325, 0.20657662, 0.11337165, 0.0417071 , 0.01028484,
0.00170007])
# Get the estimate for sigma parameter
sigma_fit = (n*p*(1-p))**0.5
# Get fitting parameters
p_halfg, p_halfg_cov = optimize.curve_fit(lambda x, a:piecewise_half_gauss(x, a, sigma = sigma_fit), x, y, p0=[1])
print(sigma_fit, p_halfg, p_halfg_cov)
## Plot the result
# unpack fitting parameters
a = np.float64(p_halfg)
# unpack uncertainties in fitting parameters from diagonal of covariance matrix
#da = [np.sqrt(p_halfg_cov[j,j]) for j in range(p_halfg.size)] # if we fit more parameters
da = np.float64(np.sqrt(p_halfg_cov[0]))
# create fitting function from fitted parameters
f_fit = np.linspace(0, 10, 50)
y_fit = piecewise_half_gauss(f_fit, a)
# Create figure window to plot data
fig = plt.figure(1, figsize=(10,10))
plt.scatter(x, y, color = 'r', label = 'Original points')
plt.plot(f_fit, y_fit, label = 'Fit')
plt.xlabel('My x values')
plt.ylabel('My y values')
plt.text(5.8, .25, 'a = {0:0.5f}$\pm${1:0.6f}'.format(a, da))
plt.legend()
However, if I plot it manually, it fits EXACTLY!
plt.scatter(x, y, c = 'r', label = 'Original points')
plt.plot(np.linspace(0,5,50), piecewise_half_gauss(np.linspace(0,5,50), 1, sigma_fit), label = 'Fit')
plt.legend()
EDIT -- solved:
it is a plotting problem, need to use
y_fit = piecewise_half_gauss(f_fit, a, sigma = 0.6*sigma_fit)
The problem was in plotting and fitting the parameters -- if I fit it with different sigma, I also need to change it in the plotting section when I generate y_fit:
# Get fitting parameters
p_halfg, p_halfg_cov = optimize.curve_fit(lambda x, a:piecewise_half_gauss(x, a, sigma = 0.6*sigma_fit), x, y, p0=[1])
...
y_fit = piecewise_half_gauss(f_fit, a, sigma = 0.6*sigma_fit)
How can I find and plot a LOWESS curve that looks like the following using Python?
I'm aware of the LOWESS implementation in statsmodels, but it doesn't seem to be able to give me 95% confidence interval lines that I can shade between. Seaborn has a method that calls the statsmodels implementation, but it can't plot the confidence intervals.
Other StackOverflow answers give code to draw a LOESS/LOWESS line, but none with a confidence interval. Can anyone assist with this? Is anyone aware of an existing implementation that would enable me to do this?
Thanks in advance.
I found a link here is useful, and I put code below:
def lowess(x, y, f=1./3.):
"""
Basic LOWESS smoother with uncertainty.
Note:
- Not robust (so no iteration) and
only normally distributed errors.
- No higher order polynomials d=1
so linear smoother.
"""
# get some paras
xwidth = f*(x.max()-x.min()) # effective width after reduction factor
N = len(x) # number of obs
# Don't assume the data is sorted
order = np.argsort(x)
# storage
y_sm = np.zeros_like(y)
y_stderr = np.zeros_like(y)
# define the weigthing function -- clipping too!
tricube = lambda d : np.clip((1- np.abs(d)**3)**3, 0, 1)
# run the regression for each observation i
for i in range(N):
dist = np.abs((x[order][i]-x[order]))/xwidth
w = tricube(dist)
# form linear system with the weights
A = np.stack([w, x[order]*w]).T
b = w * y[order]
ATA = A.T.dot(A)
ATb = A.T.dot(b)
# solve the syste
sol = np.linalg.solve(ATA, ATb)
# predict for the observation only
yest = A[i].dot(sol)# equiv of A.dot(yest) just for k
place = order[i]
y_sm[place]=yest
sigma2 = (np.sum((A.dot(sol) -y [order])**2)/N )
# Calculate the standard error
y_stderr[place] = np.sqrt(sigma2 *
A[i].dot(np.linalg.inv(ATA)
).dot(A[i]))
return y_sm, y_stderr
import numpy as np
import matplotlib.pyplot as plt
# make some data
x = 5*np.random.random(100)
y = np.sin(x) * 3*np.exp(-x) + np.random.normal(0, 0.2, 100)
order = np.argsort(x)
#run it
y_sm, y_std = lowess(x, y, f=1./5.)
# plot it
plt.plot(x[order], y_sm[order], color='tomato', label='LOWESS')
plt.fill_between(x[order], y_sm[order] - 1.96*y_std[order],
y_sm[order] + 1.96*y_std[order], alpha=0.3, label='LOWESS uncertainty')
plt.plot(x, y, 'k.', label='Observations')
plt.legend(loc='best')
#run it
y_sm, y_std = lowess(x, y, f=1./5.)
# plot it
plt.plot(x[order], y_sm[order], color='tomato', label='LOWESS')
plt.fill_between(x[order], y_sm[order] - y_std[order],
y_sm[order] + y_std[order], alpha=0.3, label='LOWESS uncertainty')
plt.plot(x, y, 'k.', label='Observations')
plt.legend(loc='best')