Related
I have such histogram:
and I have this code that finds the maxima (-21.5 in my case):
from scipy.stats import gaussian_kde
def find_range(column):
kde = gaussian_kde(column)
no_samples = len(column)
samples = np.linspace(column.min(), column.max(), no_samples)
probs = kde.evaluate(samples)
maxima_index = probs.argmax()
maxima = samples[maxima_index]
plt.scatter(samples, probs) #, color='b',linewidths=0.05)
plt.show()
return [maxima]
But I need to find the range of the most dominant values of the histogram (in this histogram for example: -30 : -5).
Something like, the value from both sides where it's probability is equal to 20% of the maxima probability.
How can I achieve it?
I had tried the following:
t_right = list(filter(lambda tup:np.logical_and(tup[1] > maxima , probs[tup[0]] <= max(probs)*0.2), enumerate(samples)))
but getting many values, I want only one value that cut the curve
I'm not sure if that is what you are looking for but I've found this article on Towards data Science code form that article is as follow:
Link: https://towardsdatascience.com/take-your-histograms-to-the-next-level-using-matplotlib-5f093ad7b9d3
# Plot
# Plot histogram
avocado.plot(kind = "hist", density = True, alpha = 0.65, bins = 15) # change density to true, because KDE uses density
# Plot KDE
avocado.plot(kind = "kde")
# Quantile lines
quant_5, quant_25, quant_50, quant_75, quant_95 = avocado.quantile(0.05), avocado.quantile(0.25), avocado.quantile(0.5), avocado.quantile(0.75), avocado.quantile(0.95)
quants = [[quant_5, 0.6, 0.16], [quant_25, 0.8, 0.26], [quant_50, 1, 0.36], [quant_75, 0.8, 0.46], [quant_95, 0.6, 0.56]]
for i in quants:
ax.axvline(i[0], alpha = i[1], ymax = i[2], linestyle = ":")
# X
ax.set_xlabel("Average Price ($)")
# Limit x range to 0-4
x_start, x_end = 0, 4
ax.set_xlim(x_start, x_end)
# Y
ax.set_ylim(0, 1)
ax.set_yticklabels([])
ax.set_ylabel("")
# Annotations
ax.text(quant_5-.1, 0.17, "5th", size = 10, alpha = 0.8)
ax.text(quant_25-.13, 0.27, "25th", size = 11, alpha = 0.85)
ax.text(quant_50-.13, 0.37, "50th", size = 12, alpha = 1)
ax.text(quant_75-.13, 0.47, "75th", size = 11, alpha = 0.85)
ax.text(quant_95-.25, 0.57, "95th Percentile", size = 10, alpha =.8)
# Overall
ax.grid(False)
ax.set_title("Avocado Prices in U.S. Markets", size = 17, pad = 10)
# Remove ticks and spines
ax.tick_params(left = False, bottom = False)
for ax, spine in ax.spines.items():
spine.set_visible(False)
plt.show()
The output of above is something like that:
I hope that could be helpful for you! :)
This is my solution, will be glad to get other ideas:
from scipy.stats import gaussian_kde
def find_range(column):
kde = gaussian_kde(column)
no_samples = len(column)
samples = np.linspace(column.min(), column.max(), no_samples)
probs = kde.evaluate(samples)
maxima_index = probs.argmax()
maxima = samples[maxima_index]
t_right_list = list(filter(lambda tup:np.logical_and(tup[1] > maxima , math.isclose(probs[tup[0]], max(probs)*0.2, abs_tol=0.00001) ), enumerate(samples)))
t_right = np.median(list(zip(*t_right_list))[1])
t_left_list = list(filter(lambda tup:np.logical_and(tup[1] < maxima , math.isclose(probs[tup[0]], max(probs)*0.2, abs_tol=0.00001) ), enumerate(samples)))
t_left = np.median(list(zip(*t_left_list))[1])
plt.scatter(samples, probs) #, color='b',linewidths=0.05)
plt.show()
return [t_left, maxima, t_right]
In case more than one value will be retrieved in t_right/t_left (because of abs_tol param value), then median can be used (in order to get only one value)
I have the following data set where I have to estimate the joint density of 'bwt' and 'age' using kernel density estimation with a 2-dimensional Gaussian kernel and width h=5. I can't use modules such as scipy where there are ready functions to do this and I have to built functions to calculate the density. Here's what I've gotten so far.
import numpy as np
import pandas as pd
babies_full = pd.read_csv("https://www2.helsinki.fi/sites/default/files/atoms/files/babies2.txt", sep='\t')
#Getting the columns I need
babies_full1=babies_full[['gestation', 'age']]
x=np.array(babies_full1,'int')
#2d Gaussian kernel
def k_2dgauss(x):
return np.exp(-np.sum(x**2, 1)/2) / np.sqrt(2*np.pi)
#Multivariate kernel density
def mv_kernel_density(t, x, h):
d = x.shape[1]
return np.mean(k_2dgauss((t - x)/h))/h**d
t = np.linspace(1.0, 5.0, 50)
h=5
print(mv_kernel_density(t, x, h))
However, I get a value error 'ValueError: operands could not be broadcast together with shapes (50,) (1173,2)' which think is because different shape of the matrices. I also don't understand why k_2dgauss(x) for me returns an array of zeros since it should only return one value. In general, I am new to the concept of kernel density estimation I don't really know if I've written the functions right so any hints would help!
Following on from my comments on your original post, I think this is what you want to do, but if not then come back to me and we can try again.
# info supplied by OP
import numpy as np
import pandas as pdbabies_full = \
pd.read_csv("https://www2.helsinki.fi/sites/default/files/atoms/files/babies2.txt", sep='\t')
#Getting the columns I need
babies_full1=babies_full[['gestation', 'age']]
x=np.array(babies_full1,'int')
# my contributions
from math import floor, ceil
def binMaker(arr, base):
"""function I already use for this sort of thing.
arr is the arr I want to make bins for
base is the bin separation, but does require you to import floor and ceil
otherwise you can make these bins manually yourself"""
binMin = floor(arr.min() / base) * base
binMax = ceil(arr.max() / base) * base
return np.arange(binMin, binMax + base, base)
bins1 = binMaker(x[:,0], 20.) # bins from 140. to 360. spaced 20 apart
bins2 = binMaker(x[:,1], 5.) # bins from 15. to 45. spaced 5. apart
counts = np.zeros((len(bins1)-1, len(bins2)-1)) # empty array for counts to go in
for i in range(0, len(bins1)-1): # loop over the intervals, hence the -1
boo = (x[:,0] >= bins1[i]) * (x[:,0] < bins1[i+1])
for j in range(0, len(bins2)-1): # loop over the intervals, hence the -1
counts[i,j] = np.count_nonzero((x[boo,1] >= bins2[j]) *
(x[boo,1] < bins2[j+1]))
# if you want your PDF to be a fraction of the total
# rather than the number of counts, do the next line
counts /= x.shape[0]
# plotting
import matplotlib.pyplot as plt
from matplotlib.colors import BoundaryNorm
# setting the levels so that each number in counts has its own colour
levels = np.linspace(-0.5, counts.max()+0.5, int(counts.max())+2)
cmap = plt.get_cmap('viridis') # or any colormap you like
norm = BoundaryNorm(levels, ncolors=cmap.N, clip=True)
fig, ax = plt.subplots(1, 1, figsize=(6,5), dpi=150)
pcm = ax.pcolormesh(bins2, bins1, counts, ec='k', lw=1)
fig.colorbar(pcm, ax=ax, label='Counts (%)')
ax.set_xlabel('Age')
ax.set_ylabel('Gestation')
ax.set_xticks(bins2)
ax.set_yticks(bins1)
plt.title('Manually making a 2D (joint) PDF')
If this is what you wanted, then there is an easier way with np.histgoram2d, although I think you specified it had to be using your own methods, and not built in functions. I've included it anyway for completeness' sake.
pdf = np.histogram2d(x[:,0], x[:,1], bins=(bins1,bins2))[0]
pdf /= x.shape[0] # again for normalising and making a percentage
levels = np.linspace(-0.5, pdf.max()+0.5, int(pdf.max())+2)
cmap = plt.get_cmap('viridis') # or any colormap you like
norm = BoundaryNorm(levels, ncolors=cmap.N, clip=True)
fig, ax = plt.subplots(1, 1, figsize=(6,5), dpi=150)
pcm = ax.pcolormesh(bins2, bins1, pdf, ec='k', lw=1)
fig.colorbar(pcm, ax=ax, label='Counts (%)')
ax.set_xlabel('Age')
ax.set_ylabel('Gestation')
ax.set_xticks(bins2)
ax.set_yticks(bins1)
plt.title('using np.histogram2d to make a 2D (joint) PDF')
Final note - in this example, the only place where counts doesn't equal pdf is for the bin between 40 <= age < 45 and 280 <= gestation 300, which I think is due to how, in my manual case, I've used <= and <, and I'm a little unsure how np.histogram2d handles values outside the bin ranges, or on the bin edges etc. We can see the element of x that is responsible
>>> print(x[1011])
[280 45]
I've found quite a few examples of fitting a linear regression with zero intercept.
However, I would like to fit a linear regression with a fixed x-intercept. In other words, the regression will start at a specific x.
I have the following code for plotting.
import numpy as np
import matplotlib.pyplot as plt
xs = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0,
20.0, 40.0, 60.0, 80.0])
ys = np.array([0.50505332505407008, 1.1207373784533172, 2.1981844719020001,
3.1746209003398689, 4.2905482471260044, 6.2816226678076958,
11.073788414382639, 23.248479770546009, 32.120462301367183,
44.036117671229206, 54.009003143831116, 102.7077685684846,
185.72880217806673, 256.12183145545811, 301.97120103079675])
def best_fit_slope_and_intercept(xs, ys):
# m = xs.dot(ys)/xs.dot(xs)
m = (((np.average(xs)*np.average(ys)) - np.average(xs*ys)) /
((np.average(xs)*np.average(xs)) - np.average(xs*xs)))
b = np.average(ys) - m*np.average(xs)
return m, b
def rSquaredValue(ys_orig, ys_line):
def sqrdError(ys_orig, ys_line):
return np.sum((ys_line - ys_orig) * (ys_line - ys_orig))
yMeanLine = np.average(ys_orig)
sqrtErrorRegr = sqrdError(ys_orig, ys_line)
sqrtErrorYMean = sqrdError(ys_orig, yMeanLine)
return 1 - (sqrtErrorRegr/sqrtErrorYMean)
m, b = best_fit_slope_and_intercept(xs, ys)
regression_line = m*xs+b
r_squared = rSquaredValue(ys, regression_line)
print(r_squared)
plt.plot(xs, ys, 'bo')
# Normal best fit
plt.plot(xs, m*xs+b, 'r-')
# Zero intercept
plt.plot(xs, m*xs, 'g-')
plt.show()
And I want something like the follwing where the regression line starts at (5, 0).
Thank You. Any and all help is appreciated.
I been thinking for some time and I've found a possible workaround to the problem.
If I understood well, you want to find slope and intercept of the linear regression model with a fixed x-axis intercept.
Providing that's the case (imagine you want the x-axis intercept to take the value forced_intercept), it's as if you "moved" all the points -forced_intercept times in the x-axis, and then you forced scikit-learn to use y-axis intercept equal 0. You would then have the slope. To find the intercept just isolate b from y=ax+b and force the point (forced_intercept,0). When you do that, you get to b=-a*forced_intercept (where a is the slope). In code (notice xs reshaping):
import numpy as np
import matplotlib.pyplot as plt
from sklearn.linear_model import LinearRegression
xs = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0,
20.0, 40.0, 60.0, 80.0]).reshape((-1,1)) #notice you must reshape your array or you will get a ValueError error from NumPy.
ys = np.array([0.50505332505407008, 1.1207373784533172, 2.1981844719020001,
3.1746209003398689, 4.2905482471260044, 6.2816226678076958,
11.073788414382639, 23.248479770546009, 32.120462301367183,
44.036117671229206, 54.009003143831116, 102.7077685684846,
185.72880217806673, 256.12183145545811, 301.97120103079675])
forced_intercept = 5 #as you provided in your example of (5,0)
new_xs = xs - forced_intercept #here we "move" all the points
model = LinearRegression(fit_intercept=False).fit(new_xs, ys) #force an intercept of 0
r = model.score(new_xs,ys)
a = model.coef_
b = -1 * a * forced_intercept #here we find the slope so that the line contains (forced intercept,0)
print(r,a,b)
plt.plot(xs,ys,'o')
plt.plot(xs,a*xs+b)
plt.show()
Hope this is what you were looking for.
May be this approach will be useful.
import numpy as np
import matplotlib.pyplot as plt
xs = np.array([0.1, 0.2, 0.4, 0.6, 0.8, 1.0, 2.0, 4.0, 6.0, 8.0, 10.0,
20.0, 40.0, 60.0, 80.0])
ys = np.array([0.50505332505407008, 1.1207373784533172, 2.1981844719020001,
3.1746209003398689, 4.2905482471260044, 6.2816226678076958,
11.073788414382639, 23.248479770546009, 32.120462301367183,
44.036117671229206, 54.009003143831116, 102.7077685684846,
185.72880217806673, 256.12183145545811, 301.97120103079675])
# At first we add this anchor point to the points set.
xs = np.append(xs, [5.])
ys = np.append(ys, [0.])
# Then we prepare the coefficient matrix according docs
# https://docs.scipy.org/doc/numpy/reference/generated/numpy.linalg.lstsq.html
A = np.vstack([xs, np.ones(len(xs))]).T
# Then we prepare weights for these points. And we put all weights
# equal except the last one (for added anchor point).
# In this example it's weight 1000 times larger in comparison with others.
W = np.diag(np.ones([len(xs)]))
W[-1,-1] = 1000.
# And we find least-squares solution.
m, c = np.linalg.lstsq(np.dot(W, A), np.dot(W, ys), rcond=None)[0]
plt.plot(xs, ys, 'o', label='Original data', markersize=10)
plt.plot(xs, m * xs + c, 'r', label='Fitted line')
plt.show()
If you used scikit-learn for linear regression task, it's possible to define intercept(s) using intercept_ attribute.
I have the histogram of my input data (in black) given in the following graph:
I'm trying to fit the Gamma distribution but not on the whole data but just to the first curve of the histogram (the first mode). The green plot in the previous graph corresponds to when I fitted the Gamma distribution on all the samples using the following python code which makes use of scipy.stats.gamma:
img = IO.read(input_file)
data = img.flatten() + abs(np.min(img)) + 1
# calculate dB positive image
img_db = 10 * np.log10(img)
img_db_pos = img_db + abs(np.min(img_db))
data = img_db_pos.flatten() + 1
# data histogram
n, bins, patches = plt.hist(data, 1000, normed=True)
# slice histogram here
# estimation of the parameters of the gamma distribution
fit_alpha, fit_loc, fit_beta = gamma.fit(data, floc=0)
x = np.linspace(0, 100)
y = gamma.pdf(x, fit_alpha, fit_loc, fit_beta)
print '(alpha, beta): (%f, %f)' % (fit_alpha, fit_beta)
# plot estimated model
plt.plot(x, y, linewidth=2, color='g')
plt.show()
How can I restrict the fitting only to the interesting subset of this data?
Update1 (slicing):
I sliced the input data by keeping only values below the max of the previous histogram, but the results were not really convincing:
This was achieved by inserting the following code below the # slice histogram here comment in the previous code:
max_data = bins[np.argmax(n)]
data = data[data < max_data]
Update2 (scipy.optimize.minimize):
The code below shows how scipy.optimize.minimize() is used to minimize an energy function to find (alpha, beta):
import matplotlib.pyplot as plt
import numpy as np
from geotiff.io import IO
from scipy.stats import gamma
from scipy.optimize import minimize
def truncated_gamma(x, max_data, alpha, beta):
gammapdf = gamma.pdf(x, alpha, loc=0, scale=beta)
norm = gamma.cdf(max_data, alpha, loc=0, scale=beta)
return np.where(x < max_data, gammapdf / norm, 0)
# read image
img = IO.read(input_file)
# calculate dB positive image
img_db = 10 * np.log10(img)
img_db_pos = img_db + abs(np.min(img_db))
data = img_db_pos.flatten() + 1
# data histogram
n, bins = np.histogram(data, 100, normed=True)
# using minimize on a slice data below max of histogram
max_data = bins[np.argmax(n)]
data = data[data < max_data]
data = np.random.choice(data, 1000)
energy = lambda p: -np.sum(np.log(truncated_gamma(data, max_data, *p)))
initial_guess = [np.mean(data), 2.]
o = minimize(energy, initial_guess, method='SLSQP')
fit_alpha, fit_beta = o.x
# plot data histogram and model
x = np.linspace(0, 100)
y = gamma.pdf(x, fit_alpha, 0, fit_beta)
plt.hist(data, 30, normed=True)
plt.plot(x, y, linewidth=2, color='g')
plt.show()
The algorithm above converged for a subset of data, and the output in o was:
x: array([ 16.66912781, 6.88105559])
But as can be seen on the screenshot below, the gamma plot doesn't fit the histogram:
You can use a general optimization tool such as scipy.optimize.minimize to fit a truncated version of the desired function, resulting in a nice fit:
First, the modified function:
def truncated_gamma(x, alpha, beta):
gammapdf = gamma.pdf(x, alpha, loc=0, scale=beta)
norm = gamma.cdf(max_data, alpha, loc=0, scale=beta)
return np.where(x<max_data, gammapdf/norm, 0)
This selects values from the gamma distribution where x < max_data, and zero elsewhere. The np.where part is not actually important here, because the data is exclusively to the left of max_data anyway. The key is normalization, because varying alpha and beta will change the area to the left of the truncation point in the original gamma.
The rest is just optimization technicalities.
It's common practise to work with logarithms, so I used what's sometimes called "energy", or the logarithm of the inverse of the probability density.
energy = lambda p: -np.sum(np.log(truncated_gamma(data, *p)))
Minimize:
initial_guess = [np.mean(data), 2.]
o = minimize(energy, initial_guess, method='SLSQP')
fit_alpha, fit_beta = o.x
My output is (alpha, beta): (11.595208, 824.712481). Like the original, it is a maximum likelihood estimate.
If you're not happy with the convergence rate, you may want to
Select a sample from your rather big dataset:
data = np.random.choice(data, 10000)
Try different algorithms using the method keyword argument.
Some optimization routines output a representation of the inverse hessian, which is useful for uncertainty estimation. Enforcement of nonnegativity for the parameters may also be a good idea.
A log-scaled plot without truncation shows the entire distribution:
Here's another possible approach using a manually created dataset in excel that more or less matched the plot given.
Raw Data
Outline
Imported data into a Pandas dataframe.
Mask the indices after the
max response index.
Create a mirror image of the remaining data.
Append the mirror image while leaving a buffer of empty space.
Fit the desired distribution to the modified data. Below I do a normal fit by the method of moments and adjust the amplitude and width.
Working Script
# Import data to dataframe.
df = pd.read_csv('sample.csv', header=0, index_col=0)
# Mask indices after index at max Y.
mask = df.index.values <= df.Y.argmax()
df = df.loc[mask, :]
scaled_y = 100*df.Y.values
# Create new df with mirror image of Y appended.
sep = 6
app_zeroes = np.append(scaled_y, np.zeros(sep, dtype=np.float))
mir_y = np.flipud(scaled_y)
new_y = np.append(app_zeroes, mir_y)
# Using Scipy-cookbook to fit a normal by method of moments.
idxs = np.arange(new_y.size) # idxs=[0, 1, 2,...,len(data)]
mid_idxs = idxs.mean() # len(data)/2
# idxs-mid_idxs is [-53.5, -52.5, ..., 52.5, len(data)/2]
scaling_param = np.sqrt(np.abs(np.sum((idxs-mid_idxs)**2*new_y)/np.sum(new_y)))
# adjust amplitude
fmax = new_y.max()*1.2 # adjusted function max to 120% max y.
# adjust width
scaling_param = scaling_param*.7 # adjusted by 70%.
# Fit normal.
fit = lambda t: fmax*np.exp(-(t-mid_idxs)**2/(2*scaling_param**2))
# Plot results.
plt.plot(new_y, '.')
plt.plot(fit(idxs), '--')
plt.show()
Result
See the scipy-cookbook fitting data page for more on fitting a normal using method of moments.
I am trying to fit some data with a Gaussian (and more complex) function(s). I have created a small example below.
My first question is, am I doing it right?
My second question is, how do I add an error in the x-direction, i.e. in the x-position of the observations/data?
It is very hard to find nice guides on how to do this kind of regression in pyMC. Perhaps because its easier to use some least squares, or similar approach, I however have many parameters in the end and need to see how well we can constrain them and compare different models, pyMC seemed like the good choice for that.
import pymc
import numpy as np
import matplotlib.pyplot as plt; plt.ion()
x = np.arange(5,400,10)*1e3
# Parameters for gaussian
amp_true = 0.2
size_true = 1.8
ps_true = 0.1
# Gaussian function
gauss = lambda x,amp,size,ps: amp*np.exp(-1*(np.pi**2/(3600.*180.)*size*x)**2/(4.*np.log(2.)))+ps
f_true = gauss(x=x,amp=amp_true, size=size_true, ps=ps_true )
# add noise to the data points
noise = np.random.normal(size=len(x)) * .02
f = f_true + noise
f_error = np.ones_like(f_true)*0.05*f.max()
# define the model/function to be fitted.
def model(x, f):
amp = pymc.Uniform('amp', 0.05, 0.4, value= 0.15)
size = pymc.Uniform('size', 0.5, 2.5, value= 1.0)
ps = pymc.Normal('ps', 0.13, 40, value=0.15)
#pymc.deterministic(plot=False)
def gauss(x=x, amp=amp, size=size, ps=ps):
e = -1*(np.pi**2*size*x/(3600.*180.))**2/(4.*np.log(2.))
return amp*np.exp(e)+ps
y = pymc.Normal('y', mu=gauss, tau=1.0/f_error**2, value=f, observed=True)
return locals()
MDL = pymc.MCMC(model(x,f))
MDL.sample(1e4)
# extract and plot results
y_min = MDL.stats()['gauss']['quantiles'][2.5]
y_max = MDL.stats()['gauss']['quantiles'][97.5]
y_fit = MDL.stats()['gauss']['mean']
plt.plot(x,f_true,'b', marker='None', ls='-', lw=1, label='True')
plt.errorbar(x,f,yerr=f_error, color='r', marker='.', ls='None', label='Observed')
plt.plot(x,y_fit,'k', marker='+', ls='None', ms=5, mew=2, label='Fit')
plt.fill_between(x, y_min, y_max, color='0.5', alpha=0.5)
plt.legend()
I realize that I might have to run more iterations, use burn in and thinning in the end. The figure plotting the data and the fit is seen here below.
The pymc.Matplot.plot(MDL) figures looks like this, showing nicely peaked distributions. This is good, right?
My first question is, am I doing it right?
Yes! You need to include a burn-in period, which you know. I like to throw out the first half of my samples. You don't need to do any thinning, but sometimes it will make your post-MCMC work faster to process and smaller to store.
The only other thing I advise is to set a random seed, so that your results are "reproducible": np.random.seed(12345) will do the trick.
Oh, and if I was really giving too much advice, I'd say import seaborn to make the matplotlib results a little more beautiful.
My second question is, how do I add an error in the x-direction, i.e. in the x-position of the observations/data?
One way is to include a latent variable for each error. This works in your example, but will not be feasible if you have many more observations. I'll give a little example to get you started down this road:
# add noise to observed x values
x_obs = pm.rnormal(mu=x, tau=(1e4)**-2)
# define the model/function to be fitted.
def model(x_obs, f):
amp = pm.Uniform('amp', 0.05, 0.4, value= 0.15)
size = pm.Uniform('size', 0.5, 2.5, value= 1.0)
ps = pm.Normal('ps', 0.13, 40, value=0.15)
x_pred = pm.Normal('x', mu=x_obs, tau=(1e4)**-2) # this allows error in x_obs
#pm.deterministic(plot=False)
def gauss(x=x_pred, amp=amp, size=size, ps=ps):
e = -1*(np.pi**2*size*x/(3600.*180.))**2/(4.*np.log(2.))
return amp*np.exp(e)+ps
y = pm.Normal('y', mu=gauss, tau=1.0/f_error**2, value=f, observed=True)
return locals()
MDL = pm.MCMC(model(x_obs, f))
MDL.use_step_method(pm.AdaptiveMetropolis, MDL.x_pred) # use AdaptiveMetropolis to "learn" how to step
MDL.sample(200000, 100000, 10) # run chain longer since there are more dimensions
It looks like it may be hard to get good answers if you have noise in x and y:
Here is a notebook collecting this all up.
EDIT: Important note
This has been bothering me for a while now. The answers given by myself and Abraham here are correct in the sense that they add variability to x. HOWEVER: Note that you cannot simply add uncertainty in this way to cancel out the errors you have in your x-values, so that you regress against "true x". The methods in this answer can show you how adding errors to x affects your regression if you have the true x. If you have a mismeasured x, these answers will not help you. Having errors in the x-values is a very tricky problem to solve, as it leads to "attenuation" and an "errors-in-variables effect". The short version is: having unbiased, random errors in x leads to bias in your regression estimates. If you have this problem, check out Carroll, R.J., Ruppert, D., Crainiceanu, C.M. and Stefanski, L.A., 2006. Measurement error in nonlinear models: a modern perspective. Chapman and Hall/CRC., or for a Bayesian approach, Gustafson, P., 2003. Measurement error and misclassification in statistics and epidemiology: impacts and Bayesian adjustments. CRC Press. I ended up solving my specific problem using Carroll et al.'s SIMEX method along with PyMC3. The details are in Carstens, H., Xia, X. and Yadavalli, S., 2017. Low-cost energy meter calibration method for measurement and verification. Applied energy, 188, pp.563-575. It is also available on ArXiv
I converted Abraham Flaxman's answer above into PyMC3, in case someone needs it. Some very minor changes, but can be confusing nevertheless.
The first is that the deterministic decorator #Deterministic is replaced by a distribution-like call function var=pymc3.Deterministic(). Second, when generating a vector of normally distributed random variables,
rvs = pymc2.rnormal(mu=mu, tau=tau)
is replaced by
rvs = pymc3.Normal('var_name', mu=mu, tau=tau,shape=size(var)).random()
The complete code is as follows:
import numpy as np
from pymc3 import *
import matplotlib.pyplot as plt
# set random seed for reproducibility
np.random.seed(12345)
x = np.arange(5,400,10)*1e3
# Parameters for gaussian
amp_true = 0.2
size_true = 1.8
ps_true = 0.1
#Gaussian function
gauss = lambda x,amp,size,ps: amp*np.exp(-1*(np.pi**2/(3600.*180.)*size*x)**2/(4.*np.log(2.)))+ps
f_true = gauss(x=x,amp=amp_true, size=size_true, ps=ps_true )
# add noise to the data points
noise = np.random.normal(size=len(x)) * .02
f = f_true + noise
f_error = np.ones_like(f_true)*0.05*f.max()
with Model() as model3:
amp = Uniform('amp', 0.05, 0.4, testval= 0.15)
size = Uniform('size', 0.5, 2.5, testval= 1.0)
ps = Normal('ps', 0.13, 40, testval=0.15)
gauss=Deterministic('gauss',amp*np.exp(-1*(np.pi**2*size*x/(3600.*180.))**2/(4.*np.log(2.)))+ps)
y =Normal('y', mu=gauss, tau=1.0/f_error**2, observed=f)
start=find_MAP()
step=NUTS()
trace=sample(2000,start=start)
# extract and plot results
y_min = np.percentile(trace.gauss,2.5,axis=0)
y_max = np.percentile(trace.gauss,97.5,axis=0)
y_fit = np.percentile(trace.gauss,50,axis=0)
plt.plot(x,f_true,'b', marker='None', ls='-', lw=1, label='True')
plt.errorbar(x,f,yerr=f_error, color='r', marker='.', ls='None', label='Observed')
plt.plot(x,y_fit,'k', marker='+', ls='None', ms=5, mew=1, label='Fit')
plt.fill_between(x, y_min, y_max, color='0.5', alpha=0.5)
plt.legend()
Which results in
y_error
For errors in x (note the 'x' suffix to variables):
# define the model/function to be fitted in PyMC3:
with Model() as modelx:
x_obsx = pm3.Normal('x_obsx',mu=x, tau=(1e4)**-2, shape=40)
ampx = Uniform('ampx', 0.05, 0.4, testval=0.15)
sizex = Uniform('sizex', 0.5, 2.5, testval=1.0)
psx = Normal('psx', 0.13, 40, testval=0.15)
x_pred = Normal('x_pred', mu=x_obsx, tau=(1e4)**-2*np.ones_like(x_obsx),testval=5*np.ones_like(x_obsx),shape=40) # this allows error in x_obs
gauss=Deterministic('gauss',ampx*np.exp(-1*(np.pi**2*sizex*x_pred/(3600.*180.))**2/(4.*np.log(2.)))+psx)
y = Normal('y', mu=gauss, tau=1.0/f_error**2, observed=f)
start=find_MAP()
step=NUTS()
tracex=sample(20000,start=start)
Which results in:
x_error_graph
the last observation is that when doing
traceplot(tracex[100:])
plt.tight_layout();
(result not shown), we can see that sizex seems to be suffering from 'attenuation' or 'regression dilution' due to the error in the measurement of x.