Getting the statistics of deterministic variables in PyMC - python

Say I have a random collection of (X,Y) points:
import pymc as pm
import numpy as np
import matplotlib.pyplot as plt
import scipy
x = np.array(range(0,50))
y = np.random.uniform(low=0.0, high=40.0, size=200)
y = map((lambda a: a[0] + a[1]), zip(x,y))
plt.scatter(x,y)
and that I fit simple linear regression:
std = 20.
tau=1/(std**2)
alpha = pm.Normal('alpha', mu=0, tau=tau)
beta = pm.Normal('beta', mu=0, tau=tau)
sigma = pm.Uniform('sigma', lower=0, upper=20)
y_est = alpha + beta * x
likelihood = pm.Normal('y', mu=y_est, tau=1/(sigma**2), observed=True, value=y)
model = pm.Model([likelihood, alpha, beta, sigma, y_est])
mcmc = pm.MCMC(model)
mcmc.sample(40000, 15000)
How can I get the distribution or the statistics of y_est[0], y_est[1], y_est[2].. (note that these variables correspond to the y values estimated for each input x value.

In PyMC 2, if you are interested in the trace of a deterministic, you should wrap the deterministic in a Lambda object (or decorate a function with #deterministic). In your case, this would be:
y_est = Lambda('y_est', lambda a=alpha, b=beta: a + b * x)
You should then be able to call the summary method or plot the node, just like a Stochastic.
BTW, you do not need to instantiate a Model object, as MCMC already does that for you. All you need is:
mcmc = pm.MCMC([likelihood, alpha, beta, sigma, y_est])
or even more concisely:
mcmc = pm.MCMC(vars())

Following #Chris' advice, the following works:
x = pm.Uniform('x', lower=xmin, upper=xmax)
alpha = pm.Normal('alpha', mu=0, tau=tau)
beta = pm.Normal('beta', mu=0, tau=tau)
sigma = pm.Uniform('sigma', lower=0, upper=20)
# The deterministic:
y_gen = pm.Lambda('y_gen', lambda a=alpha, x=x, b=beta: a + b * x)
And then draw samples from it as follows:
mcmc = pm.MCMC([x, y_gen])
mcmc.sample(n_total_samples, n_burn_in)
x_trace = mcmc.trace('x')[:]
y_trace = mcmc.trace('y_gen')[:]

Related

NumPyro: sampling active sites as Bernoulli RVs

I want to modify the following NumPyro model:
import jax.numpy as jnp
from jax import random, vmap
import numpy as np
import numpyro
import numpyro.distributions as dist
from numpyro.infer import MCMC, NUTS
numpyro.set_host_device_count(6)
def model(y=None, X=None):
n_predictors = X.shape[1]
with numpyro.plate('state', n_predictors):
theta = numpyro.sample('theta', dist.Gamma(concentration=1, rate=1/5000))
mu = jnp.dot(X, theta)
numpyro.sample('y', dist.Normal(loc=mu, scale=1), obs=y)
theta = np.zeros(5) # True parameters
theta[0] = 2
theta[1] = 3
X = np.random.randn(20, theta.size)**2 # Design matrix
y = X # theta + np.random.randn(X.shape[0]) # data
rng_key = random.PRNGKey(74674)
rng_key, rng_key_ = random.split(rng_key)
mcmc = MCMC(NUTS(model), num_warmup=500, num_samples=1000, num_chains=6)
mcmc.run(rng_key_, X=X, y=y)
mcmc.print_summary()
I want to include Bernoulli RVs z that choose which theta is active. Then I would like to make inference for these z. Basically, I am trying to do variable selection. The idea is illustrated in the following model (which fails):
def failed_model(y=None, X=None):
n_predictors = X.shape[1]
with numpyro.plate('state', n_predictors):
theta = numpyro.sample('theta', dist.Gamma(concentration=1, rate=1/5000))
z = numpyro.sample('z', dist.Bernoulli(0.1)
mu = jnp.dot(X, theta * z)
numpyro.sample('y', dist.Normal(loc=mu, scale=1), obs=y)
I tried to understand the second example from the [docs][1], but it does not show masking for a random array, but rather for a fixed array.
[1]: https://num.pyro.ai/en/stable/distributions.html

Scipy minimize / Scipy Curve fit / lmfit

log(VA) = gamma - (1/eta)log[alphaL^(-eta) + betaK^(-eta)]
I'm trying to estimate the above function with nonlinear least squares. I am using 3 different packages (Scipy-minimize, Scipy-curve_fit and lmfit - Model) for this but I find different parameter results in each one. I can't understand why. I would be very grateful if anyone can help with a solution or offer a different solution method.
SCIPY-MINIMIZE
import numpy as np
from scipy.optimize import minimize, curve_fit
from lmfit import Model, Parameters
L = np.array([0.299, 0.295, 0.290, 0.284, 0.279, 0.273, 0.268, 0.262, 0.256, 0.250])
K = np.array([2.954, 3.056, 3.119, 3.163, 3.215, 3.274, 3.351, 3.410, 3.446, 3.416])
VA = np.array([0.919, 0.727, 0.928, 0.629, 0.656, 0.854, 0.955, 0.981, 0.908, 0.794])
def f(param):
gamma = param[0]
alpha = param[1]
beta = param[2]
eta = param[3]
VA_est = gamma - (1/eta)*np.log(alpha*L**-eta + beta*K**-eta)
return np.sum((np.log(VA) - VA_est)**2)
bnds = [(1, np.inf), (0,1),(0,1),(-1, np.inf)]
x0 = (1,0.01,0.98, 1)
con = {"type":"eq", "fun":c}
result = minimize(f, x0, bounds = bnds)
print(result.fun)
print(result.message)
print(result.x[0],result.x[1],result.x[2],result.x[3])
SCIPY-MINIMIZE - OUT
0.30666062040617503
CONVERGENCE: NORM_OF_PROJECTED_GRADIENT_<=_PGTOL
1.0 0.5587147011643757 0.9371430857380681 5.873041615873815
SCIPY-CURVE_FIT
def f(X, gamma, alpha, beta, eta):
L,K = X
return gamma - (1/eta) * np.log(alpha*L**-eta + beta*K**-eta)
p0 = 1,0.01,0.98, 1
res, cov = curve_fit(f, (L, K), np.log(VA), p0, bounds = ((1,0,0,-1),(np.inf,1,1,np.inf)) )
gamma, alpha, beta, eta = res[0],res[1],res[2],res[3]
gamma, alpha, beta, eta
SCIPY-CURVE_FIT - OUT
(1.000000000062141,
0.26366547263939205,
0.9804436474926481,
13.449747863921704)
LMFIT-MODEL
def f(x, gamma, alpha, beta, eta):
L = x[0]
K = x[1]
return gamma - (1/eta)*np.log(alpha*L**-eta + beta*K**-eta)
fmodel = Model(f)
params = Parameters()
params.add('gamma', value = 1, vary=True, min = 1)
params.add('alpha', value = 0.01, vary=True, max = 1, min = 0)
params.add('beta', value = 0.98, vary=True, max = 1, min = 0)
params.add('eta', value = 1, vary=True, min = -1)
result = fmodel.fit(np.log(VA), params, x=(L,K))
print(result.fit_report())
LMFIT-MODEL - OUT
[[Model]]
Model(f)
[[Fit Statistics]]
# fitting method = leastsq
# function evals = 103
# data points = 10
# variables = 4
chi-square = 0.31749840
reduced chi-square = 0.05291640
Akaike info crit = -26.4986758
Bayesian info crit = -25.2883354
## Warning: uncertainties could not be estimated:
gamma: at initial value
gamma: at boundary
alpha: at boundary
[[Variables]]
gamma: 1.00000000 (init = 1)
alpha: 1.3245e-13 (init = 0.01)
beta: 0.20130064 (init = 0.98)
eta: 447.960413 (init = 1)
A fitting algorithm always seeks for a local minimizer of the underlying least-squares problem. Note that your problem is convex but not strictly convex. Consequently, there's no unique global minimizer. But each local minimizer is a global one. By evaluating the first function f for each found solution, we can observe that they all have the same objective function value. Hence, each solution is a global minimizer.
Why does each method find a different minimizer? The reason is simple. Each one uses a different algorithm to solve the underlying nonlinear optimization problem, e.g. scipy.optimize.minimize uses the 'L-BFGS-B' algorithm while scipy.optimize.curve_fit uses scipy.optimize.least_squares with the Trust Region Reflective algorithm ('TRF'). In short, you can only expect the same solution for different algorithms for a strictly convex problem.

Incremental Bayesian updates with multi-dimensional parameters

I am trying to use PYMC3 for a Bayesian model where I would like to repeatedly train my model on new unseen data. I am thinking I would need to update the priors with the posterior of the previously trained model every time I see the data, similar to how is achieved here https://docs.pymc.io/notebooks/updating_priors.html. They use the following function that finds the KDE from the samples and replacing each of the original definitions of the parameters in the model with a call to from_posterior.
def from_posterior(param, samples):
smin, smax = np.min(samples), np.max(samples)
width = smax - smin
x = np.linspace(smin, smax, 100)
y = stats.gaussian_kde(samples)(x)
# what was never sampled should have a small probability but not 0,
# so we'll extend the domain and use linear approximation of density on it
x = np.concatenate([[x[0] - 3 * width], x, [x[-1] + 3 * width]])
y = np.concatenate([[0], y, [0]])
return Interpolated(param, x, y)
And here is my original model.
def create_model(batsmen, bowlers, id1, id2, X):
testval = [[-5,0,1,2,3.5,5] for i in range(0, 9)]
l = [i for i in range(9)]
model = pm.Model()
with model:
delta_1 = pm.Uniform("delta_1", lower=0, upper=1)
delta_2 = pm.Uniform("delta_2", lower=0, upper=1)
inv_sigma_sqr = pm.Gamma("sigma^-2", alpha=1.0, beta=1.0)
inv_tau_sqr = pm.Gamma("tau^-2", alpha=1.0, beta=1.0)
mu_1 = pm.Normal("mu_1", mu=0, sigma=1/pm.math.sqrt(inv_tau_sqr), shape=len(batsmen))
mu_2 = pm.Normal("mu_2", mu=0, sigma=1/pm.math.sqrt(inv_tau_sqr), shape=len(bowlers))
delta = pm.math.ge(l, 3) * delta_1 + pm.math.ge(l, 6) * delta_2
eta = [pm.Deterministic("eta_" + str(i), delta[i] + mu_1[id1[i]] - mu_2[id2[i]]) for i in range(9)]
cutpoints = pm.Normal("cutpoints", mu=0, sigma=1/pm.math.sqrt(inv_sigma_sqr), transform=pm.distributions.transforms.ordered, shape=(9,6), testval=testval)
X_ = [pm.OrderedLogistic("X_" + str(i), cutpoints=cutpoints[i], eta=eta[i], observed=X[i]-1) for i in range(9)]
return model
Here, the problem is that some of my parameters such as mu_1, are multidimensional. This is why I get the following error:
ValueError: points have dimension 1, dataset has dimension 1500
because of the line y = stats.gaussian_kde(samples)(x).
Can someone please help me make this work for multi-dimensional parameters? I don't properly understand what KDE is and how the code computes it.
Thank you in advance!!

PyMC3 select data within model for switchpoint analysis

I am generating a time series that has a drastic change in the middle.
import numpy as np
size = 120
x1 = np.random.randn(size)
x2 = np.random.randn(size) * 4
x = np.hstack([x1, x2])
This series of x looks like this:
The goal is now to use PyMC3 to estimate the posterior distribution of the time when the change occurred (switchpoint). This should occur around the index 120. I've used the following code;
from pymc3 import Model, Normal, HalfNormal, DiscreteUniform
basic_model = Model()
with basic_model:
mu1 = Normal('mu1', mu=0, sd=10)
mu2 = Normal('mu2', mu=0, sd=10)
sigma1 = HalfNormal('sigma1', sd=2)
sigma2 = HalfNormal('sigma2', sd=2)
tau = DiscreteUniform('tau', 0, 240)
# get likelihoods
y1 = Normal('y1', mu=mu1, sd=sigma1, observed=x[:tau])
y2 = Normal('y2', mu=mu2, sd=sigma2, observed=x[tau:])
Doing this gives an error that I cannot use tau to slice the array. What would be the approach to solve this in PyMC? It seems like I'll need the slicing to be done by something stochastic in PyMC.
Turns out PyMC3 has a switch model. Let t be the variable for time.
import pymc3 as pm
basic_model = pm.Model()
with basic_model:
mu1 = pm.Normal('mu1', mu=0, sd=10)
mu2 = pm.Normal('mu2', mu=0, sd=10)
sigma1 = pm.HalfNormal('sigma1', sd=2)
sigma2 = pm.HalfNormal('sigma2', sd=2)
switchpoint = pm.DiscreteUniform('switchpoint', t.min(), t.max())
tau_mu = pm.switch(t >= switchpoint, mu1, mu2)
tau_sigma = pm.switch(t >= switchpoint, sigma1, sigma2)
y = pm.Normal('y1', mu=tau_mu, sd=tau_sigma, observed=x)

Python: two-curve gaussian fitting with non-linear least-squares

My knowledge of maths is limited which is why I am probably stuck. I have a spectra to which I am trying to fit two Gaussian peaks. I can fit to the largest peak, but I cannot fit to the smallest peak. I understand that I need to sum the Gaussian function for the two peaks but I do not know where I have gone wrong. An image of my current output is shown:
The blue line is my data and the green line is my current fit. There is a shoulder to the left of the main peak in my data which I am currently trying to fit, using the following code:
import matplotlib.pyplot as pt
import numpy as np
from scipy.optimize import leastsq
from pylab import *
time = []
counts = []
for i in open('/some/folder/to/file.txt', 'r'):
segs = i.split()
time.append(float(segs[0]))
counts.append(segs[1])
time_array = arange(len(time), dtype=float)
counts_array = arange(len(counts))
time_array[0:] = time
counts_array[0:] = counts
def model(time_array0, coeffs0):
a = coeffs0[0] + coeffs0[1] * np.exp( - ((time_array0-coeffs0[2])/coeffs0[3])**2 )
b = coeffs0[4] + coeffs0[5] * np.exp( - ((time_array0-coeffs0[6])/coeffs0[7])**2 )
c = a+b
return c
def residuals(coeffs, counts_array, time_array):
return counts_array - model(time_array, coeffs)
# 0 = baseline, 1 = amplitude, 2 = centre, 3 = width
peak1 = np.array([0,6337,16.2,4.47,0,2300,13.5,2], dtype=float)
#peak2 = np.array([0,2300,13.5,2], dtype=float)
x, flag = leastsq(residuals, peak1, args=(counts_array, time_array))
#z, flag = leastsq(residuals, peak2, args=(counts_array, time_array))
plt.plot(time_array, counts_array)
plt.plot(time_array, model(time_array, x), color = 'g')
#plt.plot(time_array, model(time_array, z), color = 'r')
plt.show()
This code worked for me providing that you are only fitting a function that is a combination of two Gaussian distributions.
I just made a residuals function that adds two Gaussian functions and then subtracts them from the real data.
The parameters (p) that I passed to Numpy's least squares function include: the mean of the first Gaussian function (m), the difference in the mean from the first and second Gaussian functions (dm, i.e. the horizontal shift), the standard deviation of the first (sd1), and the standard deviation of the second (sd2).
import numpy as np
from scipy.optimize import leastsq
import matplotlib.pyplot as plt
######################################
# Setting up test data
def norm(x, mean, sd):
norm = []
for i in range(x.size):
norm += [1.0/(sd*np.sqrt(2*np.pi))*np.exp(-(x[i] - mean)**2/(2*sd**2))]
return np.array(norm)
mean1, mean2 = 0, -2
std1, std2 = 0.5, 1
x = np.linspace(-20, 20, 500)
y_real = norm(x, mean1, std1) + norm(x, mean2, std2)
######################################
# Solving
m, dm, sd1, sd2 = [5, 10, 1, 1]
p = [m, dm, sd1, sd2] # Initial guesses for leastsq
y_init = norm(x, m, sd1) + norm(x, m + dm, sd2) # For final comparison plot
def res(p, y, x):
m, dm, sd1, sd2 = p
m1 = m
m2 = m1 + dm
y_fit = norm(x, m1, sd1) + norm(x, m2, sd2)
err = y - y_fit
return err
plsq = leastsq(res, p, args = (y_real, x))
y_est = norm(x, plsq[0][0], plsq[0][2]) + norm(x, plsq[0][0] + plsq[0][1], plsq[0][3])
plt.plot(x, y_real, label='Real Data')
plt.plot(x, y_init, 'r.', label='Starting Guess')
plt.plot(x, y_est, 'g.', label='Fitted')
plt.legend()
plt.show()
You can use Gaussian mixture models from scikit-learn:
from sklearn import mixture
import matplotlib.pyplot
import matplotlib.mlab
import numpy as np
clf = mixture.GMM(n_components=2, covariance_type='full')
clf.fit(yourdata)
m1, m2 = clf.means_
w1, w2 = clf.weights_
c1, c2 = clf.covars_
histdist = matplotlib.pyplot.hist(yourdata, 100, normed=True)
plotgauss1 = lambda x: plot(x,w1*matplotlib.mlab.normpdf(x,m1,np.sqrt(c1))[0], linewidth=3)
plotgauss2 = lambda x: plot(x,w2*matplotlib.mlab.normpdf(x,m2,np.sqrt(c2))[0], linewidth=3)
plotgauss1(histdist[1])
plotgauss2(histdist[1])
You can also use the function below to fit the number of Gaussian you want with ncomp parameter:
from sklearn import mixture
%pylab
def fit_mixture(data, ncomp=2, doplot=False):
clf = mixture.GMM(n_components=ncomp, covariance_type='full')
clf.fit(data)
ml = clf.means_
wl = clf.weights_
cl = clf.covars_
ms = [m[0] for m in ml]
cs = [numpy.sqrt(c[0][0]) for c in cl]
ws = [w for w in wl]
if doplot == True:
histo = hist(data, 200, normed=True)
for w, m, c in zip(ws, ms, cs):
plot(histo[1],w*matplotlib.mlab.normpdf(histo[1],m,np.sqrt(c)), linewidth=3)
return ms, cs, ws
coeffs 0 and 4 are degenerate - there is absolutely nothing in the data that can decide between them. you should use a single zero level parameter instead of two (ie remove one of them from your code). this is probably what is stopping your fit (ignore the comments here saying this is not possible - there are clearly at least two peaks in that data and you should certainly be able to fit to that).
(it may not be clear why i am suggesting this, but what is happening is that coeffs 0 and 4 can cancel each other out. they can both be zero, or one could be 100 and the other -100 - either way, the fit is just as good. this "confuses" the fitting routine, which spends its time trying to work out what they should be, when there is no single right answer, because whatever value one is, the other can just be the negative of that, and the fit will be the same).
in fact, from the plot, it looks like there may be no need for a zero level at all. i would try dropping both of those and seeing how the fit looks.
also, there is no need to fit coeffs 1 and 5 (or the zero point) in the least squares. instead, because the model is linear in those you could calculate their values each loop. this will make things faster, but is not critical. i just noticed you say your maths is not so good, so probably ignore this one.

Categories

Resources