PyMC3 select data within model for switchpoint analysis - python

I am generating a time series that has a drastic change in the middle.
import numpy as np
size = 120
x1 = np.random.randn(size)
x2 = np.random.randn(size) * 4
x = np.hstack([x1, x2])
This series of x looks like this:
The goal is now to use PyMC3 to estimate the posterior distribution of the time when the change occurred (switchpoint). This should occur around the index 120. I've used the following code;
from pymc3 import Model, Normal, HalfNormal, DiscreteUniform
basic_model = Model()
with basic_model:
mu1 = Normal('mu1', mu=0, sd=10)
mu2 = Normal('mu2', mu=0, sd=10)
sigma1 = HalfNormal('sigma1', sd=2)
sigma2 = HalfNormal('sigma2', sd=2)
tau = DiscreteUniform('tau', 0, 240)
# get likelihoods
y1 = Normal('y1', mu=mu1, sd=sigma1, observed=x[:tau])
y2 = Normal('y2', mu=mu2, sd=sigma2, observed=x[tau:])
Doing this gives an error that I cannot use tau to slice the array. What would be the approach to solve this in PyMC? It seems like I'll need the slicing to be done by something stochastic in PyMC.

Turns out PyMC3 has a switch model. Let t be the variable for time.
import pymc3 as pm
basic_model = pm.Model()
with basic_model:
mu1 = pm.Normal('mu1', mu=0, sd=10)
mu2 = pm.Normal('mu2', mu=0, sd=10)
sigma1 = pm.HalfNormal('sigma1', sd=2)
sigma2 = pm.HalfNormal('sigma2', sd=2)
switchpoint = pm.DiscreteUniform('switchpoint', t.min(), t.max())
tau_mu = pm.switch(t >= switchpoint, mu1, mu2)
tau_sigma = pm.switch(t >= switchpoint, sigma1, sigma2)
y = pm.Normal('y1', mu=tau_mu, sd=tau_sigma, observed=x)

Related

Incremental Bayesian updates with multi-dimensional parameters

I am trying to use PYMC3 for a Bayesian model where I would like to repeatedly train my model on new unseen data. I am thinking I would need to update the priors with the posterior of the previously trained model every time I see the data, similar to how is achieved here https://docs.pymc.io/notebooks/updating_priors.html. They use the following function that finds the KDE from the samples and replacing each of the original definitions of the parameters in the model with a call to from_posterior.
def from_posterior(param, samples):
smin, smax = np.min(samples), np.max(samples)
width = smax - smin
x = np.linspace(smin, smax, 100)
y = stats.gaussian_kde(samples)(x)
# what was never sampled should have a small probability but not 0,
# so we'll extend the domain and use linear approximation of density on it
x = np.concatenate([[x[0] - 3 * width], x, [x[-1] + 3 * width]])
y = np.concatenate([[0], y, [0]])
return Interpolated(param, x, y)
And here is my original model.
def create_model(batsmen, bowlers, id1, id2, X):
testval = [[-5,0,1,2,3.5,5] for i in range(0, 9)]
l = [i for i in range(9)]
model = pm.Model()
with model:
delta_1 = pm.Uniform("delta_1", lower=0, upper=1)
delta_2 = pm.Uniform("delta_2", lower=0, upper=1)
inv_sigma_sqr = pm.Gamma("sigma^-2", alpha=1.0, beta=1.0)
inv_tau_sqr = pm.Gamma("tau^-2", alpha=1.0, beta=1.0)
mu_1 = pm.Normal("mu_1", mu=0, sigma=1/pm.math.sqrt(inv_tau_sqr), shape=len(batsmen))
mu_2 = pm.Normal("mu_2", mu=0, sigma=1/pm.math.sqrt(inv_tau_sqr), shape=len(bowlers))
delta = pm.math.ge(l, 3) * delta_1 + pm.math.ge(l, 6) * delta_2
eta = [pm.Deterministic("eta_" + str(i), delta[i] + mu_1[id1[i]] - mu_2[id2[i]]) for i in range(9)]
cutpoints = pm.Normal("cutpoints", mu=0, sigma=1/pm.math.sqrt(inv_sigma_sqr), transform=pm.distributions.transforms.ordered, shape=(9,6), testval=testval)
X_ = [pm.OrderedLogistic("X_" + str(i), cutpoints=cutpoints[i], eta=eta[i], observed=X[i]-1) for i in range(9)]
return model
Here, the problem is that some of my parameters such as mu_1, are multidimensional. This is why I get the following error:
ValueError: points have dimension 1, dataset has dimension 1500
because of the line y = stats.gaussian_kde(samples)(x).
Can someone please help me make this work for multi-dimensional parameters? I don't properly understand what KDE is and how the code computes it.
Thank you in advance!!

PyMC to PyMC3 for bayesian statistics model

I am starting on Bayesian Statistics using the book Probabilistic Programming and Bayesian Methods for Hackers. I realized that the code examples there are based on pymc which has been deprecated in favor of pymc3. I am trying to figure out how to port the code into pymc3 code, but am running into issues there.
I was wondering if someone could help port the following code snippet to pymc3:
import numpy as np
import pymc as pm
count_data = np.loadtxt("txtdata.csv")
n_count_data = len(count_data)
alpha = 1.0 / count_data.mean()
lambda_1 = pm.Exponential("lambda_1", alpha)
lambda_2 = pm.Exponential("lambda_2", alpha)
tau = pm.DiscreteUniform("tau", lower=0, upper=n_count_data)
#pm.deterministic
def lambda_(tau=tau, lambda_1=lambda_1, lambda_2=lambda_2):
out = np.zeros(n_count_data)
out[:tau] = lambda_1 # lambda before tau is lambda1
out[tau:] = lambda_2 # lambda after (and including) tau is lambda2
return out
observation = pm.Poisson("obs", lambda_, value=count_data, observed=True)
model = pm.Model([observation, lambda_1, lambda_2, tau])
mcmc = pm.MCMC(model)
mcmc.sample(40000, 10000, 1)
lambda_1_samples = mcmc.trace('lambda_1')[:]
lambda_2_samples = mcmc.trace('lambda_2')[:]
tau_samples = mcmc.trace('tau')[:]
I was able to do it partially, but then I am running into trouble at the line
#pm.deterministic
My partial code (for what it is worth):
import numpy as np
import pymc3 as pm
from matplotlib import pyplot as plt
count_data = np.loadtxt("txtdata.csv")
n_count_data = len(count_data)
alpha = 1.0 / count_data.mean()
basic_model = pm.Model()
with basic_model:
lambda_1 = pm.Exponential("lambda_1", alpha)
lambda_2 = pm.Exponential("lambda_2", alpha)
tau = pm.DiscreteUniform("tau", lower=0, upper=n_count_data)
#pm.deterministics
def lambda_(tau=tau, lambda_1=lambda_1, lambda_2=lambda_2):
out = np.zeros(n_count_data)
out[:tau] = lambda_1 # lambda before tau is lambda1
out[tau:] = lambda_2 # lambda after (and including) tau is lambda2
return out
observation = pm.Poisson("obs", lambda_, value=count_data, observed=True)
How do I get this to work in pymc3?

pymc determine sum of random variables

I have two independent Normal distributed random variables a, b. In pymc, it's something like:
from pymc import Normal
def model():
a = Normal('a', tau=0.01)
b = Normal('b', tau=0.1)
I'd like to know what's a+b if we can see it as a normal distribution, that is:
from pymc import Normal
def model():
a = Normal('a', tau=0.01)
b = Normal('b', tau=0.1)
tau_c = Uniform("tau_c", lower=0.0, upper=1.0)
c = Normal("a+b", tau=tau_c, observed=True, value=a+b)
Then I'd like to estimate tau_c, but it doesn't work with pymc because a and b are stochastic (if they are arrays it's posible, but I don't have observations of a or b, I just know their distributions).
A way I think I can do it, is generating random values by using the distributions of each a and b and then doing this:
def model(a, b):
tau_c = Uniform("tau_c", lower=0.0, upper=1.0)
c = Normal("a+b", tau=tau_c, observed=True, value=a+b)
But I think there's a better way of doing this with pymc.
Thanks!
If I understood correctly your question and code you should be doing something simpler. If you want to estimate the parameters of the distribution given by the sum of a and b, then use only the first block in the following example. If you also want to estimate the parameters for the variable a independently of the parameter of variable b, then use the other two blocks
with pm.Model() as model:
mu = pm.Normal('mu', mu=0, sd=10)
sd = pm.HalfNormal('sd', 10)
alpha = pm.Normal('alpha', mu=0, sd=10)
ab = pm.SkewNormal('ab', mu=mu, sd=sd, alpha=alpha, observed=a+b)
mu_a = pm.Normal('mu_a', mu=0, sd=10)
sd_a = pm.HalfNormal('sd_a', 10)
alpha_a = pm.Normal('alpha_a', mu=0, sd=10)
a = pm.SkewNormal('a', mu=mu_a, sd=sd_a, alpha=alpha_a, observed=a)
mu_b = pm.Normal('mu_b', mu=0, sd=10)
sd_b = pm.HalfNormal('sd_b', 10)
alpha_b = pm.Normal('alpha_b', mu=0, sd=10)
b = pm.SkewNormal('b', mu=mu_b, sd=sd_b, alpha=alpha_b, observed=b)
trace = pm.sample(1000)
Be sure to use the last version of PyMC3, since previous versions did not include the SkewNormal distribution.
Update:
Given that you change your question:
If a and b are independent random variables and both are normally distributed then their sum is going to be normally distributed.
a ~ N(mu_a, sd_a²)
b ~ N(mu_b, sd_b²)
a+b ~ N(mu_a+mu_b, sd_a²+sd_b²)
that is you sum their means and you sum their variances (not their standard deviations). You don't need to use PyMC3.
If you still want to use PyMC3 (may be your distribution are not Gaussian and you do not know how to compute their sum analytically). You can generate synthetic data from your a and b distributions and then use PyMC3 to estimate the parameters, something in line of:
with pm.Model() as model:
mu = pm.Normal('mu', mu=0, sd=10)
sd = pm.HalfNormal('sd', 10)
ab = pm.Normal('ab', mu=mu, sd=sd, observed=a+b)
trace = pm.sample(1000)

Getting the statistics of deterministic variables in PyMC

Say I have a random collection of (X,Y) points:
import pymc as pm
import numpy as np
import matplotlib.pyplot as plt
import scipy
x = np.array(range(0,50))
y = np.random.uniform(low=0.0, high=40.0, size=200)
y = map((lambda a: a[0] + a[1]), zip(x,y))
plt.scatter(x,y)
and that I fit simple linear regression:
std = 20.
tau=1/(std**2)
alpha = pm.Normal('alpha', mu=0, tau=tau)
beta = pm.Normal('beta', mu=0, tau=tau)
sigma = pm.Uniform('sigma', lower=0, upper=20)
y_est = alpha + beta * x
likelihood = pm.Normal('y', mu=y_est, tau=1/(sigma**2), observed=True, value=y)
model = pm.Model([likelihood, alpha, beta, sigma, y_est])
mcmc = pm.MCMC(model)
mcmc.sample(40000, 15000)
How can I get the distribution or the statistics of y_est[0], y_est[1], y_est[2].. (note that these variables correspond to the y values estimated for each input x value.
In PyMC 2, if you are interested in the trace of a deterministic, you should wrap the deterministic in a Lambda object (or decorate a function with #deterministic). In your case, this would be:
y_est = Lambda('y_est', lambda a=alpha, b=beta: a + b * x)
You should then be able to call the summary method or plot the node, just like a Stochastic.
BTW, you do not need to instantiate a Model object, as MCMC already does that for you. All you need is:
mcmc = pm.MCMC([likelihood, alpha, beta, sigma, y_est])
or even more concisely:
mcmc = pm.MCMC(vars())
Following #Chris' advice, the following works:
x = pm.Uniform('x', lower=xmin, upper=xmax)
alpha = pm.Normal('alpha', mu=0, tau=tau)
beta = pm.Normal('beta', mu=0, tau=tau)
sigma = pm.Uniform('sigma', lower=0, upper=20)
# The deterministic:
y_gen = pm.Lambda('y_gen', lambda a=alpha, x=x, b=beta: a + b * x)
And then draw samples from it as follows:
mcmc = pm.MCMC([x, y_gen])
mcmc.sample(n_total_samples, n_burn_in)
x_trace = mcmc.trace('x')[:]
y_trace = mcmc.trace('y_gen')[:]

PyMC modeling hierarchical regression with unknown means and covariances

Model
I have the following statistical model:
r_i ~ N(r | mu_i, sigma)
mu_i = w . Q_i
w ~ N(w | phi, Sigma)
prior(phi, Sigma) = NormalInvWishart(0, 1, k+1, I_k)
Where sigma is known.
Q_i and r_i (reward) are observed.
In this case, r_i and mu_i are scalars, w is 40x1, Q_i is 1x40, phi is 40x1, and Sigma is 40x40.
LaTeX formatted version: http://mathurl.com/m2utrz4
Python code
I'm trying to create a PyMC model that generates some samples and then approximates phi and Sigma.
import pymc as pm
import numpy as np
SAMPLE_SIZE = 100
q_samples = ... # Q created elsewhere
reward_sigma = np.identity(SAMPLE_SIZE) * 0.1
phi_true = (np.random.rand(40)+1) * -2
sigma_true = np.random.rand(40, 40) * 2. - 1.
weights_true = np.random.multivariate_normal(phi_true, sigma_true)
reward_true = np.random.multivariate_normal(np.dot(q_samples,weights_true), reward_sigma)
with pm.Model() as model:
phi = pm.MvNormal('phi', np.zeros((ndims)), np.identity((ndims)) * 2)
sigma = pm.InverseWishart('sigma', ndims+1, np.identity(ndims))
weights = pm.MvNormal('weights', phi, sigma)
rewards = pm.Normal('rewards', np.dot(weights, q_samples), reward_sigma, observed=reward_true)
with model:
start = pm.find_MAP()
step = pm.NUTS()
trace = pm.sample(3000, step, start)
pm.traceplot(trace)
However, when I run the app, I get the following error:
Traceback (most recent call last):
File "test_pymc.py", line 46, in <module>
phi = pm.MvNormal('phi', np.zeros((ndims)), np.identity((ndims)) * 2)
TypeError: Wrong number of dimensions: expected 0, got 1 with shape (40,).
Am I setting up my model wrong somehow?
I think you're missing the shape parameter for MvNormal. I think MvNormal(..., shape = ndim) should fix the issue. We should probably figure out a way to infer that better.

Categories

Resources