Multiprocess not recognizing defined function - python

I'm trying to fit a simple gaussian model to some data using the markov-chain monte carlo package EMCEE while utilizing parallel processing. I am using multiprocess instead of multiprocessing as I am working in Jupyter notebook where multiprocessing often runs into problems for some reason. Currently, the code is saying that one of my functions is not defined even though it was defined earlier. Not sure why it isn't being carried over once the parallelization starts. Specifically the error occurs on the very last line saying that "log_prior" is not defined. Any thoughts on how to fix this issue would be much appreciated.
import numpy as np
import emcee
from scipy.optimize import minimize
import matplotlib.pyplot as plt
np.random.seed(123)
# Choose the "true" parameters.
mu_true = 5
sig_true = 0.5
f_true = 0.534
# Generate some synthetic data from the model.
N = 500
x = np.sort(10 * np.random.rand(N))
yerr = 0.03 + 0.05 * np.random.rand(N)
y = np.exp(-np.power(x - mu_true, 2.) / (2 * np.power(sig_true, 2.)))
y+= 0.5*np.abs(f_true * y) * np.random.randn(N)
y += yerr * np.random.randn(N)
def log_likelihood(theta, x, y, yerr):
mu, sig, log_f = theta
model = np.exp(-np.power(x - mu, 2.) / (2 * np.power(sig, 2.)))
sigma2 = yerr**2 + model**2 * np.exp(2 * log_f)
return -0.5 * np.sum((y - model) ** 2 / sigma2 + np.log(sigma2))
np.random.seed(42)
nll = lambda *args: -log_likelihood(*args)
initial = np.array([mu_true, sig_true, np.log(f_true)]) + 0.1 * np.random.randn(3)
soln = minimize(nll, initial, args=(x, y, yerr))
mu_ml, sig_ml, log_f_ml = soln.x
def log_prior(theta):
mu, sig, log_f = theta
if 2 < mu < 10 and 0.0 < sig < 5 and -10.0 < log_f < 1.0:
return 0.0
return -np.inf
def log_probability(theta, x, y, yerr):
lp = log_prior(theta)
if not np.isfinite(lp):
return -np.inf
return lp + log_likelihood(theta, x, y, yerr)
import time
import multiprocess
import numpy as np
from multiprocess import Pool
pos = soln.x + 0.2 * np.random.randn(32, 3)
nwalkers, ndim = pos.shape
nsteps = 10000
if __name__ == "__main__":
with Pool() as pool:
sampler = emcee.EnsembleSampler(
nwalkers, ndim, log_probability, args=(x, y, yerr), pool=pool)
sampler.run_mcmc(pos, 10000, progress=True);

Related

Fit a curve with a SIR model and predict

I have searched and referenced the code for solving the SIR model from others on this website, but the fitting effect is very poor. Is there something wrong with my data? Still what? How should I predict given new data for this SIR model?
import numpy as np
import matplotlib.pyplot as plt
from scipy import integrate, optimize
import pandas as pd
y_total = [0.0, 0.0010131712259371835, 0.0035460992907801418, 0.00911854103343465,
0.008611955420466059, 0.021783181357649443, 0.00911854103343465, 0.07852077001013172, 0.4397163120567376,
0.21681864235055726, 0.232016210739615, 0.5278622087132725, 0.13576494427558258, 0.2988855116514691, 0.37436676798378926,
0.4209726443768997, 0.544579533941236, 0.7254305977710233, 1.0, 0.7740628166160081, 0.43617021276595747, 0.48226950354609927]
x_total = range(0,22)
ydata = np.array(y_total, dtype=float)
xdata = np.array(x_total, dtype=float)
# IO + SO + R0 is always 1 regardless of "value"
I0 = 0.3
S0 = 1 - I0
R0 = 0
def sir_model(y, x, beta, gamma):
S = -beta * y[0] * y[1] / N
R = gamma * y[1]
I = -(S + R)
return S, I, R
def fit_odeint(x, beta, gamma):
return integrate.odeint(sir_model, (S0, I0, R0), x, args=(beta, gamma))[:,1]
N = 1.0
I0 = ydata[0]
S0 = N - I0
R0 = 0.0
popt, pcov = optimize.curve_fit(fit_odeint, xdata, ydata)
fitted = fit_odeint(xdata, *popt)
plt.plot(xdata, ydata, 'o')
plt.plot(xdata, fitted)
plt.show()
Make the initial infected number also a variable, which is easily done by shift the computation of the initial state into the target function
def sir_model(y, x, beta, gamma):
N = sum(y)
S = -beta * y[0] * y[1] / N
R = gamma * y[1]
I = -(S + R)
return S, I, R
def fit_odeint(x, beta, gamma, I0):
# IO + SO + R0 is always 1 regardless of "value"
S0 = 1 - I0
R0 = 0
return integrate.odeint(sir_model, (S0, I0, R0), x, args=(beta, gamma))[:,1]
popt, pcov = optimize.curve_fit(fit_odeint, xdata, ydata,(1/5,1/8,0.1))
There were some other changes, especially adding an initial guess to curve_fit. Still this gets a warning on some difficulty in odeint. But a result is reached anyway, with popt = [0.36714402, 0.04176973, 0.01311579], 1/popt = [2.72372678, 23.94078424, 76.2439491]. Changing the initial guess to values close to this, (1/3, 1/24, 0.05), eliminates the warning.

checking convergence using python integrator

I am looking to integrate the difference between my numerical and exact solution to the heat equation though I am not sure what would be the best to way to tackle this. Is there a specific integrator that would allow me to do this ?
I hope to integrate it wrt to $x$.
I have the following code so far:
import numpy as np
from matplotlib import pyplot
from mpl_toolkits.mplot3d import Axes3D
from scipy import linalg
import matplotlib.pyplot as plt
import math
def initial_data(x):
y = np.zeros_like(x)
for i in range(len(x)):
if (x[i] < 0.25):
y[i] = 0.0
elif (x[i] < 0.5):
y[i] = 4.0 * (x[i] - 0.25)
elif (x[i] < 0.75):
y[i] = 4.0 * (0.75 - x[i])
else:
y[i] = 0.0
return y
def heat_exact(x, t, kmax = 150):
"""Exact solution from separation of variables"""
yexact = np.zeros_like(x)
for k in range(1, kmax):
d = -8.0*
(np.sin(k*np.pi/4.0)-2.0*np.sin(k*np.pi/2.0)+np.sin(3.0*k*np.pi/4.0))/((np.pi*k)**2)
yexact += d*np.exp(-(k*np.pi)**2*t)*np.sin(k*np.pi*x)
return yexact
def ftcs_heat(y, ynew, s):
ynew[1:-1] = (1 - 2.0 * s) * y[1:-1] + s * (y[2:] + y[:-2])
# Trivial boundary conditions
ynew[0] = 0.0
ynew[-1] = 0.0
Nx = 198
h = 1.0 / (Nx + 1.0)
t_end = 0.25
s = 1.0 / 3.0 # s = delta / h^2
delta = s * h**2
Nt = int(t_end / delta)+1
x = np.linspace(0.0, 1.0, Nx+2)
y = initial_data(x)
ynew = np.zeros_like(y)
for n in range(Nt):
ftcs_heat(y, ynew, s)
y = ynew
fig = plt.figure(figsize=(8,6))
ax = fig.add_subplot(111)
x_exact = np.linspace(0.0, 1.0, 200)
ax.plot(x, y, 'kx', label = 'FTCS')
ax.plot(x_exact, heat_exact(x_exact, t_end), 'b-', label='Exact solution')
ax.legend()
plt.show()
diff = (y - heat_exact(x_exact,t_end))**2 # squared difference between numerical and exact solutions
x1 = np.trapz(diff, x=x) #(works!)
import scipy.integrate as integrate
x1 = integrate.RK45(diff,diff[0],0,t_end) #(preferred but does not work)
What I am looking to integrate is the variable diff (the squared difference). Any suggestions are welcomed, thanks.
Edit: I would like to use RK45 method however I am not sure what should my y0 be, I have tried x1 = integrate.RK45(diff,diff[0],0,t_end) but get the following output error:
raise ValueError("`y0` must be 1-dimensional.")
ValueError: `y0` must be 1-dimensional.
By integration you mean you want to find the area between y and heat_exact? Or do you want to know if they are the same within a specific limit? The latter can be found with numpy.isclose. The former you can use several integration functions builtin numpy.
For example:
np.trapz(diff, x=x)
Oh, shouldn't the last line be diff = (y - heat_exact(x_exact,t_end))**2? My integration of this diff gave 8.32E-12, which looks right judging by the plots you gave me.
Check out also scipy.integrate

Why is tempered mcmc fit not convering well?

I am trying to fit a simple straight line y=mx+c type to some synthetic data using parallel-tempered mcmc. My goal is to just be able to understand how to use it, so that I can apply to some more complex models later. The example I am trying is the replica of what has already been done in a simple emcee code :
http://dfm.io/emcee/current/user/line/
but instead of using mcmc, I want to use parallel-tempered mcmc:
http://dfm.io/emcee/current/user/pt/
Here is a working code:
import numpy as np
from emcee import PTSampler
import emcee
# Choose the "true" parameters.
m_true = -0.9594
b_true = 4.294
f_true = 0.534
# Generate some synthetic data from the model.
N = 50
x = np.sort(10*np.random.rand(N))
yerr = 0.1+0.5*np.random.rand(N)
y = m_true*x+b_true
y += np.abs(f_true*y) * np.random.randn(N)
y += yerr * np.random.randn(N)
def lnlike(theta, x, y, yerr):
m, b, lnf = theta
model = m * x + b
inv_sigma2 = 1.0/(yerr**2 + model**2*np.exp(2*lnf))
return -0.5*(np.sum((y-model)**2*inv_sigma2 - np.log(inv_sigma2)))
def lnprior(theta):
m, b, lnf = theta
if -5.0 < m < 0.5 and 0.0 < b < 10.0 and -10.0 < lnf < 1.0:
return 0.0
return -np.inf
def lnprob(theta, x, y, yerr):
lp = lnprior(theta)
if not np.isfinite(lp):
return -np.inf
return lp + lnlike(theta, x, y, yerr)
import scipy.optimize as op
nll = lambda *args: -lnlike(*args)
result = op.minimize(nll, [m_true, b_true, np.log(f_true)], args=(x, y, yerr))
m_ml, b_ml, lnf_ml = result["x"]
init = [0.5, m_ml, b_ml, lnf_ml]
ntemps = 10
nwalkers = 100
ndim = 3
from multiprocessing import Pool
pos = np.random.uniform(low=-1, high=1, size=(ntemps, nwalkers, ndim))
for i in range(ntemps):
#initialize parameters near scipy optima
pos[i:,] = np.array([result["x"] + 1e-4*np.random.randn(ndim) for i in range(nwalkers)])
pool = Pool(processes=4)
sampler=PTSampler(ntemps,nwalkers, ndim, lnlike, lnprior, loglargs=(x, y, yerr), pool=pool)# args=(x, y, yerr))
#burn-in
sampler.run_mcmc(pos, 1000)
sampler.reset()
sampler.run_mcmc(pos, 10000, thin=10)
samples = sampler.chain.reshape((-1, ndim))
print('Number of posterior samples is {}'.format(samples.shape[0]))
#print best fit value together with errors
print(map(lambda v: (v[1], v[2]-v[1], v[1]-v[0]),
zip(*np.percentile(samples, [16, 50, 84],
axis=0))))
import corner
fig = corner.corner(samples, labels=["$m$", "$b$", "$\ln\,f$"],
truths=[m_true, b_true, np.log(f_true)])
fig.savefig("triangle.png")
The only problem when running this code is I get optimal parameters value which are way off the true values. And increasing the number of walkers or samples is not helping in any sense. Can anyone please advice why tempered-mcmc is not working here?
Update:
I found out a useful package called ptemcee (https://pypi.org/project/ptemcee/#description), although the documentation of this package is non-existent. It seems that this package might be useful, any help on how to implement the same linear fitting with this package would also be highly appreciated.
I have modified some lines
import time
import numpy as np
from emcee import PTSampler
import corner
import matplotlib.pyplot as plt
import scipy.optimize as op
t1 = time.time()
np.random.seed(6) # To reproduce results
# Choose the "true" parameters.
m_true = -0.9594
b_true = 4.294
f_true = 0.534
# Generate some synthetic data from the model.
N = 50
x = np.sort(10 * np.random.rand(N))
yerr = 0.1 + 0.5 * np.random.rand(N)
y_1 = m_true * x + b_true
y = np.abs(f_true * y_1) * np.random.randn(N) + y_1
y += yerr * np.random.randn(N)
plt.plot(x, y, 'o')
# With emcee
def lnlike(theta, x, y, yerr):
m, b, lnf = theta
model = m * x + b
inv_sigma2 = 1.0/(yerr**2 + model**2*np.exp(2*lnf))
return -0.5*(np.sum((y-model)**2*inv_sigma2 - np.log(inv_sigma2)))
def lnprior(theta):
m, b, lnf = theta
if -5.0 < m < 0.5 and 0.0 < b < 10.0 and -10.0 < lnf < 1.0:
return 0.0
return -np.inf
def lnprob(theta, x, y, yerr):
lp = lnprior(theta)
if not np.isfinite(lp):
return -np.inf
return lp + lnlike(theta, x, y, yerr)
nll = lambda *args: -lnlike(*args)
result = op.minimize(nll, [m_true, b_true, np.log(f_true)], args=(x, y, yerr))
m_ml, b_ml, lnf_ml = result["x"]
init = [0.5, m_ml, b_ml, lnf_ml]
ntemps = 10
nwalkers = 100
ndim = 3
pos = np.random.uniform(low=-1, high=1, size=(ntemps, nwalkers, ndim))
for i in range(ntemps):
pos[i:, :] = np.array([result["x"] + 1e-4*np.random.randn(ndim) for i in range(nwalkers)])
sampler = PTSampler(ntemps, nwalkers, ndim, lnlike, lnprior, loglargs=(x, y, yerr), threads=4) # args=(x, y, yerr))
#burn-in
print(pos.shape)
sampler.run_mcmc(pos, 100)
sampler.reset()
sampler.run_mcmc(pos, 5000, thin=10)
samples = sampler.chain.reshape((-1, ndim))
print('Number of posterior samples is {}'.format(samples.shape[0]))
#print best fit value together with errors
p1, p2, p3 = map(lambda v: (v[1], v[2]-v[1], v[1]-v[0]),
zip(*np.percentile(samples, [16, 50, 84],
axis=0)))
print(p1, '\n', p2, '\n', p3)
fig = corner.corner(samples, labels=["$m$", "$b$", "$\ln\,f$"],
truths=[m_true, b_true, np.log(f_true)])
t2 = time.time()
print('It took {:.3f} s'.format(t2 - t1))
plt.show()
The figure I get with corner is:
The important line is
sampler = PTSampler(ntemps, nwalkers, ndim, lnlike, lnprior, loglargs=(x, y, yerr), threads=4)
I have used threads=4 instead of Pool.
Look closely at this line print(p1, '\n', p2, '\n', p3), it prints the values of m_true, b_true and f_true you get:
(-1.277782877669762, 0.5745273177144817, 2.0813620981463297)
(4.800481378230051, 3.1747356851201163, 2.245189235990341)
(-0.9391847529845194, 1.1196053087321716, 3.6017609114364273)
For f, you need np.exp(-0.93918), which is 0.3909, which is close to 0.534. The values you get are close (-1.277 compared to -0.9594 and 4.8 compared to 4.294), although the errors are not bad (except for f). I mean, are you expecting to get the exact numbers? With this method, in my computer, it takes 111 s to complete, is that normal?
Let's try something different. Let's be clear: the problem is not easy when f_true is added. I will use pymc3 (you don't need to know how to use pymc3, I want to check the results found by emcee).
import time
import numpy as np
import corner
import matplotlib.pyplot as plt
import pymc3 as pm
t1 = time.time()
np.random.seed(6)
# Choose the "true" parameters.
m_true = -0.9594
b_true = 4.294
f_true = 0.534
# Generate some synthetic data from the model.
N = 50
x = np.sort(10 * np.random.rand(N))
yerr = 0.1 + 0.5 * np.random.rand(N)
y_1 = m_true * x + b_true
y = np.abs(f_true * y_1) * np.random.randn(N) + y_1
y += yerr * np.random.randn(N)
plt.plot(x, y, 'o')
with pm.Model() as model: # model specifications in PyMC3 are wrapped in a with-statement
# Define priors
f = pm.HalfCauchy('f', beta=5)
m = pm.Normal('m', 0, sd=20)
b = pm.Normal('b', 0, sd=20)
mu2 = b + m * x
sigma2 = yerr**2 + f**2 * (y_1)**2
post = pm.Normal('y', mu=mu2, sd=pm.math.sqrt(sigma2), observed=y)
with model:
trace = pm.sample(2000, tune=2000)
print(pm.summary(trace))
pm.traceplot(trace)
all_values = np.stack([trace.get_values('b'), trace.get_values('m'), trace.get_values('f')], axis=1)
fig2 = corner.corner(all_values, labels=["$b$", "$m$", "$f$"],
truths=[b_true, m_true, f_true])
t2 = time.time()
print('It took {:.3f} s'.format(t2 - t1))
plt.show()
The summary is
mean sd mc_error hpd_2.5 hpd_97.5 n_eff Rhat
m -0.995545 0.067818 0.001174 -1.123187 -0.857653 2685.610018 1.000121
b 4.398158 0.332526 0.005585 3.767336 5.057909 2746.736563 1.000201
f 0.425442 0.063884 0.000904 0.311037 0.554446 4195.591204 1.000309
The important part is the column mean, you see that the values found by pymc3 are close to the true values. The columns hpd_2.5 and hpd_97.5 are the errors for f, b and m. And it took 14 s.
The figure I get with corner is
You will say that the results of emcee are not quite good, but if you really want more accuracy, you have to modify this function:
def lnprior(theta):
m, b, lnf = theta
if -5.0 < m < 0.5 and 0.0 < b < 10.0 and -10.0 < lnf < 1.0:
return 0.0
return -np.inf
The famous prior. In this case, it is flat, and since there are a lot of priors...

Solving ODE with solve_ivp gets incredibly slow or freezes completely

I use solve_ivp to solve an ODE:
def test_ode(t, y):
dydt = C - y + (y ** 8 / (1 + y ** 8))
return dydt
steady_state = []
for C in np.linspace(0, 1, 1001):
sol = solve_ivp(test_ode, [0, 1e06], [0], method='BDF')
steady_state.append(sol.y[0][-1])
This gives me a RuntimeWarning:
ETA: --:--:--/anaconda3/lib/python3.6/site-packages/scipy/integrate/_ivp/bdf.py:418:
RuntimeWarning: divide by zero encountered in power
factors = error_norms ** (-1 / np.arange(order, order + 3))
But even worse, the run basically freezes (or at least gets incredibly slow). Replacing the initial value [0] by [1e-08] does not solve the problem. How can I fix this?
You can use NumbaLSODA: https://github.com/Nicholaswogan/NumbaLSODA . Its like solve_ivp, but all the code can be compiled. So it is very speedy:
from NumbaLSODA import lsoda_sig, lsoda
import numpy as np
import numba as nb
import time
#nb.cfunc(lsoda_sig)
def test_ode(t, y_, dydt, p):
y = y_[0]
C = p[0]
dydt[0] = C - y + (y ** 8 / (1 + y ** 8))
funcptr = test_ode.address
#nb.njit()
def main():
steady_state = np.empty((1001,),np.float64)
CC = np.linspace(0, 1, 1001)
y0 = np.array([0.0])
for i in range(len(CC)):
data = np.array([CC[i]],np.float64)
t_eval = np.array([0.0, 1.0e6])
sol, success = lsoda(funcptr, y0, t_eval, data = data)
steady_state[i] = sol[-1,0]
return steady_state
main()
start = time.time()
steady_state = main()
end = time.time()
print(end-start)
result is 0.013 seconds

Python: Gibbs sampler for regression model

I am trying to write a function for Gibbs sampler in the Bayesian framework. I got the code from this [website][1], which is a straightforward regression model. However, I am tackling a more complicated model which is: y= beta0 + beta1* x + x^gamma * sigma * epsilon where sigma is the variance of the model. That means I need to estimate p(beta0|y,x,beta1,sigma,gamma) and so on(in the Gibbs sampler method). my question is how should I modify the code to sample beta0, beta1 and other variables as there are extra variables to condition on.
My codes are:
import numpy as np
import pymc as pm
import seaborn as sns
import matplotlib.pyplot as plt
import pandas as pd
plt.rcParams['figure.figsize'] = (10, 5)
conda install -c conda-forge pymc3=3.0
def sample_beta_0(y, x, beta_1, sigma, gamma, mu_0, tau_0):
N = len(y)
assert len(x) == N
tau_i = 1/((x**gamma)*sigma)**2
precision = tau_0 + sum(tau_i)
mean = tau_0 * mu_0 + np.sum((y - beta_1 * x)*tau_i)
mean /= precision
return np.random.normal(mean, 1 / np.sqrt(precision))
def sample_beta_1(y, x, beta_0, sigma, mu_1, sigma_1):
N = len(y)
assert len(x) == N
precision = sigma_1 + sigma * np.sum(x * x)
mean = sigma_1 * mu_1 + sigma * np.sum( (y - beta_0) * x)
mean /= precision
return np.random.normal(mean, 1 / np.sqrt(precision))
def sample_sigma(y, x, beta_0, beta_1, alpha, beta):
N = len(y)
alpha_new = alpha + N / 2
resid = y - beta_0 - beta_1 * x
beta_new = beta + np.sum(resid * resid) / 2
return np.random.gamma(alpha_new, 1 / beta_new)
beta_0_true = -1
beta_1_true = 2
sigma_true = 1
N = 50
x = np.random.uniform(low=0, high=4, size=N)
y = np.random.normal(beta_0_true + beta_1_true * x, 1 / np.sqrt(sigma_true))
synth_plot = plt.plot(x, y, "o")
plt.xlabel("x")
plt.ylabel("y")
# print('Y are', y)
# print('X are', x)
plt.show()
"""GIBBS Sampler"""
# specify initial values
init = {"beta_0": 0,
"beta_1": 0,
"sigma": 2}
# specify hyper parameters
hypers = {"mu_0": 0,
"sigma_0": 1,
"mu_1": 0,
"sigma_1": 1,
"alpha": 2,
"beta": 1}
def gibbs(y, x, iters, init, hypers):
assert len(y) == len(x)
beta_0 = init["beta_0"]
beta_1 = init["beta_1"]
sigma = init["sigma"]
trace = np.zeros((iters, 3)) # trace to store values of beta_0, beta_1, sigma
for it in range(iters):
beta_0 = sample_beta_0(y, x, beta_1, sigma, hypers["mu_0"], hypers["sigma_0"])
beta_1 = sample_beta_1(y, x, beta_0, sigma, hypers["mu_1"], hypers["sigma_1"])
sigma = sample_sigma(y, x, beta_0, beta_1, hypers["alpha"], hypers["beta"])
trace[it, :] = np.array((beta_0, beta_1, sigma))
trace = pd.DataFrame(trace)
trace.columns = ['beta_0', 'beta_1', 'sigma']
print(trace)
return trace
iters = 1000
trace = gibbs(y, x, iters, init, hypers)
traceplot = trace.plot()
traceplot.set_xlabel("Iteration")
traceplot.set_ylabel("Parameter value")
trace_burnt = trace[500:999]
hist_plot = trace_burnt.hist(bins = 30, layout = (1,3))
print(trace_burnt.median())
print(trace_burnt.std())
I know is really long but please help!

Categories

Resources