Random number generators C++ and Python - python

I have programmed a model in both C++ and Python. This model has a noisy-input component, which I can replace with this C++:
double doubleRand() {
thread_local std::mt19937 generator(std::random_device{}());
std::normal_distribution<double> distribution(0.0, 1.0);
return distribution(generator);
}
Or this Python:
Inoise = (np.random.normal(0, 1) * knoise * np.sqrt(gNa * A))
IIon = ((iNa + iK + iL) * A) + Inoise #
# Compute change of voltage
v[i + 1] = (vT + ((-IIon + IStim) / C) * dt)[0]
The following is very strange:
If I omit the noisy component (Inoise=0), then both models (C++ as well as Python) give exactly the same result. If I only introduce the noisy component (Istim=0), then both models give results (i.e. natural fluctuations that hardly differ from each other at 1000 runs). However, if I choose Istim=0.000001 and add noise, then the results differ by 30%. How is that possible?
Here is the full code. C++:
#include<math.h>
#include<iostream>
#include<random>
#include<vector>
#include<algorithm>
#include<fstream>
#include<omp.h>
#include <iomanip>
#include <assert.h>
// parameters
constexpr double v_Rest = -65.0;
constexpr double gNa = 1200.0;
constexpr double gK = 360.0;
constexpr double gL = 3.0;
constexpr double vNa = 115.0;
constexpr double vK = -12.0;
constexpr double vL = 10.6;
constexpr double c = 1.0;
constexpr double knoise = 0.0005;
bool print = false;
bool bisection = false;
bool test = true;
// stepsize PFs
constexpr int steps = 5;
double store[steps];
int prob[steps];
double step[steps];
// time constants
constexpr double t_end = 1.0;
constexpr double delay = 0.1;
constexpr double duration = 0.1;
constexpr double dt = 0.0025;
constexpr int t_steps = t_end/dt;
constexpr int runs = 1000;
double voltage[t_steps];
double doubleRand() {
thread_local std::mt19937 engine(std::random_device{}());
std::normal_distribution<double> distribution(0.0, 1.0);
return distribution(engine);
}
double alphaM(const double v){ return 12.0 * ((2.5 - 0.1 * (v)) / (exp(2.5 - 0.1 * (v)) - 1.0)); }
double betaM(const double v){ return 12.0 * (4.0 * exp(-(v) / 18.0)); }
double betaH(const double v){ return 12.0 * (1.0 / (exp(3.0 - 0.1 * (v)) + 1.0)); }
double alphaH(const double v){ return 12.0 * (0.07 * exp(-(v) / 20.0)); }
double alphaN(const double v){ return 12.0 * ((1.0 - 0.1 * (v)) / (10.0 * (exp(1.0 - 0.1 * (v)) - 1.0))); }
double betaN(const double v){ return 12.0 * (0.125 * exp(-(v) / 80.0)); }
double HH_model(const double I, const double area_factor){
const double A = 1.0e-8 * area_factor;
const double C = c*A;
const double v0 = 0.0;
const double m0 = alphaM(v0)/(alphaM(v0)+betaM(v0));
const double h0 = alphaH(v0)/(alphaH(v0)+betaH(v0));
const double n0 = alphaN(v0)/(alphaN(v0)+betaN(v0));
int count = 0;
for(int j=0; j<runs; j++){
double vT = v0;
double mT = m0;
double hT = h0;
double nT = n0;
for(int i=0; i<t_steps; i++){
double IStim = 0.0;
if ((delay / dt <= (double)i) && ((double)i <= (delay + duration) / dt))
IStim = I;
mT = (mT + dt * alphaM(vT)) / (1.0 + dt * (alphaM(vT) + betaM(vT)));
hT = (hT + dt * alphaH(vT)) / (1.0 + dt * (alphaH(vT) + betaH(vT)));
nT = (nT + dt * alphaN(vT)) / (1.0 + dt * (alphaN(vT) + betaN(vT)));
const double iNa = gNa * pow(mT, 3.0) * hT * (vT - vNa);
const double iK = gK * pow(nT, 4.0) * (vT - vK);
const double iL = gL * (vT-vL);
const double Inoise = (doubleRand() * knoise * sqrt(gNa * A));
const double IIon = ((iNa + iK + iL) * A) + Inoise;
vT += ((-IIon + IStim) / C) * dt;
voltage[i] = vT;
if(vT > 60.0) {
count++;
break;
}
}
}
return count;
}
int main(){
std::cout << HH_model(1.0e-6,1) << std::endl;
}
}
Python:
import matplotlib.pyplot as py
import numpy as np
import scipy.optimize as optimize
from tqdm import tqdm
import warnings
warnings.simplefilter(action='ignore', category=FutureWarning)
# HH parameters
v_Rest = -65 # in mV
gNa = 1200 # in mS/cm^2
gK = 360 # in mS/cm^2
gL = 0.3*10 # in mS/cm^2
vNa = 115 # in mV
vK = -12 # in mV
vL = 10.6 # in mV
#Number of runs
runs = 1000
c = 1 # in uF/cm^2
def alphaM(v): return 12 * ((2.5 - 0.1 * (v)) / (np.exp(2.5 - 0.1 * (v)) - 1))
def betaM(v): return 12 * (4 * np.exp(-(v) / 18))
def betaH(v): return 12 * (1 / (np.exp(3 - 0.1 * (v)) + 1))
def alphaH(v): return 12 * (0.07 * np.exp(-(v) / 20))
def alphaN(v): return 12 * ((1 - 0.1 * (v)) / (10 * (np.exp(1 - 0.1 * (v)) - 1)))
def betaN(v): return 12 * (0.125 * np.exp(-(v) / 80))
def HH_model(I,area_factor):
count = 0
t_end = 1 # in ms
delay = 0.1 # in ms
duration = 0.1 # in ms
dt = 0.0025 # in ms
area_factor = area_factor
I = I
C = c*A # uF
for j in tqdm(range(0, runs), total=runs):
# Introduction of equations and channels
# compute the timesteps
t_steps= t_end/dt+1
# Compute the initial values
v0 = 0
m0 = alphaM(v0)/(alphaM(v0)+betaM(v0))
h0 = alphaH(v0)/(alphaH(v0)+betaH(v0))
n0 = alphaN(v0)/(alphaN(v0)+betaN(v0))
# Allocate memory for v, m, h, n
v = np.zeros((int(t_steps), 1))
m = np.zeros((int(t_steps), 1))
h = np.zeros((int(t_steps), 1))
n = np.zeros((int(t_steps), 1))
# Set Initial values
v[:, 0] = v0
m[:, 0] = m0
h[:, 0] = h0
n[:, 0] = n0
### Noise component
knoise= 0.0005 #uA/(mS)^1/2
### --------- Step3: SOLVE
for i in range(0, int(t_steps)-1, 1):
# Get current states
vT = v[i]
mT = m[i]
hT = h[i]
nT = n[i]
# Stimulus current
IStim = 0
if delay / dt <= i <= (delay + duration) / dt:
IStim = I # in uA
else:
IStim = 0
# Compute change of m, h and n
m[i + 1] = (mT + dt * alphaM(vT)) / (1 + dt * (alphaM(vT) + betaM(vT)))
h[i + 1] = (hT + dt * alphaH(vT)) / (1 + dt * (alphaH(vT) + betaH(vT)))
n[i + 1] = (nT + dt * alphaN(vT)) / (1 + dt * (alphaN(vT) + betaN(vT)))
# Ionic currents
iNa = gNa * m[i + 1] ** 3. * h[i + 1] * (vT - vNa)
iK = gK * n[i + 1] ** 4. * (vT - vK)
iL = gL * (vT-vL)
Inoise = (np.random.normal(0, 1) * knoise * np.sqrt(gNa * A))
IIon = ((iNa + iK + iL) * A) + Inoise #
# Compute change of voltage
v[i + 1] = (vT + ((-IIon + IStim) / C) * dt)[0] # in ((uA / cm ^ 2) / (uF / cm ^ 2)) * ms == mV
# adjust the voltage to the resting potential
v = v + v_Rest
# test if there was a spike
if max(v[:]-v_Rest) > 60:
count += 1
return count

You've messed up indents in the Python code. These lines
m[i + 1] = (mT + dt * alphaM(vT)) / (1 + dt * (alphaM(vT) + betaM(vT)))
h[i + 1] = (hT + dt * alphaH(vT)) / (1 + dt * (alphaH(vT) + betaH(vT)))
n[i + 1] = (nT + dt * alphaN(vT)) / (1 + dt * (alphaN(vT) + betaN(vT)))
do not execute when condition delay / dt <= i <= (delay + duration) / dt is True
After indentation is fixed the Python code produces 866, which nearly matches 876 - result of C++ code.

Related

How do I convert C program to python

I have been trying to import a C code in python file. But its not working. So I've decided to convert the C program into Python so that importing the function will be easier.
The C code I want to convert is given below. (I got this from github)
#include "Python.h"
#include "numpy/arrayobject.h"
#include <math.h>
# define CUBE(x) ((x) * (x) * (x))
# define SQR(x) ((x) * (x))
static PyObject *interp3_tricubic(PyObject *self, PyObject *args);
float TriCubic (float px, float py, float pz, float *volume, int xDim, int yDim, int zDim);
// what function are exported
static PyMethodDef tricubicmethods[] = {
{"_interp3_tricubic", interp3_tricubic, METH_VARARGS},
{NULL, NULL}
};
// This function is essential for an extension for Numpy created in C
void inittricubic() {
(void) Py_InitModule("tricubic", tricubicmethods);
import_array();
}
// the data should be FLOAT32 and should be ensured in the wrapper
static PyObject *interp3_tricubic(PyObject *self, PyObject *args)
{
PyArrayObject *volume, *result, *C, *R, *S;
float *pr, *pc, *ps;
float *pvol, *pvc;
int xdim, ydim, zdim;
// We expect 4 arguments of the PyArray_Type
if(!PyArg_ParseTuple(args, "O!O!O!O!",
&PyArray_Type, &volume,
&PyArray_Type, &C,
&PyArray_Type, &R,
&PyArray_Type, &S)) return NULL;
if ( NULL == volume ) return NULL;
if ( NULL == C ) return NULL;
if ( NULL == R ) return NULL;
if ( NULL == S ) return NULL;
// result matrix is the same size as C and is float
result = (PyArrayObject*) PyArray_ZEROS(PyArray_NDIM(C), C->dimensions, NPY_FLOAT, 0);
// This is for reference counting ( I think )
PyArray_FLAGS(result) |= NPY_OWNDATA;
// massive use of iterators to progress through the data
PyArrayIterObject *itr_v, *itr_r, *itr_c, *itr_s;
itr_v = (PyArrayIterObject *) PyArray_IterNew(result);
itr_r = (PyArrayIterObject *) PyArray_IterNew(R);
itr_c = (PyArrayIterObject *) PyArray_IterNew(C);
itr_s = (PyArrayIterObject *) PyArray_IterNew(S);
pvol = (float *)PyArray_DATA(volume);
xdim = PyArray_DIM(volume, 0);
ydim = PyArray_DIM(volume, 1);
zdim = PyArray_DIM(volume, 2);
while(PyArray_ITER_NOTDONE(itr_v)) {
pvc = (float *) PyArray_ITER_DATA(itr_v);
pr = (float *) PyArray_ITE R_DATA(itr_r);
pc = (float *) PyArray_ITER_DATA(itr_c);
ps = (float *) PyArray_ITER_DATA(itr_s);
*pvc = TriCubic(*pc, *pr, *ps, pvol, xdim, ydim, zdim);
PyArray_ITER_NEXT(itr_v);
PyArray_ITER_NEXT(itr_r);
PyArray_ITER_NEXT(itr_c);
PyArray_ITER_NEXT(itr_s);
}
return result;
}
/*
* TriCubic - tri-cubic interpolation at point, p.
* inputs:
* px, py, pz - the interpolation point.
* volume - a pointer to the float volume data, stored in x,
* y, then z order (x index increasing fastest).
* xDim, yDim, zDim - dimensions of the array of volume data.
* returns:
* the interpolated value at p.
* note:
* rudimentary range checking is done in this function.
*/
float TriCubic (float px, float py, float pz, float *volume, int xDim, int yDim, int zDim)
{
int x, y, z;
int i, j, k;
float dx, dy, dz;
float *pv;
float u[4], v[4], w[4];
float r[4], q[4];
float vox = 0;
int xyDim;
xyDim = xDim * yDim;
x = (int) px, y = (int) py, z = (int) pz;
// necessary evil truncating at dim-2 because tricubic needs 2 more values
// which is criminal near edges
// future work includes doing trilinear for edge cases
// range checking is extremely important here
if (x < 3 || x > xDim-3 || y < 3 || y > yDim-3 || z < 3 || z > zDim-3)
return (0);
dx = px - (float) x, dy = py - (float) y, dz = pz - (float) z;
pv = volume + (x - 1) + (y - 1) * xDim + (z - 1) * xyDim;
/* factors for Catmull-Rom interpolation */
u[0] = -0.5 * CUBE (dx) + SQR (dx) - 0.5 * dx;
u[1] = 1.5 * CUBE (dx) - 2.5 * SQR (dx) + 1;
u[2] = -1.5 * CUBE (dx) + 2 * SQR (dx) + 0.5 * dx;
u[3] = 0.5 * CUBE (dx) - 0.5 * SQR (dx);
v[0] = -0.5 * CUBE (dy) + SQR (dy) - 0.5 * dy;
v[1] = 1.5 * CUBE (dy) - 2.5 * SQR (dy) + 1;
v[2] = -1.5 * CUBE (dy) + 2 * SQR (dy) + 0.5 * dy;
v[3] = 0.5 * CUBE (dy) - 0.5 * SQR (dy);
w[0] = -0.5 * CUBE (dz) + SQR (dz) - 0.5 * dz;
w[1] = 1.5 * CUBE (dz) - 2.5 * SQR (dz) + 1;
w[2] = -1.5 * CUBE (dz) + 2 * SQR (dz) + 0.5 * dz;
w[3] = 0.5 * CUBE (dz) - 0.5 * SQR (dz);
for (k = 0; k < 4; k++)
{
q[k] = 0;
for (j = 0; j < 4; j++)
{
r[j] = 0;
for (i = 0; i < 4; i++)
{
r[j] += u[i] * *pv;
pv++;
}
q[k] += v[j] * r[j];
pv += xDim - 4;
}
vox += w[k] * q[k];
pv += xyDim - 4 * xDim;
}
return vox;
}
I have tried to convert this code to python. But the output I got is wrong. The python code I created is added below.
import numpy as N
import math
import scipy
global result
def interp3_tricubic(volume, C, R, S):
if volume is None :
result = 0
elif C is None:
result = 0
elif R is None:
result = 0
elif S is None:
result = 0
else:
result = N.zeros(len(C), dtype=('float'))
tri_v = N.array(volume, dtype=("float"))
tri_r = N.array(R, dtype=("float"))
tri_c = N.array(C, dtype=("float"))
tri_s = N.array(S, dtype=("float"))
tri_vol = N.array(volume, dtype=("float"))
xDim = volume.shape[0]
yDim = volume.shape[1]
zDim = volume.shape[2]
for i in range(len(C)):
tri_v = TriCubic(tri_c[i], tri_r[i], tri_s[i], volume, xDim, yDim, zDim)
i = i + 1
# print(tri_v, "tri_v")
return tri_v
def TriCubic ( px, py, pz, volume, xDim, yDim, zDim):
xyDim = xDim * yDim
x = px.astype(int)
y = py.astype(int)
z = pz.astype(int)
dx = px - x
dy = py - y
dz = pz - z
pv = volume + (x - 1) + (y - 1) * xDim + (z - 1) * xyDim;
def cube(num):
return num * num * num
def sqrt(num):
return num * num
u = N.array([0,0,0,0], dtype=('float'))
v = N.array([0,0,0,0], dtype=('float'))
w = N.array([0,0,0,0], dtype=('float'))
vox = N.zeros_like(volume, dtype=('float'))
u[0] = -0.5 * cube (dx) + sqrt (dx) - 0.5 * dx;
u[1] = 1.5 * cube (dx) - 2.5 * sqrt (dx) + 1;
u[2] = -1.5 * cube (dx) + 2 * sqrt (dx) + 0.5 * dx;
u[3] = 0.5 * cube (dx) - 0.5 * sqrt (dx);
v[0] = -0.5 * cube (dy) + sqrt (dy) - 0.5 * dy;
v[1] = 1.5 * cube (dy) - 2.5 * sqrt (dy) + 1;
v[2] = -1.5 * cube (dy) + 2 * sqrt (dy) + 0.5 * dy;
v[3] = 0.5 * cube (dy) - 0.5 * sqrt (dy);
w[0] = -0.5 * cube (dz) + sqrt (dz) - 0.5 * dz;
w[1] = 1.5 * cube (dz) - 2.5 * sqrt (dz) + 1;
w[2] = -1.5 * cube (dz) + 2 * sqrt (dz) + 0.5 * dz;
w[3] = 0.5 * cube (dz) - 0.5 * sqrt (dz);
k = 0
j = 0
i = 0
q = [0,0,0,0]
r = [0,0,0,0]
for k in range(4):
for j in range(4):
for i in range(4):
r[j] += u[i] * pv[i]
i = i+1
q[k] += v[j] * r[j]
pv += xDim - 4
j = j+1
vox += w[k] * q[k]
pv += xyDim - 4 * xDim
k = k+1
return vox
I am confused on the meaning of some lines.
Like these lines...
static PyObject *interp3_tricubic(PyObject *self, PyObject *args);
itr_v = (PyArrayIterObject *) PyArray_IterNew(result);
r[j] += u[i] * *pv;
Please help me correct the code. I am stuck!

Can cython make things slower?

def htc_gnielinski_calc(
self, MFR_ref_in, rho_ref_in, mu_ref_in, cp_ref_in, k_ref_in
):
"""
Gnielinski Refrigerant Heat transfer model: Calculate HTC
"""
V_ref_in = MFR_ref_in / (rho_ref_in * pi * self.ID ** 2 / 4)
Re = (rho_ref_in * V_ref_in * self.ID) / mu_ref_in
if Re < 3000:
HTC_Gnielinski = (k_ref_in / self.ID) * 3.66
return HTC_Gnielinski
f = (1.58 * log(Re) - 3.28) ** (-2)
Pr_ref = mu_ref_in * cp_ref_in / k_ref_in
HTC_Gnielinski = (
(k_ref_in / self.ID)
* ((f / 2) * (Re - 1000) * Pr_ref)
/ (1 + 12.7 * (f / 2) ** 0.5 * (Pr_ref ** (2 / 3) - 1))
)
return HTC_Gnielinski
This is a equation solving heat transfer coefficient of refrigerant.
Importing cython to make my project run faster, actually it takes more time.
Here's my cython code
cpdef htc_gnielinski_calc(double MFR_ref_in, double rho_ref_in, double mu_ref_in, double cp_ref_in, double k_ref_in, double ID):
"""
Gnielinski Refrigerant Heat transfer model: Calculate HTC
"""
cdef double V_ref_in = MFR_ref_in / (rho_ref_in * pi * ID ** 2 / 4)
cdef double Re = (rho_ref_in * V_ref_in * ID) / mu_ref_in
cdef double HTC_Gnielinski, f, Pr_ref
if Re < 3000:
HTC_Gnielinski = (k_ref_in / ID) * 3.66
return HTC_Gnielinski
f = (1.58 * log(Re) - 3.28) ** (-2)
Pr_ref = mu_ref_in * cp_ref_in / k_ref_in
HTC_Gnielinski = (
(k_ref_in / ID)
* ((f / 2) * (Re - 1000) * Pr_ref)
/ (1 + 12.7 * (f / 2) ** 0.5 * (Pr_ref ** (2 / 3) - 1))
)
return HTC_Gnielinski
However, code below this make my project run faster. (Different equation)
cpdef htc_shah_cond_calc(
double MFR_ref_in,
double P_ref_in,
double x_ref_in,
double mu_ref_l_in,
double mu_ref_g_in,
double cp_ref_l_in,
double k_ref_l_in,
double rho_ref_l_in,
double rho_ref_g_in,
double ID,
double P_critical_ref
):
"Two-phase HTC"
cdef int theta = 0
cdef double G = MFR_ref_in / ((pi * ID ** 2) / 4)
cdef double P_r = P_ref_in / P_critical_ref
cdef double Z = (1 / x_ref_in - 1) ** 0.8 * P_r ** 0.4
cdef double Re_LS = (G * (1 - x_ref_in) * ID) / mu_ref_l_in # Reynolds number assuming liquid phase flowing alone
cdef double Pr_l = mu_ref_l_in * cp_ref_l_in / k_ref_l_in
cdef double h_LS = 0.023 * Re_LS ** 0.8 * Pr_l ** 0.4 * (k_ref_l_in / ID)
cdef double h_I = ( h_LS * (1 + 3.8 / (Z ** 0.95)) * (mu_ref_l_in / (14 * mu_ref_g_in)) ** (0.0058 + 0.557 * P_r) )
cdef double h_Nu = (1.32* Re_LS ** (-1 / 3)* (rho_ref_l_in* (rho_ref_l_in - rho_ref_g_in)* 9.80665* k_ref_l_in ** 3/ (mu_ref_l_in ** 2))** (1 / 3))
cdef double J_g = (x_ref_in * G) / ( 9.80665 * ID * rho_ref_g_in * (rho_ref_l_in - rho_ref_g_in) ) ** 0.5
cdef double HTC_Shah_Cond
if theta == 0:
if J_g >= (0.98 * (Z + 0.263) ** (-0.62)):
HTC_Shah_Cond = h_I # Regime 1 in horizontal tube
elif J_g <= (0.95 * (1.254 + 2.27 * Z ** (1.249)) ** (-1)):
HTC_Shah_Cond = h_Nu # Regime 3 in horizontal tube
elif J_g > (0.95 * (1.254 + 2.27 * Z ** (1.249)) ** (-1)) and J_g < (
0.98 * (Z + 0.263) ** (-0.62)
):
HTC_Shah_Cond = h_I + h_Nu # Regime 2 in horizontal tube
elif theta == 90:
if J_g >= (1 / (2.4 * Z + 0.73)):
HTC_Shah_Cond = h_I # Regime 1 in vertical tube
elif J_g <= (0.89 - 0.93 * exp(-0.087 * Z ** (-1.17))):
HTC_Shah_Cond = h_Nu # Regime 3 in vertical tube
elif J_g > (0.89 - 0.93 * exp(-0.087 * Z ** (-1.17))) and J_g < (
1 / (2.4 * Z + 0.73)
):
HTC_Shah_Cond = h_I + h_Nu # Regime 2 in vertical tube
return HTC_Shah_Cond
What would be the differences?
Are there any criteria for taking cython during optimization?

Epitrocoid in Python is not giving proper plot

I wrote the following code in C#:
using System;
using System.Drawing;
using ZedGraph;
namespace _1_12_Epitrocoid
{
class Epitrocoid
{
double A = 1.0;
double a = 0.4;
double λ = 1.4;
public double X(double ϕ)
{
return (A + a) * Math.Cos(ϕ) - λ * a * Math.Cos(((A + a) / a) * ϕ);
}
public double Y(double ϕ)
{
return (A + a) * Math.Sin(ϕ) - λ * a * Math.Sin(((A + a) / a) * ϕ);
}
}
class Program
{
static void Main(string[] args)
{
Epitrocoid e = new Epitrocoid();
PointPairList list = new PointPairList();
for (double ϕ = 0; ϕ < 10; ϕ += 0.01)
{
double x = e.X(ϕ);
double y = e.Y(ϕ);
list.Add(x, y);
}
PlotForm f = new PlotForm("Epitrocoid");
f.Add(list, "Epitrocoid", Color.Black);
f.AxisChange();
f.ShowDialog();
Console.ReadLine();
}
}
}
I converted this source code into Python like the following:
import math
import matplotlib.pyplot as plt
A = 1.0;
a = 0.4;
λ = 1.4;
def X(ϕ):
return (A + a) * math.cos(ϕ) - λ * a * math.cos(((A + a) / a) * ϕ);
def Y(ϕ):
return (A + a) * math.sin(ϕ) - λ * a * math.sin(((A + a) / a) * ϕ);
x_list = []
y_list = []
for i in range(0, 1001, 1):
ϕ = i / 1000.0
x_list.append(X(ϕ))
y_list.append(Y(ϕ))
print(len(x_list))
print(len(y_list))
plt.plot(x_list, y_list)
Can someone tell me what thing is going wrong here?
You didn't construct your iterator the same in Python (up to phi = 1) as you did in C# (up to phi = 10).
Using the right phi gets you there with Python. Also, using numpy simplifies things quite a bit.
import numpy
from matplotlib import pyplot
A = 1.0;
a = 0.4;
λ = 1.4;
def X(ϕ):
return (A + a) * numpy.cos(ϕ) - λ * a * numpy.cos(((A + a) / a) * ϕ);
def Y(ϕ):
return (A + a) * numpy.sin(ϕ) - λ * a * numpy.sin(((A + a) / a) * ϕ);
ϕ = numpy.arange(0, 10, 0.01)
x = X(ϕ)
y = Y(ϕ)
fig, ax = pyplot.subplots()
ax.plot(x, y)
ax.set_aspect('equal')

Understanding timesteps in scipy.integrate.odeint

I am trying to solve a PDE using odeint and the method of lines. My code is definitely wrong - and I'm trying to figure out where it is going wrong.
I am calling the ode solver using odeint(odefunc,y0,tspan) where tspan = np.linspace(0.0, 0.5, 5) & y0 = 1.0*np.ones(3).
I tried printing t within odefunc and am confused by the output. Despite the fact that I am solving up to t=0.5, the last t-value to print is 0.015081203121127767. The number of outputs matches tspan, but I cannot see how it could possibly be solving up to t = 0.5 when the last time in the de function is 0.015. What am I missing?
My DE is time dependent - so this is making it very hard to figure out where things are going wrong because I don't seem to be seeing the times where everything fails.
ETA - this is failing, but running this without some of the irrelevant stuff I am getting the warning ODEintWarning: Excess work done on this call (perhaps wrong Dfun type). Run with full_output = 1 to get quantitative information., which I'm assuming is part of the issue - but it doesn't appear to be halting the code.
MWE
import numpy as np
from scipy.integrate import odeint
import matplotlib.pyplot as plt
import math
import sys
plt.interactive(False)
sigma = 2320
rho = 1000
gravity = 9.81 # [m/s^2]
g = gravity*3600*3600 # [m/hour^2]
S = 0.01
settlingVelocity = 0.02 # [m/s]
ws = settlingVelocity*3600 # [m/hour]
n = 0.04 # [SI]
J = 400 # [Ws/m]
k = 0.02
Cstar = 0.2 * sigma # [kg/m^3]
W = 2 # [m]
D0 = 1.2
Lw = 20
L = 100
tend = 0.5 # in hours
tspan = np.linspace(0.0, tend, 5)
def d(t): # metres
if t < 50: # hours
return 0.5
else:
return 0.05
def Q(t):
return 3600 * (math.sqrt(S)/n)*((W*d(t))**(5/3))/((2*d(t) + W)**(2/3))
def h(t):
return d(t)/2
def beta(t):
return (sigma - rho) * g * h(t)/sigma
def Omega(t):
return rho * g * S * Q(t) # [W/m]
def PsiTime(t):
return rho * g * Q(t) * (D0 - d(t))/(Lw)
N = 10
X = np.linspace(0, L, N)
delX = L/ (N-1)
def odefunc(y, t):
def zetaEh(t):
return k * (PsiTime(t) + Omega(t)) / (J + beta(t))
def zetaEW(t):
return (2*d(t)/(W + 2*d(t))) * k * Omega(t)/(J + beta(t))
def zetaR(t):
return (W/(W + 2*d(t))) * k*Omega(t)/(beta(t))
def zetaEF(t,i):
return (W/(W + 2*d(t))) * k * Omega(t) / (J + beta(t))
C = y[:N]
M = y[N:]
print("time: ", t)
dCdt = np.zeros(X.shape)
dMdt = np.zeros(X.shape)
dCdt[0] = ( # forward difference for dCdx
-Q(t) / (W*d(t)) * (C[1] - C[0]) / delX
+ (zetaEh(t) / (W * d(t))) * ((Cstar - C[0]) / Cstar)
- (ws * C[0] * (beta(t))) / (d(t) * (J + beta(t)))
)
dMdt[0] = 0
# gully channel
for i in range (1, N-1): # central difference
if M[i] + W *C[i] * ws - zetaR(t) * (Cstar - C[i]) / Cstar < 0:
reMass = M[i] + W * C[i] * ws
dCdt[i] = (
-Q(t) / (W*d(t)) * (C[i+1] - C[i - 1]) / (2*delX)
+ 1 / (W * d(t)) * ((zetaEW(t) + zetaEF(t,i)) * (Cstar - C[i]) / Cstar
+ reMass * (1 - (beta(t))/ (J + beta(t))))
- C[i] * ws/d(t)
)
dMdt[i] = -M[i]
else:
dCdt[i] = (
-Q(t) / (W*d(t)) * (C[i+1] - C[i - 1]) / (2*delX)
+ 1 / (W * d(t)) * (zetaEW(t) + zetaR(t)) * (Cstar - C[i]) / Cstar
- C[i] * ws / d(t)
)
dMdt[i] = W * C[i] * ws - zetaR(t) * (Cstar - C[i]) / Cstar
# Final node - backward difference
if M[N-1] + W * C[N-1] * ws - zetaR(t) * (Cstar - C[N-1]) / Cstar < 0:
reMass = M[N-1] + W * C[N-1] * ws
dCdt[N-1] = (
-Q(t) / (W * d(t)) * (C[N-1] - C[N-2]) / delX
+ 1 / (W * d(t)) * ((zetaEW(t) + zetaEF(t, i)) * (Cstar - C[N-1]) / Cstar
+ reMass * (1 - (beta(t)) / (J + beta(t))))
- C[i] * ws / d(t)
)
dMdt[N-1] = -M[N-1]
else:
dCdt[N-1] = (
-Q(t) / (W * d(t)) * (C[N-2] - C[N - 1]) / delX
+ 1 / (W * d(t)) * (zetaEW(t) + zetaR(t)) * (Cstar - C[N-1]) / Cstar
- C[N-1] * ws / d(t)
)
dMdt[N-1] = W * C[N-1] * ws - zetaR(t) * (Cstar - C[N-1]) / Cstar
dydt = np.ravel([dCdt, dMdt])
return dydt
init_C = 0.0 * np.ones(X.shape)
init_M = 0.0 * np.ones(X.shape)
init= np.ravel([init_C, init_M])
sol = odeint(odefunc, init, tspan)
conc = sol[:, :N]

Speed up an integration function in Python

I have a function that is the inner loop of some larger problem. SO it will be called millions of time. I have tried to optimize it. But since it is my first numerical project, I am wondering if there are other ways that can improve the speed.
cython does not seem to help. Maybe numpy is close to c already.
or I don't write cython code efficiently.
import numpy as np
import math
import numexpr as ne
par_mu_rho = 0.8
par_alpha_rho = 0.7
# ' the first two are mean of mus and the '
# ' last two are the mean of alphas.'
cov_epsilon = [[1, par_mu_rho], [par_mu_rho, 1]]
cov_nu = [[1, par_alpha_rho], [par_alpha_rho, 1]]
nrows = 10000
np.random.seed(123)
epsilon_sim = np.random.multivariate_normal([0, 0], cov_epsilon, nrows)
nu_sim = np.random.multivariate_normal([0, 0], cov_nu, nrows)
errors = np.concatenate((epsilon_sim, nu_sim), axis=1)
errors = np.exp(errors)
### the function to be optimized
def mktout(mean_mu_alpha, errors, par_gamma):
mu10 = errors[:, 0] * math.exp(mean_mu_alpha[0])
mu11 = math.exp(par_gamma) * mu10 # mu with gamma
mu20 = errors[:, 1] * math.exp(mean_mu_alpha[1])
mu21 = math.exp(par_gamma) * mu20
alpha1 = errors[:, 2] * math.exp(mean_mu_alpha[2])
alpha2 = errors[:, 3] * math.exp(mean_mu_alpha[3])
j_is_larger = (mu10 > mu20)
# useneither1 = (mu10 < 1/168)
threshold2 = (1 + mu10 * alpha1) / (168 + alpha1)
# useboth1 = (mu21 >= threshold2)
j_is_smaller = ~j_is_larger
# useneither2 = (mu20 < 1/168)
threshold3 = (1 + mu20 * alpha2) / (168 + alpha2)
# useboth2 = (mu11 >= threshold3)
case1 = j_is_larger * (mu10 < 1 / 168)
case2 = j_is_larger * (mu21 >= threshold2)
# case3 = j_is_larger * (1 - (useneither1 | useboth1))
case3 = j_is_larger ^ (case1 | case2)
case4 = j_is_smaller * (mu20 < 1 / 168)
case5 = j_is_smaller * (mu11 >= threshold3)
# case6 = j_is_smaller * (1 - (useneither2 | useboth2))
case6 = j_is_smaller ^ (case4 | case5)
t0 = ne.evaluate(
"case1*168+case2 * (168 + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * 168 +case5 * (168 + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3")
# for some cases, t1 would be 0 anyway, so they are omitted here.
t1 = ne.evaluate(
"case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)")
# t2 = (j_is_larger*useboth1*(t0*alpha2*mu21- alpha2) +
# j_is_smaller*useboth2*(t0*alpha2*mu21- alpha2) +
# j_is_smaller*(1- (useneither2|useboth2))*(t0*alpha2*mu20 - alpha2)
# )
t2 = 168 - t0 - t1
p12 = case2 + case5
p1 = case3 + p12
p2 = case6 + p12
return t1.sum()/10000, t2.sum()/10000, p1.sum()/10000, p2.sum()/10000
timeit mktout([-6,-6,-1,-1], errors, -0.7)
On my old mac with 2.2GHz i7. the function runs at about 200µs.
Updates:
Based on suggestions and code from #CodeSurgeon and #GZ0, I settled on the following code
def mktout_full(double[:] mean_mu_alpha, double[:, ::1] errors, double par_gamma):
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
double threshold2, threshold3
double t0, t1, t2
double t1_sum, t2_sum, p1_sum, p2_sum, p12_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(<double>mean_mu_alpha[0])
exp[1] = cmath.exp(<double>mean_mu_alpha[1])
exp[2] = cmath.exp(<double>mean_mu_alpha[2])
exp[3] = cmath.exp(<double>mean_mu_alpha[3])
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
p12_sum = 0.0
for i in range(n) :
mu10 = errors[i, 0] * exp[0]
# mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
# mu21 = exp_par_gamma * mu20
# alpha1 = errors[i, 2] * exp[2]
# alpha2 = errors[i, 3] * exp[3]
# j_is_larger = mu10 > mu20
# j_is_smaller = not j_is_larger
if (mu10 >= mu20):
if (mu10 >= 1/c) :
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
if (mu21 >= threshold2):
mu11 = exp_par_gamma * mu10
t0 = (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2)
t1 = (t0 * alpha1 * mu11 - alpha1)
t1_sum += t1
t2_sum += c - t0 - t1
p1_sum += 1
p2_sum += 1
p12_sum += 1
else :
t1_sum += ((1/threshold2) * alpha1 * mu10 - alpha1)
p1_sum += 1
else :
if (mu20 >= 1/c) :
mu11 = exp_par_gamma * mu10
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
if (mu11 >= threshold3):
mu21 = exp_par_gamma * mu20
t0 = (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2)
t1 = (t0 * alpha1 * mu11 - alpha1)
t1_sum += t1
t2_sum += c - t0 - t1
p1_sum += 1
p2_sum += 1
p12_sum += 1
else :
t2_sum += ((1/threshold3) * alpha2 * mu20 - alpha2)
p2_sum += 1
return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n, p12_sum/n
my original code runs at 650µs.
mktout and mktout_if by codesurgeon run at about 220µs and 120µs.
the above mktout_full runs at about 68 µs.
What I do in mktout_full is to optimize the if-else logic in mktout_if.
Perhaps surprisingly, parallelized out_loop by codesurgeon combined with if-else logic in mktout_full is way slower (121ms).
Briefly looking at the code and attempting to cythonize it, simply adding ndarray types to all of the parameters and variables does not change performance meaningfully. If you are fighting for shaving off microseconds for this function in this tight inner loop, I would consider making the following modifications:
The reason why this code is so tough to cythonize is that your code is vectorized. All of the operations are going through numpy or numexpr. While each of these operations alone are efficient, they all add some python overhead (which can be seen if you look at the annotated .html files cython can produce).
If you are calling this function many times (as it appears based on your comments), you can save some time by making mktout a cdef function instead. Python function calls have significant overhead.
Minor but you can try avoiding any functions from python's math module. You can replace this with from libc cimport math as cmath and use cmath.exp instead.
I see your mktout function takes in a python list mean_mu_alpha. You could consider using a cdef class object to replace this parameter and type this instead. If you choose to make mktout a cdef function instead, this can become just a struct or double * array. Either way, indexing into a python list (which can be contain arbitrary python objects that need to be unboxed into corresponds c-types) is going to be slow.
This is probably the most important part. For each call to mktout, you are allocating memory for lots of arrays (for each mu, alpha, threshold, case, t- and p- array). You then proceed to free all of this memory at the end of the function (through python's gc), only to likely use all of this space again the next call. If you can change the signature of mktout, you can pass in all of these arrays as parameters so that the memory can be reused and overwritten across function calls. Another option, which is better for this case, would be to iterate through the array and do all of the calculations one element at a time.
You can multithread the code using cython's prange function. I would reach for this after you have made all of the above changes, and I would do the multithreading outside of the mktout function itself. That is, you would be multithreading calls to mktout rather than multithreading mktout itself.
Making the above changes will be a lot of work, and you will likely have to reimplement many of the functions provided by numpy and numexpr yourself to avoid the python overhead associated with each of time. Please let me know if any part of this is unclear.
Update #1: Implementing points #1, #3, and #5, I get about a 11x fold speed-up. Here is what this code looks like. I am sure it can go faster if you ditch the def function, the list mean_mu_alpha input, and the tuple output. Note: I get slightly different results in the last decimal place compared to the original code, likely due to some floating point rules I do not understand.
from libc cimport math as cmath
from libc.stdint cimport *
from libc.stdlib cimport *
def mktout(list mean_mu_alpha, double[:, ::1] errors, double par_gamma):
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
bint j_is_larger, j_is_smaller
double threshold2, threshold3
bint case1, case2, case3, case4, case5, case6
double t0, t1, t2
double p12, p1, p2
double t1_sum, t2_sum, p1_sum, p2_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(<double>mean_mu_alpha[0])
exp[1] = cmath.exp(<double>mean_mu_alpha[1])
exp[2] = cmath.exp(<double>mean_mu_alpha[2])
exp[3] = cmath.exp(<double>mean_mu_alpha[3])
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
for i in range(n):
mu10 = errors[i, 0] * exp[0]
mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
j_is_larger = mu10 > mu20
j_is_smaller = not j_is_larger
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
case1 = j_is_larger * (mu10 < 1 / c)
case2 = j_is_larger * (mu21 >= threshold2)
case3 = j_is_larger ^ (case1 | case2)
case4 = j_is_smaller * (mu20 < 1 / c)
case5 = j_is_smaller * (mu11 >= threshold3)
case6 = j_is_smaller ^ (case4 | case5)
t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
t2 = c - t0 - t1
p12 = case2 + case5
p1 = case3 + p12
p2 = case6 + p12
t1_sum += t1
t2_sum += t2
p1_sum += p1
p2_sum += p2
return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n
Update #2: Implemented the cdef (#2), python object elimination (#4) and the multithreading (#6) ideas. #2 and #4 alone had negligible benefit, but were necessary for #6 since the GIL cannot be accessed in OpenMP prange loops. With the multithreading, you get an additional 2.5x speed boost on my quad core laptop, amounting to code that is about 27.5x faster than the original. My outer_loop function is not wholly accurate though since it just recalculates the same result over and over, but it should be enough for a test case. The complete code is below:
from libc cimport math as cmath
from libc.stdint cimport *
from libc.stdlib cimport *
from cython.parallel cimport prange
def mktout(list mean_mu_alpha, double[:, ::1] errors, double par_gamma):
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
bint j_is_larger, j_is_smaller
double threshold2, threshold3
bint case1, case2, case3, case4, case5, case6
double t0, t1, t2
double p12, p1, p2
double t1_sum, t2_sum, p1_sum, p2_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(<double>mean_mu_alpha[0])
exp[1] = cmath.exp(<double>mean_mu_alpha[1])
exp[2] = cmath.exp(<double>mean_mu_alpha[2])
exp[3] = cmath.exp(<double>mean_mu_alpha[3])
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
for i in range(n):
mu10 = errors[i, 0] * exp[0]
mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
j_is_larger = mu10 > mu20
j_is_smaller = not j_is_larger
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
case1 = j_is_larger * (mu10 < 1 / c)
case2 = j_is_larger * (mu21 >= threshold2)
case3 = j_is_larger ^ (case1 | case2)
case4 = j_is_smaller * (mu20 < 1 / c)
case5 = j_is_smaller * (mu11 >= threshold3)
case6 = j_is_smaller ^ (case4 | case5)
t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
t2 = c - t0 - t1
p12 = case2 + case5
p1 = case3 + p12
p2 = case6 + p12
t1_sum += t1
t2_sum += t2
p1_sum += p1
p2_sum += p2
return t1_sum/n, t2_sum/n, p1_sum/n, p2_sum/n
ctypedef struct Vec4:
double a
double b
double c
double d
def outer_loop(list mean_mu_alpha, double[:, ::1] errors, double par_gamma, size_t n):
cdef:
size_t i
Vec4 mean_vec
Vec4 out
mean_vec.a = <double>(mean_mu_alpha[0])
mean_vec.b = <double>(mean_mu_alpha[1])
mean_vec.c = <double>(mean_mu_alpha[2])
mean_vec.d = <double>(mean_mu_alpha[3])
with nogil:
for i in prange(n):
cy_mktout(&out, &mean_vec, errors, par_gamma)
return out
cdef void cy_mktout(Vec4 *out, Vec4 *mean_mu_alpha, double[:, ::1] errors, double par_gamma) nogil:
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
bint j_is_larger, j_is_smaller
double threshold2, threshold3
bint case1, case2, case3, case4, case5, case6
double t0, t1, t2
double p12, p1, p2
double t1_sum, t2_sum, p1_sum, p2_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(mean_mu_alpha.a)
exp[1] = cmath.exp(mean_mu_alpha.b)
exp[2] = cmath.exp(mean_mu_alpha.c)
exp[3] = cmath.exp(mean_mu_alpha.d)
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
for i in range(n):
mu10 = errors[i, 0] * exp[0]
mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
j_is_larger = mu10 > mu20
j_is_smaller = not j_is_larger
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
case1 = j_is_larger * (mu10 < 1 / c)
case2 = j_is_larger * (mu21 >= threshold2)
case3 = j_is_larger ^ (case1 | case2)
case4 = j_is_smaller * (mu20 < 1 / c)
case5 = j_is_smaller * (mu11 >= threshold3)
case6 = j_is_smaller ^ (case4 | case5)
t0 = case1*c+case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) +case3 / threshold2 +case4 * c +case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
t1 = case2 * (t0 * alpha1 * mu11 - alpha1) +case3 * (t0 * alpha1 * mu10 - alpha1) +case5 * (t0 * alpha1 * mu11 - alpha1)
t2 = c - t0 - t1
p12 = case2 + case5
p1 = case3 + p12
p2 = case6 + p12
t1_sum += t1
t2_sum += t2
p1_sum += p1
p2_sum += p2
out.a = t1_sum/n
out.b = t2_sum/n
out.c = p1_sum/n
out.d = p2_sum/n
And the setup.py file that I use is as follows (has all of the optimization and OpenMP flags):
from distutils.core import setup
from Cython.Build import cythonize
from distutils.core import Extension
import numpy as np
import os
import shutil
import platform
libraries = {
"Linux": [],
"Windows": [],
}
language = "c"
args = ["-w", "-std=c11", "-O3", "-ffast-math", "-march=native", "-fopenmp"]
link_args = ["-std=c11", "-fopenmp"]
annotate = True
directives = {
"binding": True,
"boundscheck": False,
"wraparound": False,
"initializedcheck": False,
"cdivision": True,
"nonecheck": False,
"language_level": "3",
#"c_string_type": "unicode",
#"c_string_encoding": "utf-8",
}
if __name__ == "__main__":
system = platform.system()
libs = libraries[system]
extensions = []
ext_modules = []
#create extensions
for path, dirs, file_names in os.walk("."):
for file_name in file_names:
if file_name.endswith("pyx"):
ext_path = "{0}/{1}".format(path, file_name)
ext_name = ext_path \
.replace("./", "") \
.replace("/", ".") \
.replace(".pyx", "")
ext = Extension(
name=ext_name,
sources=[ext_path],
libraries=libs,
language=language,
extra_compile_args=args,
extra_link_args=link_args,
include_dirs = [np.get_include()],
)
extensions.append(ext)
#setup all extensions
ext_modules = cythonize(
extensions,
annotate=annotate,
compiler_directives=directives,
)
setup(ext_modules=ext_modules)
"""
#immediately remove build directory
build_dir = "./build"
if os.path.exists(build_dir):
shutil.rmtree(build_dir)
"""
Update #3: Per the advice of #GZ0, there were lots of conditions where expressions in the code would evaluate to zero and be wastefully calculated. I have attempted to eliminate these areas with the following code (after fixing both the case3 and case6 statements):
cdef void cy_mktout_if(Vec4 *out, Vec4 *mean_mu_alpha, double[:, ::1] errors, double par_gamma) nogil:
cdef:
size_t i, n
double[4] exp
double exp_par_gamma
double mu10, mu11, mu20, mu21
double alpha1, alpha2
bint j_is_larger
double threshold2, threshold3
bint case1, case2, case3, case4, case5, case6
double t0, t1, t2
double p12, p1, p2
double t1_sum, t2_sum, p1_sum, p2_sum
double c
#compute the exp outside of the loop
n = errors.shape[0]
exp[0] = cmath.exp(mean_mu_alpha.a)
exp[1] = cmath.exp(mean_mu_alpha.b)
exp[2] = cmath.exp(mean_mu_alpha.c)
exp[3] = cmath.exp(mean_mu_alpha.d)
exp_par_gamma = cmath.exp(par_gamma)
c = 168.0
t1_sum = 0.0
t2_sum = 0.0
p1_sum = 0.0
p2_sum = 0.0
for i in range(n):
mu10 = errors[i, 0] * exp[0]
mu11 = exp_par_gamma * mu10
mu20 = errors[i, 1] * exp[1]
mu21 = exp_par_gamma * mu20
alpha1 = errors[i, 2] * exp[2]
alpha2 = errors[i, 3] * exp[3]
j_is_larger = mu10 > mu20
j_is_smaller = not j_is_larger
threshold2 = (1 + mu10 * alpha1) / (c + alpha1)
threshold3 = (1 + mu20 * alpha2) / (c + alpha2)
if j_is_larger:
case1 = mu10 < 1 / c
case2 = mu21 >= threshold2
case3 = not (case1 | case2)
t0 = case1*c + case2 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case3 / threshold2
t1 = case2 * (t0 * alpha1 * mu11 - alpha1) + case3 * (t0 * alpha1 * mu10 - alpha1)
t2 = c - t0 - t1
t1_sum += t1
t2_sum += t2
p1_sum += case2 + case3
p2_sum += case2
else:
case4 = mu20 < 1 / c
case5 = mu11 >= threshold3
case6 = not (case4 | case5)
t0 = case4 * c + case5 * (c + alpha1 + alpha2) / (1 + mu11 * alpha1 + mu21 * alpha2) + case6 / threshold3
t1 = case5 * (t0 * alpha1 * mu11 - alpha1)
t2 = c - t0 - t1
t1_sum += t1
t2_sum += t2
p1_sum += case5
p2_sum += case5 + case6
out.a = t1_sum/n
out.b = t2_sum/n
out.c = p1_sum/n
out.d = p2_sum/n
For 10000 iterations, the current code performs as follows:
outer_loop: 0.5116949229995953 seconds
outer_loop_if: 0.617649456995423 seconds
mktout: 0.9221872320049442 seconds
mktout_if: 1.430276553001022 seconds
python: 10.116664300003322 seconds
I think the cost of the conditional and the branch misprediction that results is making the function surprisingly slower, but I would appreciate any help clearing this up for certain.

Categories

Resources