cython times 10 slower

cython times 10 slower - python

I am trying to boost up some calculations in python by using cython ...
In my calculations I will be doing double loops or more plus I can't always use numpy vectorization so I need to boost up the python loops with cython.
here I benchmark some simple calculation and it shows so far that cython is 10 times slower than using numpy. I am sure that numpy is optimized to the max and I doubt I could be able to beat its performance but still factor of 10 slower means I am doing something wrong. Suggestions ?
test.py
import numpy as np
from histogram import distances
import time
REPEAT = 10
def printTime(message, t):
print "%s total: %.7f(s) --> average: %.7f(s) %.7f(Ms)"%(message, t, t/REPEAT, 1000000*t/REPEAT)
DATA = np.array( np.random.random((10000, 3)), dtype=np.float32)
POINT = np.array( np.random.random((1,3)), dtype=np.float32)
# numpy histogram
r = REPEAT
startTime = time.clock()
while r:
diff = (DATA-POINT)%1
diffNumpy = np.where(diff<0, diff+1, diff)
distNumpy = np.sqrt( np.add.reduce(diff**2,1) )
r-=1
printTime("numpy", time.clock()-startTime)
# cython test
r = REPEAT
startTime = time.clock()
while r:
distCython = distances(POINT, DATA)
r-=1
printTime("cython", time.clock()-startTime)
histogram.pyx
import numpy as np
import cython
cimport cython
cimport numpy as np
DTYPE=np.float32
ctypedef np.float32_t DTYPE_C
#cython.nonecheck(False)
#cython.boundscheck(False)
#cython.wraparound(False)
def distances(np.ndarray[DTYPE_C, ndim=2] point, np.ndarray[DTYPE_C, ndim=2] data):
# declare variables
cdef int i
cdef float x,y,z
cdef np.ndarray[DTYPE_C, mode="c", ndim=1] dist = np.empty((data.shape[0]), dtype=DTYPE)
# loop
for i from 0 <= i < data.shape[0]:
# calculate distance
x = (data[i,0]-point[0,0])%1
y = (data[i,1]-point[0,1])%1
z = (data[i,2]-point[0,2])%1
# fold between 0 and 1
if x<0: x+=1
if y<0: y+=1
if z<0: z+=1
# assign to array
dist[i] = np.sqrt(x**2+y**2+z**2)
return dist
setup.py
from distutils.core import setup
from Cython.Build import cythonize
import numpy as np
setup(
ext_modules = cythonize("histogram.pyx"),
include_dirs=[np.get_include()]
)
to compile do the following
python setup.py build_ext --inplace
to launch benchmarch
python test.py
My results are
numpy total: 0.0153390(s) --> average: 0.0015339(s) 1533.9000000(Ms)
cython total: 0.1509920(s) --> average: 0.0150992(s) 15099.2000000(Ms)

Your problem is almost definitely
np.sqrt(x**2+y**2+z**2)
You should use the C sqrt function. It will look something like
from libc.math cimport sqrt
sqrt(x*x + y*y + z*z)

Related

No real gains using cython

I am trying to use a modified version of the RANSAC regressor algorithm, but instead of changing the sklearn's function (to avoid the problems of be using sklearn 0.24.1 and to avoid some warnings from the function), I've found an algorithm in github and was trying to cythonize it to improve speed before I can make my modifications. To my surprise the speed gain was very poor, was it because the code is full of numpy calls or did I make something wrong? The following codes are the python version, cython version (with proper modifications to avoid errors):
##Python version
import numpy as np
def ransac_polyfit(x,y,order, t, n=0.8,k=100,f=0.9):
besterr = np.inf
bestfit = np.array([None])
for kk in range(k):
maybeinliers = np.random.randint(len(x), size=int(n*len(x)))
maybemodel = np.polyfit(x[maybeinliers], y[maybeinliers], order)
alsoinliers = np.abs(np.polyval(maybemodel, x)-y) < t
if sum(alsoinliers) > len(x)*f:
bettermodel = np.polyfit(x[alsoinliers], y[alsoinliers], order)
thiserr = np.sum(np.abs(np.polyval(bettermodel, x[alsoinliers])-y[alsoinliers]))
if thiserr < besterr:
bestfit = bettermodel
besterr = thiserr
return bestfit
##Cython version
cimport cython
cimport numpy as np
import numpy as np
np.import_array()
#cython.boundscheck(False)
#cython.wraparound(False)
cdef ransac_polyfit(np.ndarray[np.npy_int64, ndim=1] x,
np.ndarray[np.npy_int64, ndim=1] y,
np.npy_intp order,
np.npy_float32 t,
np.npy_float32 n=0.8,
np.npy_intp k=100,
np.npy_float32 f=0.9):
cdef np.npy_intp kk, i, ransac_control
cdef np.npy_float64 thiserr, besterr = -1.0
cdef np.ndarray[np.npy_int64, ndim=1] maybeinliers
cdef np.ndarray[np.npy_bool, ndim=1] alsoinliers
cdef np.ndarray[np.npy_float64, ndim=1] bestfit, maybemodel, bettermodel
bestfit = np.zeros(1, dtype = np.float64)
for kk in range(k):
maybeinliers = np.random.randint(len(x), size=int(n*len(x)))
maybemodel = np.polyfit(x[maybeinliers], y[maybeinliers], order)
alsoinliers = np.abs(polyval(maybemodel, x)-y) < t
if sum(alsoinliers) > len(x)*f:
bettermodel = np.polyfit(x[alsoinliers], y[alsoinliers], order)
thiserr = np.sum(np.abs(polyval(bettermodel, x[alsoinliers])-y[alsoinliers]))
if (thiserr < besterr) or (besterr == -1.0):
bestfit = bettermodel
besterr = thiserr
#Since I can't return an empty array, I had to set it as an array with a zero and the ransac_control variable will tell if the function was able or not to find a good model
if (besterr == -1.0):
ransac_control = 0
return ransac_control, bestfit
else:
ransac_control = 1
return ransac_control, bestfit
**PS: Couldn't send the image of the HTML page of the cython code because it's my first question

Comparing scipy.stats.t.sf vs GSL using Cython

I would like to calculate the p values of a large 2D numpy t values array. However, this takes long time and I would like to improve its speed. I tried using GSL.
Although a single gsl_cdf_tdist_P is much much faster than scipy.stats.t.sf, when iterating over the ndarray, the process is very slow. I would like help to improve this.
See the code below.
GSL_Test.pyx
import cython
cimport cython
import numpy
cimport numpy
from cython_gsl cimport *
DTYPE = numpy.float32
ctypedef numpy.float32_t DTYPE_t
cdef get_gsl_p(double t, double nu):
return (1 - gsl_cdf_tdist_P(t, nu)) * 2
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.nonecheck(False)
cdef get_gsl_p_for_2D_matrix(numpy.ndarray[DTYPE_t, ndim=2] t_matrix, int n):
cdef unsigned int rows = t_matrix.shape[0]
cdef numpy.ndarray[DTYPE_t, ndim=2] out = numpy.zeros((rows, rows), dtype='float32')
cdef unsigned int row, col
for row in range(rows):
for col in range(rows):
out[row, col] = get_gsl_p(t_matrix[row, col], n-2)
return out
def get_gsl_p_for_2D_matrix_def(numpy.ndarray[DTYPE_t, ndim=2] t_matrix, int n):
return get_gsl_p_for_2D_matrix(t_matrix, n)
ipython
import GSL_Test
import numpy
import scipy.stats
a = numpy.random.rand(3544, 3544).astype('float32')
%timeit -n 1 GSL_Test.get_gsl_p_for_2D_matrix(a, 25)
1 loop, best of 3: 7.87 s per loop
%timeit -n 1 scipy.stats.t.sf(a, 25)*2
1 loop, best of 3: 4.66 s per loop
UPDATE: Adding cdef declarations I was able to reduce the computational time but not lower than scipy still. I modified the code to have the cdef declarations.
%timeit -n 1 GSL_Test.get_gsl_p_for_2D_matrix_def(a, 25)
1 loop, best of 3: 6.73 s per loop

You can get some small gain in raw performance by using a raw special function instead of stats.t.sf. Looking at the source, you find (https://github.com/scipy/scipy/blob/master/scipy/stats/_continuous_distns.py#L3849)
def _sf(self, x, df):
return sc.stdtr(df, -x)
So that you can use stdtr directly:
np.random.seed(1234)
x = np.random.random((3740, 374))
t1 = stats.t.sf(x, 25)
t2 = stdtr(25, -x)
1 loop, best of 3: 653 ms per loop
1 loop, best of 3: 562 ms per loop
If you do reach out for cython, the typed memoryview syntax often gives you faster code than the old ndarray syntax:
from scipy.special.cython_special cimport stdtr
from numpy cimport npy_intp
import numpy as np
def tsf(double [:, ::1] x, int df=25):
cdef double[:, ::1] out = np.empty_like(x)
cdef npy_intp i, j
cdef double tmp, xx
for i in range(x.shape[0]):
for j in range(x.shape[1]):
xx = x[i, j]
out[i, j] = stdtr(df, -xx)
return np.asarray(out)
Here I'm also using the cython_special interface, which is only avaialble in the dev version of scipy (http://scipy.github.io/devdocs/special.cython_special.html#module-scipy.special.cython_special), but you can use GSL if you want.
Finally, if you suspect a bottleneck in iterations, don't forget to inspect the output of cython -a to see if there's some python overhead in the hot loops.

Cython's prange not improving performance

I'm trying to improve the performance of some metric computations with Cython's prange. Here are my codes:
def shausdorff(float64_t[:,::1] XA not None, float64_t[:,:,::1] XB not None):
cdef:
Py_ssize_t i
Py_ssize_t n = XB.shape[2]
float64_t[::1] hdist = np.zeros(n)
#arrangement to fix contiguity
XB = np.asanyarray([np.ascontiguousarray(XB[:,:,i]) for i in range(n)])
for i in range(n):
hdist[i] = _hausdorff(XA, XB[i])
return hdist
def phausdorff(float64_t[:,::1] XA not None, float64_t[:,:,::1] XB not None):
cdef:
Py_ssize_t i
Py_ssize_t n = XB.shape[2]
float64_t[::1] hdist = np.zeros(n)
#arrangement to fix contiguity (EDITED)
cdef float64_t[:,:,::1] XC = np.asanyarray([np.ascontiguousarray(XB[:,:,i]) for i in range(n)])
with nogil, parallel(num_threads=4):
for i in prange(n, schedule='static', chunksize=1):
hdist[i] = _hausdorff(XA, XC[i])
return hdist
Basically, in each iteration the hausdorff metric is computed between XA and each XB[i]. Here is the signature of the _hausdorff function:
cdef inline float64_t _hausdorff(float64_t[:,::1] XA, float64_t[:,::1] XB) nogil:
...
my problem is that both the sequential shausdorff and the parallel phausdorff have the same timings. Furthermore, it seems that phausdorff is not creating any thread at all.
So my question is what is wrong with my code, and how can I fix it to get threading working.
Here is my setup.py:
from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
from Cython.Distutils import build_ext
ext_modules=[
Extension("custom_metric",
["custom_metric.pyx"],
libraries=["m"],
extra_compile_args = ["-O3", "-ffast-math", "-march=native", "-fopenmp" ],
extra_link_args=['-fopenmp']
)
]
setup(
name = "custom_metric",
cmdclass = {"build_ext": build_ext},
ext_modules = ext_modules
)
EDIT 1: Here is a link to the html generated by cython -a: custom_metric.html
EDIT 2: Here is an example on how to call the corresponding functions (you need to compile the Cython file first)
import custom_metric as cm
import numpy as np
XA = np.random.random((9000, 210))
XB = np.random.random((1000, 210, 9))
#timing 'parallel' version
%timeit cm.phausdorff(XA, XB)
#timing sequential version
%timeit cm.shausdorff(XA, XB)

I think this the parallelization is working, but the extra overhead of the parallelization is eating up the time it would have saved. If I try with different sized arrays then I do begin to see a speed up in the parallel version
XA = np.random.random((900, 2100))
XB = np.random.random((100, 2100, 90))
Here the parallel version takes ~2/3 of the time of the serial version for me, which certainly isn't the 1/4 you'd expect, but does at least show some benefit.
One improvement I can offer is to replace the code that fixes contiguity:
XB = np.asanyarray([np.ascontiguousarray(XB[:,:,i]) for i in range(n)])
with
XB = np.ascontiguousarray(np.transpose(XB,[2,0,1]))
This speeds up both the parallel and non-parallel functions fairly significantly (a factor of 2 with the arrays you originally gave). It does make it slightly more obvious that you're being slowed down by overhead in the prange - the serial version is actually faster for the arrays in your example.

Speed up Python/Cython loops.

I've trying to get a loop in python to run as fast as possible. So I've dived into NumPy and Cython.
Here's the original Python code:
def calculate_bsf_u_loop(uvel,dy,dz):
"""
Calculate barotropic stream function from zonal velocity
uvel (t,z,y,x)
dy (y,x)
dz (t,z,y,x)
bsf (t,y,x)
"""
nt = uvel.shape[0]
nz = uvel.shape[1]
ny = uvel.shape[2]
nx = uvel.shape[3]
bsf = np.zeros((nt,ny,nx))
for jn in range(0,nt):
for jk in range(0,nz):
for jj in range(0,ny):
for ji in range(0,nx):
bsf[jn,jj,ji] = bsf[jn,jj,ji] + uvel[jn,jk,jj,ji] * dz[jn,jk,jj,ji] * dy[jj,ji]
return bsf
It's just a sum over k indices. Array sizes are nt=12, nz=75, ny=559, nx=1442, so ~725 million elements.
That took 68 seconds. Now, I've done it in cython as
import numpy as np
cimport numpy as np
cimport cython
#cython.boundscheck(False) # turn off bounds-checking for entire function
#cython.wraparound(False) # turn off negative index wrapping for entire function
## Use cpdef instead of def
## Define types for arrays
cpdef calculate_bsf_u_loop(np.ndarray[np.float64_t, ndim=4] uvel, np.ndarray[np.float64_t, ndim=2] dy, np.ndarray[np.float64_t, ndim=4] dz):
"""
Calculate barotropic stream function from zonal velocity
uvel (t,z,y,x)
dy (y,x)
dz (t,z,y,x)
bsf (t,y,x)
"""
## cdef the constants
cdef int nt = uvel.shape[0]
cdef int nz = uvel.shape[1]
cdef int ny = uvel.shape[2]
cdef int nx = uvel.shape[3]
## cdef loop indices
cdef ji,jj,jk,jn
## cdef. Note that the cdef is followed by cython type
## but the np.zeros function as python (numpy) type
cdef np.ndarray[np.float64_t, ndim=3] bsf = np.zeros([nt,ny,nx], dtype=np.float64)
for jn in xrange(0,nt):
for jk in xrange(0,nz):
for jj in xrange(0,ny):
for ji in xrange(0,nx):
bsf[jn,jj,ji] += uvel[jn,jk,jj,ji] * dz[jn,jk,jj,ji] * dy[jj,ji]
return bsf
and that took 49 seconds.
However, swapping the loop for
for jn in range(0,nt):
for jk in range(0,nz):
bsf[jn,:,:] = bsf[jn,:,:] + uvel[jn,jk,:,:] * dz[jn,jk,:,:] * dy[:,:]
only takes 0.29 seconds! Unfortunately, I can't do this in my full code.
Why is NumPy slicing so much faster than the Cython loop?
I thought NumPy was fast because it is Cython under the hood. So shouldn't they be of similar speed?
As you can see, I've disabled boundary checks in cython, and I've also compiled using "fast math". However, this only gives a tiny speedup.
Is there anyway to get a loop to be of similar speed as NumPy slicing, or is looping always slower than slicing?
Any help is greatly appreciated!
/Joakim

That code is screaming for numpy.einsum's's intervention, given that you are doing elementwise-multiplication and then sum-reduction on the second axis of the 4D product array, which essenti
ally numpy.einsum does in a highly efficient manner. To solve your case, you can use numpy.einsum in two ways -
bsf = np.einsum('ijkl,ijkl,kl->ikl',uvel,dz,dy)
bsf = np.einsum('ijkl,ijkl->ikl',uvel,dz)*dy
Runtime tests & Verify outputs -
In [100]: # Take a (1/5)th of original input shapes
...: original_shape = [12,75, 559,1442]
...: m,n,p,q = (np.array(original_shape)/5).astype(int)
...:
...: # Generate random arrays with given shapes
...: uvel = np.random.rand(m,n,p,q)
...: dy = np.random.rand(p,q)
...: dz = np.random.rand(m,n,p,q)
...:
In [101]: bsf = calculate_bsf_u_loop(uvel,dy,dz)
In [102]: print(np.allclose(bsf,np.einsum('ijkl,ijkl,kl->ikl',uvel,dz,dy)))
True
In [103]: print(np.allclose(bsf,np.einsum('ijkl,ijkl->ikl',uvel,dz)*dy))
True
In [104]: %timeit calculate_bsf_u_loop(uvel,dy,dz)
1 loops, best of 3: 2.16 s per loop
In [105]: %timeit np.einsum('ijkl,ijkl,kl->ikl',uvel,dz,dy)
100 loops, best of 3: 3.94 ms per loop
In [106]: %timeit np.einsum('ijkl,ijkl->ikl',uvel,dz)*dy
100 loops, best of 3: 3.96 ms per loo

Cythonize a partial differential equation integrator

I am trying to speed up a finite differences integrator for a partial differential equation using Cython. I am not sure what I need to do in order for Cython to work correctly with the numpy arrays.
The diffusion term function that I use is
def laplacian(var, dh2):
""" (1D array, dx^2) -> laplacian(1D array)
periodic_laplacian_1D_4th_order
Implementing the 4th order 1D laplacian with periodic condition
"""
lap = numpy.zeros_like(var)
lap[1:] = (4.0/3.0)*var[:-1]
lap[0] = (4.0/3.0)*var[1]
lap[:-1] += (4.0/3.0)*var[1:]
lap[-1] += (4.0/3.0)*var[0]
lap += (-5.0/2.0)*var
lap[2:] += (-1.0/12.0)*var[:-2]
lap[:2] += (-1.0/12.0)*var[-2:]
lap[:-2] += (-1.0/12.0)*var[2:]
lap[-2:] += (-1.0/12.0)*var[:2]
return lap / dh2
And the rhs of the equations of the model are
from derivatives import laplacian
def dbdt(b,w,p,m,d,dx2):
""" db/dt of Modified Klausmeier """
return w*b**2 - m*b + laplacian(b,dx2)
def dwdt(b,w,p,m,d,dx2):
""" dw/dt of Modified Klausmeier """
return p - w - w*b**2 + d*laplacian(b,dx2)
How can I optimize those functions using Cython?
I have a repository on Github for my working code, that integrates the Gray-Scott model - Gray-Scott model integrator.

To use Cython efficiently, you should make all loops explicit and make sure cython -a shows as few Python calls as possible. A first try would be:
import numpy as np
cimport numpy as np
cimport cython
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.cdivision(True)
def laplacian(double [::1] var, double dh2):
""" (1D array, dx^2) -> laplacian(1D array)
periodic_laplacian_1D_4th_order
Implementing the 4th order 1D laplacian with periodic condition
"""
cdef int n = var.shape[0]
cdef double[::1] lap = np.zeros(n)
cdef int i
for i in range(0, n-1):
lap[1+i] = (4.0/3.0)*var[i]
lap[0] = (4.0/3.0)*var[1]
for i in range(0, n-1):
lap[i] += (4.0/3.0)*var[1+i]
lap[n-1] += (4.0/3.0)*var[0]
for i in range(0, n):
lap[i] += (-5.0/2.0)*var[i]
for i in range(0, n-2):
lap[2+i] += (-1.0/12.0)*var[i]
for i in range(0, 2):
lap[i] += (-1.0/12.0)*var[n - 2 + i]
for i in range(0, n-2):
lap[i] += (-1.0/12.0)*var[i+2]
for i in range(0, 2):
lap[n-2+i] += (-1.0/12.0)*var[i]
for i in range(0, n):
lap[i] /= dh2
return lap
Now this gives you:
$ python -m timeit -s 'import numpy as np; from lap import laplacian; var = np.random.rand(1000000); dh2 = .01' 'laplacian(var, dh2)'
100 loops, best of 3: 11.5 msec per loop
while the NumPy code gave:
100 loops, best of 3: 18.5 msec per loop
Note that the Cython could be further optimized by merging loops etc.
I also tried with a customized (i.e. not committed in master) version of Pythran and without changing the original Python code, I had the same speedup as the Cython version, without the hassle of converting the code:
#pythran export laplacian(float [], float)
import numpy
def laplacian(var, dh2):
""" (1D array, dx^2) -> laplacian(1D array)
periodic_laplacian_1D_4th_order
Implementing the 4th order 1D laplacian with periodic condition
"""
lap = numpy.zeros_like(var)
lap[1:] = (4.0/3.0)*var[:-1]
lap[0] = (4.0/3.0)*var[1]
lap[:-1] += (4.0/3.0)*var[1:]
lap[-1] += (4.0/3.0)*var[0]
lap += (-5.0/2.0)*var
lap[2:] += (-1.0/12.0)*var[:-2]
lap[:2] += (-1.0/12.0)*var[-2:]
lap[:-2] += (-1.0/12.0)*var[2:]
lap[-2:] += (-1.0/12.0)*var[:2]
return lap / dh2
Converted with:
$ pythran lap.py -O3
And I get:
100 loops, best of 3: 11.6 msec per loop

So I guess I've figured it out, though I am not sure this is the most optimized way to do it:
import numpy as np
cimport numpy as np
cdef laplacian(np.ndarray[np.float64_t, ndim=1] var,np.float64_t dh2):
""" (1D array, dx^2) -> laplacian(1D array)
periodic_laplacian_1D_4th_order
Implementing the 4th order 1D laplacian with periodic condition
"""
lap = np.zeros_like(var)
lap[1:] = (4.0/3.0)*var[:-1]
lap[0] = (4.0/3.0)*var[1]
lap[:-1] += (4.0/3.0)*var[1:]
lap[-1] += (4.0/3.0)*var[0]
lap += (-5.0/2.0)*var
lap[2:] += (-1.0/12.0)*var[:-2]
lap[:2] += (-1.0/12.0)*var[-2:]
lap[:-2] += (-1.0/12.0)*var[2:]
lap[-2:] += (-1.0/12.0)*var[:2]
return lap / dh2
I have used the following setup.py
from distutils.core import setup
from Cython.Build import cythonize
setup(
ext_modules = cythonize("derivatives_c.pyx")
)
Any advice on improving it is welcome..

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

cython times 10 slower - python

Your problem is almost definitely np.sqrt(x2+y2+z**2) You should use the C sqrt function. It will look something like from libc.math cimport sqrt sqrt(xx + yy + z*z)

Related

No real gains using cython

Comparing scipy.stats.t.sf vs GSL using Cython

Cython's prange not improving performance

Speed up Python/Cython loops.

Cythonize a partial differential equation integrator

Categories

Resources

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

cython times 10 slower - python

Your problem is almost definitely np.sqrt(x**2+y**2+z**2) You should use the C sqrt function. It will look something like from libc.math cimport sqrt sqrt(x*x + y*y + z*z)

Related

No real gains using cython

Comparing scipy.stats.t.sf vs GSL using Cython

Cython's prange not improving performance

Speed up Python/Cython loops.

Cythonize a partial differential equation integrator

Categories

Resources

Your problem is almost definitely np.sqrt(x2+y2+z**2) You should use the C sqrt function. It will look something like from libc.math cimport sqrt sqrt(xx + yy + z*z)