Cython's prange not improving performance

Cython's prange not improving performance - python

I'm trying to improve the performance of some metric computations with Cython's prange. Here are my codes:
def shausdorff(float64_t[:,::1] XA not None, float64_t[:,:,::1] XB not None):
cdef:
Py_ssize_t i
Py_ssize_t n = XB.shape[2]
float64_t[::1] hdist = np.zeros(n)
#arrangement to fix contiguity
XB = np.asanyarray([np.ascontiguousarray(XB[:,:,i]) for i in range(n)])
for i in range(n):
hdist[i] = _hausdorff(XA, XB[i])
return hdist
def phausdorff(float64_t[:,::1] XA not None, float64_t[:,:,::1] XB not None):
cdef:
Py_ssize_t i
Py_ssize_t n = XB.shape[2]
float64_t[::1] hdist = np.zeros(n)
#arrangement to fix contiguity (EDITED)
cdef float64_t[:,:,::1] XC = np.asanyarray([np.ascontiguousarray(XB[:,:,i]) for i in range(n)])
with nogil, parallel(num_threads=4):
for i in prange(n, schedule='static', chunksize=1):
hdist[i] = _hausdorff(XA, XC[i])
return hdist
Basically, in each iteration the hausdorff metric is computed between XA and each XB[i]. Here is the signature of the _hausdorff function:
cdef inline float64_t _hausdorff(float64_t[:,::1] XA, float64_t[:,::1] XB) nogil:
...
my problem is that both the sequential shausdorff and the parallel phausdorff have the same timings. Furthermore, it seems that phausdorff is not creating any thread at all.
So my question is what is wrong with my code, and how can I fix it to get threading working.
Here is my setup.py:
from distutils.core import setup
from distutils.extension import Extension
from Cython.Build import cythonize
from Cython.Distutils import build_ext
ext_modules=[
Extension("custom_metric",
["custom_metric.pyx"],
libraries=["m"],
extra_compile_args = ["-O3", "-ffast-math", "-march=native", "-fopenmp" ],
extra_link_args=['-fopenmp']
)
]
setup(
name = "custom_metric",
cmdclass = {"build_ext": build_ext},
ext_modules = ext_modules
)
EDIT 1: Here is a link to the html generated by cython -a: custom_metric.html
EDIT 2: Here is an example on how to call the corresponding functions (you need to compile the Cython file first)
import custom_metric as cm
import numpy as np
XA = np.random.random((9000, 210))
XB = np.random.random((1000, 210, 9))
#timing 'parallel' version
%timeit cm.phausdorff(XA, XB)
#timing sequential version
%timeit cm.shausdorff(XA, XB)

I think this the parallelization is working, but the extra overhead of the parallelization is eating up the time it would have saved. If I try with different sized arrays then I do begin to see a speed up in the parallel version
XA = np.random.random((900, 2100))
XB = np.random.random((100, 2100, 90))
Here the parallel version takes ~2/3 of the time of the serial version for me, which certainly isn't the 1/4 you'd expect, but does at least show some benefit.
One improvement I can offer is to replace the code that fixes contiguity:
XB = np.asanyarray([np.ascontiguousarray(XB[:,:,i]) for i in range(n)])
with
XB = np.ascontiguousarray(np.transpose(XB,[2,0,1]))
This speeds up both the parallel and non-parallel functions fairly significantly (a factor of 2 with the arrays you originally gave). It does make it slightly more obvious that you're being slowed down by overhead in the prange - the serial version is actually faster for the arrays in your example.

Related

Cython slower compared to Python when using OpenCV

I am experimenting with Cython and OpenCV and trying to benchmark the performance for image manipulation. I have tried optimizing my Cython code as much as I could, but I still get slower performance with it. I understand most of the code is executed in C because of OpenCV, yet I expected better performance for python loops using Cython. Can anyone tell me if there anything I can do to improve it? Following is my code:
# load_images.py
import cv2
from random import randint
import numpy as np
def fetch_images(n):
def get_img():
x = randint(640, 6144)
y = randint(640, 6144)
return np.random.rand(x,y, 3).astype(np.uint8)
return [get_img() for _ in range(n)]
def resize_img(img):
img = cv2.resize(img, (640, 640))
return cv2.cvtColor(img, cv2.COLOR_BGR2GRAY)
def preprocess(images):
return [resize_img(img) for img in images]
# load_images_cy.pyx
import cv2
from random import randint
import numpy as np
cimport numpy as np
cimport cython
ctypedef np.uint8_t DTYPE_t
#cython.boundscheck(False)
#cython.wraparound(False)
cdef np.ndarray[DTYPE_t, ndim=3] get_img():
cdef int x = randint(640, 6144)
cdef int y = randint(640, 6144)
return np.random.rand(x,y, 3).astype(np.uint8)
cpdef list fetch_images(int n):
cdef int _;
return [get_img() for _ in range(n)]
cdef np.ndarray[DTYPE_t, ndim=2] resize_img(np.ndarray[DTYPE_t, ndim=3] img):
cdef np.ndarray[DTYPE_t, ndim=3] im;
im = cv2.resize(img, (640, 640))
return cv2.cvtColor(im, cv2.COLOR_BGR2GRAY)
cpdef np.ndarray[DTYPE_t, ndim=3] preprocess(list images):
cdef np.ndarray[DTYPE_t, ndim=3] img;
cdef np.ndarray[DTYPE_t, ndim=3] collection = np.empty((len(images), 640, 640), dtype=np.uint8);
cdef int i;
for i, img in enumerate(images):
collection[i] = resize_img(img)
return collection
# main.py
import load_images_cy
import load_images
import timeit
images = load_images.fetch_images(20)
result_cy = timeit.timeit(lambda: load_images_cy.preprocess(images), number=20)
result_py = timeit.timeit(lambda: load_images.preprocess(images), number=20)
print(f'{result_py/result_cy} times faster')
Output:
0.9192241989059127 times faster

Cython is primarily meant for interfacing with C code, and writing Python extension modules more easily. While performance improvements can be obtained through Cython, it is not intended to be a drop-in speed-up for Python code.
PyPy, however, is intended to be a more-or-less drop-in speed-up for Python code. It provides an alternate interpreter which is generally faster than CPython, the reference/default Python implementation.
Also, your decorators here:
#cython.boundscheck(False)
#cython.wraparound(False)
cdef np.ndarray[DTYPE_t, ndim=3] get_img():
...
Only apply to get_img - not any of the other functions below. Not sure if that was intentional or not. There should not be a blank line between these.
If you want to stick with Cython, and gain performance improvements through it, consider altering the compilation options, such as providing -O2 or -O3.

Cython parallelism and stencils

After an intensive use of numba, I am coming back to cython to parallelize some time consuming functions. Hereafter, a base example :
import numpy as np
cimport numpy as np
from cython import boundscheck, wraparound
from cython.parallel import parallel, prange
#boundscheck(False)
#wraparound(False)
def cytest1(double[:,::1] a, double[:,::1] b, int ix1, int ix2, int iz1, int iz2):
cdef int ix
cdef int iz
for ix in range(ix1, ix2):
for iz in range(iz1, iz2):
b[ix, iz] = 0.5*(a[ix+1, iz] - a[ix-1, iz])
return b
#boundscheck(False)
#wraparound(False)
def cytest2(double[:,::1] a, double[:,::1] b, int ix1, int ix2, int iz1, int iz2):
cdef int ix
cdef int iz
with nogil, parallel():
for ix in prange(ix1, ix2):
for iz in range(iz1, iz2):
b[ix, iz] = 0.5*(a[ix+1, iz] - a[ix-1, iz])
return b
When compiling these two functions (with openmp flag), and calling them as follows :
nx, nz = 1024, 1024
a = np.random.rand(nx, nz)
b = np.zeros_like(a)
Nit = 1000
ti = time.time()
for i in range(Nit):
cytest1(a, b, 5, nx-5, 0, nz)
print('cytest1 : {:.3f} s.'.format(time.time() - ti))
ti = time.time()
for i in range(Nit):
cytest2(a, b, 5, nx-5, 0, nz)
print('cytest2 : {:.3f} s.'.format(time.time() - ti))
I obtain these execution times :
cytest1 : 1.757 s.
cytest2 : 1.861 s.
When the parallel function is executed, I can see my 4 cpu-s in action, but the execution time is nearly the same that the one obtained with the serial function. I tried to move prange to the inner loop, but for worst results. I also tried some different schedule options but without success.
I am clearly missing something, but what ? Is prange unable to chunk the loop with a code trying to access n+X/n-X elements ?
EDIT :
My setup :
model name : Intel(R) Core(TM) i7-6600U CPU # 2.60GHz
MemTotal : 8052556 kB
Python : 3.5.2
cython : 0.28.2
Numpy : 1.14.2
Numba : 0.37.0
The setup.py :
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
ext_modules = [
Extension("stencil",
["stencil.pyx"],
libraries=["m"],
extra_compile_args=["-O3", "-ffast-math", "-march=native", "-fopenmp"],
extra_link_args=['-fopenmp'],
)
]
setup(
name="stencil",
cmdclass={"build_ext": build_ext},
ext_modules=ext_modules
)

This answer will be a lot of guesswork, but as we will see: a lot depends on the hardware, so it is not easy to explain without having the same hardware at hand.
The first question is: What is the bottle-neck? By looking at the code I would assume, that this is a memory-bound task.
To make it more clear-cut, let's do only the following operation in the loop:
b[ix, iz] = (a[ix+1, iz])
So there is no calculation, only memory accesses.
I use Intel Xeon E5-2620 # 2.1 Ghz with 2 processors and %timeit-magic reports:
>>> %timeit cytest1(a,b,5, nx-5, 0, nz)
100 loops, best of 3: 1.99 ms per loop
>>> %timeit cytest2(a,b,5, nx-5, 0, nz)
The slowest run took 234.48 times longer than the fastest. This could mean that an intermediate result is being cached.
1000 loops, best of 3: 324 µs per loop
As we can see, some caching is going on. We have 2 arrays, each 8Mb - that means 16Mb of data which has to be "touched". Every processor on my machine has 15Mb cache - so for a single thread the data is evicted from cache before it can be reused,but if both processors are used there are 20Mb of fast cache - and thus big enough to keep all of the data.
That means the speed-up we see is due to larger amount of fast-memory (cache) which can be utilized by the parallelized version.
Let's increase the size of the arrays, so the cache isn't big enough even for the paralleliized version:
....
>>> nx, nz = 10240, 10240 #100 times bigger
....
>>> %timeit cytest1(a,b,5, nx-5, 0, nz)
1 loop, best of 3: 238 ms per loop
>>> %timeit cytest2(a,b,5, nx-5, 0, nz)
10 loops, best of 3: 99.3 ms per loop
Now it is about 2 times faster, which is easy to explain: two processors have twice the memory-bandwidth compared to one processor and both are utilized by the parallel version.
We get very similar results for your formula
b[ix, iz] = 0.5*(a[ix+1, iz] - a[ix-1, iz])
which is not surprisingly - there are not enough calculations to make it CPU-bound.
sin and cos are pretty CPU-intensive operations, so using them will make the calculation CPU-bound (see appendix for the whole code):
...
b[ix, iz] = sin(a[ix+1, iz])
...
>>> %timeit cytest1(a,b,5, nx-5, 0, nz)
1 loop, best of 3: 1.6 s per loop
>>> %timeit cytest2(a,b,5, nx-5, 0, nz)
1 loop, best of 3: 217 ms per loop
This yields speed-up of 8, which is quite reasonable for my machine.
Obviously, for other machines/architectures different behaviors can be observed. But in a nutshell:
I would not expect much speed-up for your formula - the task is memory-bound, so the question is, whether you can achieve a higher bandwidth of memory-accesses or not.
For more CPU-intensive calculation you should be able to see at least some speed-up, which yet depends on your hardware.
Listing (on windows, use -fopenmp on linux):
%%cython --compile-args=/openmp --link-args=/openmp
from cython.parallel import parallel, prange
from cython import boundscheck, wraparound
from libc.math cimport sin
#boundscheck(False)
#wraparound(False)
def cytest1(double[:,::1] a, double[:,::1] b, int ix1, int ix2, int iz1, int iz2):
cdef int ix
cdef int iz
for ix in range(ix1, ix2):
for iz in range(iz1, iz2):
b[ix, iz] =sin(a[ix+1, iz])
return b
#boundscheck(False)
#wraparound(False)
def cytest2(double[:,::1] a, double[:,::1] b, int ix1, int ix2, int iz1, int iz2):
cdef int ix
cdef int iz
with nogil, parallel():
for ix in prange(ix1, ix2):
for iz in range(iz1, iz2):
b[ix, iz] = sin(a[ix+1, iz])
return b

Can my numba code be faster than numpy

I am new to Numba and am trying to speed up some calculations that have proved too unwieldy for numpy. The example I've given below compares a function containing a subset of my calculations using a vectorized/numpy and numba versions of the function the latter of which was also tested as pure python by commenting out the #autojit decorator.
I find that the numba and numpy versions give similar speed ups relative to the pure python, both of which are about a factor of 10 speed improvement.
The numpy version was actually slightly faster than my numba function but because of the 4D nature of this calculation I quickly run out of memory when the arrays in the numpy function are sized much larger than this toy example.
This speed up is nice but I have often seen speed ups of >100x on the web when moving from pure python to numba.
I would like to know if there is a general expected speed increase when moving to numba in nopython mode. I would also like to know if there are any components of my numba-ized function that would be limiting further speed increases.
import numpy as np
from timeit import default_timer as timer
from numba import autojit
import math
def vecRadCalcs(slope, skyz, solz, skya, sola):
nloc = len(slope)
ntime = len(solz)
[lenz, lena] = skyz.shape
asolz = np.tile(np.reshape(solz,[ntime,1,1,1]),[1,nloc,lenz,lena])
asola = np.tile(np.reshape(sola,[ntime,1,1,1]),[1,nloc,lenz,lena])
askyz = np.tile(np.reshape(skyz,[1,1,lenz,lena]),[ntime,nloc,1,1])
askya = np.tile(np.reshape(skya,[1,1,lenz,lena]),[ntime,nloc,1,1])
phi1 = np.cos(asolz)*np.cos(askyz)
phi2 = np.sin(asolz)*np.sin(askyz)*np.cos(askya- asola)
phi12 = phi1 + phi2
phi12[phi12> 1.0] = 1.0
phi = np.arccos(phi12)
return(phi)
#autojit
def RadCalcs(slope, skyz, solz, skya, sola, phi):
nloc = len(slope)
ntime = len(solz)
pop = 0.0
[lenz, lena] = skyz.shape
for iiT in range(ntime):
asolz = solz[iiT]
asola = sola[iiT]
for iL in range(nloc):
for iz in range(lenz):
for ia in range(lena):
askyz = skyz[iz,ia]
askya = skya[iz,ia]
phi1 = math.cos(asolz)*math.cos(askyz)
phi2 = math.sin(asolz)*math.sin(askyz)*math.cos(askya- asola)
phi12 = phi1 + phi2
if phi12 > 1.0:
phi12 = 1.0
phi[iz,ia] = math.acos(phi12)
pop = pop + 1
return(pop)
zenith_cells = 90
azim_cells = 360
nloc = 10 # nominallly ~ 700
ntim = 10 # nominallly ~ 200000
slope = np.random.rand(nloc) * 10.0
solz = np.random.rand(ntim) *np.pi/2.0
sola = np.random.rand(ntim) * 1.0*np.pi
base = np.ones([zenith_cells,azim_cells])
skya = np.deg2rad(np.cumsum(base,axis=1))
skyz = np.deg2rad(np.cumsum(base,axis=0)*90/zenith_cells)
phi = np.zeros(skyz.shape)
start = timer()
outcalc = RadCalcs(slope, skyz, solz, skya, sola, phi)
stop = timer()
outcalc2 = vecRadCalcs(slope, skyz, solz, skya, sola)
stopvec = timer()
print(outcalc)
print(stop-start)
print(stopvec-stop)

On my machine running numba 0.31.0, the Numba version is 2x faster than the vectorized solution. When timing numba functions, you need to run the function more than one time because the first time you're seeing the time of jitting the code + the run time. Subsequent runs will not include the overhead of jitting the functions time since Numba caches the jitted code in memory.
Also, please note that your functions are not calculating the same thing -- you want to be careful that you're comparing the same things using something like np.allclose on the results.

Cython parallel OpenMP for Black Scholes with NumPy integrated, serial code 10M options 3.5s, parallel?

Here is the Black (Black Scholes less the dividend) option pricing model for options on futures written in Cython with actual multi-threading, but I can't run it. (NOW FIXED, SEE LATER POST BELOW FOR ANSWER). I am using Python 3.5 with Microsoft Visual Studio 2015 compiler. Here is the serial version that takes 3.5s for 10M options: Cython program is slower than plain Python (10M options 3.5s vs 3.25s Black Scholes) - what am I missing?
I attempted to make this parallel by using nogil but after compiling, I cannot access the internal function CyBlackP. There are several issues with this (at least on Windows). 1) Cython when generating the OpenMP code assumes you are beyond v2.0 but Microsoft Visual Studio 2015 is stuck on the old version which requires signed iterators. The workaround I have is after first attempting to build the code, it will error out, then open the output CyBlackP.cpp file in Microsoft Visual Studio 2015, search for size_t __pyx_t_2 (line 1430), then change it to ssize_t __pyx_t_2, and change the next line from size_t __pyx_t_3 to ssize_t __pyx_t_3 to get rid of signed/unsigned errors, and compile again. 2) You can't directly go from NumPy arrays into the function as nogil only works on pure C/C++ functions, so I have several helper functions to convert the NumPy array inputs into C++ vector format, pass those to a C++ function, then convert the returned vector back to a NumPy array. I'm posting the parallel code here for others to use and I'm sure someone out there can figure out why I can't access the parallel function from Python - the non-parallel version was accessed like this from CyBlackP.CyBlackP import CyBlackP.
Code is here with steps on how to build. First file save as CyBlackP.pyx
[note the exposed function to Python here is CyBlackP, which converts the NumPy input arrays into C vectors through the helper functions, then passes the C vectors to the C function CyBlackParallel, which runs with nogil and OpenMP. The results are then converted back to a NumPy array and returned from CyBlackP back to Python]:
import numpy as np
cimport cython
from cython.parallel cimport prange
from libcpp.vector cimport vector
cdef extern from "math.h" nogil:
double exp(double)
double log(double)
double erf(double)
double sqrt(double)
cdef double std_norm_cdf(double x) nogil:
return 0.5*(1+erf(x/sqrt(2.0)))
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.cdivision(True)
cdef CyBlackParallel(vector[double] Black_PnL, vector[double] Black_S, vector[double] Black_Texpiry, vector[double] Black_strike, vector[double] Black_volatility, vector[double] Black_IR, vector[int] Black_callput):
cdef int i
N = Black_PnL.size()
cdef double d1, d2
for i in prange(N, nogil=True, num_threads=4, schedule='static'):
d1 = ((log(Black_S[i] / Black_strike[i]) + Black_Texpiry[i] * (Black_volatility[i] * Black_volatility[i]) / 2)) / (Black_volatility[i] * sqrt(Black_Texpiry[i]))
d2 = d1 - Black_volatility[i] * sqrt(Black_Texpiry[i])
Black_PnL[i] = exp(-Black_IR[i] * Black_Texpiry[i]) * (Black_callput[i] * Black_S[i] * std_norm_cdf(Black_callput[i] * d1) - Black_callput[i] * Black_strike[i] * std_norm_cdf(Black_callput[i] * d2))
return Black_PnL
cdef vector[double] arrayToVector(np.ndarray[np.float64_t,ndim=1] array):
cdef long size = array.size
cdef vector[double] vec
cdef long i
for i in range(size):
vec.push_back(array[i])
return vec
cdef vector[int] INTarrayToVector(np.ndarray[np.int64_t,ndim=1] array):
cdef long size = array.size
cdef vector[int] vec
cdef long i
for i in range(size):
vec.push_back(array[i])
return vec
cdef np.ndarray[np.float64_t, ndim=1] vectorToArray(vector[double] vec):
cdef np.ndarray[np.float64_t, ndim=1] arr = np.zeros(vec.size())
cdef long i
for i in range(vec.size()):
arr[i] = vec[i]
return arr
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.cdivision(True)
cpdef CyBlackP(ndarray[np.float64_t, ndim=1] PnL, ndarray[np.float64_t, ndim=1] S0, ndarray[np.float64_t, ndim=1] Texpiry, ndarray[np.float64_t, ndim=1] strike, ndarray [np.float64_t, ndim=1] volatility, ndarray[np.float64_t, ndim=1] IR, ndarray[np.int64_t, ndim=1] callput):
cdef vector[double] Black_PnL, Black_S, Black_Texpiry, Black_strike, Black_volatility, Black_IR
cdef ndarray[np.float64_t, ndim=1] Results
cdef vector[int] Black_callput
Black_PnL = arrayToVector(PnL)
Black_S = arrayToVector(S0)
Black_Texpiry = arrayToVector(Texpiry)
Black_strike = arrayToVector(strike)
Black_volatility = arrayToVector(volatility)
Black_IR = arrayToVector(IR)
Black_callput = INTarrayToVector(callput)
Black_PnL = CyBlackParallel (Black_PnL, Black_S, Black_Texpiry, Black_strike, Black_volatility, Black_IR, Black_callput)
Results = vectorToArray(Black_PnL)
return Results
Next code piece save as setup.py for use by Cython:
try:
from setuptools import setup
from setuptools import Extension
except ImportError:
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
import numpy as np
ext_modules = [Extension("CyBlackP",sources=["CyBlackP.pyx"],
extra_compile_args=['/Ot', '/openmp', '/favor:INTEL64', '/EHsc', '/GA'],
language='c++')]
setup(
name= 'Generic model class',
cmdclass = {'build_ext': build_ext},
include_dirs = [np.get_include()],
ext_modules = ext_modules)
Then from a command prompt, type: python setup.py build_ext --inplace --compiler=msvc to build.
Any help on getting access to this function is appreciated, not sure why I can't seem to locate it after compiling. I can import CyBlackP or from CyBlackP import * but I can't get to the actual function to calculate the option values.
Here is a realistic NumPy test script to use if you want to test this Cython function:
BlackPnL = np.zeros(10000000)
Black_S=np.random.randint(200, 10000, 10000000)*0.01
Black_Texpiry=np.random.randint(1,500,10000000)*0.01
Black_strike=np.random.randint(1,100,10000000)*0.1
Black_volatility=np.random.rand(10000000)*1.2
Black_IR=np.random.rand(10000000)*0.1
Black_callput=np.sign(np.random.randn(10000000))
Black_callput=Black_callput.astype(np.int64)

Okay I figured out what was wrong using dependency walker http://www.dependencywalker.com/ on the CyBlackP.cp35-win_amd64.pyd file generated by Cython. It showed that 2 DLLs were not found: msvcp140_app.dll and vcomp140_app.dll which are just the x64 versions of MSVC OpenMP and CRT C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\redist\x64\
Microsoft.VC140.OpenMP\vcomp140.dll and C:\Program Files (x86)\Microsoft Visual Studio 14.0\VC\redist\x64\Microsoft.VC14
0.CRT\msvcp140.dll renamed with _app inserted, and copied to the \CyBlackP\ project directory. I also updated my setup.py like this which gets rid of the annoying import statement (now just from CyBlackP import CyBlackP):
try:
from setuptools import setup
from setuptools import Extension
except ImportError:
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
import numpy as np
import os
module = 'CyBlackP'
ext_modules = [Extension(module, sources=[module + ".pyx"],
extra_compile_args=['/Ot', '/favor:INTEL64', '/EHsc', '/GA', '/openmp'],
language='c++')]
setup(
name = module,
cmdclass = {'build_ext': build_ext},
include_dirs = [np.get_include(), os.path.join(np.get_include(), 'numpy')],
ext_modules = ext_modules)

cython times 10 slower

I am trying to boost up some calculations in python by using cython ...
In my calculations I will be doing double loops or more plus I can't always use numpy vectorization so I need to boost up the python loops with cython.
here I benchmark some simple calculation and it shows so far that cython is 10 times slower than using numpy. I am sure that numpy is optimized to the max and I doubt I could be able to beat its performance but still factor of 10 slower means I am doing something wrong. Suggestions ?
test.py
import numpy as np
from histogram import distances
import time
REPEAT = 10
def printTime(message, t):
print "%s total: %.7f(s) --> average: %.7f(s) %.7f(Ms)"%(message, t, t/REPEAT, 1000000*t/REPEAT)
DATA = np.array( np.random.random((10000, 3)), dtype=np.float32)
POINT = np.array( np.random.random((1,3)), dtype=np.float32)
# numpy histogram
r = REPEAT
startTime = time.clock()
while r:
diff = (DATA-POINT)%1
diffNumpy = np.where(diff<0, diff+1, diff)
distNumpy = np.sqrt( np.add.reduce(diff**2,1) )
r-=1
printTime("numpy", time.clock()-startTime)
# cython test
r = REPEAT
startTime = time.clock()
while r:
distCython = distances(POINT, DATA)
r-=1
printTime("cython", time.clock()-startTime)
histogram.pyx
import numpy as np
import cython
cimport cython
cimport numpy as np
DTYPE=np.float32
ctypedef np.float32_t DTYPE_C
#cython.nonecheck(False)
#cython.boundscheck(False)
#cython.wraparound(False)
def distances(np.ndarray[DTYPE_C, ndim=2] point, np.ndarray[DTYPE_C, ndim=2] data):
# declare variables
cdef int i
cdef float x,y,z
cdef np.ndarray[DTYPE_C, mode="c", ndim=1] dist = np.empty((data.shape[0]), dtype=DTYPE)
# loop
for i from 0 <= i < data.shape[0]:
# calculate distance
x = (data[i,0]-point[0,0])%1
y = (data[i,1]-point[0,1])%1
z = (data[i,2]-point[0,2])%1
# fold between 0 and 1
if x<0: x+=1
if y<0: y+=1
if z<0: z+=1
# assign to array
dist[i] = np.sqrt(x**2+y**2+z**2)
return dist
setup.py
from distutils.core import setup
from Cython.Build import cythonize
import numpy as np
setup(
ext_modules = cythonize("histogram.pyx"),
include_dirs=[np.get_include()]
)
to compile do the following
python setup.py build_ext --inplace
to launch benchmarch
python test.py
My results are
numpy total: 0.0153390(s) --> average: 0.0015339(s) 1533.9000000(Ms)
cython total: 0.1509920(s) --> average: 0.0150992(s) 15099.2000000(Ms)

Your problem is almost definitely
np.sqrt(x**2+y**2+z**2)
You should use the C sqrt function. It will look something like
from libc.math cimport sqrt
sqrt(x*x + y*y + z*z)

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Cython's prange not improving performance - python

Related

Cython slower compared to Python when using OpenCV

Cython parallelism and stencils

Can my numba code be faster than numpy

Cython parallel OpenMP for Black Scholes with NumPy integrated, serial code 10M options 3.5s, parallel?

cython times 10 slower

Categories

Resources