What is going in this malloc'ed array in Cython? - python

%%cython -f -c=-O3 -c=-fopenmp --link-args=-fopenmp
from cython.parallel import parallel, prange
from libc.stdlib cimport abort, malloc, free
cdef int idx, i, n = 100
cdef int k
cdef int * local_buf
cdef int size = 10
cdef void func(int* lb) nogil:
cdef int j
for j in xrange(size):
lb[j] += -1*j
local_buf = <int *> malloc(sizeof(int) * size)
with nogil, parallel():
if local_buf == NULL:
abort()
# populate our local buffer in a sequential loop
for i in xrange(size):
local_buf[i] = i * 2
# share the work using the thread-local buffer(s)
for k in prange(n, schedule='guided'):
func(local_buf)
for i in xrange(size):
print local_buf[i]
free(local_buf)
0
-98
-196
-294
-392
-490
-588
-686
-784
-882
edit:
The above block shows the output after one run, but the contents in local_buf seems to change every or so re-run. What's going on?

The result there seems reasonable with the code given, do you actually get different result each run?
This should be regular python equivalent:
size = 10
n = 100
lst = [i*2 for i in range(size)]
for i in range(n):
for j in range(size):
lst[j] += -1*j
print lst
#[0, -98, -196, -294, -392, -490, -588, -686, -784, -882]

Related

Can this problem be implemented in parallel in Cython with OpenMP?

I have parallalised some Cython code with OpenMP. Once in a while, the code computes wrong results.
I created a nearly minimal working example of my problem. "Nearly", because the frequency of the wrong results seem to depend on even the tiniest changes in code, thus, e.g. I kept the function pointers in.
The Cython code is
#cython: language_level=3, boundscheck=False, wraparound=False, cdivision=True
# distutils: language = c++
import numpy as np
cimport cython
from cython.parallel import prange, parallel
from libcpp.vector cimport vector
cimport numpy as np
cdef inline double estimator_matheron(const double f_diff) nogil:
return f_diff * f_diff
ctypedef double (*_estimator_func)(const double) nogil
cdef inline void normalization_matheron(
vector[double]& variogram,
vector[long]& counts,
const int variogram_len
):
cdef int i
for i in range(variogram_len):
if counts[i] == 0:
counts[i] = 1
variogram[i] /= (2. * counts[i])
ctypedef void (*_normalization_func)(vector[double]&, vector[long]&, const int)
def test(const double[:] f):
cdef _estimator_func estimator_func = estimator_matheron
cdef _normalization_func normalization_func = normalization_matheron
cdef int i_max = f.shape[0] - 1
cdef int j_max = i_max + 1
cdef vector[double] variogram_local, variogram
cdef vector[long] counts_local, counts
cdef int i, j
with nogil, parallel():
variogram_local.resize(j_max, 0.0)
counts_local.resize(j_max, 0)
for i in range(i_max):
for j in range(1, j_max-i):
counts_local[j] += 1
variogram_local[j] += estimator_func(f[i] - f[i+j])
normalization_func(variogram_local, counts_local, j_max)
return np.asarray(variogram_local)
To test the code, I used this script:
import numpy as np
from cython_parallel import test
z = np.array(
(41.2, 40.2, 39.7, 39.2, 40.1, 38.3, 39.1, 40.0, 41.1, 40.3),
dtype=np.double,
)
print(test(z))
The result should be
[0. 0.49166667 0.7625 1.09071429 0.90166667 1.336
0.9525 0.435 0.005 0.405 ]
This is what a wrong result typically looks like
[0. 0.44319444 0.75483871 1.09053571 0.90166667 1.336
0.9525 0.435 0.005 0.405 ]
This code mainly sums up numbers into the the vector variogram_local. Most of the time, this code works, but without having made sufficient statistics, wrong results are produced maybe every 30th time. It always works, if I change the line with nogil, parallel(): to with nogil:. It also always works, if I don't use the function pointers at all, like this:
with nogil, parallel():
variogram_local.resize(j_max, 0.0)
counts_local.resize(j_max, 0)
for i in range(i_max):
for j in range(1, j_max-i):
counts_local[j] += 1
variogram_local[j] += (f[i] - f[i+j]) * (f[i] - f[i+j])
for j in range(j_max):
if counts_local[j] == 0:
counts_local[j] = 1
variogram_local[j] /= (2. * counts_local[j])
return np.asarray(variogram_local)
The full code is tested on different platforms and these problems mainly occure on MacOS with clang, e.g.:
https://ci.appveyor.com/project/conda-forge/staged-recipes/builds/29018878
EDIT
Thanks to your input, I modified the code and with num_threads=2 it works. But as soon as num_threads>2 I get wrong results again. Do you think that, if Cython support for OpenMP would be perfect, my new code should work or am I still getting something wrong?
If this should be on Cython's side, I guess I will indeed implement the code in pure C++.
def test(const double[:] f):
cdef int i_max = f.shape[0] - 1
cdef int j_max = i_max + 1
cdef vector[double] variogram_local, variogram
cdef vector[long] counts_local, counts
cdef int i, j, k
variogram.resize(j_max, 0.0)
counts.resize(j_max, 0)
with nogil, parallel(num_threads=2):
variogram_local = vector[double](j_max, 0.0)
counts_local = vector[long)(j_max, 0)
for i in prange(i_max):
for j in range(1, j_max-i):
counts_local[j] += 1
variogram_local[j] += (f[i] - f[i+j]) * (f[i] - f[i+j])
for k in range(j_max):
counts[k] += counts_local[k]
variogram[k] += variogram_local[k]
for i in range(j_max):
if counts[i] == 0:
counts[i] = 1
variogram[i] /= (2. * counts[i])
return np.asarray(variogram)
Contrary to their name, variogram_local and counts_local are not actually local. They are shared and all threads mess around with them in parallel, hence the undefined result.
Note that you don't actually share any work. It's just all threads doing the same thing - the whole serial task.
A somewhat sensible parallel version would look more like this:
variogram.resize(j_max, 0.0)
counts.resize(j_max, 0)
with nogil, parallel():
for i in range(i_max):
for j in prange(1, j_max-i):
counts[j] += 1
variogram[j] += estimator_func(f[i] - f[i+j])
The shared arrays are initialized outside and then the threads share the inner j-loop. Since no two threads will ever work on the same j, this is safe to do.
Now it may not be ideal to parallelize the inner loop. If you were to actually parallelize the outer loop, you would have to in fact make actual local variables and merge/reduce them afterwards.
The problem with your modified code is that you have a race condition the section which adds up counts_local and variogram_local. You want this in the parallel block (so that you still have access to the thread-local variables) but you only want one thread at a time to be working on it. The easiest way is to put it in a with gil: block so that Python enforces the "one thread at a time":
with gil:
for k in range(j_max):
counts[k] += counts_local[k]
variogram[k] += variogram_local[k]
This bit should hopefully be a quick task at the end, so shouldn't take too long.
If it were in C/C++ you'd probably use #pragma openmp atomic or #pragma openmp critical instead for the block. It's difficult to to this in Cython since their OpenMP support is quite basic, but you probably could abuse wrapped C macros to make the addition atomic.
Cython's OpenMP support is really geared around simple loops and scalar reductions. If you're doing more than that then it doesn't have the syntax to give you fine control of OpenMP and for this reason I'd tend to recommend writing your critical OpenMP functions in C or C++ (whichever you're more comfortable with).

How to use parallelism in cython

I'm trying to apply parallelism to the following algorithm. This should be easily parallelizable, as the calculations are independent for the first three dimensions (b, i, j).
def nb_forward(np.ndarray[FLOAT64, ndim=4] inputv, np.ndarray[FLOAT64, ndim=4] kernels, np.ndarray[FLOAT64, ndim=1] bias, tuple stride):
cdef unsigned int kernel_size0 = kernels.shape[1], kernel_size1 = kernels.shape[2], \
stride0 = stride[0], stride1 = stride[1], \
num_dim = kernels.shape[0], \
num_filters = kernels.shape[3], \
batch_size = inputv.shape[0]
cdef unsigned int out_size0 = (inputv.shape[1] - kernel_size0) / stride0 + 1, \
out_size1 = (inputv.shape[2] - kernel_size1) / stride1 + 1
cdef double[:, :, :, :] out = np.empty(shape=[batch_size, out_size0, out_size1, num_filters], dtype=np.float64)
cdef unsigned int b, i, j, m, kw, kh, n
cdef unsigned int iin, jin
cdef double acc
with nogil, parallel():
for b in prange(batch_size):
for i in range(out_size0):
for j in range(out_size1):
iin = i*stride0
jin = j*stride1
for n in range(num_filters):
acc = 0.
for kw in range(kernel_size0):
for kh in range(kernel_size1):
for m in range(num_dim):
acc += inputv[b, iin + kw, jin + kh, m] * kernels[m, kw, kh, n]
out[b, i, j, n] = acc + bias[n]
return out
Error:
Cannot read reduction variable in loop body
Initially I tried to parallelize only at the level of b, since parallelizing at level b, i, j is at the pixel level and I do not know if it is worth generating as many threads. But I have not succeeded.
I tried to use a temporary array out_batch, but being a numpy array, it is giving me a lot of problems and
Error: malloc problems
I have also tried instead of using numpy array using double arrays (double [:,:,:]) but it gives:
Error: Memoryview slices can only be shared in parallel sections
Does anyone have an idea? Is there any way to apply nogil at the level of b, i, j (or only b) and then compact the data?
Obviously the variable acc is shared between all threads and thus it can come to raise conditions - Cython rightly doesn't let this code to compile.
The variable acc shouldn't be shared between the threads, but be private to a thread. However, to my limited knowledge, there is no way to do it with cython yet (not sure what happened with this proposal).
A usual workaround is to allocate a large enough working array tmp and to accumulate the value for i-th thread in tmp[i]. Often enough (but not always) already presented arrays can be used for this purpose, so also in your case - by replacing acc through out[b,i,j,n]:
for n in range(num_filters):
out[b, i, j, n] = 0.
for kw in range(kernel_size0):
for kh in range(kernel_size1):
for m in range(num_dim):
out[b, i, j, n] += inputv[b, iin + kw, jin + kh, m] * kernels[m, kw, kh, n]
out[b, i, j, n] += bias[n]

Cython No Performance Increase with prange/parallel

I'm using Cython version 0.27.3 to compile the following source for a simple primality testing module that contains both python and cython implementations of the same algorithm. When I set the threads parameter to different values, I see no performance increase, despite the GIL being released. Is there something that's preventing this from running in parallel?
The function in question is the cdef void _getprimes which accepts a memoryview slice as a parameter and should set all non-prime values to 0 in that slice.
primes.pyx
#cython: boundscheck=False, wraparound=False, nonecheck=False
cimport cython
from cpython cimport array
from cython.parallel cimport parallel, prange
from libc.math cimport sqrt, ceil
from libc.stdlib cimport malloc, free
from libc.stdio cimport printf
import math
# =====================
# Python implementation
# =====================
def pyisprime(n):
"""Python implementation"""
if n < 2 or n & 1 == 0:
if n == 2:
return True
return False
for i in range(2, int(math.sqrt(n)) + 1):
if n % i == 0:
return False
return True
def pygetprimes(nums):
return [num for num in nums if pyisprime(num)]
# =====================
# Cython implementation
# =====================
cdef int _isprime(unsigned long long n) nogil:
"""Cython implementation of a simple primality check"""
cdef unsigned long long upper
cdef unsigned long long i = 3
cdef int prime = 1
if n < 2 or n & 1 == 0:
if n == 2:
return 1
return 0
upper = <unsigned long long>ceil(sqrt(<double>n))
while i <= upper:
if n % i == 0:
prime = 0
break
i += 1
return prime
def isprime(unsigned long long n):
"""Wrapper for _isprime"""
cdef int result
with nogil:
result = _isprime(n)
return result
cdef void _getprimes(unsigned long long[:] nums, int threads) nogil:
cdef unsigned long num
cdef int i = 0
with parallel(num_threads=threads):
for i in prange(nums.shape[0], schedule="dynamic"):
if _isprime(nums[i]) == 0:
nums[i] = 0
def getprimes(nums, int threads = 1):
"""Wrapper for _getprimes"""
cdef unsigned long long num
cdef unsigned long long[:] primes = array.array("Q", nums)
with nogil:
_getprimes(primes, threads)
return [num for num in primes if num != 0]
setup.py
#!/usr/bin/env python3
from distutils.core import setup
from Cython.Build import cythonize
setup(
name="primes",
ext_modules=cythonize('primes.pyx'),
)
test.py
#!/usr/bin/env python3
import functools
import random
import time
import primes
def timed(func):
def wrapped(*args, **kwargs):
start = time.time()
val = func(*args, **kwargs)
end = time.time()
print(func.__name__, end - start)
return val
return functools.wraps(func)(wrapped)
def main():
nums = [random.randint(0, 0xffffff) for _ in range(500000)]
pyfoo = timed(primes.pygetprimes)
cyfoo = timed(primes.getprimes)
x = pyfoo(nums)
y = cyfoo(nums, 1)
z = cyfoo(nums, 4)
assert x == y == z
if __name__ == "__main__":
main()
When I run cyfoo, I expected that increasing the number of threads from 1 to 4 would show some type of speed increase, but this is not the case:
[aarcher#Arch]: ~/Programming/Cython/build/lib.linux-x86_64-3.6>$ ./test.py
pygetprimes 5.11554741859436
getprimes 1.1129701137542725
getprimes 1.1306445598602295
It seems you need to enable compiler flags for OpenMP for the parallel statements to actually do anything.
See cython docs here
http://cython.readthedocs.io/en/latest/src/userguide/parallelism.html#compiling
# setup.py
# ... omitted ...
ext_modules = [
Extension(
"hello",
["hello.pyx"],
extra_compile_args=['-fopenmp'],
extra_link_args=['-fopenmp'],
)
]

Cython function memory leak

I have converted a python function to a cython one. Now the function works as it is supposed to. But I am getting a lot of memory leak when the main program calls this function multiple times. I have freed the memory that I allocated dynamically, but it does not seem to work.
What am I doing wrong here?
from cpython.mem cimport PyMem_Malloc, PyMem_Free
def longest_common_substring(refWord, stemWord):
cdef:
int longest, x_longest
int x, y, k
Py_ssize_t lengthRefWord
Py_ssize_t lengthStemWord
wchar_t *referenceWord = PyUnicode_AsWideCharString(refWord, &lengthRefWord)
wchar_t *stemmableWord = PyUnicode_AsWideCharString(stemWord, &lengthStemWord)
int t1 = lengthRefWord+1
int t2 = lengthStemWord+1
int **m = <int **> PyMem_Malloc(t1 * sizeof(int *))
wchar_t tempChar1;
wchar_t tempChar2;
longest = 0
x_longest = 0
for k in range(t1):
m[k] = <int *> PyMem_Malloc(t2 * sizeof(int))
for x in range(0, t1):
for y in range(0, t2):
m[x][y] = 0
for x in range(1, t1):
for y in range(1, t2):
tempChar1 = referenceWord[x - 1]
tempChar2 = stemmableWord[y - 1]
if tempChar1 == tempChar2:
m[x][y] = m[x - 1][y - 1] + 1
if m[x][y] > longest:
longest = m[x][y]
x_longest = x
else:
m[x][y] = 0
for k in range(t1):
PyMem_Free(m[k])
PyMem_Free(m)
return refWord[x_longest - longest: x_longest]
PyUnicode_AsWideCharString allocates memory that you have to free. The documentation says
Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it) on success.
You get two strings from this function but free neither of them.

Cython numpy array indexing

I am trying to speed up some python code with cython, and I'm making use of cython's -a option to see where I can improve things. My understanding is that in the generated html file, the highlighted lines are ones where python functions are called - is that correct?
In the following trivial function, I have declared the numpy array argument arr using the buffer syntax. I thought that this allows indexing operations to take place purely in C without having to call python functions. However, cython -a (version 0.15) highlights the line where I set the value of an element of arr, though not the one where i read one of its elements. Why does this happen? Is there a more efficient way of accessing numpy array elements?
import numpy
cimport numpy
def foo(numpy.ndarray[double, ndim=1] arr not None):
cdef int i
cdef double elem
for i in xrange(10):
elem = arr[i] #not highlighted
arr[i] = 1.0 + elem #highlighted
EDIT: Also, how does the mode buffer argument interact with numpy? Assuming I haven't changed the order argument of numpy.array from the default, is it always safe to use mode='c'? Does this actually make a difference to performance?
EDIT after delnan's comment: arr[i] += 1 also gets highlighted (that is why I split it up in the first place, to see which part of the operation was causing the issue). If I turn off bounds checking to simplify things (this makes no difference to what gets highlighted), the generated c code is:
/* "ct.pyx":11
* cdef int i
* cdef double elem
* for i in xrange(10): # <<<<<<<<<<<<<<
* elem = arr[i]
* arr[i] = 1.0 + elem
*/
for (__pyx_t_1 = 0; __pyx_t_1 < 10; __pyx_t_1+=1) {
__pyx_v_i = __pyx_t_1;
/* "ct.pyx":12
* cdef double elem
* for i in xrange(10):
* elem = arr[i] # <<<<<<<<<<<<<<
* arr[i] = 1.0 + elem
*/
__pyx_t_2 = __pyx_v_i;
__pyx_v_elem = (*__Pyx_BufPtrStrided1d(double *, __pyx_bstruct_arr.buf, __pyx_t_2, __pyx_bstride_0_arr));
/* "ct.pyx":13
* for i in xrange(10):
* elem = arr[i]
* arr[i] = 1.0 + elem # <<<<<<<<<<<<<<
*/
__pyx_t_3 = __pyx_v_i;
*__Pyx_BufPtrStrided1d(double *, __pyx_bstruct_arr.buf, __pyx_t_3, __pyx_bstride_0_arr) = (1.0 + __pyx_v_elem);
}
The answer is that the highlighter fools the reader.
I compiled your code and the instructions generated under the highlight are those needed
to handle the error cases and the return value, they are not related to the array assignment.
Indeed if you change the code to read :
def foo(numpy.ndarray[double, ndim=1] arr not None):
cdef int i
cdef double elem
for i in xrange(10):
elem = arr[i]
arr[i] = 1.0 + elem
return # + add this
The highlight would be on the last line and not more in the assignment.
You can further speed up your code by using the #cython.boundscheck:
import numpy
cimport numpy
cimport cython
#cython.boundscheck(False)
def foo(numpy.ndarray[double, ndim=1] arr not None):
cdef int i
cdef double elem
for i in xrange(10):
elem = arr[i]
arr[i] = 1.0 + elem
return

Categories

Resources