Cython No Performance Increase with prange/parallel - python
I'm using Cython version 0.27.3 to compile the following source for a simple primality testing module that contains both python and cython implementations of the same algorithm. When I set the threads parameter to different values, I see no performance increase, despite the GIL being released. Is there something that's preventing this from running in parallel?
The function in question is the cdef void _getprimes which accepts a memoryview slice as a parameter and should set all non-prime values to 0 in that slice.
primes.pyx
#cython: boundscheck=False, wraparound=False, nonecheck=False
cimport cython
from cpython cimport array
from cython.parallel cimport parallel, prange
from libc.math cimport sqrt, ceil
from libc.stdlib cimport malloc, free
from libc.stdio cimport printf
import math
# =====================
# Python implementation
# =====================
def pyisprime(n):
"""Python implementation"""
if n < 2 or n & 1 == 0:
if n == 2:
return True
return False
for i in range(2, int(math.sqrt(n)) + 1):
if n % i == 0:
return False
return True
def pygetprimes(nums):
return [num for num in nums if pyisprime(num)]
# =====================
# Cython implementation
# =====================
cdef int _isprime(unsigned long long n) nogil:
"""Cython implementation of a simple primality check"""
cdef unsigned long long upper
cdef unsigned long long i = 3
cdef int prime = 1
if n < 2 or n & 1 == 0:
if n == 2:
return 1
return 0
upper = <unsigned long long>ceil(sqrt(<double>n))
while i <= upper:
if n % i == 0:
prime = 0
break
i += 1
return prime
def isprime(unsigned long long n):
"""Wrapper for _isprime"""
cdef int result
with nogil:
result = _isprime(n)
return result
cdef void _getprimes(unsigned long long[:] nums, int threads) nogil:
cdef unsigned long num
cdef int i = 0
with parallel(num_threads=threads):
for i in prange(nums.shape[0], schedule="dynamic"):
if _isprime(nums[i]) == 0:
nums[i] = 0
def getprimes(nums, int threads = 1):
"""Wrapper for _getprimes"""
cdef unsigned long long num
cdef unsigned long long[:] primes = array.array("Q", nums)
with nogil:
_getprimes(primes, threads)
return [num for num in primes if num != 0]
setup.py
#!/usr/bin/env python3
from distutils.core import setup
from Cython.Build import cythonize
setup(
name="primes",
ext_modules=cythonize('primes.pyx'),
)
test.py
#!/usr/bin/env python3
import functools
import random
import time
import primes
def timed(func):
def wrapped(*args, **kwargs):
start = time.time()
val = func(*args, **kwargs)
end = time.time()
print(func.__name__, end - start)
return val
return functools.wraps(func)(wrapped)
def main():
nums = [random.randint(0, 0xffffff) for _ in range(500000)]
pyfoo = timed(primes.pygetprimes)
cyfoo = timed(primes.getprimes)
x = pyfoo(nums)
y = cyfoo(nums, 1)
z = cyfoo(nums, 4)
assert x == y == z
if __name__ == "__main__":
main()
When I run cyfoo, I expected that increasing the number of threads from 1 to 4 would show some type of speed increase, but this is not the case:
[aarcher#Arch]: ~/Programming/Cython/build/lib.linux-x86_64-3.6>$ ./test.py
pygetprimes 5.11554741859436
getprimes 1.1129701137542725
getprimes 1.1306445598602295
It seems you need to enable compiler flags for OpenMP for the parallel statements to actually do anything.
See cython docs here
http://cython.readthedocs.io/en/latest/src/userguide/parallelism.html#compiling
# setup.py
# ... omitted ...
ext_modules = [
Extension(
"hello",
["hello.pyx"],
extra_compile_args=['-fopenmp'],
extra_link_args=['-fopenmp'],
)
]
Related
Cython promlem: cimport libcpp.vector not compiled
I'm trying to use cython to speed up my code. Since I'm working with an array of strings, I want to use string and vector from c++. But I have problems compiling if I import c libraries. For an example, I tried to implement an example from here: https://cython.readthedocs.io/en/latest/src/tutorial/cython_tutorial.html. So, my code is from libcpp.vector cimport vector def primes(unsigned int nb_primes): cdef int n, i cdef vector[int] p p.reserve(nb_primes) # allocate memory for 'nb_primes' elements. n = 2 while p.size() < nb_primes: # size() for vectors is similar to len() for i in p: if n % i == 0: break else: p.push_back(n) # push_back is similar to append() n += 1 # Vectors are automatically converted to Python # lists when converted to Python objects. return p I save thiscode like 'test_char.pyx'. For compilation i use it: from Cython.Build import cythonize setup(name='test_char', ext_modules = cythonize('test_char.pyx') ) After that i get test_char.c, but i don't get test_char.py. If i will use this code (without cimport): def primes(int nb_primes): cdef int n, i, len_p cdef int p[1000] if nb_primes > 1000: nb_primes = 1000 len_p = 0 # The current number of elements in p. n = 2 while len_p < nb_primes: # Is n prime? for i in p[:len_p]: if n % i == 0: break # If no break occurred in the loop, we have a prime. else: p[len_p] = n len_p += 1 n += 1 # Let's return the result in a python list: result_as_list = [prime for prime in p[:len_p]] return result_as_list all be right. So, plz, any ideas?
from distutils.extension import Extension extensions = [ Extension("test_char", ["test_char.pyx"] , language="c++" ) ] setup( name="test_char", ext_modules = cythonize(extensions), ) it can solve this problem
Wrong calculation when using Cython [duplicate]
This question already has answers here: C left shift on 64 bits fail (2 answers) Closed 2 years ago. I implemented Lucas-Lehmer primality test to check Mersenne prime in python. Then I use Cython to speed up the calculation. Original Python code: def lucas_lehmer(p): if p == 2: return True s = 4 M = (1 << p) - 1 for i in range(p-2): s = ((s * s) - 2) % M print("Processed: {}%".format(100*i//(p-2))) if s == 0: return True else: return False Cython code: cpdef lucas_lehmer(int p): if p == 2: return True cdef unsigned long long int M M = (1 << p) - 1 cdef unsigned long long int s s = 4 cdef int i for i in range(p-2): s = ((s * s) - 2) % M print("Processed: {}%".format(100*i//(p-2))) if s == 0: return True else: return False Running the original Python code, it works correctly. But for Cython, it's only correct with p = 31 and lower, testing with p = 61 and bigger (all tested p values are values that 2^p-1 is prime), it returns False (not a prime number), except for p = 86243. For some p like 97, even though 2^97-1 is not a prime number, the program actually return True (is a prime number), which is a contradiction. Why does this happen? Without using cdef for variable M and s, the calculation will be correct, but the performance won't get any improved.
Running a few tests on your code I found that M was always equal to 1 so I defined p as a cdef and got the required result. Not sure exactly what the issue is but it's something to do with that bit operation on p. p needs to be of the same type as M for it to make sense and if one is cdef and one is python int somehow it doesn't work? cpdef lucas_lehmer(int py): cdef p p = py if p == 2: return True cdef M M = (1 << p) - 1 cdef s s = 4 cdef int i for i in range(p-2): s = ((s * s) - 2) % M print("Processed: {}%".format(100*i//(p-2))) if s == 0: return True else: return False
Performance drop using cython
I wanted to make some code faster using cython's capability to use efficient indexing: http://docs.cython.org/src/tutorial/numpy.html Basically the code represents the dependency of buttons on a game board of the game http://www.hacker.org/cross/index.php # file test_so_cy.pyx import time import numpy as np cimport numpy as np DTYPE = np.uint8 ctypedef np.uint8_t DTYPE_t def time_fmt(td): return "{:.2f} s".format(td) def derive_equations(np.ndarray[DTYPE_t, ndim=2] field not None): cdef unsigned int n, m, i, j, x, y t1 = time.time() n, m = len(field), len(field[0]) # generate equations for dimensions n and m eqs = [] block = 2 # as soon as a 2 is hit there isnt any influence for i in xrange(n): for j in xrange(m): eq = 0L if field[i][j] == block: eqs.append([i*m+j ,field[i][j], eq]) continue # rows upwards for x in xrange(i-1, -1, -1): if field[x][j] == block: break eq ^= 1L << (x*m+j) # rows downwards for x in xrange(i, n): if field[x][j] == block: break eq ^= 1L << (x*m+j) # cols left for y in xrange(j-1, -1, -1): if field[i][y] == block: break eq ^= 1L << (i*m+y) # cols right # j+1 to avoid resetting the influence of itself for y in xrange(j+1, m): if field[i][y] == block: break eq ^= 1L << (i*m+y) eqs.append([i*m+j, field[i][j], eq]) t2 = time.time() print 'preprocess time:', time_fmt(t2 - t1) return n, m, eqs def main(): field = np.array( [[0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,2,1,0,0,2,1,0,1,1,0,0,0,0,0], [0,1,0,0,1,1,0,1,0,0,0,1,1,0,0,1,0,1,0,0,1,0,1,1,1,0,1,1,1], [1,1,0,1,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,1,0,1,1,0,0,1,0,0,2], [0,0,0,0,1,0,1,1,0,1,1,1,0,1,0,1,1,0,0,0,1,1,0,0,2,1,1,0,1], [0,1,0,1,1,1,1,1,2,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,2,0,1,0,1], [0,1,1,0,0,1,1,0,1,0,0,1,1,1,0,1,1,1,0,0,1,1,1,0,1,0,1,1,1], [0,0,0,1,0,1,1,0,1,0,0,1,1,1,0,1,0,0,0,0,0,0,0,1,0,1,0,1,1], [1,0,1,0,1,1,0,0,0,0,0,1,0,0,2,0,1,1,0,0,0,0,1,0,0,2,1,0,0], [1,0,1,0,1,0,1,0,1,1,1,0,1,0,1,1,0,1,1,0,1,0,1,0,1,0,1,1,1], [0,0,1,0,0,1,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,1,1,2], [1,0,1,1,0,0,1,0,1,1,1,0,1,2,1,1,1,2,1,0,1,1,1,0,0,0,0,0,0], [0,0,1,0,1,0,0,1,0,1,1,1,1,1,1,0,0,1,1,0,0,1,0,0,0,1,0,0,1], [1,1,0,0,0,1,0,0,1,0,0,1,0,1,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0], [1,1,1,0,1,1,1,1,0,0,1,0,1,1,0,0,0,0,1,1,1,1,1,0,1,0,1,0,1], [1,0,0,0,1,1,0,0,2,0,1,1,2,0,0,1,0,1,0,1,0,2,1,1,1,1,0,0,2], [1,0,1,1,1,1,1,0,0,1,1,0,1,1,0,0,1,0,0,0,2,1,0,1,0,1,0,1,1], [0,0,1,1,1,0,0,0,0,0,2,1,0,1,0,1,0,1,1,1,1,0,0,1,1,1,1,0,1], [0,1,0,1,2,0,0,0,0,0,1,1,0,1,0,1,0,1,0,1,0,0,1,0,1,0,1,1,0], [0,1,0,0,2,0,0,0,1,0,1,0,0,1,0,1,1,0,0,1,0,1,1,1,0,1,1,1,1], [1,0,0,1,0,0,1,0,1,0,0,2,0,1,1,1,1,1,0,0,1,0,1,0,1,1,0,1,1], [0,0,1,0,1,1,0,0,1,0,0,0,1,1,1,0,0,1,0,0,1,0,1,2,0,1,1,0,2], [0,1,1,0,1,0,1,1,0,0,1,0,0,0,1,1,0,1,0,1,1,1,1,1,2,0,1,2,0], [0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,1,1,2,0,0,1,0,0,1,1,0], [0,0,1,1,0,1,1,0,0,1,1,1,1,0,0,1,0,0,1,1,1,0,0,0,1,1,1,0,1], [0,2,0,1,1,1,1,0,1,0,0,0,0,0,1,1,1,0,1,1,1,1,0,1,0,0,0,1,1], [0,2,1,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,1,0,1,1,1,1,1,1,0,1,1], [0,1,1,1,0,1,0,0,0,1,0,2,0,1,1,1,1,1,0,1,0,1,0,0,1,1,0,1,0], [0,1,1,1,1,1,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0], [1,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,2,1,1]], dtype=DTYPE) derive_equations(field) if __name__ == '__main__': main() # file setup_so.py from distutils.core import setup from Cython.Build import cythonize import numpy setup( name = "test_so", ext_modules = cythonize('test_so_cy.pyx'), include_dirs=[numpy.get_include()] ) # usage: python setup_so.py build_ext --inplace # import test_so_cy # test_so_cy.main() The problem is that the cython code runs ~3 times slower than the pure python version. (I am using the time module to measure execution time because for bigger matrices its ok). cython -a tells me that the if field[x][j] == block: break lines are still using much python. So it seems that fast indexing still cannot be used. Any ideas what i am doing wrong?
Original speed: 0.14s 14X speedup (0.01s): The field[i][j] will evaluate the field[i] first and then try to evaluate the resulting python object. use the field[i,j] notation for a HUGE boost in speed 5X speedup (0.0018s): type the eq variable cdef long eq 12X s5eedup (0.00012s) : replace the list with a stack made of an np array: cdef np.ndarray[long, ndim=2] eqs=np.zeros((n*m,3),np.long) cdef int curr_eqn=0 #append to list code if field[i,j] == block: eqs[curr_eqn,0]=i*m+j eqs[curr_eqn,1]=field[i,j] eqs[curr_eqn,2]=eq curr_eqn+=1 continue total speedup: 1100x
How to do struct.pack and struct.unpack in cython?
I'm trying to convert a python module to cython, it does a lot of serialize and deserialize work. Currently I have to do this: import struct from libc.stdint cimport ( int32_t, int64_t, ) cpdef bytes write_int(int32_t i): return struct.pack("!i", i) cpdef bytes write_long(int64_t i): return struct.pack("!q", i) cdef bytes write_double(double val): return struct.pack("!d", val) cdef bytes write_string(bytes val): cdef int32_t length = len(val) cdef str fmt fmt = "!i%ds" % length return struct.pack(fmt, length, val) Is there an equal in c lib to struct.pack and struct.unpack? What's the best way to do things like this in cython?
I looked at the modules (this and this) and just translated the code to Cython and removed the PyObject parts. In theory this should work, but some parts (like the float parts) I have no way of rigorously testing: Some imports: from cpython.array cimport array, clone from libc.string cimport memcmp, memcpy from libc.math cimport frexp, ldexp from libc.stdint cimport int32_t, int64_t Save some code with a fused type. It's technically not a stable feature, but it works flawlessly for me: ctypedef fused integer: int32_t int64_t This part tests the machine's endianness. It works for me, but that's hardly a complete suite. OTOH, it looks about right cdef enum float_format_type: unknown_format, ieee_big_endian_format, ieee_little_endian_format # Set-up cdef array stringtemplate = array('B') cdef float_format_type double_format cdef double x = 9006104071832581.0 if sizeof(double) == 8: if memcmp(&x, b"\x43\x3f\xff\x01\x02\x03\x04\x05", 8) == 0: double_format = ieee_big_endian_format elif memcmp(&x, b"\x05\x04\x03\x02\x01\xff\x3f\x43", 8) == 0: double_format = ieee_little_endian_format else: double_format = unknown_format else: double_format = unknown_format; (The stringtemplate is used to be able to make bytes objects quickly) This part's simple: cdef void _write_integer(integer x, char* output): cdef int i for i in range(sizeof(integer)-1, -1, -1): output[i] = <char>x x >>= 8 cpdef bytes write_int(int32_t i): cdef array output = clone(stringtemplate, sizeof(int32_t), False) _write_integer(i, output.data.as_chars) return output.data.as_chars[:sizeof(int32_t)] cpdef bytes write_long(int64_t i): cdef array output = clone(stringtemplate, sizeof(int64_t), False) _write_integer(i, output.data.as_chars) return output.data.as_chars[:sizeof(int64_t)] The array is similar to malloc but it's garbage collected :). This part I mostly have no idea about. My "tests" passed, but it's mostly hope: cdef void _write_double(double x, char* output): cdef: unsigned char sign int e double f unsigned int fhi, flo, i char *s if double_format == unknown_format or True: if x < 0: sign = 1 x = -x else: sign = 0 f = frexp(x, &e) # Normalize f to be in the range [1.0, 2.0) if 0.5 <= f < 1.0: f *= 2.0 e -= 1 elif f == 0.0: e = 0 else: raise SystemError("frexp() result out of range") if e >= 1024: raise OverflowError("float too large to pack with d format") elif e < -1022: # Gradual underflow f = ldexp(f, 1022 + e) e = 0; elif not (e == 0 and f == 0.0): e += 1023 f -= 1.0 # Get rid of leading 1 # fhi receives the high 28 bits; flo the low 24 bits (== 52 bits) f *= 2.0 ** 28 fhi = <unsigned int>f # Truncate assert fhi < 268435456 f -= <double>fhi f *= 2.0 ** 24 flo = <unsigned int>(f + 0.5) # Round assert(flo <= 16777216); if flo >> 24: # The carry propagated out of a string of 24 1 bits. flo = 0 fhi += 1 if fhi >> 28: # And it also progagated out of the next 28 bits. fhi = 0 e += 1 if e >= 2047: raise OverflowError("float too large to pack with d format") output[0] = (sign << 7) | (e >> 4) output[1] = <unsigned char> (((e & 0xF) << 4) | (fhi >> 24)) output[2] = 0xFF & (fhi >> 16) output[3] = 0xFF & (fhi >> 8) output[4] = 0xFF & fhi output[5] = 0xFF & (flo >> 16) output[6] = 0xFF & (flo >> 8) output[7] = 0xFF & flo else: s = <char*>&x; if double_format == ieee_little_endian_format: for i in range(8): output[i] = s[7-i] else: for i in range(8): output[i] = s[i] If you can understand how it works, be sure to check it yourself. Then we wrap it as before: cdef bytes write_double(double x): cdef array output = clone(stringtemplate, sizeof(double), False) _write_double(x, output.data.as_chars) return output.data.as_chars[:sizeof(double)] The string one is actually really simple, and explains why I set it up as I did above: cdef bytes write_string(bytes val): cdef: int32_t int_length = sizeof(int32_t) int32_t input_length = len(val) array output = clone(stringtemplate, int_length + input_length, True) _write_integer(input_length, output.data.as_chars) memcpy(output.data.as_chars + int_length, <char*>val, input_length) return output.data.as_chars[:int_length + input_length]
If you're only packing one type of data per command (eg. a group of ints, then a group of floats etc), you can use array.array() for faster results, either via Python or Cython. Source: Serialize a group of integers using Cython
What is going in this malloc'ed array in Cython?
%%cython -f -c=-O3 -c=-fopenmp --link-args=-fopenmp from cython.parallel import parallel, prange from libc.stdlib cimport abort, malloc, free cdef int idx, i, n = 100 cdef int k cdef int * local_buf cdef int size = 10 cdef void func(int* lb) nogil: cdef int j for j in xrange(size): lb[j] += -1*j local_buf = <int *> malloc(sizeof(int) * size) with nogil, parallel(): if local_buf == NULL: abort() # populate our local buffer in a sequential loop for i in xrange(size): local_buf[i] = i * 2 # share the work using the thread-local buffer(s) for k in prange(n, schedule='guided'): func(local_buf) for i in xrange(size): print local_buf[i] free(local_buf) 0 -98 -196 -294 -392 -490 -588 -686 -784 -882 edit: The above block shows the output after one run, but the contents in local_buf seems to change every or so re-run. What's going on?
The result there seems reasonable with the code given, do you actually get different result each run? This should be regular python equivalent: size = 10 n = 100 lst = [i*2 for i in range(size)] for i in range(n): for j in range(size): lst[j] += -1*j print lst #[0, -98, -196, -294, -392, -490, -588, -686, -784, -882]