Cython No Performance Increase with prange/parallel - python

I'm using Cython version 0.27.3 to compile the following source for a simple primality testing module that contains both python and cython implementations of the same algorithm. When I set the threads parameter to different values, I see no performance increase, despite the GIL being released. Is there something that's preventing this from running in parallel?
The function in question is the cdef void _getprimes which accepts a memoryview slice as a parameter and should set all non-prime values to 0 in that slice.
primes.pyx
#cython: boundscheck=False, wraparound=False, nonecheck=False
cimport cython
from cpython cimport array
from cython.parallel cimport parallel, prange
from libc.math cimport sqrt, ceil
from libc.stdlib cimport malloc, free
from libc.stdio cimport printf
import math
# =====================
# Python implementation
# =====================
def pyisprime(n):
"""Python implementation"""
if n < 2 or n & 1 == 0:
if n == 2:
return True
return False
for i in range(2, int(math.sqrt(n)) + 1):
if n % i == 0:
return False
return True
def pygetprimes(nums):
return [num for num in nums if pyisprime(num)]
# =====================
# Cython implementation
# =====================
cdef int _isprime(unsigned long long n) nogil:
"""Cython implementation of a simple primality check"""
cdef unsigned long long upper
cdef unsigned long long i = 3
cdef int prime = 1
if n < 2 or n & 1 == 0:
if n == 2:
return 1
return 0
upper = <unsigned long long>ceil(sqrt(<double>n))
while i <= upper:
if n % i == 0:
prime = 0
break
i += 1
return prime
def isprime(unsigned long long n):
"""Wrapper for _isprime"""
cdef int result
with nogil:
result = _isprime(n)
return result
cdef void _getprimes(unsigned long long[:] nums, int threads) nogil:
cdef unsigned long num
cdef int i = 0
with parallel(num_threads=threads):
for i in prange(nums.shape[0], schedule="dynamic"):
if _isprime(nums[i]) == 0:
nums[i] = 0
def getprimes(nums, int threads = 1):
"""Wrapper for _getprimes"""
cdef unsigned long long num
cdef unsigned long long[:] primes = array.array("Q", nums)
with nogil:
_getprimes(primes, threads)
return [num for num in primes if num != 0]
setup.py
#!/usr/bin/env python3
from distutils.core import setup
from Cython.Build import cythonize
setup(
name="primes",
ext_modules=cythonize('primes.pyx'),
)
test.py
#!/usr/bin/env python3
import functools
import random
import time
import primes
def timed(func):
def wrapped(*args, **kwargs):
start = time.time()
val = func(*args, **kwargs)
end = time.time()
print(func.__name__, end - start)
return val
return functools.wraps(func)(wrapped)
def main():
nums = [random.randint(0, 0xffffff) for _ in range(500000)]
pyfoo = timed(primes.pygetprimes)
cyfoo = timed(primes.getprimes)
x = pyfoo(nums)
y = cyfoo(nums, 1)
z = cyfoo(nums, 4)
assert x == y == z
if __name__ == "__main__":
main()
When I run cyfoo, I expected that increasing the number of threads from 1 to 4 would show some type of speed increase, but this is not the case:
[aarcher#Arch]: ~/Programming/Cython/build/lib.linux-x86_64-3.6>$ ./test.py
pygetprimes 5.11554741859436
getprimes 1.1129701137542725
getprimes 1.1306445598602295

It seems you need to enable compiler flags for OpenMP for the parallel statements to actually do anything.
See cython docs here
http://cython.readthedocs.io/en/latest/src/userguide/parallelism.html#compiling
# setup.py
# ... omitted ...
ext_modules = [
Extension(
"hello",
["hello.pyx"],
extra_compile_args=['-fopenmp'],
extra_link_args=['-fopenmp'],
)
]

Related

Cython promlem: cimport libcpp.vector not compiled

I'm trying to use cython to speed up my code. Since I'm working with an array of strings, I want to use string and vector from c++. But I have problems compiling if I import c libraries. For an example, I tried to implement an example from here: https://cython.readthedocs.io/en/latest/src/tutorial/cython_tutorial.html.
So, my code is
from libcpp.vector cimport vector
def primes(unsigned int nb_primes):
cdef int n, i
cdef vector[int] p
p.reserve(nb_primes) # allocate memory for 'nb_primes' elements.
n = 2
while p.size() < nb_primes: # size() for vectors is similar to len()
for i in p:
if n % i == 0:
break
else:
p.push_back(n) # push_back is similar to append()
n += 1
# Vectors are automatically converted to Python
# lists when converted to Python objects.
return p
I save thiscode like 'test_char.pyx'. For compilation i use it:
from Cython.Build import cythonize
setup(name='test_char',
ext_modules = cythonize('test_char.pyx')
)
After that i get test_char.c, but i don't get test_char.py.
If i will use this code (without cimport):
def primes(int nb_primes):
cdef int n, i, len_p
cdef int p[1000]
if nb_primes > 1000:
nb_primes = 1000
len_p = 0 # The current number of elements in p.
n = 2
while len_p < nb_primes:
# Is n prime?
for i in p[:len_p]:
if n % i == 0:
break
# If no break occurred in the loop, we have a prime.
else:
p[len_p] = n
len_p += 1
n += 1
# Let's return the result in a python list:
result_as_list = [prime for prime in p[:len_p]]
return result_as_list
all be right. So, plz, any ideas?
from distutils.extension import Extension
extensions = [
Extension("test_char", ["test_char.pyx"]
, language="c++"
)
]
setup(
name="test_char",
ext_modules = cythonize(extensions),
)
it can solve this problem

Wrong calculation when using Cython [duplicate]

This question already has answers here:
C left shift on 64 bits fail
(2 answers)
Closed 2 years ago.
I implemented Lucas-Lehmer primality test to check Mersenne prime in python. Then I use Cython to speed up the calculation.
Original Python code:
def lucas_lehmer(p):
if p == 2:
return True
s = 4
M = (1 << p) - 1
for i in range(p-2):
s = ((s * s) - 2) % M
print("Processed: {}%".format(100*i//(p-2)))
if s == 0:
return True
else:
return False
Cython code:
cpdef lucas_lehmer(int p):
if p == 2:
return True
cdef unsigned long long int M
M = (1 << p) - 1
cdef unsigned long long int s
s = 4
cdef int i
for i in range(p-2):
s = ((s * s) - 2) % M
print("Processed: {}%".format(100*i//(p-2)))
if s == 0:
return True
else:
return False
Running the original Python code, it works correctly. But for Cython, it's only correct with p = 31 and lower, testing with p = 61 and bigger (all tested p values are values that 2^p-1 is prime), it returns False (not a prime number), except for p = 86243.
For some p like 97, even though 2^97-1 is not a prime number, the program actually return True (is a prime number), which is a contradiction.
Why does this happen? Without using cdef for variable M and s, the calculation will be correct, but the performance won't get any improved.
Running a few tests on your code I found that M was always equal to 1
so I defined p as a cdef and got the required result.
Not sure exactly what the issue is but it's something to do with that bit operation on p. p needs to be of the same type as M for it to make sense and if one is cdef and one is python int somehow it doesn't work?
cpdef lucas_lehmer(int py):
cdef p
p = py
if p == 2:
return True
cdef M
M = (1 << p) - 1
cdef s
s = 4
cdef int i
for i in range(p-2):
s = ((s * s) - 2) % M
print("Processed: {}%".format(100*i//(p-2)))
if s == 0:
return True
else:
return False

Performance drop using cython

I wanted to make some code faster using cython's capability to use efficient indexing: http://docs.cython.org/src/tutorial/numpy.html
Basically the code represents the dependency of buttons on a game board of the game http://www.hacker.org/cross/index.php
# file test_so_cy.pyx
import time
import numpy as np
cimport numpy as np
DTYPE = np.uint8
ctypedef np.uint8_t DTYPE_t
def time_fmt(td):
return "{:.2f} s".format(td)
def derive_equations(np.ndarray[DTYPE_t, ndim=2] field not None):
cdef unsigned int n, m, i, j, x, y
t1 = time.time()
n, m = len(field), len(field[0])
# generate equations for dimensions n and m
eqs = []
block = 2 # as soon as a 2 is hit there isnt any influence
for i in xrange(n):
for j in xrange(m):
eq = 0L
if field[i][j] == block:
eqs.append([i*m+j ,field[i][j], eq])
continue
# rows upwards
for x in xrange(i-1, -1, -1):
if field[x][j] == block: break
eq ^= 1L << (x*m+j)
# rows downwards
for x in xrange(i, n):
if field[x][j] == block: break
eq ^= 1L << (x*m+j)
# cols left
for y in xrange(j-1, -1, -1):
if field[i][y] == block: break
eq ^= 1L << (i*m+y)
# cols right
# j+1 to avoid resetting the influence of itself
for y in xrange(j+1, m):
if field[i][y] == block: break
eq ^= 1L << (i*m+y)
eqs.append([i*m+j, field[i][j], eq])
t2 = time.time()
print 'preprocess time:', time_fmt(t2 - t1)
return n, m, eqs
def main():
field = np.array(
[[0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,2,1,0,0,2,1,0,1,1,0,0,0,0,0],
[0,1,0,0,1,1,0,1,0,0,0,1,1,0,0,1,0,1,0,0,1,0,1,1,1,0,1,1,1],
[1,1,0,1,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,1,0,1,1,0,0,1,0,0,2],
[0,0,0,0,1,0,1,1,0,1,1,1,0,1,0,1,1,0,0,0,1,1,0,0,2,1,1,0,1],
[0,1,0,1,1,1,1,1,2,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,2,0,1,0,1],
[0,1,1,0,0,1,1,0,1,0,0,1,1,1,0,1,1,1,0,0,1,1,1,0,1,0,1,1,1],
[0,0,0,1,0,1,1,0,1,0,0,1,1,1,0,1,0,0,0,0,0,0,0,1,0,1,0,1,1],
[1,0,1,0,1,1,0,0,0,0,0,1,0,0,2,0,1,1,0,0,0,0,1,0,0,2,1,0,0],
[1,0,1,0,1,0,1,0,1,1,1,0,1,0,1,1,0,1,1,0,1,0,1,0,1,0,1,1,1],
[0,0,1,0,0,1,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,1,1,2],
[1,0,1,1,0,0,1,0,1,1,1,0,1,2,1,1,1,2,1,0,1,1,1,0,0,0,0,0,0],
[0,0,1,0,1,0,0,1,0,1,1,1,1,1,1,0,0,1,1,0,0,1,0,0,0,1,0,0,1],
[1,1,0,0,0,1,0,0,1,0,0,1,0,1,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0],
[1,1,1,0,1,1,1,1,0,0,1,0,1,1,0,0,0,0,1,1,1,1,1,0,1,0,1,0,1],
[1,0,0,0,1,1,0,0,2,0,1,1,2,0,0,1,0,1,0,1,0,2,1,1,1,1,0,0,2],
[1,0,1,1,1,1,1,0,0,1,1,0,1,1,0,0,1,0,0,0,2,1,0,1,0,1,0,1,1],
[0,0,1,1,1,0,0,0,0,0,2,1,0,1,0,1,0,1,1,1,1,0,0,1,1,1,1,0,1],
[0,1,0,1,2,0,0,0,0,0,1,1,0,1,0,1,0,1,0,1,0,0,1,0,1,0,1,1,0],
[0,1,0,0,2,0,0,0,1,0,1,0,0,1,0,1,1,0,0,1,0,1,1,1,0,1,1,1,1],
[1,0,0,1,0,0,1,0,1,0,0,2,0,1,1,1,1,1,0,0,1,0,1,0,1,1,0,1,1],
[0,0,1,0,1,1,0,0,1,0,0,0,1,1,1,0,0,1,0,0,1,0,1,2,0,1,1,0,2],
[0,1,1,0,1,0,1,1,0,0,1,0,0,0,1,1,0,1,0,1,1,1,1,1,2,0,1,2,0],
[0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,1,1,2,0,0,1,0,0,1,1,0],
[0,0,1,1,0,1,1,0,0,1,1,1,1,0,0,1,0,0,1,1,1,0,0,0,1,1,1,0,1],
[0,2,0,1,1,1,1,0,1,0,0,0,0,0,1,1,1,0,1,1,1,1,0,1,0,0,0,1,1],
[0,2,1,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,1,0,1,1,1,1,1,1,0,1,1],
[0,1,1,1,0,1,0,0,0,1,0,2,0,1,1,1,1,1,0,1,0,1,0,0,1,1,0,1,0],
[0,1,1,1,1,1,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0],
[1,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,2,1,1]], dtype=DTYPE)
derive_equations(field)
if __name__ == '__main__':
main()
# file setup_so.py
from distutils.core import setup
from Cython.Build import cythonize
import numpy
setup(
name = "test_so",
ext_modules = cythonize('test_so_cy.pyx'),
include_dirs=[numpy.get_include()]
)
# usage: python setup_so.py build_ext --inplace
# import test_so_cy
# test_so_cy.main()
The problem is that the cython code runs ~3 times slower than the pure python version. (I am using the time module to measure execution time because for bigger matrices its ok).
cython -a tells me that the
if field[x][j] == block: break
lines are still using much python. So it seems that fast indexing still cannot be used.
Any ideas what i am doing wrong?
Original speed: 0.14s
14X speedup (0.01s): The field[i][j] will evaluate the field[i] first and then try to evaluate the resulting python object. use the field[i,j] notation for a HUGE boost in speed
5X speedup (0.0018s): type the eq variable cdef long eq
12X s5eedup (0.00012s) : replace the list with a stack made of an np array:
cdef np.ndarray[long, ndim=2] eqs=np.zeros((n*m,3),np.long)
cdef int curr_eqn=0
#append to list code
if field[i,j] == block:
eqs[curr_eqn,0]=i*m+j
eqs[curr_eqn,1]=field[i,j]
eqs[curr_eqn,2]=eq
curr_eqn+=1
continue
total speedup: 1100x

How to do struct.pack and struct.unpack in cython?

I'm trying to convert a python module to cython, it does a lot of serialize and deserialize work.
Currently I have to do this:
import struct
from libc.stdint cimport (
int32_t,
int64_t,
)
cpdef bytes write_int(int32_t i):
return struct.pack("!i", i)
cpdef bytes write_long(int64_t i):
return struct.pack("!q", i)
cdef bytes write_double(double val):
return struct.pack("!d", val)
cdef bytes write_string(bytes val):
cdef int32_t length = len(val)
cdef str fmt
fmt = "!i%ds" % length
return struct.pack(fmt, length, val)
Is there an equal in c lib to struct.pack and struct.unpack? What's the best way to do things like this in cython?
I looked at the modules (this and this) and just translated the code to Cython and removed the PyObject parts. In theory this should work, but some parts (like the float parts) I have no way of rigorously testing:
Some imports:
from cpython.array cimport array, clone
from libc.string cimport memcmp, memcpy
from libc.math cimport frexp, ldexp
from libc.stdint cimport int32_t, int64_t
Save some code with a fused type. It's technically not a stable feature, but it works flawlessly for me:
ctypedef fused integer:
int32_t
int64_t
This part tests the machine's endianness. It works for me, but that's hardly a complete suite. OTOH, it looks about right
cdef enum float_format_type:
unknown_format,
ieee_big_endian_format,
ieee_little_endian_format
# Set-up
cdef array stringtemplate = array('B')
cdef float_format_type double_format
cdef double x = 9006104071832581.0
if sizeof(double) == 8:
if memcmp(&x, b"\x43\x3f\xff\x01\x02\x03\x04\x05", 8) == 0:
double_format = ieee_big_endian_format
elif memcmp(&x, b"\x05\x04\x03\x02\x01\xff\x3f\x43", 8) == 0:
double_format = ieee_little_endian_format
else:
double_format = unknown_format
else:
double_format = unknown_format;
(The stringtemplate is used to be able to make bytes objects quickly)
This part's simple:
cdef void _write_integer(integer x, char* output):
cdef int i
for i in range(sizeof(integer)-1, -1, -1):
output[i] = <char>x
x >>= 8
cpdef bytes write_int(int32_t i):
cdef array output = clone(stringtemplate, sizeof(int32_t), False)
_write_integer(i, output.data.as_chars)
return output.data.as_chars[:sizeof(int32_t)]
cpdef bytes write_long(int64_t i):
cdef array output = clone(stringtemplate, sizeof(int64_t), False)
_write_integer(i, output.data.as_chars)
return output.data.as_chars[:sizeof(int64_t)]
The array is similar to malloc but it's garbage collected :).
This part I mostly have no idea about. My "tests" passed, but it's mostly hope:
cdef void _write_double(double x, char* output):
cdef:
unsigned char sign
int e
double f
unsigned int fhi, flo, i
char *s
if double_format == unknown_format or True:
if x < 0:
sign = 1
x = -x
else:
sign = 0
f = frexp(x, &e)
# Normalize f to be in the range [1.0, 2.0)
if 0.5 <= f < 1.0:
f *= 2.0
e -= 1
elif f == 0.0:
e = 0
else:
raise SystemError("frexp() result out of range")
if e >= 1024:
raise OverflowError("float too large to pack with d format")
elif e < -1022:
# Gradual underflow
f = ldexp(f, 1022 + e)
e = 0;
elif not (e == 0 and f == 0.0):
e += 1023
f -= 1.0 # Get rid of leading 1
# fhi receives the high 28 bits; flo the low 24 bits (== 52 bits)
f *= 2.0 ** 28
fhi = <unsigned int>f # Truncate
assert fhi < 268435456
f -= <double>fhi
f *= 2.0 ** 24
flo = <unsigned int>(f + 0.5) # Round
assert(flo <= 16777216);
if flo >> 24:
# The carry propagated out of a string of 24 1 bits.
flo = 0
fhi += 1
if fhi >> 28:
# And it also progagated out of the next 28 bits.
fhi = 0
e += 1
if e >= 2047:
raise OverflowError("float too large to pack with d format")
output[0] = (sign << 7) | (e >> 4)
output[1] = <unsigned char> (((e & 0xF) << 4) | (fhi >> 24))
output[2] = 0xFF & (fhi >> 16)
output[3] = 0xFF & (fhi >> 8)
output[4] = 0xFF & fhi
output[5] = 0xFF & (flo >> 16)
output[6] = 0xFF & (flo >> 8)
output[7] = 0xFF & flo
else:
s = <char*>&x;
if double_format == ieee_little_endian_format:
for i in range(8):
output[i] = s[7-i]
else:
for i in range(8):
output[i] = s[i]
If you can understand how it works, be sure to check it yourself.
Then we wrap it as before:
cdef bytes write_double(double x):
cdef array output = clone(stringtemplate, sizeof(double), False)
_write_double(x, output.data.as_chars)
return output.data.as_chars[:sizeof(double)]
The string one is actually really simple, and explains why I set it up as I did above:
cdef bytes write_string(bytes val):
cdef:
int32_t int_length = sizeof(int32_t)
int32_t input_length = len(val)
array output = clone(stringtemplate, int_length + input_length, True)
_write_integer(input_length, output.data.as_chars)
memcpy(output.data.as_chars + int_length, <char*>val, input_length)
return output.data.as_chars[:int_length + input_length]
If you're only packing one type of data per command (eg. a group of ints, then a group of floats etc), you can use array.array() for faster results, either via Python or Cython.
Source:
Serialize a group of integers using Cython

What is going in this malloc'ed array in Cython?

%%cython -f -c=-O3 -c=-fopenmp --link-args=-fopenmp
from cython.parallel import parallel, prange
from libc.stdlib cimport abort, malloc, free
cdef int idx, i, n = 100
cdef int k
cdef int * local_buf
cdef int size = 10
cdef void func(int* lb) nogil:
cdef int j
for j in xrange(size):
lb[j] += -1*j
local_buf = <int *> malloc(sizeof(int) * size)
with nogil, parallel():
if local_buf == NULL:
abort()
# populate our local buffer in a sequential loop
for i in xrange(size):
local_buf[i] = i * 2
# share the work using the thread-local buffer(s)
for k in prange(n, schedule='guided'):
func(local_buf)
for i in xrange(size):
print local_buf[i]
free(local_buf)
0
-98
-196
-294
-392
-490
-588
-686
-784
-882
edit:
The above block shows the output after one run, but the contents in local_buf seems to change every or so re-run. What's going on?
The result there seems reasonable with the code given, do you actually get different result each run?
This should be regular python equivalent:
size = 10
n = 100
lst = [i*2 for i in range(size)]
for i in range(n):
for j in range(size):
lst[j] += -1*j
print lst
#[0, -98, -196, -294, -392, -490, -588, -686, -784, -882]

Categories

Resources