Performance drop using cython

Performance drop using cython - python

I wanted to make some code faster using cython's capability to use efficient indexing: http://docs.cython.org/src/tutorial/numpy.html
Basically the code represents the dependency of buttons on a game board of the game http://www.hacker.org/cross/index.php
# file test_so_cy.pyx
import time
import numpy as np
cimport numpy as np
DTYPE = np.uint8
ctypedef np.uint8_t DTYPE_t
def time_fmt(td):
return "{:.2f} s".format(td)
def derive_equations(np.ndarray[DTYPE_t, ndim=2] field not None):
cdef unsigned int n, m, i, j, x, y
t1 = time.time()
n, m = len(field), len(field[0])
# generate equations for dimensions n and m
eqs = []
block = 2 # as soon as a 2 is hit there isnt any influence
for i in xrange(n):
for j in xrange(m):
eq = 0L
if field[i][j] == block:
eqs.append([i*m+j ,field[i][j], eq])
continue
# rows upwards
for x in xrange(i-1, -1, -1):
if field[x][j] == block: break
eq ^= 1L << (x*m+j)
# rows downwards
for x in xrange(i, n):
if field[x][j] == block: break
eq ^= 1L << (x*m+j)
# cols left
for y in xrange(j-1, -1, -1):
if field[i][y] == block: break
eq ^= 1L << (i*m+y)
# cols right
# j+1 to avoid resetting the influence of itself
for y in xrange(j+1, m):
if field[i][y] == block: break
eq ^= 1L << (i*m+y)
eqs.append([i*m+j, field[i][j], eq])
t2 = time.time()
print 'preprocess time:', time_fmt(t2 - t1)
return n, m, eqs
def main():
field = np.array(
[[0,1,0,0,0,0,0,0,0,0,0,1,1,1,0,2,1,0,0,2,1,0,1,1,0,0,0,0,0],
[0,1,0,0,1,1,0,1,0,0,0,1,1,0,0,1,0,1,0,0,1,0,1,1,1,0,1,1,1],
[1,1,0,1,0,0,0,0,0,0,0,1,1,0,1,0,1,0,0,1,0,1,1,0,0,1,0,0,2],
[0,0,0,0,1,0,1,1,0,1,1,1,0,1,0,1,1,0,0,0,1,1,0,0,2,1,1,0,1],
[0,1,0,1,1,1,1,1,2,1,1,0,1,0,0,0,0,0,0,1,0,0,1,0,2,0,1,0,1],
[0,1,1,0,0,1,1,0,1,0,0,1,1,1,0,1,1,1,0,0,1,1,1,0,1,0,1,1,1],
[0,0,0,1,0,1,1,0,1,0,0,1,1,1,0,1,0,0,0,0,0,0,0,1,0,1,0,1,1],
[1,0,1,0,1,1,0,0,0,0,0,1,0,0,2,0,1,1,0,0,0,0,1,0,0,2,1,0,0],
[1,0,1,0,1,0,1,0,1,1,1,0,1,0,1,1,0,1,1,0,1,0,1,0,1,0,1,1,1],
[0,0,1,0,0,1,1,0,1,0,0,1,0,0,1,0,0,0,1,0,0,0,0,1,0,1,1,1,2],
[1,0,1,1,0,0,1,0,1,1,1,0,1,2,1,1,1,2,1,0,1,1,1,0,0,0,0,0,0],
[0,0,1,0,1,0,0,1,0,1,1,1,1,1,1,0,0,1,1,0,0,1,0,0,0,1,0,0,1],
[1,1,0,0,0,1,0,0,1,0,0,1,0,1,1,0,0,0,0,1,1,0,0,1,0,0,0,0,0],
[1,1,1,0,1,1,1,1,0,0,1,0,1,1,0,0,0,0,1,1,1,1,1,0,1,0,1,0,1],
[1,0,0,0,1,1,0,0,2,0,1,1,2,0,0,1,0,1,0,1,0,2,1,1,1,1,0,0,2],
[1,0,1,1,1,1,1,0,0,1,1,0,1,1,0,0,1,0,0,0,2,1,0,1,0,1,0,1,1],
[0,0,1,1,1,0,0,0,0,0,2,1,0,1,0,1,0,1,1,1,1,0,0,1,1,1,1,0,1],
[0,1,0,1,2,0,0,0,0,0,1,1,0,1,0,1,0,1,0,1,0,0,1,0,1,0,1,1,0],
[0,1,0,0,2,0,0,0,1,0,1,0,0,1,0,1,1,0,0,1,0,1,1,1,0,1,1,1,1],
[1,0,0,1,0,0,1,0,1,0,0,2,0,1,1,1,1,1,0,0,1,0,1,0,1,1,0,1,1],
[0,0,1,0,1,1,0,0,1,0,0,0,1,1,1,0,0,1,0,0,1,0,1,2,0,1,1,0,2],
[0,1,1,0,1,0,1,1,0,0,1,0,0,0,1,1,0,1,0,1,1,1,1,1,2,0,1,2,0],
[0,0,0,0,1,0,1,0,0,0,0,1,0,0,1,0,0,1,1,1,2,0,0,1,0,0,1,1,0],
[0,0,1,1,0,1,1,0,0,1,1,1,1,0,0,1,0,0,1,1,1,0,0,0,1,1,1,0,1],
[0,2,0,1,1,1,1,0,1,0,0,0,0,0,1,1,1,0,1,1,1,1,0,1,0,0,0,1,1],
[0,2,1,1,1,1,1,1,1,1,0,1,1,0,0,0,0,0,1,0,1,1,1,1,1,1,0,1,1],
[0,1,1,1,0,1,0,0,0,1,0,2,0,1,1,1,1,1,0,1,0,1,0,0,1,1,0,1,0],
[0,1,1,1,1,1,0,1,0,1,0,1,0,0,0,0,0,1,0,0,0,0,1,1,0,0,0,0,0],
[1,0,0,0,0,1,0,1,0,1,0,1,0,0,1,0,0,1,0,0,1,0,1,0,0,1,2,1,1]], dtype=DTYPE)
derive_equations(field)
if __name__ == '__main__':
main()
# file setup_so.py
from distutils.core import setup
from Cython.Build import cythonize
import numpy
setup(
name = "test_so",
ext_modules = cythonize('test_so_cy.pyx'),
include_dirs=[numpy.get_include()]
)
# usage: python setup_so.py build_ext --inplace
# import test_so_cy
# test_so_cy.main()
The problem is that the cython code runs ~3 times slower than the pure python version. (I am using the time module to measure execution time because for bigger matrices its ok).
cython -a tells me that the
if field[x][j] == block: break
lines are still using much python. So it seems that fast indexing still cannot be used.
Any ideas what i am doing wrong?

Original speed: 0.14s
14X speedup (0.01s): The field[i][j] will evaluate the field[i] first and then try to evaluate the resulting python object. use the field[i,j] notation for a HUGE boost in speed
5X speedup (0.0018s): type the eq variable cdef long eq
12X s5eedup (0.00012s) : replace the list with a stack made of an np array:
cdef np.ndarray[long, ndim=2] eqs=np.zeros((n*m,3),np.long)
cdef int curr_eqn=0
#append to list code
if field[i,j] == block:
eqs[curr_eqn,0]=i*m+j
eqs[curr_eqn,1]=field[i,j]
eqs[curr_eqn,2]=eq
curr_eqn+=1
continue
total speedup: 1100x

Related

Cython promlem: cimport libcpp.vector not compiled

I'm trying to use cython to speed up my code. Since I'm working with an array of strings, I want to use string and vector from c++. But I have problems compiling if I import c libraries. For an example, I tried to implement an example from here: https://cython.readthedocs.io/en/latest/src/tutorial/cython_tutorial.html.
So, my code is
from libcpp.vector cimport vector
def primes(unsigned int nb_primes):
cdef int n, i
cdef vector[int] p
p.reserve(nb_primes) # allocate memory for 'nb_primes' elements.
n = 2
while p.size() < nb_primes: # size() for vectors is similar to len()
for i in p:
if n % i == 0:
break
else:
p.push_back(n) # push_back is similar to append()
n += 1
# Vectors are automatically converted to Python
# lists when converted to Python objects.
return p
I save thiscode like 'test_char.pyx'. For compilation i use it:
from Cython.Build import cythonize
setup(name='test_char',
ext_modules = cythonize('test_char.pyx')
)
After that i get test_char.c, but i don't get test_char.py.
If i will use this code (without cimport):
def primes(int nb_primes):
cdef int n, i, len_p
cdef int p[1000]
if nb_primes > 1000:
nb_primes = 1000
len_p = 0 # The current number of elements in p.
n = 2
while len_p < nb_primes:
# Is n prime?
for i in p[:len_p]:
if n % i == 0:
break
# If no break occurred in the loop, we have a prime.
else:
p[len_p] = n
len_p += 1
n += 1
# Let's return the result in a python list:
result_as_list = [prime for prime in p[:len_p]]
return result_as_list
all be right. So, plz, any ideas?

from distutils.extension import Extension
extensions = [
Extension("test_char", ["test_char.pyx"]
, language="c++"
)
]
setup(
name="test_char",
ext_modules = cythonize(extensions),
)
it can solve this problem

Python complex numbers from C++ strange output

I'm working on translating some code from C++ to Python and some values in python output are wrong. The expected output is pairs of numbers stored in the array. In the python, I get a lot of 1-0j pairs first and then good ones. In C++ the greatest value is around 1.3 and in Python over 9. How do I have to modify my python code to get the output from C++.
main func C++, I know that S do nothing but I'll use it later:
int X = 1000;
int N = X;
complex<double> S;
for (int n=0; n<X; n++)
{
S = complex<double>(0,0);
for (int x=0; x<X; x++)
{
double r = cos(((2*M_PI)/X)*n*x);
double i = sin(((2*M_PI)/X)*n*x);
complex<double> t (r, -i);
cout << t << endl;
}
}
Python:
import numpy as np
from math import pi
import sys
np.set_printoptions(threshold=sys.maxsize)
X = 1000
N = X
S = np.zeros(0, dtype = complex)
T = np.zeros(0, dtype = complex)
n = 0
x = 0
for n in range(0, 1000, 1):
# S = np.append(S, np.complex(0 ,0)
for x in range(0, 1000, 1):
r = np.cos(((2*pi)/X)*n*x)
i = np.sin(((2*pi)/X)*n*x)
T = np.append(T, np.complex(r, -i))
print(T)
print('\n')

It's not clear why you think your Python code is equivalent to your C++ code. T = np.append(T, np.complex(r, -i)) is not equivalent to complex<double> t (r, -i);. The actual equivalent Python code to your C++, which produces the same output, is:
from numpy import cos, sin, pi # could also import from math instead of numpy, might affect speed though
X = 1000
for n in range(X):
for x in range(X):
r = cos(((2*pi)/X)*n*x)
i = sin(((2*pi)/X)*n*x)
t = complex(r, -i)
print(t)
The way I tested this was by setting X to 5 in both sets of code and comparing the output. They were the same (with Python just showing more digits of precision).

This is my conversion of the C++ code. I maintained the array creation, but pre-allocated them rather than using np.append (unnecessarily slow when you already have the array length defined). Also added formatting to the "print" statement. Note that printing a million values takes a terribly long time, so I suggest commenting out the print line, or reducing the value of X to 50 or less.
import numpy as np
import sys
np.set_printoptions(threshold=sys.maxsize)
X = 1000
N = X
S = np.zeros((N*X,), dtype=complex)
T = np.zeros((N*X,), dtype=complex)
k = 0
for n in range(0, N):
# Not sure what you were trying to do with "S"
for x in range(0, X):
r = np.cos((2*np.pi/X)*n*x)
i = np.sin((2*np.pi/X)*n*x)
T[k] = np.complex(r, -i)
print('{val:14.3f}'.format(val=T[k]))
k += 1

Cython No Performance Increase with prange/parallel

I'm using Cython version 0.27.3 to compile the following source for a simple primality testing module that contains both python and cython implementations of the same algorithm. When I set the threads parameter to different values, I see no performance increase, despite the GIL being released. Is there something that's preventing this from running in parallel?
The function in question is the cdef void _getprimes which accepts a memoryview slice as a parameter and should set all non-prime values to 0 in that slice.
primes.pyx
#cython: boundscheck=False, wraparound=False, nonecheck=False
cimport cython
from cpython cimport array
from cython.parallel cimport parallel, prange
from libc.math cimport sqrt, ceil
from libc.stdlib cimport malloc, free
from libc.stdio cimport printf
import math
# =====================
# Python implementation
# =====================
def pyisprime(n):
"""Python implementation"""
if n < 2 or n & 1 == 0:
if n == 2:
return True
return False
for i in range(2, int(math.sqrt(n)) + 1):
if n % i == 0:
return False
return True
def pygetprimes(nums):
return [num for num in nums if pyisprime(num)]
# =====================
# Cython implementation
# =====================
cdef int _isprime(unsigned long long n) nogil:
"""Cython implementation of a simple primality check"""
cdef unsigned long long upper
cdef unsigned long long i = 3
cdef int prime = 1
if n < 2 or n & 1 == 0:
if n == 2:
return 1
return 0
upper = <unsigned long long>ceil(sqrt(<double>n))
while i <= upper:
if n % i == 0:
prime = 0
break
i += 1
return prime
def isprime(unsigned long long n):
"""Wrapper for _isprime"""
cdef int result
with nogil:
result = _isprime(n)
return result
cdef void _getprimes(unsigned long long[:] nums, int threads) nogil:
cdef unsigned long num
cdef int i = 0
with parallel(num_threads=threads):
for i in prange(nums.shape[0], schedule="dynamic"):
if _isprime(nums[i]) == 0:
nums[i] = 0
def getprimes(nums, int threads = 1):
"""Wrapper for _getprimes"""
cdef unsigned long long num
cdef unsigned long long[:] primes = array.array("Q", nums)
with nogil:
_getprimes(primes, threads)
return [num for num in primes if num != 0]
setup.py
#!/usr/bin/env python3
from distutils.core import setup
from Cython.Build import cythonize
setup(
name="primes",
ext_modules=cythonize('primes.pyx'),
)
test.py
#!/usr/bin/env python3
import functools
import random
import time
import primes
def timed(func):
def wrapped(*args, **kwargs):
start = time.time()
val = func(*args, **kwargs)
end = time.time()
print(func.__name__, end - start)
return val
return functools.wraps(func)(wrapped)
def main():
nums = [random.randint(0, 0xffffff) for _ in range(500000)]
pyfoo = timed(primes.pygetprimes)
cyfoo = timed(primes.getprimes)
x = pyfoo(nums)
y = cyfoo(nums, 1)
z = cyfoo(nums, 4)
assert x == y == z
if __name__ == "__main__":
main()
When I run cyfoo, I expected that increasing the number of threads from 1 to 4 would show some type of speed increase, but this is not the case:
[aarcher#Arch]: ~/Programming/Cython/build/lib.linux-x86_64-3.6>$ ./test.py
pygetprimes 5.11554741859436
getprimes 1.1129701137542725
getprimes 1.1306445598602295

It seems you need to enable compiler flags for OpenMP for the parallel statements to actually do anything.
See cython docs here
http://cython.readthedocs.io/en/latest/src/userguide/parallelism.html#compiling
# setup.py
# ... omitted ...
ext_modules = [
Extension(
"hello",
["hello.pyx"],
extra_compile_args=['-fopenmp'],
extra_link_args=['-fopenmp'],
)
]

Parallelize these nested for loops in python

I have a multidimensional array (result) that should be filled by some nested loops. Function fun() is a complex and time-consuming function. I want to fill my array elements in a parallel manner, so I can use all my system's processing power.
Here's the code:
import numpy as np
def fun(x, y, z):
# time-consuming computation...
# ...
return output
dim1 = 10
dim2 = 20
dim3 = 30
result = np.zeros([dim1, dim2, dim3])
for i in xrange(dim1):
for j in xrange(dim2):
for k in xrange(dim3):
result[i, j, k] = fun(i, j, k)
My question is that "Can I parallelize this code or not? if yes, How?"
I'm using Windows 10 64-bit and python 2.7.
Please provide your solution by changing my code if you can.
Thanks!

If you want a more general solution, taking advantage of fully parallel execution, then why not use something like this:
>>> import multiprocess as mp
>>> p = mp.Pool()
>>>
>>> # a time consuming function taking x,y,z,...
>>> def fun(*args):
... import time
... time.sleep(.1)
... return sum(*args)
...
>>> dim1, dim2, dim3 = 10, 20, 30
>>> import itertools
>>> input = ((i,j,k) for i,j,k in itertools.combinations_with_replacement(xrange(dim3), 3) if i < dim1 and j < dim2)
>>> results = p.map(fun, input)
>>> p.close()
>>> p.join()
>>>
>>> results[:2]
[0, 1]
>>> results[-2:]
[56, 57]
Note I'm using multiprocess instead of multiprocessing, but that's only to get the ability to work in the interpreter.
I didn't use a numpy.array, but if you had to... you could just dump the output from p.map directly into a numpy.array and then modify the shape attribute to be shape = (dim1, dim2, dim3), or you could do something like this:
>>> input = ((i,j,k) for i,j,k in itertools.combinations_with_replacement(xrange(dim3), 3) if i < dim1 and j < dim2)
>>> import numpy as np
>>> results = np.empty(dim1*dim2*dim3)
>>> res = p.imap(fun, input)
>>> for i,r in enumerate(res):
... results[i] = r
...
>>> results.shape = (dim1,dim2,dim3)

Here is a version of code that runs fun(i, j, k) in parallel for differend k indices. This is done by running fun in different processes by using https://docs.python.org/2/library/multiprocessing.html
import numpy as np
from multiprocessing import Pool
def fun(x, y, z):
# time-consuming computation...
# ...
return output
def fun_wrapper(indices):
fun(*indices)
if __name__ == '__main__':
dim1 = 10
dim2 = 20
dim3 = 30
result = np.zeros([dim1, dim2, dim3])
pool = Pool(processes=8)
for i in xrange(dim1):
for j in xrange(dim2):
result[i, j] = pool.map(fun_wrapper, [(i, j, k) for k in xrange(dim3)])
This is not the most elegant solution but you may start with it. And you will get a speed up only if fun contains time-consuming computation

A simple approach could be to divide the array in sections and create some threads to operate throught these sections. For example one section from (0,0,0) to (5,10,15) and other one from (5,10,16) to (10,20,30).
You can use threading module and do something like this
import numpy as np
import threading as t
def fun(x, y, z):
# time-consuming computation...
# ...
return output
dim1 = 10
dim2 = 20
dim3 = 30
result = np.zeros([dim1, dim2, dim3])
#b - beginning index, e - end index
def work(ib,jb,kb,ie,je,ke):
for i in xrange(ib,ie):
for j in xrange(jb,je):
for k in xrange(kb,ke):
result[i, j, k] = fun(i, j, k)
threads = list()
threads.append(t.Thread(target=work, args(0,0,0,dim1/2,dim2/2,dim3/2))
threads.append(t.Thread(target=work, args(dim1/2,dim2/2,dim3/2 +1,dim1, dim2, dim3))
for thread in threads:
thread.start()
You can define these sections through some algorithm and determine the number of threads dynamically. Hope it helps you or at least give you some ideas.

Cython numpy array indexing

I am trying to speed up some python code with cython, and I'm making use of cython's -a option to see where I can improve things. My understanding is that in the generated html file, the highlighted lines are ones where python functions are called - is that correct?
In the following trivial function, I have declared the numpy array argument arr using the buffer syntax. I thought that this allows indexing operations to take place purely in C without having to call python functions. However, cython -a (version 0.15) highlights the line where I set the value of an element of arr, though not the one where i read one of its elements. Why does this happen? Is there a more efficient way of accessing numpy array elements?
import numpy
cimport numpy
def foo(numpy.ndarray[double, ndim=1] arr not None):
cdef int i
cdef double elem
for i in xrange(10):
elem = arr[i] #not highlighted
arr[i] = 1.0 + elem #highlighted
EDIT: Also, how does the mode buffer argument interact with numpy? Assuming I haven't changed the order argument of numpy.array from the default, is it always safe to use mode='c'? Does this actually make a difference to performance?
EDIT after delnan's comment: arr[i] += 1 also gets highlighted (that is why I split it up in the first place, to see which part of the operation was causing the issue). If I turn off bounds checking to simplify things (this makes no difference to what gets highlighted), the generated c code is:
/* "ct.pyx":11
* cdef int i
* cdef double elem
* for i in xrange(10): # <<<<<<<<<<<<<<
* elem = arr[i]
* arr[i] = 1.0 + elem
*/
for (__pyx_t_1 = 0; __pyx_t_1 < 10; __pyx_t_1+=1) {
__pyx_v_i = __pyx_t_1;
/* "ct.pyx":12
* cdef double elem
* for i in xrange(10):
* elem = arr[i] # <<<<<<<<<<<<<<
* arr[i] = 1.0 + elem
*/
__pyx_t_2 = __pyx_v_i;
__pyx_v_elem = (*__Pyx_BufPtrStrided1d(double *, __pyx_bstruct_arr.buf, __pyx_t_2, __pyx_bstride_0_arr));
/* "ct.pyx":13
* for i in xrange(10):
* elem = arr[i]
* arr[i] = 1.0 + elem # <<<<<<<<<<<<<<
*/
__pyx_t_3 = __pyx_v_i;
*__Pyx_BufPtrStrided1d(double *, __pyx_bstruct_arr.buf, __pyx_t_3, __pyx_bstride_0_arr) = (1.0 + __pyx_v_elem);
}

The answer is that the highlighter fools the reader.
I compiled your code and the instructions generated under the highlight are those needed
to handle the error cases and the return value, they are not related to the array assignment.
Indeed if you change the code to read :
def foo(numpy.ndarray[double, ndim=1] arr not None):
cdef int i
cdef double elem
for i in xrange(10):
elem = arr[i]
arr[i] = 1.0 + elem
return # + add this
The highlight would be on the last line and not more in the assignment.
You can further speed up your code by using the #cython.boundscheck:
import numpy
cimport numpy
cimport cython
#cython.boundscheck(False)
def foo(numpy.ndarray[double, ndim=1] arr not None):
cdef int i
cdef double elem
for i in xrange(10):
elem = arr[i]
arr[i] = 1.0 + elem
return

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Performance drop using cython - python

Related

Cython promlem: cimport libcpp.vector not compiled

Python complex numbers from C++ strange output

Cython No Performance Increase with prange/parallel

Parallelize these nested for loops in python

Cython numpy array indexing

Categories

Resources