Cython boolean indexing optimization

Cython boolean indexing optimization - python

What is the best way to convert the following code to cython
Given the following example:
#setup example data Z and A
Z = np.random.randn(10,10)
A = np.random.randn(10,10)
A[0,1] = np.nan
A[1,3] = np.nan
A[5,3] = np.nan
A[3,5] = np.nan
B = np.isnan(A).transpose()
C = Z[B * B.transpose()]
I want to optimize the type definition of np.ndarray B in the above example and optimize the creation of ndarray C.
I tried using setting B to uint8 and python and c++ bools.
cdef np.ndarray[np.uint8_t, ndim=2, cast=True] however this yields little or no speedup
and
cdef np.ndarray[bool, ndim=2, cast=True]
where bool is either from cpython cimport bool or from libcpp cimport bool in both cases the above code will throw an error.

The right way to create a buffer which will take np.nan values is using np.float_t or np.double_t. If you try using a integer buffer the following error will raise:
ValueError: cannot convert float Nan to integer
Then, you could use something like:
cdef np.ndarray[np.double_t, ndim=2] A, Z
Z = np.random.randn(10,10)
A = np.random.randn(10,10)
A[0,1] = np.nan
A[1,3] = np.nan
A[5,3] = np.nan
A[3,5] = np.nan
B = np.isnan(A).transpose()
C = Z[B * B.transpose()]

Related

No real gains using cython

I am trying to use a modified version of the RANSAC regressor algorithm, but instead of changing the sklearn's function (to avoid the problems of be using sklearn 0.24.1 and to avoid some warnings from the function), I've found an algorithm in github and was trying to cythonize it to improve speed before I can make my modifications. To my surprise the speed gain was very poor, was it because the code is full of numpy calls or did I make something wrong? The following codes are the python version, cython version (with proper modifications to avoid errors):
##Python version
import numpy as np
def ransac_polyfit(x,y,order, t, n=0.8,k=100,f=0.9):
besterr = np.inf
bestfit = np.array([None])
for kk in range(k):
maybeinliers = np.random.randint(len(x), size=int(n*len(x)))
maybemodel = np.polyfit(x[maybeinliers], y[maybeinliers], order)
alsoinliers = np.abs(np.polyval(maybemodel, x)-y) < t
if sum(alsoinliers) > len(x)*f:
bettermodel = np.polyfit(x[alsoinliers], y[alsoinliers], order)
thiserr = np.sum(np.abs(np.polyval(bettermodel, x[alsoinliers])-y[alsoinliers]))
if thiserr < besterr:
bestfit = bettermodel
besterr = thiserr
return bestfit
##Cython version
cimport cython
cimport numpy as np
import numpy as np
np.import_array()
#cython.boundscheck(False)
#cython.wraparound(False)
cdef ransac_polyfit(np.ndarray[np.npy_int64, ndim=1] x,
np.ndarray[np.npy_int64, ndim=1] y,
np.npy_intp order,
np.npy_float32 t,
np.npy_float32 n=0.8,
np.npy_intp k=100,
np.npy_float32 f=0.9):
cdef np.npy_intp kk, i, ransac_control
cdef np.npy_float64 thiserr, besterr = -1.0
cdef np.ndarray[np.npy_int64, ndim=1] maybeinliers
cdef np.ndarray[np.npy_bool, ndim=1] alsoinliers
cdef np.ndarray[np.npy_float64, ndim=1] bestfit, maybemodel, bettermodel
bestfit = np.zeros(1, dtype = np.float64)
for kk in range(k):
maybeinliers = np.random.randint(len(x), size=int(n*len(x)))
maybemodel = np.polyfit(x[maybeinliers], y[maybeinliers], order)
alsoinliers = np.abs(polyval(maybemodel, x)-y) < t
if sum(alsoinliers) > len(x)*f:
bettermodel = np.polyfit(x[alsoinliers], y[alsoinliers], order)
thiserr = np.sum(np.abs(polyval(bettermodel, x[alsoinliers])-y[alsoinliers]))
if (thiserr < besterr) or (besterr == -1.0):
bestfit = bettermodel
besterr = thiserr
#Since I can't return an empty array, I had to set it as an array with a zero and the ransac_control variable will tell if the function was able or not to find a good model
if (besterr == -1.0):
ransac_control = 0
return ransac_control, bestfit
else:
ransac_control = 1
return ransac_control, bestfit
**PS: Couldn't send the image of the HTML page of the cython code because it's my first question

Comparing scipy.stats.t.sf vs GSL using Cython

I would like to calculate the p values of a large 2D numpy t values array. However, this takes long time and I would like to improve its speed. I tried using GSL.
Although a single gsl_cdf_tdist_P is much much faster than scipy.stats.t.sf, when iterating over the ndarray, the process is very slow. I would like help to improve this.
See the code below.
GSL_Test.pyx
import cython
cimport cython
import numpy
cimport numpy
from cython_gsl cimport *
DTYPE = numpy.float32
ctypedef numpy.float32_t DTYPE_t
cdef get_gsl_p(double t, double nu):
return (1 - gsl_cdf_tdist_P(t, nu)) * 2
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.nonecheck(False)
cdef get_gsl_p_for_2D_matrix(numpy.ndarray[DTYPE_t, ndim=2] t_matrix, int n):
cdef unsigned int rows = t_matrix.shape[0]
cdef numpy.ndarray[DTYPE_t, ndim=2] out = numpy.zeros((rows, rows), dtype='float32')
cdef unsigned int row, col
for row in range(rows):
for col in range(rows):
out[row, col] = get_gsl_p(t_matrix[row, col], n-2)
return out
def get_gsl_p_for_2D_matrix_def(numpy.ndarray[DTYPE_t, ndim=2] t_matrix, int n):
return get_gsl_p_for_2D_matrix(t_matrix, n)
ipython
import GSL_Test
import numpy
import scipy.stats
a = numpy.random.rand(3544, 3544).astype('float32')
%timeit -n 1 GSL_Test.get_gsl_p_for_2D_matrix(a, 25)
1 loop, best of 3: 7.87 s per loop
%timeit -n 1 scipy.stats.t.sf(a, 25)*2
1 loop, best of 3: 4.66 s per loop
UPDATE: Adding cdef declarations I was able to reduce the computational time but not lower than scipy still. I modified the code to have the cdef declarations.
%timeit -n 1 GSL_Test.get_gsl_p_for_2D_matrix_def(a, 25)
1 loop, best of 3: 6.73 s per loop

You can get some small gain in raw performance by using a raw special function instead of stats.t.sf. Looking at the source, you find (https://github.com/scipy/scipy/blob/master/scipy/stats/_continuous_distns.py#L3849)
def _sf(self, x, df):
return sc.stdtr(df, -x)
So that you can use stdtr directly:
np.random.seed(1234)
x = np.random.random((3740, 374))
t1 = stats.t.sf(x, 25)
t2 = stdtr(25, -x)
1 loop, best of 3: 653 ms per loop
1 loop, best of 3: 562 ms per loop
If you do reach out for cython, the typed memoryview syntax often gives you faster code than the old ndarray syntax:
from scipy.special.cython_special cimport stdtr
from numpy cimport npy_intp
import numpy as np
def tsf(double [:, ::1] x, int df=25):
cdef double[:, ::1] out = np.empty_like(x)
cdef npy_intp i, j
cdef double tmp, xx
for i in range(x.shape[0]):
for j in range(x.shape[1]):
xx = x[i, j]
out[i, j] = stdtr(df, -xx)
return np.asarray(out)
Here I'm also using the cython_special interface, which is only avaialble in the dev version of scipy (http://scipy.github.io/devdocs/special.cython_special.html#module-scipy.special.cython_special), but you can use GSL if you want.
Finally, if you suspect a bottleneck in iterations, don't forget to inspect the output of cython -a to see if there's some python overhead in the hot loops.

How to expose a numpy array from c array in cython?

cpdef myf():
# pd has to be a c array.
# Because it will then be consumed by some c function.
cdef double pd[8000]
# Do something with pd
...
# Get a memoryview.
cdef double[:] pd_view = pd
# Coercion the memoryview to numpy array. Not working.
ret = np.asarray(pd)
return ret
I would like it to return a numpy array. How can I do it?
For the moment I have to do
pd_np = np.zeros(8000, dtype=np.double)
cdef int i
for i in range(8000):
pd_np[i] = pd[i]

If you are just declaring your array in your function why not make it a numpy array to begin with, then when you need the c array you can just grab the data pointer.
cimport numpy as np
import numpy as np
def myf():
cdef np.ndarray[double, ndim=1, mode="c"] pd_numpy = np.empty(8000)
cdef double *pd = &pd_numpy[0]
# Do something to fill pd with values
for i in range(8000):
pd[i] = i
return pd_numpy

I made a typo,
ret = np.asarray(pd_view) works

In the memview example here http://docs.cython.org/src/userguide/memoryviews.html
# Memoryview on a C array
cdef int carr[3][3][3]
cdef int [:, :, :] carr_view = carr
carr_view[...] = narr_view # np.arange(27, dtype=np.dtype("i")).reshape((3, 3, 3))
carr_view[0, 0, 0] = 100
I can create a numpy array from carr_view, the memory view on carr, a C array.
# print np.array(carr) # cython error
print 'numpy array on carr_view'
print np.array(carr_view)
print np.array(carr_view).sum() # match sum3d(carr)
# or np.asarray(carr_view)
print 'numpy copy from carr_view'
carr_copy = np.empty((3,3,3))
carr_copy[...] = carr_view[...] # don't need indexed copy
print carr_copy
print carr_copy.sum() # match sum3d(carr)

Speed up cython code

I wrote a python code that manages a lot of data and thus it takes a lot of time. So, I found out Cython and I began to change my code.
Basically, all I did is to change functions' declarations (cdef type name(arguments with variable type) ), to declare cdef variables with its type, and to declare cdef classes.
I'm writing all the .pyx with eclipse, and I'm compiling with the command python setup.py build_ext --inplace and running it with eclipse.
My issue is that comparing python with cython speed, there isn't any difference.
I run the command cython -a <file> to generate a html file and there are a lot of yellow lines.
I don't know if I'm doing something wrong, I should include something else, and I don't know how to delete these yellow lines.
I just paste some code lines, that's the part that I'd like to speed up and because the code is very long.
main.pyx
'''there are a lot of ndarray objects stored in a file and in this step I get each of them until there are no more items '''
cdef ReadWavePoints (WavePointManagement wavePointManagement, ColumnManagement columnManagement):
cdef int runReadWavePoints
wavePointManagement.OpenWavePointFileLoad(wavePointsFile)
runReadWavePoints = 1
while runReadWavePoints == 1:
try:
wavePointManagement.LoadWavePointFile()
wavePointManagement.RoundCoordinates()
wavePointManagement.SortWavePointList()
GroupColumnsVoxels(wavePointManagement.GetWavePointList(), columnManagement)
except:
wavePointManagement.CloseWavePointFile()
columnManagement.CloseWriteColumnFile()
break
'''I check which points are in the same XYZ (voxel) and in the same XY (column)'''
cdef GroupColumnsVoxels (object wavePointList, ColumnManagement columnManagement):
cdef int indexWavePointRef, indexWavePoint
cdef int saved
cdef double voxelValue
cdef int sizeWavePointList
sizeWavePointList = len(wavePointList)
indexWavePointRef = 0
while indexWavePointRef < sizeWavePointList - 1:
saved = 0
voxelValue = (wavePointList[indexWavePointRef]).GetValue()
for indexWavePoint in xrange(indexWavePointRef + 1, len(wavePointList)):
if (wavePointList[indexWavePointRef]).GetX() == (wavePointList[indexWavePoint]).GetX() and (wavePointList[indexWavePointRef]).GetY() == (wavePointList[indexWavePoint]).GetY():
if (wavePointList[indexWavePointRef]).GetZ() == (wavePointList[indexWavePoint]).GetZ():
if voxelValue < (wavePointList[indexWavePoint]).GetValue():
voxelValue = (wavePointList[indexWavePoint]).GetValue()
else:
saved = 1
CheckVoxel((wavePointList[indexWavePointRef]).GetX(), (wavePointList[indexWavePointRef]).GetY(), (wavePointList[indexWavePointRef]).GetZ(), voxelValue)
indexWavePointRef = indexWavePoint
if indexWavePointRef == sizeWavePointList - 1:
CheckVoxel((wavePointList[indexWavePointRef]).GetX(), (wavePointList[indexWavePointRef]).GetY(), (wavePointList[indexWavePointRef]).GetZ(), (wavePointList[indexWavePointRef]).GetValue())
break
else:
saved = 1
CheckVoxel((wavePointList[indexWavePointRef]).GetX(), (wavePointList[indexWavePointRef]).GetY(), (wavePointList[indexWavePointRef]).GetZ(), voxelValue)
columnObject = columnInstance.Column((wavePointList[indexWavePointRef]).GetX(), (wavePointList[indexWavePointRef]).GetY())
columnManagement.AddColumn(columnObject)
MaximumHeightColumn((wavePointList[indexWavePointRef]).GetX(), (wavePointList[indexWavePointRef]).GetY(), (wavePointList[indexWavePointRef]).GetZ())
indexWavePointRef = indexWavePoint
break
if saved == 0:
CheckVoxel((wavePointList[indexWavePointRef]).GetX(), (wavePointList[indexWavePointRef]).GetY(), (wavePointList[indexWavePointRef]).GetZ(), voxelValue)
indexWavePointRef = indexWavePoint
columnObject = columnInstance.Column((wavePointList[indexWavePointRef]).GetX(), (wavePointList[indexWavePointRef]).GetY())
columnManagement.AddColumn(columnObject)
MaximumHeightColumn((wavePointList[indexWavePointRef]).GetX(), (wavePointList[indexWavePointRef]).GetY(), (wavePointList[indexWavePointRef]).GetZ())
'''I check if the data stored in a voxel is lower than the new one; if its the case, I store it'''
cdef CheckVoxel (double X, double Y, double Z, double newValue):
cdef object bandVoxel, structvalCheckVoxel, out_str
cdef tuple valueCheckVoxel
bandVoxel = datasetVoxels.GetRasterBand(int(math.floor(Z/0.3))+1)
structvalCheckVoxel = bandVoxel.ReadRaster(int(math.floor((X-Xmin)/0.25)), int(math.floor((Ymax-Y)/0.25)), 1, 1, buf_type=gdal.GDT_Float32)
valueCheckVoxel = struct.unpack('f', structvalCheckVoxel)
if newValue > valueCheckVoxel[0]:
out_str = struct.pack('f', newValue)
bandVoxel.WriteRaster(int(math.floor((X-Xmin)/0.25)), int(math.floor((Ymax-Y)/0.25)), 1, 1, out_str)
'''I check if this point has the highest Z and I store this information'''
cdef MaximumHeightColumn(double X, double Y, double newZ):
cdef object bandMetricMaximumHeightColumn, structvalMaximumHeightColumn, out_strMaximumHeightColumn
cdef tuple valueMaximumHeightColumn
bandMetricMaximumHeightColumn = datasetMetrics.GetRasterBand(10)
structvalMaximumHeightColumn = bandMetricMaximumHeightColumn.ReadRaster(int(math.floor((X-Xmin)/0.25)), int(math.floor((Ymax-Y)/0.25)), 1, 1, buf_type=gdal.GDT_Float32)
valueMaximumHeightColumn = struct.unpack('f', structvalMaximumHeightColumn)
if newZ > round(valueMaximumHeightColumn[0], 1):
out_strMaximumHeightColumn = struct.pack('f', newZ)
bandMetricMaximumHeightColumn.WriteRaster(int(math.floor((X-Xmin)/0.25)), int(math.floor((Ymax-Y)/0.25)), 1, 1, out_strMaximumHeightColumn)
WavePointManagement.pyx
'''this class serializes, rounds and sorts the points of each ndarray'''
import cPickle as pickle
import numpy as np
cimport numpy as np
import math
cdef class WavePointManagement(object):
'''
This class manages all the points extracted from the waveform
'''
cdef object fileObject, wavePointList
__slots__ = ('wavePointList', 'fileObject')
def __cinit__(self):
'''
Constructor
'''
self.fileObject = None
self.wavePointList = np.array([])
cdef object GetWavePointList(self):
return self.wavePointList
cdef void OpenWavePointFileLoad (self, object fileName):
self.fileObject = file(fileName, 'rb')
cdef void LoadWavePointFile (self):
self.wavePointList = None
self.wavePointList = pickle.load(self.fileObject)
cdef void SortWavePointList (self):
self.wavePointList = sorted(self.wavePointList, key=lambda k: (k.x, k.y, k.z))
cdef void RoundCoordinates (self):
cdef int indexPointObject, sizeWavePointList
for pointObject in self.GetWavePointList():
pointObject.SetX(round(math.floor(pointObject.GetX()/0.25)*0.25, 2))
pointObject.SetY(round(math.ceil(pointObject.GetY()/0.25)*0.25, 2))
pointObject.SetZ(round(math.floor(pointObject.GetZ()/0.3)*0.3, 1))
cdef void CloseWavePointFile(self):
self.fileObject.close()
setup.py
from distutils.core import setup
from distutils.extension import Extension
from Cython.Distutils import build_ext
import numpy
ext = Extension("main", ["main.pyx"], include_dirs = [numpy.get_include()])
setup (ext_modules=[ext],
cmdclass = {'build_ext' : build_ext}
)
test_cython.py
'''this is the file I run with eclipse after compiling'''
from main import main
main()
How could I speed up this code?

Your code jumps back and forth between using numpy arrays and lists. As such there is virtually no difference between the code that cython will produce.
The following code produces a python list, and the key function is a pure python function as well.
self.wavePointList = sorted(self.wavePointList, key=lambda k: (k.x, k.y, k.z))
You will want to use ndarray.sort (or numpy.sort if you don't want to sort inplace). To do this you will also need to change how your objects are stored in the array. That is, you will need to use a structured array. See numpy.sort for examples on how to sort structured arrays -- particularly the last two examples on the page.
Once you have your data stored in a numpy array then you need to tell cython about how the data is stored in the array. This includes providing type information and the dimensions of the array. This page provides more information how to work efficiently with numpy arrays.
An example of show to create and sort structured arrays:
import numpy as np
cimport numpy as np
DTYPE = [('name', 'S10'), ('height', np.float64), ('age', np.int32)]
cdef packed struct Person:
char name[10]
np.float64_t height
np.int32_t age
ctypedef Person DTYPE_t
def create_array():
values = [('Arthur', 1.8, 41), ('Lancelot', 1.9, 38),
('Galahad', 1.7, 38)]
return np.array(values, dtype=DTYPE)
cpdef sort_by_age_then_height(np.ndarray[DTYPE_t, ndim=1] arr):
arr.sort(order=['age', 'height'])
Finally, you will need to convert your code from using python methods to using the standard c library methods for a further speed up. Below is an example using RoundCoordinates. ``cpdef` means the function is also exposed to python by a wrapper function.
cimport cython
cimport numpy as np
from libc.math cimport floor, ceil, round
import numpy as np
DTYPE = [('x', np.float64), ('y', np.float64), ('z', np.float64)]
cdef packed struct Point3D:
np.float64_t x, y, z
ctypedef Point3D DTYPE_t
# Caution should be used when turning the bounds check off as it can lead to undefined
# behaviour if you use an invalid index.
#cython.boundscheck(False)
cpdef RoundCoordinates_cy(np.ndarray[DTYPE_t] pointlist):
cdef int i
cdef DTYPE_t point
for i in range(len(pointlist)): # this line is optimised into a c loop
point = pointlist[i] # creates a copy of the point
point.x = round(floor(point.x/0.25)*2.5) / 10
point.y = round(ceil(point.y/0.25)*2.5) / 10
point.z = round(floor(point.z/0.3)*3) / 10
pointlist[i] = point # overwrites the old point data with the new data
Finally, before rewriting your entire code base, you should profile your code to see which functions the program spends most of its time and optimise those functions before bothering about optimising other functions.

In-place shuffling of multidimensional arrays

I am trying to implement a NaN-safe shuffling procedure in Cython that can shuffle along several axis of a multidimensional matrix of arbitrary dimension.
In the simple case of a 1D matrix, one can simply shuffle over all indices with non-NaN values using the Fisher–Yates algorithm:
def shuffle1D(np.ndarray[double, ndim=1] x):
cdef np.ndarray[long, ndim=1] idx = np.where(~np.isnan(x))[0]
cdef unsigned int i,j,n,m
randint = np.random.randint
for i in xrange(len(idx)-1, 0, -1):
j = randint(i+1)
n,m = idx[i], idx[j]
x[n], x[m] = x[m], x[n]
I would like to extend this algorithm to handle large multidimensional arrays without reshape (which triggers a copy for more complicated cases not considered here). To this end, I would need to get rid of the fixed input dimension, which seems neither possible with numpy arrays nor memoryviews in Cython. Is there a workaround?
Many thanks in advance!

Thanks to the comments of #Veedrac this answer uses more of Cython capabilities.
A pointer array stores the memory address of the values along axis
Your algorithm is used with a modification that checks for nan values, preventing them of being sorted
It won't create a copy for C ordered arrays. In case of Fortran ordered arrays the ravel() command will return a copy. This could be improved by creating another array of double pointers to carry the values of x, probably with some cache penalty...
This code is at least one order of magnitude faster than the other based on slices.
from libc.stdlib cimport malloc, free
cimport numpy as np
import numpy as np
from numpy.random import randint
cdef extern from "numpy/npy_math.h":
bint npy_isnan(double x)
def shuffleND(x, int axis=-1):
cdef np.ndarray[double, ndim=1] v # view of x
cdef np.ndarray[int, ndim=1] strides
cdef int i, j
cdef int num_axis, pos, stride
cdef double tmp
cdef double **v_axis
if axis==-1:
axis = x.ndim-1
shape = list(x.shape)
num_axis = shape.pop(axis)
v_axis = <double **>malloc(num_axis*sizeof(double *))
for i in range(num_axis):
v_axis[i] = <double *>malloc(1*sizeof(double))
try:
tmp_strides = [s//x.itemsize for s in x.strides]
stride = tmp_strides.pop(axis)
strides = np.array(tmp_strides, dtype=np.int32)
v = x.ravel()
for indices in np.ndindex(*shape):
pos = (strides*indices).sum()
for i in range(num_axis):
v_axis[i] = &v[pos + i*stride]
for i in range(num_axis-1, 0, -1):
j = randint(i+1)
if npy_isnan(v_axis[i][0]) or npy_isnan(v_axis[j][0]):
continue
tmp = v_axis[i][0]
v_axis[i][0] = v_axis[j][0]
v_axis[j][0] = tmp
finally:
free(v_axis)
return x

The following algorithm is based on slices, where no copy is made and it should work for any np.ndarray. The main steps are:
np.ndindex() is used to run throught the different multidimensional indices, excluding the one belonging to the axis you want to shuffle
the shuffle already developed by you for the 1-D case is applied.
Code:
def shuffleND(np.ndarray x, axis=-1):
cdef np.ndarray[long long, ndim=1] idx
cdef unsigned int i, j, n, m
if axis==-1:
axis = x.ndim-1
all_shape = list(np.shape(x))
shape = all_shape[:]
shape.pop(axis)
for slices in np.ndindex(*shape):
slices = list(slices)
axis_slice = slices[:]
axis_slice.insert(axis, slice(None))
idx = np.where(~np.isnan(x[tuple(axis_slice)]))[0]
for i in range(idx.shape[0]-1, 0, -1):
j = randint(i+1)
n, m = idx[i], idx[j]
slice1 = slices[:]
slice1.insert(axis, n)
slice2 = slices[:]
slice2.insert(axis, m)
slice1 = tuple(slice1)
slice2 = tuple(slice2)
x[slice1], x[slice2] = x[slice2], x[slice1]
return x

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Cython boolean indexing optimization - python

Related

No real gains using cython

Comparing scipy.stats.t.sf vs GSL using Cython

How to expose a numpy array from c array in cython?

Speed up cython code

In-place shuffling of multidimensional arrays

Categories

Resources