I am transferring a double-array from a c-function to a python-function. My code for that is:
C-Code:
double *compute(int size, const double a[])
{
double* array;
array = malloc(sizeof(double)*size);
for (int i=0; i<size; i++)
{
array[i] = 3*a[i];
}
//printf("Array in compute-function is: \n[");
//for(int i = 0; i < size; i++)
//printf("%f, ", array[i]);
//printf("]\n");
return array;
}
pyx-code:
cdef class ArrayWrapper:
cdef void* data_ptr
cdef int size
cdef set_data(self, int size, void* data_ptr):
""" Set the data of the array
This cannot be done in the constructor as it must recieve C-level
arguments.
Parameters:
-----------
size: int
Length of the array.
data_ptr: void*
Pointer to the data
"""
self.data_ptr = data_ptr
self.size = size
def __array__(self):
""" Here we use the __array__ method, that is called when numpy
tries to get an array from the object."""
cdef np.npy_intp shape[1]
shape[0] = <np.npy_intp> self.size
# Create a 1D array, of length 'size'
ndarray = np.PyArray_SimpleNewFromData(1, shape,
np.NPY_INT, self.data_ptr)
return ndarray
def __dealloc__(self):
""" Frees the array. This is called by Python when all the
references to the object are gone. """
free(<void*>self.data_ptr)
def py_compute(int size, np.ndarray[np.double_t,ndim=1] a):
""" Python binding of the 'compute' function in 'GNLSE_RHS.c' that does
not copy the data allocated in C.
"""
cdef double *array
cdef np.ndarray ndarray
# Call the C function
array = compute(size, <double*> a.data)
array_wrapper = ArrayWrapper()
array_wrapper.set_data(size, <void*> array)
ndarray = np.array(array_wrapper, copy=False)
# Assign our object to the 'base' of the ndarray object
ndarray.base = <PyObject*> array_wrapper
# Increment the reference count, as the above assignement was done in
# C, and Python does not know that there is this additional reference
Py_INCREF(array_wrapper)
return ndarray
python-code:
for i in xrange(10):
x[i] = i;
a = cython_wrapper.py_compute(10, x)
print a
But my result is
[ 0 0 0 1074266112 0 1075314688 0 1075970048 0 1076363264]
instead of the expected
[ 0. 3. 6. 9. 12. 15. 18. 21. 24. 27.]
Where is my mistake? I assume it has something to do with a problematic pointer transfer, but I am not sure.
The mistake here is that in the line
ndarray = np.PyArray_SimpleNewFromData(1, shape,
np.NPY_INT, self.data_ptr)
you are telling numpy that self.data_ptr is pointing to an array of ints and not one of doubles.
You can fix your code by telling numpy the correct datatype like so:
ndarray = np.PyArray_SimpleNewFromData(1, shape,
np.NPY_DOUBLE, self.data_ptr)
and it should work as expected.
In addition to this you can simplify your wrapper code slightly as well by not having to pass in the size of the input array as it is already contained in the np.ndarray that you pass to py_compute
def py_compute(np.ndarray[np.double_t,ndim=1] a):
""" Python binding of the 'compute' function in 'GNLSE_RHS.c' that does
not copy the data allocated in C.
"""
cdef double *array
cdef np.ndarray ndarray
cdef size = a.shape[0]
# Call the C function
array = compute(size, &a[0])
array_wrapper = ArrayWrapper()
array_wrapper.set_data(size, <void*> array)
ndarray = np.array(array_wrapper, copy=False)
# Assign our object to the 'base' of the ndarray object
ndarray.base = <PyObject*> array_wrapper
# Increment the reference count, as the above assignement was done in
# C, and Python does not know that there is this additional reference
Py_INCREF(array_wrapper)
return ndarray
Related
I have a C++ function that I want you call in Python 2.7.12, looking like this:
extern "C" {
double* myfunction(double* &y, double* &z, int &n_y, int &n_z, int a, int b)
{
vector<double> _x;
vector<double> _y;
vector<double> _z;
// Call some external C++ function
cpp_function(_x, _y, _z, a, b);
// Convert vectors back to arrays
double* x = &_x[0]; // or x = _x.data();
y = &_y[0];
z = &_z[0];
n_y = static_cast<int>(_y.size());
n_z = static_cast<int>(_z.size());
return x;
}
}
Basically this function takes as input two integers a,b (plus some other data that I omitted for clarity purpose) and do some calculations before putting the results into two arrays y, z and their respective sizes into n_y, n_z, and returning an array x of size a*b.
After building this function to a shared library myfunction.so, I call it in Python as follows:
from ctypes import *
libc = CDLL('myfunction.so')
myfunction = libc.myfunction
myfunction.restype = POINTER(c_double)
myfunction.argtypes = [POINTER(c_double), POINTER(c_double),
c_int, c_int,
c_int, c_int]
y = POINTER(c_double)()
z = POINTER(c_double)()
n_y = c_int()
n_z = c_int()
a = 18
b = 18
x = myfunction(byref(y), byref(z),
byref(n_y), byref(n_z),
c_int(a), c_int(b))
Running this script I obtained an error:
ctypes.ArgumentError: argument 3: : wrong
type
So the c_int type of n_y is not correct. What should I put instead?
Thank you very much for your help!
UPDATE
Following the suggestion by #GiacomoAlzetta and #CristiFati, I have changed my code to use pointers instead of pass by reference, as follows.
(y and z are similar so let me omit z)
extern "C" {
double* myfunction(double** y, int* n_y, int a, int b)
{
vector<double> _x;
vector<double> _y;
// Call some external C++ function
cpp_function(_x, _y, a, b);
// Convert vectors back to arrays
double* x = &_x[0]; // or x = _x.data();
*y = &_y[0];
*n_y = static_cast<int>(_y.size());
return x;
}
}
Now in C++, I call the above function as follows:
double* y;
int n_y;
int a = 18;
int b = 18;
double* x = myfunction(&y, &n_y, a, b);
which works. And in Python:
from ctypes import *
libc = CDLL('myfunction.so')
myfunction = libc.myfunction
myfunction.restype = POINTER(c_double)
myfunction.argtypes = [POINTER(POINTER(c_double)), POINTER(c_int),
c_int, c_int]
y = POINTER(POINTER(c_double))()
n_y = POINTER(c_int)()
a = 18
b = 18
x = myfunction(y, n_y, c_int(a), c_int(b))
which produced a Segmentation fault error, which happened at the line
*y = &_y[0];
Thank you for your help!
You're almost there. In the meantime, stay close to [Python 3.Docs]: ctypes - A foreign function library for Python.
Remember that you should handle pointer arguments (actually it applies to all of them, but for non pointer ones things are straightforward) the same way, no matter where you are.
In other words, what you do in C (instantiate a variable and pass its pointer to the function), you should also do in Python (instead of instantiate the variable pointer and pass it to the function).
Translated into code, you should modify the way you initialize y, n_y, and the function (myfunction) call:
>>> from ctypes import * # Anti-pattern. Don't ever use it
>>>
>>> y = POINTER(c_double)()
n_y = c_int()
a = 18
b = 18
x = myfunction(pointer(y), pointer(n_y), a, b)
Notes:
What I stated in a comment (Undefined Behavior because vectors are living on the stack and will be destroyed when exiting the function) still stands. To fix it either:
Allocate the data on heap (malloc / new) before returning it (when done with it, you'll also need to deallocate it (free / delete), to avoid memory leaks)
Make them static
Some remotely connected examples:
[SO]: Python ctypes cdll.LoadLibrary, instantiate an object, execute its method, private variable address truncated (#CristiFati's answer)
[SO]: python ctypes issue on different OSes (#CristiFati's answer)
You can use references, since references are just syntax for yet another level of pointer.
You vectors are local variables and are freed when your function returns, so you need to keep the memory around.
Here's your C++ code reworked to keep the memory around. I just created some local variables with some data since your example wasn't complete:
#define API __declspec(dllexport) // Windows-specific export
#include <cstdlib>
#include <vector>
using namespace std;
extern "C" {
API double* myfunction(double* &y, double* &z, int &n_x, int &n_y, int &n_z)
{
vector<double> _x {1.1,2.2,3.3};
vector<double> _y {4.4,5.5};
vector<double> _z {6.6,7.7,8.8,9.9};
// Allocate some arrays to store the vectors.
double* x = new double[_x.size()];
y = new double[_y.size()];
z = new double[_z.size()];
memcpy(x,_x.data(),_x.size() * sizeof(double));
memcpy(y,_y.data(),_y.size() * sizeof(double));
memcpy(z,_z.data(),_z.size() * sizeof(double));
n_x = static_cast<int>(_x.size());
n_y = static_cast<int>(_y.size());
n_z = static_cast<int>(_z.size());
return x;
}
// A function to free up the memory.
API void myfree(double* x, double* y, double* z)
{
delete [] x;
delete [] y;
delete [] z;
}
}
Python:
from ctypes import *
dll = CDLL('test')
dll.myfunction.argtypes = (POINTER(POINTER(c_double)),
POINTER(POINTER(c_double)),
POINTER(c_int),
POINTER(c_int),
POINTER(c_int))
dll.myfunction.restype = POINTER(c_double)
dll.myfree.argtypes = POINTER(c_double),POINTER(c_double),POINTER(c_double)
dll.myfree.restype = None
# Helper function to allocate storage for return arrays
def myfunction():
y = POINTER(c_double)() # create an instance of a C double*
z = POINTER(c_double)()
n_x = c_int() # and instances of C int
n_y = c_int()
n_z = c_int()
# Pass them all by reference so new values can be returned
x = dll.myfunction(byref(y),byref(z),byref(n_x),byref(n_y),byref(n_z))
# Copies the data into Python lists
a = x[:n_x.value]
b = y[:n_y.value]
c = z[:n_z.value]
# Free the C arrays and return the Python lists.
dll.myfree(x,y,z)
return a,b,c
x,y,z = myfunction()
print(x,y,z)
Output:
[1.1, 2.2, 3.3] [4.4, 5.5] [6.6, 7.7, 8.8, 9.9]
Note there is a lot of copying going on. Look into numpy, which creates arrays in a format that can be directly accessed by C and has a built-in ctypes interface.
I use cython and I need to store the data as shown below. Earlier I used for loops to store the data from pus_image[0] into a 3D array but when running for n frames it created a bottleneck in performance. Hence I used PyArray_NewFromDescr to store which solves the bottleneck issue earlier faced. But the displayed images look different from the previous method, as I am not able to do increment _puc_image += aoiStride. Could anyone please help me solve this issue.
Code 1 :
def LiveAquisition(self,nframes,np.ndarray[np.uint16_t,ndim = 3,mode = 'c']data):
cdef:
int available
AT_64 sizeInBytes
AT_64 aoiStride
AT_WC string[20]
AT_WC string1[20]
AT_WC string2[20]
AT_WC string3[20]
unsigned char * pBuf
unsigned char * _puc_image
int BufSize
unsigned int i, j, k, l = 0
for i in range(nframes):
pBuf = <unsigned char *>calloc(sizeInBytes, sizeof(unsigned char))
AT_QueueBuffer(<AT_H>self.cameraHandle, pBuf, sizeInBytes)
print "Frame number is :",
print i
response_code = AT_WaitBuffer(<AT_H>self.cameraHandle, &pBuf, &BufSize, 500)
_puc_image = pBuf
pus_image = <unsigned short*>pBuf
for j in range(self.aoiWidth/self.hbin):
pus_image = <unsigned short*>(_puc_image)
for k in range(self.aoiHeight/self.vbin):
data[l][j][k] = pus_image[0]
pus_image += 1
_puc_image += aoiStride
free(pBuf)
return data
Code 2 : Using PyArray_NewFromDescr
Prior to which its defined as :
from cpython.ref cimport PyTypeObject
from python_ref cimport Py_INCREF
cdef extern from "<numpy/arrayobject.h>":
object PyArray_NewFromDescr(PyTypeObject *subtype, np.dtype descr,int nd, np.npy_intp* dims,np.npy_intp*strides,void* data, int flags, object obj)
def LiveAquisition(self,nframes,np.ndarray[np.uint16_t,ndim = 3,mode = 'c']data):
cdef:
int available
AT_64 sizeInBytes
AT_64 aoiStride
AT_WC string[20]
AT_WC string1[20]
AT_WC string2[20]
AT_WC string3[20]
unsigned char * pBuf
unsigned char * _puc_image
int BufSize
unsigned int i, j, k, l = 0
np.npy_intp dims[2]
np.dtype dtype = np.dtype('<B')
for i in range(nframes):
pBuf = <unsigned char *>calloc(sizeInBytes, sizeof(unsigned char))
AT_QueueBuffer(<AT_H>self.cameraHandle, pBuf, sizeInBytes)
print "Frame number is :",
print i
response_code = AT_WaitBuffer(<AT_H>self.cameraHandle, &pBuf, &BufSize, 500)
Py_INCREF(dtype)
dims[0] = self.aoiWidth
dims[1] = self.aoiHeight
data[i,:,:] = PyArray_NewFromDescr(<PyTypeObject *> np.ndarray, np.dtype('<B'), 2,dims, NULL,pBuf, np.NPY_C_CONTIGUOUS, None)
free(pBuf)
return data
There's a few large errors in the way you're doing this. However, what you're doing is totally unnecessary, and there's a much simpler approach.
You can simply allocate the data using Numpy, and get the address of the first element of that array:
# earlier
cdef unsigned char[:,::1] p
# in loop
p = np.array((self.aoiWidth,self.aoiHeight),dtype=np.uint8)
pbuf = &p[0,0] # address of first element of p
# code goes here
data[i,:,:] = p
Errors in what you're doing:
pBuf = <unsigned char *>calloc(sizeInBytes, sizeof(unsigned char))
Here, sizeInBytes is uninitialized, and therefore the size you allocate with be arbitrary.
PyArray_NewFromDescr steals a reference to the descr argument. This means that it does not increment the reference count of the argument. The line
PyArray_NewFromDescr(<PyTypeObject *> np.ndarray, np.dtype('<B'), ...)
will be translated as Cython to something like
temp_dtype = np.dtype('<B') # refcount 1
PyArray_NewFromDescr(<PyTypeObject *> np.ndarray, temp_dtype, ...)
# temp_dtype refcount is still 1
Py_DECREF(temp_dtype) # Cython's own cleanup
# temp_dtype has now been destroyed, but is still being used by your array
It looks like you copied some code that dealt with this correctly (Py_INCREF(dtype), which was then passed to PyArray_NewFromDescr), but chose to ignore that and create your own temporary object.
PyArray_NewFromDescr does not own the data. Therefore you are responsible for deallocating it once it has been used (and only when you're sure it's no longer needed). You only do one free, after the loop, so you are leaking almost all the memory you allocated. Either put the free in the loop, or modify the OWNDATA flag to give your new array ownership of your array.
In summary, unless you have a good understanding of the Python C API I recommend don't using PyArray_NewFromDescr and using numpy arrays to allocate your data instead.
I am working on a project involving object detection through deep learning, with the underlying detection code written in C. Due to the requirements of the project, this code has a Python wrapper around it, which interfaces with the required C functions through ctypes. Images are read from Python, and then transferred into C to be processed as a batch.
In its current state, the code is very unoptimized: the images (640x360x3 each) are read using cv2.imread then stacked into a numpy array. For example, for a batch size of 16, the dimensions of this array are (16,360,640,3). Once this is done, a pointer to this array is passed through ctypes into C where the array is parsed, pixel values are normalized and rearranged into a 2D array. The dimensions of the 2D array are 16x691200 (16x(640*360*3)), arranged as follows.
row [0]: Image 0: (B)r0(B)r1(B)r2.... (G)r0(G)r1(G)r2.... (R)r0(R)r1(R)r2....
row [1]: Image 1: (B)r0(B)r1(B)r2.... (G)r0(G)r1(G)r2.... (R)r0(R)r1(R)r2....
.
.
row [15]: Image 15: (B)r0(B)r1(B)r2.... (G)r0(G)r1(G)r2.... (R)r0(R)r1(R)r2....
`
The C code for doing this currently looks like this, where the pixel values are accessed through strides and arranged sequentially per image. nb is the total number of images in the batch (usually 16); h, w, c are 360,640 and 3 respectively.
matrix ndarray_to_matrix(unsigned char* src, long* shape, long* strides)
{
int nb = shape[0];
int h = shape[1];
int w = shape[2];
int c = shape[3];
matrix X = make_matrix(nb, h*w*c);
int step_b = strides[0];
int step_h = strides[1];
int step_w = strides[2];
int step_c = strides[3];
int b, i, j, k;
int index1, index2 = 0;
for(b = 0; b < nb ; ++b) {
for(i = 0; i < h; ++i) {
for(k= 0; k < c; ++k) {
for(j = 0; j < w; ++j) {
index1 = k*w*h + i*w + j;
index2 = step_b*b + step_h*i + step_w*j + step_c*k;
X.vals[b][index1] = src[index2]/255.;
}
}
}
}
return X;
}
And the corresponding Python code that calls this function: (array is the original numpy array)
for i in range(start, end):
imgName = imgDir + '/' + allImageName[i]
img = cv2.imread(imgName, 1)
batchImageData[i-start,:,:] = img[:,:]
data = batchImageData.ctypes.data_as(POINTER(c_ubyte))
resmatrix = self.ndarray_to_matrix(data, batchImageData.ctypes.shape, batchImageData.ctypes.strides)
As of now, this ctypes implementation takes about 35 ms for a batch of 16 images. I'm working on a very FPS critical image processing pipeline, so is there a more efficient way of doing these operations? Specifically:
Can I read the image directly as a 'strided' one dimensional array in Python from disk, thus avoiding the iterative access and copying?
I have looked into numpy operations such as:
np.ascontiguousarray(img.transpose(2,0,1).flat, dtype=float)/255. which should achieve something similar, but this is actually taking more time possibly because of it being called in Python.
Would Cython help anywhere during the read operation?
Regarding the ascontiguousarray method, I'm assuming that it's pretty slow as python has to do some memory works to return a C-like contiguous array.
EDIT 1:
I saw this answer, apparently openCV's imread function should already return a contiguous array.
I am not very familiar with ctypes, but happen to use the PyBind library and can only recommend using it. It implements Python's buffer protocol hence allowing you to interact with python data with almost no overhead.
I've answered a question explaining how to pass a numpy array from Python to C/C++, do something dummy to it in C++ and return a dynamically created array back to Python.
EDIT 2 : I've added a simple example that receives a Numpy array, send it to C and prints it from C. You can find it here. Hope it helps!
EDIT 3 :
To answer your last comment, yes you can definitely do that.
You could modify your code to (1) instantiate a 2D numpy array in C++, (2) pass its reference to the data to your C function that will modify it instead of declaring a Matrix and (3) return that instance to Python by reference.
Your function would become:
void ndarray_to_matrix(unsigned char* src, double * x, long* shape, long* strides)
{
int nb = shape[0];
int h = shape[1];
int w = shape[2];
int c = shape[3];
int step_b = strides[0];
int step_h = strides[1];
int step_w = strides[2];
int step_c = strides[3];
int b, i, j, k;
int index1, index2 = 0;
for(b = 0; b < nb ; ++b) {
for(i = 0; i < h; ++i) {
for(k= 0; k < c; ++k) {
for(j = 0; j < w; ++j) {
index1 = k*w*h + i*w + j;
index2 = step_b*b + step_h*i + step_w*j + step_c*k;
X.vals[b][index1] = src[index2]/255.;
}
}
}
}
}
And you'd add, in your C++ wrapper code
// Instantiate the output array, assuming we know b, h, c,w
py::array_t<double> x = py::array_t<double>(b*h*c*w);
py::buffer_info bufx = x.request();
double*ptrx = (double *) bufx.ptr;
// Call to your C function with ptrx as input
ndarray_to_matrix(src, ptrx, shape, strides);
// now reshape x
x.reshape({b, h*c*w});
Do not forget to modify the prototype of the C++ wrapper function to return a numpy array like:
py::array_t<double> read_matrix(...){}...
This should work, I didn't test it though :)
Is there a way to use AES-NI instructions within Cython code?
Closest I could find is how someone accessed SIMD instructions:
https://groups.google.com/forum/#!msg/cython-users/nTnyI7A6sMc/a6_GnOOsLuQJ
AES-NI in Python thread was not answered:
Python support for AES-NI
You should be able to just define the intrinsics as if they're normal C functions in Cython. Something like
cdef extern from "emmintrin.h": # I'm going off the microsoft documentation for where the headers are
# define the datatype as an opaque type
ctypedef struct __m128i:
pass
__m128i _mm_set_epi32 (int i3, int i2, int i1, int i0)
cdef extern from "wmmintrin.h":
__m128i _mm_aesdec_si128(__m128i v,__m128i rkey)
# then in some Cython function
def f():
cdef __m128i v = _mm_set_epi32(1,2,3,4)
cdef __m128i key = _mm_set_epi32(5,6,7,8)
cdef __m128i result = _mm_aesdec_si128(v,key)
The question "how do I apply this over a bytes array"? First, you get a char* of the bytes array. Then just iterate over it with range (being careful not to run off the end).
# assuming you already have an __m128i key
cdef __m128i v
cdef char* array = python_bytes_array # auto conversion
cdef int i, j
# you NEED to ensure that the byte array has a length divisible by
# 16, otherwise you'll probably get a segmentation fault.
for i in range(0,len(python_bytes_array),16):
# go over in chunks of 16
v = _mm_set_epi8(array[i+15],array[i+14],array[i+13],
# etc... fill in the rest
array[i+1], array[i])
cdef __m128 result = _mm_aesdec_si128(v,key)
# write back to the same place?
for j in range(16):
array[i+j] = _mm_extract_epi8(result,j)
I'd like to call my C function from Python, in order to manipulate some NumPy arrays. The function is like this:
void c_func(int *in_array, int n, int *out_array);
where the results are supplied in out_array, whose size I know in advance (not my function, actually). I try to do in the corresponding .pyx file the following, in order to able to pass the input to the function from a NumPy array, and store the result in a NumPy array:
def pyfunc(np.ndarray[np.int32_t, ndim=1] in_array):
n = len(in_array)
out_array = np.zeros((512,), dtype = np.int32)
mymodule.c_func(<int *> in_array.data, n, <int *> out_array.data)
return out_array
But I get
"Python objects cannot be cast to pointers of primitive types" error for the output assignment. How do I accomplish this?
(If I require that the Python caller allocates the proper output array, then I can do
def pyfunc(np.ndarray[np.int32_t, ndim=1] in_array, np.ndarray[np.int32_t, ndim=1] out_array):
n = len(in_array)
mymodule.cfunc(<int *> in_array.data, n, <int*> out_array.data)
But can I do this in a way that the caller doesn't have to pre-allocate the appropriately sized output array?
You should add cdef np.ndarray before the out_array assignement:
def pyfunc(np.ndarray[np.int32_t, ndim=1] in_array):
cdef np.ndarray out_array = np.zeros((512,), dtype = np.int32)
n = len(in_array)
mymodule.c_func(<int *> in_array.data, n, <int *> out_array.data)
return out_array
Here is an example how to manipulate NumPy arrays using code written in C/C++ through ctypes.
I wrote a small function in C, taking the square of numbers from a first array and writing the result to a second array. The number of elements is given by a third parameter. This code is compiled as shared object.
squares.c compiled to squares.so:
void square(double* pin, double* pout, int n) {
for (int i=0; i<n; ++i) {
pout[i] = pin[i] * pin[i];
}
}
In python, you just load the library using ctypes and call the function. The array pointers are obtained from the NumPy ctypes interface.
import numpy as np
import ctypes
n = 5
a = np.arange(n, dtype=np.double)
b = np.zeros(n, dtype=np.double)
square = ctypes.cdll.LoadLibrary("./square.so")
aptr = a.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
bptr = b.ctypes.data_as(ctypes.POINTER(ctypes.c_double))
square.square(aptr, bptr, n)
print b
This will work for any c-library, you just have to know which argument types to pass, possibly rebuilding c-structs in python using ctypes.