I'm trying to convert a python module to cython, it does a lot of serialize and deserialize work.
Currently I have to do this:
import struct
from libc.stdint cimport (
int32_t,
int64_t,
)
cpdef bytes write_int(int32_t i):
return struct.pack("!i", i)
cpdef bytes write_long(int64_t i):
return struct.pack("!q", i)
cdef bytes write_double(double val):
return struct.pack("!d", val)
cdef bytes write_string(bytes val):
cdef int32_t length = len(val)
cdef str fmt
fmt = "!i%ds" % length
return struct.pack(fmt, length, val)
Is there an equal in c lib to struct.pack and struct.unpack? What's the best way to do things like this in cython?
I looked at the modules (this and this) and just translated the code to Cython and removed the PyObject parts. In theory this should work, but some parts (like the float parts) I have no way of rigorously testing:
Some imports:
from cpython.array cimport array, clone
from libc.string cimport memcmp, memcpy
from libc.math cimport frexp, ldexp
from libc.stdint cimport int32_t, int64_t
Save some code with a fused type. It's technically not a stable feature, but it works flawlessly for me:
ctypedef fused integer:
int32_t
int64_t
This part tests the machine's endianness. It works for me, but that's hardly a complete suite. OTOH, it looks about right
cdef enum float_format_type:
unknown_format,
ieee_big_endian_format,
ieee_little_endian_format
# Set-up
cdef array stringtemplate = array('B')
cdef float_format_type double_format
cdef double x = 9006104071832581.0
if sizeof(double) == 8:
if memcmp(&x, b"\x43\x3f\xff\x01\x02\x03\x04\x05", 8) == 0:
double_format = ieee_big_endian_format
elif memcmp(&x, b"\x05\x04\x03\x02\x01\xff\x3f\x43", 8) == 0:
double_format = ieee_little_endian_format
else:
double_format = unknown_format
else:
double_format = unknown_format;
(The stringtemplate is used to be able to make bytes objects quickly)
This part's simple:
cdef void _write_integer(integer x, char* output):
cdef int i
for i in range(sizeof(integer)-1, -1, -1):
output[i] = <char>x
x >>= 8
cpdef bytes write_int(int32_t i):
cdef array output = clone(stringtemplate, sizeof(int32_t), False)
_write_integer(i, output.data.as_chars)
return output.data.as_chars[:sizeof(int32_t)]
cpdef bytes write_long(int64_t i):
cdef array output = clone(stringtemplate, sizeof(int64_t), False)
_write_integer(i, output.data.as_chars)
return output.data.as_chars[:sizeof(int64_t)]
The array is similar to malloc but it's garbage collected :).
This part I mostly have no idea about. My "tests" passed, but it's mostly hope:
cdef void _write_double(double x, char* output):
cdef:
unsigned char sign
int e
double f
unsigned int fhi, flo, i
char *s
if double_format == unknown_format or True:
if x < 0:
sign = 1
x = -x
else:
sign = 0
f = frexp(x, &e)
# Normalize f to be in the range [1.0, 2.0)
if 0.5 <= f < 1.0:
f *= 2.0
e -= 1
elif f == 0.0:
e = 0
else:
raise SystemError("frexp() result out of range")
if e >= 1024:
raise OverflowError("float too large to pack with d format")
elif e < -1022:
# Gradual underflow
f = ldexp(f, 1022 + e)
e = 0;
elif not (e == 0 and f == 0.0):
e += 1023
f -= 1.0 # Get rid of leading 1
# fhi receives the high 28 bits; flo the low 24 bits (== 52 bits)
f *= 2.0 ** 28
fhi = <unsigned int>f # Truncate
assert fhi < 268435456
f -= <double>fhi
f *= 2.0 ** 24
flo = <unsigned int>(f + 0.5) # Round
assert(flo <= 16777216);
if flo >> 24:
# The carry propagated out of a string of 24 1 bits.
flo = 0
fhi += 1
if fhi >> 28:
# And it also progagated out of the next 28 bits.
fhi = 0
e += 1
if e >= 2047:
raise OverflowError("float too large to pack with d format")
output[0] = (sign << 7) | (e >> 4)
output[1] = <unsigned char> (((e & 0xF) << 4) | (fhi >> 24))
output[2] = 0xFF & (fhi >> 16)
output[3] = 0xFF & (fhi >> 8)
output[4] = 0xFF & fhi
output[5] = 0xFF & (flo >> 16)
output[6] = 0xFF & (flo >> 8)
output[7] = 0xFF & flo
else:
s = <char*>&x;
if double_format == ieee_little_endian_format:
for i in range(8):
output[i] = s[7-i]
else:
for i in range(8):
output[i] = s[i]
If you can understand how it works, be sure to check it yourself.
Then we wrap it as before:
cdef bytes write_double(double x):
cdef array output = clone(stringtemplate, sizeof(double), False)
_write_double(x, output.data.as_chars)
return output.data.as_chars[:sizeof(double)]
The string one is actually really simple, and explains why I set it up as I did above:
cdef bytes write_string(bytes val):
cdef:
int32_t int_length = sizeof(int32_t)
int32_t input_length = len(val)
array output = clone(stringtemplate, int_length + input_length, True)
_write_integer(input_length, output.data.as_chars)
memcpy(output.data.as_chars + int_length, <char*>val, input_length)
return output.data.as_chars[:int_length + input_length]
If you're only packing one type of data per command (eg. a group of ints, then a group of floats etc), you can use array.array() for faster results, either via Python or Cython.
Source:
Serialize a group of integers using Cython
Related
This question already has answers here:
C left shift on 64 bits fail
(2 answers)
Closed 2 years ago.
I implemented Lucas-Lehmer primality test to check Mersenne prime in python. Then I use Cython to speed up the calculation.
Original Python code:
def lucas_lehmer(p):
if p == 2:
return True
s = 4
M = (1 << p) - 1
for i in range(p-2):
s = ((s * s) - 2) % M
print("Processed: {}%".format(100*i//(p-2)))
if s == 0:
return True
else:
return False
Cython code:
cpdef lucas_lehmer(int p):
if p == 2:
return True
cdef unsigned long long int M
M = (1 << p) - 1
cdef unsigned long long int s
s = 4
cdef int i
for i in range(p-2):
s = ((s * s) - 2) % M
print("Processed: {}%".format(100*i//(p-2)))
if s == 0:
return True
else:
return False
Running the original Python code, it works correctly. But for Cython, it's only correct with p = 31 and lower, testing with p = 61 and bigger (all tested p values are values that 2^p-1 is prime), it returns False (not a prime number), except for p = 86243.
For some p like 97, even though 2^97-1 is not a prime number, the program actually return True (is a prime number), which is a contradiction.
Why does this happen? Without using cdef for variable M and s, the calculation will be correct, but the performance won't get any improved.
Running a few tests on your code I found that M was always equal to 1
so I defined p as a cdef and got the required result.
Not sure exactly what the issue is but it's something to do with that bit operation on p. p needs to be of the same type as M for it to make sense and if one is cdef and one is python int somehow it doesn't work?
cpdef lucas_lehmer(int py):
cdef p
p = py
if p == 2:
return True
cdef M
M = (1 << p) - 1
cdef s
s = 4
cdef int i
for i in range(p-2):
s = ((s * s) - 2) % M
print("Processed: {}%".format(100*i//(p-2)))
if s == 0:
return True
else:
return False
I'm using Cython version 0.27.3 to compile the following source for a simple primality testing module that contains both python and cython implementations of the same algorithm. When I set the threads parameter to different values, I see no performance increase, despite the GIL being released. Is there something that's preventing this from running in parallel?
The function in question is the cdef void _getprimes which accepts a memoryview slice as a parameter and should set all non-prime values to 0 in that slice.
primes.pyx
#cython: boundscheck=False, wraparound=False, nonecheck=False
cimport cython
from cpython cimport array
from cython.parallel cimport parallel, prange
from libc.math cimport sqrt, ceil
from libc.stdlib cimport malloc, free
from libc.stdio cimport printf
import math
# =====================
# Python implementation
# =====================
def pyisprime(n):
"""Python implementation"""
if n < 2 or n & 1 == 0:
if n == 2:
return True
return False
for i in range(2, int(math.sqrt(n)) + 1):
if n % i == 0:
return False
return True
def pygetprimes(nums):
return [num for num in nums if pyisprime(num)]
# =====================
# Cython implementation
# =====================
cdef int _isprime(unsigned long long n) nogil:
"""Cython implementation of a simple primality check"""
cdef unsigned long long upper
cdef unsigned long long i = 3
cdef int prime = 1
if n < 2 or n & 1 == 0:
if n == 2:
return 1
return 0
upper = <unsigned long long>ceil(sqrt(<double>n))
while i <= upper:
if n % i == 0:
prime = 0
break
i += 1
return prime
def isprime(unsigned long long n):
"""Wrapper for _isprime"""
cdef int result
with nogil:
result = _isprime(n)
return result
cdef void _getprimes(unsigned long long[:] nums, int threads) nogil:
cdef unsigned long num
cdef int i = 0
with parallel(num_threads=threads):
for i in prange(nums.shape[0], schedule="dynamic"):
if _isprime(nums[i]) == 0:
nums[i] = 0
def getprimes(nums, int threads = 1):
"""Wrapper for _getprimes"""
cdef unsigned long long num
cdef unsigned long long[:] primes = array.array("Q", nums)
with nogil:
_getprimes(primes, threads)
return [num for num in primes if num != 0]
setup.py
#!/usr/bin/env python3
from distutils.core import setup
from Cython.Build import cythonize
setup(
name="primes",
ext_modules=cythonize('primes.pyx'),
)
test.py
#!/usr/bin/env python3
import functools
import random
import time
import primes
def timed(func):
def wrapped(*args, **kwargs):
start = time.time()
val = func(*args, **kwargs)
end = time.time()
print(func.__name__, end - start)
return val
return functools.wraps(func)(wrapped)
def main():
nums = [random.randint(0, 0xffffff) for _ in range(500000)]
pyfoo = timed(primes.pygetprimes)
cyfoo = timed(primes.getprimes)
x = pyfoo(nums)
y = cyfoo(nums, 1)
z = cyfoo(nums, 4)
assert x == y == z
if __name__ == "__main__":
main()
When I run cyfoo, I expected that increasing the number of threads from 1 to 4 would show some type of speed increase, but this is not the case:
[aarcher#Arch]: ~/Programming/Cython/build/lib.linux-x86_64-3.6>$ ./test.py
pygetprimes 5.11554741859436
getprimes 1.1129701137542725
getprimes 1.1306445598602295
It seems you need to enable compiler flags for OpenMP for the parallel statements to actually do anything.
See cython docs here
http://cython.readthedocs.io/en/latest/src/userguide/parallelism.html#compiling
# setup.py
# ... omitted ...
ext_modules = [
Extension(
"hello",
["hello.pyx"],
extra_compile_args=['-fopenmp'],
extra_link_args=['-fopenmp'],
)
]
I have converted a python function to a cython one. Now the function works as it is supposed to. But I am getting a lot of memory leak when the main program calls this function multiple times. I have freed the memory that I allocated dynamically, but it does not seem to work.
What am I doing wrong here?
from cpython.mem cimport PyMem_Malloc, PyMem_Free
def longest_common_substring(refWord, stemWord):
cdef:
int longest, x_longest
int x, y, k
Py_ssize_t lengthRefWord
Py_ssize_t lengthStemWord
wchar_t *referenceWord = PyUnicode_AsWideCharString(refWord, &lengthRefWord)
wchar_t *stemmableWord = PyUnicode_AsWideCharString(stemWord, &lengthStemWord)
int t1 = lengthRefWord+1
int t2 = lengthStemWord+1
int **m = <int **> PyMem_Malloc(t1 * sizeof(int *))
wchar_t tempChar1;
wchar_t tempChar2;
longest = 0
x_longest = 0
for k in range(t1):
m[k] = <int *> PyMem_Malloc(t2 * sizeof(int))
for x in range(0, t1):
for y in range(0, t2):
m[x][y] = 0
for x in range(1, t1):
for y in range(1, t2):
tempChar1 = referenceWord[x - 1]
tempChar2 = stemmableWord[y - 1]
if tempChar1 == tempChar2:
m[x][y] = m[x - 1][y - 1] + 1
if m[x][y] > longest:
longest = m[x][y]
x_longest = x
else:
m[x][y] = 0
for k in range(t1):
PyMem_Free(m[k])
PyMem_Free(m)
return refWord[x_longest - longest: x_longest]
PyUnicode_AsWideCharString allocates memory that you have to free. The documentation says
Returns a buffer allocated by PyMem_Alloc() (use PyMem_Free() to free it) on success.
You get two strings from this function but free neither of them.
I am trying to convert this C function into Python;
typedef unsigned long var;
/* Bit rotate rightwards */
var ror(var v,unsigned int bits) {
return (v>>bits)|(v<<(8*sizeof(var)-bits));
}
I have tried Googling for some solutions, but I can't seem to get any of them to give the same results as the one here.
This is one solution I have found from another program;
def mask1(n):
"""Return a bitmask of length n (suitable for masking against an
int to coerce the size to a given length)
"""
if n >= 0:
return 2**n - 1
else:
return 0
def ror(n, rotations=1, width=8):
"""Return a given number of bitwise right rotations of an integer n,
for a given bit field width.
"""
rotations %= width
if rotations < 1:
return n
n &= mask1(width)
return (n >> rotations) | ((n << (8 * width - rotations)))
I am trying to btishift key = 0xf0f0f0f0f123456. The C code gives 000000000f0f0f12 when it is called with; ror(key, 8 << 1) and Python gives; 0x0f0f0f0f0f123456 (the original input!)
Your C output doesn't match the function that you provided. That is presumably because you are not printing it correctly. This program:
#include <stdio.h>
#include <stdint.h>
uint64_t ror(uint64_t v, unsigned int bits)
{
return (v>>bits) | (v<<(8*sizeof(uint64_t)-bits));
}
int main(void)
{
printf("%llx\n", ror(0x0123456789abcdef, 4));
printf("%llx\n", ror(0x0123456789abcdef, 8));
printf("%llx\n", ror(0x0123456789abcdef, 12));
printf("%llx\n", ror(0x0123456789abcdef, 16));
return 0;
}
produces the following output:
f0123456789abcde
ef0123456789abcd
def0123456789abc
cdef0123456789ab
To produce an ror function in Python I refer you to this excellent article: http://www.falatic.com/index.php/108/python-and-bitwise-rotation
This Python 2 code produces the same output as the C program above:
ror = lambda val, r_bits, max_bits: \
((val & (2**max_bits-1)) >> r_bits%max_bits) | \
(val << (max_bits-(r_bits%max_bits)) & (2**max_bits-1))
print "%x" % ror(0x0123456789abcdef, 4, 64)
print "%x" % ror(0x0123456789abcdef, 8, 64)
print "%x" % ror(0x0123456789abcdef, 12, 64)
print "%x" % ror(0x0123456789abcdef, 16, 64)
The shortest way I've found in Python:
(note this works only with integers as inputs)
def ror(n,rotations,width):
return (2**width-1)&(n>>rotations|n<<(width-rotations))
There are different problems in your question.
C part :
You use a value of key that is a 64 bits value (0x0f0f0f0f0f123456), but the output shows that for you compiler unsigned long is only 32 bits wide. So what C code does is rotating the 32 bits value 0x0f123456 16 times giving 0x34560f12
If you had used unsigned long long (assuming it is 64 bits on your architecture as it is on mine), you would have got 0x34560f0f0f0f0f12 (rotation 16 times of a 64 bits)
Python part :
The definition of width between mask1 and ror is not consistent. mask1 takes a width in bits, where ror takes a width in bytes and one byte = 8 bits.
The ror function should be :
def ror(n, rotations=1, width=8):
"""Return a given number of bitwise right rotations of an integer n,
for a given bit field width.
"""
rotations %= width * 8 # width bytes give 8*bytes bits
if rotations < 1:
return n
mask = mask1(8 * width) # store the mask
n &= mask
return (n >> rotations) | ((n << (8 * width - rotations)) & mask) # apply the mask to result
That way with key = 0x0f0f0f0f0f123456, you get :
>>> hex(ror(key, 16))
'0x34560f0f0f0f0f12L'
>>> hex(ror(key, 16, 4))
'0x34560f12L'
exactly the same as C output
i know its nearly 6 years old
I always find it easier to use string slices than bitwise operations.
def rotate_left(x, n):
return int(f"{x:032b}"[n:] + f"{x:032b}"[:n], 2)
def rotate_right(x, n):
return int(f"{x:032b}"[-n:] + f"{x:032b}"[:-n], 2)
def rotation_value(value, rotations, widht=32):
""" Return a given number of bitwise left or right rotations of an interger
value,
for a given bit field widht.
if rotations == -rotations:
left
else:
right
"""
if int(rotations) != abs(int(rotations)):
rotations = widht + int(rotations)
return (int(value)<<(widht-(rotations%widht)) | (int(value)>>(rotations%widht))) & ((1<<widht)-1)
%%cython -f -c=-O3 -c=-fopenmp --link-args=-fopenmp
from cython.parallel import parallel, prange
from libc.stdlib cimport abort, malloc, free
cdef int idx, i, n = 100
cdef int k
cdef int * local_buf
cdef int size = 10
cdef void func(int* lb) nogil:
cdef int j
for j in xrange(size):
lb[j] += -1*j
local_buf = <int *> malloc(sizeof(int) * size)
with nogil, parallel():
if local_buf == NULL:
abort()
# populate our local buffer in a sequential loop
for i in xrange(size):
local_buf[i] = i * 2
# share the work using the thread-local buffer(s)
for k in prange(n, schedule='guided'):
func(local_buf)
for i in xrange(size):
print local_buf[i]
free(local_buf)
0
-98
-196
-294
-392
-490
-588
-686
-784
-882
edit:
The above block shows the output after one run, but the contents in local_buf seems to change every or so re-run. What's going on?
The result there seems reasonable with the code given, do you actually get different result each run?
This should be regular python equivalent:
size = 10
n = 100
lst = [i*2 for i in range(size)]
for i in range(n):
for j in range(size):
lst[j] += -1*j
print lst
#[0, -98, -196, -294, -392, -490, -588, -686, -784, -882]