Fastest way to sort in Python (no cython) - python

I have a problem where I've to sort a very big array(shape - 7900000X4X4) with a custom function. I used sorted but it took more than 1 hour to sort. My code was something like this.
def compare(x,y):
print('DD '+str(x[0]))
if(np.array_equal(x[1],y[1])==True):
return -1
a = x[1].flatten()
b = y[1].flatten()
idx = np.where( (a>b) != (a<b) )[0][0]
if a[idx]<0 and b[idx]>=0:
return 0
elif b[idx]<0 and a[idx]>=0:
return 1
elif a[idx]<0 and b[idx]<0:
if a[idx]>b[idx]:
return 0
elif a[idx]<b[idx]:
return 1
elif a[idx]<b[idx]:
return 1
else:
return 0
def cmp_to_key(mycmp):
class K:
def __init__(self, obj, *args):
self.obj = obj
def __lt__(self, other):
return mycmp(self.obj, other.obj)
return K
tblocks = sorted(tblocks.items(),key=cmp_to_key(compare))
This worked but I want it to complete in seconds. I don't think any direct implementation in python can give me the performance I need, so I tried cython. My Cython code is this, which is pretty simple.
cdef int[:,:] arrr
cdef int size
cdef bool compare(int a,int b):
global arrr,size
cdef int[:] x = arrr[a]
cdef int[:] y = arrr[b]
cdef int i,j
i = 0
j = 0
while(i<size):
if((j==size-1)or(y[j]<x[i])):
return 0
elif(x[i]<y[j]):
return 1
i+=1
j+=1
return (j!=size-1)
def sorted(np.ndarray boxes,int total_blocks,int s):
global arrr,size
cdef int i
cdef vector[int] index = xrange(total_blocks)
arrr = boxes
size = s
sort(index.begin(),index.end(),compare)
return index
This code in cython took 33 seconds! Cython is the solution, but I am looking for some alternate solutions which can run directly on python. For example numba. I tried Numba, but I didn't get satisfying results. Kindly help!

It is hard to give an answer without a working example. I assume, that arrr in your Cython code was a 2D-array and I assume that size was size=arrr.shape[0]
Numba Implementation
import numpy as np
import numba as nb
from numba.targets import quicksort
def custom_sorting(compare_fkt):
index_arange=np.arange(size)
quicksort_func=quicksort.make_jit_quicksort(lt=compare_fkt,is_argsort=False)
jit_sort_func=nb.njit(quicksort_func.run_quicksort)
index=jit_sort_func(index_arange)
return index
def compare(a,b):
x = arrr[a]
y = arrr[b]
i = 0
j = 0
while(i<size):
if((j==size-1)or(y[j]<x[i])):
return False
elif(x[i]<y[j]):
return True
i+=1
j+=1
return (j!=size-1)
arrr=np.random.randint(-9,10,(7900000,8))
size=arrr.shape[0]
index=custom_sorting(compare)
This gives 3.85s for the generated testdata. But the speed of a sorting algorithm heavily depends on the data....
Simple Example
import numpy as np
import numba as nb
from numba.targets import quicksort
#simple reverse sort
def compare(a,b):
return a > b
#create some test data
arrr=np.array(np.random.rand(7900000)*10000,dtype=np.int32)
#we can pass the comparison function
quicksort_func=quicksort.make_jit_quicksort(lt=compare,is_argsort=True)
#compile the sorting function
jit_sort_func=nb.njit(quicksort_func.run_quicksort)
#get the result
ind_sorted=jit_sort_func(arrr)
This implementation is about 35% slower than np.argsort, but this is also common in using np.argsort in compiled code.

If I understand your code correctly then the order you have in mind is the standard order, only that it starts at 0 wraps around at +/-infinity and maxes out at -0. On top of that we have simple left-to-right lexicographic order.
Now, if your array dtype is integer, observe the following: Because of complement representation of negatives view-casting to unsigned int makes your order the standard order. On top of that, if we use big endian encoding, efficient lexicographic ordering can be achieved by view-casting to void dtype.
The code below shows that using a 10000x4x4 example that this method gives the same result as your Python code.
It also benchmarks it on a 7,900,000x4x4 example (using array, not dict). On my modest laptop this method takes 8 seconds.
import numpy as np
def compare(x, y):
# print('DD '+str(x[0]))
if(np.array_equal(x[1],y[1])==True):
return -1
a = x[1].flatten()
b = y[1].flatten()
idx = np.where( (a>b) != (a<b) )[0][0]
if a[idx]<0 and b[idx]>=0:
return 0
elif b[idx]<0 and a[idx]>=0:
return 1
elif a[idx]<0 and b[idx]<0:
if a[idx]>b[idx]:
return 0
elif a[idx]<b[idx]:
return 1
elif a[idx]<b[idx]:
return 1
else:
return 0
def cmp_to_key(mycmp):
class K:
def __init__(self, obj, *args):
self.obj = obj
def __lt__(self, other):
return mycmp(self.obj, other.obj)
return K
def custom_sort(a):
assert a.dtype==np.int64
b = a.astype('>i8', copy=False)
return b.view(f'V{a.dtype.itemsize * a.shape[1]}').ravel().argsort()
tblocks = np.random.randint(-9,10, (10000, 4, 4))
tblocks = dict(enumerate(tblocks))
tblocks_s = sorted(tblocks.items(),key=cmp_to_key(compare))
tblocksa = np.array(list(tblocks.values()))
tblocksa = tblocksa.reshape(tblocksa.shape[0], -1)
order = custom_sort(tblocksa)
tblocks_s2 = list(tblocks.items())
tblocks_s2 = [tblocks_s2[o] for o in order]
print(tblocks_s == tblocks_s2)
from timeit import timeit
data = np.random.randint(-9_999, 10_000, (7_900_000, 4, 4))
print(timeit(lambda: data[custom_sort(data.reshape(data.shape[0], -1))],
number=5) / 5)
Sample output:
True
7.8328493310138585

Related

How to properly specify a signature for numba JIT functions?

I created a #jit compiled function, which should be faster than a normal loop.
howerver it isnt, since just in time compilation takes minutes. What am I doing wrong?
What is the most efficient way to specify signatures for this function?
''' reproducable sample data '''
import numpy as np, time, numba as nb
LEN, Amount_Of_Elements = 10000,4000
temp = np.random.randint(Amount_Of_Elements*0.7,high=Amount_Of_Elements, size=LEN)
RatiosUp = [np.random.uniform(size=rand) for rand in temp]
RatiosDown = [np.random.uniform(size=rand) for rand in temp]
UpPointsSlices = [np.random.uniform(size=rand) for rand in temp]
DownPointsSlices = [np.random.uniform(size=rand) for rand in temp]
''' function '''
#nb.jit
def filter(a,b):
return a > b
#nb.jit
def func(RatiosUp, RatiosDown, UpPointsSlices, DownPointsSlices, result):
for i in range(len(RatiosUp)):
for j in range(RatiosUp[i].size):
if filter3(RatiosUp[i][j],RatiosDown[i][j]):
result[i][j] = 1
elif filter3(RatiosDown[i][j],RatiosUp[i][j]):
result[i][j] = 0
elif filter3(UpPointsSlices[i][j],DownPointsSlices[i][j]):
result[i][j] = 0
else:
result[i][j] = 1

How can I create a numba callable that has adjustable parameters?

I would like to create a numba-compiled python callable (a function that I can use in another Numba-compiled function) that has an internal array that I can adjust to influence the result of the function call. In pure python, this would correspond to a class with a __call__ method:
class Test:
def __init__(self, arr):
self.arr = arr
def __call__(self, idx):
res = 0
for i in idx:
res += self.arr[i]
return res
t = Test([0, 1, 2])
print(t([1, 2]))
t.arr = [1, 2, 3]
print(t([1, 2]))
which prints 3 and 5, respectively, so the result was different after I modified the internal array arr.
A literal translation to Numba using jitclass and numpy arrays looks like this
import numpy as np
import numba as nb
#nb.jitclass([('arr', nb.double[:])])
class Test:
def __init__(self, arr):
self.arr = arr.astype(np.double)
def __call__(self, idx):
res = 0
for i in idx:
res += self.arr[i]
return res
t = Test(np.arange(3))
print(t(np.array([1, 2])))
t.arr = np.arange(3) + 1
print(t(np.array([1, 2])))
Unfortunately, this fails with TypeError: 'Test' object is not callable, since Numba does not seem to support __call__, yet.
I then tried to solve the problem using closures
import numpy as np
import numba as nb
arr = np.arange(5)
#nb.jit
def call(idx):
res = 0
for i in idx:
res += arr[i]
return res
print(call(np.array([1, 2])))
arr += 1
print(call(np.array([1, 2])))
but this prints 3 twice, since closures copy the data in arr into an internal representation, which I then cannot (easily?) change from the outside.
I even tried to trick Numba, by using ctypes pointers on Numpy arrays I combination with numba.carray, but Numba still seems to copy the data, so I cannot manipulate it.
I understand that Numba wants to control the memory and avoid access to memory regions that might not be used anymore. However, I have a specific use case where I would like to avoid passing around the extra array arr and rather adjust the internal copy somehow. Is there any way to achieve this?
EDIT:
I tried the suggestion by Daniel in the comments to use a method different than __call__, but this also does not work. Here is what I thought might work:
#nb.jitclass([('arr', nb.double[:])])
class Test:
def __init__(self, arr):
self.arr = arr
def call(self, idx):
return self.arr[idx]
a = Test(np.arange(5).astype(np.double))
print(a.call(3))
a.arr += 1
print(a.call(3))
#nb.njit
def rhs(idx):
return a.call(idx)
rhs(3)
This prints 3 and 4, so the array arr can indeed be manipulated. However, using the instance a in a compiled method fails with a NotImplementedError, so I suspect this use case is not (yet) supported by Numba.
Divide the problem in two parts, a numba function and a pure python class:
import numpy as np
import numba
#numba.jit
def calc(arr, idx):
res = 0
for i in idx:
res += arr[i]
return res
class Test:
def __init__(self, arr):
self.arr = arr.astype(np.double)
def __call__(self, idx):
return calc(self.arr, idx)
t = Test(np.arange(3))
print(t(np.array([1, 2])))
t.arr = np.arange(3) + 1
print(t(np.array([1, 2])))
I believe you need #property before the methods of the class but this may not be the only issue
#nb.jitclass([('arr', nb.double[:])])
class Test:
def __init__(self, arr):
self.arr = arr
#property
def call(self, idx):
return self.arr[idx]
a = Test(np.arange(5).astype(np.double))
print(a.call(3))
a.arr += 1
print(a.call(3))
#nb.njit
def rhs(idx):
return a.call(idx)
rhs(3)
This effect is the result of nopython compilation. If your goal is to create such callable at any costs, even possibly without taking benefits from jit-compilation - object compilation mode is a simple solution for your problem. This may be acheived in your closure example code simply by providing forceobj=True parameter to #nb.jit decorator.
This code prints 3 and 5 respectively:
import numpy as np
import numba as nb
arr = np.arange(5)
#nb.jit(forceobj=True)
def call(idx):
res = 0
for i in idx:
res += arr[i]
return res
print(call(np.array([1, 2])))
arr += 1
print(call(np.array([1, 2])))

Indexing multidimensional numpy array inside numba's jitclass

I'm trying to insert a small multidimensional array into a larger one inside a numba jitclass. The small array is set specific positions of the larger array defined by an index list.
The following MWE shows the problem without numba - everything works as expected
import numpy as np
class NumbaClass(object):
def __init__(self, n, m):
self.A = np.zeros((n, m))
# solution 1 using pure python
def nonNumbaFunction1(self, idx, values):
self.A[idx[:, None], idx] = values
# solution 2 using pure python
def nonNumbaFunction2(self, idx, values):
self.A[np.ix_(idx, idx)] = values
if __name__ == "__main__":
n = 6
m = 8
obj = NumbaClass(n, m)
print(f'A =\n{obj.A}')
idx = np.array([0, 2, 5])
values = np.arange(len(idx)**2).reshape(len(idx), len(idx))
print(f'values =\n{values}')
obj.nonNumbaFunction1(idx, values)
print(f'A =\n{obj.A}')
obj.nonNumbaFunction2(idx, values)
print(f'A =\n{obj.A}')
Both functions nonNumbaFunction1 and nonNumbaFunction2 do not work inside a numba class. So my current solution looks like this which is not really nice in my opinion
import numpy as np
from numba import jitclass
from numba import int64, float64
from collections import OrderedDict
specs = OrderedDict()
specs['A'] = float64[:, :]
#jitclass(specs)
class NumbaClass(object):
def __init__(self, n, m):
self.A = np.zeros((n, m))
# solution for numba jitclass
def numbaFunction(self, idx, values):
for i in range(len(values)):
idxi = idx[i]
for j in range(len(values)):
idxj = idx[j]
self.A[idxi, idxj] = values[i, j]
if __name__ == "__main__":
n = 6
m = 8
obj = NumbaClass(n, m)
print(f'A =\n{obj.A}')
idx = np.array([0, 2, 5])
values = np.arange(len(idx)**2).reshape(len(idx), len(idx))
print(f'values =\n{values}')
obj.numbaFunction(idx, values)
print(f'A =\n{obj.A}')
So my questions are:
Does anyone know a solution to this indexing in numba or is there another vectorized solution?
Is there a faster solution for nonNumbaFunction1?
It might be useful to know that inserted array is small (4x4 to 10x10), but this indexing appears in nested loops so it has to be quiet fast as well! Later I need a similar indexing for three dimensional objects too.
Because of limitations on numba's indexing support, I don't think you can do any better than writing out the for loops yourself. To make it generic across dimensions, you could use the generated_jit decorator to specialize. Something like this:
def set_2d(target, values, idx):
for i in range(values.shape[0]):
for j in range(values.shape[1]):
target[idx[i], idx[j]] = values[i, j]
def set_3d(target, values, idx):
for i in range(values.shape[0]):
for j in range(values.shape[1]):
for k in range(values.shape[2]):
target[idx[i], idx[j], idx[k]] = values[i, j, l]
#numba.generated_jit
def set_nd(target, values, idx):
if target.ndim == 2:
return set_2d
elif target.ndim == 3:
return set_3d
Then, this could be used in your jitclass
specs = OrderedDict()
specs['A'] = float64[:, :]
#jitclass(specs)
class NumbaClass(object):
def __init__(self, n, m):
self.A = np.zeros((n, m))
def numbaFunction(self, idx, values):
set_nd(self.A, values, idx)

Why is my solution on LeetCode not working (recursion, python)?

class Solution:
def climbStairs(self, n):
"""
:type n: int
:rtype: int
"""
if n == 1:
return 1
if n == 2:
return 2
else:
return self.climbStairs(n - 1) + self.climbStairs(n - 2)
My solution however is not working on Leetcode. Do you know why this is? Thank you so much!
I am trying to solve fibonacci sequence
You've chosen a poor algorithm. It's not that it's recursive, just horribly inefficient. Here's a more efficient recursive approach:
def climbStairs(self, n, res=1, nxt=1):
if n == 0:
return res
return self.climbStairs(n - 1, nxt, res + nxt)
This runs much faster than your code and can go a lot higher before getting a stack overflow. But it will eventually get a stack overflow. We might do better adding memoization but ultimately, a simple iterative solution would be easier:
def climbStairs(self, n): # doesn't use 'self', could be class method
if 0 <= n <= 1:
return 1
a = b = 1
for _ in range(n - 1):
a, b = b, a + b
return b
Your solution looks good but it's inefficient. You don't have to address if n==2 part. Keep it simple, like this:
class Solution:
def climbStairs(self, n: int) -> int:
if n == 0 or n == 1:
return 1
return self.climbStairs(n-1) + self.climbStairs(n-2)

Computing stats on generators in single pass. Python

When working with generators you can only pull out items on a single pass. An alternative is to load the generator into an list and do multiple passes but this involves a hit on performance and memory allocation.
Can anyone think of a better way of computing the following metrics from a generator in a single pass. Ideally the code computes the count, sum, average, sd, max, min and any other stats you can think of.
UPDATE
Initial horrid code in this gist.
See the gist here: https://gist.github.com/3038746
Using the great suggestions from #larsmans here is the final solution I went with. Using the named tuple really helped.
import random
from math import sqrt
from collections import namedtuple
def stat(gen):
"""Returns the namedtuple Stat as below."""
Stat = namedtuple('Stat', 'total, sum, avg, sd, max, min')
it = iter(gen)
x0 = next(it)
mx = mn = s = x0
s2 = x0*x0
n = 1
for x in it:
mx = max(mx, x)
mn = min(mn, x)
s += x
s2 += x*x
n += 1
return Stat(n, s, s/n, sqrt(s2/n - s*s/n/n), mx, mn)
def random_int_list(size=100, start=0, end=1000):
return (random.randrange(start,end,1) for x in xrange(size))
if __name__ == '__main__':
r = stat(random_int_list())
print r #Stat(total=100, sum=56295, avg=562, sd=294.82537204250247, max=994, min=10)
def statistics(it):
"""Returns number of elements, sum, max, min"""
it = iter(it)
x0 = next(it)
maximum = minimum = total = x0
n = 1
for x in it:
maximum = max(maximum, x)
minimum = min(minimum, x)
total += x
n += 1
return n, total, maximum, minimum
Add other statistics as you please. Consider using a namedtuple when the number of statistics to compute grows large.
If you want to get really fancy, you can build an OO hierarchy of statistics collectors (untested):
class Summer(object):
def __init__(self, x0=0):
self.value = x0
def add(self, x):
self.value += x
class SquareSummer(Summer):
def add(self, x):
super(SquareSummer, self).add(x ** 2)
class Maxer(object):
def __init__(self, x0):
self.value = x0
def add(self, x):
self.value = max(self.value, x)
# example usage: collect([Maxer, Summer], iterable)
def collect(collectors, it):
it = iter(it)
x0 = next(it)
collectors = [c(x0) for c in collectors]
for x in it:
for c in collectors:
c.add(x)
return [c.value for c in collectors]

Categories

Resources