How to properly specify a signature for numba JIT functions? - python

I created a #jit compiled function, which should be faster than a normal loop.
howerver it isnt, since just in time compilation takes minutes. What am I doing wrong?
What is the most efficient way to specify signatures for this function?
''' reproducable sample data '''
import numpy as np, time, numba as nb
LEN, Amount_Of_Elements = 10000,4000
temp = np.random.randint(Amount_Of_Elements*0.7,high=Amount_Of_Elements, size=LEN)
RatiosUp = [np.random.uniform(size=rand) for rand in temp]
RatiosDown = [np.random.uniform(size=rand) for rand in temp]
UpPointsSlices = [np.random.uniform(size=rand) for rand in temp]
DownPointsSlices = [np.random.uniform(size=rand) for rand in temp]
''' function '''
#nb.jit
def filter(a,b):
return a > b
#nb.jit
def func(RatiosUp, RatiosDown, UpPointsSlices, DownPointsSlices, result):
for i in range(len(RatiosUp)):
for j in range(RatiosUp[i].size):
if filter3(RatiosUp[i][j],RatiosDown[i][j]):
result[i][j] = 1
elif filter3(RatiosDown[i][j],RatiosUp[i][j]):
result[i][j] = 0
elif filter3(UpPointsSlices[i][j],DownPointsSlices[i][j]):
result[i][j] = 0
else:
result[i][j] = 1

Related

why is this async matrix multiplication code slower than synchronous?

I'm trying to implement a matrix multiplication function that is faster than my synchronous multiplication function by using concurrent.futures.threadPoolExecutor() threads. Here is the code:
def asyncmult(m1, m2):
threads = []
prod = np.zeros((m1.shape[0], m2.shape[1]))
def multvecs(inp):
vec1, vec2, index = inp
i, j = index
sum = 0
for k in range(vec1.shape[0]):
sum += vec1[k] * vec2[k]
prod[i][j] = sum
with concurrent.futures.ThreadPoolExecutor() as executor:
for i in range(m1.shape[0]):
for j in range(m2.shape[1]):
f = executor.submit(multvecs, (m1[i], m2[:, j], (i, j)))
threads.append(f)
return prod
on plotting against time taken with increase in input matrix size this function performs the worse than synchronous.
why is this happening? How can I fix this?

How to parallelise using Jit (Numba) parallel?

I wanted to share with you how I solved my parallelisation problem I had with jit. Initially I had the code bellow and got this error:
Code:
#jit(nopython=True, parallel=True)
def mandelbrot_2(cArray, iterations):
count = 0
for c in cArray:
z = 0
for n in range(iterations):
z = np.square(z) + c
if np.abs(z) > 2 :
count += 1 ;
break
return cArray.shape[0] - count
Error:
/Users/alexander/opt/anaconda3/lib/python3.8/site-packages/numba/core/typed_passes.py:326: NumbaPerformanceWarning: [1m
The keyword argument 'parallel=True' was specified but no transformation for parallel execution was possible.
To find out why, try turning on parallel diagnostics, see https://numba.pydata.org/numba-doc/latest/user/parallel.html#diagnostics for help.
[1m
File "<ipython-input-56-9fcfc0a9fe03>", line 17:[0m
[1m#jit(nopython=True, parallel=True)
[1mdef mandelbrot_2(cArray, iterations):
[0m[1m^[0m[0m
[0m
warnings.warn(errors.NumbaPerformanceWarning(msg,
Import prange:
from numba import jit, prange
New Code:
Convert the first loop to indexing an array instead of using a new array element each time.
converted the range in both loops to prange which I imported from Numba.
#jit(nopython=True, parallel=True)
def mandelbrot_2(cArray, iterations):
count = 0
for c in prange(cArray.shape[0]):
z = 0
for n in prange(iterations):
z = np.square(z) + cArray[c]
if np.abs(z) > 2 : count += 1 ; break
return cArray.shape[0] - count

Is there a pythonic way to sample N consecutive elements from a list or numpy array

Is there a pythonic way to select N consecutive elements from a list or numpy array.
So Suppose:
Choice = [1,2,3,4,5,6]
I would like to create a new list of length N by randomly selecting element X in Choice along with the N-1 consecutive elements following choice.
So if:
X = 4
N = 4
The resulting list would be:
Selection = [5,6,1,2]
I think something similar to the following would work.
S = []
for i in range(X,X+N):
S.append(Selection[i%6])
But I was wondering if there is a python or numpy function that can select the elements at once that was more efficient.
Use itertools, specifically islice and cycle.
start = random.randint(0, len(Choice) - 1)
list(islice(cycle(Choice), start, start + n))
cycle(Choice) is an infinite sequence that repeats your original list, so that the slice start:start + n will wrap if necessary.
You could use a list comprehension, using modulo operations on the index to keep it in range of the list:
Choice = [1,2,3,4,5,6]
X = 4
N = 4
L = len(Choice)
Selection = [Choice[i % L] for i in range(X, X+N)]
print(Selection)
Output
[5, 6, 1, 2]
Note that if N is less than or equal to len(Choice), you can greatly simplify the code:
Choice = [1,2,3,4,5,6]
X = 4
N = 4
L = len(Choice)
Selection = Choice[X:X+N] if X+N <= L else Choice[X:] + Choice[:X+N-L]
print(Selection)
Since you are asking for the most efficient way I created a little benchmark to test the solutions proposed in this thread.
I rewrote your current solution as:
def op(choice, x):
n = len(choice)
selection = []
for i in range(x, x + n):
selection.append(choice[i % n])
return selection
Where choice is the input list and x is the random index.
These are the results if choice contains 1_000_000 random numbers:
chepner: 0.10840400000000017 s
nick: 0.2066781999999998 s
op: 0.25887470000000024 s
fountainhead: 0.3679908000000003 s
Full code
import random
from itertools import cycle, islice
from time import perf_counter as pc
import numpy as np
def op(choice, x):
n = len(choice)
selection = []
for i in range(x, x + n):
selection.append(choice[i % n])
return selection
def nick(choice, x):
n = len(choice)
return [choice[i % n] for i in range(x, x + n)]
def fountainhead(choice, x):
n = len(choice)
return np.take(choice, range(x, x + n), mode='wrap')
def chepner(choice, x):
n = len(choice)
return list(islice(cycle(choice), x, x + n))
results = []
n = 1_000_000
choice = random.sample(range(n), n)
x = random.randint(0, n - 1)
# Correctness
assert op(choice, x) == nick(choice,x) == chepner(choice,x) == list(fountainhead(choice,x))
# Benchmark
for f in op, nick, chepner, fountainhead:
t0 = pc()
f(choice, x)
t1 = pc()
results.append((t1 - t0, f))
for t, f in sorted(results):
print(f'{f.__name__}: {t} s')
If using a numpy array as the source, we could of course use numpy "fancy indexing".
So, if ChoiceArray is the numpy array equivalent of the list Choice, and if L is len(Choice) or len(ChoiceArray):
Selection = ChoiceArray [np.arange(X, N+X) % L]
Here's a numpy approach:
import numpy as np
Selection = np.take(Choice, range(X,N+X), mode='wrap')
Works even if Choice is a Python list rather than a numpy array.

Nested Numba function performance

currently I am trying to improve the performance of my python code. To do so I successfully use numba. In order to improve the structure of my code I create functions. Now I have noticed to my surprise that if I split the code into different numba functions, the code is significantly slower than if I put the whole code in one function with a numba decorator.
An example would be:
#nb.njit
def fct_4(a, b):
x = a ^ b
setBits = 0
while x > 0:
setBits += x & 1
x >>= 1
return setBits
#nb.njit
def fct_3(c, set_1, set_2):
h = 2
if c not in set_1 and c not in set_2:
if fct_4(0, c) <= h:
set_1.add(c)
else:
set_2.add(c)
#nb.njit
def fct_2(c, set_1, set_2):
fct_3(c, set_1, set_2)
#nb.njit
def fct_1(set_1, set_2):
for x1 in range(1000):
c = 2
fct_2(c, set_1, set_2)
is slower than
#nb.njit
def fct_1(set_1, set_2):
for x1 in range(1000):
c = 2
h = 2
if c not in set_1 and c not in set_2:
if fct_4(0, c) <= h:
set_1.add(c)
else:
set_2.add(c)
with
#nb.njit
def main_fct(set_1, set_2):
for i in range(50):
for x in range(1000):
fct_1(set_1, set_2)
set_1 = {0}
set_2 = {47}
start = timeit.default_timer()
main_fct(set_1, set_2)
stop = timeit.default_timer()
(2.70 seconds vs 0.46 seconds). I thought this shouldn't make a difference. Could you enlighten me?
Since python is a dynamically typed language, its function call overhead is quite high.
On top of that you are looping over the function calls, so the execution time incurred in calling the function and checking the arguments is multiplied 1000 times.

Fastest way to sort in Python (no cython)

I have a problem where I've to sort a very big array(shape - 7900000X4X4) with a custom function. I used sorted but it took more than 1 hour to sort. My code was something like this.
def compare(x,y):
print('DD '+str(x[0]))
if(np.array_equal(x[1],y[1])==True):
return -1
a = x[1].flatten()
b = y[1].flatten()
idx = np.where( (a>b) != (a<b) )[0][0]
if a[idx]<0 and b[idx]>=0:
return 0
elif b[idx]<0 and a[idx]>=0:
return 1
elif a[idx]<0 and b[idx]<0:
if a[idx]>b[idx]:
return 0
elif a[idx]<b[idx]:
return 1
elif a[idx]<b[idx]:
return 1
else:
return 0
def cmp_to_key(mycmp):
class K:
def __init__(self, obj, *args):
self.obj = obj
def __lt__(self, other):
return mycmp(self.obj, other.obj)
return K
tblocks = sorted(tblocks.items(),key=cmp_to_key(compare))
This worked but I want it to complete in seconds. I don't think any direct implementation in python can give me the performance I need, so I tried cython. My Cython code is this, which is pretty simple.
cdef int[:,:] arrr
cdef int size
cdef bool compare(int a,int b):
global arrr,size
cdef int[:] x = arrr[a]
cdef int[:] y = arrr[b]
cdef int i,j
i = 0
j = 0
while(i<size):
if((j==size-1)or(y[j]<x[i])):
return 0
elif(x[i]<y[j]):
return 1
i+=1
j+=1
return (j!=size-1)
def sorted(np.ndarray boxes,int total_blocks,int s):
global arrr,size
cdef int i
cdef vector[int] index = xrange(total_blocks)
arrr = boxes
size = s
sort(index.begin(),index.end(),compare)
return index
This code in cython took 33 seconds! Cython is the solution, but I am looking for some alternate solutions which can run directly on python. For example numba. I tried Numba, but I didn't get satisfying results. Kindly help!
It is hard to give an answer without a working example. I assume, that arrr in your Cython code was a 2D-array and I assume that size was size=arrr.shape[0]
Numba Implementation
import numpy as np
import numba as nb
from numba.targets import quicksort
def custom_sorting(compare_fkt):
index_arange=np.arange(size)
quicksort_func=quicksort.make_jit_quicksort(lt=compare_fkt,is_argsort=False)
jit_sort_func=nb.njit(quicksort_func.run_quicksort)
index=jit_sort_func(index_arange)
return index
def compare(a,b):
x = arrr[a]
y = arrr[b]
i = 0
j = 0
while(i<size):
if((j==size-1)or(y[j]<x[i])):
return False
elif(x[i]<y[j]):
return True
i+=1
j+=1
return (j!=size-1)
arrr=np.random.randint(-9,10,(7900000,8))
size=arrr.shape[0]
index=custom_sorting(compare)
This gives 3.85s for the generated testdata. But the speed of a sorting algorithm heavily depends on the data....
Simple Example
import numpy as np
import numba as nb
from numba.targets import quicksort
#simple reverse sort
def compare(a,b):
return a > b
#create some test data
arrr=np.array(np.random.rand(7900000)*10000,dtype=np.int32)
#we can pass the comparison function
quicksort_func=quicksort.make_jit_quicksort(lt=compare,is_argsort=True)
#compile the sorting function
jit_sort_func=nb.njit(quicksort_func.run_quicksort)
#get the result
ind_sorted=jit_sort_func(arrr)
This implementation is about 35% slower than np.argsort, but this is also common in using np.argsort in compiled code.
If I understand your code correctly then the order you have in mind is the standard order, only that it starts at 0 wraps around at +/-infinity and maxes out at -0. On top of that we have simple left-to-right lexicographic order.
Now, if your array dtype is integer, observe the following: Because of complement representation of negatives view-casting to unsigned int makes your order the standard order. On top of that, if we use big endian encoding, efficient lexicographic ordering can be achieved by view-casting to void dtype.
The code below shows that using a 10000x4x4 example that this method gives the same result as your Python code.
It also benchmarks it on a 7,900,000x4x4 example (using array, not dict). On my modest laptop this method takes 8 seconds.
import numpy as np
def compare(x, y):
# print('DD '+str(x[0]))
if(np.array_equal(x[1],y[1])==True):
return -1
a = x[1].flatten()
b = y[1].flatten()
idx = np.where( (a>b) != (a<b) )[0][0]
if a[idx]<0 and b[idx]>=0:
return 0
elif b[idx]<0 and a[idx]>=0:
return 1
elif a[idx]<0 and b[idx]<0:
if a[idx]>b[idx]:
return 0
elif a[idx]<b[idx]:
return 1
elif a[idx]<b[idx]:
return 1
else:
return 0
def cmp_to_key(mycmp):
class K:
def __init__(self, obj, *args):
self.obj = obj
def __lt__(self, other):
return mycmp(self.obj, other.obj)
return K
def custom_sort(a):
assert a.dtype==np.int64
b = a.astype('>i8', copy=False)
return b.view(f'V{a.dtype.itemsize * a.shape[1]}').ravel().argsort()
tblocks = np.random.randint(-9,10, (10000, 4, 4))
tblocks = dict(enumerate(tblocks))
tblocks_s = sorted(tblocks.items(),key=cmp_to_key(compare))
tblocksa = np.array(list(tblocks.values()))
tblocksa = tblocksa.reshape(tblocksa.shape[0], -1)
order = custom_sort(tblocksa)
tblocks_s2 = list(tblocks.items())
tblocks_s2 = [tblocks_s2[o] for o in order]
print(tblocks_s == tblocks_s2)
from timeit import timeit
data = np.random.randint(-9_999, 10_000, (7_900_000, 4, 4))
print(timeit(lambda: data[custom_sort(data.reshape(data.shape[0], -1))],
number=5) / 5)
Sample output:
True
7.8328493310138585

Categories

Resources