What is the fastest possible way to run:
reduce(lambda x,y : x#y, ls)
in python?
for a list of matrices ls. I don't have an Nvidia GPU, but I do have a lot of CPU cores to work with. I thought I could make the process work in parallel (split it to log iterations), but it seems that for small (1000x1000) matrix, this is actually worst. Here is the code I tried:
from multiprocessing import Pool
import numpy as np
from itertools import zip_longest
def matmul(x):
if x[1] is None:
return x[0]
return x[1]#x[0]
def fast_mul(ls):
while True:
n = len(ls)
if n == 0:
raise Exception("Splitting Error")
if n == 1:
return ls[0]
if n == 2:
return ls[1]#ls[0]
with Pool(processes=(n//2+1)) as pool:
ls = pool.map(matmul, list(zip_longest(*[iter(ls)]*2)))
There is a function to do this: np.linalg.multi_dot, supposedly optimized for the best evaluation order:
np.linalg.multi_dot(ls)
In fact the docs say something very close to your original phrasing:
Think of multi_dot as:
def multi_dot(arrays): return functools.reduce(np.dot, arrays)
You could also try np.einsum, which will allow you to multiply up to 25 matrices:
from string import ascii_lowercase
ls = [...]
index = ','.join(ascii_lowercase[x:x + 2] for x in range(len(ls)))
index += f'->{index[0]}{index[-1]}'
np.einsum(index, *ls)
Timing
Simple case:
ls = np.random.rand(100, 1000, 1000) - 0.5
%timeit reduce(lambda x, y : x # y, ls)
4.3 s ± 76.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit reduce(np.matmul, ls)
4.35 s ± 84.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit reduce(np.dot, ls)
4.86 s ± 68.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit np.linalg.multi_dot(ls)
5.24 s ± 66.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
More complicated case:
ls = [x.T if i % 2 else x for i, x in enumerate(np.random.rand(100, 2000, 500) - 0.5)]
%timeit reduce(lambda x, y : x # y, ls)
7.94 s ± 96.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit reduce(np.matmul, ls)
7.91 s ± 33.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit reduce(np.dot, ls)
9.38 s ± 111 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit np.linalg.multi_dot(ls)
2.03 s ± 52.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Notice that the up-front work done by multi_dot has negative benefit in the straightforward case (and more suprisingly, lambda works faster than the raw operator), but saves 75% of the time in the less straightforward case.
So just for completeness, here is a less non-square case:
ls = [x.T if i % 2 else x for i, x in enumerate(np.random.rand(100, 400, 300) - 0.5)]
%timeit reduce(lambda x, y : x # y, ls)
245 ms ± 8.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit reduce(np.matmul, ls)
245 ms ± 12.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit reduce(np.dot, ls)
284 ms ± 12.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit np.linalg.multi_dot(ls)
638 ms ± 12.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
So really it seems that for most general cases, your original reduce call is actually about as good as you need to get. My only suggestion would be to use operator.matmul instead of the lambda.
EDIT: Threw in yet another possible function
EDIT: I added the results with np.linalg.multi_dot, expecting it would be faster than the rest but actually it is much slower somehow. I suppose it is design with other kind of use case in mind.
I'm not sure you will be able to get much faster than that. Here are a few different implementations of the reduction for the case where the data is a 3D array of square matrices:
from multiprocessing import Pool
from functools import reduce
import numpy as np
import numba as nb
def matmul_n_naive(data):
return reduce(np.matmul, data)
# If you don't care about modifying data pass copy=False
def matmul_n_binary(data, copy=True):
if len(data) < 1:
raise ValueError
data = np.array(data, copy=copy)
n, r, c = data.shape
dt = data.dtype
s = 1
while (n + s - 1) // s > 1:
a = data[:n - s:2 * s]
b = data[s:n:2 * s]
np.matmul(a, b, out=a)
s *= 2
return np.array(a[0])
def matmul_n_pool(data):
if len(data) < 1:
raise ValueError
lst = data
with Pool() as pool:
while len(lst) > 1:
lst_next = pool.starmap(np.matmul, zip(lst[::2], lst[1::2]))
if len(lst) % 2 != 0:
lst_next.append(lst[-1])
lst = lst_next
return lst[0]
#nb.njit(parallel=False)
def matmul_n_numba_nopar(data):
res = np.eye(data.shape[1], data.shape[2], dtype=data.dtype)
for i in nb.prange(len(data)):
res = res # data[i]
return res
#nb.njit(parallel=True)
def matmul_n_numba_par(data):
res = np.eye(data.shape[1], data.shape[2], dtype=data.dtype)
for i in nb.prange(len(data)): # Numba knows how to do parallel reductions correctly
res = res # data[i]
return res
def matmul_n_multidot(data):
return np.linalg.multi_dot(data)
And a test:
# Test
import numpy as np
np.random.seed(0)
a = np.random.rand(10, 100, 100) * 2 - 1
b1 = matmul_n_naive(a)
b2 = matmul_n_binary(a)
b3 = matmul_n_pool(a)
b4 = matmul_n_numba_nopar(a)
b5 = matmul_n_numba_par(a)
b6 = matmul_n_multidot(a)
print(np.allclose(b1, b2))
# True
print(np.allclose(b1, b3))
# True
print(np.allclose(b1, b4))
# True
print(np.allclose(b1, b5))
# True
print(np.allclose(b1, b6))
# True
Here are some benchmarks, it seems there is no consistent winner but the "naive" solution is pretty good all around, binary and Numba vary, the process pool is not really good and np.linalg.multi_dot does not seem to be very advantageous with square matrices.
import numpy as np
# 10 matrices 1000x1000
np.random.seed(0)
a = np.random.rand(10, 1000, 1000) * 0.1 - 0.05
%timeit matmul_n_naive(a)
# 121 ms ± 6.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit matmul_n_binary(a)
# 165 ms ± 3.68 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit matmul_n_numba_nopar(a)
# 108 ms ± 510 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit matmul_n_numba_par(a)
# 244 ms ± 7.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit matmul_n_multidot(a)
# 132 ms ± 2.41 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# 200 matrices 100x100
np.random.seed(0)
a = np.random.rand(200, 100, 100) * 0.1 - 0.05
%timeit matmul_n_naive(a)
# 4.4 ms ± 226 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit matmul_n_binary(a)
# 13.4 ms ± 299 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit matmul_n_numba_nopar(a)
# 9.51 ms ± 126 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit matmul_n_numba_par(a)
# 4.93 ms ± 146 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit matmul_n_multidot(a)
# 1.14 s ± 22.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# 300 matrices 10x10
np.random.seed(0)
a = np.random.rand(300, 10, 10) * 0.1 - 0.05
%timeit matmul_n_naive(a)
# 526 µs ± 953 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit matmul_n_binary(a)
# 152 µs ± 508 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit matmul_n_pool(a)
# 610 ms ± 5.93 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit matmul_n_numba_nopar(a)
# 239 µs ± 1.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit matmul_n_numba_par(a)
# 175 µs ± 422 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit matmul_n_multidot(a)
# 3.68 s ± 87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# 1000 matrices 10x10
np.random.seed(0)
a = np.random.rand(1000, 10, 10) * 0.1 - 0.05
%timeit matmul_n_naive(a)
# 1.56 ms ± 4.49 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit matmul_n_binary(a)
# 392 µs ± 790 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit matmul_n_pool(a)
# 727 ms ± 12.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit matmul_n_numba_nopar(a)
# 589 µs ± 356 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit matmul_n_numba_par(a)
# 451 µs ± 1.68 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit matmul_n_multidot(a)
# Never finished...
Related
Conjugating a complex number appears to be about 30 times faster if the type() of the complex number is complex rather than numpy.complex128, see the minimal example below. However, the absolute value takes about the same time. Taking the real and the imaginary part is only about 3 times faster.
Why is the conjugate slower by that much? When I take a from a large complex-valued array, it seems I should cast it to complex first (the complex conjugation is part of a larger code which has many (> 10^6) iterations).
import numpy as np
np.random.seed(100)
a = (np.random.rand(1) + 1j*np.random.rand(1))[0]
b = complex(a)
%timeit a.conjugate() # 2.95 µs ± 24 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%timeit a.conj() # 2.86 µs ± 14.2 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%timeit b.conjugate() # 82.8 ns ± 1.28 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
%timeit abs(a) # 112 ns ± 1.7 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
%timeit abs(b) # 99.6 ns ± 0.623 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
%timeit a.real # 145 ns ± 0.259 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
%timeit b.real # 54.8 ns ± 0.121 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
%timeit a.imag # 144 ns ± 0.771 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
%timeit b.imag # 55.4 ns ± 0.297 ns per loop (mean ± std. dev. of 7 runs, 10000000 loops each)
Calling NumPy routines always comes at a fixed cost, which in this case is more expensive than cost of the Python-native routine.
As soon as you start processing more than one number (possibly millions) at once NumPy will be much faster:
import numpy as np
N = 10
a = np.random.rand(N) + 1j*np.random.rand(N)
b = [complex(x) for x in a]
%timeit a.conjugate() # 481 ns ± 1.39 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
%timeit [x.conjugate() for x in b] # 605 ns ± 6.11 ns per loop (mean ± std. dev. of 7 runs, 1,000,000 loops each)
What is the best way to count the rows in a 2d numpy array that include all values of another 1d numpy array? The 2nd array can have more columns than the length of the 1d array.
elements = np.arange(4).reshape((2, 2))
test_elements = [2, 3]
somefunction(elements, test_elements)
I would expect the function to return 1.
elements = np.arange(15).reshape((5, 3))
# array([[ 0, 1, 2],
# [ 3, 4, 5],
# [ 6, 7, 8],
# [ 9, 10, 11],
# [12, 13, 14]])
test_elements = [4, 3]
somefunction(elements, test_elements)
Should also return 1.
All elements of the 1d array must be included. If only a few elements are found in a row, it doesn't count. Hence:
elements = np.arange(15).reshape((5, 3))
# array([[ 0, 1, 2],
# [ 3, 4, 5],
# [ 6, 7, 8],
# [ 9, 10, 11],
# [12, 13, 14]])
test_elements = [3, 4, 10]
somefunction(elements, test_elements)
Should also return 0.
Create a boolean array of elements found then use any row-wise this will avoid multiple values in the same row and at last count the rows by using sum,
np.any(np.isin(elements, test), axis=1).sum()
Output
>>> elements
array([[ 0, 1, 2],
[ 3, 4, 5],
[ 6, 7, 8],
[ 9, 10, 11],
[12, 13, 14]])
>>> test = [1, 6, 7, 4]
>>> np.any(np.isin(elements, test), axis=1).sum()
3
There's probably a more efficient solution, but if you want the rows where "all" elements of test_elements are present, you can reverse np.isin and apply it along each row, with the following:
np.apply_along_axis(lambda x: np.isin(test_elements, x), 1, elements).all(1).sum()
A slightly more efficient (but less readable) variant of #norok2's solution is the following.
sum(map(set(test_elements).issubset, elements))
(EDIT: OK, now I actually had a bit more time to figure out what is going on.)
The are two issues here:
the computational complexity depends on the sizes of both inputs and it is not captured well by a 1D benchmark plot
the actual timing are dominated by variation in the inputs
The problem can be separated in two parts:
looping through the rows
performing the subset check, which is basically a nested-loop quadratic operation (in the worst-case scenario)
We know that, for sufficiently large inputs, looping through the rows is faster in NumPy and slower in pure Python.
For reference, let's consider these two approaches:
# pure Python approach
def all_in_by_row_flt(arr, elems=ELEMS):
return sum(1 for row in arr if all(e in row for e in elems))
# NumPy apprach (based on #Mstaino answer)
def all_in_by_row_np(arr, elems=ELEMS):
def _aaa_helper(row, e=elems):
return np.isin(e, row)
return np.sum(np.all(np.apply_along_axis(_aaa_helper, 1, arr), 1))
Then, considering the subset check operation, if the input is such that the check is performed within fewer loops, pure Python looping gets faster than NumPy. Conversely, if a sufficiently large number of loops is required, then NumPy can actually be faster.
On top of this, there is the looping through the rows, but because the subset check operation is quadratic AND the have different constant coefficients, there are situations for which, despite the rows-looping being faster in NumPy (because the number of rows would be sufficiently large), the overall operation is faster in pure Python.
This was the situation I was running into in the earlier benchmarks, and corresponds to the situation where the subset check is always (or almost) False and it does fail within few loops.
As soon as the subset check starts requiring more loops, the Python only approach begins to lag behind and for the situation where the subset check is actually True for most (if not all) the rows, the NumPy approach is actually faster.
Another key difference between the NumPy and the pure Python approach is that pure Python uses lazy evaluation and NumPy does not, and actually require the creation of potentially large intermediate objects that slow down the computation.
On top of this, NumPy iterates over the rows twice (one in sum() and one in np.apply_along_axis()), while the pure Python approaches only once.
Other approaches using set().issubset() like e.g. from #GZ0 answer:
def all_in_by_row_set(arr, elems=ELEMS):
elems = set(elems)
return sum(map(elems.issubset, row))
have different timings than the explicitly writing of the nested-loop when it comes to subset checking, but they still suffer from slower outer looping.
So, what's next?
The answer is to use Cython or Numba.
The idea is to get NumPy-like (read: C) speed all the times (and not only for sufficiently large inputs), lazy evaluation and minimal number of looping through the rows.
An example of a Cython approach (as implemented in IPython, using the %load_ext Cython magic) is:
%%cython --cplus -c-O3 -c-march=native -a
#cython: language_level=3, boundscheck=False, wraparound=False, initializedcheck=False, cdivision=True
cdef long all_in_by_row_c(long[:, :] arr, long[:] elems) nogil:
cdef long result = 0
I = arr.shape[0]
J = arr.shape[1]
K = elems.shape[0]
for i in range(I):
is_subset = True
for k in range(K):
is_contained = False
for j in range(J):
if elems[k] == arr[i, j]:
is_contained = True
break
if not is_contained:
is_subset = False
break
result += 1 if is_subset else 0
return result
def all_in_by_row_cy(long[:, :] arr, long[:] elems):
return all_in_by_row_c(arr, elems)
While a similar Numba code reads:
import numba as nb
#nb.jit(nopython=True, nogil=True)
def all_in_by_row_jit(arr, elems=ELEMS):
result = 0
n_rows, n_cols = arr.shape
for i in range(n_rows):
is_subset = True
for e in elems:
is_contained = False
for r in arr[i, :]:
if e == r:
is_contained = True
break
if not is_contained:
is_subset = False
break
result += 1 if is_subset else 0
return result
Now, time-wise we get to the following (for relatively small number of rows):
arr.shape=(100, 1000) elems.shape=(1000,) result=0
Func: all_in_by_row_cy 120 µs ± 1.07 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Func: all_in_by_row_jit 129 µs ± 131 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Func: all_in_by_row_flt 2.44 ms ± 13.6 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Func: all_in_by_row_set 9.98 ms ± 52.9 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Func: all_in_by_row_np 13.7 ms ± 52.2 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
arr.shape=(100, 2000) elems.shape=(1000,) result=0
Func: all_in_by_row_cy 1.45 ms ± 24.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Func: all_in_by_row_jit 1.52 ms ± 4.16 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Func: all_in_by_row_flt 30.1 ms ± 452 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Func: all_in_by_row_set 19.8 ms ± 56.1 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Func: all_in_by_row_np 18 ms ± 28.4 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
arr.shape=(100, 3000) elems.shape=(1000,) result=37
Func: all_in_by_row_cy 10.4 ms ± 31.7 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Func: all_in_by_row_jit 10.9 ms ± 13.8 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Func: all_in_by_row_flt 226 ms ± 2.67 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Func: all_in_by_row_set 30.5 ms ± 92.9 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Func: all_in_by_row_np 21.9 ms ± 87.4 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
arr.shape=(100, 4000) elems.shape=(1000,) result=86
Func: all_in_by_row_cy 16.8 ms ± 32.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Func: all_in_by_row_jit 17.7 ms ± 42 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
Func: all_in_by_row_flt 385 ms ± 2.33 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Func: all_in_by_row_set 39.5 ms ± 588 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Func: all_in_by_row_np 25.7 ms ± 128 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Now that the slow down of the last block cannot be explained by the increased input size in the second dimension.
Actually, if the short-circuit rate is increased (e.g. by changing the values range of the random arrays), for the last block (same input sizes) one gets:
arr.shape=(100, 4000) elems.shape=(1000,) result=0
Func: all_in_by_row_cy 152 µs ± 1.89 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Func: all_in_by_row_jit 173 µs ± 4.72 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Func: all_in_by_row_flt 556 µs ± 8.56 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Func: all_in_by_row_set 39.7 ms ± 287 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Func: all_in_by_row_np 31.5 ms ± 315 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Note that set()-based method is kind of independent on the short-circuit rate (because of the hash-based implementation which has ~O(1) check for presence complexity, but this comes at the expenses of hashing pre-computation and these results indicate this might not be faster than the direct nested-looping approach).
Finally, for larger rows counts :
arr.shape=(100000, 1000) elems.shape=(1000,) result=0
Func: all_in_by_row_cy 141 ms ± 2.08 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Func: all_in_by_row_jit 150 ms ± 1.73 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
Func: all_in_by_row_flt 2.6 s ± 28.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Func: all_in_by_row_set 10.1 s ± 216 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Func: all_in_by_row_np 13.7 s ± 15.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
arr.shape=(100000, 2000) elems.shape=(1000,) result=34
Func: all_in_by_row_cy 1.2 s ± 753 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
Func: all_in_by_row_jit 1.27 s ± 7.32 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Func: all_in_by_row_flt 24.1 s ± 119 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Func: all_in_by_row_set 19.5 s ± 270 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Func: all_in_by_row_np 18 s ± 18.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
arr.shape=(100000, 3000) elems.shape=(1000,) result=33859
Func: all_in_by_row_cy 9.79 s ± 11.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Func: all_in_by_row_jit 10.3 s ± 5.55 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Func: all_in_by_row_flt 3min 30s ± 1.13 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
Func: all_in_by_row_set 30 s ± 57.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Func: all_in_by_row_np 21.9 s ± 59.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
arr.shape=(100000, 4000) elems.shape=(1000,) result=86376
Func: all_in_by_row_cy 17 s ± 30.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Func: all_in_by_row_jit 17.9 s ± 13 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Func: all_in_by_row_flt 6min 29s ± 293 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Func: all_in_by_row_set 38.9 s ± 33 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Func: all_in_by_row_np 25.7 s ± 29.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Finally, note that the Cython/Numba code may be algorithmically optimized.
I'm trying to find the best way to compute the minimum element wise products between two sets of vectors. The usual matrix multiplication C=A#B computes Cij as the sum of the pairwise products of the elements of the vectors Ai and B^Tj. I would like to perform instead the minimum of the pairwise products. I can't find an efficient way to do this between two matrices with numpy.
One way to achieve this would be to generate the 3D matrix of the pairwise products between A and B (before the sum) and then take the minimum over the third dimension. But this would lead to a huge memory footprint (and I actually dn't know how to do this).
Do you have any idea how I could achieve this operation ?
Example:
A = [[1,1],[1,1]]
B = [[0,2],[2,1]]
matrix matmul:
C = [[1*0+1*2,1*2+1*1][1*0+1*2,1*2+1*1]] = [[2,3],[2,3]]
minimum matmul:
C = [[min(1*0,1*2),min(1*2,1*1)][min(1*0,1*2),min(1*2,1*1)]] = [[0,1],[0,1]]
Use broadcasting after extending A to 3D -
A = np.asarray(A)
B = np.asarray(B)
C_out = np.min(A[:,None]*B,axis=2)
If you care about memory footprint, use numexpr module to be efficient about it -
import numexpr as ne
C_out = ne.evaluate('min(A3D*B,2)',{'A3D':A[:,None]})
Timings on large arrays -
In [12]: A = np.random.rand(200,200)
In [13]: B = np.random.rand(200,200)
In [14]: %timeit np.min(A[:,None]*B,axis=2)
34.4 ms ± 614 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [15]: %timeit ne.evaluate('min(A3D*B,2)',{'A3D':A[:,None]})
29.3 ms ± 316 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [16]: A = np.random.rand(300,300)
In [17]: B = np.random.rand(300,300)
In [18]: %timeit np.min(A[:,None]*B,axis=2)
113 ms ± 2.27 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [19]: %timeit ne.evaluate('min(A3D*B,2)',{'A3D':A[:,None]})
102 ms ± 691 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
So, there's some improvement with numexpr, but maybe not as much I was expecting it to be.
Numba can be also an option
I was a bit surprised of the not particularly good Numexpr Timings, so I tried a Numba Version. For large Arrays this can be optimized further. (Quite the same principles like for a dgemm can be applied)
import numpy as np
import numba as nb
import numexpr as ne
#nb.njit(fastmath=True,parallel=True)
def min_pairwise_prod(A,B):
assert A.shape[1]==B.shape[1]
res=np.empty((A.shape[0],B.shape[0]))
for i in nb.prange(A.shape[0]):
for j in range(B.shape[0]):
min_prod=A[i,0]*B[j,0]
for k in range(B.shape[1]):
prod=A[i,k]*B[j,k]
if prod<min_prod:
min_prod=prod
res[i,j]=min_prod
return res
Timings
A=np.random.rand(300,300)
B=np.random.rand(300,300)
%timeit res_1=min_pairwise_prod(A,B) #parallel=True
5.56 ms ± 1.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res_1=min_pairwise_prod(A,B) #parallel=False
26 ms ± 163 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit res_2 = ne.evaluate('min(A3D*B,2)',{'A3D':A[:,None]})
87.7 ms ± 265 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit res_3=np.min(A[:,None]*B,axis=2)
110 ms ± 214 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
A=np.random.rand(1000,300)
B=np.random.rand(1000,300)
%timeit res_1=min_pairwise_prod(A,B) #parallel=True
50.6 ms ± 401 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res_1=min_pairwise_prod(A,B) #parallel=False
296 ms ± 5.02 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res_2 = ne.evaluate('min(A3D*B,2)',{'A3D':A[:,None]})
992 ms ± 7.59 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit res_3=np.min(A[:,None]*B,axis=2)
1.27 s ± 15.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
I want to combine two int columns to create a new dot-separated str column. I've got one way that works but if there is a faster way, it would help. I've also tried a suggestion I found in another answer on SO that produces an error.
This works:
df3 = pd.DataFrame({'job_number': [3913291, 3887250, 3913041],
'task_number': [38544, 0, 1]})
df3['filename'] = df3['job_number'].astype(str) + '.' + df3['task_number'].astype(str)
0 3913291.38544
1 3887250.0
2 3913041.1
This answer to a similar question suggests a "numpy" way, using .values.astype(str), but I haven't gotten it to work yet. Here I run it without including the dot separator:
df3['job_number'].values.astype(int).astype(str) + df3['task_number'].astype(int).astype(str)
0 391329138544
1 38872500
2 39130411
But when I include the dot separator I get an error:
df3['job_number'].values.astype(int).astype(str) + '.' + df3['task_number'].astype(int).astype(str)
TypeError: ufunc 'add' did not contain a loop with signature matching types dtype('<U11') dtype('<U11') dtype('<U11')
The result I want is:
0 3913291.38544
1 3887250.0
2 3913041.1
For comparison of given methods with other available methods do refer #Jezrael answer.
Method 1
To add a dummy column containing ., use it in processing and later drop it:
%%timeit
df3['dummy'] ='.'
res = df3['job_number'].values.astype(str) + df3['dummy'] + df3['task_number'].values.astype(str)
df3.drop(columns=['dummy'], inplace=True)
1.31 ms ± 41.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
To the extension of method 1, if you exclude the processing time of dummy column creation and dropping it then it is the best you get -
%%timeit
df3['job_number'].values.astype(str) + df3['dummy'] + df3['task_number'].values.astype(str)
286 µs ± 15.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
Method 2
Use apply
%timeit df3.T.apply(lambda x: str(x[0]) + '.' + str(x[1]))
883 µs ± 22 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
You can use list comprehension:
df3["filename"] = ['.'.join(i) for i in
zip(df3["job_number"].map(str),df3["task_number"].map(str))]
If use python 3.6+ the fastest solution with f-strings:
df3["filename2"] = [f'{i}.{j}' for i,j in zip(df3["job_number"],df3["task_number"])]
Performance in 30k rows:
df3 = pd.DataFrame({'job_number': [3913291, 3887250, 3913041],
'task_number': [38544, 0, 1]})
df3 = pd.concat([df3] * 10000, ignore_index=True)
In [64]: %%timeit
...: df3["filename2"] = [f'{i}.{j}' for i,j in zip(df3["job_number"],df3["task_number"])]
...:
20.5 ms ± 226 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [65]: %%timeit
...: df3["filename3"] = ['.'.join(i) for i in zip(df3["job_number"].map(str),df3["task_number"].map(str))]
...:
30.9 ms ± 189 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
In [66]: %%timeit
...: df3["filename4"] = df3.T.apply(lambda x: str(x[0]) + '.' + str(x[1]))
...:
1.7 s ± 31.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
In [67]: %%timeit
...: df3['dummy'] ='.'
...: res = df3['job_number'].values.astype(str) + df3['dummy'] + df3['task_number'].values.astype(str)
...: df3.drop(columns=['dummy'], inplace=True)
...:
73.6 ms ± 1.23 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
But also very fast is original solution:
In [73]: %%timeit
...: df3['filename'] = df3['job_number'].astype(str) + '.' + df3['task_number'].astype(str)
48.3 ms ± 872 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
With small modification - using map instead astype:
In [76]: %%timeit
...: df3['filename'] = df3['job_number'].map(str) + '.' + df3['task_number'].map(str)
...:
26 ms ± 676 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Methods in order of %%timeit results
I timed all the suggested methods and a few more on two DataFrames. Here are the timed results for the suggested methods (thank you #meW and #jezrael). If I missed any or you have another, let me know and I'll add it.
Two timings are shown for each method: first for processing the 3 rows in the example df and then for processing 57K rows in another df. Timings may vary on another system. Solutions that include TEST['dot'] in the concatenation string require this column in the df: add it with TEST['dot'] = '.'.
Original method (still the fastest):
.astype(str), +, '.'
%%timeit
TEST['filename'] = TEST['job_number'].astype(str) + '.' + TEST['task_number'].astype(str)
# 553 µs ± 6.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) on 3 rows
# 69.6 ms ± 876 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) on 57K rows
Proposed methods and a few permutations on them:
.astype(int).astype(str), +, '.'
%%timeit
TEST['filename'] = TEST['job_number'].astype(int).astype(str) + '.' + TEST['task_number'].astype(int).astype(str)
# 553 µs ± 6.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) on 3 rows
# 70.2 ms ± 739 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) on 57K rows
.values.astype(int).astype(str), +, TEST['dot']
%%timeit
TEST['filename'] = TEST['job_number'].values.astype(int).astype(str) + TEST['dot'] + TEST['task_number'].values.astype(int).astype(str)
# 221 µs ± 5.93 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) on 3 rows
# 82.3 ms ± 743 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) on 57K rows
.values.astype(str), +, TEST['dot']
%%timeit
TEST["filename"] = TEST['job_number'].values.astype(str) + TEST['dot'] + TEST['task_number'].values.astype(str)
# 221 µs ± 5.93 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) on 3 rows
# 92.8 ms ± 1.21 ms per loop (mean ± std. dev. of 7 runs, 10 loops each) on 57K rows
'.'.join(), list comprehension, .values.astype(str)
%%timeit
TEST["filename"] = ['.'.join(i) for i in TEST[["job_number",'task_number']].values.astype(str)]
# 743 µs ± 19.7 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) on 3 rows
# 147 ms ± 532 µs per loop (mean ± std. dev. of 7 runs, 10 loops each) on 57K rows
f-string, list comprehension, .values.astype(str)
%%timeit
TEST["filename2"] = [f'{i}.{j}' for i,j in TEST[["job_number",'task_number']].values.astype(str)]
# 642 µs ± 27.9 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) on 3 rows
# 167 ms ± 3.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) on 57K rows
'.'.join(), zip, list comprehension, .map(str)
%%timeit
TEST["filename"] = ['.'.join(i) for i in
zip(TEST["job_number"].map(str), TEST["task_number"].map(str))]
# 512 µs ± 5.74 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) on 3 rows
# 181 ms ± 4.17 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) on 57K rows
apply(lambda, str(x[2]), +, '.')
%%timeit
TEST['filename'] = TEST.T.apply(lambda x: str(x[2]) + '.' + str(x[10]))
# 735 µs ± 13.6 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each) on 3 rows
# 2.69 s ± 18.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each) on 57K rows
If you see a way to improve on any of these, please let me know and I'll add to the list!
For example
>>> two_powers(42)
>>> (2, 8, 32)
My current naive implementation (taken from here) looks like that
def two_powers(num):
return tuple(2 ** i for i, j in enumerate(bin(num)[-1: 1: -1]) if j == '1')
But I hope there're faster ways to do this.
Try this:
def two_powers(num):
powers = []
while num != 0:
powers.append(num & -num)
num = num & (num - 1)
return powers
Your solution is good actually, with a slight (big!) detail of efficiency:
Use
1<<i
(bitwise shift) instead of
2**i
So, to copy you, consider the following :
def two_powers(num):
return set(1 << i for i, j in enumerate(bin(num)[-1: 1: -1]) if j == '1')
print two_powers(42)
You can make use of the log2 and generator expression
until you are run out of powers of two.
import math
def two_powers(num):
while num > 0:
power = int(math.log(num, 2))
yield 2**power
num = num - 2**power
Sample run:
>>> tuple(two_powers(42)))
(32, 8, 2)
>>> tuple(two_powers(43)))
(32, 8, 2, 1)
You can do this:
import math
def two_powers(num):
# Compute number of bits for big numbers
num_bits = math.floor(math.log2(num)) + 1 if num >= (1 << 32) else 32
# Take those bits where there is a "one" in the number
return [1 << p for p in range(num_bits) if num & (1 << p)]
print(two_powers(42))
# [2, 8, 32]
EDIT: Wrt the number of bits, you can make more splits if you are really concerned about performance, either down to save iterations or up to avoid computing the logarithm (or if you know your input numbers are going to be in some particular range):
import math
def two_powers(num):
# Compute number of bits for big numbers
if num < (1 << 8):
num_bits = 8
elif num < (1 << 16):
num_bits = 16
elif num < (1 << 24):
num_bits = 24
elif num < (1 << 32):
num_bits = 32
else:
num_bits = math.floor(math.log2(num)) + 1
# Take those bits where there is a "one" in the number
return [1 << p for p in range(num_bits) if num & (1 << p)]
print(two_powers(42))
# [2, 8, 32]
You can use a generator with a shifting bit mask:
def two_powers(n):
m = 1
while n >= m:
if n & m:
yield m
m <<= 1
So that:
tuple(two_powers(42))
would be:
(2, 8, 32)
>>> n=42
>>> {1<<i for i,d in enumerate(reversed(bin(n)[2:])) if d=='1'}
{8, 2, 32}
Attn: Late Posters: Please feel free to run this small benchmark
including your code, and amend the results accordingly. You will need
to re-run the tests for everyone as hardware differences will happen.
Gentlemen, here are your scores:
The winner is #tsionyx
timeit:
Benchmark <=10^2
[97, 48, 31, 39, 33, 69, 71, 21, 50, 17]
two_powers_op_____ 17.8 µs ± 1.2 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_jdehesa 15.1 µs ± 888 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_matina_ 14.6 µs ± 755 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_agile_e 7.87 µs ± 524 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_taras__ 25.8 µs ± 1.29 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_sunitha 12 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_blhsing 11.5 µs ± 566 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_tsionyx 5.77 µs ± 57.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Benchmark <=10^3
[682, 124, 42, 275, 743, 837, 474, 186, 739, 290]
two_powers_op_____ 22.1 µs ± 710 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_jdehesa 17.9 µs ± 829 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_matina_ 17.6 µs ± 881 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_agile_e 12.7 µs ± 763 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_taras__ 49.2 µs ± 3.85 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_sunitha 18.1 µs ± 2.56 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_blhsing 19.2 µs ± 2.79 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_tsionyx 10.4 µs ± 1.14 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Benchmark <=10^4
[4641, 5675, 3355, 4746, 9948, 5192, 3446, 7174, 1683, 7611]
two_powers_op_____ 30.8 µs ± 3.36 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_jdehesa 22.2 µs ± 2.37 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_matina_ 21.7 µs ± 1.13 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_agile_e 17.5 µs ± 2.46 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_taras__ 64.3 µs ± 12.1 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_sunitha 18.5 µs ± 1.24 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_blhsing 19.2 µs ± 193 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_tsionyx 11.6 µs ± 43.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Benchmark <=10^5
[20885, 23810, 25330, 32967, 34183, 16847, 54905, 85767, 37069, 32379]
two_powers_op_____ 32.8 µs ± 1.76 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_jdehesa 24.2 µs ± 534 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_matina_ 27.1 µs ± 2.99 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_agile_e 18.7 µs ± 246 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_taras__ 68.9 µs ± 3.16 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_sunitha 20.6 µs ± 486 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_blhsing 22.7 µs ± 883 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_tsionyx 14.2 µs ± 244 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Benchmark <=10^6
[182928, 93105, 710309, 926572, 859733, 818327, 654197, 829750, 358363, 946684]
two_powers_op_____ 40.6 µs ± 236 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_jdehesa 28.2 µs ± 310 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_matina_ 27.9 µs ± 936 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_agile_e 23.8 µs ± 364 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_taras__ 89.9 µs ± 406 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_sunitha 24.4 µs ± 493 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_blhsing 26.6 µs ± 366 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_tsionyx 19.3 µs ± 95.2 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
code used:
import functools
import math
import random
from collections import defaultdict
def two_powers_op_____(num):
return tuple(2 ** i for i, j in enumerate(bin(num)[-1: 1: -1]) if j == '1')
def two_powers_jdehesa(num):
num_bits = math.floor(math.log2(num)) + 1
return [1 << p for p in range(num_bits) if num & (1 << p)]
def two_powers_matina_(num):
return set(1 << i for i, j in enumerate(bin(num)[-1: 1: -1]) if j == '1')
def two_powers_agile_e(num):
powers = []
while num != 0:
powers.append(num & -num)
num = num & (num - 1)
return powers
def _two_powers_taras(num):
while num > 0:
power = int(math.log(num, 2))
yield 2 ** power
num = num - 2 ** power
def two_powers_taras__(num):
return tuple(_two_powers_taras(num))
def two_powers_sunitha(num):
return {1 << i for i, d in enumerate(reversed(bin(num)[2:])) if d == '1'}
def _two_powers_blhsing(n):
m = 1
while n >= m:
if n & m:
yield m
m <<= 1
def two_powers_blhsing(n):
return tuple(_two_powers_blhsing(n))
def two_powers_tsionyx(num):
powers = []
while num > 0:
rest = num & (num - 1)
powers.append(num - rest)
num = rest
return powers
funcs = [
two_powers_op_____,
two_powers_jdehesa,
two_powers_matina_,
two_powers_agile_e,
two_powers_taras__,
two_powers_sunitha,
two_powers_blhsing,
two_powers_tsionyx,
]
# ================== UTILITY FUNCTIONS ======================= #
def _partial_map(f, vals):
"""Run function on a range of inputs as a single function"""
p = functools.partial(map, f, vals)
p.__name__ = f.__name__
return p
def _sanity_check(f, n):
factors = f(n)
assert len(factors) > 0
# factors are unique
assert len(set(factors)) == len(factors)
assert sum(factors) == n
for f in factors:
b = bin(f)
assert b == '0b1' + '0' * (len(b) - 3)
def benchmark(fs, inputs):
for f in fs:
for n in inputs:
_sanity_check(f, n)
aggr_funcs = [_partial_map(f, inputs) for f in fs]
res = dict()
print(inputs)
for f in aggr_funcs:
print(f.__name__, end=' ')
tres = %timeit -o tuple(f())
res[f.__name__] = tres.average
return res
def plot(results):
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
plt.figure(figsize=(10, 10))
matplotlib.rcParams.update({'font.size': 18})
leg = []
for k, v in results.items():
x, y = zip(*sorted(v.items()))
plt.plot(x, [i * 10 ** 6 for i in y])
leg.append(k)
plt.legend(leg, loc='upper left')
plt.ylabel('μs')
plt.show()
full_res = defaultdict(dict)
for degree in range(2, 7):
print('Benchmark <=10^%i' % degree)
for name, t in benchmark(funcs, [random.randint(1, 10 ** degree) for _ in range(10)]).items():
full_res[name][degree] = t
# you can view the results if you run it inside a jupyter notebook
# just uncomment the following line
# plot(full_res)
measured on Lenovo ThinkPad E480
Inspired by Agile_Eagle's answer
def two_powers(num):
powers = []
while num > 0:
rest = num & (num - 1)
powers.append(num - rest)
num = rest
return powers