What is the fastest possible way to run:
reduce(lambda x,y : x#y, ls)
in python?
for a list of matrices ls. I don't have an Nvidia GPU, but I do have a lot of CPU cores to work with. I thought I could make the process work in parallel (split it to log iterations), but it seems that for small (1000x1000) matrix, this is actually worst. Here is the code I tried:
from multiprocessing import Pool
import numpy as np
from itertools import zip_longest
def matmul(x):
if x[1] is None:
return x[0]
return x[1]#x[0]
def fast_mul(ls):
while True:
n = len(ls)
if n == 0:
raise Exception("Splitting Error")
if n == 1:
return ls[0]
if n == 2:
return ls[1]#ls[0]
with Pool(processes=(n//2+1)) as pool:
ls = pool.map(matmul, list(zip_longest(*[iter(ls)]*2)))
There is a function to do this: np.linalg.multi_dot, supposedly optimized for the best evaluation order:
np.linalg.multi_dot(ls)
In fact the docs say something very close to your original phrasing:
Think of multi_dot as:
def multi_dot(arrays): return functools.reduce(np.dot, arrays)
You could also try np.einsum, which will allow you to multiply up to 25 matrices:
from string import ascii_lowercase
ls = [...]
index = ','.join(ascii_lowercase[x:x + 2] for x in range(len(ls)))
index += f'->{index[0]}{index[-1]}'
np.einsum(index, *ls)
Timing
Simple case:
ls = np.random.rand(100, 1000, 1000) - 0.5
%timeit reduce(lambda x, y : x # y, ls)
4.3 s ± 76.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit reduce(np.matmul, ls)
4.35 s ± 84.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit reduce(np.dot, ls)
4.86 s ± 68.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit np.linalg.multi_dot(ls)
5.24 s ± 66.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
More complicated case:
ls = [x.T if i % 2 else x for i, x in enumerate(np.random.rand(100, 2000, 500) - 0.5)]
%timeit reduce(lambda x, y : x # y, ls)
7.94 s ± 96.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit reduce(np.matmul, ls)
7.91 s ± 33.7 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit reduce(np.dot, ls)
9.38 s ± 111 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit np.linalg.multi_dot(ls)
2.03 s ± 52.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Notice that the up-front work done by multi_dot has negative benefit in the straightforward case (and more suprisingly, lambda works faster than the raw operator), but saves 75% of the time in the less straightforward case.
So just for completeness, here is a less non-square case:
ls = [x.T if i % 2 else x for i, x in enumerate(np.random.rand(100, 400, 300) - 0.5)]
%timeit reduce(lambda x, y : x # y, ls)
245 ms ± 8.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit reduce(np.matmul, ls)
245 ms ± 12.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit reduce(np.dot, ls)
284 ms ± 12.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit np.linalg.multi_dot(ls)
638 ms ± 12.4 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
So really it seems that for most general cases, your original reduce call is actually about as good as you need to get. My only suggestion would be to use operator.matmul instead of the lambda.
EDIT: Threw in yet another possible function
EDIT: I added the results with np.linalg.multi_dot, expecting it would be faster than the rest but actually it is much slower somehow. I suppose it is design with other kind of use case in mind.
I'm not sure you will be able to get much faster than that. Here are a few different implementations of the reduction for the case where the data is a 3D array of square matrices:
from multiprocessing import Pool
from functools import reduce
import numpy as np
import numba as nb
def matmul_n_naive(data):
return reduce(np.matmul, data)
# If you don't care about modifying data pass copy=False
def matmul_n_binary(data, copy=True):
if len(data) < 1:
raise ValueError
data = np.array(data, copy=copy)
n, r, c = data.shape
dt = data.dtype
s = 1
while (n + s - 1) // s > 1:
a = data[:n - s:2 * s]
b = data[s:n:2 * s]
np.matmul(a, b, out=a)
s *= 2
return np.array(a[0])
def matmul_n_pool(data):
if len(data) < 1:
raise ValueError
lst = data
with Pool() as pool:
while len(lst) > 1:
lst_next = pool.starmap(np.matmul, zip(lst[::2], lst[1::2]))
if len(lst) % 2 != 0:
lst_next.append(lst[-1])
lst = lst_next
return lst[0]
#nb.njit(parallel=False)
def matmul_n_numba_nopar(data):
res = np.eye(data.shape[1], data.shape[2], dtype=data.dtype)
for i in nb.prange(len(data)):
res = res # data[i]
return res
#nb.njit(parallel=True)
def matmul_n_numba_par(data):
res = np.eye(data.shape[1], data.shape[2], dtype=data.dtype)
for i in nb.prange(len(data)): # Numba knows how to do parallel reductions correctly
res = res # data[i]
return res
def matmul_n_multidot(data):
return np.linalg.multi_dot(data)
And a test:
# Test
import numpy as np
np.random.seed(0)
a = np.random.rand(10, 100, 100) * 2 - 1
b1 = matmul_n_naive(a)
b2 = matmul_n_binary(a)
b3 = matmul_n_pool(a)
b4 = matmul_n_numba_nopar(a)
b5 = matmul_n_numba_par(a)
b6 = matmul_n_multidot(a)
print(np.allclose(b1, b2))
# True
print(np.allclose(b1, b3))
# True
print(np.allclose(b1, b4))
# True
print(np.allclose(b1, b5))
# True
print(np.allclose(b1, b6))
# True
Here are some benchmarks, it seems there is no consistent winner but the "naive" solution is pretty good all around, binary and Numba vary, the process pool is not really good and np.linalg.multi_dot does not seem to be very advantageous with square matrices.
import numpy as np
# 10 matrices 1000x1000
np.random.seed(0)
a = np.random.rand(10, 1000, 1000) * 0.1 - 0.05
%timeit matmul_n_naive(a)
# 121 ms ± 6.09 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit matmul_n_binary(a)
# 165 ms ± 3.68 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit matmul_n_numba_nopar(a)
# 108 ms ± 510 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit matmul_n_numba_par(a)
# 244 ms ± 7.66 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit matmul_n_multidot(a)
# 132 ms ± 2.41 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
# 200 matrices 100x100
np.random.seed(0)
a = np.random.rand(200, 100, 100) * 0.1 - 0.05
%timeit matmul_n_naive(a)
# 4.4 ms ± 226 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit matmul_n_binary(a)
# 13.4 ms ± 299 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit matmul_n_numba_nopar(a)
# 9.51 ms ± 126 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit matmul_n_numba_par(a)
# 4.93 ms ± 146 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit matmul_n_multidot(a)
# 1.14 s ± 22.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# 300 matrices 10x10
np.random.seed(0)
a = np.random.rand(300, 10, 10) * 0.1 - 0.05
%timeit matmul_n_naive(a)
# 526 µs ± 953 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit matmul_n_binary(a)
# 152 µs ± 508 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit matmul_n_pool(a)
# 610 ms ± 5.93 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit matmul_n_numba_nopar(a)
# 239 µs ± 1.1 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit matmul_n_numba_par(a)
# 175 µs ± 422 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
%timeit matmul_n_multidot(a)
# 3.68 s ± 87 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# 1000 matrices 10x10
np.random.seed(0)
a = np.random.rand(1000, 10, 10) * 0.1 - 0.05
%timeit matmul_n_naive(a)
# 1.56 ms ± 4.49 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit matmul_n_binary(a)
# 392 µs ± 790 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit matmul_n_pool(a)
# 727 ms ± 12.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit matmul_n_numba_nopar(a)
# 589 µs ± 356 ns per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit matmul_n_numba_par(a)
# 451 µs ± 1.68 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
%timeit matmul_n_multidot(a)
# Never finished...
learn python by myself. I made a function for filling list. But I have 2 variants, and I want to discover which one is better and why. Or they both awful anyway I want to know truth.
def foo (x):
l = [0] * x
for i in range(x):
l[i] = i
return l
def foo1 (x):
l = []
for i in range(x):
l.append(i)
return l
from a performance perspective the first version foo is better:
%timeit foo(1000000)
# 52.4 ms ± 1.99 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit foo1(1000000)
# 67.2 ms ± 916 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
but the pythonic way to unpack an iterator in a list will be:
list(range(x))
also is faster:
%timeit list(range(1000000))
# 26.7 ms ± 661 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
When I run this parallel dask.bag code below, I seem to get much slower computation than the sequential Python code. Any insights into why?
import dask.bag as db
def is_even(x):
return not x % 2
Dask code:
%%timeit
b = db.from_sequence(range(2000000))
c = b.filter(is_even).map(lambda x: x ** 2)
c.compute()
>>> 12.8 s ± 1.15 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
# With n = 8000000
>>> 50.7 s ± 2.76 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
Python code:
%%timeit
b = list(range(2000000))
b = list(filter(is_even, b))
b = list(map(lambda x: x ** 2, b))
>>> 547 ms ± 8.8 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
# With n = 8000000
>>> 2.25 s ± 102 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Thanks to #abarnert for the suggestion to look at overhead through longer task length.
It seems like the length of each task was too short, and the overhead made Dask slower. I changed the exponent from 2 to 10000 to make each task longer. This example produces what I was expecting:
Python code:
%%timeit
b = list(range(50000))
b = list(filter(is_even, b))
b = list(map(lambda x: x ** 10000, b))
>>> 34.8 s ± 2.19 s per loop (mean ± std. dev. of 7 runs, 1 loop each)
Dask code:
%%timeit
b = db.from_sequence(range(50000))
c = b.filter(is_even).map(lambda x: x ** 10000)
c.compute()
>>> 26.4 s ± 409 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
I have following code in which I have given i as an argument to cout() but still getting TypeError: count() takes at least 1 argument (0 given)
def is_isogram(s:str):
for i in s:
print(i)
if( str.count(i) > 1): # specified argument , still getting error
return False
return True
str.count(i) > 1 should be s.count(i) > 1, this will fix your error.
You could use for i in set(i): or collections.Counter to solve your task more efficiently:
>>> from collections import Counter
>>> s = 'abbc'
>>> Counter(s).most_common(1)[0][1] == 1
False
Timings:
s = 'abcdefghijklmnopqrstuvwxyzz' # a worst case?
%timeit Counter(s).most_common(1)[0][1] == 1
13.2 µs ± 27.7 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
%timeit len(set(s)) == len(s)
1.33 µs ± 3.46 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
%%timeit
for i in set(s):
if s.count(i) > 1:
break
1.72 µs ± 17.5 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
%%timeit
for i in s:
if s.count(i) > 1:
break
6.78 µs ± 14.1 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
For example
>>> two_powers(42)
>>> (2, 8, 32)
My current naive implementation (taken from here) looks like that
def two_powers(num):
return tuple(2 ** i for i, j in enumerate(bin(num)[-1: 1: -1]) if j == '1')
But I hope there're faster ways to do this.
Try this:
def two_powers(num):
powers = []
while num != 0:
powers.append(num & -num)
num = num & (num - 1)
return powers
Your solution is good actually, with a slight (big!) detail of efficiency:
Use
1<<i
(bitwise shift) instead of
2**i
So, to copy you, consider the following :
def two_powers(num):
return set(1 << i for i, j in enumerate(bin(num)[-1: 1: -1]) if j == '1')
print two_powers(42)
You can make use of the log2 and generator expression
until you are run out of powers of two.
import math
def two_powers(num):
while num > 0:
power = int(math.log(num, 2))
yield 2**power
num = num - 2**power
Sample run:
>>> tuple(two_powers(42)))
(32, 8, 2)
>>> tuple(two_powers(43)))
(32, 8, 2, 1)
You can do this:
import math
def two_powers(num):
# Compute number of bits for big numbers
num_bits = math.floor(math.log2(num)) + 1 if num >= (1 << 32) else 32
# Take those bits where there is a "one" in the number
return [1 << p for p in range(num_bits) if num & (1 << p)]
print(two_powers(42))
# [2, 8, 32]
EDIT: Wrt the number of bits, you can make more splits if you are really concerned about performance, either down to save iterations or up to avoid computing the logarithm (or if you know your input numbers are going to be in some particular range):
import math
def two_powers(num):
# Compute number of bits for big numbers
if num < (1 << 8):
num_bits = 8
elif num < (1 << 16):
num_bits = 16
elif num < (1 << 24):
num_bits = 24
elif num < (1 << 32):
num_bits = 32
else:
num_bits = math.floor(math.log2(num)) + 1
# Take those bits where there is a "one" in the number
return [1 << p for p in range(num_bits) if num & (1 << p)]
print(two_powers(42))
# [2, 8, 32]
You can use a generator with a shifting bit mask:
def two_powers(n):
m = 1
while n >= m:
if n & m:
yield m
m <<= 1
So that:
tuple(two_powers(42))
would be:
(2, 8, 32)
>>> n=42
>>> {1<<i for i,d in enumerate(reversed(bin(n)[2:])) if d=='1'}
{8, 2, 32}
Attn: Late Posters: Please feel free to run this small benchmark
including your code, and amend the results accordingly. You will need
to re-run the tests for everyone as hardware differences will happen.
Gentlemen, here are your scores:
The winner is #tsionyx
timeit:
Benchmark <=10^2
[97, 48, 31, 39, 33, 69, 71, 21, 50, 17]
two_powers_op_____ 17.8 µs ± 1.2 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_jdehesa 15.1 µs ± 888 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_matina_ 14.6 µs ± 755 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_agile_e 7.87 µs ± 524 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_taras__ 25.8 µs ± 1.29 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_sunitha 12 µs ± 1.19 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_blhsing 11.5 µs ± 566 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_tsionyx 5.77 µs ± 57.9 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Benchmark <=10^3
[682, 124, 42, 275, 743, 837, 474, 186, 739, 290]
two_powers_op_____ 22.1 µs ± 710 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_jdehesa 17.9 µs ± 829 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_matina_ 17.6 µs ± 881 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_agile_e 12.7 µs ± 763 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_taras__ 49.2 µs ± 3.85 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_sunitha 18.1 µs ± 2.56 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_blhsing 19.2 µs ± 2.79 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_tsionyx 10.4 µs ± 1.14 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Benchmark <=10^4
[4641, 5675, 3355, 4746, 9948, 5192, 3446, 7174, 1683, 7611]
two_powers_op_____ 30.8 µs ± 3.36 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_jdehesa 22.2 µs ± 2.37 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_matina_ 21.7 µs ± 1.13 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_agile_e 17.5 µs ± 2.46 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_taras__ 64.3 µs ± 12.1 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_sunitha 18.5 µs ± 1.24 µs per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_blhsing 19.2 µs ± 193 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_tsionyx 11.6 µs ± 43.4 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Benchmark <=10^5
[20885, 23810, 25330, 32967, 34183, 16847, 54905, 85767, 37069, 32379]
two_powers_op_____ 32.8 µs ± 1.76 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_jdehesa 24.2 µs ± 534 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_matina_ 27.1 µs ± 2.99 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_agile_e 18.7 µs ± 246 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
two_powers_taras__ 68.9 µs ± 3.16 µs per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_sunitha 20.6 µs ± 486 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_blhsing 22.7 µs ± 883 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_tsionyx 14.2 µs ± 244 ns per loop (mean ± std. dev. of 7 runs, 100000 loops each)
Benchmark <=10^6
[182928, 93105, 710309, 926572, 859733, 818327, 654197, 829750, 358363, 946684]
two_powers_op_____ 40.6 µs ± 236 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_jdehesa 28.2 µs ± 310 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_matina_ 27.9 µs ± 936 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_agile_e 23.8 µs ± 364 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_taras__ 89.9 µs ± 406 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_sunitha 24.4 µs ± 493 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_blhsing 26.6 µs ± 366 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
two_powers_tsionyx 19.3 µs ± 95.2 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
code used:
import functools
import math
import random
from collections import defaultdict
def two_powers_op_____(num):
return tuple(2 ** i for i, j in enumerate(bin(num)[-1: 1: -1]) if j == '1')
def two_powers_jdehesa(num):
num_bits = math.floor(math.log2(num)) + 1
return [1 << p for p in range(num_bits) if num & (1 << p)]
def two_powers_matina_(num):
return set(1 << i for i, j in enumerate(bin(num)[-1: 1: -1]) if j == '1')
def two_powers_agile_e(num):
powers = []
while num != 0:
powers.append(num & -num)
num = num & (num - 1)
return powers
def _two_powers_taras(num):
while num > 0:
power = int(math.log(num, 2))
yield 2 ** power
num = num - 2 ** power
def two_powers_taras__(num):
return tuple(_two_powers_taras(num))
def two_powers_sunitha(num):
return {1 << i for i, d in enumerate(reversed(bin(num)[2:])) if d == '1'}
def _two_powers_blhsing(n):
m = 1
while n >= m:
if n & m:
yield m
m <<= 1
def two_powers_blhsing(n):
return tuple(_two_powers_blhsing(n))
def two_powers_tsionyx(num):
powers = []
while num > 0:
rest = num & (num - 1)
powers.append(num - rest)
num = rest
return powers
funcs = [
two_powers_op_____,
two_powers_jdehesa,
two_powers_matina_,
two_powers_agile_e,
two_powers_taras__,
two_powers_sunitha,
two_powers_blhsing,
two_powers_tsionyx,
]
# ================== UTILITY FUNCTIONS ======================= #
def _partial_map(f, vals):
"""Run function on a range of inputs as a single function"""
p = functools.partial(map, f, vals)
p.__name__ = f.__name__
return p
def _sanity_check(f, n):
factors = f(n)
assert len(factors) > 0
# factors are unique
assert len(set(factors)) == len(factors)
assert sum(factors) == n
for f in factors:
b = bin(f)
assert b == '0b1' + '0' * (len(b) - 3)
def benchmark(fs, inputs):
for f in fs:
for n in inputs:
_sanity_check(f, n)
aggr_funcs = [_partial_map(f, inputs) for f in fs]
res = dict()
print(inputs)
for f in aggr_funcs:
print(f.__name__, end=' ')
tres = %timeit -o tuple(f())
res[f.__name__] = tres.average
return res
def plot(results):
%matplotlib inline
import matplotlib.pyplot as plt
import matplotlib
plt.figure(figsize=(10, 10))
matplotlib.rcParams.update({'font.size': 18})
leg = []
for k, v in results.items():
x, y = zip(*sorted(v.items()))
plt.plot(x, [i * 10 ** 6 for i in y])
leg.append(k)
plt.legend(leg, loc='upper left')
plt.ylabel('μs')
plt.show()
full_res = defaultdict(dict)
for degree in range(2, 7):
print('Benchmark <=10^%i' % degree)
for name, t in benchmark(funcs, [random.randint(1, 10 ** degree) for _ in range(10)]).items():
full_res[name][degree] = t
# you can view the results if you run it inside a jupyter notebook
# just uncomment the following line
# plot(full_res)
measured on Lenovo ThinkPad E480
Inspired by Agile_Eagle's answer
def two_powers(num):
powers = []
while num > 0:
rest = num & (num - 1)
powers.append(num - rest)
num = rest
return powers