I was convinced to save computation time in using lambda function, but it's not that clear. look at this example:
import numpy as np
import timeit
def f_with_lambda():
a = np.array(range(5))
b = np.array(range(5))
A,B = np.meshgrid(a,b)
rst = list(map(lambda x,y : x+y , A, B))
return np.array(rst)
def f_with_for():
a = range(5)
b = np.array(range(5))
rst = [b+x for x in a]
return np.array(rst)
lambda_rst = f_with_lambda()
for_rst = f_with_for()
if __name__ == '__main__':
print(timeit.timeit("f_with_lambda()",setup = "from __main__ import f_with_lambda",number = 10000))
print(timeit.timeit("f_with_for()",setup = "from __main__ import f_with_for",number = 10000))
result is simple:
-lambda function result with time it is 0.3514268280014221 s
- with for loop : 0.10633227700236603 s
How do I write my lambda function to be competitive ? I noticed the list function to get results from de map object is not good in time. Any other way to proceed ? the mesgrid function is certainly not the best as well...
every tip is welcome!
Considering the remark about the list:
import numpy as np
import timeit
def f_with_lambda():
A,B = np.meshgrid(range(150),range(150))
return np.array(map(lambda x,y : x+y , A, B))
def f_with_for():
return np.array([np.array(range(150))+x for x in range(150)])
if __name__ == '__main__':
print(timeit.timeit("f_with_lambda()",setup = "from __main__ import f_with_lambda",number = 10000))
print(timeit.timeit("f_with_for()",setup = "from __main__ import f_with_for",number = 10000))
it is changing a lot of things. This time (lambda vs for)
for 5:
0.30227499100146815 vs 0.2510572589999356 (quite similar)
for 150:
0.6687559890015109 vs 20.31807473200024 ( :) :) :) ) !! great job! thank you!
Memory allocation is taking time (it should call an OS procedure, it might be delayed).
In the lambda version, you allocated a, b, meshgrid, rst (list and array versions) + the return array.
In the for version, you allocated b and rst + the return array. a is a generator so it takes no time to create and load it in memory.
This is why your function using lambda is slower.
Plus, don't use list to handle result of np-array operations to cast it back to np-array.
Just by removing the list() it become faster (from 0.9 to 0.4).
def f_with_lambda():
a = np.array(range(SIZE))
b = np.array(range(SIZE))
A,B = np.meshgrid(a,b)
rst = map(lambda x,y : x+y , A, B)
return np.array(rst)
See https://stackoverflow.com/a/46470401/9453926 for speed comparison.
I compacted the code:
import numpy as np
import timeit
def f_with_lambda():
A,B = np.meshgrid(range(150),range(150))
return np.array(list(map(lambda x,y : x+y , A, B)))
def f_with_for():
return np.array([np.array(range(150))+x for x in range(150)])
if __name__ == '__main__':
print(timeit.timeit("f_with_lambda()",setup = "from __main__ import f_with_lambda",number = 10000))
print(timeit.timeit("f_with_for()",setup = "from __main__ import f_with_for",number = 10000))
This time, for a 5x5, the result is
Lambda vs for
0.38113487999726203 vs 0.24913009200099623
and with 150 it's better:
2.680842614001449 vs 20.176408246999927
But I found no way to integrate the mesgrid inside the lambda function. and the list conversion before the array is sad as well.
I took time to integrate the last remark from politinsa:
import numpy as np
import timeit
def f_with_lambda():
A,B = np.meshgrid(range(150),range(150))
return np.array(list(map(lambda x,y : x+y , A, B)))
def f_with_for():
return np.array([np.array(range(150))+x for x in range(150)])
def f_with_lambda_nolist():
A,B = np.meshgrid(range(150),range(150))
return np.array(map(lambda x,y : x+y , A, B))
if __name__ == '__main__':
print(timeit.timeit("f_with_lambda()",setup = "from __main__ import f_with_lambda",number = 10000))
print(timeit.timeit("f_with_for()",setup = "from __main__ import f_with_for",number = 10000))
print(timeit.timeit("f_with_lambda_nolist()",setup = "from __main__ import f_with_lambda_nolist",number = 10000))
results are:
2.4421722999977646 s
18.75847979998798 s
0.6800016999914078 s -> list conversion has (as explained) a real impact on memory allocation
Related
I want take logarithm multiple times. We know this
import numpy as np
np.log(x)
now the second logarithm would be
np.log(np.log(x))
what if one wants to take n number of logs? surely it would not be pythonic to repeat n times as above.
As per #eugenhu's suggestion, one way is to use a generic function which loops iteratively:
import numpy as np
def repeater(f, n):
def fn(i):
result = i
for _ in range(n):
result = f(result)
return result
return fn
repeater(np.log, 5)(x)
You could use the following little trick:
>>> from functools import reduce
>>>
>>> k = 4
>>> x = 1e12
>>>
>>> y = np.array(x)
>>> reduce(np.log, (k+1) * (y,))[()]
0.1820258315495139
and back:
>>> reduce(np.exp, (k+1) * (y,))[()]
999999999999.9813
On my machine this is slightly faster than #jp_data_analysis' approach:
>>> def f_pp(ufunc, x, k):
... y = np.array(x)
... return reduce(ufunc, (k+1) * (y,))[()]
...
>>> x = 1e12
>>> k = 5
>>>
>>> from timeit import repeat
>>> kwds = dict(globals=globals(), number=100000)
>>>
>>> repeat('repeater(np.log, 5)(x)', **kwds)
[0.5353733809897676, 0.5327484680456109, 0.5363518510130234]
>>> repeat('f_pp(np.log, x, 5)', **kwds)
[0.4512511100037955, 0.4380568229826167, 0.45331112697022036]
To be fair, their approach is more flexible. Mine uses quite specific properties of unary ufuncs and numpy arrays.
Larger k is also possible. For that we need to make sure that x is complex because np.log will not switch automatically.
>>> x = 1e12+0j
>>> k = 50
>>>
>>> f_pp(np.log, x, 50)
(0.3181323483680859+1.3372351153002153j)
>>> f_pp(np.exp, _, 50)
(1000000007040.9696+6522.577629950761j)
# not that bad, all things considered ...
>>>
>>> repeat('f_pp(np.log, x, 50)', **kwds)
[4.272890724008903, 4.266964592039585, 4.270542044949252]
>>> repeat('repeater(np.log, 50)(x)', **kwds)
[5.799160094989929, 5.796761817007791, 5.80835147597827]
From this post, you can compose functions:
Code
import itertools as it
import functools as ft
import numpy as np
def compose(f, g):
return lambda x: f(g(x))
identity = lambda x: x
Demo
ft.reduce(compose, it.repeat(np.log, times=2), identity)(10)
# 0.83403244524795594
ft.reduce(compose, it.repeat(np.log, times=3), identity)(10)
# -0.18148297420509205
Assume that you have a list with an arbitrary amounts of items, and you wish to get the number of items that match a specific conditions. I though of two ways to do this in a sensible manner but I am not sure which one is best (more pythonic) - or if there is perhaps a better option (without sacrificing too much readability).
import numpy.random as nprnd
import timeit
my = nprnd.randint(1000, size=1000000)
def with_len(my_list):
much = len([t for t in my_list if t >= 500])
def with_sum(my_list):
many = sum(1 for t in my_list if t >= 500)
t1 = timeit.Timer('with_len(my)', 'from __main__ import with_len, my')
t2 = timeit.Timer('with_sum(my)', 'from __main__ import with_sum, my')
print("with len:",t1.timeit(1000)/1000)
print("with sum:",t2.timeit(1000)/1000)
Performance is almost identical between these two cases. However, which of these is more pythonic? Or is there a better alternative?
For those who are curious, I tested the proposed solutions (from comments and answers) and these are the results:
import numpy as np
import timeit
import functools
my = np.random.randint(1000, size=100000)
def with_len(my_list):
return len([t for t in my_list if t >= 500])
def with_sum(my_list):
return sum(1 for t in my_list if t >= 500)
def with_sum_alt(my_list):
return sum(t >= 500 for t in my_list)
def with_lambda(my_list):
return functools.reduce(lambda a, b: a + (1 if b >= 500 else 0), my_list, 0)
def with_np(my_list):
return len(np.where(my_list>=500)[0])
t1 = timeit.Timer('with_len(my)', 'from __main__ import with_len, my')
t2 = timeit.Timer('with_sum(my)', 'from __main__ import with_sum, my')
t3 = timeit.Timer('with_sum_alt(my)', 'from __main__ import with_sum_alt, my')
t4 = timeit.Timer('with_lambda(my)', 'from __main__ import with_lambda, my')
t5 = timeit.Timer('with_np(my)', 'from __main__ import with_np, my')
print("with len:", t1.timeit(1000)/1000)
print("with sum:", t2.timeit(1000)/1000)
print("with sum_alt:", t3.timeit(1000)/1000)
print("with lambda:", t4.timeit(1000)/1000)
print("with np:", t5.timeit(1000)/1000)
Python 2.7
('with len:', 0.02201753337348283)
('with sum:', 0.022727363518455238)
('with sum_alt:', 0.2370256687439941) # <-- very slow!
('with lambda:', 0.026367264818657078)
('with np:', 0.0005811764306089913) # <-- very fast!
Python 3.6
with len: 0.017649643657480736
with sum: 0.0182978007766851
with sum_alt: 0.19659815740239048
with lambda: 0.02691670741400111
with np: 0.000534095418615152
The 2nd one, with_sum is more pythonic in the sense that it uses much less memory as it doesn't build the whole list because the generator expression is fed to sum().
I'm with #Chris_Rands. But as far as performance is concerned, there is a faster way using numpy:
import numpy as np
def with_np(my_list):
return len(np.where(my_list>=500)[0])
I have a multidimensional array (result) that should be filled by some nested loops. Function fun() is a complex and time-consuming function. I want to fill my array elements in a parallel manner, so I can use all my system's processing power.
Here's the code:
import numpy as np
def fun(x, y, z):
# time-consuming computation...
# ...
return output
dim1 = 10
dim2 = 20
dim3 = 30
result = np.zeros([dim1, dim2, dim3])
for i in xrange(dim1):
for j in xrange(dim2):
for k in xrange(dim3):
result[i, j, k] = fun(i, j, k)
My question is that "Can I parallelize this code or not? if yes, How?"
I'm using Windows 10 64-bit and python 2.7.
Please provide your solution by changing my code if you can.
Thanks!
If you want a more general solution, taking advantage of fully parallel execution, then why not use something like this:
>>> import multiprocess as mp
>>> p = mp.Pool()
>>>
>>> # a time consuming function taking x,y,z,...
>>> def fun(*args):
... import time
... time.sleep(.1)
... return sum(*args)
...
>>> dim1, dim2, dim3 = 10, 20, 30
>>> import itertools
>>> input = ((i,j,k) for i,j,k in itertools.combinations_with_replacement(xrange(dim3), 3) if i < dim1 and j < dim2)
>>> results = p.map(fun, input)
>>> p.close()
>>> p.join()
>>>
>>> results[:2]
[0, 1]
>>> results[-2:]
[56, 57]
Note I'm using multiprocess instead of multiprocessing, but that's only to get the ability to work in the interpreter.
I didn't use a numpy.array, but if you had to... you could just dump the output from p.map directly into a numpy.array and then modify the shape attribute to be shape = (dim1, dim2, dim3), or you could do something like this:
>>> input = ((i,j,k) for i,j,k in itertools.combinations_with_replacement(xrange(dim3), 3) if i < dim1 and j < dim2)
>>> import numpy as np
>>> results = np.empty(dim1*dim2*dim3)
>>> res = p.imap(fun, input)
>>> for i,r in enumerate(res):
... results[i] = r
...
>>> results.shape = (dim1,dim2,dim3)
Here is a version of code that runs fun(i, j, k) in parallel for differend k indices. This is done by running fun in different processes by using https://docs.python.org/2/library/multiprocessing.html
import numpy as np
from multiprocessing import Pool
def fun(x, y, z):
# time-consuming computation...
# ...
return output
def fun_wrapper(indices):
fun(*indices)
if __name__ == '__main__':
dim1 = 10
dim2 = 20
dim3 = 30
result = np.zeros([dim1, dim2, dim3])
pool = Pool(processes=8)
for i in xrange(dim1):
for j in xrange(dim2):
result[i, j] = pool.map(fun_wrapper, [(i, j, k) for k in xrange(dim3)])
This is not the most elegant solution but you may start with it. And you will get a speed up only if fun contains time-consuming computation
A simple approach could be to divide the array in sections and create some threads to operate throught these sections. For example one section from (0,0,0) to (5,10,15) and other one from (5,10,16) to (10,20,30).
You can use threading module and do something like this
import numpy as np
import threading as t
def fun(x, y, z):
# time-consuming computation...
# ...
return output
dim1 = 10
dim2 = 20
dim3 = 30
result = np.zeros([dim1, dim2, dim3])
#b - beginning index, e - end index
def work(ib,jb,kb,ie,je,ke):
for i in xrange(ib,ie):
for j in xrange(jb,je):
for k in xrange(kb,ke):
result[i, j, k] = fun(i, j, k)
threads = list()
threads.append(t.Thread(target=work, args(0,0,0,dim1/2,dim2/2,dim3/2))
threads.append(t.Thread(target=work, args(dim1/2,dim2/2,dim3/2 +1,dim1, dim2, dim3))
for thread in threads:
thread.start()
You can define these sections through some algorithm and determine the number of threads dynamically. Hope it helps you or at least give you some ideas.
from python timeit module i want to check how much time does it take to print the following , how to do so,
import timeit
x = [x for x in range(10000)]
timeit.timeit("print x[9999]")
d=[{i:i} for i in x]
timeit.timeit("print d[9999]")
NameError: global name 'x' is not defined
NameError: global name 'd' is not defined
Per the docs:
To give the timeit module access to functions you define, you can pass a setup parameter which contains an import statement
In your case, that would be e.g.:
timeit.timeit('print d[9999]',
setup='from __main__ import d')
Here is an example of how you can do it:
import timeit
x = [x for x in range(10000)]
d = [{i: i} for i in x]
for i in [x, d]:
t = timeit.timeit(stmt="print(i[9999])", number=100, globals=globals())
print(f"took: {t:.4f}")
Output:
took: 0.0776
took: 0.0788
Please notice I added number=100, so it runs 100 times each test. By default it 1,000,000 times.
Is it possible to map a NumPy array in place? If yes, how?
Given a_values - 2D array - this is the bit of code that does the trick for me at the moment:
for row in range(len(a_values)):
for col in range(len(a_values[0])):
a_values[row][col] = dim(a_values[row][col])
But it's so ugly that I suspect that somewhere within NumPy there must be a function that does the same with something looking like:
a_values.map_in_place(dim)
but if something like the above exists, I've been unable to find it.
It's only worth trying to do this in-place if you are under significant space constraints. If that's the case, it is possible to speed up your code a little bit by iterating over a flattened view of the array. Since reshape returns a new view when possible, the data itself isn't copied (unless the original has unusual structure).
I don't know of a better way to achieve bona fide in-place application of an arbitrary Python function.
>>> def flat_for(a, f):
... a = a.reshape(-1)
... for i, v in enumerate(a):
... a[i] = f(v)
...
>>> a = numpy.arange(25).reshape(5, 5)
>>> flat_for(a, lambda x: x + 5)
>>> a
array([[ 5, 6, 7, 8, 9],
[10, 11, 12, 13, 14],
[15, 16, 17, 18, 19],
[20, 21, 22, 23, 24],
[25, 26, 27, 28, 29]])
Some timings:
>>> a = numpy.arange(2500).reshape(50, 50)
>>> f = lambda x: x + 5
>>> %timeit flat_for(a, f)
1000 loops, best of 3: 1.86 ms per loop
It's about twice as fast as the nested loop version:
>>> a = numpy.arange(2500).reshape(50, 50)
>>> def nested_for(a, f):
... for i in range(len(a)):
... for j in range(len(a[0])):
... a[i][j] = f(a[i][j])
...
>>> %timeit nested_for(a, f)
100 loops, best of 3: 3.79 ms per loop
Of course vectorize is still faster, so if you can make a copy, use that:
>>> a = numpy.arange(2500).reshape(50, 50)
>>> g = numpy.vectorize(lambda x: x + 5)
>>> %timeit g(a)
1000 loops, best of 3: 584 us per loop
And if you can rewrite dim using built-in ufuncs, then please, please, don't vectorize:
>>> a = numpy.arange(2500).reshape(50, 50)
>>> %timeit a + 5
100000 loops, best of 3: 4.66 us per loop
numpy does operations like += in place, just as you might expect -- so you can get the speed of a ufunc with in-place application at no cost. Sometimes it's even faster! See here for an example.
By the way, my original answer to this question, which can be viewed in its edit history, is ridiculous, and involved vectorizing over indices into a. Not only did it have to do some funky stuff to bypass vectorize's type-detection mechanism, it turned out to be just as slow as the nested loop version. So much for cleverness!
This is a write-up of contributions scattered in answers and
comments, that I wrote after accepting the answer to the question.
Upvotes are always welcome, but if you upvote this answer, please
don't miss to upvote also those of senderle and (if (s)he writes
one) eryksun, who suggested the methods below.
Q: Is it possible to map a numpy array in place?
A: Yes but not with a single array method. You have to write your own code.
Below a script that compares the various implementations discussed in the thread:
import timeit
from numpy import array, arange, vectorize, rint
# SETUP
get_array = lambda side : arange(side**2).reshape(side, side) * 30
dim = lambda x : int(round(x * 0.67328))
# TIMER
def best(fname, reps, side):
global a
a = get_array(side)
t = timeit.Timer('%s(a)' % fname,
setup='from __main__ import %s, a' % fname)
return min(t.repeat(reps, 3)) #low num as in place --> converge to 1
# FUNCTIONS
def mac(array_):
for row in range(len(array_)):
for col in range(len(array_[0])):
array_[row][col] = dim(array_[row][col])
def mac_two(array_):
li = range(len(array_[0]))
for row in range(len(array_)):
for col in li:
array_[row][col] = int(round(array_[row][col] * 0.67328))
def mac_three(array_):
for i, row in enumerate(array_):
array_[i][:] = [int(round(v * 0.67328)) for v in row]
def senderle(array_):
array_ = array_.reshape(-1)
for i, v in enumerate(array_):
array_[i] = dim(v)
def eryksun(array_):
array_[:] = vectorize(dim)(array_)
def ufunc_ed(array_):
multiplied = array_ * 0.67328
array_[:] = rint(multiplied)
# MAIN
r = []
for fname in ('mac', 'mac_two', 'mac_three', 'senderle', 'eryksun', 'ufunc_ed'):
print('\nTesting `%s`...' % fname)
r.append(best(fname, reps=50, side=50))
# The following is for visually checking the functions returns same results
tmp = get_array(3)
eval('%s(tmp)' % fname)
print tmp
tmp = min(r)/100
print('\n===== ...AND THE WINNER IS... =========================')
print(' mac (as in question) : %.4fms [%.0f%%]') % (r[0]*1000,r[0]/tmp)
print(' mac (optimised) : %.4fms [%.0f%%]') % (r[1]*1000,r[1]/tmp)
print(' mac (slice-assignment) : %.4fms [%.0f%%]') % (r[2]*1000,r[2]/tmp)
print(' senderle : %.4fms [%.0f%%]') % (r[3]*1000,r[3]/tmp)
print(' eryksun : %.4fms [%.0f%%]') % (r[4]*1000,r[4]/tmp)
print(' slice-assignment w/ ufunc : %.4fms [%.0f%%]') % (r[5]*1000,r[5]/tmp)
print('=======================================================\n')
The output of the above script - at least in my system - is:
mac (as in question) : 88.7411ms [74591%]
mac (optimised) : 86.4639ms [72677%]
mac (slice-assignment) : 79.8671ms [67132%]
senderle : 85.4590ms [71832%]
eryksun : 13.8662ms [11655%]
slice-assignment w/ ufunc : 0.1190ms [100%]
As you can observe, using numpy's ufunc increases speed of more than 2 and almost 3 orders of magnitude compared with the second best and worst alternatives respectively.
If using ufunc is not an option, here's a comparison of the other alternatives only:
mac (as in question) : 91.5761ms [672%]
mac (optimised) : 88.9449ms [653%]
mac (slice-assignment) : 80.1032ms [588%]
senderle : 86.3919ms [634%]
eryksun : 13.6259ms [100%]
HTH!
Why not using numpy implementation, and the out_ trick ?
from numpy import array, arange, vectorize, rint, multiply, round as np_round
def fmilo(array_):
np_round(multiply(array_ ,0.67328, array_), out=array_)
got:
===== ...AND THE WINNER IS... =========================
mac (as in question) : 80.8470ms [130422%]
mac (optimised) : 80.2400ms [129443%]
mac (slice-assignment) : 75.5181ms [121825%]
senderle : 78.9380ms [127342%]
eryksun : 11.0800ms [17874%]
slice-assignment w/ ufunc : 0.0899ms [145%]
fmilo : 0.0620ms [100%]
=======================================================
if ufuncs are not possible, you should maybe consider using cython.
it is easy to integrate and give big speedups on specific use of numpy arrays.
This is just an updated version of mac's write-up, actualized for Python 3.x, and with numba and numpy.frompyfunc added.
numpy.frompyfunc takes an abitrary python function and returns a function, which when cast on a numpy.array, applies the function elementwise.
However, it changes the datatype of the array to object, so it is not in place, and future calculations on this array will be slower.
To avoid this drawback, in the test numpy.ndarray.astype will be called, returning the datatype to int.
As side note:
Numba isn't included in Python's basic libraries and has to be downloaded externally if you want to test it. In this test, it actually does nothing, and if it would have been called with #jit(nopython=True), it would have given an error message saying that it can't optimize anything there. Since, however, numba can often speed-up code written in a functional style, it is included for integrity.
import timeit
from numpy import array, arange, vectorize, rint, frompyfunc
from numba import autojit
# SETUP
get_array = lambda side : arange(side**2).reshape(side, side) * 30
dim = lambda x : int(round(x * 0.67328))
# TIMER
def best(fname, reps, side):
global a
a = get_array(side)
t = timeit.Timer('%s(a)' % fname,
setup='from __main__ import %s, a' % fname)
return min(t.repeat(reps, 3)) #low num as in place --> converge to 1
# FUNCTIONS
def mac(array_):
for row in range(len(array_)):
for col in range(len(array_[0])):
array_[row][col] = dim(array_[row][col])
def mac_two(array_):
li = range(len(array_[0]))
for row in range(len(array_)):
for col in li:
array_[row][col] = int(round(array_[row][col] * 0.67328))
def mac_three(array_):
for i, row in enumerate(array_):
array_[i][:] = [int(round(v * 0.67328)) for v in row]
def senderle(array_):
array_ = array_.reshape(-1)
for i, v in enumerate(array_):
array_[i] = dim(v)
def eryksun(array_):
array_[:] = vectorize(dim)(array_)
#autojit
def numba(array_):
for row in range(len(array_)):
for col in range(len(array_[0])):
array_[row][col] = dim(array_[row][col])
def ufunc_ed(array_):
multiplied = array_ * 0.67328
array_[:] = rint(multiplied)
def ufunc_frompyfunc(array_):
udim = frompyfunc(dim,1,1)
array_ = udim(array_)
array_.astype("int")
# MAIN
r = []
totest = ('mac', 'mac_two', 'mac_three', 'senderle', 'eryksun', 'numba','ufunc_ed','ufunc_frompyfunc')
for fname in totest:
print('\nTesting `%s`...' % fname)
r.append(best(fname, reps=50, side=50))
# The following is for visually checking the functions returns same results
tmp = get_array(3)
eval('%s(tmp)' % fname)
print (tmp)
tmp = min(r)/100
results = list(zip(totest,r))
results.sort(key=lambda x: x[1])
print('\n===== ...AND THE WINNER IS... =========================')
for name,time in results:
Out = '{:<34}: {:8.4f}ms [{:5.0f}%]'.format(name,time*1000,time/tmp)
print(Out)
print('=======================================================\n')
And finally, the results:
===== ...AND THE WINNER IS... =========================
ufunc_ed : 0.3205ms [ 100%]
ufunc_frompyfunc : 3.8280ms [ 1194%]
eryksun : 3.8989ms [ 1217%]
mac_three : 21.4538ms [ 6694%]
senderle : 22.6421ms [ 7065%]
mac_two : 24.6230ms [ 7683%]
mac : 26.1463ms [ 8158%]
numba : 27.5041ms [ 8582%]
=======================================================