How to vectorize multiple levels of recursion? - python

I am a noobie to python and numpy (and programming in general). I am trying to speed up my code as much as possible. The math involves several summations over multiple axes of a few arrays. I've attained one level of vectorization, but I can't seem to get any deeper than that and have to resort to for loops (I believe there's three levels of recursion, M, N, and I, one of which I've eliminated, I). Here's my code for the relevant section (this code works, but I'd like to speed it up):
def B1(n, i):
return np.pi * n * dmaxi * (-1)**(n+1) * np.sin(qi[i]*dmaxi) * ((np.pi*n)**2 - (qi[i]*dmaxi)**2)**(-1)
for n in N:
B[n, :] = B1(n, I)
for m in M:
for n in N:
C[m, n] = np.dot((1/np.square(qi*Iq[0, :, 2]))*B[m, :], B[n, :])
Y[m] = np.dot((1/np.square(qi*Iq[0, :, 2]))*U[0, :, 1], B[m, :])
A = np.linalg.solve(C[1:, 1:], (0.25)*Y[1:])
dmaxi is just a float and m, n and i are integers. The arrays have the following shapes:
>>> qi.shape
(551,)
>>> N.shape
(18,)
>>> M.shape
(18,)
>>> I.shape
(551,)
>>> Iq.shape
(1, 551, 3)
>>> U.shape
(1, 551, 3)
As you can see I've vectorized the calculation of the 2nd axis of B, but I can't seem to do it for the 1st axis, C, and Y, which still require the for loops. It seems that when I try to do the same form of vectorization that I did for the 1st axis of B (define a function, then give the array as the argument), I get a broadcasting error since it appears to be trying to calculate both axes simultaneously, rather than the 1st, then the 2nd, which is why I had to force it into a for loop instead. The same problem occurs for both C and Y which is why they're both in for loops also. In case that's confusing, essentially what I tried was:
>>> B[:, :] = B1(N, I)
Traceback (most recent call last):
File "<stdin>", line 1, in <module>
File "sasrec_v6.py", line 155, in B1
return np.pi * n * dmaxi * (-1)**(n+1) * np.sin(qi[i]*dmaxi) * ((np.pi*n)**2 - (qi[i]*dmaxi)**2)**(-1)
ValueError: operands could not be broadcast together with shapes (18) (551)
Vectorizing the 2nd axis of B made a substantial improvement to the speed of my code, so I'm assuming that the same will apply for further vectorization (I hope I'm using that term correctly by the way).

You can use broadcasting to make 2d arrays from your 1d index vectors. I haven't tested these yet, but they should work:
If you reshape the N to be a column vector, then B1 will return a 2d array:
B[N] = B1(N[:, None], I)
For Y and C, I'd use np.einsum to have better control over which axes are mulitplied (probably this could be done with np.dot as well but I'm not sure how.
C[M[:, None], N] = np.einsum('ij,kj->ik',
B[M]/np.square(qi*Iq[0, :, 2]),
B[N])
Y[M] = np.einsum('i, ki->k',
U[0, :, 1]/np.square(qi*Iq[0, :, 2]),
B[M])
To see what that indexing trick does:
In [1]: a = np.arange(3)
In [2]: a
Out[2]: array([0, 1, 2])
In [3]: a[:, None]
Out[3]:
array([[0],
[1],
[2]])
In [4]: b = np.arange(4,1,-1)
In [5]: b
Out[5]: array([4, 3, 2])
In [6]: a[:, None] * b
Out[6]:
array([[0, 0, 0],
[4, 3, 2],
[8, 6, 4]])
It saves two orders of magnitude in time:
In [92]: %%timeit
....: B = np.zeros((18, 551))
....: C = np.zeros((18, 18))
....: Y = np.zeros((18))
....: for n in N:
....: B[n, :] = B1(n, I)
....: for m in M:
....: for n in N:
....: C[m, n] = np.dot((1/np.square(qi*Iq[0, :, 2]))*B[m, :], B[n, :])
....: Y[m] = np.dot((1/np.square(qi*Iq[0, :, 2]))*U[0, :, 1], B[m, :])
....:
100 loops, best of 3: 15.8 ms per loop
In [93]: %%timeit
....: Bv = np.zeros((18, 551))
....: Cv = np.zeros((18, 18))
....: Yv = np.zeros((18))
....: Bv[N] = B1(N[:, None], I)
....: Cv[M[:, None], N] = np.einsum('ij,kj->ik', B[M]/np.square(qi*Iq[0, :, 2]), B[N])
....: Yv[M] = np.einsum('i, ki->k', U[0, :, 1]/np.square(qi*Iq[0, :, 2]), B[M])
....:
1000 loops, best of 3: 1.34 ms per loop
Here's my test:
import numpy as np
# make fake data:
np.random.seed(5)
qi = np.random.rand(551)
N = np.random.randint(0,18,18)#np.arange(18)
M = np.random.randint(0,18,18)#np.arange(18)
I = np.arange(551)
Iq = np.random.rand(1, 551, 3)
U = np.random.rand(1, 551, 3)
B = np.zeros((18, 551))
C = np.zeros((18, 18))
Y = np.zeros((18))
Bv = np.zeros((18, 551))
Cv = np.zeros((18, 18))
Yv = np.zeros((18))
dmaxi = 1.
def B1(n, i):
return np.pi * n * dmaxi * (-1)**(n+1) * np.sin(qi[i]*dmaxi) * ((np.pi*n)**2 - (qi[i]*dmaxi)**2)**(-1)
for n in N:
B[n, :] = B1(n, I)
for m in M:
for n in N:
C[m, n] = np.dot((1/np.square(qi*Iq[0, :, 2]))*B[m, :], B[n, :])
Y[m] = np.dot((1/np.square(qi*Iq[0, :, 2]))*U[0, :, 1], B[m, :])
Bv[N] = B1(N[:, None], I)
print "B correct?", np.allclose(Bv, B)
# np.einsum test case:
n, m = 2, 3
a = np.arange(n*m).reshape(n,m)*8 + 2
b = np.arange(n*m)[::-1].reshape(n,m)
c = np.empty((n,n))
for i in range(n):
for j in range(n):
c[i,j] = np.dot(a[i],b[j])
cv = np.einsum('ij,kj->ik', a, b)
print "einsum test successful?", np.allclose(c,cv)
Cv[M[:, None], N] = np.einsum('ij,kj->ik',
B[M]/np.square(qi*Iq[0, :, 2]),
B[N])
print "C correct?", np.allclose(Cv, C)
Yv[M] = np.einsum('i, ki->k',
U[0, :, 1]/np.square(qi*Iq[0, :, 2]),
B[M])
print "Y correct?", np.allclose(Yv, Y)
output :D
B correct? True
einsum test successful? True
C correct? True
Y correct? True

Related

How to vectorize a moving Numpy slice window

I have two numpy ndarrays, array1 and array 2, with array1.shape = array2.shape = (n, l, m).
A 3rd ndarray is initialized as array3 = np.nan * np.zeros((n-1, l, m + 1)) and is then computed using the following for loop:
for i in range(m):
array3[:n - i - 1, :, i] = array1[i + 1:, :, i] - array2[:n - i - 1, :, i]
Is there a simple way to vectorize this and avoid the for loop ?
Here is a simple example:
import numpy as np
a = np.ones((6, 4, 4)) * np.arange(1, 5)
b = np.ones((6, 4, 4))
c = np.nan * np.zeros((5, 4, 4))
n = a.shape[0]
m = a.shape[2]
for i in range(m):
c[:n - i - 1, :, i] = a[i + 1:, :, i] - b[:n - i - 1, :, i]
I tried rewriting array a the following way:
a = np.concatenate((a, np.nan * np.zeros((4, 4, 4))), axis=0)
row_idx, column_idx, slice_idx = np.ogrid[:a.shape[0], :a.shape[1], :a.shape[2]]
r = -1.0 * np.arange(1, 5) + a.shape[0]
row_idx = (row_idx - r[np.newaxis, np.newaxis, :]).astype(int)
a = a[row_idx, column_idx, slice_idx]
a = a[:6, :, :]
and then subtract array b directly but it was only marginally faster for arrays of larger size.

Hi, I have tried to write this code in python, but i have a error, i hope that someone could help me

This is my code in python, the dimension of sx should be of 100X4 and sy 100X1 by the multiplication (sx)(B)(sy).
import numpy as np
B= [[-6.08066634428988e-10, -8.61023850910464e-11, 5.48222828615260e-12, -9.49229025004441e-14],
[-3.38148313553674e-11, 6.47759097087283e-12, 1.14900158474371e-13, -5.70078947874486e-15],
[-2.55893304237669e-13, -1.40941560399352e-13, 5.76510238931847e-15, -5.52980385181738e-17],
[3.39795122177475e-15, 7.95704191204353e-16, -5.31260642039813e-17, 7.83532802015832e-19]]
[X, Y] = np.meshgrid(np.arange(0, 3, 0.01*3),np.arange(0, 15, 0.01*(15)))
sx=[]
sy=[]
F=[]
for i in range(len(X)):
for j in range(len(X)):
for k in range(len(B)):
sx[i,k].append(X[i,j]**k)
for l in range(len(B)):
sy[l].append((Y[i,j]**l))
F[i,j] = sx*B*sy
The error:
sx[i,k].append(X[i,j]**k) TypeError: list indices must be integers or slices, not tuple
MATLAB code copied from comment (guess as to formatting)
[x,y]=meshgrid(0:0.01*3:3,0:0.01*15:15);
for i=1:size(x)
for j=1:size(x)
for k=0:size(B) -1
sx(1,k+1)=(x(i,j)^k);
end
for k=0:size(B) -1
sy(k+1,1)=(y(i,j)^k);
end
G(i,j)=sx*B*sy;
end
end
If sx or X is a 2D list then indices must be [i][j]. If you're trying to append to two indices i and j then it should be separate calls to append.
In an Octave session:
B =
-6.0807e-10 -8.6102e-11 5.4822e-12 -9.4923e-14
-3.3815e-11 6.4776e-12 1.1490e-13 -5.7008e-15
-2.5589e-13 -1.4094e-13 5.7651e-15 -5.5298e-17
3.3980e-15 7.9570e-16 -5.3126e-17 7.8353e-19
>>
>> [x,y]=meshgrid(0:0.01*3:3,0:0.01*15:15);
>> for i=1:size(x)
for j=1:size(x)
for k=0:size(B) -1
sx(1,k+1)=(x(i,j)^k);
end
for k=0:size(B) -1
sy(k+1,1)=(y(i,j)^k);
end
G(i,j)=sx*B*sy;
end
end
produces
x, y, G (101 x 101)
>> sx (1,4)
sx =
1 3 9 27
>> sy (4,1)
sy =
1
15
225
3375
So the G element is (1,4) * (4,4) * (4,1) => (1,1)
Looks like I should be able to make a
In [100]: B= [[-6.08066634428988e-10, -8.61023850910464e-11, 5.48222828615260e-12, -9.49229025004441e-14],
...: [-3.38148313553674e-11, 6.47759097087283e-12, 1.14900158474371e-13, -5.70078947874486e-15],
...: [-2.55893304237669e-13, -1.40941560399352e-13, 5.76510238931847e-15, -5.52980385181738e-17],
...: [3.39795122177475e-15, 7.95704191204353e-16, -5.31260642039813e-17, 7.83532802015832e-19]]
...:
In [101]: B = np.array(B)
In [106]: [X, Y] = np.meshgrid(np.linspace(0, 3, 101),np.linspace(0, 15, 101),indexing='ij')
In [107]: X.shape
Out[107]: (101, 101)
In [108]: k = np.arange(0,4)
In [109]: k
Out[109]: array([0, 1, 2, 3])
In [110]: SX = X[:,:,None]**k # (101,101,4)
In [111]: SY = Y[:,:,None]**k
In [114]: G = np.einsum('ijk,kl,ijl->ij',SX,B,SY)
In [115]: G.shape
Out[115]: (101, 101)
Allowing for the "F" order of MATLAB (ie. transpose), looks like these results match:
>> G(1,1)
ans = -0.00000000060807
In [118]: G[0,0]
Out[118]: -6.08066634428988e-10
>> G(50,23)
ans = -0.00000000097117
In [119]: G[22,49]
Out[119]: -9.71172989297259e-10
With broadcasting I don't to make the meshgrid arrays
In [121]: x, y = np.linspace(0,3,101), np.linspace(0,15,101)
In [124]: sx = x[:,None]**k
In [125]: sy = y[:,None]**k
In [126]: sx.shape
Out[126]: (101, 4)
In [129]: g = sx#B#sy.T
In [130]: g.shape
Out[130]: (101, 101)
In [131]: np.allclose(G,g)
Out[131]: True
Here I'm doing a matrix product of
(101,4) (4,4) (4,100) => (101,101)

numpy - tensor multiplication product

I have a 4 x 4 matrix
import numpy as np
c = np.random.rand((4,4))
I want to create an 100 x 4 x 4 x 100 tensor such that when the first an last index are equal, I get back my matrix else I get zeros.
I can do this in a loop as
Z = np.zeros((100, 4, 4, 100))
for i in range(100):
Z[i, :, :, i] = c
is there a better way to do this? I tried looking at np.tensordot and np.einsum but could not figure it out.
Thanks,
Sahil
Use advanced-indexing -
n = 100
Zout = np.zeros((n, 4, 4, n))
I = np.arange(n)
Zout[I,:,:,I] = c
With eye-masking -
n = 100
mask = np.eye(n, dtype=bool)
Zout = np.zeros((n, 4, 4, n))
Zout.transpose(0,3,1,2)[mask] = c
Timings -
In [72]: c = np.random.rand(4,4)
In [73]: %%timeit
...: n = 100
...: Zout = np.zeros((n, 4, 4, n))
...: I = np.arange(n)
...: Zout[I,:,:,I] = c
10000 loops, best of 3: 47.5 µs per loop
In [74]: %%timeit
...: n = 100
...: mask = np.eye(n, dtype=bool)
...: Zout = np.zeros((n, 4, 4, n))
...: Zout.transpose(0,3,1,2)[mask] = c
10000 loops, best of 3: 73.1 µs per loop

How to rotate a square numpy array with different times efficiently by `np.rot90`?

I have a 2d numpy array, for example:
a = np.array([
[0, 1, 2],
[3, 4, 5],
[6, 7, 8]])
and another 1d array:
I = np.array([0, 2, 3, 1, 0, 2, 0, 1])
I want to rotate a by np.rot90 function like following:
b = np.zeros((len(I), 3, 3))
for i, k in enumerate(I):
b[i] = np.rot90(a, k=k)
Can I do it more efficiently without the floop?
Approach #1
Generate a 3D array of all possible 4 rotations and simply index into it with I and thus have a vectorized solution -
P = np.empty((4,) + a.shape, dtype=a.dtype)
P[0] = a # For np.rot90(a, k=0)
P[1] = a.T[::-1] # For np.rot90(a, k=1)
P[2] = a[::-1,::-1] # For np.rot90(a, k=2)
P[3] = a.T[:,::-1] # For np.rot90(a, k=3)
out = P[I]
Approach #2
Another way to create P would be with -
P = np.array([np.rot90(a, k=i) for i in range(4)])
and as with the previous method simply index into P with I for final output.
Runtime test
Approaches -
def org_app(a, I):
m,n = a.shape
b = np.zeros((len(I), m, n), dtype=a.dtype)
for i, k in enumerate(I):
b[i] = np.rot90(a, k=k)
return b
def app1(a, I):
P = np.empty((4,) + a.shape, dtype=a.dtype)
P[0] = a
P[1] = a.T[::-1]
P[2] = a[::-1,::-1]
P[3] = a.T[:,::-1]
return P[I]
def app2(a, I):
P = np.array([np.rot90(a, k=i) for i in range(4)])
return P[I]
Timings -
In [54]: a = np.random.randint(0,9,(10,10))
In [55]: I = np.random.randint(0,4,(10000))
In [56]: %timeit org_app(a, I)
10 loops, best of 3: 51 ms per loop
In [57]: %timeit app1(a, I)
1000 loops, best of 3: 469 µs per loop
In [58]: %timeit app2(a, I)
1000 loops, best of 3: 549 µs per loop
100x+ speedup!
One more efficient way that I can think of (still not vectorized) is using a list comprehension, in one line:
np.array([np.rot90(a, k=i) for i in I])

How to repeat elements of an array along two axes?

I want to repeat elements of an array along axis 0 and axis 1 for M and N times respectively:
import numpy as np
a = np.arange(12).reshape(3, 4)
b = a.repeat(2, 0).repeat(2, 1)
print(b)
[[ 0 0 1 1 2 2 3 3]
[ 0 0 1 1 2 2 3 3]
[ 4 4 5 5 6 6 7 7]
[ 4 4 5 5 6 6 7 7]
[ 8 8 9 9 10 10 11 11]
[ 8 8 9 9 10 10 11 11]]
This works, but I want to know are there better methods without create a temporary array.
You could use the Kronecker product, see numpy.kron:
>>> a = np.arange(12).reshape(3,4)
>>> print(np.kron(a, np.ones((2,2), dtype=a.dtype)))
[[ 0 0 1 1 2 2 3 3]
[ 0 0 1 1 2 2 3 3]
[ 4 4 5 5 6 6 7 7]
[ 4 4 5 5 6 6 7 7]
[ 8 8 9 9 10 10 11 11]
[ 8 8 9 9 10 10 11 11]]
Your original method is OK too, though!
You can make use of np.broadcast_to here:
def broadcast_tile(a, h, w):
x, y = a.shape
m, n = x * h, y * w
return np.broadcast_to(
a.reshape(x, 1, y, 1), (x, h, y, w)
).reshape(m, n)
broadcast_tile(a, 2, 2)
array([[ 0, 0, 1, 1, 2, 2, 3, 3],
[ 0, 0, 1, 1, 2, 2, 3, 3],
[ 4, 4, 5, 5, 6, 6, 7, 7],
[ 4, 4, 5, 5, 6, 6, 7, 7],
[ 8, 8, 9, 9, 10, 10, 11, 11],
[ 8, 8, 9, 9, 10, 10, 11, 11]])
Performance
Functions
def chris(a, h, w):
x, y = a.shape
m, n = x * h, y * w
return np.broadcast_to(
a.reshape(x, 1, y, 1), (x, h, y, w)
).reshape(m, n)
def alex_riley(a, b0, b1):
r, c = a.shape
rs, cs = a.strides
x = np.lib.stride_tricks.as_strided(a, (r, b0, c, b1), (rs, 0, cs, 0))
return x.reshape(r*b0, c*b1)
def paul_panzer(a, b0, b1):
r, c = a.shape
out = np.empty((r, b0, c, b1), a.dtype)
out[...] = a[:, None, :, None]
return out.reshape(r*b0, c*b1)
def wim(a, h, w):
return np.kron(a, np.ones((h,w), dtype=a.dtype))
Setup
import numpy as np
import pandas as pd
from timeit import timeit
res = pd.DataFrame(
index=['chris', 'alex_riley', 'paul_panzer', 'wim'],
columns=[5, 10, 20, 50, 100, 500, 1000],
dtype=float
)
a = np.arange(100).reshape((10,10))
for f in res.index:
for c in res.columns:
h = w = c
stmt = '{}(a, h, w)'.format(f)
setp = 'from __main__ import h, w, a, {}'.format(f)
res.at[f, c] = timeit(stmt, setp, number=50)
Output
Since the result cannot be implemented as a view, as_strided offers no benefits over simple preallocation and broadcasting. Because of its overhead as_strided seems in fact a bit slower (I did no proper benchmarking, though).
The as_strided code is taken from #AlexRiley's post.
from numpy.lib.stride_tricks import as_strided
import numpy as np
def tile_array(a, b0, b1):
r, c = a.shape # number of rows/columns
rs, cs = a.strides # row/column strides
x = as_strided(a, (r, b0, c, b1), (rs, 0, cs, 0)) # view a as larger 4D array
return x.reshape(r*b0, c*b1) # create new 2D array
def tile_array_pp(a, b0, b1):
r, c = a.shape
out = np.empty((r, b0, c, b1), a.dtype)
out[...] = a[:, None, :, None]
return out.reshape(r*b0, c*b1)
a = np.arange(9).reshape(3, 3)
kwds = {'globals': {'f_ar': tile_array, 'f_pp': tile_array_pp, 'a': a},
'number': 1000}
from timeit import timeit
print('as_strided', timeit('f_ar(a, 100, 100)', **kwds))
print('broadcast ', timeit('f_pp(a, 100, 100)', **kwds))
Sample run:
as_strided 0.048387714981799945
broadcast 0.04324757700669579
Another solution is to use as_strided. kron is much slower then using repeat twice. I have found that as_strided is much faster than a double repeat in many cases (small arrays [<250x250] with only a doubling in each dimension as_strided was slower). The as_strided trick is as follows:
a = arange(1000000).reshape((1000, 1000)) # dummy data
from numpy.lib.stride_tricks import as_strided
N, M = 4,3 # number of time to replicate each point in each dimension
H, W = a.shape
b = as_strided(a, (H, N, W, M), (a.strides[0], 0, a.strides[1], 0)).reshape((H*N, W*M))
This works by using 0-length strides which causes numpy to read the same value multiple times (until it gets to the next dimension). The final reshape does copy the data, but only once unlike using a double repeat which will copy the data twice.
Errata: I'm only taking 2x upsampling into account.
TL;DR It turns out that after the OpenCV version,
np.repeat(np.repeat(a, 2, axis=1), 2, axis=0)
is the fastest. So the answer is - there's no faster ways in numpy today,
but you can get a slight improvement by changing the order of axes.
And if you don't mind OpenCV -
cv.resize(a, None, fx=2, fy=2, interpolation=cv.INTER_NEAREST)
Here is the test.
import timeit
import numpy as np
import cv2 as cv
test = np.zeros((16, 16, 3), dtype=np.float32)
def measure(f):
t = timeit.timeit("f(test)", number=1000, globals={"test": test, "f": f})
print("%s - %f"%(f.__name__, t))
return f, t
def fastest(c):
print(c.__name__)
winner, t = min((measure(getattr(c, ve)) for ve in dir(c) if ve.startswith("alg_")), key=lambda x: x[1])
print("%s winner: %s - %f"%(c.__name__, winner.__name__, t))
return winner
#fastest
class nn:
def alg_01(a):
return np.repeat(np.repeat(a, 2, axis=0), 2, axis=1)
def alg_02(a):
return np.repeat(np.repeat(a, 2, axis=1), 2, axis=0)
def alg_03(a):
b = a[:, None, :, None]
b = np.concatenate((b, b), axis=1)
b = np.concatenate((b, b), axis=3)
return b.reshape(a.shape[0]<<1, a.shape[1]<<1, *a.shape[2:])
def alg_04(a):
b = a[:, None, :, None]
b = np.concatenate((b, b), axis=3)
b = np.concatenate((b, b), axis=1)
return b.reshape(a.shape[0]<<1, a.shape[1]<<1, *a.shape[2:])
def alg_05(a):
return (a[:, None, :, None]*np.ones((1, 2, 1, 2)+((1,)*len(a.shape[2:])), dtype=np.float32)).reshape(a.shape[0]<<1, a.shape[1]<<1, *a.shape[2:])
def alg_06(a):
return cv.resize(a, None, fx=2, fy=2, interpolation=cv.INTER_NEAREST)
def alg_07(a):
return a[:, None, :, None][:, (0, 0)][:, :, :, (0, 0)].reshape(a.shape[0]<<1, a.shape[1]<<1, *a.shape[2:])
def alg_08(a):
return a[:, None, :, None][:, :, :, (0, 0)][:, (0, 0)].reshape(a.shape[0]<<1, a.shape[1]<<1, *a.shape[2:])
def alg_09(a):
return np.kron(a, np.ones((2, 2), dtype=np.float32))
def alg_10(a):
return np.broadcast_to(a[:, None, :, None], (a.shape[0], 2, a.shape[1], 2)+a.shape[2:]).reshape(a.shape[0]<<1, a.shape[1]<<1, *a.shape[2:])
def alg_11(a):
ret = np.empty((a.shape[0], 2, a.shape[1], 2, *a.shape[2:]), dtype=np.float32)
ret[...] = a[:, None, :, None]
ret.resize((a.shape[0]<<1, a.shape[1]<<1, *a.shape[2:]), refcheck=False)
return ret
The result is:
nn
alg_01 - 0.040967
alg_02 - 0.033744
alg_03 - 0.057969
alg_04 - 0.048739
alg_05 - 0.076595
alg_06 - 0.078638
alg_07 - 0.084692
alg_08 - 0.084539
alg_09 - 0.344339
alg_10 - 0.078707
alg_11 - 0.049424
nn winner: alg_02 - 0.033744

Categories

Resources