Speed-up computation on sub-matrices of a 4D tensor

Speed-up computation on sub-matrices of a 4D tensor - python

I have an array whose input data is 1028* 24* 24*16. When I run the code below, it works very slowly. How can I speed this up? thanks
(I want to get 3*3 matrices from a large array.)
import itertools,math,time,random
import numpy as np
start=time.time()
def ls(x):
x_p = x / np.sum(x)
return np.max(x_p)
inputs = np.random.rand(1028, 24, 24, 16)
b, r, c, ch = inputs.shape[0], inputs.shape[1], inputs.shape[2], inputs.shape[3]
inputs = np.transpose(inputs, (0, 3, 1, 2))
inputs = np.reshape(inputs, (b*ch, r, c))
s=2
r=3
num_r=9
num_c=9
ke = np.zeros((b*ch, num_r, num_c), dtype=np.float32)
for i in range(num_r):
for j in range(num_c):
outs = np.array(list(map(ls, inputs[:, i*s:i*s+r, j*s:j*s+r])))
ke[:, i, j] = outs
ke = np.reshape(ke, (b, ch, num_r, num_c))
ke = np.transpose(ke, (0, 2, 3, 1))
print(time.time() - start)

The original code can be wrapped (with some polishing) into the following function:
import numpy as np
def foo0(arr, s=2, r=3, nr=9, nc=9):
forward_axes = (0, 3, 1, 2)
backward_axes = (0, 2, 3, 1)
b, r, c, ch = arr.shape
arr = np.transpose(arr, forward_axes)
arr = np.reshape(arr, (b * ch, r, c))
result = np.zeros((b * ch, nr, nc), dtype=np.float32)
for i in range(nr):
for j in range(nc):
result[:, i, j] = np.fromiter(
map(
lambda x: np.max(x / np.sum(x)),
arr[:, i * s:i * s + r, j * s:j * s + r]),
dtype=np.float32)
result = np.reshape(result, (b, ch, nr, nc))
result = np.transpose(result, backward_axes)
return result
While the explicit looping suggests that Numba could be applied here to get some low-hanging fruit, unfortunately the function cannot be readily decorated without interaction with Python object, thus greatly reducing the speed-up.
Fortunately, the core computation can be readily vectorized and, as long as nr and nc are small enough, this optimization is sufficient:
def foo1(arr, s=2, r=3, nr=9, nc=9):
forward_axes = (0, 3, 1, 2)
backward_axes = (0, 2, 3, 1)
b, r, c, ch = arr.shape
arr = np.transpose(arr, forward_axes)
arr = np.reshape(arr, (b * ch, r, c))
result = np.zeros((b * ch, nr, nc), dtype=np.float32)
for i in range(nr):
for j in range(nc):
x = arr[:, i * s:i * s + r, j * s:j * s + r]
result[:, i, j] = np.max(x, (-1, -2)) / np.sum(x, (-1, -2))
result = np.reshape(result, (b, ch, nr, nc))
result = np.transpose(result, backward_axes)
return result
(The above foo1() is essentially equivalent to #ymmx's answer with some additional optimizations.)
Note that max(x / k) is the same as max(x) / k but the number of divisions are greatly reduced.
Actually, transposing and reshaping, while it may help with the computation speed, it is not really necessary:
def foo2(arr, s=2, r=3, nr=9, nc=9):
b, r, c, ch = arr.shape
result = np.zeros((b, nr, nc, ch), dtype=np.float32)
for i in range(nr):
for j in range(nc):
x = arr[:, i * s:i * s + r, j * s:j * s + r, :]
result[:, i, j, :] = np.max(x, (1, 2)) / np.sum(x, (1, 2))
return result
The above is simpler to translate in Numba, but the speed gain for small nr/nc would be minimal (compared to the partially vectorized approach):
import numba as nb
#nb.njit
def sum_nb(arr):
result = 0
for x in arr:
result += x
return result
#nb.njit
def max_nb(arr):
result = arr[0]
for x in arr[1:]:
if x > result:
result = x
return result
#nb.njit
def _sum_max(arr):
b, r, c, ch = arr.shape
res = np.empty((b, ch), dtype=arr.dtype)
for i in range(b):
for j in range(ch):
x = arr[i, :, :, j].ravel()
res[i, j] = max_nb(x) / sum_nb(x)
return res
#nb.njit
def foo3(arr, s=2, r=3, nr=9, nc=9):
b, r, c, ch = arr.shape
result = np.zeros((b, nr, nc, ch), dtype=np.float32)
for i in range(nr):
for j in range(nc):
result[:, i, j, :] = _sum_max(arr[:, i * s:i * s + r, j * s:j * s + r, :])
return result
Another option would be to keep the Numba-incompatible code outside of the main looping:
#nb.njit(fastmath=True)
def _foo4(arr, result, s, r, nr, nc):
bch, nr, nc = result.shape
for i in range(nr):
for j in range(nc):
for k in range(bch):
x = arr[k, i * s:i * s + r, j * s:j * s + r].ravel()
result[k, i, j] = max_nb(x) / sum_nb(x)
return result
def foo4(arr, s=2, r=3, nr=9, nc=9):
forward_axes = (0, 3, 1, 2)
backward_axes = (0, 2, 3, 1)
b, r, c, ch = arr.shape
arr = np.transpose(arr, forward_axes)
arr = np.reshape(arr, (b * ch, r, c))
result = np.empty((b * ch, nr, nc))
result = _foo4(arr, result, s, r, nr, nc)
result = np.reshape(result, (b, ch, nr, nc))
result = np.transpose(result, backward_axes)
return result
but again the speed gain would be minimal.
Note that a fully vectorized approach is unlikely to be efficient, because the objects inside the main loops are jagged.
To get some ideas of the relative speed:
funcs = foo0, foo1, foo2, foo3, foo4
arr = np.random.rand(100, 24, 24, 16)
timeds_n = {}
for p in range(1):
n = 10 ** p
k = 3
arr = np.random.rand(100, 24, 24, 16)
print(f"N = {arr.size}")
base = funcs[0](arr)
timeds_n[n] = []
for func in funcs:
res = func(arr)
timed = %timeit -r 1 -n 1 -q -o func(arr)
timeds_n[n].append(timed.best)
print(f"{func.__name__:>24} {np.allclose(base, res)} {timed.best:.9f}")
N = 921600
foo0 True 1.757508748
foo1 True 0.095540081
foo2 True 0.179208341
foo3 True 0.160671403
foo4 True 0.155691721

I think the issue is mainly the function ls which should be vectorized and the list / map that takes you time
import itertools,math,time,random
import numpy as np
start=time.time()
def ls(x):
x_p = x / np.sum(np.sum(x, axis=1), axis=1)[:,None,None]
return np.max(np.max(x_p,axis=1),axis=1)
inputs = np.random.rand(1028, 24, 24, 16)
b, r, c, ch = inputs.shape[0], inputs.shape[1], inputs.shape[2], inputs.shape[3]
inputs = np.transpose(inputs, (0, 3, 1, 2))
inputs = np.reshape(inputs, (b*ch, r, c))
s=2
r=3
num_r=9
num_c=9
ke = np.zeros((b*ch, num_r, num_c), dtype=np.float32)
for i in range(num_r):
print(i)
for j in range(num_c):
# outs = np.array(list(map(ls, inputs[:, i*s:i*s+r, j*s:j*s+r])))
outs = ls(inputs[:, i*s:i*s+r, j*s:j*s+r])
ke[:, i, j] = outs
ke = np.reshape(ke, (b, ch, num_r, num_c))
ke = np.transpose(ke, (0, 2, 3, 1))
print(time.time() - start)

Related

How to efficiently calculate score = dot(a, LeakyReLU(x_i+y_j)) for each i, j in [N]?

I have to compute score = dot(a, LeakyReLU(x_i+y_j)) for each i, j in [N], where a, x_i, y_j is the D-dimensional vecotr, and dot() is the dot-product that outputs a scalar value. So finally, I have to get NxN score.
In keras, I implemented as:
#given X (N x D), Y(N x D), A (D x 1)
X = tf.expand_dims(X, axis=1) #(N x 1 x D)
Y = tf.expand_dims(Y, axis=0) #(1 x N x D)
feature_sum = X+ Y #(N x N x D) broadcast automatically
dense = K.dot(LeakyReLU(alpha=0.1)(feature_sum), A) # (N x N x 1)
The problem is that feature_sum is GPU-memory expensive, where N,D>1000. Then any other efficient implementation?

The dot product is a commutative operation with respect to the sum. Therefore:
dot(LRelu(X + Y), A) = dot(LRelu(X), A) + dot(LRelu(Y), A)
So, you can do:
dense_x = K.dot(LRelu(X), A)
dense_y = K.dot(LRelu(Y), A)
dense_x = tf.expand_dims(dense_x, axis=1)
dense_y = tf.expand_dims(dense_y, axis=0)
dense = dense_x + dense_y
In this way, all operations are done at most on N x D elements and you only have to store a maximum of N x N elements (assuming N > D).
Quantitative comparisons. N=1000, D=500
def timeit(func):
def run(*args, **kwargs):
start = time.time()
out = func(*args, **kwargs)
end = time.time()
print(f"Exec: {(end-start)*1000:.4f}ms")
return out
return run
#timeit
def fast(X, Y, A, N, D):
X = X.reshape(N, 1, D)
Y = Y.reshape(1, N, D)
feature_sum = X + Y
dense = feature_sum # A
return dense
#timeit
def fast(X, Y, A, N, D):
dense_x = X # A
dense_y = Y # A
dense_x = dense_x.reshape(N, 1, 1)
dense_y = dense_y.reshape(1, N, 1)
dense = dense_x + dense_y
return dense
def main():
N = 1000
D = 500
X = np.random.rand(N, D)
Y = np.random.rand(N, D)
A = np.random.rand(D, 1)
dense1 = slow(X, Y, A, N, D)
dense2 = fast(X, Y, A, N, D)
print("Same result: ", np.allclose(dense1, dense2))
Output:
Exec: 1547.9290ms # slow
Exec: 2.9860ms # fast
Same result: True

vectorized implementation without using for loops

I'm trying to implement this code but it's quite slow because of two for loops. Can anyone suggest vectorized version of this code, please?
import numpy as np
P,Q = 1000,1000
thresh = 100
H = np.zeros((P,Q)
for u in range(P):
for v in range(Q):
if dist(u, v, P, Q) <= thresh:
H_LP[u, v] = 1
def dist(u, v, p, q):
return np.sqrt(np.square(u - p / 2) + np.square(v - q / 2))

Try this.
import numpy as np
P,Q = 1000,1000
thresh = 100
u = np.arange(P)
v = np.arange(Q)
dist_mat = np.sqrt(((u - P/2)**2)[:, None] + ((v - Q/2)**2)[None, :])
H = np.zeros((P, Q))
H[dist_mat <= thresh] = 1

I think numba can speed up your code
import numpy as np
import numba
P,Q = 1000,1000
thresh = 100
H = np.zeros((P,Q)
#jit(nopython=True)
def function(P, Q, thresh, H):
for u in range(P):
for v in range(Q):
if dist(u, v, P, Q) <= thresh:
H_LP[u, v] = 1
def dist(u, v, p, q):
return np.sqrt(np.square(u - p / 2) + np.square(v - q / 2))

import numpy as np
P,Q = 1000,1000
thresh = 100
idx_H = np.stack(np.indices((P, Q)), axis=-1)
H_LP = dist(idx_H[..., 0], idx_H[..., 1], P, Q) <= thresh
def dist(u, v, p, q):
return np.sqrt(np.square(u - p / 2) + np.square(v - q / 2))

Vectorizing softmax cross-entropy gradient

I'm trying to implement my own neural network with (almost) fully vectorized operations. There are lots of posts out there but I can't seem to find one that fits all three of these:
separate cross-entropy and softmax terms in the gradient calculation (so I can interchange the last activation and loss)
multi-class classification (y is one-hot encoded)
all operations are fully vectorized
My main question is: How do I get to dE/dz (N x K) given dE/da (N x K) and da/dz (N x K x K) using a fully vectorized operation? i.e. How do I vectorize dE_dz_test2?
My second question is:
Is there a better way to write softmax_derivative?
I used this as a reference for calculating the gradient one sample at a time:
http://saitcelebi.com/tut/output/part2.html
and this for figuring out how to do backprop
https://peterroelants.github.io/posts/neural-network-implementation-part04/
def one_hot_encode(y, n_classes):
y_onehot = np.zeros((len(y), n_classes))
for i, y_i in enumerate(y):
y_onehot[i, y_i] = 1
return y_onehot
def cross_entropy_derivative(y_true, y_pred):
# dE / da
# input: N x K
# output: N x K array
N = len(y_true)
return -(y_true / y_pred) / N
def softmax(x):
# activation (a)
# input: N x K array
# output: N x K array
# https://eli.thegreenplace.net/2016/the-softmax-function-and-its-derivative/
exp = np.exp(x - np.max(x))
return exp / np.sum(exp, axis=1)[:, None]
def softmax_derivative(Z):
# da/dz
#input: N x K array
#output: N x K x K array
#http://saitcelebi.com/tut/output/part2.html
N, K = Z.shape
s = softmax(Z)[:, :, np.newaxis]
a = np.tensordot(s, np.ones((1, K)), axes=([-1],[0]))
I = np.repeat(np.eye(K, K)[np.newaxis, :, :], N, axis=0)
b = I - np.tensordot(np.ones((K, 1)), s.T, axes=([-1],[0])).T
return a * np.swapaxes(b, 1, 2)
def softmax_derivative_test(Z):
# da/dz
# non-vectorized softmax gradient calculation
#http://saitcelebi.com/tut/output/part2.html
N, K = Z.shape
da_dz = np.zeros((N, K, K))
kron_delta = np.eye(K)
s = softmax(Z)
for n in range(N):
for i in range(K):
for j in range(K):
da_dz[n, i, j] = s[n, i] * (kron_delta[i, j] - s[n, j])
return da_dz
def dE_dz_test2(dE_da, da_dz):
# array (N x K)
# array (N x K x K)
# output: array (N x K)
N, K = dE_da.shape
dE_dz = np.zeros((N, K))
for n in range(N):
dE_dz[n, :] = np.matmul(da_dz[n], dE_da[n, :, np.newaxis]).T
return dE_dz
def some_type_of_matrix_multiplication_(dE_da, da_dz):
# how do i get dE/dz from dE_da and da_dz
pass
X = np.random.rand(100, 2)
W = np.random.rand(2, 4)
y = np.random.randint(0, 4, size=100)
y = one_hot_encode(y, 4)
Z = X # W
S = softmax(Z)
N, K = Z.shape
# da / dz for softmax
da_dz = softmax_derivative(Z) # (100, 4, 4)
da_dz_test = softmax_derivative_test(Z) # (100, 4, 4) - non vectorized implementation
print(np.isclose(da_dz, da_dz_test).all()) # equivalence test
dE_da = cross_entropy_derivative(y, S) # (100, 4)
dE_dz = some_type_of_matrix_multiplication_(dE_da, da_dz) # what do I do here? *****
dE_dz_test = (S - y) / N # (100, 4) If you combine dE/da and da/dz terms
dE_dz_test2 = dE_dz_test2(dE_da, da_dz)
print(np.isclose(dE_dz_test, dE_dz_test2).all()) # equivalence test
True
True

Here is an approach using np.einsum:
def da_dz_pp(z,sm=None):
if sm is None:
sm = softmax(z)
res = np.einsum('ij,ik->ijk',sm,-sm)
np.einsum('ijj->ij',res)[...] += sm
return res
def dE_dz_pp(y,z,sm=None):
if sm is None:
sm = softmax(z)
dE_da = cross_entropy_derivative(y,sm)
da_dz = da_dz_pp(z,sm)
return np.einsum('ij,ijk->ik',dE_da,da_dz)
It seems to reproduce what your code outputs and is a bit faster.

Wiki example for Arnoldi iteration only works for real matrices?

The Wikipedia entry for the Arnoldi method provides a Python example that produces basis of the Krylov subspace of a matrix A. Supposedly, if A is Hermitian (i.e. if A == A.conj().T) then the Hessenberg matrix h generated by this algorithm is tridiagonal (source). However, when I use the Wikipedia code on a real-world Hermitian matrix, the Hessenberg matrix is not at all tridiagonal. When I perform the computation on the real part of A (so that A == A.T) then I do get a tridiagonal Hessenberg matrix, so there seems to be a problem with the imaginary components of A. Does anybody know why the Wikipedia code doesn't produce the expected results?
Working example:
import numpy as np
import matplotlib.pyplot as plt
from scipy.linalg import circulant
def arnoldi_iteration(A, b, n):
m = A.shape[0]
h = np.zeros((n + 1, n), dtype=np.complex)
Q = np.zeros((m, n + 1), dtype=np.complex)
q = b / np.linalg.norm(b) # Normalize the input vector
Q[:, 0] = q # Use it as the first Krylov vector
for k in range(n):
v = A.dot(q) # Generate a new candidate vector
for j in range(k + 1): # Subtract the projections on previous vectors
h[j, k] = np.dot(Q[:, j], v)
v = v - h[j, k] * Q[:, j]
h[k + 1, k] = np.linalg.norm(v)
eps = 1e-12 # If v is shorter than this threshold it is the zero vector
if h[k + 1, k] > eps: # Add the produced vector to the list, unless
q = v / h[k + 1, k] # the zero vector is produced.
Q[:, k + 1] = q
else: # If that happens, stop iterating.
return Q, h
return Q, h
# Construct matrix A
N = 2**4
I = np.eye(N)
k = np.fft.fftfreq(N, 1.0 / N) + 0.5
alpha = np.linspace(0.1, 1.0, N)*2e2
c = np.fft.fft(alpha) / N
C = circulant(c)
A = np.einsum("i, ij, j->ij", k, C, k)
# Show that A is Hermitian
print(np.allclose(A, A.conj().T))
# Arbitrary (random) initial vector
np.random.seed(0)
v = np.random.rand(N)
# Perform Arnoldi iteration with complex A
_, h = arnoldi_iteration(A, v, N)
# Perform Arnoldi iteration with real A
_, h2 = arnoldi_iteration(np.real(A), v, N)
# Plot results
plt.subplot(121)
plt.imshow(np.abs(h))
plt.title("Complex A")
plt.subplot(122)
plt.imshow(np.abs(h2))
plt.title("Real A")
plt.tight_layout()
plt.show()
Result:

After browsing through some conference presentation slides, I realised that at some point Q had to be conjugated when A is complex. The correct algorithm is posted below for reference, with the code change marked (note that this correction has also been submitted to the Wikipedia entry):
import numpy as np
def arnoldi_iteration(A, b, n):
m = A.shape[0]
h = np.zeros((n + 1, n), dtype=np.complex)
Q = np.zeros((m, n + 1), dtype=np.complex)
q = b / np.linalg.norm(b)
Q[:, 0] = q
for k in range(n):
v = A.dot(q)
for j in range(k + 1):
h[j, k] = np.dot(Q[:, j].conj(), v) # <-- Q needs conjugation!
v = v - h[j, k] * Q[:, j]
h[k + 1, k] = np.linalg.norm(v)
eps = 1e-12
if h[k + 1, k] > eps:
q = v / h[k + 1, k]
Q[:, k + 1] = q
else:
return Q, h
return Q, h

General Minimum RESidual (GMRES) with ILU preconditioner

I'm trying to implement the ILU preconditioner in this GMRES code I wrote (in order to solve the linear sistem Ax = b. I'm trying with an easy tridiagonal SPD matrix of dimension 25x25. As you can see I'm calculating the preconditioner with spilu method. The code is running without error, but the solution is clearly wrong since, at the end of the code, I'm printing the norm of b and the norm of the product A*x. They are not nearly the same..
The code Run fine without preconditioner and converge with 13 iteration for the same matrix.
This is the code I followed
import numpy as np
import scipy as sp
import matplotlib.pyplot as plt
'Size controller'
matrixSize =25
'Building a tri-diagonal matrix'
def Atridiag(val_0, val_sup, val_inf, mSize):
cen = np.ones((1, mSize))*val_0
sup = np.ones((1, mSize-1))*val_sup
inf = np.ones((1, mSize-1))*val_inf
diag_cen = np.diagflat(cen, 0)
diag_sup = np.diagflat(sup, 1)
diag_inf = np.diagflat(inf, -1)
return diag_cen + diag_sup + diag_inf
A = Atridiag(2, -1, -1, matrixSize)
A = sp.sparse.csc_matrix (A)
'Plot matrix sparsity'
plt.clf()
plt.spy(A, marker ='.', markersize=2)
plt.show()
'random b and x0 vectors'
b = np.matrix(np.ones((matrixSize, 1)))
x = np.matrix(np.ones((matrixSize, 1)))
'Incomplete LU'
M = sp.sparse.linalg.dsolve.spilu(A)
M1 = lambda x: M.solve(x)
M2=sp.sparse.linalg.LinearOperator((matrixSize,matrixSize),M1)
'Initial Data'
nmax_iter = 30
rstart = 2
tol = 1e-7
e = np.zeros((nmax_iter + 1, 1))
rr = 1
'Starting GMRES'
for rs in range (0, rstart+1):
'first check on residual'
if rr < tol :
break
else:
r0 = (b - A.dot(x))
betha = np.linalg.norm(r0)
e[0] = betha
H = np.zeros((nmax_iter + 1, nmax_iter))
V = np.zeros((matrixSize, nmax_iter+1))
V[:, 0:1] = r0/betha
for k in range (1, nmax_iter+1):
'Appling the Preconditioner'
t = A.dot(V[:, k-1])
V[:, k] = M2.matvec(t)
'Ortogonalizzazione GS'
for j in range (k):
H[j, k-1] = np.dot(V[:, k].T, V[:, j])
V[:, k] = V[:, k] - (np.dot(H[j, k-1], V[:, j]))
H[k, k-1] = np.linalg.norm(V[:, k])
V[:, k] = V[:, k] / H[k, k-1]
'QR Decomposition'
n=k
Q = np.zeros((n+1, n))
R = np.zeros((n, n))
R[0, 0] = np.linalg.norm(H[0:n+2, 0])
Q[:, 0] = H[0:n+1, 0] / R[0,0]
for j in range (0, n+1):
t = H[0:n+1, j-1]
for i in range (0, j-1):
R[i, j-1] = np.dot(Q[:, i], t)
t = t - np.dot(R[i, j-1], Q[:, i])
R[j-1, j-1] = np.linalg.norm(t)
Q[:, j-1] = t / R[j-1, j-1]
g = np.dot(Q.T, e[0:k+1])
Z = np.dot(np.linalg.inv(R), g)
Res = e[0:n] - np.dot(H[0:n, 0:n], Z[0:n])
rr = np.linalg.norm(Res)
'second check on residual'
if rr < tol:
break
'Updating the solution'
x = x + np.dot(V[:, 0:k], Z)
print(sp.linalg.norm(b))
print(sp.linalg.norm(np.dot(A.todense(),x)))
Really Hope somebody can figure it out!!

Maybe it's too late, but for future reference :
You forgot to multiply by the conditioner when updating x :
x = x + M2.dot(np.dot(V[:, 0:k], Z) # M2.matvec() works the same
See here
With that fix, the algorithm converges in 1 iteration.
Other comments:
You can directly do : M2 = sp.sparse.linalg.LinearOperator((matrixSize,matrixSize),M.solve)
At the end, to compare Ax and b, it's better to print the difference (residual) because you will get a much more precise result: print(sp.linalg.norm(b - np.dot(A.todense(),x)))

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Speed-up computation on sub-matrices of a 4D tensor - python

Related

How to efficiently calculate score = dot(a, LeakyReLU(x_i+y_j)) for each i, j in [N]?

vectorized implementation without using for loops

Vectorizing softmax cross-entropy gradient

Wiki example for Arnoldi iteration only works for real matrices?

General Minimum RESidual (GMRES) with ILU preconditioner

Categories

Resources