is there an elegant, numpy way to apply the dot product elementwise? Or how can the below code be translated into a nicer version?
m0 # shape (5, 3, 2, 2)
m1 # shape (5, 2, 2)
r = np.empty((5, 3, 2, 2))
for i in range(5):
for j in range(3):
r[i, j] = np.dot(m0[i, j], m1[i])
Thanks in advance!
Approach #1
Use np.einsum -
np.einsum('ijkl,ilm->ijkm',m0,m1)
Steps involved :
Keep the first axes from the inputs aligned.
Lose the last axis from m0 against second one from m1 in sum-reduction.
Let remaining axes from m0 and m1 spread-out/expand with elementwise multiplications in an outer-product fashion.
Approach #2
If you are looking for performance and with the axis of sum-reduction having a smaller length, you are better off with one-loop and using matrix-multiplication with np.tensordot, like so -
s0,s1,s2,s3 = m0.shape
s4 = m1.shape[-1]
r = np.empty((s0,s1,s2,s4))
for i in range(s0):
r[i] = np.tensordot(m0[i],m1[i],axes=([2],[0]))
Approach #3
Now, np.dot could be efficiently used on 2D inputs for some further performance boost. So, with it, the modified version, though a bit longer one, but hopefully the most performant one would be -
s0,s1,s2,s3 = m0.shape
s4 = m1.shape[-1]
m0.shape = s0,s1*s2,s3 # Get m0 as 3D for temporary usage
r = np.empty((s0,s1*s2,s4))
for i in range(s0):
r[i] = m0[i].dot(m1[i])
r.shape = s0,s1,s2,s4
m0.shape = s0,s1,s2,s3 # Put m0 back to 4D
Runtime test
Function definitions -
def original_app(m0, m1):
s0,s1,s2,s3 = m0.shape
s4 = m1.shape[-1]
r = np.empty((s0,s1,s2,s4))
for i in range(s0):
for j in range(s1):
r[i, j] = np.dot(m0[i, j], m1[i])
return r
def einsum_app(m0, m1):
return np.einsum('ijkl,ilm->ijkm',m0,m1)
def tensordot_app(m0, m1):
s0,s1,s2,s3 = m0.shape
s4 = m1.shape[-1]
r = np.empty((s0,s1,s2,s4))
for i in range(s0):
r[i] = np.tensordot(m0[i],m1[i],axes=([2],[0]))
return r
def dot_app(m0, m1):
s0,s1,s2,s3 = m0.shape
s4 = m1.shape[-1]
m0.shape = s0,s1*s2,s3 # Get m0 as 3D for temporary usage
r = np.empty((s0,s1*s2,s4))
for i in range(s0):
r[i] = m0[i].dot(m1[i])
r.shape = s0,s1,s2,s4
m0.shape = s0,s1,s2,s3 # Put m0 back to 4D
return r
Timings and verification -
In [291]: # Inputs
...: m0 = np.random.rand(50,30,20,20)
...: m1 = np.random.rand(50,20,20)
...:
In [292]: out1 = original_app(m0, m1)
...: out2 = einsum_app(m0, m1)
...: out3 = tensordot_app(m0, m1)
...: out4 = dot_app(m0, m1)
...:
...: print np.allclose(out1, out2)
...: print np.allclose(out1, out3)
...: print np.allclose(out1, out4)
...:
True
True
True
In [293]: %timeit original_app(m0, m1)
...: %timeit einsum_app(m0, m1)
...: %timeit tensordot_app(m0, m1)
...: %timeit dot_app(m0, m1)
...:
100 loops, best of 3: 10.3 ms per loop
10 loops, best of 3: 31.3 ms per loop
100 loops, best of 3: 5.12 ms per loop
100 loops, best of 3: 4.06 ms per loop
I think numpy.inner() is what you really want?
Related
I have a NumPy array full of indices:
size = 100000
idx = np.random.randint(0, size, size=size)
And I have a simple function that loops over the indices and does:
out = np.zeros(size, dtype=np.int)
for i in range(size):
j = idx[i]
out[min(i, j)] = out[min(i, j)] + 1
out[max(i, j)] = out[max(i, j)] - 1
return np.cumsum(out)
This is quite slow when size is large and I am hoping to find a faster way to accomplish this. I've tried this but it isn't quite right:
out = np.zeros(size, dtype=np.int)
i = np.arange(size)
j = idx[i]
mini = np.minimum(i, j)
maxi = np.maximum(i, j)
out[mini] = out[mini] + 1
out[maxi] = out[maxi] - 1
return np.cumsum(out)
We can make use of np.bincount -
R = np.arange(size)
out = np.bincount(np.minimum(R,idx),minlength=size)
out -= np.bincount(np.maximum(R,idx),minlength=size)
final_out = out.cumsum()
Timings -
All posted solutions use cumsum at the end. So, let's time these skipping that last step -
In [25]: np.random.seed(0)
...: size = 100000
...: idx = np.random.randint(0, size, size=size)
# From this post
In [27]: %%timeit
...: R = np.arange(size)
...: out = np.bincount(np.minimum(R,idx),minlength=size)
...: out -= np.bincount(np.maximum(R,idx),minlength=size)
1000 loops, best of 3: 643 µs per loop
# #slaw's solution
In [28]: %%timeit
...: i = np.arange(size)
...: j = idx[i]
...: mini = np.minimum(i, j)
...: maxi = np.maximum(i, j)
...:
...: unique_mini, mini_counts = np.unique(mini, return_counts=True)
...: unique_maxi, maxi_counts = np.unique(maxi, return_counts=True)
...:
...: out = np.zeros(size, dtype=np.int)
...: out[unique_mini] = out[unique_mini] + mini_counts
...: out[unique_maxi] = out[unique_maxi] - maxi_counts
100 loops, best of 3: 13.3 ms per loop
# Loopy one from question
In [29]: %%timeit
...: out = np.zeros(size, dtype=np.int)
...:
...: for i in range(size):
...: j = idx[i]
...: out[min(i, j)] = out[min(i, j)] + 1
...: out[max(i, j)] = out[max(i, j)] - 1
10 loops, best of 3: 141 ms per loop
This seems to give the same answer as the for-loop
i = np.arange(size)
j = idx[i]
mini = np.minimum(i, j)
maxi = np.maximum(i, j)
unique_mini, mini_counts = np.unique(mini, return_counts=True)
unique_maxi, maxi_counts = np.unique(maxi, return_counts=True)
out = np.zeros(size, dtype=np.int)
out[unique_mini] = out[unique_mini] + mini_counts
out[unique_maxi] = out[unique_maxi] - maxi_counts
return np.cumsum(out)
Given two NumPy arrays, say:
import numpy as np
import numpy.random as rand
n = 1000
x = rand.binomial(n=1, p=.5, size=(n, 10))
y = rand.binomial(n=1, p=.5, size=(n, 10))
Is there a more efficient way to compute X in the following:
X = np.zeros((n, n))
for i in range(n):
for j in range(n):
X[i, j] = 1 * np.all(x[i] == y[j])
Approach #1 : Input arrays with 0s & 1s
For input arrays with 0s and 1s only, we can reduce each of their rows to scalars and hence the input arrays to 1D and then leverage broadcasting, like so -
n = x.shape[1]
s = 2**np.arange(n)
x1D = x.dot(s)
y1D = y.dot(s)
Xout = (x1D[:,None] == y1D).astype(float)
Approach #2 : Generic case
For a generic case, we can use views -
# https://stackoverflow.com/a/45313353/ #Divakar
def view1D(a, b): # a, b are arrays
a = np.ascontiguousarray(a)
b = np.ascontiguousarray(b)
void_dt = np.dtype((np.void, a.dtype.itemsize * a.shape[1]))
return a.view(void_dt).ravel(), b.view(void_dt).ravel()
x1D, y1D = view1D(x, y)
Xout = (x1D[:,None] == y1D).astype(float)
Runtime test
# Setup
In [287]: np.random.seed(0)
...: n = 1000
...: x = rand.binomial(n=1, p=.5, size=(n, 10))
...: y = rand.binomial(n=1, p=.5, size=(n, 10))
# Original approach
In [288]: %%timeit
...: X = np.zeros((n, n))
...: for i in range(n):
...: for j in range(n):
...: X[i, j] = 1 * np.all(x[i] == y[j])
1 loop, best of 3: 4.69 s per loop
# Approach #1
In [290]: %%timeit
...: n = x.shape[1]
...: s = 2**np.arange(n)
...: x1D = x.dot(s)
...: y1D = y.dot(s)
...: Xout = (x1D[:,None] == y1D).astype(float)
1000 loops, best of 3: 1.42 ms per loop
# Approach #2
In [291]: %%timeit
...: x1D, y1D = view1D(x, y)
...: Xout = (x1D[:,None] == y1D).astype(float)
100 loops, best of 3: 18.5 ms per loop
Here is the functionality demonstrated on a fixed number of matrices:
x = np.matrix('0.5')
y = np.matrix('0.5 0.5; 0.5 0.5')
z = np.matrix('0.75 0.25; 0.34 0.66')
output = []
for i in x.flat:
for j in y.flat:
for k in z.flat:
output.append(i * j * k)
I need help solving this issue on a variable number of matrices. I have tried using
reduce(np.dot, arr)
But this is not what I want to do.
With A holding the list of input matrices, we could just iteratively use np.outer. np.outer would flatten the inputs on its own, so, we don't need to do it ourselves and only a final flattening step would be needed.
Thus, solution would be -
A = [x,y,z,w]
out = A[0]
for i in A[1:]:
out = np.outer(out, i)
out = out.ravel()
Note that the output would be an array. If needed as a matrix, simply wrap it with np.matrix() at the end.
Sample run for 4 matrices -
In [38]: x = np.matrix('0.5')
...: y = np.matrix('0.15 0.25; 0.35 0.45')
...: z = np.matrix('0.75 0.25; 0.34 0.66')
...: w = np.matrix('0.45 0.15; 0.8 0.2')
...:
...: output = []
...: for i in x.flat:
...: for j in y.flat:
...: for k in z.flat:
...: for l in w.flat:
...: output.append(i * j * k * l)
...:
In [64]: A = [x,y,z,w]
...: out = A[0]
...: for i in A[1:]:
...: out = np.outer(out, i)
...: out = out.ravel()
...:
In [65]: np.allclose(output, out)
Out[65]: True
I have a 3D numpy array of shape (t, n1, n2):
x = np.random.rand(10, 2, 4)
I need to calculate another 3D array y which is of shape (t, n1, n1) such that:
y[0] = np.cov(x[0,:,:])
...and so on for all slices along the first axis.
So, a loopy implementation would be:
y = np.zeros((10,2,2))
for i in np.arange(x.shape[0]):
y[i] = np.cov(x[i, :, :])
Is there any way to vectorize this so I can calculate all covariance matrices in one go? I tried doing:
x1 = x.swapaxes(1, 2)
y = np.dot(x, x1)
But it didn't work.
Hacked into numpy.cov source code and tried using the default parameters. As it turns out, np.cov(x[i,:,:]) would be simply :
N = x.shape[2]
m = x[i,:,:]
m -= np.sum(m, axis=1, keepdims=True) / N
cov = np.dot(m, m.T) /(N - 1)
So, the task was to vectorize this loop that would iterate through i and process all of the data from x in one go. For the same, we could use broadcasting at the third step. For the final step, we are performing sum-reduction there along all slices in first axis. This could be efficiently implemented in a vectorized manner with np.einsum. Thus, the final implementation came to this -
N = x.shape[2]
m1 = x - x.sum(2,keepdims=1)/N
y_out = np.einsum('ijk,ilk->ijl',m1,m1) /(N - 1)
Runtime test
In [155]: def original_app(x):
...: n = x.shape[0]
...: y = np.zeros((n,2,2))
...: for i in np.arange(x.shape[0]):
...: y[i]=np.cov(x[i,:,:])
...: return y
...:
...: def proposed_app(x):
...: N = x.shape[2]
...: m1 = x - x.sum(2,keepdims=1)/N
...: out = np.einsum('ijk,ilk->ijl',m1,m1) / (N - 1)
...: return out
...:
In [156]: # Setup inputs
...: n = 10000
...: x = np.random.rand(n,2,4)
...:
In [157]: np.allclose(original_app(x),proposed_app(x))
Out[157]: True # Results verified
In [158]: %timeit original_app(x)
1 loops, best of 3: 610 ms per loop
In [159]: %timeit proposed_app(x)
100 loops, best of 3: 6.32 ms per loop
Huge speedup there!
I have a 3-Dimensional numpy array A. I would like to multiply every element A[i,j,k] by w*( i / Lx + j / Ly + k / Lz ) where w, Lx, Ly and Lz are real numbers (floats). Performing this operation in a for loop is highly impractical, since I need to be able to scale this for large arrays and a for loop over the three indices ijk scales in O(N^3).
Is there an efficient way to perform an operation on each element of a numpy array that cares about index?
You can use broadcasting -
M,N,R = A.shape
p1 = np.arange(M)[:,None,None]/Lx
p2 = np.arange(N)[:,None]/Ly
p3 = np.arange(R)/Lz
out = A/(w*(p1 + p2 + p3))
You can also use np.ix_ for a more elegant solution -
M,N,R = A.shape
X,Y,Z = np.ix_(np.arange(M),np.arange(N),np.arange(R))
out = A/(w*((X/Lx) + (Y/Ly) + (Z/Lz)))
Runtime tests and output verification -
Function definitions:
def vectorized_app1(A, w, Lx, Ly, Lz ):
M,N,R = A.shape
p1 = np.arange(M)[:,None,None]/Lx
p2 = np.arange(N)[:,None]/Ly
p3 = np.arange(R)/Lz
return A/(w*(p1 + p2 + p3))
def vectorized_app2(A, w, Lx, Ly, Lz ):
M,N,R = A.shape
X,Y,Z = np.ix_(np.arange(M),np.arange(N),np.arange(R))
return A/(w*((X/Lx) + (Y/Ly) + (Z/Lz)))
def original_app(A, w, Lx, Ly, Lz ):
out = np.empty_like(A)
M,N,R = A.shape
for i in range(M):
for j in range(N):
for k in range(R):
out[i,j,k] = A[i,j,k]/(w*( (i / Lx) + (j / Ly) + (k / Lz) ))
return out
Timings:
In [197]: # Inputs
...: A = np.random.rand(100,100,100)
...: w, Lx, Ly, Lz = 2.3, 3.2, 4.2, 5.2
...:
In [198]: np.allclose(original_app(A,w,Lx,Ly,Lz),vectorized_app1(A,w,Lx,Ly,Lz))
Out[198]: True
In [199]: np.allclose(original_app(A,w,Lx,Ly,Lz),vectorized_app2(A,w,Lx,Ly,Lz))
Out[199]: True
In [200]: %timeit original_app(A, w, Lx, Ly, Lz )
1 loops, best of 3: 1.39 s per loop
In [201]: %timeit vectorized_app1(A, w, Lx, Ly, Lz )
10 loops, best of 3: 24.6 ms per loop
In [202]: %timeit vectorized_app2(A, w, Lx, Ly, Lz )
10 loops, best of 3: 24.2 ms per loop