vectorized implementation without using for loops

vectorized implementation without using for loops - python

I'm trying to implement this code but it's quite slow because of two for loops. Can anyone suggest vectorized version of this code, please?
import numpy as np
P,Q = 1000,1000
thresh = 100
H = np.zeros((P,Q)
for u in range(P):
for v in range(Q):
if dist(u, v, P, Q) <= thresh:
H_LP[u, v] = 1
def dist(u, v, p, q):
return np.sqrt(np.square(u - p / 2) + np.square(v - q / 2))

Try this.
import numpy as np
P,Q = 1000,1000
thresh = 100
u = np.arange(P)
v = np.arange(Q)
dist_mat = np.sqrt(((u - P/2)**2)[:, None] + ((v - Q/2)**2)[None, :])
H = np.zeros((P, Q))
H[dist_mat <= thresh] = 1

I think numba can speed up your code
import numpy as np
import numba
P,Q = 1000,1000
thresh = 100
H = np.zeros((P,Q)
#jit(nopython=True)
def function(P, Q, thresh, H):
for u in range(P):
for v in range(Q):
if dist(u, v, P, Q) <= thresh:
H_LP[u, v] = 1
def dist(u, v, p, q):
return np.sqrt(np.square(u - p / 2) + np.square(v - q / 2))

import numpy as np
P,Q = 1000,1000
thresh = 100
idx_H = np.stack(np.indices((P, Q)), axis=-1)
H_LP = dist(idx_H[..., 0], idx_H[..., 1], P, Q) <= thresh
def dist(u, v, p, q):
return np.sqrt(np.square(u - p / 2) + np.square(v - q / 2))

Related

Speed-up computation on sub-matrices of a 4D tensor

I have an array whose input data is 1028* 24* 24*16. When I run the code below, it works very slowly. How can I speed this up? thanks
(I want to get 3*3 matrices from a large array.)
import itertools,math,time,random
import numpy as np
start=time.time()
def ls(x):
x_p = x / np.sum(x)
return np.max(x_p)
inputs = np.random.rand(1028, 24, 24, 16)
b, r, c, ch = inputs.shape[0], inputs.shape[1], inputs.shape[2], inputs.shape[3]
inputs = np.transpose(inputs, (0, 3, 1, 2))
inputs = np.reshape(inputs, (b*ch, r, c))
s=2
r=3
num_r=9
num_c=9
ke = np.zeros((b*ch, num_r, num_c), dtype=np.float32)
for i in range(num_r):
for j in range(num_c):
outs = np.array(list(map(ls, inputs[:, i*s:i*s+r, j*s:j*s+r])))
ke[:, i, j] = outs
ke = np.reshape(ke, (b, ch, num_r, num_c))
ke = np.transpose(ke, (0, 2, 3, 1))
print(time.time() - start)

The original code can be wrapped (with some polishing) into the following function:
import numpy as np
def foo0(arr, s=2, r=3, nr=9, nc=9):
forward_axes = (0, 3, 1, 2)
backward_axes = (0, 2, 3, 1)
b, r, c, ch = arr.shape
arr = np.transpose(arr, forward_axes)
arr = np.reshape(arr, (b * ch, r, c))
result = np.zeros((b * ch, nr, nc), dtype=np.float32)
for i in range(nr):
for j in range(nc):
result[:, i, j] = np.fromiter(
map(
lambda x: np.max(x / np.sum(x)),
arr[:, i * s:i * s + r, j * s:j * s + r]),
dtype=np.float32)
result = np.reshape(result, (b, ch, nr, nc))
result = np.transpose(result, backward_axes)
return result
While the explicit looping suggests that Numba could be applied here to get some low-hanging fruit, unfortunately the function cannot be readily decorated without interaction with Python object, thus greatly reducing the speed-up.
Fortunately, the core computation can be readily vectorized and, as long as nr and nc are small enough, this optimization is sufficient:
def foo1(arr, s=2, r=3, nr=9, nc=9):
forward_axes = (0, 3, 1, 2)
backward_axes = (0, 2, 3, 1)
b, r, c, ch = arr.shape
arr = np.transpose(arr, forward_axes)
arr = np.reshape(arr, (b * ch, r, c))
result = np.zeros((b * ch, nr, nc), dtype=np.float32)
for i in range(nr):
for j in range(nc):
x = arr[:, i * s:i * s + r, j * s:j * s + r]
result[:, i, j] = np.max(x, (-1, -2)) / np.sum(x, (-1, -2))
result = np.reshape(result, (b, ch, nr, nc))
result = np.transpose(result, backward_axes)
return result
(The above foo1() is essentially equivalent to #ymmx's answer with some additional optimizations.)
Note that max(x / k) is the same as max(x) / k but the number of divisions are greatly reduced.
Actually, transposing and reshaping, while it may help with the computation speed, it is not really necessary:
def foo2(arr, s=2, r=3, nr=9, nc=9):
b, r, c, ch = arr.shape
result = np.zeros((b, nr, nc, ch), dtype=np.float32)
for i in range(nr):
for j in range(nc):
x = arr[:, i * s:i * s + r, j * s:j * s + r, :]
result[:, i, j, :] = np.max(x, (1, 2)) / np.sum(x, (1, 2))
return result
The above is simpler to translate in Numba, but the speed gain for small nr/nc would be minimal (compared to the partially vectorized approach):
import numba as nb
#nb.njit
def sum_nb(arr):
result = 0
for x in arr:
result += x
return result
#nb.njit
def max_nb(arr):
result = arr[0]
for x in arr[1:]:
if x > result:
result = x
return result
#nb.njit
def _sum_max(arr):
b, r, c, ch = arr.shape
res = np.empty((b, ch), dtype=arr.dtype)
for i in range(b):
for j in range(ch):
x = arr[i, :, :, j].ravel()
res[i, j] = max_nb(x) / sum_nb(x)
return res
#nb.njit
def foo3(arr, s=2, r=3, nr=9, nc=9):
b, r, c, ch = arr.shape
result = np.zeros((b, nr, nc, ch), dtype=np.float32)
for i in range(nr):
for j in range(nc):
result[:, i, j, :] = _sum_max(arr[:, i * s:i * s + r, j * s:j * s + r, :])
return result
Another option would be to keep the Numba-incompatible code outside of the main looping:
#nb.njit(fastmath=True)
def _foo4(arr, result, s, r, nr, nc):
bch, nr, nc = result.shape
for i in range(nr):
for j in range(nc):
for k in range(bch):
x = arr[k, i * s:i * s + r, j * s:j * s + r].ravel()
result[k, i, j] = max_nb(x) / sum_nb(x)
return result
def foo4(arr, s=2, r=3, nr=9, nc=9):
forward_axes = (0, 3, 1, 2)
backward_axes = (0, 2, 3, 1)
b, r, c, ch = arr.shape
arr = np.transpose(arr, forward_axes)
arr = np.reshape(arr, (b * ch, r, c))
result = np.empty((b * ch, nr, nc))
result = _foo4(arr, result, s, r, nr, nc)
result = np.reshape(result, (b, ch, nr, nc))
result = np.transpose(result, backward_axes)
return result
but again the speed gain would be minimal.
Note that a fully vectorized approach is unlikely to be efficient, because the objects inside the main loops are jagged.
To get some ideas of the relative speed:
funcs = foo0, foo1, foo2, foo3, foo4
arr = np.random.rand(100, 24, 24, 16)
timeds_n = {}
for p in range(1):
n = 10 ** p
k = 3
arr = np.random.rand(100, 24, 24, 16)
print(f"N = {arr.size}")
base = funcs[0](arr)
timeds_n[n] = []
for func in funcs:
res = func(arr)
timed = %timeit -r 1 -n 1 -q -o func(arr)
timeds_n[n].append(timed.best)
print(f"{func.__name__:>24} {np.allclose(base, res)} {timed.best:.9f}")
N = 921600
foo0 True 1.757508748
foo1 True 0.095540081
foo2 True 0.179208341
foo3 True 0.160671403
foo4 True 0.155691721

I think the issue is mainly the function ls which should be vectorized and the list / map that takes you time
import itertools,math,time,random
import numpy as np
start=time.time()
def ls(x):
x_p = x / np.sum(np.sum(x, axis=1), axis=1)[:,None,None]
return np.max(np.max(x_p,axis=1),axis=1)
inputs = np.random.rand(1028, 24, 24, 16)
b, r, c, ch = inputs.shape[0], inputs.shape[1], inputs.shape[2], inputs.shape[3]
inputs = np.transpose(inputs, (0, 3, 1, 2))
inputs = np.reshape(inputs, (b*ch, r, c))
s=2
r=3
num_r=9
num_c=9
ke = np.zeros((b*ch, num_r, num_c), dtype=np.float32)
for i in range(num_r):
print(i)
for j in range(num_c):
# outs = np.array(list(map(ls, inputs[:, i*s:i*s+r, j*s:j*s+r])))
outs = ls(inputs[:, i*s:i*s+r, j*s:j*s+r])
ke[:, i, j] = outs
ke = np.reshape(ke, (b, ch, num_r, num_c))
ke = np.transpose(ke, (0, 2, 3, 1))
print(time.time() - start)

Faster way to iterate through pixel using numpy with conditions?

def colorize(im, h, s, l_adjust):
result = Image.new('RGBA', im.size)
pixin = np.copy(im)
pixout = np.array(result)
>>>>>>>>>>>>>>>>> loop <<<<<<<<<<<<<<<<<
for y in range(pixout.shape[1]):
for x in range(pixout.shape[0]):
lum = currentRGB(pixin[x, y][0], pixin[x, y][1], pixin[x, y][2])
r, g, b = colorsys.hls_to_rgb(h, lum, s)
r, g, b = int(r * 255.99), int(g * 255.99), int(b * 255.99)
pixout[x, y] = (r, g, b, 255)
>>>>>>>>>>>>>>>>>>>>> Loop end <<<<<<<<<<<
return result
Trying to find the HSL per pixel value from a frame of input video but it's taking too much time about 1.5s but want to reduce the time to at least within 0.3s. Any faster way to do this without using these 2 loops? Looking for something like LUT(Look up table)/vectorize/something with NumPy shortcut to avoid those 2 loops. Thanks
OR
Part 2 ->>
If I break the custom currentRGB() into the for loops it looks like :
def colorize(im, h, s, l_adjust):
result = Image.new('RGBA', im.size)
pixin = np.copy(im)
pixout = np.array(result)
for y in range(pixout.shape[1]):
for x in range(pixout.shape[0]):
currentR, currentG, currentB = pixin[x, y][0]/255 , pixin[x, y][1]/255, pixin[x, y][2]/255
#luminance
lum = (currentR * 0.2126) + (currentG * 0.7152) + (currentB * 0.0722)
if l_adjust > 0:
lum = lum * (1 - l_adjust)
lum = lum + (1.0 - (1.0 - l_adjust))
else:
lum = lum * (l_adjust + 1)
l = lum
r, g, b = colorsys.hls_to_rgb(h, l, s)
r, g, b = int(r * 255.99), int(g * 255.99), int(b * 255.99)
pixout[x, y] = (r, g, b, 255)
return pixout

You can use Numba to drastically speed the computation up. Here is the implementation:
import numba as nb
#nb.njit('float32(float32,float32,float32)')
def hue_to_rgb(p, q, t):
if t < 0: t += 1
if t > 1: t -= 1
if t < 1./6: return p + (q - p) * 6 * t
if t < 1./2: return q
if t < 2./3: return p + (q - p) * (2./3 - t) * 6
return p
#nb.njit('UniTuple(uint8,3)(float32,float32,float32)')
def hls_to_rgb(h, l, s):
if s == 0:
# achromatic
r = g = b = l
else:
q = l * (1 + s) if l < 0.5 else l + s - l * s
p = 2 * l - q
r = hue_to_rgb(p, q, h + 1./3)
g = hue_to_rgb(p, q, h)
b = hue_to_rgb(p, q, h - 1./3)
return (int(r * 255.99), int(g * 255.99), int(b * 255.99))
#nb.njit('void(uint8[:,:,::1],uint8[:,:,::1],float32,float32,float32)', parallel=True)
def colorize_numba(pixin, pixout, h, s, l_adjust):
for x in nb.prange(pixout.shape[0]):
for y in range(pixout.shape[1]):
currentR, currentG, currentB = pixin[x, y, 0]/255 , pixin[x, y, 1]/255, pixin[x, y, 2]/255
#luminance
lum = (currentR * 0.2126) + (currentG * 0.7152) + (currentB * 0.0722)
if l_adjust > 0:
lum = lum * (1 - l_adjust)
lum = lum + (1.0 - (1.0 - l_adjust))
else:
lum = lum * (l_adjust + 1)
l = lum
r, g, b = hls_to_rgb(h, l, s)
pixout[x, y, 0] = r
pixout[x, y, 1] = g
pixout[x, y, 2] = b
pixout[x, y, 3] = 255
def colorize(im, h, s, l_adjust):
result = Image.new('RGBA', im.size)
pixin = np.copy(im)
pixout = np.array(result)
colorize_numba(pixin, pixout, h, s, l_adjust)
return pixout
This optimized parallel implementation is about 2000 times faster than the original code on my 6-core machine (on 800x600 images). The hls_to_rgb implementation is coming from this post. Note that the string in #nb.njit decorators are not mandatory but enable Numba to compile the function ahead of time instead of at the first call. For more information about the types, please read the Numba documentation.

Wiki example for Arnoldi iteration only works for real matrices?

The Wikipedia entry for the Arnoldi method provides a Python example that produces basis of the Krylov subspace of a matrix A. Supposedly, if A is Hermitian (i.e. if A == A.conj().T) then the Hessenberg matrix h generated by this algorithm is tridiagonal (source). However, when I use the Wikipedia code on a real-world Hermitian matrix, the Hessenberg matrix is not at all tridiagonal. When I perform the computation on the real part of A (so that A == A.T) then I do get a tridiagonal Hessenberg matrix, so there seems to be a problem with the imaginary components of A. Does anybody know why the Wikipedia code doesn't produce the expected results?
Working example:
import numpy as np
import matplotlib.pyplot as plt
from scipy.linalg import circulant
def arnoldi_iteration(A, b, n):
m = A.shape[0]
h = np.zeros((n + 1, n), dtype=np.complex)
Q = np.zeros((m, n + 1), dtype=np.complex)
q = b / np.linalg.norm(b) # Normalize the input vector
Q[:, 0] = q # Use it as the first Krylov vector
for k in range(n):
v = A.dot(q) # Generate a new candidate vector
for j in range(k + 1): # Subtract the projections on previous vectors
h[j, k] = np.dot(Q[:, j], v)
v = v - h[j, k] * Q[:, j]
h[k + 1, k] = np.linalg.norm(v)
eps = 1e-12 # If v is shorter than this threshold it is the zero vector
if h[k + 1, k] > eps: # Add the produced vector to the list, unless
q = v / h[k + 1, k] # the zero vector is produced.
Q[:, k + 1] = q
else: # If that happens, stop iterating.
return Q, h
return Q, h
# Construct matrix A
N = 2**4
I = np.eye(N)
k = np.fft.fftfreq(N, 1.0 / N) + 0.5
alpha = np.linspace(0.1, 1.0, N)*2e2
c = np.fft.fft(alpha) / N
C = circulant(c)
A = np.einsum("i, ij, j->ij", k, C, k)
# Show that A is Hermitian
print(np.allclose(A, A.conj().T))
# Arbitrary (random) initial vector
np.random.seed(0)
v = np.random.rand(N)
# Perform Arnoldi iteration with complex A
_, h = arnoldi_iteration(A, v, N)
# Perform Arnoldi iteration with real A
_, h2 = arnoldi_iteration(np.real(A), v, N)
# Plot results
plt.subplot(121)
plt.imshow(np.abs(h))
plt.title("Complex A")
plt.subplot(122)
plt.imshow(np.abs(h2))
plt.title("Real A")
plt.tight_layout()
plt.show()
Result:

After browsing through some conference presentation slides, I realised that at some point Q had to be conjugated when A is complex. The correct algorithm is posted below for reference, with the code change marked (note that this correction has also been submitted to the Wikipedia entry):
import numpy as np
def arnoldi_iteration(A, b, n):
m = A.shape[0]
h = np.zeros((n + 1, n), dtype=np.complex)
Q = np.zeros((m, n + 1), dtype=np.complex)
q = b / np.linalg.norm(b)
Q[:, 0] = q
for k in range(n):
v = A.dot(q)
for j in range(k + 1):
h[j, k] = np.dot(Q[:, j].conj(), v) # <-- Q needs conjugation!
v = v - h[j, k] * Q[:, j]
h[k + 1, k] = np.linalg.norm(v)
eps = 1e-12
if h[k + 1, k] > eps:
q = v / h[k + 1, k]
Q[:, k + 1] = q
else:
return Q, h
return Q, h

root-finding algorithm for a complex polynomial equation in python

I am trying to solve for the following equation with a simple algorithm. I am not sure if the algorithm that I'm using is the best one or not but it is the only way that I could think of.
In this equation, everything other than P is known, and I am trying to solve for that. N is an array of counts, an i is the channel number. S(n) is the probability of having a certain n and C is binomial coefficient of (n, r). Pi is the probability in i channel and Pj is the probability in the previous channels with D distance to i. The code itself is not working but I believe that the main problem is in the way that I am trying to solve for it.
import numpy as np
import matplotlib.pyplot as plt
import math as ms
from scipy.misc import derivative
import scipy as sp
def next_guess(f, x):
slop = derivative(f, x, dx = 0.01)
return x - float(f(x))/slop
def my_newton(f, guess):
for i in range(30):
#print(guess)
guess = next_guess(f, guess)
return guess
def binomial(n, r):
dif = ms.factorial(n - r)
n = ms.factorial(n)
r = ms.factorial(r)
return (n/(r*dif))
def wrap_func(x, S = np.array([0.1, 0.5, 0.2, 0.1, 0.1]), D = 1, N = np.array([10, 15, 20, 1, 13])):
if type(x) == float:
z = np.zeros(1)
else:
z = np.zeros(x.shape[0])
N_tot = N.sum()
n_max = S.shape[0]
for i in range(z.shape[0]):
z[i] += my_newton(func(x, S = S, D = 1, N = N[i], n_max = n_max, N_tot = N_tot, i = i), i/100)
return z
def func(x, S = np.array([0.1, 0.5, 0.2, 0.1, 0.1]), D = 1, N = 0, n_max = 5, N_tot = 10, i = 0):
S_sum = 0
binom_sum = 0
y = 0
for n in range(n_max):
S_sum += S[n]
for r in range(n):
binom_sum += binomial(n, r)
y += S_sum * binom_sum * (x**r) * (1 - x - summ_x(x, D, i, S, N, n_max, N_tot))**(n-r)
return N_tot * y - N
def summ_x(x, D, i, S, N, n_max, N_tot):
j_min = max(i - D - 1, 0)
j_max = i - 1
x_values = 0
if i == 0:
return x_values
else:
for j in range(j_min, j_max):
x_values += func(x, S, D, N, n_max, N_tot, i)
return x_values
x = np.linspace(0, 1, 1000)
S = np.array([0.1, 0.5, 0.2, 0.1, 0.1])
N = np.random.choice(50, size = 1000)
print(my_newton(wrap_func, 0.1))
plt.plot(x, wrap_func(x, S = S, D = 1, N = N ))
plt.axhline(0, lw = 0.5, color = 'grey')
#plt.plot(my_newton(wrap_func, 1), wrap_func(my_newton(wrap_func, 1), S = S, D = 1, N = N), 'd')
plt.show()

Python Scipy Minimize Not Working

I try to minimize the vectors x,y, but they just satisfied constraints with no work for minimizing.
e.g:
input init: x=[0.2,0.3,0.5] (sum of elements is 1) ,feedback: res.x=[0.,0.3,0.5],it hasn't changed at all!
# -*- coding:utf8-*-
import random
import numpy as np
from scipy import optimize
import networkx as nx
import matplotlib.pyplot as plt
def Ud(x, X, Aj, G, p):
lost = G.node[Aj[-1]]["weight"]
Aj = Aj[:-1]
P = 1
sum = 0
for xi in x:
for Xi in X:
N = set(Aj).intersection(set(Xi))
# N=[random.randint(90,120) for _ in range(0,1)]
for n in N:
P *= (1 - p[n - 1])
sum += xi * P
P = 1
return -lost * sum
### objective function for defenders ###
def min_Ud(x, X, A, G, p):
min = float("inf")
for Aj in A:
temp_min = Ud(x, X, Aj, G, p)
if temp_min < min:
min = temp_min
return -min
### objective function for attackers ###
def Ua(a, X, A, G, p):
sum = 0
P = 1
for aj, Aj in a, A:
for Xi in X:
N = set(Aj[:-1]).intersection(set(Xi))
for n in N:
P *= (1 - p[n - 1])
sum += G.node[Aj[-1]]["weight"] * aj * P
P = 1
return sum
### fun for LP ###
def coreLP(X, A, G, p):
x0 = np.array([0.7, 0.2, 0.1])
a0 = np.array([0.5, 0.2, 0.3])
x_res = float("inf")
a_res = float("inf")
def c1(x):
return x.sum() - 1
cons = ({'type': 'eq', 'fun': c1})
Ud_star = optimize.minimize(min_Ud, x0, args=(X, A, G, p), constraints=cons, bounds=((0, 1), (0, 1), (0, 1)))
Ua_star = optimize.minimize(min_Ud, a0, args=(X, A, G, p), constraints=cons, bounds=((0, 1), (0, 1), (0, 1)))
print Ud_star
print Ua_star
return Ud_star.x, Ua_star.x

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

vectorized implementation without using for loops - python

Try this. import numpy as np P,Q = 1000,1000 thresh = 100 u = np.arange(P) v = np.arange(Q) dist_mat = np.sqrt(((u - P/2)2)[:, None] + ((v - Q/2)2)[None, :]) H = np.zeros((P, Q)) H[dist_mat <= thresh] = 1

import numpy as np P,Q = 1000,1000 thresh = 100 idx_H = np.stack(np.indices((P, Q)), axis=-1) H_LP = dist(idx_H[..., 0], idx_H[..., 1], P, Q) <= thresh def dist(u, v, p, q): return np.sqrt(np.square(u - p / 2) + np.square(v - q / 2))

Related

Speed-up computation on sub-matrices of a 4D tensor

Faster way to iterate through pixel using numpy with conditions?

Wiki example for Arnoldi iteration only works for real matrices?

root-finding algorithm for a complex polynomial equation in python

Python Scipy Minimize Not Working

Categories

Resources

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

vectorized implementation without using for loops - python

Try this. import numpy as np P,Q = 1000,1000 thresh = 100 u = np.arange(P) v = np.arange(Q) dist_mat = np.sqrt(((u - P/2)**2)[:, None] + ((v - Q/2)**2)[None, :]) H = np.zeros((P, Q)) H[dist_mat <= thresh] = 1

import numpy as np P,Q = 1000,1000 thresh = 100 idx_H = np.stack(np.indices((P, Q)), axis=-1) H_LP = dist(idx_H[..., 0], idx_H[..., 1], P, Q) <= thresh def dist(u, v, p, q): return np.sqrt(np.square(u - p / 2) + np.square(v - q / 2))

Related

Speed-up computation on sub-matrices of a 4D tensor

Faster way to iterate through pixel using numpy with conditions?

Wiki example for Arnoldi iteration only works for real matrices?

root-finding algorithm for a complex polynomial equation in python

Python Scipy Minimize Not Working

Categories

Resources

Try this. import numpy as np P,Q = 1000,1000 thresh = 100 u = np.arange(P) v = np.arange(Q) dist_mat = np.sqrt(((u - P/2)2)[:, None] + ((v - Q/2)2)[None, :]) H = np.zeros((P, Q)) H[dist_mat <= thresh] = 1