I would like to optimize this Python code with Cython:
def updated_centers(point, start, center):
return np.array([__cluster_mean(point[start[c]:start[c + 1]], center[c]) for c in range(center.shape[0])])
def __cluster_mean(point, center):
return (np.sum(point, axis=0) + center) / (point.shape[0] + 1)
My Cython code:
cimport cython
cimport numpy as np
import numpy as np
# C-compatible Numpy integer type.
DTYPE = np.intc
#cython.boundscheck(False) # Deactivate bounds checking
#cython.wraparound(False) # Deactivate negative indexing.
#cython.cdivision(True) # Deactivate division by 0 checking.
def updated_centers(double [:,:] point, int [:] label, double [:,:] center):
if (point.shape[0] != label.size) or (point.shape[1] != center.shape[1]) or (center.shape[0] > point.shape[0]):
raise ValueError("Incompatible dimensions")
cdef Py_ssize_t i, c, j
cdef Py_ssize_t n = point.shape[0]
cdef Py_ssize_t m = point.shape[1]
cdef Py_ssize_t nc = center.shape[0]
# Updated centers. We accumulate point and center contributions into this array.
# Start by adding the (unscaled) center contributions.
new_center = np.zeros([nc, m])
new_center[:] = center
# Counter array. Will contain cluster sizes (including center, whose contribution
# is again added here) at the end of the point loop.
cluster_size = np.ones([nc], dtype=DTYPE)
# Add point contributions.
for i in range(n):
c = label[i]
cluster_size[c] += 1
for j in range(m):
new_center[c, j] += point[i, j]
# Scale center+point summation to be a mean.
for c in range(nc):
for j in range(m):
new_center[c, j] /= cluster_size[c]
return new_center
However, Cython is slower than python:
Python: %timeit f.updated_centers(point, start, center)
331 ms ± 11.9 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Cython: %timeit fx.updated_centers(point, label, center)
433 ms ± 14 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
The HTML reveals that almost all lines are yellow: allocating the array, +=, /=. I expected Cython to be an order of magnitude faster. What am I doing wrong?
You need to tell Cython that new_center and cluster_size are arrays:
cdef double[:, :] new_center = np.zeros((nc, m))
...
cdef int[:] cluster_size = np.ones((nc,), dtype=DTYPE)
...
Without these type annotations Cython cannot generate efficient C code, and has to call into the Python interpreter when you access those arrays.This is why the lines in the HTML output of cython -a where you access these arrays were yellow.
With just these two small modifications we immediately see the speedup we want:
%timeit python_updated_centers(point, start, center)
392 ms ± 41.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit cython_updated_centers(point, start, center)
1.18 ms ± 145 µs per loop (mean ± std. dev. of 7 runs, 1000 loops each)
For such simple kernels, you can also use pythran to get nice speedups:
#pythran export updated_centers(float64 [:, :], int32 [:] , float64 [:, :] )
import numpy as np
def updated_centers(point, start, center):
return np.array([__cluster_mean(point[start[c]:start[c + 1]], center[c]) for c in range(center.shape[0])])
def __cluster_mean(point, center):
return (np.sum(point, axis=0) + center) / (point.shape[0] + 1)
Compiled with pythran updated_centers.py and one get the following timings:
Numpy code (same code, not compiled):
$ python -m perf timeit -s 'import numpy as np; n, m = 100000, 5; k = n//2; point = np.random.rand(n, m); start = 2*np.arange(k+1, dtype=np.int32); center=np.random.rand(k, m); from updated_centers import updated_centers' 'updated_centers(point, start, center)'
.....................
Mean +- std dev: 271 ms +- 12 ms
Pythran (after compilation):
$ python -m perf timeit -s 'import numpy as np; n, m = 100000, 5; k = n//2; point = np.random.rand(n, m); start = 2*np.arange(k+1, dtype=np.int32); center=np.random.rand(k, m); from updated_centers import updated_centers' 'updated_centers(point, start, center)'
.....................
Mean +- std dev: 12.8 ms +- 0.3 ms
The key is to write the Cython code like the Python code, to access arrays only when necessary.
cimport cython
cimport numpy as np
import numpy as np
# C-compatible Numpy integer type.
DTYPE = np.intc
#cython.boundscheck(False) # Deactivate bounds checking
#cython.wraparound(False) # Deactivate negative indexing.
#cython.cdivision(True) # Deactivate division by 0 checking.
def updated_centers(double [:, :] point, int [:] start, double [:, :] center):
"""Returns the updated list of cluster centers (damped center of mass Pahkira scheme). Cluster c
(and center[c]) corresponds to the point range point[start[c]:start[c+1]]."""
if (point.shape[1] != center.shape[1]) or (center.shape[0] > point.shape[0]) or (start.size != center.shape[0] + 1):
raise ValueError("Incompatible dimensions")
# Py_ssize_t is the proper C type for Python array indices.
cdef Py_ssize_t i, c, j, cluster_start, cluster_stop, cluster_size
cdef Py_ssize_t n = point.shape[0]
cdef Py_ssize_t m = point.shape[1]
cdef Py_ssize_t nc = center.shape[0]
cdef double center_of_mass
# Updated centers. We accumulate point and center contributions into this array.
# Start by adding the (unscaled) center contributions.
new_center = np.zeros([nc, m])
cluster_start = start[0]
for c in range(nc):
cluster_stop = start[c + 1]
cluster_size = cluster_stop - cluster_start + 1
for j in range(m):
center_of_mass = center[c, j]
for i in range(cluster_start, cluster_stop):
center_of_mass += point[i, j]
new_center[c, j] = center_of_mass / cluster_size
cluster_start = cluster_stop
return np.asarray(new_center)
With the same API we get
n, m = 100000, 5; k = n//2; point = np.random.rand(n, m); start = 2*np.arange(k+1, dtype=np.intc); center=np.random.rand(k, m);
%timeit fx.updated_centers(point, start, center)
31 ms ± 2.15 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit f.updated_centers(point, start, center)
734 ms ± 17.6 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Related
I have a somewhat contrived example to cytonize, where I want a function to:
accept a 1D numpy array of arbitrary length (~100'000 ÷ 1'000'000 np.float64's)
do some filtering on it
return results as a new [numpy?] array of the same length
The code and profiling is as follows:
%%cython -a
from libc.stdlib cimport malloc, free
from cython cimport boundscheck, wraparound
import numpy as np
#boundscheck(False)
#wraparound(False)
def func_memview(double[:] arr):
cdef:
int N = arr.shape[0], i
double *out_ptr = <double *> malloc(N * sizeof(double))
double[:] out = <double[:N]>out_ptr
for i in range(1, N):
if arr[i] > arr[i-1]:
out[i] = arr[i]
else:
out[i] = 0.
free(out_ptr)
return np.asarray(out)
My question is can I do any better with this?
As DavidW has pointed out, your code has some issues with memory management and it would be better to use a numpy-array directly:
%%cython
from cython cimport boundscheck, wraparound
import numpy as np
#boundscheck(False)
#wraparound(False)
def func_memview_correct(double[:] arr):
cdef:
int N = arr.shape[0], i
double[:] out = np.empty(N)
for i in range(1, N):
if arr[i] > arr[i-1]:
out[i] = arr[i]
else:
out[i] = 0.0
return np.asarray(out)
It is about as fast as the faulty original version:
import numpy as np
np.random.seed(0)
k= np.random.rand(5*10**7)
%timeit func_memview(k) # 413 ms ± 14.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit func_memview_correct(k) # 412 ms ± 15.5 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
The question is how this code could be made faster? Most obvious options are
Parallelization.
Using vectorization/SIMD instructions.
It is notoriously hard to ensure that the C-code generated by Cython gets vectorized, see for example this SO-post. For many compilers it is necessary to use contiguous memory view to improve the situation, i.e.:
%%cython -c=/O3
from cython cimport boundscheck, wraparound
import numpy as np
#boundscheck(False)
#wraparound(False)
def func_memview_correct_cont(double[::1] arr): // <---- HERE
cdef:
int N = arr.shape[0], i
double[::1] out = np.empty(N) // <--- HERE
for i in range(1, N):
if arr[i] > arr[i-1]:
out[i] = arr[i]
else:
out[i] = 0.0
return np.asarray(out)
On my machine it is not really much faster
%timeit func_memview_correct_cont(k) # 402 ms ± 11.2 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Other compilers might do better. However, I've often seen gcc and msvc struggling with producing optimal assembler for code typical for filtering (see for example this SO-question). Clang is much better at this, so the easiest solution would be probably to use numba:
import numba as nb
#nb.njit
def nb_func(arr):
N = arr.shape[0]
out = np.empty(N)
for i in range(1, N):
if arr[i] > arr[i-1]:
out[i] = arr[i]
else:
out[i] = 0.0
return out
which outperforms the cython code by almost factor of 3:
%timeit nb_func(k) # 151 ms ± 2.87 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
It is easy to parallelize the numba version using prange, but the win is not that much: parallelized version runs in 116ms on my machine.
To summarize: For such type of tasks my advice is to use numba. Using cython is trickier and the final performance will be down to the compiler used in the background.
I have a program for a simulation and inside the program I have a function. I have realized that the function consumes most time of simulation. So, I am trying to optimize the funcion first. The function is as follows
Julia version 1.1:
function fun_jul(M,ksi,xi,x)
F(n,x) = sin(n*pi*(x+1)/2)*cos(n*pi*(x+1)/2);
K = length(ksi);
Z = zeros(length(x),K);
for n in 1:M
for k in 1:K
for l in 1:length(x)
Z[l,k] += (1-(n/(M+1))^2)^xi*F(n,ksi[k])*F(n,x[l]);
end
end
end
return Z
end
I also rewrite the above function in python+numba for comparison as follows
Python+numba
import numpy as np
from numba import prange, jit
#jit(nopython=True, parallel=True)
def fun_py(M,ksi,xi,x):
K = len(ksi);
F = lambda nn,xx: np.sin(nn*np.pi*(xx+1)/2)*np.cos(nn*np.pi*(xx+1)/2);
Z = np.zeros((len(x),K));
for n in range(1,M+1):
for k in prange(0,K):
Z[:,k] += (1-(n/(M+1))**2)**xi*F(n,ksi[k])*F(n,x);
return Z
But Julia codes are very slow here are my results:
Julia results:
using BenchmarkTools
N=400; a=-0.5; b=0.5; x=range(a,b,length=N); cc=x; M = 2*N+100; xi = M/40;
#benchmark fun_jul(M,cc,xi,x)
BenchmarkTools.Trial:
memory estimate: 1.22 MiB
allocs estimate: 2
--------------
minimum time: 25.039 s (0.00% GC)
median time: 25.039 s (0.00% GC)
mean time: 25.039 s (0.00% GC)
maximum time: 25.039 s (0.00% GC)
--------------
samples: 1
evals/sample: 1
Python results:
N=400;a = -0.5;b = 0.5;x = np.linspace(a,b,N);cc = x;M = 2*N + 100;xi = M/40;
%timeit fun_py(M,cc,xi,x);
1.2 s ± 10.3 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Any help on improving the codes both for julia and python+numba would be appreciated.
Updated
Based on #Przemyslaw Szufel's answer and the other posts I have improved numba and julia codes. Now both are parallelized. Here are timings
Python+Numba times:
#jit(nopython=True, parallel=True)
def fun_py(M,ksi,xi,x):
K = len(ksi);
F = lambda nn,xx: np.sin(nn*np.pi*(xx+1)/2)*np.cos(nn*np.pi*(xx+1)/2);
Z = np.zeros((K,len(x)));
for n in range(1,M+1):
pw = (1-(n/(M+1))**2)**xi; f=F(n,x)
for k in prange(0,K):
Z[k,:] = Z[k,:] + pw*F(n,ksi[k])*f;
return Z
N=1000; a=-0.5; b=0.5; x=np.linspace(a,b,N); cc=x; M = 2*N+100; xi = M/40;
%timeit fun_py(M,cc,xi,x);
733 ms ± 13.1 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Julia times
N=1000; a=-0.5; b=0.5; x=range(a,b,length=N); cc=x; M = 2*N+100; xi = M/40;
#benchmark fun_jul2(M,cc,xi,x)
BenchmarkTools.Trial:
memory estimate: 40.31 MiB
allocs estimate: 6302
--------------
minimum time: 705.470 ms (0.17% GC)
median time: 726.403 ms (0.17% GC)
mean time: 729.032 ms (1.68% GC)
maximum time: 765.426 ms (5.27% GC)
--------------
samples: 7
evals/sample: 1
I got down to 300ms on a single thread (instead of 28s on my machine) with the following code.
You are using multi-threading for Numba. In Julia you should use parallel processing (multi-threading support is experimental fo Julia). It seems that your code is doing some kind of parameter sweep - such codes are very easy to parallelize but it usually requires some adjustments to your computational process.
Here is the code:
function fun_jul2(M,ksi,xi,x)
F(n,x) = sin(n*pi*(x+1))/2;
K = length(ksi);
L = length(x);
Z = zeros(length(x),K);
for n in 1:M
F_im1= [F(n,ksi[k]) for k in 1:K]
F_im2 = [F(n,x[l]) for l in 1:L]
pow = (1-(n/(M+1))^2)^xi
for k in 1:K
for l in 1:L
Z[l,k] += pow*F_im1[k]*F_im2[l];
end
end
end
Z
end
julia> fun_jul2(M,cc,xi,x) ≈ fun_jul(M,cc,xi,x)
true
julia> #time fun_jul2(M,cc,xi,x);
0.305269 seconds (1.81 k allocations: 6.934 MiB, 1.60% gc time)
** EDIT: with multithreading and inbounds suggested by DNF:
function fun_jul3(M,ksi,xi,x)
F(n,x) = sin(n*pi*(x+1))/2;
K = length(ksi);
L = length(x);
Z = zeros(length(x),K);
for n in 1:M
F_im1= [F(n,ksi[k]) for k in 1:K]
F_im2 = [F(n,x[l]) for l in 1:L]
pow = (1-(n/(M+1))^2)^xi
Threads.#threads for k in 1:K
for l in 1:L
#inbounds Z[l,k] += pow*F_im1[k]*F_im2[l];
end
end
end
Z
end
And now the running time (remember to run set JULIA_NUM_THREADS=4 or Linux equivalent before launching Julia):
julia> fun_jul2(M,cc,xi,x) ≈ fun_jul3(M,cc,xi,x)
true
julia> #time fun_jul3(M,cc,xi,x);
0.051470 seconds (2.71 k allocations: 6.989 MiB)
You could also try to further experiment with parallelizing of computing of F_im1 and F_im2.
You can do, or fail to do, loop optimization in any language that has loops. The major difference here is that the numba code is vectorized for the inner loop but the Julia code is not. To vectorize the Julia version, it is sometimes necessary to change operators to their vectorized versions with the ., so that + becomes .+ for example.
Since I cannot get Numba to install properly on my older Windows 10 machine, I ran the code versions below on free Linux versions on the Web. This means I had to use the Python interface for timeit(), not the command line.
Run in Jupyter at mybinder, probably with 1 thread since it is not specified. :
import timeit
timeit.timeit("""
#jit(nopython=True, parallel=True)
def fun_py(M,ksi,xi,x):
K = len(ksi);
F = lambda nn,xx: np.sin(nn*np.pi*(xx+1)/2)*np.cos(nn*np.pi*(xx+1)/2);
Z = np.zeros((len(x),K));
for n in range(1,M+1):
for k in prange(0,K):
Z[:,k] += (1-(n/(M+1))**2)**xi*F(n,ksi[k])*F(n,x);
return Z
N=400; a = -0.5; b = 0.5; x = np.linspace(a,b,N); cc = x;M = 2*N + 100; xi = M/40;
fun_py(M,cc,xi,x)
""", setup ="import numpy as np; from numba import prange, jit", number=5)
Out[1]: 61.07768889795989
Your machine must be a lot faster than Jupyter, ForBonder.
I ran this optimized julia code version below, in Jupyter on JuliaBox, 1 thread kernel specified:
using BenchmarkTools
F(n, x) = sinpi.(n * (x .+ 1) / 2) .* cospi.(n * (x .+ 1) / 2)
function fun_jul2(M, ksi, xi, x)
K = length(ksi)
Z = zeros(length(x), K)
for n in 1:M, k in 1:K
Z[:, k] .+= (1 - (n / (M + 1))^2)^xi * F(n, ksi[k]) * F(n, x)
end
return Z
end
const N=400; const a=-0.5; const b=0.5; const x=range(a,b,length=N);
const cc=x; const M = 2*N+100; const xi = M/40;
#btime fun_jul2(M, cc, xi, x)
8.076 s (1080002 allocations: 3.35 GiB)
For performance, just precompute the trigonometric part.
Indeed, sin is a costly operation:
%timeit np.sin(1.)
712 ns ± 2.22 ns per loop (mean ± std. dev. of 7 runs, 1000000 loops each)
%timeit 1.2*3.4
5.88 ns ± 0.016 ns per loop (mean ± std. dev. of 7 runs, 100000000 loops each)
In python :
#jit
def fun_py2(M,ksi,xi,x):
NN = np.arange(1,M+1)
Fksi = np.sin(np.pi*np.outer(NN,ksi+1))/2 # sin(a)cos(a) is sin(2a)/2
Fx = np.sin(np.pi*np.outer(NN,x+1))/2
U = (1-(NN/(M+1))**2)**xi
Z = np.zeros((len(x),len(ksi)))
for n in range(len(NN)):
for k in range(len(ksi)):
for l in range(len(x)):
Z[k,l] += U[n] * Fksi[n,k] * Fx[n,l];
return Z
For a 30x improvement:
np.allclose(fun_py(M,cc,xi,x),fun_py2(M,cc,xi,x))
True
%timeit fun_py(M,cc,xi,x)
1.14 s ± 4.47 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
%timeit fun_py2(M,cc,xi,x)
29.5 ms ± 375 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
This doesn't trig any parallelism. I suppose the same will occur for Julia.
Usually I'm able to match Numba's performance when using Cython. However, in this example I have failed to do so - Numba is about 4 times faster than my Cython's version.
Here the Cython-version:
%%cython -c=-march=native -c=-O3
cimport numpy as np
import numpy as np
cimport cython
#cython.boundscheck(False)
#cython.wraparound(False)
def cy_where(double[::1] df):
cdef int i
cdef int n = len(df)
cdef np.ndarray[dtype=double] output = np.empty(n, dtype=np.float64)
for i in range(n):
if df[i]>0.5:
output[i] = 2.0*df[i]
else:
output[i] = df[i]
return output
And here is the Numba-version:
import numba as nb
#nb.njit
def nb_where(df):
n = len(df)
output = np.empty(n, dtype=np.float64)
for i in range(n):
if df[i]>0.5:
output[i] = 2.0*df[i]
else:
output[i] = df[i]
return output
When tested, the Cython version is on par with numpy's where, but is clearly inferior to Numba:
#Python3.6 + Cython 0.28.3 + gcc-7.2
import numpy
np.random.seed(0)
n = 10000000
data = np.random.random(n)
assert (cy_where(data)==nb_where(data)).all()
assert (np.where(data>0.5,2*data, data)==nb_where(data)).all()
%timeit cy_where(data) # 179ms
%timeit nb_where(data) # 49ms (!!)
%timeit np.where(data>0.5,2*data, data) # 278 ms
What is the reason for Numba's performance and how can it be matched when using Cython?
As suggested by #max9111, eliminating stride by using continuous memory-view, which doesn't improve the performance much:
#cython.boundscheck(False)
#cython.wraparound(False)
def cy_where_cont(double[::1] df):
cdef int i
cdef int n = len(df)
cdef np.ndarray[dtype=double] output = np.empty(n, dtype=np.float64)
cdef double[::1] view = output # view as continuous!
for i in range(n):
if df[i]>0.5:
view[i] = 2.0*df[i]
else:
view[i] = df[i]
return output
%timeit cy_where_cont(data) # 165 ms
This seems to be completely driven by optimizations that LLVM is able to make. If I compile the cython example with clang, performance between the two examples is identical. For what it's worth, MSVC on windows shows a similar performance discrepancy to numba.
$ CC=clang ipython
<... setup code>
In [7]: %timeit cy_where(data) # 179ms
...: %timeit nb_where(data) # 49ms (!!)
30.8 ms ± 309 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
30.2 ms ± 498 µs per loop (mean ± std. dev. of 7 runs, 10 loops each)
Interestingly, compiling the original Numpy code with pythran, using clang as a backend, yields the same performance as the Numba version.
import numpy as np
#pythran export work(float64[])
def work(df):
return np.where(data>0.5,2*data, data)
Compiled with
CXX=clang++ CC=clang pythran pythran_work.py -O3 -march=native
and the benchmark session:
import numpy as np
np.random.seed(0)
n = 10000000
data = np.random.random(n)
import numba_work, pythran_work
%timeit numba_work.work(data)
12.7 ms ± 20 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
%timeit pythran_work.work(data)
12.7 ms ± 32.5 µs per loop (mean ± std. dev. of 7 runs, 100 loops each)
In my project (about clustering algorithms, specifically k-medoids) is crucial to be able to compute pairwise distances efficiently. I have a dataset of ~60,000 objects. The problem is, distances must be computed between inhomogeneous vectors, i.e. vectors which may differ in length (in that case, missing items are treated as if they were 0).
Here is a minimal working example:
# %%
MAX_LEN = 11
N = 100
import random
def manhattan_distance(vec1, vec2):
n1, n2 = len(vec1), len(vec2)
n = min(n1, n2)
dist = 0
for i in range(n):
dist += abs(vec1[i] - vec2[i])
if n1 > n2:
for i in range(n, n1):
dist += abs(vec1[i])
else:
for i in range(n, n2):
dist += abs(vec2[i])
return dist
def compute_distances():
n = len(data)
for i in range(n):
for j in range(n):
manhattan_distance(data[i], data[j])
data = []
for i in range(N):
data.append([])
for k in range(random.randint(5, MAX_LEN)):
data[i].append(random.randint(0, 10))
%timeit compute_distances()
import numpy as np
def manhattan_distance_np(vec1, vec2):
return np.absolute(vec1 - vec2).sum()
def compute_distances_np():
n = len(data)
for i in range(n):
for j in range(n):
manhattan_distance_np(data_np[i], data_np[j])
data_np = [np.append(np.asarray(d), np.zeros(MAX_LEN - len(d))) for d in data]
%timeit compute_distances_np()
I was testing my Python lists implementation versus a numpy implementation.
Here are the results (computation times):
Python lists: 79.6 ms ± 3.78 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
numpy arrays: 226 ms ± 7.18 ms per loop (mean ± std. dev. of 7 runs, 1 loop each)
Why is there such a huge difference? I supposed numpy arrays were really fast.
Is there a way to improve my code? Am I misunderstanding the inner workings of numpy?
Edit: I may need, in the future, to be able to use a custom distance function for pairwise distances computations. The method should work also for data sets of length 60'000 without running out of memory.
I believe you can just make your arrays dense and set the unused last elements to 0s.
import numpy as np
from scipy.spatial.distance import cdist, pdist, squareform
def batch_pdist(x, metric, batchsize=1000):
dists = np.zeros((len(x), len(x)))
for i in range(0, len(x), batchsize):
for j in range(0, len(x), batchsize):
dist_batch = cdist(x[i:i+batchsize], x[j:j+batchsize], metric=metric)
dists[i:i+batchsize, j:j+batchsize] = dist_batch
return dists
MIN_LEN = 5
MAX_LEN = 11
N = 10000
M = 10
data = []
data = np.zeros((N,MAX_LEN))
for i in range(N):
num_nonzero = np.random.randint(MIN_LEN, MAX_LEN)
data[i, :num_nonzero] = np.random.randint(0, M, num_nonzero)
dists = squareform(pdist(data, metric='cityblock'))
dists2 = batch_pdist(data, metric='cityblock', batchsize=500)
print((dists == dists2).all())
Timing Output:
%timeit squareform(pdist(data, metric='cityblock'))
43.8 µs ± 134 ns per loop (mean ± std. dev. of 7 runs, 10000 loops each)
Edit:
For a custom distance function see the very bottom of this documentation.
I finally found probably the most straightforward way to solve this problem without changing too much the code and rely solely on computations and not on memory (since that could be unfeasible for very large datasets).
Based on juanpa.arrivillaga suggestion, I tried numba, that is a library that speeds up array-oriented and math-heavy Python code and is targeted mainly at numpy. You can read a good guide on optimizing Python code here: https://jakevdp.github.io/blog/2015/02/24/optimizing-python-with-numpy-and-numba/.
MAX_LEN = 11
N = 100
# Pure Python lists implementation.
import random
def manhattan_distance(vec1, vec2):
n1, n2 = len(vec1), len(vec2)
n = min(n1, n2)
dist = 0
for i in range(n):
dist += abs(vec1[i] - vec2[i])
if n1 > n2:
for i in range(n, n1):
dist += abs(vec1[i])
else:
for i in range(n, n2):
dist += abs(vec2[i])
return dist
def compute_distances():
n = len(data)
for i in range(n):
for j in range(n):
manhattan_distance(data[i], data[j])
data = []
for i in range(N):
data.append([])
for k in range(random.randint(5, MAX_LEN)):
data[i].append(random.randint(0, 10))
%timeit compute_distances()
# numpy+numba implementation.
import numpy as np
from numba import jit
#jit
def manhattan_distance_np(vec1, vec2):
return np.absolute(vec1 - vec2).sum()
#jit
def compute_distances_np():
n = len(data)
for i in range(n):
for j in range(n):
manhattan_distance_np(data_np[i], data_np[j])
data_np = np.array([np.append(np.asarray(d), np.zeros(MAX_LEN - len(d))) for d in data])
%timeit compute_distances_np()
Timing output:
%timeit compute_distances()
78.4 ms ± 3.44 ms per loop (mean ± std. dev. of 7 runs, 10 loops each)
%timeit compute_distances_np()
4.1 ms ± 14.7 µs per loop (mean ± std. dev. of 7 runs, 1 loop each)
As you can see, the numpy with numba optimizations is about 19 times faster (with no other code optimization involved).
The purpose of this mathematical function is to compute a distance between two (or more) protein structures using dihedral angles:
It is very useful in structural biology, for example. And I already code this function in python using numpy, but the goal is to have a faster implementation. As computation time reference, I use the euclidean distance function available in the scikit-learn package.
Here the code I have for the moment:
import numpy as np
import numexpr as ne
from sklearn.metrics.pairwise import euclidean_distances
# We have 10000 structures with 100 dihedral angles
n = 10000
m = 100
# Generate some random data
c = np.random.rand(n,m)
# Generate random int number
x = np.random.randint(c.shape[0])
print c.shape, x
# First version with numpy of the dihedral_distances function
def dihedral_distances(a, b):
l = 1./a.shape[0]
return np.sqrt(l* np.sum((0.5)*(1. - np.cos(a-b)), axis=1))
# Accelerated version with numexpr
def dihedral_distances_ne(a, b):
l = 1./a.shape[0]
tmp = ne.evaluate('sum((0.5)*(1. - cos(a-b)), axis=1)')
return ne.evaluate('sqrt(l* tmp)')
# The function of reference I try to be close as possible
# in term of computation time
%timeit euclidean_distances(c[x,:], c)[0]
1000 loops, best of 3: 1.07 ms per loop
# Computation time of the first version of the dihedral_distances function
# We choose randomly 1 structure among the 10000 structures.
# And we compute the dihedral distance between this one and the others
%timeit dihedral_distances(c[x,:], c)
10 loops, best of 3: 21.5 ms per loop
# Computation time of the accelerated function with numexpr
%timeit dihedral_distances_ne(c[x,:], c)
100 loops, best of 3: 9.44 ms per loop
9.44 ms it's very fast, but it's very slow if you need to run it a million times. Now the question is, how to do that? What is the next step? Cython? PyOpenCL? I have some experience with PyOpenCL, however I never code something as elaborate as this one. I don't know if it's possible to compute the dihedral distances in one step on GPU as I do with numpy and how to proceed.
Thank you for helping me!
EDIT:
Thank you guys! I am currently working on the full solution and once it's finished I will put the code here.
CYTHON VERSION:
%load_ext cython
import numpy as np
np.random.seed(1234)
n = 10000
m = 100
c = np.random.rand(n,m)
x = np.random.randint(c.shape[0])
print c.shape, x
%%cython --compile-args=-fopenmp --link-args=-fopenmp --force
import numpy as np
cimport numpy as np
from libc.math cimport sqrt, cos
cimport cython
from cython.parallel cimport parallel, prange
# Define a function pointer to a metric
ctypedef double (*metric)(double[: ,::1], np.intp_t, np.intp_t)
cdef extern from "math.h" nogil:
double cos(double x)
double sqrt(double x)
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.cdivision(True)
cdef double dihedral_distances(double[:, ::1] a, np.intp_t i1, np.intp_t i2):
cdef double res
cdef int m
cdef int j
res = 0.
m = a.shape[1]
for j in range(m):
res += 1. - cos(a[i1, j] - a[i2, j])
res /= 2.*m
return sqrt(res)
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.cdivision(True)
cdef double dihedral_distances_p(double[:, ::1] a, np.intp_t i1, np.intp_t i2):
cdef double res
cdef int m
cdef int j
res = 0.
m = a.shape[1]
with nogil, parallel(num_threads=2):
for j in prange(m, schedule='dynamic'):
res += 1. - cos(a[i1, j] - a[i2, j])
res /= 2.*m
return sqrt(res)
#cython.boundscheck(False)
#cython.wraparound(False)
def pairwise(double[: ,::1] c not None, np.intp_t x, p = True):
cdef metric dist_func
if p:
dist_func = &dihedral_distances_p
else:
dist_func = &dihedral_distances
cdef np.intp_t i, n_structures
n_samples = c.shape[0]
cdef double[::1] res = np.empty(n_samples)
for i in range(n_samples):
res[i] = dist_func(c, x, i)
return res
%timeit pairwise(c, x, False)
100 loops, best of 3: 17 ms per loop
# Parallel version
%timeit pairwise(c, x, True)
10 loops, best of 3: 37.1 ms per loop
So I follow your link to create the cython version of the dihedral distances function. We gain some speed, not so much, but it is still slower than the numexpr version (17ms vs 9.44ms). So I tried to parallelize the function using prange and it is worse (37.1ms vs 17ms vs 9.4ms)!
Do I miss something?
If you're willing to use http://pythran.readthedocs.io/, you can leverage on the numpy implementation and get better performance than cython for that case:
#pythran export np_cos_norm(float[], float[])
import numpy as np
def np_cos_norm(a, b):
val = np.sum(1. - np.cos(a-b))
return np.sqrt(val / 2. / a.shape[0])
And compile it with:
pythran fast.py
To get an average x2 over the cython version.
If using:
pythran fast.py -march=native -DUSE_BOOST_SIMD -fopenmp
You'll get a vectorized, parallel version that runs slightly faster:
100000 loops, best of 3: 2.54 µs per loop
1000000 loops, best of 3: 674 ns per loop
100000 loops, best of 3: 16.9 µs per loop
100000 loops, best of 3: 4.31 µs per loop
10000 loops, best of 3: 176 µs per loop
10000 loops, best of 3: 42.9 µs per loop
(using the same testbed as ev-br)
Here's a quick-and-dirty try with cython, for just a pair of 1D arrays:
(in an IPython notebook)
%%cython
cimport cython
cimport numpy as np
cdef extern from "math.h":
double cos(double x) nogil
double sqrt(double x) nogil
def cos_norm(a, b):
return cos_norm_impl(a, b)
#cython.boundscheck(False)
#cython.wraparound(False)
#cython.cdivision(True)
cdef double cos_norm_impl(double[::1] a, double[::1] b) nogil:
cdef double res = 0., val
cdef int m = a.shape[0]
# XXX: shape of b not checked
cdef int j
for j in range(m):
val = a[j] - b[j]
res += 1. - cos(val)
res /= 2.*m
return sqrt(res)
Comparing with a straightforward numpy implementation,
def np_cos_norm(a, b):
val = np.add.reduce(1. - np.cos(a-b))
return np.sqrt(val / 2. / a.shape[0])
I get
np.random.seed(1234)
for n in [100, 1000, 10000]:
x = np.random.random(n)
y = np.random.random(n)
%timeit cos_norm(x, y)
%timeit np_cos_norm(x, y)
print '\n'
100000 loops, best of 3: 3.04 µs per loop
100000 loops, best of 3: 12.4 µs per loop
100000 loops, best of 3: 18.8 µs per loop
10000 loops, best of 3: 30.8 µs per loop
1000 loops, best of 3: 196 µs per loop
1000 loops, best of 3: 223 µs per loop
So, depending on the dimensionality of your vectors, you can get from a factor of 4 to nil of a speedup.
For computing pairwise distances, you can probably do much better, as shown in this blog post, but of course YMMV.