Vectorize a python loop over a numpy array

Vectorize a python loop over a numpy array - python

I need to speed up the processing of this loop as it is very slow. But I don't know how to vectorize it since the result of one value depends on the result of a previous value. Any suggestions?
import numpy as np
sig = np.random.randn(44100)
alpha = .9887
beta = .999
out = np.zeros_like(sig)
for n in range(1, len(sig)):
if np.abs(sig[n]) >= out[n-1]:
out[n] = alpha * out[n-1] + (1 - alpha) * np.abs( sig[n] )
else:
out[n] = beta * out[n-1]

Numba's just-in-time compiler should deal with indexing overhead you're facing pretty well by compiling the function to native code during first execution:
In [1]: %cpaste
Pasting code; enter '--' alone on the line to stop or use Ctrl-D.
:import numpy as np
:
:sig = np.random.randn(44100)
:alpha = .9887
:beta = .999
:
:def nonvectorized(sig):
: out = np.zeros_like(sig)
:
: for n in range(1, len(sig)):
: if np.abs(sig[n]) >= out[n-1]:
: out[n] = alpha * out[n-1] + (1 - alpha) * np.abs( sig[n] )
: else:
: out[n] = beta * out[n-1]
: return out
:--
In [2]: nonvectorized(sig)
Out[2]:
array([ 0. , 0.01862503, 0.04124917, ..., 1.2979579 ,
1.304247 , 1.30294275])
In [3]: %timeit nonvectorized(sig)
10 loops, best of 3: 80.2 ms per loop
In [4]: from numba import jit
In [5]: vectorized = jit(nonvectorized)
In [6]: np.allclose(vectorized(sig), nonvectorized(sig))
Out[6]: True
In [7]: %timeit vectorized(sig)
1000 loops, best of 3: 249 µs per loop
EDIT: as suggested in a comment, adding jit benchmarks. jit(nonvectorized) is creating a lightweight wrapper and thus is a cheap operation.
In [8]: %timeit jit(nonvectorized)
10000 loops, best of 3: 45.3 µs per loop
The function itself is compiled during the first execution (hence just-in-time) which takes a while, but probably not as much:
In [9]: %timeit jit(nonvectorized)(sig)
10 loops, best of 3: 169 ms per loop

Low vectorisation potential on a "forward-dependent-loop" code
majority of your "vectorisation" parallelism is out of the game, once the dependency is analysed. ( JIT-compiler cannot vectorise "against" such dependence barrier either )
you may pre-calculate some re-used values in a vectorised manner, but there is no direct python syntax manner ( without an external JIT-compiler workaround ) to arrange forward-shifting-dependence loop computation into your CPU vector-register aligned co-parallel computation:
from zmq import Stopwatch # ok to use pyzmq 2.11 for [usec] .Stopwatch()
aStopWATCH = Stopwatch() # a performance measurement .Stopwatch() instance
sig = np.abs(sig) # self-destructive calc/assign avoids memalloc-OPs
aConst = ( 1 - alpha ) # avoids many repetitive SUB(s) in the loop
for thisPtr in range( 1, len( sig ) ): # FORWARD-SHIFTING-DEPENDENCE LOOP:
prevPtr = thisPtr - 1 # prevPtr->"previous" TimeSlice in out[] ( re-used 2 x len(sig) times )
if sig[thisPtr] < out[prevPtr]: # 1st re-use
out[thisPtr] = out[prevPtr] * beta # 2nd
else:
out[thisPtr] = out[prevPtr] * alpha + ( aConst * sig[thisPtr] ) # 2nd
A good example of vectorised speed-up can be seen in cases, where calculation strategy can be parallelised/broadcast along 1D, 2D or even 3D structure of the native numpy array. For a speedup of about 100x see an RGBA-2D matrix accelerated processing in Vectorised code for a PNG picture processing ( an OpenGL shader pipeline)
Performance increased still about 3x
Even this simple python code revision has increased the speed more than about 2.8x times ( right now, i.e. without undertaking an installation to allow using an ad-hoc JIT-optimising compiler ):
>>> def aForwardShiftingDependenceLOOP(): # proposed code-revision
... aStopWATCH.start() # ||||||||||||||||||.start
... for thisPtr in range( 1, len( sig ) ):
... # |vvvvvvv|------------# FORWARD-SHIFTING-LOOP DEPENDENCE
... prevPtr = thisPtr - 1 #|vvvvvvv|--STEP-SHIFTING avoids Numpy syntax
... if ( sig[ thisPtr] < out[prevPtr] ):
... out[ thisPtr] = out[prevPtr] * beta
... else:
... out[ thisPtr] = out[prevPtr] * alpha + ( aConst * sig[thisPtr] )
... usec = aStopWATCH.stop() # ||||||||||||||||||.stop
... print usec, " [usec]"
>>> aForwardShiftingDependenceLOOP()
57593 [usec]
57879 [usec]
58085 [usec]
>>> def anOriginalForLOOP():
... aStopWATCH.start()
... for n in range( 1, len( sig ) ):
... if ( np.abs( sig[n] ) >= out[n-1] ):
... out[n] = out[n-1] * alpha + ( 1 - alpha ) * np.abs( sig[n] )
... else:
... out[n] = out[n-1] * beta
... usec = aStopWATCH.stop()
... print usec, " [usec]"
>>> anOriginalForLOOP()
164907 [usec]
165674 [usec]
165154 [usec]

Related

Fast way to calculate conditional function

What is the most fast way to calculate function like
# here x is just a number
def f(x):
if x >= 0:
return np.log(x+1)
else:
return -np.log(-x+1)
One possible way is:
# here x is an array
def loga(x)
cond = [x >= 0, x < 0]
choice = [np.log(x+1), -np.log(-x+1)
return np.select(cond, choice)
But seems numpy goes through array element by element.
Is there any way to use something conceptually similar to np.exp(x) to achieve better performance?

def f(x):
return (x/abs(x)) * np.log(1+abs(x))

In cases like these, masking helps -
def mask_vectorized_app(x):
out = np.empty_like(x)
mask = x>=0
mask_rev = ~mask
out[mask] = np.log(x[mask]+1)
out[mask_rev] = -np.log(-x[mask_rev]+1)
return out
Introducing numexpr module helps us further.
import numexpr as ne
def mask_vectorized_numexpr_app(x):
out = np.empty_like(x)
mask = x>=0
mask_rev = ~mask
x_masked = x[mask]
x_rev_masked = x[mask_rev]
out[mask] = ne.evaluate('log(x_masked+1)')
out[mask_rev] = ne.evaluate('-log(-x_rev_masked+1)')
return out
Inspired by #user2685079's post and then using the logarithmetic property : log(A**B) = B*log(A), we can push in the sign into the log computations and this allows us to do more work with numexpr's evaluate expression, like so -
s = (-2*(x<0))+1 # np.sign(x)
out = ne.evaluate('log( (abs(x)+1)**s)')
Computing sign using comparison gives us s in another way -
s = (-2*(x<0))+1
Finally, we can push this into the numexpr evaluate expression -
def mask_vectorized_numexpr_app2(x):
return ne.evaluate('log( (abs(x)+1)**((-2*(x<0))+1))')
Runtime test
Loopy approach for comparison -
def loopy_app(x):
out = np.empty_like(x)
for i in range(len(out)):
out[i] = f(x[i])
return out
Timings and verification -
In [141]: x = np.random.randn(100000)
...: print np.allclose(loopy_app(x), mask_vectorized_app(x))
...: print np.allclose(loopy_app(x), mask_vectorized_numexpr_app(x))
...: print np.allclose(loopy_app(x), mask_vectorized_numexpr_app2(x))
...:
True
True
True
In [142]: %timeit loopy_app(x)
...: %timeit mask_vectorized_numexpr_app(x)
...: %timeit mask_vectorized_numexpr_app2(x)
...:
10 loops, best of 3: 108 ms per loop
100 loops, best of 3: 3.6 ms per loop
1000 loops, best of 3: 942 µs per loop
Using #user2685079's solution using np.sign to replace the first part and then with and without numexpr evaluation -
In [143]: %timeit np.sign(x) * np.log(1+abs(x))
100 loops, best of 3: 3.26 ms per loop
In [144]: %timeit np.sign(x) * ne.evaluate('log(1+abs(x))')
1000 loops, best of 3: 1.66 ms per loop

Using numba
Numba gives you the power to speed up your applications with high performance functions written directly in Python. With a few annotations, array-oriented and math-heavy Python code can be just-in-time compiled to native machine instructions, similar in performance to C, C++ and Fortran, without having to switch languages or Python interpreters.
Numba works by generating optimized machine code using the LLVM compiler infrastructure at import time, runtime, or statically (using the included pycc tool). Numba supports compilation of Python to run on either CPU or GPU hardware, and is designed to integrate with the Python scientific software stack.
The Numba project is supported by Continuum Analytics and The Gordon and Betty Moore Foundation (Grant GBMF5423).
from numba import njit
import numpy as np
#njit
def pir(x):
a = np.empty_like(x)
for i in range(a.size):
x_ = x[i]
_x = abs(x_)
a[i] = np.sign(x_) * np.log(1 + _x)
return a
Accuracy
np.isclose(pir(x), f(x)).all()
True
Timing
x = np.random.randn(100000)
# My proposal
%timeit pir(x)
1000 loops, best of 3: 881 µs per loop
# OP test
%timeit f(x)
1000 loops, best of 3: 1.26 ms per loop
# Divakar-1
%timeit mask_vectorized_numexpr_app(x)
100 loops, best of 3: 2.97 ms per loop
# Divakar-2
%timeit mask_vectorized_numexpr_app2(x)
1000 loops, best of 3: 621 µs per loop
Function definitions
from numba import njit
import numpy as np
#njit
def pir(x):
a = np.empty_like(x)
for i in range(a.size):
x_ = x[i]
_x = abs(x_)
a[i] = np.sign(x_) * np.log(1 + _x)
return a
import numexpr as ne
def mask_vectorized_numexpr_app(x):
out = np.empty_like(x)
mask = x>=0
mask_rev = ~mask
x_masked = x[mask]
x_rev_masked = x[mask_rev]
out[mask] = ne.evaluate('log(x_masked+1)')
out[mask_rev] = ne.evaluate('-log(-x_rev_masked+1)')
return out
def mask_vectorized_numexpr_app2(x):
return ne.evaluate('log( (abs(x)+1)**((-2*(x<0))+1))')
def f(x):
return (x/abs(x)) * np.log(1+abs(x))

You can slightly improve the speed of your second solution by using np.where instead of np.select:
def loga(x):
cond = [x >= 0, x < 0]
choice = [np.log(x+1), -np.log(-x+1)]
return np.select(cond, choice)
def logb(x):
return np.where(x>=0, np.log(x+1), -np.log(-x+1))
In [16]: %timeit loga(arange(-1000,1000))
10000 loops, best of 3: 169 µs per loop
In [17]: %timeit logb(arange(-1000,1000))
10000 loops, best of 3: 98.3 µs per loop
In [18]: np.all(loga(arange(-1000,1000)) == logb(arange(-1000,1000)))
Out[18]: True

Python: rewrite a looping numpy math function to run on GPU

Can someone help me rewrite this one function (the doTheMath function) to do the calculations on the GPU? I used a few good days now trying to get my head around it but to no result. I wonder maybe somebody can help me rewrite this function in whatever way you may seem fit as log as I gives the same result at the end. I tried to use #jit from numba but for some reason it is actually much slower than running the code as usual. With a huge sample size, the goal is to decrease the execution time considerably so naturally I believe the GPU is the fastest way to do it.
I'll explain a little what is actually happening. The real data, which looks almost identical as the sample data created in the code below is divided into sample sizes of approx 5.000.000 rows each sample or around 150MB per file. In total there are around 600.000.000 rows or 20GB of data. I must loop through this data, sample by sample and then row by row in each sample, take the last 2000 (or another) rows as of each line and run the doTheMath function which returns a result. That result is then saved back to the hardrive where I can do some other things with it with another program. As you can see below, I do not need all of the results of all the rows, only those bigger than a specific amount. If I run my function as it is right now in python I get about 62seconds per 1.000.000 rows. This is a very long time considering all the data and how fast it should be done with.
I must mention that I upload the real data file by file to the RAM with the help of data = joblib.load(file) so uploading the data is not the problem as it takes only about 0.29 seconds per file. Once uploaded I run the entire code below. What takes the longest time is the doTheMath function. I am willing to give all of my 500 reputation points I have on stackoverflow as a reward for somebody willing to help me rewrite this simple code to run on the GPU. My interest is specifically in the GPU, I really want to see how it is done on this problem at hand.
EDIT/UPDATE 1:
Here is a link to a small sample of the real data: data_csv.zip About 102000 rows of real data1 and 2000 rows for real data2a and data2b. Use minimumLimit = 400 on the real sample data
EDIT/UPDATE 2:
For those following this post here is a short summary of the answers below. Up until now we have 4 answers to the original solution. The one offered by #Divakar are just tweaks to the original code. Of the two tweaks only the first one is actually applicable to this problem, the second one is a good tweak but does not apply here. Out of the other three answers, two of them are CPU based solutions and one tensorflow-GPU try. The Tensorflow-GPU by Paul Panzer seems to be promising but when i actually run it on the GPU it is slower than the original, so the code still needs improvement.
The other two CPU based solutions are submitted by #PaulPanzer (a pure numpy solution) and #MSeifert (a numba solution). Both solutions give very good results and both process data extremely fast compared to the original code. Of the two the one submitted by Paul Panzer is faster. It processes about 1.000.000 rows in about 3 seconds. The only problem is with smaller batchSizes, this can be overcome by either switching to the numba solution offered by MSeifert, or even the original code after all the tweaks that have been discussed below.
I am very happy and thankful to #PaulPanzer and #MSeifert for the work they did on their answers. Still, since this is a question about a GPU based solution, i am waiting to see if anybody is willing to give it a try on a GPU version and see how much faster the data can be processed on the GPU when compared to the current CPU solutions. If there will be no other answers outperforming #PaulPanzer's pure numpy solution then i'll accept his answer as the right one and gets the bounty :)
EDIT/UPDATE 3:
#Divakar has posted a new answer with a solution for the GPU. After my testings on real data, the speed is not even comparable to the CPU counterpart solutions. The GPU processes about 5.000.000 in about 1,5 seconds. This is incredible :) I am very excited about the GPU solution and i thank #Divakar for posting it. As well as i thank #PaulPanzer and #MSeifert for their CPU solutions :) Now my research continues with an incredible speed due to the GPU :)
import pandas as pd
import numpy as np
import time
def doTheMath(tmpData1, data2a, data2b):
A = tmpData1[:, 0]
B = tmpData1[:,1]
C = tmpData1[:,2]
D = tmpData1[:,3]
Bmax = B.max()
Cmin = C.min()
dif = (Bmax - Cmin)
abcd = ((((A - Cmin) / dif) + ((B - Cmin) / dif) + ((C - Cmin) / dif) + ((D - Cmin) / dif)) / 4)
return np.where(((abcd <= data2a) & (abcd >= data2b)), 1, 0).sum()
#Declare variables
batchSize = 2000
sampleSize = 5000000
resultArray = []
minimumLimit = 490 #use 400 on the real sample data
#Create Random Sample Data
data1 = np.matrix(np.random.uniform(1, 100, (sampleSize + batchSize, 4)))
data2a = np.matrix(np.random.uniform(0, 1, (batchSize, 1))) #upper limit
data2b = np.matrix(np.random.uniform(0, 1, (batchSize, 1))) #lower limit
#approx. half of data2a will be smaller than data2b, but that is only in the sample data because it is randomly generated, NOT the real data. The real data2a is always higher than data2b.
#Loop through the data
t0 = time.time()
for rowNr in range(data1.shape[0]):
tmp_df = data1[rowNr:rowNr + batchSize] #rolling window
if(tmp_df.shape[0] == batchSize):
result = doTheMath(tmp_df, data2a, data2b)
if (result >= minimumLimit):
resultArray.append([rowNr , result])
print('Runtime:', time.time() - t0)
#Save data results
resultArray = np.array(resultArray)
print(resultArray[:,1].sum())
resultArray = pd.DataFrame({'index':resultArray[:,0], 'result':resultArray[:,1]})
resultArray.to_csv("Result Array.csv", sep=';')
The PC specs I am working on:
GTX970(4gb) video card;
i7-4790K CPU 4.00Ghz;
16GB RAM;
a SSD drive
running Windows 7;
As a side question, would a second video card in SLI help on this problem?

Introduction and solution code
Well, you asked for it! So, listed in this post is an implementation with PyCUDA that uses lightweight wrappers extending most of CUDA's capabilities within Python environment. We will its SourceModule functionality that lets us write and compile CUDA kernels staying in Python environment.
Getting to the problem at hand, among the computations involved, we have sliding maximum and minimum, few differences and divisions and comparisons. For the maximum and minimum parts, that involves block max finding (for each sliding window), we will use reduction-technique as discussed in some detail here. This would be done at block level. For the upper level iterations across sliding windows, we would use the grid level indexing into CUDA resources. For more info on this block and grid format, please refer to page-18. PyCUDA also supports builtins for computing reductions like max and min, but we lose control, specifically we intend to use specialized memory like shared and constant memory for leveraging GPU at its near to optimum level.
Listing out the PyCUDA-NumPy solution code -
1] PyCUDA part -
import pycuda.autoinit
import pycuda.driver as drv
import numpy as np
from pycuda.compiler import SourceModule
mod = SourceModule("""
#define TBP 1024 // THREADS_PER_BLOCK
__device__ void get_Bmax_Cmin(float* out, float *d1, float *d2, int L, int offset)
{
int tid = threadIdx.x;
int inv = TBP;
__shared__ float dS[TBP][2];
dS[tid][0] = d1[tid+offset];
dS[tid][1] = d2[tid+offset];
__syncthreads();
if(tid<L-TBP)
{
dS[tid][0] = fmaxf(dS[tid][0] , d1[tid+inv+offset]);
dS[tid][1] = fminf(dS[tid][1] , d2[tid+inv+offset]);
}
__syncthreads();
inv = inv/2;
while(inv!=0)
{
if(tid<inv)
{
dS[tid][0] = fmaxf(dS[tid][0] , dS[tid+inv][0]);
dS[tid][1] = fminf(dS[tid][1] , dS[tid+inv][1]);
}
__syncthreads();
inv = inv/2;
}
__syncthreads();
if(tid==0)
{
out[0] = dS[0][0];
out[1] = dS[0][1];
}
__syncthreads();
}
__global__ void main1(float* out, float *d0, float *d1, float *d2, float *d3, float *lowL, float *highL, int *BLOCKLEN)
{
int L = BLOCKLEN[0];
int tid = threadIdx.x;
int iterID = blockIdx.x;
float Bmax_Cmin[2];
int inv;
float Cmin, dif;
__shared__ float dS[TBP*2];
get_Bmax_Cmin(Bmax_Cmin, d1, d2, L, iterID);
Cmin = Bmax_Cmin[1];
dif = (Bmax_Cmin[0] - Cmin);
inv = TBP;
dS[tid] = (d0[tid+iterID] + d1[tid+iterID] + d2[tid+iterID] + d3[tid+iterID] - 4.0*Cmin) / (4.0*dif);
__syncthreads();
if(tid<L-TBP)
dS[tid+inv] = (d0[tid+inv+iterID] + d1[tid+inv+iterID] + d2[tid+inv+iterID] + d3[tid+inv+iterID] - 4.0*Cmin) / (4.0*dif);
dS[tid] = ((dS[tid] >= lowL[tid]) & (dS[tid] <= highL[tid])) ? 1 : 0;
__syncthreads();
if(tid<L-TBP)
dS[tid] += ((dS[tid+inv] >= lowL[tid+inv]) & (dS[tid+inv] <= highL[tid+inv])) ? 1 : 0;
__syncthreads();
inv = inv/2;
while(inv!=0)
{
if(tid<inv)
dS[tid] += dS[tid+inv];
__syncthreads();
inv = inv/2;
}
if(tid==0)
out[iterID] = dS[0];
__syncthreads();
}
""")
Please note that THREADS_PER_BLOCK, TBP is to be set based on the batchSize. The rule of thumb here is to assign power of 2 value to TBP that is just lesser than batchSize. Thus, for batchSize = 2000, we needed TBP as 1024.
2] NumPy part -
def gpu_app_v1(A, B, C, D, batchSize, minimumLimit):
func1 = mod.get_function("main1")
outlen = len(A)-batchSize+1
# Set block and grid sizes
BSZ = (1024,1,1)
GSZ = (outlen,1)
dest = np.zeros(outlen).astype(np.float32)
N = np.int32(batchSize)
func1(drv.Out(dest), drv.In(A), drv.In(B), drv.In(C), drv.In(D), \
drv.In(data2b), drv.In(data2a),\
drv.In(N), block=BSZ, grid=GSZ)
idx = np.flatnonzero(dest >= minimumLimit)
return idx, dest[idx]
Benchmarking
I have tested on GTX 960M. Please note that PyCUDA expects arrays to be of contiguous order. So, we need to slice the columns and make copies. I am expecting/assuming that the data could be read from the files such that the data is spread along rows instead of being as columns. Thus, keeping those out of the benchmarking function for now.
Original approach -
def org_app(data1, batchSize, minimumLimit):
resultArray = []
for rowNr in range(data1.shape[0]-batchSize+1):
tmp_df = data1[rowNr:rowNr + batchSize] #rolling window
result = doTheMath(tmp_df, data2a, data2b)
if (result >= minimumLimit):
resultArray.append([rowNr , result])
return resultArray
Timings and verification -
In [2]: #Declare variables
...: batchSize = 2000
...: sampleSize = 50000
...: resultArray = []
...: minimumLimit = 490 #use 400 on the real sample data
...:
...: #Create Random Sample Data
...: data1 = np.random.uniform(1, 100000, (sampleSize + batchSize, 4)).astype(np.float32)
...: data2b = np.random.uniform(0, 1, (batchSize)).astype(np.float32)
...: data2a = data2b + np.random.uniform(0, 1, (batchSize)).astype(np.float32)
...:
...: # Make column copies
...: A = data1[:,0].copy()
...: B = data1[:,1].copy()
...: C = data1[:,2].copy()
...: D = data1[:,3].copy()
...:
...: gpu_out1,gpu_out2 = gpu_app_v1(A, B, C, D, batchSize, minimumLimit)
...: cpu_out1,cpu_out2 = np.array(org_app(data1, batchSize, minimumLimit)).T
...: print(np.allclose(gpu_out1, cpu_out1))
...: print(np.allclose(gpu_out2, cpu_out2))
...:
True
False
So, there's some differences between CPU and GPU countings. Let's investigate them -
In [7]: idx = np.flatnonzero(~np.isclose(gpu_out2, cpu_out2))
In [8]: idx
Out[8]: array([12776, 15208, 17620, 18326])
In [9]: gpu_out2[idx] - cpu_out2[idx]
Out[9]: array([-1., -1., 1., 1.])
There are four instances of non-matching counts. These are off at max by 1. Upon research, I came across some information on this. Basically, since we are using math intrinsics for max and min computations and those I think are causing the last binary bit in the floating pt representation to be diferent than the CPU counterpart. This is termed as ULP error and has been discused in detail here and here.
Finally, puting the issue aside, let's get to the most important bit, the performance -
In [10]: %timeit org_app(data1, batchSize, minimumLimit)
1 loops, best of 3: 2.18 s per loop
In [11]: %timeit gpu_app_v1(A, B, C, D, batchSize, minimumLimit)
10 loops, best of 3: 82.5 ms per loop
In [12]: 2180.0/82.5
Out[12]: 26.424242424242426
Let's try with bigger datasets. With sampleSize = 500000, we get -
In [14]: %timeit org_app(data1, batchSize, minimumLimit)
1 loops, best of 3: 23.2 s per loop
In [15]: %timeit gpu_app_v1(A, B, C, D, batchSize, minimumLimit)
1 loops, best of 3: 821 ms per loop
In [16]: 23200.0/821
Out[16]: 28.25822168087698
So, the speedup stays constant at around 27.
Limitations :
1) We are using float32 numbers, as GPUs work best with those. Double precision specially on non-server GPUs aren't popular when it comes to performance and since you are working with such a GPU, I tested with float32.
Further improvement :
1) We could use faster constant memory to feed in data2a and data2b, rather than use global memory.

Tweak #1
Its usually advised to vectorize things when working with NumPy arrays. But with very large arrays, I think you are out of options there. So, to boost performance, a minor tweak is possible to optimize on the last step of summing.
We could replace the step that makes an array of 1s and 0s and does summing :
np.where(((abcd <= data2a) & (abcd >= data2b)), 1, 0).sum()
with np.count_nonzero that works efficiently to count True values in a boolean array, instead of converting to 1s and 0s -
np.count_nonzero((abcd <= data2a) & (abcd >= data2b))
Runtime test -
In [45]: abcd = np.random.randint(11,99,(10000))
In [46]: data2a = np.random.randint(11,99,(10000))
In [47]: data2b = np.random.randint(11,99,(10000))
In [48]: %timeit np.where(((abcd <= data2a) & (abcd >= data2b)), 1, 0).sum()
10000 loops, best of 3: 81.8 µs per loop
In [49]: %timeit np.count_nonzero((abcd <= data2a) & (abcd >= data2b))
10000 loops, best of 3: 28.8 µs per loop
Tweak #2
Use a pre-computed reciprocal when dealing with cases that undergo implicit broadcasting. Some more info here. Thus, store reciprocal of dif and use that instead at the step :
((((A - Cmin) / dif) + ((B - Cmin) / dif) + ...
Sample test -
In [52]: A = np.random.rand(10000)
In [53]: dif = 0.5
In [54]: %timeit A/dif
10000 loops, best of 3: 25.8 µs per loop
In [55]: %timeit A*(1.0/dif)
100000 loops, best of 3: 7.94 µs per loop
You have four places using division by dif. So, hopefully this would bring out noticeable boost there too!

Before you start tweaking the target (GPU) or using anything else (i.e. parallel executions ), you might want to consider how to improve the already existing code. You used the numba-tag so I'll use it to improve the code: First we operate on arrays not on matrices:
data1 = np.array(np.random.uniform(1, 100, (sampleSize + batchSize, 4)))
data2a = np.array(np.random.uniform(0, 1, batchSize)) #upper limit
data2b = np.array(np.random.uniform(0, 1, batchSize)) #lower limit
Each time you call doTheMath you expect an integer back, however you use a lot of arrays and create a lot of intermediate arrays:
abcd = ((((A - Cmin) / dif) + ((B - Cmin) / dif) + ((C - Cmin) / dif) + ((D - Cmin) / dif)) / 4)
return np.where(((abcd <= data2a) & (abcd >= data2b)), 1, 0).sum()
This creates an intermediate array each step:
tmp1 = A-Cmin,
tmp2 = tmp1 / dif,
tmp3 = B - Cmin,
tmp4 = tmp3 / dif
... you get the gist.
However this is a reduce function (array -> integer) so having a lot of intermediate arrays is unnecessary weight, just calculate the value of the "fly".
import numba as nb
#nb.njit
def doTheMathNumba(tmpData, data2a, data2b):
Bmax = np.max(tmpData[:, 1])
Cmin = np.min(tmpData[:, 2])
diff = (Bmax - Cmin)
idiff = 1 / diff
sum_ = 0
for i in range(tmpData.shape[0]):
val = (tmpData[i, 0] + tmpData[i, 1] + tmpData[i, 2] + tmpData[i, 3]) / 4 * idiff - Cmin * idiff
if val <= data2a[i] and val >= data2b[i]:
sum_ += 1
return sum_
I did something else here to avoid multiple operations:
(((A - Cmin) / dif) + ((B - Cmin) / dif) + ((C - Cmin) / dif) + ((D - Cmin) / dif)) / 4
= ((A - Cmin + B - Cmin + C - Cmin + D - Cmin) / dif) / 4
= (A + B + C + D - 4 * Cmin) / (4 * dif)
= (A + B + C + D) / (4 * dif) - (Cmin / dif)
This actually cuts down the execution time by almost a factor of 10 on my computer:
%timeit doTheMath(tmp_df, data2a, data2b) # 1000 loops, best of 3: 446 µs per loop
%timeit doTheMathNumba(tmp_df, data2a, data2b) # 10000 loops, best of 3: 59 µs per loop
There are certainly also other improvements, for example using a rolling min/max to calculate Bmax and Cmin, that would make at least part of the calculation run in O(sampleSize) instead of O(samplesize * batchsize). This would also make it possible to reuse some of the (A + B + C + D) / (4 * dif) - (Cmin / dif) calculations because if Cmin and Bmax don't change for the next sample these values don't differ. It's a bit complicated to do because the comparisons differ. But definitely possible! See here:
import time
import numpy as np
import numba as nb
#nb.njit
def doTheMathNumba(abcd, data2a, data2b, Bmax, Cmin):
diff = (Bmax - Cmin)
idiff = 1 / diff
quarter_idiff = 0.25 * idiff
sum_ = 0
for i in range(abcd.shape[0]):
val = abcd[i] * quarter_idiff - Cmin * idiff
if val <= data2a[i] and val >= data2b[i]:
sum_ += 1
return sum_
#nb.njit
def doloop(data1, data2a, data2b, abcd, Bmaxs, Cmins, batchSize, sampleSize, minimumLimit, resultArray):
found = 0
for rowNr in range(data1.shape[0]):
if(abcd[rowNr:rowNr + batchSize].shape[0] == batchSize):
result = doTheMathNumba(abcd[rowNr:rowNr + batchSize],
data2a, data2b, Bmaxs[rowNr], Cmins[rowNr])
if (result >= minimumLimit):
resultArray[found, 0] = rowNr
resultArray[found, 1] = result
found += 1
return resultArray[:found]
#Declare variables
batchSize = 2000
sampleSize = 50000
resultArray = []
minimumLimit = 490 #use 400 on the real sample data
data1 = np.array(np.random.uniform(1, 100, (sampleSize + batchSize, 4)))
data2a = np.array(np.random.uniform(0, 1, batchSize)) #upper limit
data2b = np.array(np.random.uniform(0, 1, batchSize)) #lower limit
from scipy import ndimage
t0 = time.time()
abcd = np.sum(data1, axis=1)
Bmaxs = ndimage.maximum_filter1d(data1[:, 1],
size=batchSize,
origin=-((batchSize-1)//2-1)) # correction for even shapes
Cmins = ndimage.minimum_filter1d(data1[:, 2],
size=batchSize,
origin=-((batchSize-1)//2-1))
result = np.zeros((sampleSize, 2), dtype=np.int64)
doloop(data1, data2a, data2b, abcd, Bmaxs, Cmins, batchSize, sampleSize, minimumLimit, result)
print('Runtime:', time.time() - t0)
This gives me a Runtime: 0.759593152999878 (after numba compiled the functions!), while your original took had Runtime: 24.68975639343262. Now we're 30 times faster!
With your sample size it still takes Runtime: 60.187848806381226 but that's not too bad, right?
And even if I haven't done this myself, numba says that it's possible to write "Numba for CUDA GPUs" and it doesn't seem to complicated.

Here is some code to demonstrate what is possible by just tweaking the algorithm. It's pure numpy but on the sample data you posted gives a roughly 35x speedup over the original version (~1,000,000 samples in ~2.5sec on my rather modest machine):
>>> result_dict = master('run')
[('load', 0.82578349113464355), ('precomp', 0.028138399124145508), ('max/min', 0.24333405494689941), ('ABCD', 0.015314102172851562), ('main', 1.3356468677520752)]
TOTAL 2.44821691513
Tweaks used:
A+B+C+D, see my other answer
running min/max, including avoiding to compute (A+B+C+D - 4Cmin)/(4dif) multiple times with the same Cmin/dif.
These are more or less routine. That leaves the comparison with data2a/b which is expensive O(NK) where N is the number of samples and K is the size of the window.
Here one can take advantage of the relatively well-behaved data. Using the running min/max one can create variants of data2a/b that can be used to test a range of window offsets at a time, if the test fails all these offsets can be ruled out immediately, otherwise the range is bisected.
import numpy as np
import time
# global variables; they will hold the precomputed pre-screening filters
preA, preB = {}, {}
CHUNK_SIZES = None
def sliding_argmax(data, K=2000):
"""compute the argmax of data over a sliding window of width K
returns:
indices -- indices into data
switches -- window offsets at which the maximum changes
(strictly speaking: where the index of the maximum changes)
excludes 0 but includes maximum offset (len(data)-K+1)
see last line of compute_pre_screening_filter for a recipe to convert
this representation to the vector of maxima
"""
N = len(data)
last = np.argmax(data[:K])
indices = [last]
while indices[-1] <= N - 1:
ge = np.where(data[last + 1 : last + K + 1] > data[last])[0]
if len(ge) == 0:
if last + K >= N:
break
last += 1 + np.argmax(data[last + 1 : last + K + 1])
indices.append(last)
else:
last += 1 + ge[0]
indices.append(last)
indices = np.array(indices)
switches = np.where(data[indices[1:]] > data[indices[:-1]],
indices[1:] + (1-K), indices[:-1] + 1)
return indices, np.r_[switches, [len(data)-K+1]]
def compute_pre_screening_filter(bound, n_offs):
"""compute pre-screening filter for point-wise upper bound
given a K-vector of upper bounds B and K+n_offs-1-vector data
compute K+n_offs-1-vector filter such that for each index j
if for any offset 0 <= o < n_offs and index 0 <= i < K such that
o + i = j, the inequality B_i >= data_j holds then filter_j >= data_j
therefore the number of data points below filter is an upper bound for
the maximum number of points below bound in any K-window in data
"""
pad_l, pad_r = np.min(bound[:n_offs-1]), np.min(bound[1-n_offs:])
padded = np.r_[pad_l+np.zeros(n_offs-1,), bound, pad_r+np.zeros(n_offs-1,)]
indices, switches = sliding_argmax(padded, n_offs)
return padded[indices].repeat(np.diff(np.r_[[0], switches]))
def compute_all_pre_screening_filters(upper, lower, min_chnk=5, dyads=6):
"""compute upper and lower pre-screening filters for data blocks of
sizes K+n_offs-1 where
n_offs = min_chnk, 2min_chnk, ..., 2^(dyads-1)min_chnk
the result is stored in global variables preA and preB
"""
global CHUNK_SIZES
CHUNK_SIZES = min_chnk * 2**np.arange(dyads)
preA[1] = upper
preB[1] = lower
for n in CHUNK_SIZES:
preA[n] = compute_pre_screening_filter(upper, n)
preB[n] = -compute_pre_screening_filter(-lower, n)
def test_bounds(block, counts, threshold=400):
"""test whether the windows fitting in the data block 'block' fall
within the bounds using pre-screening for efficient bulk rejection
array 'counts' will be overwritten with the counts of compliant samples
note that accurate counts will only be returned for above threshold
windows, because the analysis of bulk rejected windows is short-circuited
also note that bulk rejection only works for 'well behaved' data and
for example not on random numbers
"""
N = len(counts)
K = len(preA[1])
r = N % CHUNK_SIZES[0]
# chop up N into as large as possible chunks with matching pre computed
# filters
# start with small and work upwards
counts[:r] = [np.count_nonzero((block[l:l+K] <= preA[1]) &
(block[l:l+K] >= preB[1]))
for l in range(r)]
def bisect(block, counts):
M = len(counts)
cnts = np.count_nonzero((block <= preA[M]) & (block >= preB[M]))
if cnts < threshold:
counts[:] = cnts
return
elif M == CHUNK_SIZES[0]:
counts[:] = [np.count_nonzero((block[l:l+K] <= preA[1]) &
(block[l:l+K] >= preB[1]))
for l in range(M)]
else:
M //= 2
bisect(block[:-M], counts[:M])
bisect(block[M:], counts[M:])
N = N // CHUNK_SIZES[0]
for M in CHUNK_SIZES:
if N % 2:
bisect(block[r:r+M+K-1], counts[r:r+M])
r += M
elif N == 0:
return
N //= 2
else:
for j in range(2*N):
bisect(block[r:r+M+K-1], counts[r:r+M])
r += M
def analyse(data, use_pre_screening=True, min_chnk=5, dyads=6,
threshold=400):
samples, upper, lower = data
N, K = samples.shape[0], upper.shape[0]
times = [time.time()]
if use_pre_screening:
compute_all_pre_screening_filters(upper, lower, min_chnk, dyads)
times.append(time.time())
# compute switching points of max and min for running normalisation
upper_inds, upper_swp = sliding_argmax(samples[:, 1], K)
lower_inds, lower_swp = sliding_argmax(-samples[:, 2], K)
times.append(time.time())
# sum columns
ABCD = samples.sum(axis=-1)
times.append(time.time())
counts = np.empty((N-K+1,), dtype=int)
# main loop
# loop variables:
offs = 0
u_ind, u_scale, u_swp = 0, samples[upper_inds[0], 1], upper_swp[0]
l_ind, l_scale, l_swp = 0, samples[lower_inds[0], 2], lower_swp[0]
while True:
# check which is switching next, min(C) or max(B)
if u_swp > l_swp:
# greedily take the largest block possible such that dif and Cmin
# do not change
block = (ABCD[offs:l_swp+K-1] - 4*l_scale) \
* (0.25 / (u_scale-l_scale))
if use_pre_screening:
test_bounds(block, counts[offs:l_swp], threshold=threshold)
else:
counts[offs:l_swp] = [
np.count_nonzero((block[l:l+K] <= upper) &
(block[l:l+K] >= lower))
for l in range(l_swp - offs)]
# book keeping
l_ind += 1
offs = l_swp
l_swp = lower_swp[l_ind]
l_scale = samples[lower_inds[l_ind], 2]
else:
block = (ABCD[offs:u_swp+K-1] - 4*l_scale) \
* (0.25 / (u_scale-l_scale))
if use_pre_screening:
test_bounds(block, counts[offs:u_swp], threshold=threshold)
else:
counts[offs:u_swp] = [
np.count_nonzero((block[l:l+K] <= upper) &
(block[l:l+K] >= lower))
for l in range(u_swp - offs)]
u_ind += 1
if u_ind == len(upper_inds):
assert u_swp == N-K+1
break
offs = u_swp
u_swp = upper_swp[u_ind]
u_scale = samples[upper_inds[u_ind], 1]
times.append(time.time())
return {'counts': counts, 'valid': np.where(counts >= 400)[0],
'timings': np.diff(times)}
def master(mode='calibrate', data='fake', use_pre_screening=True, nrep=3,
min_chnk=None, dyads=None):
t = time.time()
if data in ('fake', 'load'):
data1 = np.loadtxt('data1.csv', delimiter=';', skiprows=1,
usecols=[1,2,3,4])
data2a = np.loadtxt('data2a.csv', delimiter=';', skiprows=1,
usecols=[1])
data2b = np.loadtxt('data2b.csv', delimiter=';', skiprows=1,
usecols=[1])
if data == 'fake':
data1 = np.tile(data1, (10, 1))
threshold = 400
elif data == 'random':
data1 = np.random.random((102000, 4))
data2b = np.random.random(2000)
data2a = np.random.random(2000)
threshold = 490
if use_pre_screening or mode == 'calibrate':
print('WARNING: pre-screening not efficient on artificial data')
else:
raise ValueError("data mode {} not recognised".format(data))
data = data1, data2a, data2b
t_load = time.time() - t
if mode == 'calibrate':
min_chnk = (2, 3, 4, 5, 6) if min_chnk is None else min_chnk
dyads = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10) if dyads is None else dyads
timings = np.zeros((len(min_chnk), len(dyads)))
print('max bisect ' + ' '.join([
' n.a.' if dy == 0 else '{:7d}'.format(dy) for dy in dyads]),
end='')
for i, mc in enumerate(min_chnk):
print('\nmin chunk {}'.format(mc), end=' ')
for j, dy in enumerate(dyads):
for k in range(nrep):
if dy == 0: # no pre-screening
timings[i, j] += analyse(
data, False, mc, dy, threshold)['timings'][3]
else:
timings[i, j] += analyse(
data, True, mc, dy, threshold)['timings'][3]
timings[i, j] /= nrep
print('{:7.3f}'.format(timings[i, j]), end=' ', flush=True)
best_mc, best_dy = np.unravel_index(np.argmin(timings.ravel()),
timings.shape)
print('\nbest', min_chnk[best_mc], dyads[best_dy])
return timings, min_chnk[best_mc], dyads[best_dy]
if mode == 'run':
min_chnk = 2 if min_chnk is None else min_chnk
dyads = 5 if dyads is None else dyads
res = analyse(data, use_pre_screening, min_chnk, dyads, threshold)
times = np.r_[[t_load], res['timings']]
print(list(zip(('load', 'precomp', 'max/min', 'ABCD', 'main'), times)))
print('TOTAL', times.sum())
return res

This is technically off-topic (not GPU) but I'm sure you'll be interested.
There is one obvious and rather large saving:
Precompute A + B + C + D (not in the loop, on the whole data: data1.sum(axis=-1)), because abcd = ((A+B+C+D) - 4Cmin) / (4dif). This will save quite a few ops.
Surprised nobody spotted that one before ;-)
Edit:
There is another thing, though I suspect that's only in your example, not in your real data:
As it stands roughly half of data2a will be smaller than data2b. In these places your conditions on abcd cannot be both True, so you needn't even compute abcd there.
Edit:
One more tweak I used below but forgot to mention: If you compute the max (or min) over a moving window. When you move one point to the right, say, how likely is the max to change? There are only two things that can change it: the new point on the right is larger (happens roughly once in windowlength times, and even if it happens, you immediately know the new max) or the old max falls off the window
on the left (also happens roughly once in windowlength times). Only in this last case you have to search the entire window for the new max.
Edit:
Couldn't resist giving it a try in tensorflow. I don't have a GPU, so you yourself have to test it for speed. Put "gpu" for "cpu" on the marked line.
On cpu it is about half as fast as your original implementation (i.e. without Divakar's tweaks). Note that I've taken the liberty of changing the inputs from matrix to plain array. Currently tensorflow is a bit of a moving target, so make sure you have the right version. I used Python3.6 and tf 0.12.1 If you do a pip3 install tensorflow-gpu today it should might work.
import numpy as np
import time
import tensorflow as tf
# currently the max/min code is sequential
# thus
parallel_iterations = 1
# but you can put this in a separate loop, precompute and then try and run
# the remainder of doTheMathTF with a larger parallel_iterations
# tensorflow is quite capricious about its data types
ddf = tf.float64
ddi = tf.int32
def worker(data1, data2a, data2b):
###################################
# CHANGE cpu to gpu in next line! #
###################################
with tf.device('/cpu:0'):
g = tf.Graph ()
with g.as_default():
ABCD = tf.constant(data1.sum(axis=-1), dtype=ddf)
B = tf.constant(data1[:, 1], dtype=ddf)
C = tf.constant(data1[:, 2], dtype=ddf)
window = tf.constant(len(data2a))
N = tf.constant(data1.shape[0] - len(data2a) + 1, dtype=ddi)
data2a = tf.constant(data2a, dtype=ddf)
data2b = tf.constant(data2b, dtype=ddf)
def doTheMathTF(i, Bmax, Bmaxind, Cmin, Cminind, out):
# most of the time we can keep the old max/min
Bmaxind = tf.cond(Bmaxind<i,
lambda: i + tf.to_int32(
tf.argmax(B[i:i+window], axis=0)),
lambda: tf.cond(Bmax>B[i+window-1],
lambda: Bmaxind,
lambda: i+window-1))
Cminind = tf.cond(Cminind<i,
lambda: i + tf.to_int32(
tf.argmin(C[i:i+window], axis=0)),
lambda: tf.cond(Cmin<C[i+window-1],
lambda: Cminind,
lambda: i+window-1))
Bmax = B[Bmaxind]
Cmin = C[Cminind]
abcd = (ABCD[i:i+window] - 4 * Cmin) * (1 / (4 * (Bmax-Cmin)))
out = out.write(i, tf.to_int32(
tf.count_nonzero(tf.logical_and(abcd <= data2a,
abcd >= data2b))))
return i + 1, Bmax, Bmaxind, Cmin, Cminind, out
with tf.Session(graph=g) as sess:
i, Bmaxind, Bmax, Cminind, Cmin, out = tf.while_loop(
lambda i, _1, _2, _3, _4, _5: i<N, doTheMathTF,
(tf.Variable(0, dtype=ddi), tf.Variable(0.0, dtype=ddf),
tf.Variable(-1, dtype=ddi),
tf.Variable(0.0, dtype=ddf), tf.Variable(-1, dtype=ddi),
tf.TensorArray(ddi, size=N)),
shape_invariants=None,
parallel_iterations=parallel_iterations,
back_prop=False)
out = out.pack()
sess.run(tf.initialize_all_variables())
out, = sess.run((out,))
return out
#Declare variables
batchSize = 2000
sampleSize = 50000#00
resultArray = []
#Create Sample Data
data1 = np.random.uniform(1, 100, (sampleSize + batchSize, 4))
data2a = np.random.uniform(0, 1, (batchSize,))
data2b = np.random.uniform(0, 1, (batchSize,))
t0 = time.time()
out = worker(data1, data2a, data2b)
print('Runtime (tensorflow):', time.time() - t0)
good_indices, = np.where(out >= 490)
res_tf = np.c_[good_indices, out[good_indices]]
def doTheMath(tmpData1, data2a, data2b):
A = tmpData1[:, 0]
B = tmpData1[:,1]
C = tmpData1[:,2]
D = tmpData1[:,3]
Bmax = B.max()
Cmin = C.min()
dif = (Bmax - Cmin)
abcd = ((((A - Cmin) / dif) + ((B - Cmin) / dif) + ((C - Cmin) / dif) + ((D - Cmin) / dif)) / 4)
return np.where(((abcd <= data2a) & (abcd >= data2b)), 1, 0).sum()
#Loop through the data
t0 = time.time()
for rowNr in range(sampleSize+1):
tmp_df = data1[rowNr:rowNr + batchSize] #rolling window
result = doTheMath(tmp_df, data2a, data2b)
if (result >= 490):
resultArray.append([rowNr , result])
print('Runtime (original):', time.time() - t0)
print(np.alltrue(np.array(resultArray)==res_tf))

performance loss after vectorization in numpy

I am writing a time consuming program. To reduce the time, I have tried my best to use numpy.dot instead of for loops.
However, I found vectorized program to have much worse performance than the for loop version:
import numpy as np
import datetime
kpt_list = np.zeros((10000,20),dtype='float')
rpt_list = np.zeros((1000,20),dtype='float')
h_r = np.zeros((20,20,1000),dtype='complex')
r_ndegen = np.zeros(1000,dtype='float')
r_ndegen.fill(1)
# setup completed
# this is a the vectorized version
r_ndegen_tile = np.tile(r_ndegen.reshape(1000, 1), 10000)
start = datetime.datetime.now()
phase = np.exp(1j * np.dot(rpt_list, kpt_list.T))/r_ndegen_tile
kpt_data_1 = h_r.dot(phase)
end = datetime.datetime.now()
print((end-start).total_seconds())
# the result is 19.302483
# this is the for loop version
kpt_data_2 = np.zeros((20, 20, 10000), dtype='complex')
start = datetime.datetime.now()
for i in range(10000):
kpt = kpt_list[i, :]
phase = np.exp(1j * np.dot(kpt, rpt_list.T))/r_ndegen
kpt_data_2[:, :, i] = h_r.dot(phase)
end = datetime.datetime.now()
print((end-start).total_seconds())
# the result is 7.74583
What is happening here?

The first thing I suggest you do is break your script down into separate functions to make profiling and debugging easier:
def setup(n1=10000, n2=1000, n3=20, seed=None):
gen = np.random.RandomState(seed)
kpt_list = gen.randn(n1, n3).astype(np.float)
rpt_list = gen.randn(n2, n3).astype(np.float)
h_r = (gen.randn(n3, n3,n2) + 1j*gen.randn(n3, n3,n2)).astype(np.complex)
r_ndegen = gen.randn(1000).astype(np.float)
return kpt_list, rpt_list, h_r, r_ndegen
def original_vec(*args, **kwargs):
kpt_list, rpt_list, h_r, r_ndegen = setup(*args, **kwargs)
r_ndegen_tile = np.tile(r_ndegen.reshape(1000, 1), 10000)
phase = np.exp(1j * np.dot(rpt_list, kpt_list.T)) / r_ndegen_tile
kpt_data = h_r.dot(phase)
return kpt_data
def original_loop(*args, **kwargs):
kpt_list, rpt_list, h_r, r_ndegen = setup(*args, **kwargs)
kpt_data = np.zeros((20, 20, 10000), dtype='complex')
for i in range(10000):
kpt = kpt_list[i, :]
phase = np.exp(1j * np.dot(kpt, rpt_list.T)) / r_ndegen
kpt_data[:, :, i] = h_r.dot(phase)
return kpt_data
I would also highly recommend using random data rather than all-zero or all-one arrays, unless that's what your actual data looks like (!). This makes it much easier to check the correctness of your code - for example, if your last step is to multiply by a matrix of zeros then your output will always be all-zeros, regardless of whether or not there is a mistake earlier on in your code.
Next, I would run these functions through line_profiler to see where they are spending most of their time. In particular, for original_vec:
In [1]: %lprun -f original_vec original_vec()
Timer unit: 1e-06 s
Total time: 23.7598 s
File: <ipython-input-24-c57463f84aad>
Function: original_vec at line 12
Line # Hits Time Per Hit % Time Line Contents
==============================================================
12 def original_vec(*args, **kwargs):
13
14 1 86498 86498.0 0.4 kpt_list, rpt_list, h_r, r_ndegen = setup(*args, **kwargs)
15
16 1 69700 69700.0 0.3 r_ndegen_tile = np.tile(r_ndegen.reshape(1000, 1), 10000)
17 1 1331947 1331947.0 5.6 phase = np.exp(1j * np.dot(rpt_list, kpt_list.T)) / r_ndegen_tile
18 1 22271637 22271637.0 93.7 kpt_data = h_r.dot(phase)
19
20 1 4 4.0 0.0 return kpt_data
You can see that it spends 93% of its time computing the dot product between h_r and phase. Here, h_r is a (20, 20, 1000) array and phase is (1000, 10000). We're computing a sum product over the last dimension of h_r and the first dimension of phase (you could write this in einsum notation as ijk,kl->ijl).
The first two dimensions of h_r don't really matter here - we could just as easily reshape h_r into a (20*20, 1000) array before taking the dot product. It turns out that this reshaping operation by itself gives a huge performance improvement:
In [2]: %timeit h_r.dot(phase)
1 loop, best of 3: 22.6 s per loop
In [3]: %timeit h_r.reshape(-1, 1000).dot(phase)
1 loop, best of 3: 1.04 s per loop
I'm not entirely sure why this should be the case - I would have hoped that numpy's dot function would be smart enough to apply this simple optimization automatically. On my laptop the second case seems to use multiple threads whereas the first one doesn't, suggesting that it might not be calling multithreaded BLAS routines.
Here's a vectorized version that incorporates the reshaping operation:
def new_vec(*args, **kwargs):
kpt_list, rpt_list, h_r, r_ndegen = setup(*args, **kwargs)
phase = np.exp(1j * np.dot(rpt_list, kpt_list.T)) / r_ndegen[:, None]
kpt_data = h_r.reshape(-1, phase.shape[0]).dot(phase)
return kpt_data.reshape(h_r.shape[:2] + (-1,))
The -1 indices tell numpy to infer the size of those dimensions according to the other dimensions and the number of elements in the array. I've also used broadcasting to divide by r_ndegen, which eliminates the need for np.tile.
By using the same random input data, we can check that the new version gives the same result as the original:
In [4]: ans1 = original_loop(seed=0)
In [5]: ans2 = new_vec(seed=0)
In [6]: np.allclose(ans1, ans2)
Out[6]: True
Some performance benchmarks:
In [7]: %timeit original_loop()
1 loop, best of 3: 13.5 s per loop
In [8]: %timeit original_vec()
1 loop, best of 3: 24.1 s per loop
In [5]: %timeit new_vec()
1 loop, best of 3: 2.49 s per loop
Update:
I was curious about why np.dot was so much slower for the original (20, 20, 1000) h_r array, so I dug into the numpy source code. The logic implemented in multiarraymodule.c turns out to be shockingly simple:
#if defined(HAVE_CBLAS)
if (PyArray_NDIM(ap1) <= 2 && PyArray_NDIM(ap2) <= 2 &&
(NPY_DOUBLE == typenum || NPY_CDOUBLE == typenum ||
NPY_FLOAT == typenum || NPY_CFLOAT == typenum)) {
return cblas_matrixproduct(typenum, ap1, ap2, out);
}
#endif
In other words numpy just checks whether either of the input arrays has > 2 dimensions, and immediately falls back on a non-BLAS implementation of matrix-matrix multiplication. It seems like it shouldn't be too difficult to check whether the inner dimensions of the two arrays are compatible, and if so treat them as 2D and perform *gemm matrix-matrix multiplication on them. In fact there's an open feature request for this dating back to 2012, if any numpy devs are reading...
In the meantime, it's a nice performance trick to be aware of when multiplying tensors.
Update 2:
I forgot about np.tensordot. Since it calls the same underlying BLAS routines as np.dot on a 2D array, it can achieve the same performance bump, but without all those ugly reshape operations:
In [6]: %timeit np.tensordot(h_r, phase, axes=1)
1 loop, best of 3: 1.05 s per loop

I suspect the first operation is hitting the the resource limit. May be you can benefit from these two questions: Efficient dot products of large memory-mapped arrays, and Dot product of huge arrays in numpy.

Faster way to do this using Python, Numpy?

I have a function in Python. I would like to make it a lot faster? Does anyone have any tips?
def garchModel(e2, omega=0.01, beta=0.1, gamma=0.8 ):
sigma = np.empty( len( e2 ) )
sigma[0] = omega
for i in np.arange( 1, len(e2) ):
sigma[i] = omega + beta * sigma[ i-1 ] + gamma * e2[ i-1 ]
return sigma

The following code works, but there's too much trickery going on, I am not sure it is not depending on some undocumented implementation detail that could eventually break down:
from numpy.lib.stride_tricks import as_strided
from numpy.core.umath_tests import inner1d
def garch_model(e2, omega=0.01, beta=0.1, gamma=0.8):
n = len(e2)
sigma = np.empty((n,))
sigma[:] = omega
sigma[1:] += gamma * e2[:-1]
sigma_view = as_strided(sigma, shape=(n-1, 2), strides=sigma.strides*2)
inner1d(sigma_view, [beta, 1], out=sigma[1:])
return sigma
In [75]: e2 = np.random.rand(1e6)
In [76]: np.allclose(garchModel(e2), garch_model(e2))
Out[76]: True
In [77]: %timeit garchModel(e2)
1 loops, best of 3: 6.93 s per loop
In [78]: %timeit garch_model(e2)
100 loops, best of 3: 17.5 ms per loop

I have tried using Numba, which for my dataset gives a 200x improvement!
Thanks for all the suggestions above, but I can't get them to give me the correct answer. I will try to read up about linear filters, but it's Friday night now and I'm a bit too tired to take in anymore information.
from numba import autojit
#autojit
def garchModel2(e2, beta=0.1, gamma=0.8, omega=0.01, ):
sigma = np.empty( len( e2 ) )
sigma[0] = omega
for i in range( 1, len(e2) ):
sigma[i] = omega + beta * sigma[ i-1 ] + gamma * e2[ i-1 ]
return sigma

This is a solution based on #stx2 's idea. One potential problem is that beta**N may cause float point overflow when N becomes large (same with cumprod).
>>> def garchModel2(e2, omega=0.01, beta=0.1, gamma=0.8):
wt0=cumprod(array([beta,]*(len(e2)-1)))
wt1=cumsum(hstack((0.,wt0)))+1
wt2=hstack((wt0[::-1], 1.))*gamma
wt3=hstack((1, wt0))[::-1]*beta
pt1=hstack((0.,(array(e2)*wt2)[:-1]))
pt2=wt1*omega
return cumsum(pt1)/wt3+pt2
>>> garchModel([1,2,3,4,5])
array([ 0.01 , 0.811 , 1.6911 , 2.57911 , 3.467911])
>>> garchModel2([1,2,3,4,5])
array([ 0.01 , 0.811 , 1.6911 , 2.57911 , 3.467911])
>>> f1=lambda: garchModel2(range(5))
>>> f=lambda: garchModel(range(5))
>>> T=timeit.Timer('f()', 'from __main__ import f')
>>> T1=timeit.Timer('f1()', 'from __main__ import f1')
>>> T.timeit(1000)
0.01588106868331031
>>> T1.timeit(1000) #When e2 dimension is samll, garchModel2 is slower
0.04536693909403766
>>> f1=lambda: garchModel2(range(10000))
>>> f=lambda: garchModel(range(10000))
>>> T.timeit(1000)
35.745981961394534
>>> T1.timeit(1000) #When e2 dimension is large, garchModel2 is faster
1.7330512676890066
>>> f1=lambda: garchModel2(range(1000000))
>>> f=lambda: garchModel(range(1000000))
>>> T.timeit(50)
167.33835501439427
>>> T1.timeit(50) #The difference is even bigger.
8.587259274572716
I didn't use beta**N but cumprod instead. ** will probably slow it down a lot.

Your calculation is a linear filter of the sequence omega + gamma*e2, so you can use scipy.signal.lfilter. Here's a version of your calculation, with appropriate tweaks of the initial conditions and input of the filter to generate the same output as garchModel:
import numpy as np
from scipy.signal import lfilter
def garch_lfilter(e2, omega=0.01, beta=0.1, gamma=0.8):
# Linear filter coefficients:
b = [1]
a = [1, -beta]
# Initial condition for the filter:
zi = np.array([beta*omega])
# Preallocate the output array, and set the first value to omega:
sigma = np.empty(len(e2))
sigma[0] = omega
# Apply the filter to omega + gamma*e2[:-1]
sigma[1:], zo = lfilter(b, a, omega + gamma*e2[:-1], zi=zi)
return sigma
Verify that it gives the same result as #Jaime's function:
In [6]: e2 = np.random.rand(1e6)
In [7]: np.allclose(garch_model(e2), garch_lfilter(e2))
Out[7]: True
It is a lot faster than garchModel, but not as fast as #Jaime's function.
Timing for #Jaime's garch_model:
In [8]: %timeit garch_model(e2)
10 loops, best of 3: 21.6 ms per loop
Timing for garch_lfilter:
In [9]: %timeit garch_lfilter(e2)
10 loops, best of 3: 26.8 ms per loop

As #jaime shows, there's a way. However, I don't know if there's a way to rewrite the function, make it much faster, and keep it simple.
An alternative approach then, is using optimization "magics", such as cython or numba.

paralellize loop over iter

I am having performance issues with my code.
step # IIII consumes hours of time. I used to materialize the
the itertools.prodct before, but thanks to a user I dont do pro_data = product(array_b,array_a) anymore. This helped me with memory issues, but the still is heavily time consuming.
I would like to paralellize it with multithreading or multiprocesisng, whatever you can suggest, I am grateful.
Explanation. I have two arrays that contain x and y values of particles. For each particle (defined by two coordinates) I want to calculate a function with another. For combinations I use the itertools.product method and loop over every particle. I run over 50000 particels in total, so I have N*N/2 combinations to calculate.
Thanks in advance
import numpy as np
import matplotlib.pyplot as plt
from itertools import product,combinations_with_replacement
def func(ar1,ar2,ar3,ar4): #example func that takes four arguments
return (ar1*ar2**22+np.sin(ar3)+ar4)
def newdist(a):
return func(a[0][0],a[0][1],a[1][0],a[1][1])
x_edges = np.logspace(-3,1, num=25) #prepare x-axis for histogram
x_mean = 10**((np.log10(x_edges[:-1])+np.log10(x_edges[1:]))/2)
x_width=x_edges[1:]-x_edges[:-1]
hist_data=np.zeros([len(x_edges)-1])
array1=np.random.uniform(0.,10.,100)
array2=np.random.uniform(0.,10.,100)
array_a = np.dstack((array1,array1))[0]
array_b = np.dstack((array2,array2))[0]
# IIII
for i in product(array_a,array_b):
(result,bins) = np.histogram(newdist(i),bins=x_edges)
hist_data+=result
hist_data = np.array(map(float, hist_data))
plt.bar(x_mean,hist_data,width=x_width,color='r')
plt.show()
-----EDIT-----
I used this code now:
def mp_dist(array_a,array_b, d, bins): #d chunks AND processes
def worker(array_ab, out_q):
""" push result in queue """
outdict = {}
outdict = vec_chunk(array_ab, bins)
out_q.put(outdict)
out_q = mp.Queue()
a = np.swapaxes(array_a, 0 ,1)
b = np.swapaxes(array_b, 0 ,1)
array_size_a=len(array_a)-(len(array_a)%d)
array_size_b=len(array_b)-(len(array_b)%d)
a_chunk = array_size_a / d
b_chunk = array_size_b / d
procs = []
#prepare arrays for mp
array_ab = np.empty((4, a_chunk, b_chunk))
for j in xrange(d):
for k in xrange(d):
array_ab[[0, 1]] = a[:, a_chunk * j:a_chunk * (j + 1), None]
array_ab[[2, 3]] = b[:, None, b_chunk * k:b_chunk * (k + 1)]
p = mp.Process(target=worker, args=(array_ab, out_q))
procs.append(p)
p.start()
resultarray = np.empty(len(bins)-1)
for i in range(d):
resultarray+=out_q.get()
# Wait for all worker processes to finish
for pro in procs:
pro.join()
print resultarray
return resultarray
Problem here is that I cannot control the numbers of processes. How Can I use a mp.Pool() instead?
than

First, lets look at a straightforward vectorization of your problem. I have a feeling that you want your array_a and array_b to be the exact same, i.e. the coordinates of the particles, but I am keeping them separate here.
I have turned your code into a function, to make timing easier:
def IIII(array_a, array_b, bins) :
hist_data=np.zeros([len(bins)-1])
for i in product(array_a,array_b):
(result,bins) = np.histogram(newdist(i), bins=bins)
hist_data+=result
hist_data = np.array(map(float, hist_data))
return hist_data
You can, by the way, generate your sample data in a less convoluted way as follows:
n = 100
array_a = np.random.uniform(0, 10, size=(n, 2))
array_b = np.random.uniform(0, 10, size=(n, 2))
So first we need to vectorize your func. I have done it so it can take any array of shape (4, ...). To spare memory, it is doing the calculation in place, and returning the first plane, i.e. array[0].
def func_vectorized(a) :
a[1] **= 22
np.sin(a[2], out=a[2])
a[0] *= a[1]
a[0] += a[2]
a[0] += a[3]
return a[0]
With this function in place, we can write a vectorized version of IIII:
def IIII_vec(array_a, array_b, bins) :
array_ab = np.empty((4, len(array_a), len(array_b)))
a = np.swapaxes(array_a, 0 ,1)
b = np.swapaxes(array_b, 0 ,1)
array_ab[[0, 1]] = a[:, :, None]
array_ab[[2, 3]] = b[:, None, :]
newdist = func_vectorized(array_ab)
hist, _ = np.histogram(newdist, bins=bins)
return hist
With n = 100 points, they both return the same:
In [2]: h1 = IIII(array_a, array_b, x_edges)
In [3]: h2 = IIII_bis(array_a, array_b, x_edges)
In [4]: np.testing.assert_almost_equal(h1, h2)
But the timing differences are already very relevant:
In [5]: %timeit IIII(array_a, array_b, x_edges)
1 loops, best of 3: 654 ms per loop
In [6]: %timeit IIII_vec(array_a, array_b, x_edges)
100 loops, best of 3: 2.08 ms per loop
A 300x speedup!. If you try it again with longer sample data, n = 1000, you can see that they both scale equally bad, as n**2, so the 300x stays there:
In [10]: %timeit IIII(array_a, array_b, x_edges)
1 loops, best of 3: 68.2 s per loop
In [11]: %timeit IIII_bis(array_a, array_b, x_edges)
1 loops, best of 3: 229 ms per loop
So you are still looking at a good 10 min. of processing, which is not really that much when compared to the more than 2 days that your current solution would require.
Of course, for things to be so nice, you will need to fit a (4, 50000, 50000) array of floats into memory, something that my system cannot handle. But you can still keep things relatively fast, by processing it in chunks. The following version of IIII_vec divides each array into d chunks. As written, the length of the array should be divisible by d. It wouldn't bee too hard to overcome that limitation, but it would obfuscate the true purpose:
def IIII_vec_bis(array_a, array_b, bins, d=1) :
a = np.swapaxes(array_a, 0 ,1)
b = np.swapaxes(array_b, 0 ,1)
a_chunk = len(array_a) // d
b_chunk = len(array_b) // d
array_ab = np.empty((4, a_chunk, b_chunk))
hist_data = np.zeros((len(bins) - 1,))
for j in xrange(d) :
for k in xrange(d) :
array_ab[[0, 1]] = a[:, a_chunk * j:a_chunk * (j + 1), None]
array_ab[[2, 3]] = b[:, None, b_chunk * k:b_chunk * (k + 1)]
newdist = func_vectorized(array_ab)
hist, _ = np.histogram(newdist, bins=bins)
hist_data += hist
return hist_data
First, lets check that it really works:
In [4]: h1 = IIII_vec(array_a, array_b, x_edges)
In [5]: h2 = IIII_vec_bis(array_a, array_b, x_edges, d=10)
In [6]: np.testing.assert_almost_equal(h1, h2)
And now some timings. With n = 100:
In [7]: %timeit IIII_vec(array_a, array_b, x_edges)
100 loops, best of 3: 2.02 ms per loop
In [8]: %timeit IIII_vec_bis(array_a, array_b, x_edges, d=10)
100 loops, best of 3: 12 ms per loop
But as you start having to have a larger and larger array in memory, doing it in chunks starts to pay off. With n = 1000:
In [12]: %timeit IIII_vec(array_a, array_b, x_edges)
1 loops, best of 3: 223 ms per loop
In [13]: %timeit IIII_vec_bis(array_a, array_b, x_edges, d=10)
1 loops, best of 3: 208 ms per loop
With n = 10000 I can no longer call IIII_vec without an array is too big error, but the chunky version is still running:
In [18]: %timeit IIII_vec_bis(array_a, array_b, x_edges, d=10)
1 loops, best of 3: 21.8 s per loop
And just to show that it can be done, I have run it once with n = 50000:
In [23]: %timeit -n1 -r1 IIII_vec_bis(array_a, array_b, x_edges, d=50)
1 loops, best of 1: 543 s per loop
So a good 9 minutes of number crunching, which is not all that bad given it has computed 2.5 billion interactions.

Use vectorized numpy operations. Replace the for-loop over product() with a single newdist() call by creating arguments using meshgrid().
To parallize the problem compute newdist() on slices of array_a, array_b that correspond to subblocks of meshgrid(). Here's an example using slices and multiprocessing.
Here's another example to demonstrate the steps: python loop -> vectorized numpy version -> parallel:
#!/usr/bin/env python
from __future__ import division
import math
import multiprocessing as mp
import numpy as np
try:
from itertools import izip as zip
except ImportError:
zip = zip # Python 3
def pi_loop(x, y, npoints):
"""Compute pi using Monte-Carlo method."""
# note: the method converges to pi very slowly.
return 4 * sum(1 for xx, yy in zip(x, y) if (xx**2 + yy**2) < 1) / npoints
def pi_vectorized(x, y, npoints):
return 4 * ((x**2 + y**2) < 1).sum() / npoints # or just .mean()
def mp_init(x_shared, y_shared):
global mp_x, mp_y
mp_x, mp_y = map(np.frombuffer, [x_shared, y_shared]) # no copy
def mp_pi(args):
# perform computations on slices of mp_x, mp_y
start, end = args
x = mp_x[start:end] # no copy
y = mp_y[start:end]
return ((x**2 + y**2) < 1).sum()
def pi_parallel(x, y, npoints):
# compute pi using multiple processes
pool = mp.Pool(initializer=mp_init, initargs=[x, y])
step = 100000
slices = ((start, start + step) for start in range(0, npoints, step))
return 4 * sum(pool.imap_unordered(mp_pi, slices)) / npoints
def main():
npoints = 1000000
# create shared arrays
x_sh, y_sh = [mp.RawArray('d', npoints) for _ in range(2)]
# initialize arrays
x, y = map(np.frombuffer, [x_sh, y_sh])
x[:] = np.random.uniform(size=npoints)
y[:] = np.random.uniform(size=npoints)
for f, a, b in [(pi_loop, x, y),
(pi_vectorized, x, y),
(pi_parallel, x_sh, y_sh)]:
pi = f(a, b, npoints)
precision = int(math.floor(math.log10(npoints)) / 2 - 1 + 0.5)
print("%.*f %.1e" % (precision + 1, pi, abs(pi - math.pi)))
if __name__=="__main__":
main()
Time performance for npoints = 10_000_000:
pi_loop pi_vectorized pi_parallel
32.6 0.159 0.069 # seconds
It shows that the main performance benefit is from converting the python loop to its vectorized numpy analog.

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Vectorize a python loop over a numpy array - python

Related

Fast way to calculate conditional function

Python: rewrite a looping numpy math function to run on GPU

performance loss after vectorization in numpy

Faster way to do this using Python, Numpy?

paralellize loop over iter

Categories

Resources