I need to calculate the Fourier transform of a 256 element float64 signal. The requirement is as such that I need to invoke these FFTs from inside a cuda.jitted section and it must be completed within 25usec. Alas cuda.jit-compiled functions do not allow to invoke external libraries => I wrote my own. Alas my single-core code is still way too slow (~250usec on a Quadro P4000). Is there a better way?
I created a single core FFT-function that gives correct results, but is alas 10x too slow. I don't understand how to make good use of multiple cores.
---fft.py
from numba import cuda, boolean, void, int32, float32, float64, complex128
import math, sys, cmath
def _transform_radix2(vector, inverse, out):
n = len(vector)
levels = int32(math.log(float32(n))/math.log(float32(2)))
assert 2**levels==n # error: Length is not a power of 2
#uncomment either Numba.Cuda or Numpy memory allocation, (intelligent conditional compileation??)
exptable = cuda.local.array(1024, dtype=complex128)
#exptable = np.zeros(1024, np.complex128)
assert (n // 2) <= len(exptable) # error: FFT length > MAXFFTSIZE
coef = complex128((2j if inverse else -2j) * math.pi / n)
for i in range(n // 2):
exptable[i] = cmath.exp(i * coef)
for i in range(n):
x = i
y = 0
for j in range(levels):
y = (y << 1) | (x & 1)
x >>= 1
out[i] = vector[y]
size = 2
while size <= n:
halfsize = size // 2
tablestep = n // size
for i in range(0, n, size):
k = 0
for j in range(i, i + halfsize):
temp = out[j + halfsize] * exptable[k]
out[j + halfsize] = out[j] - temp
out[j] += temp
k += tablestep
size *= 2
scale=float64(n if inverse else 1)
for i in range(n):
out[i]=out[i]/scale # the inverse requires a scaling
# now create the Numba.cuda version to be called by a GPU
gtransform_radix2 = cuda.jit(device=True)(_transform_radix2)
---test.py
from numba import cuda, void, float64, complex128, boolean
import cupy as cp
import numpy as np
import timeit
import fft
#cuda.jit(void(float64[:],boolean, complex128[:]))
def fftbench(y, inverse, FT):
Y = cuda.local.array(256, dtype=complex128)
for i in range(len(y)):
Y[i]=complex128(y[i])
fft.gtransform_radix2(Y, False, FT)
str='\nbest [%2d/%2d] iterations, min:[%9.3f], max:[%9.3f], mean:[%9.3f], std:[%9.3f] usec'
a=[127.734375 ,130.87890625 ,132.1953125 ,129.62109375 ,118.6015625
,110.2890625 ,106.55078125 ,104.8203125 ,106.1875 ,109.328125
,113.5 ,118.6640625 ,125.71875 ,127.625 ,120.890625
,114.04296875 ,112.0078125 ,112.71484375 ,110.18359375 ,104.8828125
,104.47265625 ,106.65625 ,109.53515625 ,110.73828125 ,111.2421875
,112.28125 ,112.38671875 ,112.7734375 ,112.7421875 ,113.1328125
,113.24609375 ,113.15625 ,113.66015625 ,114.19921875 ,114.5
,114.5546875 ,115.09765625 ,115.2890625 ,115.7265625 ,115.41796875
,115.73828125 ,116. ,116.55078125 ,116.5625 ,116.33984375
,116.63671875 ,117.015625 ,117.25 ,117.41015625 ,117.6640625
,117.859375 ,117.91015625 ,118.38671875 ,118.51171875 ,118.69921875
,118.80859375 ,118.67578125 ,118.78125 ,118.49609375 ,119.0078125
,119.09375 ,119.15234375 ,119.33984375 ,119.31640625 ,119.6640625
,119.890625 ,119.80078125 ,119.69140625 ,119.65625 ,119.83984375
,119.9609375 ,120.15625 ,120.2734375 ,120.47265625 ,120.671875
,120.796875 ,120.4609375 ,121.1171875 ,121.35546875 ,120.94921875
,120.984375 ,121.35546875 ,120.87109375 ,120.8359375 ,121.2265625
,121.2109375 ,120.859375 ,121.17578125 ,121.60546875 ,121.84375
,121.5859375 ,121.6796875 ,121.671875 ,121.78125 ,121.796875
,121.8828125 ,121.9921875 ,121.8984375 ,122.1640625 ,121.9375
,122. ,122.3515625 ,122.359375 ,122.1875 ,122.01171875
,121.91015625 ,122.11328125 ,122.1171875 ,122.6484375 ,122.81640625
,122.33984375 ,122.265625 ,122.78125 ,122.44921875 ,122.34765625
,122.59765625 ,122.63671875 ,122.6796875 ,122.6171875 ,122.34375
,122.359375 ,122.7109375 ,122.83984375 ,122.546875 ,122.25390625
,122.06640625 ,122.578125 ,122.7109375 ,122.83203125 ,122.5390625
,122.2421875 ,122.06640625 ,122.265625 ,122.13671875 ,121.8046875
,121.87890625 ,121.88671875 ,122.2265625 ,121.63671875 ,121.14453125
,120.84375 ,120.390625 ,119.875 ,119.34765625 ,119.0390625
,118.4609375 ,117.828125 ,117.1953125 ,116.9921875 ,116.046875
,115.16015625 ,114.359375 ,113.1875 ,110.390625 ,108.41796875
,111.90234375 ,117.296875 ,127.0234375 ,147.58984375 ,158.625
,129.8515625 ,120.96484375 ,124.90234375 ,130.17578125 ,136.47265625
,143.9296875 ,150.24609375 ,141. ,117.71484375 ,109.80859375
,115.24609375 ,118.44140625 ,120.640625 ,120.9921875 ,111.828125
,101.6953125 ,111.21484375 ,114.91015625 ,115.2265625 ,118.21875
,125.3359375 ,139.44140625 ,139.76953125 ,135.84765625 ,137.3671875
,141.67578125 ,139.53125 ,136.44921875 ,135.08203125 ,135.7890625
,137.58203125 ,138.7265625 ,154.33203125 ,172.01171875 ,152.24609375
,129.8046875 ,125.59375 ,125.234375 ,127.32421875 ,132.8984375
,147.98828125 ,152.328125 ,153.7734375 ,155.09765625 ,156.66796875
,159.0546875 ,151.83203125 ,138.91796875 ,138.0546875 ,140.671875
,143.48046875 ,143.99609375 ,146.875 ,146.7578125 ,141.15234375
,141.5 ,140.76953125 ,140.8828125 ,145.5625 ,150.78125
,148.89453125 ,150.02734375 ,150.70703125 ,152.24609375 ,148.47265625
,131.95703125 ,125.40625 ,123.265625 ,123.57421875 ,129.859375
,135.6484375 ,144.51171875 ,155.05078125 ,158.4453125 ,140.8125
,100.08984375 ,104.29296875 ,128.55078125 ,139.9921875 ,143.38671875
,143.69921875 ,137.734375 ,124.48046875 ,116.73828125 ,114.84765625
,113.85546875 ,117.45703125 ,122.859375 ,125.8515625 ,133.22265625
,139.484375 ,135.75 ,122.69921875 ,115.7734375 ,116.9375
,127.57421875]
y1 =cp.zeros(len(a), cp.complex128)
FT1=cp.zeros(len(a), cp.complex128)
for i in range(len(a)):
y1[i]=a[i] #convert to complex to feed the FFT
r=1000
series=sorted(timeit.repeat("fftbench(y1, False, FT1)", number=1, repeat=r, globals=globals()))
series=series[0:r-5]
print(str % (len(series), r, 1e6*np.min(series), 1e6*np.max(series), 1e6*np.mean(series), 1e6*np.std(series)));
a faster implementation t<<25usec
The drawback of your algorithm is that even on GPU it runs on a single-core.
In order to understand how to design algorithms on Nvidia GPGPU I recommend to look at :
the CUDA C Programming guide and to the numba documentation to apply the code in python.
Moreover to understand what's wrong with your code, I recommend to use Nvidia profiler.
The following parts of the answer will explained how to apply the basics on your example.
Run multiples threads
To improve performances, you will first need to launch multiples threads that can run in parallel, CUDA handle threads as follow:
Threads are grouped into blocs of n threads (n < 1024)
Each thread withing the same bloc can be synchronized and have access to a (fast) common memory space called "shared memory".
You can run multiples blocs in parallel in a "grid" but you will lose the synchronization mechanism.
The syntax to run multiples threads is the following:
fftbench[griddim, blockdim](y1, False, FT1)
to simplify, I will use only one bloc of size 256:
fftbench[1, 256](y1, False, FT1)
Memory
To improve GPU performances it's important to look where the data will be stored, their is three main spaces:
global memory: it's the "RAM" of your GPU, it's slow and have a high latency, this is where all your array are placed when you send them to the GPU.
shared memory: it's a little fast access memory, all the thread of a bloc have access to the same shared memory.
local memory: physically it's the same that global memory, but each thread access its own local memory.
Typically, if you use multiples times the sames data, you should try store them in shared memory to prevent latency from the global memory.
In your code, you can store exptable in shared memory:
exptable = cuda.shared.array(1024, dtype=complex128)
and if n is not too big, you may want to use a working instead of using out:
working = cuda.shared.array(256, dtype=complex128)
Assign tasks to each thread
Of course if you don't change your function, all thread will do the same job and it will just slow down your program.
In this example we will assign each thread to one cell of the array. To do so, we have to get the unique id of thread withing a bloc:
idx = cuda.threadIdx.x
Now we will be able to speed up the for loops, lets handle them one by one:
exptable = cuda.shared.array(1024, dtype=complex128)
...
for i in range(n // 2):
exptable[i] = cmath.exp(i * coef)
Here is the goal: we will want the n/2 first threads to fill this array, then all the thread will be able to use it.
So in this case just replace the for loop by a condition on the thread idx's:
if idx < n // 2:
exptable[idx] = cmath.exp(idx * coef)
For the two last loops it's easier, each thread will deal with one cell of the array:
for i in range(n):
x = i
y = 0
for j in range(levels):
y = (y << 1) | (x & 1)
x >>= 1
out[i] = vector[y]
become
x = idx
y = 0
for j in range(levels):
y = (y << 1) | (x & 1)
x >>= 1
working[idx] = vector[y]
and
for i in range(n):
out[i]=out[i]/scale # the inverse requires a scaling
become
out[idx]=working[idx]/scale # the inverse requires a scaling
I use the shared array working but you can replace it by out if you want to use global memory.
Now, lets look at the while loop, we said that we want each thread to only deal with one cell of the array. So we can try to parallelize the two for loops inside.
...
for i in range(0, n, size):
k = 0
for j in range(i, i + halfsize):
temp = out[j + halfsize] * exptable[k]
out[j + halfsize] = out[j] - temp
out[j] += temp
k += tablestep
...
To simplify I will only use half of the threads, we will take the 128 first threads and determine j as follow:
...
if idx < 128:
j = (idx%halfsize) + size*(idx//halfsize)
...
k is:
k = tablestep*(idx%halfsize)
so we got the loop:
size = 2
while size <= n:
halfsize = size // 2
tablestep = n // size
if idx < 128:
j = (idx%halfsize) + size*(idx//halfsize)
k = tablestep*(idx%halfsize)
temp = working[j + halfsize] * exptable[k]
working[j + halfsize] = working[j] - temp
working[j] += temp
size *= 2
Synchronization
Last but not least, we need to synchronize all theses threads. In fact the program will not work if we do not synch. On the GPU thread may not run at the same time so you can get issues when data are produced by one thread and used by another one, for example:
exptable[0] is used by thread_2 before thread_0 fill store its value
working[j + halfsize] is moddified by another thread before you store it in temp
to prevent this we can use the function:
cuda.syncthreads()
All the threads in the same bloc will finish this line before execution the rest of the code.
In this example, you need to synchronize at two point, after the working initialization and after each iteration of the while loop.
then your code look like:
def _transform_radix2(vector, inverse, out):
n = len(vector)
levels = int32(math.log(float32(n))/math.log(float32(2)))
assert 2**levels==n # error: Length is not a power of 2
exptable = cuda.shared.array(1024, dtype=complex128)
working = cuda.shared.array(256, dtype=complex128)
assert (n // 2) <= len(exptable) # error: FFT length > MAXFFTSIZE
coef = complex128((2j if inverse else -2j) * math.pi / n)
if idx < n // 2:
exptable[idx] = cmath.exp(idx * coef)
x = idx
y = 0
for j in range(levels):
y = (y << 1) | (x & 1)
x >>= 1
working[idx] = vector[y]
cuda.syncthreads()
size = 2
while size <= n:
halfsize = size // 2
tablestep = n // size
if idx < 128:
j = (idx%halfsize) + size*(idx//halfsize)
k = tablestep*(idx%halfsize)
temp = working[j + halfsize] * exptable[k]
working[j + halfsize] = working[j] - temp
working[j] += temp
size *= 2
cuda.syncthreads()
scale=float64(n if inverse else 1)
out[idx]=working[idx]/scale # the inverse requires a scaling
I feel like your question is a good way to introduce some basics about GPGPU computing and I try to answer it in a didactic way. The final code is far from perfect and can be optimized a lot, I highly recommend you to read this Programming guide if you want to learn more about GPU optimizations.
I am optimizing a bottleneck section of my code--iterating on a function a' = f(a), where a and a' are N by 1 vectors, until max(abs(a' - a)) is sufficiently small.
I have put a Numba wrapper on f(a), and got a nice speedup over the most optimized pure NumPy version I was able to produe (cut runtime by about 50%).
I tried writing a C-compatible version of numpy.max(numpy.abs(aprime - a)), but it turns out this is slower! I actually lose back ALL of the gains I got from Numba-fying the first portion of the iteration!
Is there likely to be a way for Numba or Cython to improve upon numpy.max(numpy.abs(aprime - a))? I reproduce my code below for reference, where a is P0 and a' is Pprime:
EDIT: For me, it seems that it is important to "flatten()" the inputs to "maxabs()". When I do this, the performance is no worse than NumPy. Then, when I do a "dry run" of the function outside the timing brackets as JoshAdel suggested, the loop with "maxabs" does slightly better than the loop with numpy.max(numpy.abs()).
from numba import jit
import numpy as np
### Preliminaries, to make the working example fully functional
n = 1200
Gammer = np.exp(-np.random.rand(n,n))
alpher = np.ones((n,1))
xxer = 10000*np.random.rand(n,1)
chii = 6.5
varkappa = 6.5
phi3 = 1.5
A = .5
sig = .2
mmer = np.dot(Gammer,xxer**phi3)
totalprod = A*alpher + (1-A)*mmer
Gammerchii = Gammer**chii
Gammerrats = Gammerchii[:,0].flatten()/Gammerchii[0,:].flatten()
Gammerrats[(Gammerchii[0,:].flatten() == 0) | (Gammerchii[:,0].flatten() == 0)] = 1.
P0 = (Gammerrats*(xxer[0]/totalprod[0])*(totalprod/xxer).flatten())**(1/(1+2*chii))
P0 *= n/np.sum(P0)
### End of preliminaries
### This is the function to produce a' = f(a)
#jit
def Piteration(P0, chii, sig, n, xxer, totalprod, Gammerrats, Gammerchii):
Mac = np.zeros((n,))
Pprime = np.zeros((n,))
themacpow = 1-(1/chii)*(sig/(1-sig))
specialchiipow = 1/(1+2*chii)
Psum = 0.
for i in range(n):
for j in range(n):
Mac[j] += ((P0[i]/P0[j])**chii)*Gammerchii[i,j]*totalprod[j]
for i in range(n):
Pprime[i] = (Gammerrats[i]*(xxer[0]/totalprod[0])*(totalprod[i]/xxer[i])*((Mac[i]/Mac[0])**themacpow))**specialchiipow
Psum += Pprime[i]
Psum = n/Psum
for i in range(n):
Pprime[i] *= Psum
return Pprime
### This is the function to find max(abs(aprime - a))
#jit
def maxabs(vec1,vec2,n):
themax = 0.
curdiff = 0.
for i in range(n):
curdiff = vec1[i] - vec2[i]
if curdiff < 0:
curdiff *= -1
if curdiff > themax:
themax = curdiff
return themax
### This is the main loop
diff = 1000.
while diff > 1e-2:
Pprime = Piteration(P0.flatten(), chii, sig, n, xxer.flatten(), totalprod.flatten(), Gammerrats.flatten(), Gammerchii)
diff = maxabs(P0.flatten(),Pprime.flatten(),n)
P0 = 1.*Pprime
When I time your maxabs function vs np.max(np.abs(vec1 - vec2)) for an array of shape (1200,), the numba version is ~2.6x faster using numba 0.32.0.
When you time the code, make sure you run your function once before you time it so that you don't include the time it takes to jit the code, which you only pay the first time. In general using timeit and running multiple times takes care of this. I'm not sure how you did the timing though since I see almost no difference in using maxabs vs the numpy call, most of the runtime seems to be in the call to Piteration.