Compute and fill an array in parallel - python

As part of a signal processing task, I am doing some computation per frequency step.
I have a frequencies list of length 513.
I have a 3D numpy array A of shape (81,81,513), where 513 is the number of frequency points. I then have a 81x81 matrix per frequency.
I want to apply some modification to each of those matrices, to end up with a modified version of A I'll name B here, which will also be of shape (81,81,513).
For that, I start pre-allocating B with :
B = np.zeros_like(A)
I then loop over my frequencies and call a dothing function like:
for index, frequency in enumerate(frequencies):
B[:,:,index] = dothing(A[:,:,index])
The problem is that dothing takes a lot of time, and ran sequentially over 513 frequency steps seems endless.
So I wanted to parallelize it. But even after reading a lot of docs and watching a lot of videos, I get lost in all the libraries and potential solutions.
Computations at all individual frequencies can be done independently. But in the end I need to assign everything back to B in the right order.
Any idea on how to do that?
Thanks in advance
Antoine

Here I would use a shared array using shared_memory, as there's no need to protect write access if no two loop iterations ever use the same memory address. I eliminated the second array to shorten the example (only construct a single shared array), and I re-ordered the array shape to better preserve memory-aligned access.
from multiprocessing import Pool
from multiprocessing.shared_memory import SharedMemory
import numpy as np
import numpy.typing as npt
from typing import Any
from time import sleep
def dothing(arr: np.ndarray, t_func: Any) -> np.ndarray:
sleep(.05) #simulate hard work
return arr * 2
def dodothing(args: tuple[int, Any]):
global arr
index = args[0]
t_func = args[1]
arr[index] = dothing(arr[index], t_func) #write result back to self to avoid need for 2 shared arrays
def init(shm: SharedMemory, shape: tuple[int, ...], dtype: npt.DTypeLike):
global arr
arr = np.ndarray(shape, dtype=dtype, buffer=shm.buf)
if __name__ == "__main__":
_A = np.ones((513,81,81), np.float64) #source data
t_funcs = ["some transfer function"] * _A.shape[0] #added example of passing some data + an index
nbytes = _A.size * _A.itemsize
dtype = _A.dtype
shape = _A.shape
shm = SharedMemory(create=True, size=nbytes)
A = np.ndarray(shape, dtype=dtype, buffer=shm.buf)
A[:] = _A[:] #copy contents into shared A
with Pool(initializer=init, initargs=(shm, shape, dtype)) as pool:
pool.map(dodothing, enumerate(t_funcs)) #enumerate returns tuple[int,Any] each loop
print(A.sum()/_A.sum()) #prove we multiplied all elements by 2
shm.close()
shm.unlink()
multiprocessing.Pool is a bit funny sometimes in what can be a valid argument to a target function, so I tend to share things like Lock, Queue, shared_memory etc. via the pool's initialization function, which accepts arguments just like Process does.

Related

Numba #guvectorise returns garbage values

The code below is a test for calculating distance between points in a periodic system.
import itertools
import time
import numpy as np
import numba
from numba import njit
#njit(cache=True)
def get_dr(i=np.array([]),j=np.array([]),cellsize=np.array([])):
k=np.zeros(3,dtype=np.float64)
for idx, _ in enumerate(cellsize):
k[idx] = (j[idx]-i[idx])-cellsize[idx]*np.round((j[idx]-i[idx])/cellsize[idx])
return np.linalg.norm(k)
#numba.guvectorize(["void(float64[:],float64[:],float64[:],float64)"],
"(m),(m),(m)->()",nopython=True,cache=True)
def get_dr_vec(i,j,cellsize,dr):
dr=0.0
k=np.zeros(3,dtype=np.float64)
for idx, _ in enumerate(cellsize):
k[idx] = (j[idx]-i[idx])-cellsize[idx]*np.round((j[idx]-i[idx])/cellsize[idx])
dr=np.sqrt(np.square(k[0])+np.square(k[1])+np.square(k[2]))
N, dim = 50, 3 # 50 particles in 3D
vec = np.random.random((N, dim))
cellsize=np.array([26.4,26.4,70.0])
rList=[];rList2=[]
start = time.perf_counter()
for (pI, pJ) in itertools.product(vec, vec):
rList.append(get_dr(pI,pJ,cellsize))
end =time.perf_counter()
print("Time {:.3g}s".format(end-start))
newvec1=[];newvec2=[]
start = time.perf_counter()
for (pI, pJ) in itertools.product(vec, vec):
newvec1.append(pI)
newvec2.append(pJ)
cellsizeVec=np.full(shape=np.shape(newvec1),fill_value=cellsize,dtype=float)
rList2=get_dr_vec(np.array(newvec1),np.array(newvec2),cellsizeVec)
end =time.perf_counter()
print("Time {:.3g}s".format(end-start))
print(rList2)
exit()
Compared to get_dr() which shows the correct result, get_dr_vec() shows garbage and nan values. The function get_dr_vec() is calculating the correct value for dr, but it returns garbage values with correct dimensions. Can someone suggest any ideas on how to resolve this issue?
You made a small mistake in the guvectorize function call. Guvectorize does not want you to redefine the output variable, the output array/scalar must be filled in instead. The code below should work:
#numba.guvectorize(["void(float64[:],float64[:],float64[:],float64[:])"],
"(m),(m),(m)->()",nopython=True, cache=True)
def get_dr_vec(i,j,cellsize,dr):
k=np.zeros(3,dtype=np.float64)
for idx, _ in enumerate(cellsize):
k[idx] = (j[idx]-i[idx])-cellsize[idx]*np.round((j[idx]-i[idx])/cellsize[idx])
# The mistake was on this line. You had "dr =", but it should be "dr[0] ="
dr[0] = np.sqrt(np.square(k[0])+np.square(k[1])+np.square(k[2]))
The reason that dr = does not work is because guvectorize already allocates the dr array before you call the function. dr = messes things up because it places a new dr array in a new place in memory, so when numba looks at the original place in memory, where it expects to find an array, it instead finds nothing. dr[0] = does work, because that way, we can fill in the values in the original place in memory, where numba expect the values to be.
If it is still not 100% clear i recommend that you look through the numba documentation on this topic.
Those "garbage values" you were seeing was the output array that was never filled, similar to what you would see if you would call print(np.empty(10))

Adding arrays to global array using multiprocessing

I've a global NumPy array ys_final and have defined a function that generates an array ys. The ys array will be generated based on an input parameter and I want to add these ys arrays to the global array, i.e ys_final = ys_final + ys.
The order of addition doesn't matter so I want to use Pool.apply_async() from the multiprocessing library but I can't write to the global array. The code for reference is:
import multiprocessing as mp
ys_final = np.zeros(len)
def ys_genrator(i):
#code to generate ys array
return ys
pool = mp.Pool(mp.cpu_count())
for i in range(3954):
ys_final = ys_final + pool.apply_async(ys_genrator, args=(i)).get()
pool.close()
pool.join()
The above block of code keeps on running forever and nothing happens. I've tried *mp.Process also and still I face the same problem. There I defined a target function that directly adds to the global array but it is also not working as the block keeps running forever. Reference:
def func(i):
#code to generate ys
global ys_final
ys_final = ys_final + ys
for i in range(3954):
p = mp.Process(target=func, args=(i,))
p.start()
p.join()
Any suggestions will be really helpful.
EDIT:
My ys_genrator is a function for linear interpolation. Based on the parameter i which is an index for rows in a 2D image, the function creates an array of interpolated amplitudes that will be superimposed with all the interpolated amplitudes from the image, so ys need to be added to ys_final
The variable len is the length of the interpolated array, which is same for all rows.
For reference, a simpler version of ys_genrator(i) is as follows:
def ys_genrator(i):
ys = np.ones(10)*i
return ys
A few points:
pool.apply_async(ys_genrator, args=(i)) needs to be pool.apply_async(ys_genrator, args=(i,)). Note the comma after the i.
pool.apply_async(ys_genrator, args=(i,)).get() is exactly equivalent to pool.apply(ys.genrator, args=(i,)). That is, you will block because of your immediate call to get and you will have absolutely no parallism. You would need to do all your calls to pool.apply_async and save the returned AsyncResult instances and only then call get on these instances.
If you are running under Windows, you will have a problem. The code that creates new processes must be within a block governed by if __name__ == '__main__':
If you are running under something like Jupyter Notebook or iPython you will have a problem. The worker function, ys_genrator, would need to be in an external file and imported.
Using apply_async for submitting a lot of tasks is inefficient. You are better of using imap or imap_unordered where the tasks get submitted in "chunks" and you can process the results one by one as they become available. But you must choose a "suitable" chunksize argument.
Any code you have at the global level, such as ys_final = np.zeros(len) will be executed by every sub-process if you are running under Windows, and this can be wasteful if the subprocesses do not need to "see" this variable. If they do need to see this variable, be aware that each process in the pool will be working with its own copy of the variable so it better be a read-only usage. Even then, it can be very wasteful of storage if the variable is large. There are ways of sharing such a variable across the processes but it is not perfectly clear whether you need to (you haven't even defined variable len). So it is difficult to give you improved code. However, it appears that your worker function does not need to "see" ys_final, so I will take a shot at an improved solution.
But be aware that if your function ys_genrator is very trivial, nothing will be gained by using multiprocessing because there is overhead in both creating the processing pool and in passing arguments from one process to another. Also, if ys_genrator is using numpy, this can also be a source of problems since numpy uses multiprocessing for some of its own functions and you are better off not mixing numpy with your own multiprocessing.
import multiprocessing as mp
import numpy as np
SIZE = 3
def ys_genrator(i):
#code to generate ys array
# for this dummy example all SIZE entries will end up with the same result:
ys = [i] * SIZE # for example: [1, 1, 1]
return ys
def compute_chunksize(poolsize, iterable_size):
chunksize, remainder = divmod(iterable_size, 4 * poolsize)
if remainder:
chunksize += 1
return chunksize
if __name__ == '__main__':
ys_final = np.zeros(SIZE)
n_iterations = 3954
poolsize = min(mp.cpu_count(), n_iterations)
chunksize = compute_chunksize(poolsize, n_iterations)
print('poolsize =', poolsize, 'chunksize =', chunksize)
pool = mp.Pool(poolsize)
for result in pool.imap_unordered(ys_genrator, range(n_iterations), chunksize):
ys_final += result
print(ys_final)
Prints:
poolsize = 8 chunksize = 124
[7815081. 7815081. 7815081.]
Update
You can also just use:
for result in pool.map(ys_genrator, range(n_iterations)):
ys_final += result
The issue is that when you use method map, the method wants to compute an efficient chunksize argument based on the size of the iterable argument (see my compute_chunksize function above, which is essentially what pool.map will use). But to do this, is will have to first convert the iterable to a list to get its size. If n_iterations is very large, this is not very efficient, although it's probably not an major issue for a size of 3954. Still, you would be better off using my compute_chunksize function in this case since you know the size of the iterable and then pass the chunksize argument explicitly to map as I have done in the code using imap_unordered.

Avoid simultaneously reading multiple files for a dask array

From a library, I get a function that reads a file and returns a numpy array.
I want to build a Dask array with multiple blocks from multiple files.
Each block is the result of calling the function on a file.
When I ask Dask to compute, will Dask asks the functions to read multiple files from the hard disk at the same time?
If that is the case, how to avoid that? My computer doesn't have a parallel file system.
Example:
import numpy as np
import dask.array as da
import dask
# Make test data
n = 2
m = 3
x = np.arange(n * m, dtype=np.int).reshape(n, m)
np.save('0.npy', x)
np.save('1.npy', x)
# np.load is a function that reads a file
# and returns a numpy array.
# Build delayed
y = [dask.delayed(np.load)('%d.npy' % i)
for i in range(2)]
# Build individual Dask arrays.
# I can get the shape of each numpy array without
# reading the whole file.
z = [da.from_delayed(a, (n, m), np.int) for a in y]
# Combine the dask arrays
w = da.vstack(z)
print(w.compute())
You could use an distributed
lock primitive - so that your loader function does acquire-read-release.
read_lock = distributed.Lock('numpy-read')
#dask.delayed
def load_numpy(lock, fn):
lock.acquire()
out = np.load(fn)
lock.release()
return out
y = [load_numpy(lock, '%d.npy' % i) for i in range(2)]
Also, da.from_array accepts a lock, so you could maybe create individual arrays from the delayed function np.load directly supplying the lock.
Alternatively, you could assign a single unit of
resource to the worker (with multiple threads), and then compute (or persist) with a requirement of one unit per file-read task, as in the example in the linked doc.
Response to comment: to_hdf wasn't specified in the question, I am not sure why it is being questioned now; however, you can use da.store(compute=False) with a h5py.File, and then specify the resource to use when calling compute. Note that this does not materialise the data into memory.

Parallelize loop over numpy rows

I need to apply the same function onto every row in a numpy array and store the result again in a numpy array.
# states will contain results of function applied to a row in array
states = np.empty_like(array)
for i, ar in enumerate(array):
states[i] = function(ar, *args)
# do some other stuff on states
function does some non trivial filtering of my data and returns an array when the conditions are True and when they are False. function can either be pure python or cython compiled. The filtering operations on the rows are complicated and can depend on previous values in the row, this means I can't operate on the whole array in an element-by-element fashion
Is there a way to do something like this in dask for example?
Dask solution
You could do with with dask.array by chunking the array by row, calling map_blocks, then computing the result
ar = ...
x = da.from_array(ar, chunks=(1, arr.shape[1]))
x.map_blocks(function, *args)
states = x.compute()
By default this will use threads, you can use processes in the following way
from dask.multiprocessing import get
states = x.compute(get=get)
Pool solution
However dask is probably overkill for embarrassingly parallel computations like this, you could get by with a threadpool
from multiprocessing.pool import ThreadPool
pool = ThreadPool()
ar = ...
states = np.empty_like(array)
def f(i):
states[i] = function(ar[i], *args)
pool.map(f, range(len(ar)))
And you could switch to processes with the following change
from multiprocessing import Pool
pool = Pool()
Turn your function into a universal function: http://docs.scipy.org/doc/numpy/reference/ufuncs.html.
Then: states = function(array, *args).

numpy -- Transform non-contiguous data to contiguous data in place

Consider the following code:
import numpy as np
a = np.zeros(50)
a[10:20:2] = 1
b = c = a[10:40:4]
print b.flags # You'll see that b and c are not C_CONTIGUOUS or F_CONTIGUOUS
My question:
Is there a way (with only a reference to b) to make both b and c contiguous?
It is completely fine if np.may_share_memory(b,a) returns False after this operation.
Things which are close, but don't quite work out are: np.ascontiguousarray/np.asfortranarray as they will return a new array.
My use case is that I have very large 3D fields stored in a subclass of a numpy.ndarray. In order to save memory, I would like to chop those fields down to the portion of the domain that I am actually interested in processing:
a = a[ix1:ix2,iy1:iy2,iz1:iz2]
Slicing for the subclass is somewhat more restricted than slicing of ndarray objects, but this should work and it will "do the right thing" -- the various custom meta-data attached on the subclass will be transformed/preserved as expected. Unfortunately, since this returns a view, numpy won't free the big array afterward so I don't actually save any memory here.
To be completely clear, I'm looking to accomplish 2 things:
preserve the metadata on my class instance. slicing will work, but I'm not sure about other forms of copying.
make it so that the original array is free to be garbage collected
According to Alex Martelli:
"The only really reliable way to ensure that a large
but temporary use of memory DOES return all resources to the system
when it's done, is to have that use happen in a subprocess, which
does the memory-hungry work then terminates."
However, the following appears to free at least some of the memory:
Warning: my way of measuring free memory is Linux-specific:
import time
import numpy as np
def free_memory():
"""
Return free memory available, including buffer and cached memory
"""
total = 0
with open('/proc/meminfo', 'r') as f:
for line in f:
line = line.strip()
if any(line.startswith(field) for field in ('MemFree', 'Buffers', 'Cached')):
field, amount, unit = line.split()
amount = int(amount)
if unit != 'kB':
raise ValueError(
'Unknown unit {u!r} in /proc/meminfo'.format(u=unit))
total += amount
return total
def gen_change_in_memory():
"""
https://stackoverflow.com/a/14446011/190597 (unutbu)
"""
f = free_memory()
diff = 0
while True:
yield diff
f2 = free_memory()
diff = f - f2
f = f2
change_in_memory = gen_change_in_memory().next
Before allocating the large array:
print(change_in_memory())
# 0
a = np.zeros(500000)
a[10:20:2] = 1
b = c = a[10:40:4]
After allocating the large array:
print(change_in_memory())
# 3844 # KiB
a[:len(b)] = b
b = a[:len(b)]
a.resize(len(b), refcheck=0)
time.sleep(1)
Free memory increases after resizing:
print(change_in_memory())
# -3708 # KiB
You can do this in cython:
In [1]:
%load_ext cythonmagic
In [2]:
%%cython
cimport numpy as np
np.import_array()
def to_c_contiguous(np.ndarray a):
cdef np.ndarray new
cdef int dim, i
new = a.copy()
dim = np.PyArray_NDIM(new)
for i in range(dim):
np.PyArray_STRIDES(a)[i] = np.PyArray_STRIDES(new)[i]
a.data = new.data
np.PyArray_UpdateFlags(a, np.NPY_C_CONTIGUOUS)
np.set_array_base(a, new)
In [8]:
import sys
import numpy as np
a = np.random.rand(10, 10, 10)
b = c = a[::2, 1::3, 2::4]
d = a[::2, 1::3, 2::4]
print sys.getrefcount(a)
to_c_contiguous(b)
print sys.getrefcount(a)
print np.all(b==d)
The output is:
4
3
True
to_c_contiguous(a) will create a c_contiguous copy of a, and make it as the base of a.
After the call of to_c_contiguous(b), the refcount of a is decreased, and when the refcount of a become 0, it will be freed.
I would claim the correct way to accomplish the 2 things you listed is by np.copying the slices you create.
Of course, in order for that to work correctly, you would need to define an appropriate __array_finalize__. You weren't very clear about why you decided to avoid it in the first place, but my feeling is that you should define it. (how did you solve the bx**2 problem without using __array_finalize__?)

Categories

Resources