I am working with some code in Python and MPI4PY that is throwing a strange error. When I try to run the code below I it throws the following:
ERROR; return code from pthread_create() is 11
Error detail: Resource temporarily unavailable
sh: fork: retry: Resource temporarily unavailable
/home/sfortney/anaconda/lib/python2.7/site-packages/numexpr/cpuinfo.py:40: UserWarning: [Errno 11] Resource temporarily unavailable
warnings.warn(str(e), UserWarning, stacklevel=stacklevel)
I scaled up this code from a simpler, working MPI4PY script that I have posted as well below. From my research on this error it seems that I am creating too many threads. This seems odd to me as I am not calling any threading, just multiple processors (My basic understanding is that threads are an intra-core phenomenon which wouldn't be touched with if I was just calling multiple cores and doing one thing on each. Sorry if this is not true.).
I can't make sense of why the code at the bottom works perfectly but the code immediately below which uses the same structure does not. Why would the code below be running into thread constraints? And where in the code is it even calling multiple threads?
I have posted the whole code below for reproducibility of the error. If it is relevant I am running this on a 32 core LinuxBox.
#to run this call "mpiexec -n 10 python par_implement_wavefront.py" in terminal
from __future__ import division
import pandas as pd
import numpy as np
import itertools
import os
from itertools import chain, combinations
from operator import add
from collections import Counter
home="/home/sfortney"
np.set_printoptions(precision=2, suppress=True)
#choose dimensionality and granularity
dim=2
gran=5
from mpi4py import MPI
from mpi4py.MPI import ANY_SOURCE
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
command_buffer = np.zeros(3) # first entry is boolean, second is tuple with objective function inputs, third is array index
result_buffer=np.zeros(3) # first position is node,
if rank==0:
#defining all of our functions we will need on the root node first
#makes ax1 into the axes of an n dim array
def axis_fitter(arr, dim, gran, start=1, stop=101):
ax1=np.linspace(start,stop, num=gran)
for i in range(dim):
indexlist=[0]*dim
indexlist[i]= slice(None)
arr[indexlist]=ax1
return arr
#this is used to make the inital queues
#fix me to work with nan's!
def queue_init(arr):
queue=[]
queueposs=[]
queuedone=np.argwhere(arr >0).tolist()
return queue,queueposs,queuedone
#this is used in the queue updating function
def queue_sorter(queue):
queue.sort(key=lambda x: np.linalg.norm(np.array(x)))
# using the L1 norm
# queue.sort(key=lambda x: sum(x))
return queue
#this finds all the indicies to the "back" of our box
def back_index(dim):
standardbasis=[]
for i in range(dim):
vec=[0]*dim
vec[i]=vec[i]+1
standardbasis.append(vec)
powerset=[]
for z in chain.from_iterable(combinations(standardbasis,r) for r in range(len(standardbasis)+1)):
powerset.append(z)
powersetnew=[]
for i in range(len(powerset)):
powersetnew.append([sum(x) for x in zip(*list(powerset[i]))])
powersetnew.remove([])
powersetnew=[[i*(-1) for i in x] for x in powersetnew]
return powersetnew
#this takes a completed index and updates our queue of possible values
#as well as our done queue
def queue_update(queue,queueposs,queuedone, arr,dim,comp_idx=[0,0]):
queuedone.append(comp_idx)
if comp_idx==[0,0]:
init_index=[1]*dim
queue.append(init_index)
for i in range(dim):
poss_index=[1]*dim
poss_index[i]=2
queueposs.append(poss_index)
return queue,queueposs,queuedone
else:
queuedone.append(comp_idx)
try:
queueposs.remove(comp_idx)
except:
pass
for i in range(dim):
new_idx=comp_idx[:]
new_idx[i]=new_idx[i]+1
back_list=back_index(dim)
back_list2=[]
for x in back_list:
back_list2.append(list(np.add(np.asarray(new_idx),np.asarray(x))))
if set(tuple(x) for x in back_list2).issubset(set(tuple(x) for x in queuedone)):
queueposs.append(new_idx)
queueposs=list(set(tuple(x) for x in queueposs)-set(tuple(x) for x in queuedone))
queueposs=[list(x) for x in queueposs]
queueposs=queue_sorter(queueposs)
try:
for x in range(len(queueposs)):
queueappender=(queueposs).pop(x)
queue.append(queueappender)
except:
print "queueposs empty"
queue=queue_sorter(queue)
return queue,queueposs,queuedone
#this function makes it so we dont have to pass the whole array through MPI but only the pertinent information
def objectivefuncprimer(arr, queue_elem, dim):
inputs=back_index(dim)
inputs2=[]
for x in inputs:
inputs2.append(list(np.add(np.asarray(queue_elem),np.asarray(x))))
inputs3=[]
for x in range(len(inputs2)):
inputs3.append(arr[tuple(inputs2[x])])
return inputs3
#this function takes a value and an index and assigns the array that value at the index
def arrupdater(val,idx):
arr[tuple(idx)]=val
return arr, idx
#########Initializing
all_finished=False
#make our empty array
sizer=tuple([gran]*dim)
arr=np.zeros(shape=sizer)
nodes_avail=range(1, size) # 0 is not a worker
#assumes axes all start at same place
ax1=np.linspace(20,30, num=gran)
arr=axis_fitter(arr, dim, gran)
#fitting axes and initializing queues
arr=axis_fitter(arr, dim, gran, start=20, stop=30)
queue,queueposs,queuedone =queue_init(arr)
#running first updater
queue,queueposs,queuedone=queue_update(queue,queueposs,queuedone,arr,dim)
def sender(queue):
send_num=min(len(queue),len(nodes_avail))
for k in range(send_num):
node=nodes_avail.pop()
queue_elem=queue.pop(k)
command_buffer[0]=int(all_finished)
command_buffer[1]=queue_elem
command_buffer[2]=objectivefuncprimer(arr,queue_elem,dim)
comm.Send(command_buffer, dest=node)
while all_finished==False:
sender(queue)
comm.Recv(result_buffer,source=MPI.ANY_SOURCE)
arr,comp_idx=arrupdater(result_buffer[1],result_buffer[2])
queue,queueposs,queuedone=queue_update(queue,queueposs,queuedone,arr,dim,comp_idx)
nodes_avail.append(result_buffer[0])
if len(queuedone)==gran**2:
for n in range(1, size):
comm.Send(np.array([True,0,0]), dest=n)
all_finished=True
print arr
if rank>0:
all_finished_worker=False
#this test function will only work in 2d
def objectivefunc2d_2(inputs):
#this will be important for more complicated functions later
#backnum=(2**dim)-1
val=sum(inputs)
return val
while all_finished_worker==False:
comm.Recv(command_buffer, source=0)
all_finished_worker=bool(command_buffer[0])
if all_finished_worker==False:
result=objectivefunc2d_2(command_buffer[2])
# print str(result) +" from "+str(rank)
result_buffer=np.array([rank,result,command_buffer[1]])
comm.Send(result_buffer, dest=0)
This code works and is of the basic structure that I used above but on a much, much more simple example.
from __future__ import division
import numpy as np
import os
from itertools import chain, combinations
from mpi4py import MPI
from mpi4py.MPI import ANY_SOURCE
comm = MPI.COMM_WORLD
rank = comm.Get_rank()
size = comm.Get_size()
command_buffer = np.zeros(2) # first entry is boolean, rest is data
result_buffer=np.zeros(2) # first position is node, rest is data
if rank==0:
all_finished=False
nodes_avail=range(1, size) # 0 is not a worker
arr=[]
q=range(20)
def primer(q):
return int(all_finished),q
def sender(q):
send_num=min(len(q),len(nodes_avail))
for k in range(send_num):
node=nodes_avail.pop()
queue_init=q.pop()
command_buffer[0]=primer(queue_init)[0]
command_buffer[1]=primer(queue_init)[1]
comm.Send(command_buffer, dest=node)
while all_finished==False:
sender(q)
# update q
comm.Recv(result_buffer,source=MPI.ANY_SOURCE)
arr.append(result_buffer[1])
nodes_avail.append(result_buffer[0])
if len(arr)==20:
for n in range(1, size):
comm.Send(np.array([True,0]), dest=n)
all_finished=True
print arr
if rank>0:
all_finished_worker=False
while all_finished_worker==False:
comm.Recv(command_buffer, source=0)
all_finished_worker=bool(command_buffer[0])
if all_finished_worker==False:
result=command_buffer[1]*2
# print str(result) +" from "+str(rank)
result_buffer=np.array([rank,result])
comm.Send(result_buffer, dest=0)
Related
As part of a signal processing task, I am doing some computation per frequency step.
I have a frequencies list of length 513.
I have a 3D numpy array A of shape (81,81,513), where 513 is the number of frequency points. I then have a 81x81 matrix per frequency.
I want to apply some modification to each of those matrices, to end up with a modified version of A I'll name B here, which will also be of shape (81,81,513).
For that, I start pre-allocating B with :
B = np.zeros_like(A)
I then loop over my frequencies and call a dothing function like:
for index, frequency in enumerate(frequencies):
B[:,:,index] = dothing(A[:,:,index])
The problem is that dothing takes a lot of time, and ran sequentially over 513 frequency steps seems endless.
So I wanted to parallelize it. But even after reading a lot of docs and watching a lot of videos, I get lost in all the libraries and potential solutions.
Computations at all individual frequencies can be done independently. But in the end I need to assign everything back to B in the right order.
Any idea on how to do that?
Thanks in advance
Antoine
Here I would use a shared array using shared_memory, as there's no need to protect write access if no two loop iterations ever use the same memory address. I eliminated the second array to shorten the example (only construct a single shared array), and I re-ordered the array shape to better preserve memory-aligned access.
from multiprocessing import Pool
from multiprocessing.shared_memory import SharedMemory
import numpy as np
import numpy.typing as npt
from typing import Any
from time import sleep
def dothing(arr: np.ndarray, t_func: Any) -> np.ndarray:
sleep(.05) #simulate hard work
return arr * 2
def dodothing(args: tuple[int, Any]):
global arr
index = args[0]
t_func = args[1]
arr[index] = dothing(arr[index], t_func) #write result back to self to avoid need for 2 shared arrays
def init(shm: SharedMemory, shape: tuple[int, ...], dtype: npt.DTypeLike):
global arr
arr = np.ndarray(shape, dtype=dtype, buffer=shm.buf)
if __name__ == "__main__":
_A = np.ones((513,81,81), np.float64) #source data
t_funcs = ["some transfer function"] * _A.shape[0] #added example of passing some data + an index
nbytes = _A.size * _A.itemsize
dtype = _A.dtype
shape = _A.shape
shm = SharedMemory(create=True, size=nbytes)
A = np.ndarray(shape, dtype=dtype, buffer=shm.buf)
A[:] = _A[:] #copy contents into shared A
with Pool(initializer=init, initargs=(shm, shape, dtype)) as pool:
pool.map(dodothing, enumerate(t_funcs)) #enumerate returns tuple[int,Any] each loop
print(A.sum()/_A.sum()) #prove we multiplied all elements by 2
shm.close()
shm.unlink()
multiprocessing.Pool is a bit funny sometimes in what can be a valid argument to a target function, so I tend to share things like Lock, Queue, shared_memory etc. via the pool's initialization function, which accepts arguments just like Process does.
I am working on hyperspectral images. To reduce the noise from the image I am using wavelet transformation using pywt package. When I am doing this normally(serial processing) it's working smoothly. But when I am trying to implement parallel processing using multiple cores for wavelet transformation on the image, then I have to pass certain parameters like
wavelet family
thresholding value
threshold technique (hard/soft)
But I am not able to pass these parameters using the pool object, I can pass only data as an argument when I am using the pool.imap(). But when I am using the pool.apply_async() it's taking much more time and also the order of the output is not the same. Here I am adding the code for reference:
import matplotlib.pyplot as plt
import numpy as np
import multiprocessing as mp
import os
import time
from math import log10, sqrt
import pywt
import tifffile
def spec_trans(d,wav_fam,threshold_val,thresh_type):
data=np.array(d,dtype=np.float64)
data_dec=decomposition(data,wav_fam)
data_t=thresholding(data_dec,threshold_val,thresh_type)
data_rec=reconstruction(data_t,wav_fam)
return data_rec
if __name__ == '__main__':
#input
X=tifffile.imread('data/Classification/university.tif')
#take paramaters
threshold_val=float(input("Enter the value for image thresholding: "))
print("The available wavelet functions:",pywt.wavelist())
wav_fam=input("Choose a wavelet function for transformation: ")
threshold_type=['hard','soft']
print("The available wavelet functions:",threshold_type)
thresh_type=input("Choose a type for threshholding technique: ")
start=time.time()
p = mp.Pool(4)
jobs=[]
for dataBand in xmp:
jobs.append(p.apply_async(spec_trans,args=(dataBand,wav_fam,threshold_val,thresh_type)))
transformedX=[]
for jobBit in jobs:
transformedX.append(jobBit.get())
end=time.time()
p.close()
Also when I am using the 'soft' technique for thresholding I am facing the following error:
C:\Users\Sawon\anaconda3\lib\site-packages\pywt\_thresholding.py:25: RuntimeWarning: invalid value encountered in multiply
thresholded = data * thresholded
The results of serial execution and parallel execution would be more or less the same. But here I am getting slightly different results.
Any suggestion to modify the code will be helpful
Thank
[This isn't a direct answer to the question, but a clearer follow up query than trying the below via the small comment box]
As a quick check, pass in an iterator counter to spec_trans and return it back out (as well as your result) - and push it into a separate list, transformedXseq or something - and then compare to your input sequence. i.e.
def spec_trans(d,wav_fam,threshold_val,thresh_type, iCount):
data=np.array(d,dtype=np.float64)
data_dec=decomposition(data,wav_fam)
data_t=thresholding(data_dec,threshold_val,thresh_type)
data_rec=reconstruction(data_t,wav_fam)
return data_rec, iCount
and then within main
jobs=[]
iJobs = 0
for dataBand in xmp:
jobs.append(p.apply_async(spec_trans,args=(dataBand,wav_fam,threshold_val,thresh_type, iJobs)))
iJobs = iJobs + 1
transformedX=[]
transformedXseq=[]
for jobBit in jobs:
res = jobBit.get()
transformedX.append(res[0])
transformedXseq.append(res[1])
... and check the list transformedXseq to see if you've gathered the jobs back up in the sequence you submitted them. It should match!
Assuming wav_fam, threshold_val and thresh_type do not vary from call to call, first arrange for these arguments to be the first arguments to worker function spec_trans:
def spec_trans(wav_fam, threshold_val, thresh_type, d):
Now I don't see where in your pool-creation block you have defined xmp, but presumably this is an iterable. You need to modify this code as follows:
from functools import partial
def compute_chunksize(pool_size, iterable_size):
chunksize, remainder = divmod(iterable_size, 4 * pool_size)
if remainder:
chunksize += 1
return chunksize
if __name__ == '__main__':
X=tifffile.imread('data/Classification/university.tif')
#take paramaters
threshold_val=float(input("Enter the value for image thresholding: "))
print("The available wavelet functions:",pywt.wavelist())
wav_fam=input("Choose a wavelet function for transformation: ")
threshold_type=['hard','soft']
print("The available wavelet functions:",threshold_type)
thresh_type=input("Choose a type for threshholding technique: ")
start=time.time()
p = mp.Pool(4)
# first 3 arguments to spec_trans will be wav_fam, threshold_val and thresh_type
worker = partial(spec_trans, wav_fam, threshold_val, thresh_type)
suitable_chunksize = compute_chunksize(4, len(xmp))
transformedX = list(p.imap(worker, xmp, chunksize=suitable_chunksize))
end=time.time()
To obtain improved performance over using apply_async, you must use a "suitable chunksize" value with imap. Function compute_chunksize can be used for computing such a value based on the size of your pool, which is 4, and the size of the iterable being passed to imap, which would be len(xmp). If the size of xmp is small enough such that the chunksize value computed is 1, I don't really see how imap would be significantly more performant over apply_async.
Of course, you might as well just use:
transformedX = p.map(worker, xmp)
And let the pool compute its own suitable chunksize. imap has an advantage over map when the iterable is very large and not already a list. For map to compute a suitable chunksize it would first have to convert the iterable to a list just to get its length and this could be memory inefficient. But if you know the length (or approximate length) of the iterable, then by using imap you can explicitly set a chunksize without having to convert the iterable to a list. The other advantage of imap_unordered over map is that you can process the results for the individual tasks as they become available whereas with map you only get results when all the submitted tasks are complete.
Update
If you want to catch possible exceptions thrown by individual tasks submitted to your worker function, then stick with using imap, and use the following code to iterate the results returned by imap:
#transformedX = list(p.imap(worker, xmp, chunksize=suitable_chunksize))
transformedX = []
results = p.imap(worker, xmp, chunksize=suitable_chunksize)
import traceback
while True:
try:
result = next(results)
except StopIteration: # no more results
break
except Exception as e:
print('Exception occurred:', e)
traceback.print_exc() # print stacktrace
else:
transformedX.append(result)
I'm trying to repeatedly run a function that requires a few positional arguments and involves random number generation (to generate many samples of a distribution). For a MWE, I think this captures everything:
import numpy as np
import multiprocessing as mup
from functools import partial
def rarr(xsize, ysize, k):
return np.random.rand(xsize, ysize)
def clever_array(nsamp, xsize=100, ysize=100, ncores=None):
np.random.seed()
if ncores is None:
p = mup.Pool()
else:
p = mup.Pool(ncores)
out = p.map_async( partial(rarr, xsize, ysize), range(nsamp))
p.close()
return np.array(out.get())
Note that the final positional argument for rarr() is just a dummy variable, since I am using map_async(), which requires an iterable. Now if I run %timeit clever_array(500, ncores = 1) I get 208 ms, whereas %timeit clever_array(500, ncores = 5) takes 149 ms. So there is definitely some kind of parallelism happening (the speedup isn't terribly impressive for this MWE but is decent in my real code).
However, I'm wondering a few things -- is there a more natural implementation other than the dummy variable for rarr() passed as an iterable to map_async to run this many times? Is there any obvious way to pass the xsize and ysize args to rarr() other than partial()? And is there any way to ensure different results from the different cores other than initializing a different random.seed() every time?
Thanks for any help!
Typically when we use multiprocessing we would expect different results from each invocation of a function, therefore it doesn't quite make sense to call the same function many times. In order to ensure the randomness of the sampling output, it is best to separate the random state (seed) from the function itself. The best approach as recommended by the numpy official doc is to use a np.random.Generator object, created via np.random.default_rng([seed]). With that we can modify your code to
import numpy as np
import multiprocessing as mup
from functools import partial
def rarr(xsize, ysize, rng):
return rng.random((xsize, ysize))
def clever_array(nsamp, xsize=100, ysize=100, ncores=None):
if ncores is None:
p = mup.Pool()
else:
p = mup.Pool(ncores)
out = p.map_async(partial(rarr, xsize, ysize), map(np.random.default_rng, range(nsamp)))
p.close()
return np.array(out.get())
The following for loop is part of a iterative simulation process and is the main bottleneck regarding computational time:
import numpy as np
class Simulation(object):
def __init__(self,n_int):
self.n_int = n_int
def loop(self):
for itr in range(self.n_int):
#some preceeding code which updates rows_list and diff with every itr
cols_red_list = []
rows_list = list(range(2500)) #row idx for diff where negative element is known to appear
diff = np.random.uniform(-1.323, 3.780, (2500, 300)) #np.random.uniform is just used as toy example
for row in rows_list:
col = next(idx for idx, val in enumerate(diff[row,:]) if val < 0)
cols_red_list.append(col)
# some subsequent code which uses the cols_red_list data
sim1 = Simulation(n_int=10)
sim1.loop()
Hence, I tried to parallelize it by using the multiprocessing package in hope to reduce computation time:
import numpy as np
from multiprocessing import Pool, cpu_count
from functools import partial
def crossings(row, diff):
return next(idx for idx, val in enumerate(diff[row,:]) if val < 0)
class Simulation(object):
def __init__(self,n_int):
self.n_int = n_int
def loop(self):
for itr in range(self.n_int):
#some preceeding code which updates rows_list and diff with every
rows_list = list(range(2500))
diff = np.random.uniform(-1, 1, (2500, 300))
if __name__ == '__main__':
num_of_workers = cpu_count()
print('number of CPUs : ', num_of_workers)
pool = Pool(num_of_workers)
cols_red_list = pool.map(partial(crossings,diff = diff), rows_list)
pool.close()
print(len(cols_red_list))
# some subsequent code which uses the cols_red_list data
sim1 = Simulation(n_int=10)
sim1.loop()
Unfortunately, the parallelization turns out to be much slower compared to the sequential piece of code.
Hence my question: Did I use the multiprocessing package properly in that particular example? Are there alternative ways to parallelize the above mentioned for loop ?
Disclaimer: As you're trying to reduce the runtime of your code through parallelisation, this doesn't strictly answer your question but it might still be a good learning opportunity.
As a golden rule, before moving to multiprocessing to improve
performance (execution time), one should first optimise the
single-threaded case.
Your
rows_list = list(range(2500))
Generates the numbers 0 to 2499 (that's the range) and stores them in memory (list), which requires time to do the allocation of the required memory and the actual write. You then only use these predictable values once each, by reading them from memory (which also takes time), in a predictable order:
for row in rows_list:
This is particularly relevant to the runtime of your loop function as you do it repeatedly (for itr in range(n_int):).
Instead, consider generating the number only when you need it, without an intermediate store (which conceptually removes any need to access RAM):
for row in range(2500):
Secondly, on top of sharing the same issue (unnecessary accesses to memory), the following:
diff = np.random.uniform(-1, 1, (2500, 300))
# ...
col = next(idx for idx, val in enumerate(diff[row,:]) if val < 0)
seems to me to be optimisable at the level of math (or logic).
What you're trying to do is get a random variable (that col index) by defining it as "the first time I encounter a random variable in [-1;1] that is lower than 0". But notice that figuring out if a random variable with a uniform distribution over [-α;α] is negative, is the same as having a random variable over {0,1} (i.e. a bool).
Therefore, you're now working with bools instead of floats and you don't even have to do the comparison (val < 0) as you already have a bool. This potentially makes the code much faster. Using the same idea as for rows_list, you can generate that bool only when you need it; testing it until it is True (or False, choose one, it doesn't matter obviously). By doing so, you only generate as many random bools as you need, not more and not less (BTW, what happens in your code if all 300 elements in the row are negative? ;) ):
for _ in range(n_int):
cols_red_list = []
for row in range(2500):
col = next(i for i in itertools.count() if random.getrandbits(1))
cols_red_list.append(col)
or, with list comprehension:
cols_red_list = [next(i for i in count() if getrandbits(1))
for _ in range(2500)]
I'm sure that, through proper statistical analysis, you even can express that col random variable as a non-uniform variable over [0;limit[, allowing you to compute it much faster.
Please test the performance of an "optimized" version of your single-threaded implementation first. If the runtime is still not acceptable, you should then look into multithreading.
multiprocessing uses system processes (not threads!) for parallelization, which require expensive IPC (inter-process communication) to share data.
This bites you in two spots:
diff = np.random.uniform(-1, 1, (2500, 300)) creates a large matrix which is expensive to pickle/copy to another process
rows_list = list(range(2500)) creates a smaller list, but the same applies here.
To avoid this expensive IPC, you have one and a half choices:
If on a POSIX-compliant system, initialize your variables on the module level, that way each process gets a quick-and-dirty copy of the required data. This is not scalable as it requires POSIX, weird architecture (you probably don't want to put everything on the module level), and doesn't support sharing changes to that data.
Use shared memory. This only supports mostly primitive data types, but mp.Array should cover your needs.
The second problem is that setting up a pool is expensive, as num_cpu processes need to be started. Your workload is small enough to be negligible compared to this overhead. A good practice is to only create one pool and reuse it.
Here is a quick-and-dirty example of the POSIX only solution:
import numpy as np
from multiprocessing import Pool, cpu_count
from functools import partial
n_int = 10
rows_list = np.array(range(2500))
diff = np.random.uniform(-1, 1, (2500, 300))
def crossings(row, diff):
return next(idx for idx, val in enumerate(diff[row,:]) if val < 0)
def workload(_):
cols_red_list = [crossings(row, diff) for row in rows_list]
print(len(cols_red_list))
class Simulation(object):
def loop(self):
num_of_workers = cpu_count()
with Pool(num_of_workers) as pool:
pool.map(workload, range(10))
pool.close()
sim1 = Simulation()
sim1.loop()
For me (and my two cores) this is roughly twice as fast as the sequential version.
Update with shared memory:
import numpy as np
from multiprocessing import Pool, cpu_count, Array
from functools import partial
n_int = 10
ROW_COUNT = 2500
### WORKER
diff = None
result = None
def init_worker(*args):
global diff, result
(diff, result) = args
def crossings(i):
result[i] = next(idx for idx, val in enumerate(diff[i*300:(i+1)*300]) if val < 0)
### MAIN
class Simulation():
def loop(self):
num_of_workers = cpu_count()
diff = Array('d', range(ROW_COUNT*300), lock=False)
result = Array('i', ROW_COUNT, lock=False)
# Shared memory needs to be passed when workers are spawned
pool = Pool(num_of_workers, initializer=init_worker, initargs=(diff, result))
for i in range(n_int):
# SLOW, I assume you use a different source of values anyway.
diff[:] = np.random.uniform(-1, 1, ROW_COUNT*300)
pool.map(partial(crossings), range(ROW_COUNT))
print(len(result))
pool.close()
sim1 = Simulation()
sim1.loop()
A few notes:
Shared memory needs to be set up at worker creation, so it's global anyway.
This still isn't faster than the sequential version, but that's mainly due to random.uniform needing to be copied entirely into shared memory. I assume that are just values for testing, and in reality you'd fill it differently anyway.
I only pass indices to the worker, and use them to read and write values to the shared memory.
I am trying to combine the solutions provided in both of these SO answers - Using threading to slice an array into chunks and perform calculation on each chunk and reassemble the returned arrays into one array and Pass multiple parameters to concurrent.futures.Executor.map?. I have a numpy array that I chunk into segments and I want each chunk to be sent to a separate thread and an additional argument to be sent along with the chunk of the original array. This additional argument is a constant and will not change. The performCalc is a function that will take two arguments -one the chunk of the original numpy array and a constant.
First solution I tried
import psutil
import numpy as np
import sys
from concurrent.futures import ThreadPoolExecutor
from functools import partial
def main():
testThread()
def testThread():
minLat = -65.76892
maxLat = 66.23587
minLon = -178.81404
maxLon = 176.2949
latGrid = np.arange(minLat,maxLat,0.05)
lonGrid = np.arange(minLon,maxLon,0.05)
gridLon,gridLat = np.meshgrid(latGrid,lonGrid)
grid_points = np.c_[gridLon.ravel(),gridLat.ravel()]
n_jobs = psutil.cpu_count(logical=False)
chunk = np.array_split(grid_points,n_jobs,axis=0)
x = ThreadPoolExecutor(max_workers=n_jobs)
maxDistance = 4.3
func = partial(performCalc,chunk)
args = [chunk,maxDistance]
# This prints 4.3 twice although there are four cores in the system
results = x.map(func,args)
# This prints 4.3 four times correctly
results1 = x.map(performTest,chunk)
def performCalc(chunk,maxDistance):
print(maxDistance)
return chunk
def performTest(chunk):
print("test")
main()
So performCalc() prints 4.3 twice even though the number of cores in the system is 4. While performTest() prints test four times correctly. I am not able to figure out the reason for this error.
Also I am sure the way I set up the for itertools.partial call is incorrect.
1) There are four chunks of the original numpy array.
2) Each chunk is to be paired with maxDistance and sent to performCalc()
3) There will be four threads that will print maxDistance and will return parts of the total result which will be returned in one array
Where am I going wrong ?
UPDATE
I tried using the lambda approach as well
results = x.map(lambda p:performCalc(*p),args)
but this prints nothing.
Using the solution provided by user mkorvas as shown here - How to pass a function with more than one argument to python concurrent.futures.ProcessPoolExecutor.map()? I was able to solve my problem as shown in the solution here -
import psutil
import numpy as np
import sys
from concurrent.futures import ThreadPoolExecutor
from functools import partial
def main():
testThread()
def testThread():
minLat = -65.76892
maxLat = 66.23587
minLon = -178.81404
maxLon = 176.2949
latGrid = np.arange(minLat,maxLat,0.05)
lonGrid = np.arange(minLon,maxLon,0.05)
print(latGrid.shape,lonGrid.shape)
gridLon,gridLat = np.meshgrid(latGrid,lonGrid)
grid_points = np.c_[gridLon.ravel(),gridLat.ravel()]
print(grid_points.shape)
n_jobs = psutil.cpu_count(logical=False)
chunk = np.array_split(grid_points,n_jobs,axis=0)
x = ThreadPoolExecutor(max_workers=n_jobs)
maxDistance = 4.3
func = partial(performCalc,maxDistance)
results = x.map(func,chunk)
def performCalc(maxDistance,chunk):
print(maxDistance)
return chunk
main()
What apparently one needs to do(and I do not know why and maybe somebody can clarify in another answer) is you need to switch the order of input to the function performCalc()
as shown here -
def performCalc(maxDistance,chunk):
print(maxDistance)
return chunk