I'm trying to use nvtx markers along with multiprocessing pool in Python, but when only a child process calls an annotated function the operation doesn't appear in the profiling report. Is there any way to get around this, or is this a limitation of python processes? Here's some example code to replicate:
import os
import time
from multiprocessing import Pool, shared_memory
import numpy as np
import nvtx
N_SAMPLES = int(1e6)
SIGNAL = np.random.randn(N_SAMPLES) + 1j * np.random.randn(N_SAMPLES)
#nvtx.annotate(color="red")
def create_shm_array(signal):
# Store the signal in shared memory to share across processes
shm = shared_memory.SharedMemory(create=True, size=signal.nbytes)
shared_array = np.ndarray(signal.shape, dtype=signal.dtype, buffer=shm.buf)
shared_array[:] = signal[:]
return shm
def worker(shm_name):
shm = shared_memory.SharedMemory(name=shm_name)
sig = np.ndarray((N_SAMPLES,), dtype=complex, buffer=shm.buf)
return expensive_op(sig)
#nvtx.annotate(color="blue")
def expensive_op(sig):
time.sleep(2)
return np.sum(sig)
def clean_shm(shm_name):
shm = shared_memory.SharedMemory(name=shm_name)
shm.close()
shm.unlink()
if __name__ == "__main__":
print(f"Total num_bytes: {SIGNAL.nbytes} B | {SIGNAL.nbytes / 1e9} GB")
test = np.random.randn(10)
expensive_op(test)
shared_mem = create_shm_array(SIGNAL)
with Pool(os.cpu_count()) as p:
p.map(worker, [shared_mem.name] * 2)
clean_shm(shared_mem.name)
Here's the Nvidia Nsight Systems Timeline. The Marker appears during the first call from the parent process, but does not appear when called by the child processes
By default, python multiprocessing forks new processes. We need it to spawn them. Working code below.
import os
import time
from multiprocessing import Pool, shared_memory, get_context
import numpy as np
import nvtx
N_SAMPLES = int(1e6)
SIGNAL = np.random.randn(N_SAMPLES) + 1j * np.random.randn(N_SAMPLES)
#nvtx.annotate(color="red")
def create_shm_array(signal):
# Store the signal in shared memory to share across processes
shm = shared_memory.SharedMemory(create=True, size=signal.nbytes)
shared_array = np.ndarray(signal.shape, dtype=signal.dtype, buffer=shm.buf)
shared_array[:] = signal[:]
return shm
def worker(shm_name):
shm = shared_memory.SharedMemory(name=shm_name)
sig = np.ndarray((N_SAMPLES,), dtype=complex, buffer=shm.buf)
return expensive_op(sig)
#nvtx.annotate(color="blue")
def expensive_op(sig):
time.sleep(2)
return np.sum(sig)
def clean_shm(shm_name):
shm = shared_memory.SharedMemory(name=shm_name)
shm.close()
shm.unlink()
if __name__ == "__main__":
print(f"Total num_bytes: {SIGNAL.nbytes} B | {SIGNAL.nbytes / 1e9} GB")
test = np.random.randn(10)
expensive_op(test)
shared_mem = create_shm_array(SIGNAL)
with get_context("spawn").Pool(os.cpu_count()) as p:
p.map(worker, [shared_mem.name] * 2)
clean_shm(shared_mem.name)
Related
I want to have multiple processes read from a different row of a numpy array in parallel to speed things up. However, when I run the following code, the first process to reach func throws an error as if var is no longer in scope. Why is this happening?
import numpy as np
import multiprocessing as mp
num_procs = 16
num_points = 2500000
def init_worker(X):
global var
var = X
def func(proc):
X_np = np.frombuffer(var).reshape((num_procs, num_points))
for y in range(num_points):
z = X_np[proc][y]
if __name__ == '__main__':
data = np.random.randn(num_procs, num_points)
X = mp.RawArray('d', num_procs*num_points)
X_np = np.frombuffer(X).reshape((num_procs, num_points))
np.copyto(X_np, data)
pool = mp.Pool(processes=4, initializer=init_worker, initargs=(X,))
for proc in range(num_procs):
pool.apply_async(func(proc))
pool.close()
pool.join()
Traceback (most recent call last):
File "parallel_test.py", line 26, in <module>
pool.apply_async(func(proc))
File "parallel_test.py", line 13, in func
X_np = np.frombuffer(var).reshape((num_procs, num_points))
NameError: global name 'var' is not defined
Update:
For some reason, if I use Pool.map instead of the for loop with Pool.apply_async, it seems to work. I don’t understand why though.
Any reason to not declare X as global in the top-level scope? This eliminates the NameError.
import numpy as np
import multiprocessing as mp
num_procs = 16
num_points = 25000000
def func(proc):
X_np = np.frombuffer(X).reshape((num_procs, num_points))
for y in range(num_points):
z = X_np[proc][y]
if __name__ == '__main__':
data = np.random.randn(num_procs, num_points)
global X
X = mp.RawArray('d', num_procs*num_points)
X_np = np.frombuffer(X).reshape((num_procs, num_points))
np.copyto(X_np, data)
pool = mp.Pool(processes=4 )
for proc in range(num_procs):
pool.apply_async(func(proc))
pool.close()
pool.join()
When I run a reduced instance of this problem, n=20:
import numpy as np
import multiprocessing as mp
num_procs = 4
num_points = 5
def func(proc):
X_np = np.frombuffer(X).reshape((num_procs, num_points))
for y in range(num_points):
z = X_np[proc][y]
if __name__ == '__main__':
data = np.random.randn(num_procs, num_points)
global X
X = mp.RawArray('d', num_procs*num_points)
X_np = np.frombuffer(X).reshape((num_procs, num_points))
np.copyto(X_np, data)
pool = mp.Pool(processes=4 )
for proc in range(num_procs):
pool.apply_async(func(proc))
pool.close()
pool.join()
print("\n".join(map(str, X)))
I get the following output:
-0.6346037804619162
1.1005724710066107
0.33458763357165255
0.6409345714971889
0.7124888766851982
0.36760459213332963
0.23593304931386933
-0.8668969562941349
-0.8842756219923469
0.005979036105620422
1.386422154089567
-0.8770988782214508
0.25187448339771057
-0.2473967968471952
-0.4909708883978521
0.5423521489750244
0.018749603867333802
0.035304792504378055
1.3263872668956616
1.0199839603892742
You haven't provided a sample of the expected output. Does this look similar to what you expect?
I want to make calls to pool.apply_async(func) and accumulate the results as soon as they are available without waiting for each other.
import multiprocessing
import numpy as np
chrNames=['chr1','chr2','chr3']
sims=[1,2,3]
def accumulate_chrBased_simBased_result(chrBased_simBased_result,accumulatedSignalArray,accumulatedCountArray):
signalArray = chrBased_simBased_result[0]
countArray = chrBased_simBased_result[1]
accumulatedSignalArray += signalArray
accumulatedCountArray += countArray
def func(chrName,simNum):
print('%s %d' %(chrName,simNum))
result=[]
signal_array=np.full((10000,), simNum, dtype=float)
count_array = np.full((10000,), simNum, dtype=int)
result.append(signal_array)
result.append(count_array)
return result
if __name__ == '__main__':
accumulatedSignalArray = np.zeros((10000,), dtype=float)
accumulatedCountArray = np.zeros((10000,), dtype=int)
numofProcesses = multiprocessing.cpu_count()
pool = multiprocessing.Pool(numofProcesses)
for chrName in chrNames:
for simNum in sims:
result= pool.apply_async(func, (chrName,simNum,))
accumulate_chrBased_simBased_result(result.get(),accumulatedSignalArray,accumulatedCountArray)
pool.close()
pool.join()
print(accumulatedSignalArray)
print(accumulatedCountArray)
In this way, each pool.apply_async call waits for other call to end.
Is there a way do get rid of this waiting for each other?
You are using result.get() on each iteration, and making the main process wait for the function to be ready in doing so.
Please find below a working version, with prints showing that accumulation is done when "func" is ready, and adding random sleeps to ensure sizable execution time differences.
import multiprocessing
import numpy as np
from time import time, sleep
from random import random
chrNames=['chr1','chr2','chr3']
sims=[1,2,3]
def accumulate_chrBased_simBased_result(chrBased_simBased_result,accumulatedSignalArray,accumulatedCountArray):
signalArray = chrBased_simBased_result[0]
countArray = chrBased_simBased_result[1]
accumulatedSignalArray += signalArray
accumulatedCountArray += countArray
def func(chrName,simNum):
result=[]
sleep(random()*5)
signal_array=np.full((10000,), simNum, dtype=float)
count_array = np.full((10000,), simNum, dtype=int)
result.append(signal_array)
result.append(count_array)
print('%s %d' %(chrName,simNum))
return result
if __name__ == '__main__':
accumulatedSignalArray = np.zeros((10000,), dtype=float)
accumulatedCountArray = np.zeros((10000,), dtype=int)
numofProcesses = multiprocessing.cpu_count()
pool = multiprocessing.Pool(numofProcesses)
results = []
for chrName in chrNames:
for simNum in sims:
results.append(pool.apply_async(func, (chrName,simNum,)))
for i in results:
print(i)
while results:
for r in results[:]:
if r.ready():
print('{} is ready'.format(r))
accumulate_chrBased_simBased_result(r.get(),accumulatedSignalArray,accumulatedCountArray)
results.remove(r)
pool.close()
pool.join()
print(accumulatedSignalArray)
print(accumulatedCountArray)
I'm trying to alter a dictionary in python inside a process pool environment, but the dictionary isn't changed when the pool finishes.
Here's a minimal example of the problem (the output batch_input is all zeros, although inside per_batch_build it changes the relevant values)
from multiprocessing import Pool, freeze_support
import numpy as np
import itertools
def test_process():
batch_size = 2
batch_input = {'part_evecs': np.zeros((2, 10, 10)),
'model_evecs': np.zeros((2, 10, 10)),
}
batch_model_dist = np.zeros((2, 10, 10))
pool = Pool(4)
batch_output = pool.map(per_batch_build, itertools.izip(itertools.repeat(batch_input),
itertools.repeat(batch_model_dist),
list(range(batch_size))))
pool.close()
pool.join()
return batch_input, batch_model_dist
# #profile
# def per_batch_build(batch_input, batch_model_dist, batch_part_dist, dataset, i_batch):
def per_batch_build(tuple_input):
batch_input, batch_model_dist, i_batch = tuple_input
batch_model_dist[i_batch] = np.ones((10,10))
batch_input['part_evecs'][i_batch] = np.ones((10,10))
batch_input['model_evecs'][i_batch] = np.ones((10,10))
But unfortunately batch_input, batch_model_dist, batch_part_dist are all zeros, although when printing batch_input inside per_batch_build is not zero.
Using the solutions provided from previous discussions, the result stays the same (the output arrays are all zeros)
from multiprocessing import Pool, freeze_support, Manager, Array
import numpy as np
import itertools
import ctypes
def test_process():
manager = Manager()
shared_array_base = Array(ctypes.c_double, [0] * (2*10*10))
shared_array = np.ctypeslib.as_array(shared_array_base.get_obj())
shared_array = shared_array.reshape((2,10,10))
batch_size = 2
batch_input = manager.dict({'part_evecs': shared_array,
# 'model_evecs': np.zeros((2, 10, 10)),
})
batch_model_dist = np.zeros((2, 10, 10))
pool = Pool(4)
batch_output = pool.map(per_batch_build, itertools.izip(itertools.repeat(batch_input),
itertools.repeat(batch_model_dist),
list(range(batch_size))))
pool.close()
pool.join()
return batch_input, batch_model_dist
# #profile
# def per_batch_build(batch_input, batch_model_dist, batch_part_dist, dataset, i_batch):
def per_batch_build(tuple_input):
batch_input, batch_model_dist, i_batch = tuple_input
batch_model_dist[i_batch] = np.ones((10,10))
batch_input['part_evecs'][i_batch] = np.ones((10,10))
# batch_input['model_evecs'][i_batch] = np.ones((10,10))
You are changing a copy of the object created inside per_batch_build. You are naming them identically in both functions so it may be confusing.
Add
print(id(batch_model_dist))
inside both functions and see for yourself.
[Edit]
I should probably also link related response, for example:
Is shared readonly data copied to different processes for multiprocessing?
I am writing a simple python script that I need to scale to many threads. For simplicity, I have replaced the actual function I need to use with a matrix matrix multiply. I am having trouble getting my code to scale with the number of processors. Any advice to help me get the correct speedup would be helpful! My code and results are as follows:
import numpy as np
import time
import math
from multiprocessing.dummy import Pool
res = 4
#we must iterate over all of these values
wavektests = np.linspace(.1,2.5,res)
omegaratios = np.linspace(.1,2.5,res)
wavekmat,omegamat = np.meshgrid(wavektests,omegaratios)
def solve_for_omegaratio( ind ):
#obtain the indices for this run
x_ind = ind % res
y_ind = math.floor(ind / res)
#obtain the value for this run
wavek = wavektests[x_ind]
omega = omegaratios[y_ind]
#do some work ( I have replaced the real function with this)
randmat = np.random.rand(4000,4000)
nop = np.linalg.matrix_power(randmat,3)
#obtain a scalar value
value = x_ind + y_ind**2.0
return value
list_ind = range(res**2)
#Serial code execution
t0_proc = time.clock()
t0_wall = time.time()
threads = 0
dispersion = map( solve_for_omegaratio , list_ind)
displist = list(dispersion)
t1_proc = time.clock()
t1_wall = time.time()
print('serial execution')
print('wall clock time = ',t1_wall-t0_wall)
print('processor clock time = ',t1_proc-t0_proc)
print('------------------------------------------------')
#Using pool defaults
t0_proc = time.clock()
t0_wall = time.time()
if __name__ == '__main__':
pool = Pool()
dispersion = pool.map( solve_for_omegaratio , list_ind)
displist = list(dispersion)
t1_proc = time.clock()
t1_wall = time.time()
pool.close
print('num of threads = default')
print('wall clock time = ',t1_wall-t0_wall)
print('processor clock time = ',t1_proc-t0_proc)
print('------------------------------------------------')
# Using 4 threads
t0_proc = time.clock()
t0_wall = time.time()
threads = 4
if __name__ == '__main__':
pool = Pool(threads)
dispersion = pool.map( solve_for_omegaratio , list_ind)
displist = list(dispersion)
t1_proc = time.clock()
t1_wall = time.time()
pool.close
print('num of threads = ' + str(threads))
print('wall clock time = ',t1_wall-t0_wall)
print('processor clock time = ',t1_proc-t0_proc)
print('------------------------------------------------')
Results:
serial execution
wall clock time = 66.1561758518219
processor clock time = 129.16376499999998
------------------------------------------------
num of threads = default
wall clock time = 81.86436200141907
processor clock time = 263.45369
------------------------------------------------
num of threads = 4
wall clock time = 77.63390111923218
processor clock time = 260.66285300000004
------------------------------------------------
Because python has a GIL https://wiki.python.org/moin/GlobalInterpreterLock , "python-native" threads can't run execute truly concurrently and thus can't improve the performance of CPU-bound tasks like math. They can be used to parallelize IO bound tasks effectively (eg API calls which spend almost all their time waiting for network I/O). Forking separate processes with multiprocessing rather than dummy's thread-backed implementation will create multiple processes, not threads, which will be able to run concurrently ( at cost of significant memory overhead).
I use minimize from the Scipy module on Python 3.4, specifically:
resultats=minimize(margin_rate, iniprices, method='SLSQP',
jac=margin_rate_deriv, bounds=pricebounds, options={'disp': True,
'maxiter':2000}, callback=iter_report_margin_rate)
The maximum number of iterations can be set (as above), but is there a way to tell minimize to stop searching for a solution after a given set time? I looked at the general options of minimize as well as the specific options of the SLSQP solver, but could not work it out.
Thanks
You can use the callback argument to raise a warning or exception if the execution time exceeds some threshold:
import numpy as np
from scipy.optimize import minimize, rosen
import time
import warnings
class TookTooLong(Warning):
pass
class MinimizeStopper(object):
def __init__(self, max_sec=60):
self.max_sec = max_sec
self.start = time.time()
def __call__(self, xk=None):
elapsed = time.time() - self.start
if elapsed > self.max_sec:
warnings.warn("Terminating optimization: time limit reached",
TookTooLong)
else:
# you might want to report other stuff here
print("Elapsed: %.3f sec" % elapsed)
# example usage
x0 = [1.3, 0.7, 0.8, 1.9, 1.2]
res = minimize(rosen, x0, method='Nelder-Mead', callback=MinimizeStopper(1E-3))
No. What you can do is start the optimizer in a separate process, keep track of how long it has been running and terminate it if necessary:
from multiprocessing import Process, Queue
import time
import random
from __future__ import print_function
def f(param, queue):
#do the minimization and add result to queue
#res = minimize(param)
#queue.put(res)
#to make this a working example I'll just sleep a
#a random amount of time
sleep_amount = random.randint(1, 10)
time.sleep(sleep_amount)
res = param*sleep_amount
queue.put(res)
q = Queue()
p = Process(target=f, args=(2.2, q))
max_time = 3
t0 = time.time()
p.start()
while time.time() - t0 < max_time:
p.join(timeout=1)
if not p.is_alive():
break
if p.is_alive():
#process didn't finish in time so we terminate it
p.terminate()
result = None
else:
result = q.get()
print(result)