Python multiprocessing with shared RawArray

Python multiprocessing with shared RawArray - python

I want to have multiple processes read from a different row of a numpy array in parallel to speed things up. However, when I run the following code, the first process to reach func throws an error as if var is no longer in scope. Why is this happening?
import numpy as np
import multiprocessing as mp
num_procs = 16
num_points = 2500000
def init_worker(X):
global var
var = X
def func(proc):
X_np = np.frombuffer(var).reshape((num_procs, num_points))
for y in range(num_points):
z = X_np[proc][y]
if __name__ == '__main__':
data = np.random.randn(num_procs, num_points)
X = mp.RawArray('d', num_procs*num_points)
X_np = np.frombuffer(X).reshape((num_procs, num_points))
np.copyto(X_np, data)
pool = mp.Pool(processes=4, initializer=init_worker, initargs=(X,))
for proc in range(num_procs):
pool.apply_async(func(proc))
pool.close()
pool.join()
Traceback (most recent call last):
File "parallel_test.py", line 26, in <module>
pool.apply_async(func(proc))
File "parallel_test.py", line 13, in func
X_np = np.frombuffer(var).reshape((num_procs, num_points))
NameError: global name 'var' is not defined
Update:
For some reason, if I use Pool.map instead of the for loop with Pool.apply_async, it seems to work. I don’t understand why though.

Any reason to not declare X as global in the top-level scope? This eliminates the NameError.
import numpy as np
import multiprocessing as mp
num_procs = 16
num_points = 25000000
def func(proc):
X_np = np.frombuffer(X).reshape((num_procs, num_points))
for y in range(num_points):
z = X_np[proc][y]
if __name__ == '__main__':
data = np.random.randn(num_procs, num_points)
global X
X = mp.RawArray('d', num_procs*num_points)
X_np = np.frombuffer(X).reshape((num_procs, num_points))
np.copyto(X_np, data)
pool = mp.Pool(processes=4 )
for proc in range(num_procs):
pool.apply_async(func(proc))
pool.close()
pool.join()
When I run a reduced instance of this problem, n=20:
import numpy as np
import multiprocessing as mp
num_procs = 4
num_points = 5
def func(proc):
X_np = np.frombuffer(X).reshape((num_procs, num_points))
for y in range(num_points):
z = X_np[proc][y]
if __name__ == '__main__':
data = np.random.randn(num_procs, num_points)
global X
X = mp.RawArray('d', num_procs*num_points)
X_np = np.frombuffer(X).reshape((num_procs, num_points))
np.copyto(X_np, data)
pool = mp.Pool(processes=4 )
for proc in range(num_procs):
pool.apply_async(func(proc))
pool.close()
pool.join()
print("\n".join(map(str, X)))
I get the following output:
-0.6346037804619162
1.1005724710066107
0.33458763357165255
0.6409345714971889
0.7124888766851982
0.36760459213332963
0.23593304931386933
-0.8668969562941349
-0.8842756219923469
0.005979036105620422
1.386422154089567
-0.8770988782214508
0.25187448339771057
-0.2473967968471952
-0.4909708883978521
0.5423521489750244
0.018749603867333802
0.035304792504378055
1.3263872668956616
1.0199839603892742
You haven't provided a sample of the expected output. Does this look similar to what you expect?

Related

nvtx markers with Python Multiprocessing

I'm trying to use nvtx markers along with multiprocessing pool in Python, but when only a child process calls an annotated function the operation doesn't appear in the profiling report. Is there any way to get around this, or is this a limitation of python processes? Here's some example code to replicate:
import os
import time
from multiprocessing import Pool, shared_memory
import numpy as np
import nvtx
N_SAMPLES = int(1e6)
SIGNAL = np.random.randn(N_SAMPLES) + 1j * np.random.randn(N_SAMPLES)
#nvtx.annotate(color="red")
def create_shm_array(signal):
# Store the signal in shared memory to share across processes
shm = shared_memory.SharedMemory(create=True, size=signal.nbytes)
shared_array = np.ndarray(signal.shape, dtype=signal.dtype, buffer=shm.buf)
shared_array[:] = signal[:]
return shm
def worker(shm_name):
shm = shared_memory.SharedMemory(name=shm_name)
sig = np.ndarray((N_SAMPLES,), dtype=complex, buffer=shm.buf)
return expensive_op(sig)
#nvtx.annotate(color="blue")
def expensive_op(sig):
time.sleep(2)
return np.sum(sig)
def clean_shm(shm_name):
shm = shared_memory.SharedMemory(name=shm_name)
shm.close()
shm.unlink()
if __name__ == "__main__":
print(f"Total num_bytes: {SIGNAL.nbytes} B | {SIGNAL.nbytes / 1e9} GB")
test = np.random.randn(10)
expensive_op(test)
shared_mem = create_shm_array(SIGNAL)
with Pool(os.cpu_count()) as p:
p.map(worker, [shared_mem.name] * 2)
clean_shm(shared_mem.name)
Here's the Nvidia Nsight Systems Timeline. The Marker appears during the first call from the parent process, but does not appear when called by the child processes

By default, python multiprocessing forks new processes. We need it to spawn them. Working code below.
import os
import time
from multiprocessing import Pool, shared_memory, get_context
import numpy as np
import nvtx
N_SAMPLES = int(1e6)
SIGNAL = np.random.randn(N_SAMPLES) + 1j * np.random.randn(N_SAMPLES)
#nvtx.annotate(color="red")
def create_shm_array(signal):
# Store the signal in shared memory to share across processes
shm = shared_memory.SharedMemory(create=True, size=signal.nbytes)
shared_array = np.ndarray(signal.shape, dtype=signal.dtype, buffer=shm.buf)
shared_array[:] = signal[:]
return shm
def worker(shm_name):
shm = shared_memory.SharedMemory(name=shm_name)
sig = np.ndarray((N_SAMPLES,), dtype=complex, buffer=shm.buf)
return expensive_op(sig)
#nvtx.annotate(color="blue")
def expensive_op(sig):
time.sleep(2)
return np.sum(sig)
def clean_shm(shm_name):
shm = shared_memory.SharedMemory(name=shm_name)
shm.close()
shm.unlink()
if __name__ == "__main__":
print(f"Total num_bytes: {SIGNAL.nbytes} B | {SIGNAL.nbytes / 1e9} GB")
test = np.random.randn(10)
expensive_op(test)
shared_mem = create_shm_array(SIGNAL)
with get_context("spawn").Pool(os.cpu_count()) as p:
p.map(worker, [shared_mem.name] * 2)
clean_shm(shared_mem.name)

How to improve file reading speed?

I read the data slice from a large file. A 400Mb picture took 4 seconds. It only takes 1 second for the disk to read this file. The program does very little computation. How to improve speed?
from opentile import OpenTile
import time
import traceback
import os
os.environ.setdefault('TURBOJPEG', 'C:/lib/')
try:
tiler = OpenTile.open('svs800.svs')
except:
traceback.print_exc()
s=tiler.get_level(0)
tile_size=str(s.tiled_size).split("x")
time1=time.time()
from multiprocessing.pool import ThreadPool
def get_data(s):
# This function reads a piece of binary data from a certain position of the image
# and then adds the header data
return tiler.get_tile(0,0,0, (s[0], s[1]))
pool = ThreadPool(5)
y = pool.map(get_data, [(i,j) for i in range(int(tile_size[0])) for j in range(int(tile_size[1]))])
print("tiles",len(y))
time2=time.time()
print(time2)

Simple sequential approach:
from opentile import OpenTile
import os
import time
def timer(func):
def wrapper(*args, **kwargs):
start = time.perf_counter()
result = func(*args, **kwargs)
end = time.perf_counter()
print(f'{func.__name__} {end-start:.4f}s')
return result
return wrapper
os.environ.setdefault('TURBOJPEG', '/opt/libjpeg-turbo')
#timer
def open_svs(filename):
return OpenTile.open(filename)
#timer
def get_data(tiler, x, y):
return [tiler.get_tile(0, 0, 0, (x_, y_)) for x_ in range(x) for y_ in range(y)]
tiler = open_svs('18959.svs')
x, y = map(int, str(tiler.get_level(0).tiled_size).split('x'))
data = get_data(tiler, x, y)
assert len(data) == x * y
Output:
open_svs 0.0082s
get_data 0.5843s
Note:
x, y values for this file are 183, 114 respectively. The file size is 563,271,749 bytes

Converting from ThreadPool to ProcessExecutorPool

I have the following code which I would like to convert from using ThreadPool to use of ProcessPoolExecutor since it is all CPU intensive calculations and when i observe the CPU monitor I note that my 8 core processor is only using a single thread.
import datetime
from multiprocessing.dummy import Pool as ThreadPool
def thread_run(q, clients_credit_array, clients_terr_array,
freq_small_list, freq_large_list, clients, year, admin):
claim_id = []
claim_client_id = []
claim_company_id = []
claim_year = []
claim_type = []
claim_closed = []
claim_cnt = []
claim_amount = []
print(datetime.datetime.utcnow())
i = 0
client_cnt = 1000
loop_incr = 8
while i < client_cnt:
ind_rng = range(i, min((i + loop_incr), (client_cnt)), 1)
call_var = []
for q in ind_rng:
call_var.append((q,
clients_credit_array,
clients_terr_array,
freq_small_list,
freq_large_list,
clients,
year,
admin))
pool = ThreadPool(len(call_var))
results = pool.map(call_claim, call_var)
pool.close()
pool.join()
for result in results:
if result[0] == []:
pass
else:
r = 0
if r < len(result[0]):
claim_index += 1
claim_id.append(claim_index)
claim_client_id.append(result[0][r])
claim_company_id.append(result[1][r])
claim_year.append(result[2][r])
claim_type.append(result[3][r])
claim_closed.append(result[4][r])
claim_cnt.append(result[5][r])
claim_amount.append(result[6][r])
r += 1
i += loop_incr
print(datetime.datetime.utcnow())
The difficulty I am having, however, is that when I modify the code as follows, I get error messages:
from concurrent.futures import ProcessPoolExecutor as PThreadPool
pool = PThreadPool(max_workers=len(call_var))
#pool = ThreadPool(len(call_var))
results = pool.map(call_claim, call_var)
#pool.close()
#pool.join()
I had to remove the pool.close() and pool.join() as it generated errors. But when I removed them, my code was not utilizing parallel processors and it ran much longer and slower than originally. What am I missing?

As was pointed out in the comments, it is common to see Executor used as part of a context manager and without the need for join or close operations. Below is a simplified example to illustrate the concepts.
Example:
import concurrent.futures
import random
import time
import os
values = [1, 2, 3, 4, 5]
def times_two(n):
time.sleep(random.randrange(1, 5))
print("pid:", os.getpid())
return n * 2
def main():
with concurrent.futures.ProcessPoolExecutor() as executor:
results = executor.map(times_two, values)
for one_result in results:
print(one_result)
if __name__ == "__main__":
main()
Output:
pid: 396
pid: 8904
pid: 25440
pid: 20592
pid: 14636
2
4
6
8
10

How to accumulate results from pool.apply_async call?

I want to make calls to pool.apply_async(func) and accumulate the results as soon as they are available without waiting for each other.
import multiprocessing
import numpy as np
chrNames=['chr1','chr2','chr3']
sims=[1,2,3]
def accumulate_chrBased_simBased_result(chrBased_simBased_result,accumulatedSignalArray,accumulatedCountArray):
signalArray = chrBased_simBased_result[0]
countArray = chrBased_simBased_result[1]
accumulatedSignalArray += signalArray
accumulatedCountArray += countArray
def func(chrName,simNum):
print('%s %d' %(chrName,simNum))
result=[]
signal_array=np.full((10000,), simNum, dtype=float)
count_array = np.full((10000,), simNum, dtype=int)
result.append(signal_array)
result.append(count_array)
return result
if __name__ == '__main__':
accumulatedSignalArray = np.zeros((10000,), dtype=float)
accumulatedCountArray = np.zeros((10000,), dtype=int)
numofProcesses = multiprocessing.cpu_count()
pool = multiprocessing.Pool(numofProcesses)
for chrName in chrNames:
for simNum in sims:
result= pool.apply_async(func, (chrName,simNum,))
accumulate_chrBased_simBased_result(result.get(),accumulatedSignalArray,accumulatedCountArray)
pool.close()
pool.join()
print(accumulatedSignalArray)
print(accumulatedCountArray)
In this way, each pool.apply_async call waits for other call to end.
Is there a way do get rid of this waiting for each other?

You are using result.get() on each iteration, and making the main process wait for the function to be ready in doing so.
Please find below a working version, with prints showing that accumulation is done when "func" is ready, and adding random sleeps to ensure sizable execution time differences.
import multiprocessing
import numpy as np
from time import time, sleep
from random import random
chrNames=['chr1','chr2','chr3']
sims=[1,2,3]
def accumulate_chrBased_simBased_result(chrBased_simBased_result,accumulatedSignalArray,accumulatedCountArray):
signalArray = chrBased_simBased_result[0]
countArray = chrBased_simBased_result[1]
accumulatedSignalArray += signalArray
accumulatedCountArray += countArray
def func(chrName,simNum):
result=[]
sleep(random()*5)
signal_array=np.full((10000,), simNum, dtype=float)
count_array = np.full((10000,), simNum, dtype=int)
result.append(signal_array)
result.append(count_array)
print('%s %d' %(chrName,simNum))
return result
if __name__ == '__main__':
accumulatedSignalArray = np.zeros((10000,), dtype=float)
accumulatedCountArray = np.zeros((10000,), dtype=int)
numofProcesses = multiprocessing.cpu_count()
pool = multiprocessing.Pool(numofProcesses)
results = []
for chrName in chrNames:
for simNum in sims:
results.append(pool.apply_async(func, (chrName,simNum,)))
for i in results:
print(i)
while results:
for r in results[:]:
if r.ready():
print('{} is ready'.format(r))
accumulate_chrBased_simBased_result(r.get(),accumulatedSignalArray,accumulatedCountArray)
results.remove(r)
pool.close()
pool.join()
print(accumulatedSignalArray)
print(accumulatedCountArray)

Alter dictionary values within Process pool in python

I'm trying to alter a dictionary in python inside a process pool environment, but the dictionary isn't changed when the pool finishes.
Here's a minimal example of the problem (the output batch_input is all zeros, although inside per_batch_build it changes the relevant values)
from multiprocessing import Pool, freeze_support
import numpy as np
import itertools
def test_process():
batch_size = 2
batch_input = {'part_evecs': np.zeros((2, 10, 10)),
'model_evecs': np.zeros((2, 10, 10)),
}
batch_model_dist = np.zeros((2, 10, 10))
pool = Pool(4)
batch_output = pool.map(per_batch_build, itertools.izip(itertools.repeat(batch_input),
itertools.repeat(batch_model_dist),
list(range(batch_size))))
pool.close()
pool.join()
return batch_input, batch_model_dist
# #profile
# def per_batch_build(batch_input, batch_model_dist, batch_part_dist, dataset, i_batch):
def per_batch_build(tuple_input):
batch_input, batch_model_dist, i_batch = tuple_input
batch_model_dist[i_batch] = np.ones((10,10))
batch_input['part_evecs'][i_batch] = np.ones((10,10))
batch_input['model_evecs'][i_batch] = np.ones((10,10))
But unfortunately batch_input, batch_model_dist, batch_part_dist are all zeros, although when printing batch_input inside per_batch_build is not zero.
Using the solutions provided from previous discussions, the result stays the same (the output arrays are all zeros)
from multiprocessing import Pool, freeze_support, Manager, Array
import numpy as np
import itertools
import ctypes
def test_process():
manager = Manager()
shared_array_base = Array(ctypes.c_double, [0] * (2*10*10))
shared_array = np.ctypeslib.as_array(shared_array_base.get_obj())
shared_array = shared_array.reshape((2,10,10))
batch_size = 2
batch_input = manager.dict({'part_evecs': shared_array,
# 'model_evecs': np.zeros((2, 10, 10)),
})
batch_model_dist = np.zeros((2, 10, 10))
pool = Pool(4)
batch_output = pool.map(per_batch_build, itertools.izip(itertools.repeat(batch_input),
itertools.repeat(batch_model_dist),
list(range(batch_size))))
pool.close()
pool.join()
return batch_input, batch_model_dist
# #profile
# def per_batch_build(batch_input, batch_model_dist, batch_part_dist, dataset, i_batch):
def per_batch_build(tuple_input):
batch_input, batch_model_dist, i_batch = tuple_input
batch_model_dist[i_batch] = np.ones((10,10))
batch_input['part_evecs'][i_batch] = np.ones((10,10))
# batch_input['model_evecs'][i_batch] = np.ones((10,10))

You are changing a copy of the object created inside per_batch_build. You are naming them identically in both functions so it may be confusing.
Add
print(id(batch_model_dist))
inside both functions and see for yourself.
[Edit]
I should probably also link related response, for example:
Is shared readonly data copied to different processes for multiprocessing?

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

Python multiprocessing with shared RawArray - python

Related

nvtx markers with Python Multiprocessing

How to improve file reading speed?

Converting from ThreadPool to ProcessExecutorPool

How to accumulate results from pool.apply_async call?

Alter dictionary values within Process pool in python

Categories

Resources