Alter dictionary values within Process pool in python - python

I'm trying to alter a dictionary in python inside a process pool environment, but the dictionary isn't changed when the pool finishes.
Here's a minimal example of the problem (the output batch_input is all zeros, although inside per_batch_build it changes the relevant values)
from multiprocessing import Pool, freeze_support
import numpy as np
import itertools
def test_process():
batch_size = 2
batch_input = {'part_evecs': np.zeros((2, 10, 10)),
'model_evecs': np.zeros((2, 10, 10)),
}
batch_model_dist = np.zeros((2, 10, 10))
pool = Pool(4)
batch_output = pool.map(per_batch_build, itertools.izip(itertools.repeat(batch_input),
itertools.repeat(batch_model_dist),
list(range(batch_size))))
pool.close()
pool.join()
return batch_input, batch_model_dist
# #profile
# def per_batch_build(batch_input, batch_model_dist, batch_part_dist, dataset, i_batch):
def per_batch_build(tuple_input):
batch_input, batch_model_dist, i_batch = tuple_input
batch_model_dist[i_batch] = np.ones((10,10))
batch_input['part_evecs'][i_batch] = np.ones((10,10))
batch_input['model_evecs'][i_batch] = np.ones((10,10))
But unfortunately batch_input, batch_model_dist, batch_part_dist are all zeros, although when printing batch_input inside per_batch_build is not zero.
Using the solutions provided from previous discussions, the result stays the same (the output arrays are all zeros)
from multiprocessing import Pool, freeze_support, Manager, Array
import numpy as np
import itertools
import ctypes
def test_process():
manager = Manager()
shared_array_base = Array(ctypes.c_double, [0] * (2*10*10))
shared_array = np.ctypeslib.as_array(shared_array_base.get_obj())
shared_array = shared_array.reshape((2,10,10))
batch_size = 2
batch_input = manager.dict({'part_evecs': shared_array,
# 'model_evecs': np.zeros((2, 10, 10)),
})
batch_model_dist = np.zeros((2, 10, 10))
pool = Pool(4)
batch_output = pool.map(per_batch_build, itertools.izip(itertools.repeat(batch_input),
itertools.repeat(batch_model_dist),
list(range(batch_size))))
pool.close()
pool.join()
return batch_input, batch_model_dist
# #profile
# def per_batch_build(batch_input, batch_model_dist, batch_part_dist, dataset, i_batch):
def per_batch_build(tuple_input):
batch_input, batch_model_dist, i_batch = tuple_input
batch_model_dist[i_batch] = np.ones((10,10))
batch_input['part_evecs'][i_batch] = np.ones((10,10))
# batch_input['model_evecs'][i_batch] = np.ones((10,10))

You are changing a copy of the object created inside per_batch_build. You are naming them identically in both functions so it may be confusing.
Add
print(id(batch_model_dist))
inside both functions and see for yourself.
[Edit]
I should probably also link related response, for example:
Is shared readonly data copied to different processes for multiprocessing?

Related

nvtx markers with Python Multiprocessing

I'm trying to use nvtx markers along with multiprocessing pool in Python, but when only a child process calls an annotated function the operation doesn't appear in the profiling report. Is there any way to get around this, or is this a limitation of python processes? Here's some example code to replicate:
import os
import time
from multiprocessing import Pool, shared_memory
import numpy as np
import nvtx
N_SAMPLES = int(1e6)
SIGNAL = np.random.randn(N_SAMPLES) + 1j * np.random.randn(N_SAMPLES)
#nvtx.annotate(color="red")
def create_shm_array(signal):
# Store the signal in shared memory to share across processes
shm = shared_memory.SharedMemory(create=True, size=signal.nbytes)
shared_array = np.ndarray(signal.shape, dtype=signal.dtype, buffer=shm.buf)
shared_array[:] = signal[:]
return shm
def worker(shm_name):
shm = shared_memory.SharedMemory(name=shm_name)
sig = np.ndarray((N_SAMPLES,), dtype=complex, buffer=shm.buf)
return expensive_op(sig)
#nvtx.annotate(color="blue")
def expensive_op(sig):
time.sleep(2)
return np.sum(sig)
def clean_shm(shm_name):
shm = shared_memory.SharedMemory(name=shm_name)
shm.close()
shm.unlink()
if __name__ == "__main__":
print(f"Total num_bytes: {SIGNAL.nbytes} B | {SIGNAL.nbytes / 1e9} GB")
test = np.random.randn(10)
expensive_op(test)
shared_mem = create_shm_array(SIGNAL)
with Pool(os.cpu_count()) as p:
p.map(worker, [shared_mem.name] * 2)
clean_shm(shared_mem.name)
Here's the Nvidia Nsight Systems Timeline. The Marker appears during the first call from the parent process, but does not appear when called by the child processes
By default, python multiprocessing forks new processes. We need it to spawn them. Working code below.
import os
import time
from multiprocessing import Pool, shared_memory, get_context
import numpy as np
import nvtx
N_SAMPLES = int(1e6)
SIGNAL = np.random.randn(N_SAMPLES) + 1j * np.random.randn(N_SAMPLES)
#nvtx.annotate(color="red")
def create_shm_array(signal):
# Store the signal in shared memory to share across processes
shm = shared_memory.SharedMemory(create=True, size=signal.nbytes)
shared_array = np.ndarray(signal.shape, dtype=signal.dtype, buffer=shm.buf)
shared_array[:] = signal[:]
return shm
def worker(shm_name):
shm = shared_memory.SharedMemory(name=shm_name)
sig = np.ndarray((N_SAMPLES,), dtype=complex, buffer=shm.buf)
return expensive_op(sig)
#nvtx.annotate(color="blue")
def expensive_op(sig):
time.sleep(2)
return np.sum(sig)
def clean_shm(shm_name):
shm = shared_memory.SharedMemory(name=shm_name)
shm.close()
shm.unlink()
if __name__ == "__main__":
print(f"Total num_bytes: {SIGNAL.nbytes} B | {SIGNAL.nbytes / 1e9} GB")
test = np.random.randn(10)
expensive_op(test)
shared_mem = create_shm_array(SIGNAL)
with get_context("spawn").Pool(os.cpu_count()) as p:
p.map(worker, [shared_mem.name] * 2)
clean_shm(shared_mem.name)

Python concurrent.futures.ProcessPoolExecutor() not executing methods inside objects

I am trying to concurrently execute methods from two objects concurrently for a computer vision task. My idea is to use two different feature detectors to compute their respective feature descriptions inside a base class.
In this regard, I built the following toy example to understand python concurrent.futures.ProcessPoolExecutor class.
When executed, the first part of the code runs as expected with 20 Heartbeat (10 from each method executed 10 times in total) strings printed out with the sum for two objects coming out correctly as 100, -100.
But in the second half of the code, it appears the ProcessPoolExecutor is not running the do_math(self, numx) method at all. What am I doing wrong here?
With best,
Azmyin
import numpy as np
import concurrent.futures as cf
import time
def current_milli_time():
# CORE FUNCTION
# Function that returns a time tick in milliseconds
return round(time.time() * 1000)
class masterClass(object):
super_multiplier = 1 # Class variable
def __init__(self, ls):
# Attributes of masterClass
self.var1 = ls[0]
self.sumx = ls[1]
def __rep__(self):
print(f"sumx value -- {self.sumx}")
def apply_sup_mult(self, var_in):
self.sumx = self.sumx + (var_in * masterClass.super_multiplier)
time.sleep(0.025)
print(f"Hearbeat!!")
# This is a regular method
def do_math(self, numx):
self.apply_sup_mult(numx)
ls = [10,0]
ls2 = [-10,0]
numx = 10
obj1 = masterClass(ls)
obj2 = masterClass(ls2)
t1 = current_milli_time()
# Run methods one by one
for _ in range(numx):
obj1.do_math(ls[0])
obj2.do_math(ls2[0])
obj1.__rep__()
obj2.__rep__()
t2 = current_milli_time()
print(f"Time taken -- {t2 - t1} ms")
print()
## Using multiprocessing to concurrently run two methods
# Intentionally reinitialize objects
obj1 = masterClass(ls)
obj1 = masterClass(ls2)
t1 = current_milli_time()
resx = []
with cf.ProcessPoolExecutor() as executor:
for i in range(numx):
#fs = [executor.submit(obj3.do_math, ls[0]), executor.submit(obj4.do_math, ls2[0])]
f1 = executor.submit(obj1.do_math, ls[0])
f2 = executor.submit(obj2.do_math, ls2[0])
# for i,f in enumerate(cf.as_completed(fs)):
# print(f"Done with {f}")
# # State of sumx
obj1.__rep__()
obj2.__rep__()
t2 = current_milli_time()
print(f"Time taken -- {t2 - t1} ms")

xarray: Larger than memory array using map_blocks dumping results into .zarr store

I am trying to parallelize an operation that generates a very large numpy array and usually blows up the memory of a machine that is running it.
What I came up with is the following workflow:
Use Dask to generate a lazy zero filled array
Use X-Array to generate a DataArray, using the previous lazy zero array with its appropriate coordinates etc...
Using DataArray.map_blocks I call on a function write_values that gets a subset of a Numpy array from a separate file and then insert them into the appropriate location in the xarray.DataArray.
Lazily convert to xarray.Dataset with a name for the DataArray
Then I attempt to store into disk via to_zarr
First: Is this the appropriate to handle an operation that loops through the blocks in a chunked array?
Second: When I run this program, it executes while blowing up my memory, this could be due to the amount of tasks created via Dask? How can I optimize to never hit the memory limit of my machine.
Third: After this code runs, I get a zarr stored into disk, but it seems to not actually do the storing of the values I get from the external function. Is this the right way to change values in the disk stored array.
Problem: My function that writes the .zarr into disk, does not write the values from the numpy_returning_volume. I am thinking that it could be that I need to write the values while in the map_blocks function?
Thank you!
Fully working example:
import dask.array as da
import xarray as xr
import numpy as np
import pathlib
from dask.diagnostics import ProgressBar
class NumpyReturningVolume():
def __init__(self):
# self.data = da.random.random_sample([50000, 50000, 50000])
self.data = np.random.random_sample([500, 1000, 100])
def num_i(self):
return self.data.shape[0]
def num_j(self):
return self.data.shape[1]
def num_k(self):
return self.data.shape[2]
def get_float(self, start_coordinate, end_coordinate):
return self.data[
start_coordinate[0]:end_coordinate[0],
start_coordinate[1]:end_coordinate[1],
start_coordinate[2]:end_coordinate[2]
]
def write_values(chunk, **kwargs):
start_coordinate = (chunk.coords["track"].values[0], chunk.coords["bin"].values[0], chunk.coords["time"].values[0])
end_coordinate = (chunk.coords["track"].values[-1]+1, chunk.coords["bin"].values[-1]+1, chunk.coords["time"].values[-1]+1)
volume_data = kwargs["volume"].get_float(start_coordinate, end_coordinate)
chunk.data = volume_data
return(chunk)
seismic_file_path = pathlib.Path("./")
seismic_file_name = "TEST_FILE.ds"
store_path = seismic_file_path.parent.joinpath(
seismic_file_name + "_test.zarr")
numpy_returning_volume = NumpyReturningVolume()
dimensions = ('track', 'bin', 'time')
track_coords = np.arange(0, numpy_returning_volume.num_i(), 1, dtype=np.uint32)
bin_coords = np.arange(0, numpy_returning_volume.num_j(), 1, dtype=np.uint32)
time_coords = np.arange(0, numpy_returning_volume.num_k(), 1, dtype=np.uint32)
empty_arr = da.empty(shape=(
numpy_returning_volume.num_i(),
numpy_returning_volume.num_j(),
numpy_returning_volume.num_k()),
dtype=np.float32)
xarray_data = xr.DataArray(empty_arr, name="seis", coords={
'track': track_coords,
'bin': bin_coords, 'time': time_coords},
dims=dimensions)
xarray_data.map_blocks(write_values, kwargs={
"volume": numpy_returning_volume}, template=xarray_data).compute()
xarray_data = xarray_data.to_dataset(name="seis")
delayed_results = xarray_data.to_zarr(store_path.__str__(), compute=False)
with ProgressBar():
delayed_results.compute()
OMG! I just realized that my problem was the simplest thing in the world! I just needed to set a variable equal to the result of map blocks and everything works. Here is the complete working script if anyone is interested. It generates a 6GB dataset though
import dask.array as da
import xarray as xr
import numpy as np
import pathlib
from dask.diagnostics import ProgressBar
class NumpyReturningVolume():
def __init__(self):
self.data = da.random.random_sample([1000, 2000, 1000])
# self.data = np.random.random_sample([500, 1000, 100])
def num_i(self):
return self.data.shape[0]
def num_j(self):
return self.data.shape[1]
def num_k(self):
return self.data.shape[2]
def get_float(self, start_coordinate, end_coordinate):
return self.data[
start_coordinate[0]:end_coordinate[0],
start_coordinate[1]:end_coordinate[1],
start_coordinate[2]:end_coordinate[2]
].compute()
def write_values(chunk, **kwargs):
start_coordinate = (chunk.coords["track"].values[0], chunk.coords["bin"].values[0], chunk.coords["time"].values[0])
end_coordinate = (chunk.coords["track"].values[-1]+1, chunk.coords["bin"].values[-1]+1, chunk.coords["time"].values[-1]+1)
volume_data = kwargs["volume"].get_float(start_coordinate, end_coordinate)
chunk.data = volume_data
return(chunk)
seismic_file_path = pathlib.Path("./")
seismic_file_name = "TEST_FILE.ds"
store_path = seismic_file_path.parent.joinpath(
seismic_file_name + "_test.zarr")
numpy_returning_volume = NumpyReturningVolume()
dimensions = ('track', 'bin', 'time')
track_coords = np.arange(0, numpy_returning_volume.num_i(), 1, dtype=np.uint32)
bin_coords = np.arange(0, numpy_returning_volume.num_j(), 1, dtype=np.uint32)
time_coords = np.arange(0, numpy_returning_volume.num_k(), 1, dtype=np.uint32)
empty_arr = da.empty(shape=(
numpy_returning_volume.num_i(),
numpy_returning_volume.num_j(),
numpy_returning_volume.num_k()),
dtype=np.float32)
xarray_data = xr.DataArray(empty_arr, name="seis", coords={
'track': track_coords,
'bin': bin_coords, 'time': time_coords},
dims=dimensions)
# This xarray_data = is what I was missing!!
xarray_data = xarray_data.map_blocks(write_values, kwargs={
"volume": numpy_returning_volume}, template=xarray_data)
xarray_data = xarray_data.to_dataset(name="seis")
delayed_results = xarray_data.to_zarr(store_path.__str__(), compute=False)
with ProgressBar():
delayed_results.compute()

Python multiprocessing with shared RawArray

I want to have multiple processes read from a different row of a numpy array in parallel to speed things up. However, when I run the following code, the first process to reach func throws an error as if var is no longer in scope. Why is this happening?
import numpy as np
import multiprocessing as mp
num_procs = 16
num_points = 2500000
def init_worker(X):
global var
var = X
def func(proc):
X_np = np.frombuffer(var).reshape((num_procs, num_points))
for y in range(num_points):
z = X_np[proc][y]
if __name__ == '__main__':
data = np.random.randn(num_procs, num_points)
X = mp.RawArray('d', num_procs*num_points)
X_np = np.frombuffer(X).reshape((num_procs, num_points))
np.copyto(X_np, data)
pool = mp.Pool(processes=4, initializer=init_worker, initargs=(X,))
for proc in range(num_procs):
pool.apply_async(func(proc))
pool.close()
pool.join()
Traceback (most recent call last):
File "parallel_test.py", line 26, in <module>
pool.apply_async(func(proc))
File "parallel_test.py", line 13, in func
X_np = np.frombuffer(var).reshape((num_procs, num_points))
NameError: global name 'var' is not defined
Update:
For some reason, if I use Pool.map instead of the for loop with Pool.apply_async, it seems to work. I don’t understand why though.
Any reason to not declare X as global in the top-level scope? This eliminates the NameError.
import numpy as np
import multiprocessing as mp
num_procs = 16
num_points = 25000000
def func(proc):
X_np = np.frombuffer(X).reshape((num_procs, num_points))
for y in range(num_points):
z = X_np[proc][y]
if __name__ == '__main__':
data = np.random.randn(num_procs, num_points)
global X
X = mp.RawArray('d', num_procs*num_points)
X_np = np.frombuffer(X).reshape((num_procs, num_points))
np.copyto(X_np, data)
pool = mp.Pool(processes=4 )
for proc in range(num_procs):
pool.apply_async(func(proc))
pool.close()
pool.join()
When I run a reduced instance of this problem, n=20:
import numpy as np
import multiprocessing as mp
num_procs = 4
num_points = 5
def func(proc):
X_np = np.frombuffer(X).reshape((num_procs, num_points))
for y in range(num_points):
z = X_np[proc][y]
if __name__ == '__main__':
data = np.random.randn(num_procs, num_points)
global X
X = mp.RawArray('d', num_procs*num_points)
X_np = np.frombuffer(X).reshape((num_procs, num_points))
np.copyto(X_np, data)
pool = mp.Pool(processes=4 )
for proc in range(num_procs):
pool.apply_async(func(proc))
pool.close()
pool.join()
print("\n".join(map(str, X)))
I get the following output:
-0.6346037804619162
1.1005724710066107
0.33458763357165255
0.6409345714971889
0.7124888766851982
0.36760459213332963
0.23593304931386933
-0.8668969562941349
-0.8842756219923469
0.005979036105620422
1.386422154089567
-0.8770988782214508
0.25187448339771057
-0.2473967968471952
-0.4909708883978521
0.5423521489750244
0.018749603867333802
0.035304792504378055
1.3263872668956616
1.0199839603892742
You haven't provided a sample of the expected output. Does this look similar to what you expect?

How to accumulate results from pool.apply_async call?

I want to make calls to pool.apply_async(func) and accumulate the results as soon as they are available without waiting for each other.
import multiprocessing
import numpy as np
chrNames=['chr1','chr2','chr3']
sims=[1,2,3]
def accumulate_chrBased_simBased_result(chrBased_simBased_result,accumulatedSignalArray,accumulatedCountArray):
signalArray = chrBased_simBased_result[0]
countArray = chrBased_simBased_result[1]
accumulatedSignalArray += signalArray
accumulatedCountArray += countArray
def func(chrName,simNum):
print('%s %d' %(chrName,simNum))
result=[]
signal_array=np.full((10000,), simNum, dtype=float)
count_array = np.full((10000,), simNum, dtype=int)
result.append(signal_array)
result.append(count_array)
return result
if __name__ == '__main__':
accumulatedSignalArray = np.zeros((10000,), dtype=float)
accumulatedCountArray = np.zeros((10000,), dtype=int)
numofProcesses = multiprocessing.cpu_count()
pool = multiprocessing.Pool(numofProcesses)
for chrName in chrNames:
for simNum in sims:
result= pool.apply_async(func, (chrName,simNum,))
accumulate_chrBased_simBased_result(result.get(),accumulatedSignalArray,accumulatedCountArray)
pool.close()
pool.join()
print(accumulatedSignalArray)
print(accumulatedCountArray)
In this way, each pool.apply_async call waits for other call to end.
Is there a way do get rid of this waiting for each other?
You are using result.get() on each iteration, and making the main process wait for the function to be ready in doing so.
Please find below a working version, with prints showing that accumulation is done when "func" is ready, and adding random sleeps to ensure sizable execution time differences.
import multiprocessing
import numpy as np
from time import time, sleep
from random import random
chrNames=['chr1','chr2','chr3']
sims=[1,2,3]
def accumulate_chrBased_simBased_result(chrBased_simBased_result,accumulatedSignalArray,accumulatedCountArray):
signalArray = chrBased_simBased_result[0]
countArray = chrBased_simBased_result[1]
accumulatedSignalArray += signalArray
accumulatedCountArray += countArray
def func(chrName,simNum):
result=[]
sleep(random()*5)
signal_array=np.full((10000,), simNum, dtype=float)
count_array = np.full((10000,), simNum, dtype=int)
result.append(signal_array)
result.append(count_array)
print('%s %d' %(chrName,simNum))
return result
if __name__ == '__main__':
accumulatedSignalArray = np.zeros((10000,), dtype=float)
accumulatedCountArray = np.zeros((10000,), dtype=int)
numofProcesses = multiprocessing.cpu_count()
pool = multiprocessing.Pool(numofProcesses)
results = []
for chrName in chrNames:
for simNum in sims:
results.append(pool.apply_async(func, (chrName,simNum,)))
for i in results:
print(i)
while results:
for r in results[:]:
if r.ready():
print('{} is ready'.format(r))
accumulate_chrBased_simBased_result(r.get(),accumulatedSignalArray,accumulatedCountArray)
results.remove(r)
pool.close()
pool.join()
print(accumulatedSignalArray)
print(accumulatedCountArray)

Categories

Resources