So I am trying to implement a solution that was already described here, but I am changing it a bit. Instead of just trying to change the array with operations, I am trying to read from a NetCDF file using xarray and then write to a shared numpy array with the multiprocessing module.
I feel as though I am getting pretty close, but something is going wrong. I have pasted a reproducible, easy copy/paste example below. As you can see, when I run the processes, they can all read the files that I created, but they do not correctly update the shared numpy array that I am trying to write to. Any help would be appreciated.
Code
import ctypes
import logging
import multiprocessing as mp
import xarray as xr
from contextlib import closing
import numpy as np
info = mp.get_logger().info
def main():
data = np.arange(10)
for i in range(4):
ds = xr.Dataset({'x': data})
ds.to_netcdf('test_{}.nc'.format(i))
ds.close()
logger = mp.log_to_stderr()
logger.setLevel(logging.INFO)
# create shared array
N, M = 4, 10
shared_arr = mp.Array(ctypes.c_float, N * M)
arr = tonumpyarray(shared_arr, dtype=np.float32)
arr = arr.reshape((N, M))
# Fill with random values
arr[:, :] = np.zeros((N, M))
arr_orig = arr.copy()
files = ['test_0.nc', 'test_1.nc', 'test_2.nc', 'test_3.nc']
parameter_tuples = [
(files[0], 0),
(files[1], 1),
(files[2], 2),
(files[3], 3)
]
# write to arr from different processes
with closing(mp.Pool(initializer=init, initargs=(shared_arr,))) as p:
# many processes access different slices of the same array
p.map_async(g, parameter_tuples)
p.join()
print(arr_orig)
print(tonumpyarray(shared_arr, np.float32).reshape(N, M))
def init(shared_arr_):
global shared_arr
shared_arr = shared_arr_ # must be inherited, not passed as an argument
def tonumpyarray(mp_arr, dtype=np.float64):
return np.frombuffer(mp_arr.get_obj(), dtype)
def g(params):
"""no synchronization."""
print("Current File Name: ", params[0])
tmp_dataset = xr.open_dataset(params[0])
print(tmp_dataset["x"].data[:])
arr = tonumpyarray(shared_arr)
arr[params[1], :] = tmp_dataset["x"].data[:]
tmp_dataset.close()
if __name__ == '__main__':
mp.freeze_support()
main()
What's wrong?
1.You forgot to reshape back after tonumpyarray.
2.You used the wrong dtype in tonumpyarray.
Code
import ctypes
import logging
import multiprocessing as mp
import xarray as xr
from contextlib import closing
import numpy as np
info = mp.get_logger().info
def main():
data = np.arange(10)
for i in range(4):
ds = xr.Dataset({'x': data})
ds.to_netcdf('test_{}.nc'.format(i))
ds.close()
logger = mp.log_to_stderr()
logger.setLevel(logging.INFO)
# create shared array
N, M = 4, 10
shared_arr = mp.Array(ctypes.c_float, N * M)
arr = tonumpyarray(shared_arr, dtype=np.float32)
arr = arr.reshape((N, M))
# Fill with random values
arr[:, :] = np.zeros((N, M))
arr_orig = arr.copy()
files = ['test_0.nc', 'test_1.nc', 'test_2.nc', 'test_3.nc']
parameter_tuples = [
(files[0], 0),
(files[1], 1),
(files[2], 2),
(files[3], 3)
]
# write to arr from different processes
with closing(mp.Pool(initializer=init, initargs=(shared_arr, N, M))) as p:
# many processes access different slices of the same array
p.map_async(g, parameter_tuples)
p.join()
print(arr_orig)
print(tonumpyarray(shared_arr, np.float32).reshape(N, M))
def init(shared_arr_, N_, M_): # add shape
global shared_arr
global N, M
shared_arr = shared_arr_ # must be inherited, not passed as an argument
N = N_
M = M_
def tonumpyarray(mp_arr, dtype=np.float32): # change type
return np.frombuffer(mp_arr.get_obj(), dtype)
def g(params):
"""no synchronization."""
print("Current File Name: ", params[0])
tmp_dataset = xr.open_dataset(params[0])
print(tmp_dataset["x"].data[:])
arr = tonumpyarray(shared_arr).reshape(N, M) # reshape
arr[params[1], :] = tmp_dataset["x"].data[:]
tmp_dataset.close()
if __name__ == '__main__':
mp.freeze_support()
main()
Related
I'm trying to get advantages of multi-processing in python, so did some tests and found multi-processing code runs much slower than plain one. What I do wrong???
Here is the test script:
import numpy as np
from datetime import datetime
from multiprocessing import Pool
def some_func(argv):
x = argv[0]
y = argv[1]
return np.sum(x * y)
def other_func(argv):
x = argv[0]
y = argv[1]
f1 = np.fft.rfft(x)
f2 = np.fft.rfft(y)
CC = np.fft.irfft(f1 * np.conj(f2))
return CC
N = 20000
X = np.random.randint(0, 10, size=(N, N))
Y = np.random.randint(0, 10, size=(N, N))
output_check = np.zeros(N)
D1 = datetime.now()
for k in range(len(X)):
output_check[k] = np.max(some_func((X[k], Y[k])))
print('Plain: ', datetime.now()-D1)
output = np.zeros(N)
D1 = datetime.now()
with Pool(10) as pool: # CPUs
for ind, res in enumerate(pool.imap(some_func, zip(X, Y), chunksize=1)):
output[ind] = np.max(res)
pool.close()
pool.join()
print('Pool: ', datetime.now()-D1)
Output:
Plain: 0:00:00.904062
Pool: 0:00:15.386251
Why so big difference? What consumes the time???
Have 80 CPUs available, tried different pool size and chunksize...
The actual function is more complex (like other_func), with it I get almost the same time for plain and parallel code, but still no speed-up :(
The input is a BIG 3D numpy array, and I need a pairwise convolution of its elements
I have 3 numpy matrices:
One contains pixels positions in X (x_pos), another pixel positions in Y (y_pos) and a last one containing pixel values (p_value)
I would like to use these 3 matrices to build a results image
With loops I have this result:
#Resulting image
res = np.zeros((128,128,3), dtype = np.uint8)
for i in range(x_pos.shape[0]):
for j in range(x_pos.shape[1]):
# Get coordinates
x = x_pos[i][j]
y = y_pos[i][j]
res[y,x] = p_value[i][j]
With large matrices (2048*2048) this code already takes a lot of time. Is it possible to optimize this code without using a nested loop?
I specify that the positions in the pos_x and pos_y matrices do not necessarily follow each other, there may be holes or duplicate values
It should be possible using np.meshgrid
i = np.arange(0, x.shape[0])
j = np.arange(0, x.shape[1])
i_1, j_1 = np.meshgrid(i, j, indexing='ij')
res[y_1.ravel(),x_1.ravel()] = p_value[i_1.ravel(),j_1.ravel()]
First use consistent numpy 2d array indexing:
x = x_pos[i,j]
y = y_pos[i,j]
res[y,x] = p_value[i,j]
Now instead of scalar i,j use arrays
i = np.arange(n); j = np.arange(m)
You didn't provida [mcve] so I won't try to demonstrate that th
Thanks to #hpaulj and #ai2ys answer the problem is solved.
Here is a comparison of the results in terms of execution speed:
import numpy as np
import cv2
import time
m_size = 4096
m_x = np.random.randint(0,m_size,(m_size,m_size), dtype = np.uint16)
m_y = np.random.randint(0,m_size,(m_size,m_size), dtype = np.uint16)
p_value = np.ones((m_size,m_size), dtype = np.uint8)
#Meshgrid method:
out = np.zeros((m_size,m_size),dtype=np.uint8)
start = time.time()
i = np.arange(0, m_x.shape[0])
j = np.arange(0, m_x.shape[1])
i_1, j_1 = np.meshgrid(i, j, indexing='ij')
out[m_x.ravel(),m_y.ravel()] = p_value[i_1.ravel(),j_1.ravel()]
end = time.time()
print("Meshgrid: {} s".format(end - start))
#No for loop method:
out = np.zeros((m_size,m_size),dtype=np.uint8)
start = time.time()
i = np.arange(m_x.shape[0])
j = np.arange(m_y.shape[1])
x = m_x[i,j]
y = m_y[i,j]
out[x,y] = p_value[i,j]
end = time.time()
print("No loop: {} s".format(end - start))
#For loop method:
out = np.zeros((m_size,m_size),dtype=np.uint8)
start = time.time()
for i in range(m_x.shape[0]):
for j in range(m_y.shape[1]):
x = m_x[i,j]
y = m_y[i,j]
out[x,y] = p_value[i,j]
end = time.time()
print("Nested loop: {} s".format(end - start))
#Output:
Meshgrid: 0.4837045669555664 s
No loop: 0.3600656986236572 s
Nested loop: 13.10097336769104 s
I'm experimenting with Python's multiprocessing.shared_memory and multiprocessing.managers.SharedMemoryManager. I've created the following minimal code to get a grip of the process:
from multiprocessing import shared_memory
from multiprocessing import Pool
from multiprocessing.managers import SharedMemoryManager
import numpy as np
def main():
x = np.random.randn(10_000_000)
n_jobs = 25
n_workers = 4
with SharedMemoryManager() as smm, Pool(n_workers) as pool:
shm = smm.SharedMemory(x.nbytes)
y = np.ndarray(x.shape, dtype=x.dtype, buffer=shm.buf)
y[:] = x[:]
sample_means = pool.starmap(func, [(shm.name, x.shape, x.dtype)] * n_jobs)
def func(shm_name, shape, dtype):
shm = shared_memory.SharedMemory(name=shm_name, create=False)
x = np.ndarray(shape, dtype, buffer=shm.buf)
sample_mean = np.random.choice(x, size=10_000, replace=False).mean()
return sample_mean
if __name__ == '__main__':
main()
So using the shared_memory functionality we avoid making duplicates of the large numpy array x, which is good. However, it seems to me that the line y[:] = x[:] is actually creating a copy of x (in y). True, it happens only once (not for every process) but my question is - is it avoidable?
If x is a huge dataset I want to share between my workers I'd rather not make copies of it at all. Is it possible?
Trying to accelerate a DP algorithm on python, numba seemed like an appropriate candidate.
I'm doing a subtraction of a 2D array with a 1D array which delivers a 3D array. I'm then using .argmin() along the 3rd dimension to obtain a 2D array. This works just fine with numpy, but doesn't with numba.
Toy code reproducing the issue :
from numba import jit
import numpy as np
inflow = np.arange(1,0,-0.01) # Dim [T]
actions = np.arange(0,1,0.05) # Dim [M]
start_lvl = np.random.rand(500).reshape(-1,1)*49 # Dim [Nx1]
disc_lvl = np.arange(0,1000) # Dim [O]
#jit(nopython=True)
def my_func(disc_lvl, actions, start_lvl, inflow):
for i in range(0,100):
# Calculate new level at time i
new_lvl = start_lvl + inflow[i] + actions # Dim [N x M]
# For each new_level element, find closest discretized level
diff = (disc_lvl-new_lvl[:,:,np.newaxis]) # Dim [N x M x O]
idx_lvl = abs(diff).argmin(axis=2) # Dim [N x M]
return True
# function works fine without numba
success = my_func(disc_lvl, actions, start_lvl, inflow)
Why does not the code above run ? It does when taking out #jit(nopython=True).
Is there a work round to make the following calculation work with numba ?
I've tried variants with numpy repeats & expand_dims, as well as defining explicitly the input types of the jit function without success.
There are a few things you need to change to make it work:
Adding a dimension with arr[:, :, None]: for Numba, it looks like getitem so prefer using reshape
Use np.abs instead of built-in abs
The argmin with axis keyword argument is not implemented. Prefer using loops, which Numba is designed to optimize.
With all this fixed you can run the jitted function:
from numba import jit
import numpy as np
inflow = np.arange(1,0,-0.01) # Dim [T]
actions = np.arange(0,1,0.05) # Dim [M]
start_lvl = np.random.rand(500).reshape(-1,1)*49 # Dim [Nx1]
disc_lvl = np.arange(0,1000) # Dim [O]
#jit(nopython=True)
def my_func(disc_lvl, actions, start_lvl, inflow):
for i in range(0,100):
# Calculate new level at time i
new_lvl = start_lvl + inflow[i] + actions # Dim [N x M]
# For each new_level element, find closest discretized level
new_lvl_3d = new_lvl.reshape(*new_lvl.shape, 1)
diff = np.abs(disc_lvl - new_lvl_3d) # Dim [N x M x O]
idx_lvl = np.empty(new_lvl.shape)
for i in range(diff.shape[0]):
for j in range(diff.shape[1]):
idx_lvl[i, j] = diff[i, j, :].argmin()
return True
# function works fine without numba
success = my_func(disc_lvl, actions, start_lvl, inflow)
Find below the corrected code of my first post, that you can execute with and without jitted mode of the numba library (by removing the line that starts with #jit). I've observed a speed increase of factor 2 for this example.
from numba import jit
import numpy as np
import datetime as dt
inflow = np.arange(1,0,-0.01) # Dim [T]
nbTime = np.shape(inflow)[0]
actions = np.arange(0,1,0.01) # Dim [M]
start_lvl = np.random.rand(500).reshape(-1,1)*49 # Dim [Nx1]
disc_lvl = np.arange(0,1000) # Dim [O]
#jit(nopython=True)
def my_func(nbTime, disc_lvl, actions, start_lvl, inflow):
# Initialize result
res = np.empty((nbTime,np.shape(start_lvl)[0],np.shape(actions)[0]))
for t in range(0,nbTime):
# Calculate new level at time t
new_lvl = start_lvl + inflow[t] + actions # Dim [N x M]
print(t)
# For each new_level element, find closest discretized level
new_lvl_3d = new_lvl.reshape(*new_lvl.shape, 1)
diff = np.abs(disc_lvl - new_lvl_3d) # Dim [N x M x O]
idx_lvl = np.empty(new_lvl.shape)
for i in range(diff.shape[0]):
for j in range(diff.shape[1]):
idx_lvl[i, j] = diff[i, j, :].argmin()
res[t,:,:] = idx_lvl
return res
# Call function and print running time
start_time = dt.datetime.now()
result = my_func(nbTime, disc_lvl, actions, start_lvl, inflow)
print('Execution time :',(dt.datetime.now() - start_time))
I'm trying to multiplicate 2 big matrices with memory limit using hdf5 (pytables)
but function numpy.dot seems to give me error:
Valueerror: array is too big
I need to do matrix multiplication by myself maybe blockwise or there is some another python function similar to numpy.dot?
import numpy as np
import time
import tables
import cProfile
import numexpr as ne
n_row=10000
n_col=100
n_batch=10
rows = n_row
cols = n_col
batches = n_batch
atom = tables.UInt8Atom() #?
filters = tables.Filters(complevel=9, complib='blosc') # tune parameters
fileName_a = 'C:\carray_a.h5'
shape_a = (rows*batches, cols) # predefined size
h5f_a = tables.open_file(fileName_a, 'w')
ca_a = h5f_a.create_carray(h5f_a.root, 'carray', atom, shape_a, filters=filters)
for i in range(batches):
data = np.random.rand(rows,cols)
ca_a[i*rows:(i+1)*rows]= data[:]
#h5f_0.close()
rows = n_col
cols = n_row
batches = n_batch
fileName_b = 'C:\carray_b.h5'
shape_b = (rows, cols*batches) # predefined size
h5f_b = tables.open_file(fileName_b, 'w')
ca_b = h5f_b.create_carray(h5f_b.root, 'carray', atom, shape_b, filters=filters)
#need to batch by cols
sz= rows/batches
for i in range(batches):
data = np.random.rand(sz, cols*batches)
ca_b[i*sz:(i+1)*sz]= data[:]
#h5f_1.close()
rows = n_batch*n_row
cols = n_batch*n_row
fileName_c = 'C:\carray_c.h5'
shape_c = (rows, cols) # predefined size
h5f_c = tables.open_file(fileName_c, 'w')
ca_c = h5f_c.create_carray(h5f_c.root, 'carray', atom, shape_c, filters=filters)
a= h5f_a.root.carray#[:]
b= h5f_b.root.carray#[:]
c= h5f_c.root.carray
t0= time.time()
c= np.dot(a,b) #error if aray is big
print (time.time()-t0)
Update: so here is the code.It's interesting but using hdf5 it works even faster.
import numpy as np
import tables
import time
sz= 100 #chunk size
n_row=10000 #m
n_col=1000 #n
#for arbitrary size
A=np.random.rand(n_row,n_col)
B=np.random.rand(n_col,n_row)
# A=np.random.randint(5, size=(n_row,n_col))
# B=np.random.randint(5, size=(n_col,n_row))
#using numpy array
#C= np.zeros((n_row,n_row))
#using hdf5
fileName_C = 'CArray_C.h5'
atom = tables.Float32Atom()
shape = (A.shape[0], B.shape[1])
Nchunk = 128 # ?
chunkshape = (Nchunk, Nchunk)
chunk_multiple = 1
block_size = chunk_multiple * Nchunk
h5f_C = tables.open_file(fileName_C, 'w')
C = h5f_C.create_carray(h5f_C.root, 'CArray', atom, shape, chunkshape=chunkshape)
sz= block_size
t0= time.time()
for i in range(0, A.shape[0], sz):
for j in range(0, B.shape[1], sz):
for k in range(0, A.shape[1], sz):
C[i:i+sz,j:j+sz] += np.dot(A[i:i+sz,k:k+sz],B[k:k+sz,j:j+sz])
print (time.time()-t0)
t0= time.time()
res= np.dot(A,B)
print (time.time()-t0)
print (C== res)
h5f_C.close()
I don't know of a np.dot that work without loading into memory. I think blocking would work pretty well. Create a an output array (called "c" below) as pytables CArray and fill in blocks. You should choose the chunkshape when you create it to match your blocking scheme. Something like
atom = tables.Float32Atom() # you have UInt8Atom() above. do you mean that?
shape = (a.shape[0], b.shape[1])
# you can vary block_size and chunkshape independently, but I would
# aim to have block_size an integer multiple of chunkshape
# your mileage may vary and depends on the array size and how you'll
# access it in the future.
Nchunk = 128 # ?
chunkshape = (Nchunk, Nchunk)
chunk_multiple = 1
block_size = chunk_multiple * Nchunk
c = h5f.create_carray(h5.root, 'c', atom, shape, chunkshape=chunkshape)
for i_start in range(0, a.shape[0], block_size):
for j_start in range(0, b.shape[1], block_size):
for k_start in range(0, a.shape[1], block_size):
c[i_start:i_start+block_size, j_start:j_start + block_size] += \
np.dot(a[i_start:i_start + block_size, k_start:k_start + block_size],
b[k_start:k_start + block_size, j_start:j_start + block_size]