Python multiprocessing global numpy arrays - python

I have following script:
max_number = 100000
minimums = np.full((max_number), np.inf, dtype=np.float32)
data = np.zeros((max_number, 128, 128, 128), dtype=np.uint8)
if __name__ == '__main__':
main()
def worker(array, start, end):
for in_idx in range(start, end):
value = data[start:end][in_idx] # compute something using this array
minimums[in_idx] = value
def main():
jobs = []
num_jobs = 5
for i in range(num_jobs):
start = int(i * (1000 / num_jobs))
end = int(start + (1000 / num_jobs))
p = multiprocessing.Process(name=('worker_' + str(i)), target=worker, args=(start, end))
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
print(jobs)
How can I ensure that the numpy array is global and can be accessed by each worker? Each worker uses a different part of the numpy array

import numpy as np
import multiprocessing as mp
ar = np.zeros((5,5))
def callback_function(result):
x,y,data = result
ar[x,y] = data
def worker(num):
data = ar[num,num]+3
return num, num, data
def apply_async_with_callback():
pool = mp.Pool(processes=5)
for i in range(5):
pool.apply_async(worker, args = (i, ), callback = callback_function)
pool.close()
pool.join()
print "Multiprocessing done!"
if __name__ == '__main__':
ar = np.ones((5,5)) #This will be used, as local scope comes before global scope
apply_async_with_callback()
Explanation: You set up your data array and your workers and callback functions. The number of processes in the pool set up a number of independent workers, where each worker can do more than one task. The callback writes the result back to the array.
The __name__=='__main__' protects the following line from being run at each import.

Related

Improving execution time for multiprocessing

I am trying to improve the performance of this code in terms of time without success for now. Even running in 32 processes takes like 5m. Maybe do you have any suggestions to improve the time of this code? Here evaluated_f_bool_func_lst is a list with 2**24 elements. The elements of this list are 1-length strings with values of '1' or '0' ("binary list").
from sage.all import *
import time
from multiprocessing import Pool
import multiprocessing
def create_ext_component_function_i(dim, chunk_i, chunk_size, evaluated_f_bool_func_lst):
sum_y_str = []
for y in range(chunk_i, chunk_i + chunk_size):
prod = ""
for i in range(dim):
minus1 = ((-1)**(1&(y>>(i))))
prod += f'(1-{str(minus1)}*x[{str(i)}])*'
sum_y_str.append(f'{prod}{evaluated_f_bool_func_lst[y]}')
return "+".join(sum_y_str)
def create_ext_component_function(dim, evaluated_f_bool_func_lst):
sum_y = ""
chunk_size = (2**dim)/32
pool = Pool(32)
results = []
for i in range(0, 2**dim, chunk_size):
results.append(pool.apply_async(create_ext_component_function_i, args=(dim, i, chunk_size, evaluated_f_bool_func_lst)))
pool.close()
pool.join()
join_results = [result.get() for result in results]
print("+".join(join_results))
return 0
if __name__ == '__main__':
evaluated_f_bool_func_lst = load("evaluated_f_bool_func_lst.obj")
dim = 24
create_ext_component_function(dim, evaluated_f_bool_func_lst)

Multiprocessing.Process do not run process in parallel

I tried to run a very simple multiprocessing code, but the code is still serially processed.
I have tried to run it on Mac(macOS 10.13) and Linux(Ubuntu 18.04) with python 2 and 3, but in both environments I had the same problem.
the function _process has to receive numpy array as arguments, so I decided to use Multiprocess.Process instead of Multiprocess.Pool.map() and Multiprocess.Pool.apply_async() because pickle is broken when use pool.map() in a class. https://stackoverflow.com/a/21345308/4755986
import time
from multiprocessing import Process, Queue
import numpy as np
class model:
def __init__(self):
self.results = []
self.jobs = []
self.start = time.time()
def _process(self, x,y,z):
j= 0
for i in range(10**8):
j = i+j
return j
def work(self,X,Y,Z, result_queue):
start = time.time() -self.start
result = self._process(X,Y,Z)
result_queue.put(result)
print(result)
end = time.time() -self.start
print( 'start time: ', start)
print('end time:', end)
# return result_queue
def fit(self,num):
for i in range(num):
X, Y, Z = np.ones([5,5]), np.ones([3,3]), np.ones([2,2])
result_queue = Queue()
p = Process(target=self.work, args = (X,Y,Z, result_queue))
self.jobs.append(p)
p.start()
print( 'ChildProcess...',i)
result = result_queue.get()
self.results.append(result)
for p in self.jobs:
p.join()
p.close()
return self.results
R = model()
k = R.fit(10)
print(k)
The time of start and end of each process is printed, and the second process only starts after the first process is finished. This is strange because each process should be automatically assign to different core and run in parallel.
result = result_queue.get()
result_queue.get() will block if it is empty. An item will only be added when a process finishes, hence the next process will be spawned only if the previous has finished.
Below is a version that does spawn 10 processes at once. I've marked the section I've added:
import time
from multiprocessing import Process, Queue
import numpy as np
class model:
def __init__(self):
self.results = []
self.jobs = []
self.start = time.time()
def _process(self, x,y,z):
j= 0
for i in range(10**8):
j = i+j
return j
def work(self,X,Y,Z, result_queue):
start = time.time() -self.start
result = self._process(X,Y,Z)
result_queue.put(result)
print(result)
end = time.time() -self.start
print( 'start time: ', start)
print('end time:', end)
# return result_queue
def fit(self,num):
for i in range(num):
X, Y, Z = np.ones([5,5]), np.ones([3,3]), np.ones([2,2])
result_queue = Queue()
p = Process(target=self.work, args = (X,Y,Z, result_queue))
self.jobs.append(p)
p.start()
print( 'ChildProcess...',i)
#result = result_queue.get() # <--- This blocks
#self.results.append(result)
for p in self.jobs:
p.join()
p.close()
for result in result_queue: # <-----
self.results.append(result) # <-----
return self.results
R = model()
k = R.fit(10)
print(k)

Multi-core processing hangs

My code looks like the following. It seems to be "hanging" during the proc.join() loop. If I create the dataframe df with 10 records, the whole process completes fast, but starting with 10000 (as shown), then the program seems to just hang. I am using htop to look at the CPU core usages, and I do see all of them spike up to 100%, but then long after they go back down to 0%, the program doesn't seem to continue. Any ideas on what I'm doing wrong?
import pandas as pd
import numpy as np
import multiprocessing
from multiprocessing import Process, Queue
def do_something(df, partition, q):
for index in partition:
q.put([v for v in df.iloc[index]])
def start_parallel_processing(df, partitions):
q = Queue()
procs = []
results = []
for partition in partitions:
proc = Process(target=do_something, args=(df, partition, q))
proc.start()
procs.extend([proc])
for i in range(len(partitions)):
results.append(q.get(True))
for proc in procs:
proc.join()
return results
num_cpus = multiprocessing.cpu_count()
df = pd.DataFrame([(x, x+1) for x in range(10000)], columns=['x','y'])
partitions = np.array_split(df.index, num_cpus)
results = start_parallel_processing(df, partitions)
len(results)
It appears Queue.Queue doesn't behave as you want and it wasn't made for sharing between multiple process, instead you must use Manager.Queue()
I have added some print to understand your code flow,
You can still polish your code to use Pool() instead of num_cpus
import pandas as pd
import numpy as np
import multiprocessing
import pprint
from multiprocessing import Process, Queue, Manager
def do_something(df, partition, q):
# print "do_something " + str(len(partition)) + " times"
for index in partition:
# print index
for v in df.iloc[index]:
#print "sending v to queue: " + str(len(df.iloc[index]))
q.put(v, False)
print "task_done(), qsize is "+ str(q.qsize())
def start_parallel_processing(df, partitions):
m = Manager()
q = m.Queue()
procs = []
results = []
print "START: launching "+ str(len(partitions)) + " process(es)"
index = 0
for partition in partitions:
print "launching "+ str(len(partitions)) + " process"
proc = Process(target=do_something, args=(df, partition, q))
procs.extend([proc])
proc.start()
index += 1
print "launched "+ str(index) + "/" + str(len(partitions)) + " process(es)"
while True:
try:
results.append(q.get( block=False ))
except:
print "QUEUE END"
break
print pprint.pformat(results)
process_count = 0
for proc in procs:
process_count += 1
print "joining "+ str(process_count) + "/" + str(len(procs)) + " process(es)"
proc.join()
return results
num_cpus = multiprocessing.cpu_count()
df = pd.DataFrame([(x, x+1) for x in range(10000)], columns=['x','y'])
partitions = np.array_split(df.index, num_cpus)
results = start_parallel_processing(df, partitions)
print "len(results) is: "+ str(len(results))

Why is a single process pool faster than serialized implementation in this python code?

I'm experiencing with multiprocessing in python. I know that it can be slower than serialized computation, this is not the point of my post.
I'm just wandering why a single process pool is faster than the serialized computation of my basic problem. Shouldn't these times be the same?
Here is the code:
import time
import multiprocessing as mp
import matplotlib.pyplot as plt
def func(x):
return x*x*x
def multi_proc(nb_procs):
tic = time.time()
pool = mp.Pool(processes=nb_procs)
pool.map_async(func, range(1, 10000000))
toc = time.time()
return toc-tic
def single_core():
tic = time.time()
[func(x) for x in range(1, 10000000)]
toc = time.time()
return toc-tic
if __name__ == '__main__':
sc_times = [0]
mc_times = [0]
print('single core computation')
sc_constant_time = single_core()
print('{} secs'.format(sc_constant_time))
for nb_procs in range(1, 12):
print('computing for {} processes...'.format(nb_procs))
time_elapsed = (multi_proc(nb_procs))
print('{} secs'.format(time_elapsed))
mc_times.append(time_elapsed)
sc_times = [sc_constant_time for _ in mc_times]
plt.plot(sc_times, 'r--')
plt.plot(mc_times, 'b--')
plt.xlabel('nb procs')
plt.ylabel('time (s)')
plt.show()
And the plot of times per number of processes (red = serial computation, blue = multiprocessing):
EDIT 1:
I modified my code as Sidhnarth Gupta indicated, and here is the new code I have. I changed my func for no reason.
import time
import multiprocessing as mp
import matplotlib.pyplot as plt
import random
def func(x):
return random.choice(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
def multi_proc(nb_procs, nb_iter):
tic = time.time()
pool = mp.Pool(processes=nb_procs)
pool.map_async(func, range(1, nb_iter)).get()
toc = time.time()
return toc-tic
def single_core(nb_iter):
tic = time.time()
[func(x) for x in range(1, nb_iter)]
toc = time.time()
return toc-tic
if __name__ == '__main__':
# configure
nb_iter = 100000
max_procs = 16
sc_times = [0]
mc_times = [0]
# multi proc calls
for nb_procs in range(1, max_procs):
print('computing for {} processes...'.format(nb_procs))
time_elapsed = (multi_proc(nb_procs, nb_iter))
print('{} secs'.format(time_elapsed))
mc_times.append(time_elapsed)
# single proc call
print('single core computation')
for nb in range(1, len(mc_times)):
print('{}...'.format(nb))
sc_times.append(single_core(nb_iter))
# average time
average_time = sum(sc_times)/len(sc_times)
print('average time on single core: {} secs'.format(average_time))
# plot
plt.plot(sc_times, 'r--')
plt.plot(mc_times, 'b--')
plt.xlabel('nb procs')
plt.ylabel('time (s)')
plt.show()
Here is the new plot I have:
I think I can now say that I have increased my program's speed by using multiprocessing.
Your current code to calculate the time taken by multiprocessing is actually telling the time taken by the process to submit the task to the pool. The processing is actually happening in asynchronous mode without blocking the thread.
I tried your program with following changes:
def multi_proc(nb_procs):
tic = time.time()
pool = mp.Pool(processes=nb_procs)
pool.map_async(func, range(1, 10000000)).get()
toc = time.time()
return toc-tic
and
def multi_proc(nb_procs):
tic = time.time()
pool = mp.Pool(processes=nb_procs)
pool.map(func, range(1, 10000000))
toc = time.time()
return toc-tic
Both of them take significantly more time than then serialised computation.
Also while creating such graphs, you should also consider calling the single_core() function everytime you want to map the value instead of mapping the same value multiple time. You will see a significant variance in time taken by the same.

How to let a multi-processing python application quit cleanly

When I run a python script that uses multiprocessing I find it hard to get it to stop cleanly when it receives Ctrl-C. Ctrl-C has to be pressed multiple times and all sorts of error messages appear on the screen.
How can you make a python script that uses multiprocessing and quits
cleanly when it receives a Ctrl-C ?
Take this script for example
import numpy as np, time
from multiprocessing import Pool
def countconvolve(N):
np.random.seed() # ensure seed is random
count = 0
iters = 1000000 # 1million
l=12
k=12
l0=l+k-1
for n in range(N):
t = np.random.choice(np.array([-1,1], dtype=np.int8), size=l0 * iters)
v = np.random.choice(np.array([-1,1], dtype=np.int8), size = l * iters)
for i in xrange(iters):
if (not np.convolve(v[(l*i):(l*(i+1))],
t[(l0*i):(l0*(i+1))], 'valid').any()):
count += 1
return count
if __name__ == '__main__':
start = time.clock()
num_processes = 8
N = 13
pool = Pool(processes=num_processes)
res = pool.map(countconvolve, [N] * num_processes)
print res, sum(res)
print (time.clock() - start)
Jon's solution is probably better, but here it is using a signal handler. I tried it in a VBox VM which was extremely slow, but worked. I hope it will help.
import numpy as np, time
from multiprocessing import Pool
import signal
# define pool as global
pool = None
def term_signal_handler(signum, frame):
global pool
print 'CTRL-C pressed'
try:
pool.close()
pool.join()
except AttributeError:
print 'Pool has been already closed'
def countconvolve(N):
np.random.seed() # ensure seed is random
count = 0
iters = 1000000 # 1million
l=12
k=12
l0=l+k-1
for n in range(N):
t = np.random.choice(np.array([-1,1], dtype=np.int8), size=l0 * iters)
v = np.random.choice(np.array([-1,1], dtype=np.int8), size = l * iters)
for i in xrange(iters):
if (not np.convolve(v[(l*i):(l*(i+1))],t[(l0*i):(l0*(i+1))], 'valid').any()):
count += 1
return count
if __name__ == '__main__':
# Register the signal handler
signal.signal(signal.SIGINT, term_signal_handler)
start = time.clock()
num_processes = 8
N = 13
pool = Pool(processes=num_processes)
res = pool.map(countconvolve, [N] * num_processes)
print res, sum(res)
print (time.clock() - start)
I believe the try-catch mentioned in a similar post here on SO could be adapted to cover it.
If you wrap the pool.map call in the try-catch and then call terminate and join I think that would do it.
[Edit]
Some experimentation suggests something along these lines works well:
from multiprocessing import Pool
import random
import time
def countconvolve(N):
try:
sleepTime = random.randint(0,5)
time.sleep(sleepTime)
count = sleepTime
except KeyboardInterrupt as e:
pass
return count
if __name__ == '__main__':
random.seed(0)
start = time.clock()
num_processes = 8
N = 13
pool = Pool(processes=num_processes)
try:
res = pool.map(countconvolve, [N] * num_processes)
print res, sum(res)
print (time.clock() - start)
except KeyboardInterrupt as e:
print 'Stopping..'
I simplified your example somewhat to avoid having to load numpy on my machine to test but the critical part is the two try-except calls which handle the CTRL+C key presses.

Categories

Resources