When I run a python script that uses multiprocessing I find it hard to get it to stop cleanly when it receives Ctrl-C. Ctrl-C has to be pressed multiple times and all sorts of error messages appear on the screen.
How can you make a python script that uses multiprocessing and quits
cleanly when it receives a Ctrl-C ?
Take this script for example
import numpy as np, time
from multiprocessing import Pool
def countconvolve(N):
np.random.seed() # ensure seed is random
count = 0
iters = 1000000 # 1million
l=12
k=12
l0=l+k-1
for n in range(N):
t = np.random.choice(np.array([-1,1], dtype=np.int8), size=l0 * iters)
v = np.random.choice(np.array([-1,1], dtype=np.int8), size = l * iters)
for i in xrange(iters):
if (not np.convolve(v[(l*i):(l*(i+1))],
t[(l0*i):(l0*(i+1))], 'valid').any()):
count += 1
return count
if __name__ == '__main__':
start = time.clock()
num_processes = 8
N = 13
pool = Pool(processes=num_processes)
res = pool.map(countconvolve, [N] * num_processes)
print res, sum(res)
print (time.clock() - start)
Jon's solution is probably better, but here it is using a signal handler. I tried it in a VBox VM which was extremely slow, but worked. I hope it will help.
import numpy as np, time
from multiprocessing import Pool
import signal
# define pool as global
pool = None
def term_signal_handler(signum, frame):
global pool
print 'CTRL-C pressed'
try:
pool.close()
pool.join()
except AttributeError:
print 'Pool has been already closed'
def countconvolve(N):
np.random.seed() # ensure seed is random
count = 0
iters = 1000000 # 1million
l=12
k=12
l0=l+k-1
for n in range(N):
t = np.random.choice(np.array([-1,1], dtype=np.int8), size=l0 * iters)
v = np.random.choice(np.array([-1,1], dtype=np.int8), size = l * iters)
for i in xrange(iters):
if (not np.convolve(v[(l*i):(l*(i+1))],t[(l0*i):(l0*(i+1))], 'valid').any()):
count += 1
return count
if __name__ == '__main__':
# Register the signal handler
signal.signal(signal.SIGINT, term_signal_handler)
start = time.clock()
num_processes = 8
N = 13
pool = Pool(processes=num_processes)
res = pool.map(countconvolve, [N] * num_processes)
print res, sum(res)
print (time.clock() - start)
I believe the try-catch mentioned in a similar post here on SO could be adapted to cover it.
If you wrap the pool.map call in the try-catch and then call terminate and join I think that would do it.
[Edit]
Some experimentation suggests something along these lines works well:
from multiprocessing import Pool
import random
import time
def countconvolve(N):
try:
sleepTime = random.randint(0,5)
time.sleep(sleepTime)
count = sleepTime
except KeyboardInterrupt as e:
pass
return count
if __name__ == '__main__':
random.seed(0)
start = time.clock()
num_processes = 8
N = 13
pool = Pool(processes=num_processes)
try:
res = pool.map(countconvolve, [N] * num_processes)
print res, sum(res)
print (time.clock() - start)
except KeyboardInterrupt as e:
print 'Stopping..'
I simplified your example somewhat to avoid having to load numpy on my machine to test but the critical part is the two try-except calls which handle the CTRL+C key presses.
Related
I am trying to improve the performance of this code in terms of time without success for now. Even running in 32 processes takes like 5m. Maybe do you have any suggestions to improve the time of this code? Here evaluated_f_bool_func_lst is a list with 2**24 elements. The elements of this list are 1-length strings with values of '1' or '0' ("binary list").
from sage.all import *
import time
from multiprocessing import Pool
import multiprocessing
def create_ext_component_function_i(dim, chunk_i, chunk_size, evaluated_f_bool_func_lst):
sum_y_str = []
for y in range(chunk_i, chunk_i + chunk_size):
prod = ""
for i in range(dim):
minus1 = ((-1)**(1&(y>>(i))))
prod += f'(1-{str(minus1)}*x[{str(i)}])*'
sum_y_str.append(f'{prod}{evaluated_f_bool_func_lst[y]}')
return "+".join(sum_y_str)
def create_ext_component_function(dim, evaluated_f_bool_func_lst):
sum_y = ""
chunk_size = (2**dim)/32
pool = Pool(32)
results = []
for i in range(0, 2**dim, chunk_size):
results.append(pool.apply_async(create_ext_component_function_i, args=(dim, i, chunk_size, evaluated_f_bool_func_lst)))
pool.close()
pool.join()
join_results = [result.get() for result in results]
print("+".join(join_results))
return 0
if __name__ == '__main__':
evaluated_f_bool_func_lst = load("evaluated_f_bool_func_lst.obj")
dim = 24
create_ext_component_function(dim, evaluated_f_bool_func_lst)
I tried to run a very simple multiprocessing code, but the code is still serially processed.
I have tried to run it on Mac(macOS 10.13) and Linux(Ubuntu 18.04) with python 2 and 3, but in both environments I had the same problem.
the function _process has to receive numpy array as arguments, so I decided to use Multiprocess.Process instead of Multiprocess.Pool.map() and Multiprocess.Pool.apply_async() because pickle is broken when use pool.map() in a class. https://stackoverflow.com/a/21345308/4755986
import time
from multiprocessing import Process, Queue
import numpy as np
class model:
def __init__(self):
self.results = []
self.jobs = []
self.start = time.time()
def _process(self, x,y,z):
j= 0
for i in range(10**8):
j = i+j
return j
def work(self,X,Y,Z, result_queue):
start = time.time() -self.start
result = self._process(X,Y,Z)
result_queue.put(result)
print(result)
end = time.time() -self.start
print( 'start time: ', start)
print('end time:', end)
# return result_queue
def fit(self,num):
for i in range(num):
X, Y, Z = np.ones([5,5]), np.ones([3,3]), np.ones([2,2])
result_queue = Queue()
p = Process(target=self.work, args = (X,Y,Z, result_queue))
self.jobs.append(p)
p.start()
print( 'ChildProcess...',i)
result = result_queue.get()
self.results.append(result)
for p in self.jobs:
p.join()
p.close()
return self.results
R = model()
k = R.fit(10)
print(k)
The time of start and end of each process is printed, and the second process only starts after the first process is finished. This is strange because each process should be automatically assign to different core and run in parallel.
result = result_queue.get()
result_queue.get() will block if it is empty. An item will only be added when a process finishes, hence the next process will be spawned only if the previous has finished.
Below is a version that does spawn 10 processes at once. I've marked the section I've added:
import time
from multiprocessing import Process, Queue
import numpy as np
class model:
def __init__(self):
self.results = []
self.jobs = []
self.start = time.time()
def _process(self, x,y,z):
j= 0
for i in range(10**8):
j = i+j
return j
def work(self,X,Y,Z, result_queue):
start = time.time() -self.start
result = self._process(X,Y,Z)
result_queue.put(result)
print(result)
end = time.time() -self.start
print( 'start time: ', start)
print('end time:', end)
# return result_queue
def fit(self,num):
for i in range(num):
X, Y, Z = np.ones([5,5]), np.ones([3,3]), np.ones([2,2])
result_queue = Queue()
p = Process(target=self.work, args = (X,Y,Z, result_queue))
self.jobs.append(p)
p.start()
print( 'ChildProcess...',i)
#result = result_queue.get() # <--- This blocks
#self.results.append(result)
for p in self.jobs:
p.join()
p.close()
for result in result_queue: # <-----
self.results.append(result) # <-----
return self.results
R = model()
k = R.fit(10)
print(k)
I've defined a (test) Function in Python, which I am using to understand the different computation time that might be required to execute the code - using normal code (without using multi-processing or multi-threading), and then implementing each of them one by one.
Function (for Basic Usage):
from random import randint as rInt
def highComputationFunction(rangeNumber):
count_ = 0
for i in range(rangeNumber):
count_ = count_*2 + rInt(rangeNumber**2, rangeNumber**3)
count_ = 10**100//count_
return count_
Also, for Multi-Processing & Multi-Threading, I wanted to return the result of the thread to my Parent Function, so modified it like this:
from random import randint as rInt
def highComputationFunction(rangeNumber, result):
count_ = 0
for i in range(rangeNumber):
count_ = count_*2 + rInt(rangeNumber**2, rangeNumber**3)
count_ = 10**100//count_
return count_
Looking into the CPU Usage for each of the main function as below:
import time
if __name__ == '__main__':
startTime = time.time()
rangeNumber = 10000
coumputedNum = float(round(highComputationFunction(rangeNumber)//100**5000, 3))
print('\tFunction of {} Executed in: {} seconds. Result = {}'.format(rangeNumber, round(time.time() - startTime, 2), coumputedNum))
inTime = time.time()
rangeNumber = 100000
coumputedNum = float(round(highComputationFunction(rangeNumber)//100**5000, 3))
print('\tFunction of {} Executed in: {} seconds. Result = {}'.format(rangeNumber, round(time.time() - inTime, 2), coumputedNum))
inTime = time.time()
rangeNumber = 1000000
coumputedNum = float(round(highComputationFunction(rangeNumber)//100**5000, 3))
print('\tFunction of {} Executed in: {} seconds. Result = {}'.format(rangeNumber, round(time.time() - inTime, 2), coumputedNum))
print('Total Execution Time: {}'.format(round(time.time() - startTime, 2)))
This was executed in approximately 46 Seconds in Total. One output is as Below:
# python understandComputation.py
# Function of 10000 Executed in: 0.03 seconds. Result = 0.0
# Function of 100000 Executed in: 0.91 seconds. Result = 0.0
# Function of 1000000 Executed in: 45.49 seconds. Result = 0.0
# Total Execution Time: 46.44
Executed the same thing with Multi-Threading:
import time
import threading
if __name__ == '__main__':
startTime = time.time()
result_ = 0
threadList = []
for i in [10000, 100000, 1000000]:
curThread = threading.Thread(target = highComputationFunction, args = (i, result_))
curThread.start()
print('\tThread for {} Started.'.format(i))
threadList.append(curThread)
result_ += result_
for i in threadList:
i.join()
print('Total Function Executed in: {} seconds. Result = {}'.format(round(time.time() - startTime, 2), result_))
For Multi-Processing:
import time
import multiprocessing
if __name__ == '__main__':
startTime = time.time()
result_ = 0
procList = []
for i in [10000, 100000, 1000000]:
curProc = multiprocessing.Process(target = highComputationFunction, args = (i, result_))
curProc.start()
print('\tProcess for {} Started.'.format(i))
procList.append(curProc)
result_ += result_
for i in procList:
i.join()
print('Total Function Executed in: {} seconds. Result = {}'.format(round(time.time() - startTime, 2), result_))
Implementing this, got the output in much more time than usual.
# python understandComputation.py
# Thread for 10000 Started.
# Thread for 100000 Started.
# Thread for 1000000 Started.
# Total Function Executed in: 47.04 seconds. Result = 0
# python understandComputation.py
# Process for 10000 Started.
# Process for 100000 Started.
# Process for 1000000 Started.
# Total Function Executed in: 47.21 seconds. Result = 0
Please tell me, if it is wrong with the implementation of the code or not. Expected result for multi-threading and multi-processing should be less than 45.5 Seconds, which is the maximum time taken for the execution of the 1000000 number in the actual code, but I'm not getting the desired output.
I have following script:
max_number = 100000
minimums = np.full((max_number), np.inf, dtype=np.float32)
data = np.zeros((max_number, 128, 128, 128), dtype=np.uint8)
if __name__ == '__main__':
main()
def worker(array, start, end):
for in_idx in range(start, end):
value = data[start:end][in_idx] # compute something using this array
minimums[in_idx] = value
def main():
jobs = []
num_jobs = 5
for i in range(num_jobs):
start = int(i * (1000 / num_jobs))
end = int(start + (1000 / num_jobs))
p = multiprocessing.Process(name=('worker_' + str(i)), target=worker, args=(start, end))
jobs.append(p)
p.start()
for proc in jobs:
proc.join()
print(jobs)
How can I ensure that the numpy array is global and can be accessed by each worker? Each worker uses a different part of the numpy array
import numpy as np
import multiprocessing as mp
ar = np.zeros((5,5))
def callback_function(result):
x,y,data = result
ar[x,y] = data
def worker(num):
data = ar[num,num]+3
return num, num, data
def apply_async_with_callback():
pool = mp.Pool(processes=5)
for i in range(5):
pool.apply_async(worker, args = (i, ), callback = callback_function)
pool.close()
pool.join()
print "Multiprocessing done!"
if __name__ == '__main__':
ar = np.ones((5,5)) #This will be used, as local scope comes before global scope
apply_async_with_callback()
Explanation: You set up your data array and your workers and callback functions. The number of processes in the pool set up a number of independent workers, where each worker can do more than one task. The callback writes the result back to the array.
The __name__=='__main__' protects the following line from being run at each import.
I'm experiencing with multiprocessing in python. I know that it can be slower than serialized computation, this is not the point of my post.
I'm just wandering why a single process pool is faster than the serialized computation of my basic problem. Shouldn't these times be the same?
Here is the code:
import time
import multiprocessing as mp
import matplotlib.pyplot as plt
def func(x):
return x*x*x
def multi_proc(nb_procs):
tic = time.time()
pool = mp.Pool(processes=nb_procs)
pool.map_async(func, range(1, 10000000))
toc = time.time()
return toc-tic
def single_core():
tic = time.time()
[func(x) for x in range(1, 10000000)]
toc = time.time()
return toc-tic
if __name__ == '__main__':
sc_times = [0]
mc_times = [0]
print('single core computation')
sc_constant_time = single_core()
print('{} secs'.format(sc_constant_time))
for nb_procs in range(1, 12):
print('computing for {} processes...'.format(nb_procs))
time_elapsed = (multi_proc(nb_procs))
print('{} secs'.format(time_elapsed))
mc_times.append(time_elapsed)
sc_times = [sc_constant_time for _ in mc_times]
plt.plot(sc_times, 'r--')
plt.plot(mc_times, 'b--')
plt.xlabel('nb procs')
plt.ylabel('time (s)')
plt.show()
And the plot of times per number of processes (red = serial computation, blue = multiprocessing):
EDIT 1:
I modified my code as Sidhnarth Gupta indicated, and here is the new code I have. I changed my func for no reason.
import time
import multiprocessing as mp
import matplotlib.pyplot as plt
import random
def func(x):
return random.choice(['a', 'b', 'c', 'd', 'e', 'f', 'g'])
def multi_proc(nb_procs, nb_iter):
tic = time.time()
pool = mp.Pool(processes=nb_procs)
pool.map_async(func, range(1, nb_iter)).get()
toc = time.time()
return toc-tic
def single_core(nb_iter):
tic = time.time()
[func(x) for x in range(1, nb_iter)]
toc = time.time()
return toc-tic
if __name__ == '__main__':
# configure
nb_iter = 100000
max_procs = 16
sc_times = [0]
mc_times = [0]
# multi proc calls
for nb_procs in range(1, max_procs):
print('computing for {} processes...'.format(nb_procs))
time_elapsed = (multi_proc(nb_procs, nb_iter))
print('{} secs'.format(time_elapsed))
mc_times.append(time_elapsed)
# single proc call
print('single core computation')
for nb in range(1, len(mc_times)):
print('{}...'.format(nb))
sc_times.append(single_core(nb_iter))
# average time
average_time = sum(sc_times)/len(sc_times)
print('average time on single core: {} secs'.format(average_time))
# plot
plt.plot(sc_times, 'r--')
plt.plot(mc_times, 'b--')
plt.xlabel('nb procs')
plt.ylabel('time (s)')
plt.show()
Here is the new plot I have:
I think I can now say that I have increased my program's speed by using multiprocessing.
Your current code to calculate the time taken by multiprocessing is actually telling the time taken by the process to submit the task to the pool. The processing is actually happening in asynchronous mode without blocking the thread.
I tried your program with following changes:
def multi_proc(nb_procs):
tic = time.time()
pool = mp.Pool(processes=nb_procs)
pool.map_async(func, range(1, 10000000)).get()
toc = time.time()
return toc-tic
and
def multi_proc(nb_procs):
tic = time.time()
pool = mp.Pool(processes=nb_procs)
pool.map(func, range(1, 10000000))
toc = time.time()
return toc-tic
Both of them take significantly more time than then serialised computation.
Also while creating such graphs, you should also consider calling the single_core() function everytime you want to map the value instead of mapping the same value multiple time. You will see a significant variance in time taken by the same.