I am trying my hands on python multiprocessing. I want a couple of processes which are independent to each other to run in parallel and as they return check if the process was successful or not using ApplyAsync.successful() utility. However when I call successful in the callback to my subprocess the script hangs.
import multiprocessing as mp
import time
result_map = {}
def foo_pool(x):
time.sleep(2)
print x
return x
result_list = []
def log_result(result):
print result_map[result].successful() #hangs
result_list.append(result)
def apply_async_with_callback():
pool = mp.Pool()
for i in range(10):
result_map[i] = pool.apply_async(foo_pool, args = (i, ), callback = log_result)
pool.close()
pool.join()
print(result_list)
if __name__ == '__main__':
apply_async_with_callback()
You don't need to check successful() because the callback is only called when the result was successful.
Following is the relevant code (multiprocessing/pool.py - AsyncResult)
def _set(self, i, obj):
self._success, self._value = obj
if self._callback and self._success: # <-----
self._callback(self._value) # <-----
self._cond.acquire()
try:
self._ready = True
self._cond.notify()
finally:
self._cond.release()
del self._cache[self._job]
Related
I have been trying to get my code to work for many days,
I am desperate.
I've scoured the internet, but I still can't find it.
I have a text file encoded in "latin-1" of 9GB -> 737 022 387 lines, each line contains a string.
I would like to read each line and send them in an http PUT request that waits for a response, and returns TRUE or FALSE if the response is 200 or 400
The PUT request takes about 1 to 3 seconds, so to speed up the processing time I would like to use either a Thread or a multiprocessing.
To start, I simulate my PUT request with a sleep of 3 seconds.
and even that I can't get it to work
This code split my string into char, i don't know why...
from multiprocessing import Pool
from time import sleep
def process_line(line):
sleep(3)
print(line)
return True
if __name__ == "__main__":
pool = Pool(2)
peon = open(r'D:\txtFile',encoding="latin-1")
for line in peon:
res = pool.map(process_line,line )
print(res)
This give error : TypeError: process_line() takes 1 positional argument but 17 were given
import multiprocessing
from multiprocessing import Pool
from time import sleep
def process_line(line):
sleep(3)
print(line)
return True
if __name__ == "__main__":
pool = Pool(2)
with open(r"d:\txtFile",encoding="latin-1") as file:
res = pool.apply(process_line,file.readline() )
print(res)
that : Crash the computer
from multiprocessing import Pool
from time import sleep
def process_line(line):
sleep(3)
print(line)
return True
if __name__ == "__main__":
pool = Pool(2)
peon = open(r'D:\txtFile',encoding="latin-1")
for line in peon:
res = pool.map(process_line,peon )
print(res)
Although the problem seems unrealistic though. shooting 737,022,387 requests! calculate how many months it'll take from single computer!!
Still, Better way to do this task is to read line by line from file in a separate thread and insert into a queue. And then multi-process the queue.
Solution 1:
from multiprocessing import Queue, Process
from threading import Thread
from time import sleep
urls_queue = Queue()
max_process = 4
def read_urls():
with open('urls_file.txt', 'r') as f:
for url in f:
urls_queue.put(url.strip())
print('put url: {}'.format(url.strip()))
# put DONE to tell send_request_processor to exit
for i in range(max_process):
urls_queue.put("DONE")
def send_request(url):
print('send request: {}'.format(url))
sleep(1)
print('recv response: {}'.format(url))
def send_request_processor():
print('start send request processor')
while True:
url = urls_queue.get()
if url == "DONE":
break
else:
send_request(url)
def main():
file_reader_thread = Thread(target=read_urls)
file_reader_thread.start()
procs = []
for i in range(max_process):
p = Process(target=send_request_processor)
procs.append(p)
p.start()
for p in procs:
p.join()
print('all done')
# wait for all tasks in the queue
file_reader_thread.join()
if __name__ == '__main__':
main()
Demo: https://onlinegdb.com/Elfo5bGFz
Solution 2:
You can use tornado asynchronous networking library
from tornado import gen
from tornado.ioloop import IOLoop
from tornado.queues import Queue
q = Queue(maxsize=2)
async def consumer():
async for item in q:
try:
print('Doing work on %s' % item)
await gen.sleep(0.01)
finally:
q.task_done()
async def producer():
with open('urls_file.txt', 'r') as f:
for url in f:
await q.put(url)
print('Put %s' % item)
async def main():
# Start consumer without waiting (since it never finishes).
IOLoop.current().spawn_callback(consumer)
await producer() # Wait for producer to put all tasks.
await q.join() # Wait for consumer to finish all tasks.
print('Done')
# producer and consumer can run in parallel
IOLoop.current().run_sync(main)
Using method multiprocessing.pool.imap is a step in the right direction but the problem is that with so much input you will be feeding the input task queue faster than the processing pool can take the tasks off the queue and return results. Consequently, the task queue will continue to grow and you will exhaust memory. What is needed is a way to "throttle" method imap so that it blocks once the task queue size has N tasks on it. I think a reasonable value for N as a default is twice the pool size to ensure that when a pool process completes work on a task there will be no delay for it to find another task to work on. Hence we create classes BoundedQueueProcessPool (multiprocessing) and BoundedQueueThreadPool (multithreading):
import multiprocessing.pool
import multiprocessing
import threading
class ImapResult():
def __init__(self, semaphore, result):
self._semaphore = semaphore
self.it = result.__iter__()
def __iter__(self):
return self
def __next__(self):
try:
elem = self.it.__next__()
self._semaphore.release()
return elem
except StopIteration:
raise
except:
self._semaphore.release()
raise
class BoundedQueuePool:
def __init__(self, limit, semaphore):
self._limit = limit
self._semaphore = semaphore
def release(self, result, callback=None):
self._semaphore.release()
if callback:
callback(result)
def apply_async(self, func, args=(), kwds={}, callback=None, error_callback=None):
self._semaphore.acquire()
callback_fn = self.release if callback is None else lambda result: self.release(result, callback=callback)
error_callback_fn = self.release if error_callback is None else lambda result: self.release(result, callback=callback)
return super().apply_async(func, args, kwds, callback=callback_fn, error_callback=error_callback_fn)
def imap(self, func, iterable, chunksize=1):
def new_iterable(iterable):
for elem in iterable:
self._semaphore.acquire()
yield elem
if chunksize > self._limit:
raise ValueError(f'chunksize argument exceeds {self._limit}')
result = super().imap(func, new_iterable(iterable), chunksize)
return ImapResult(self._semaphore, result)
def imap_unordered(self, func, iterable, chunksize=1):
def new_iterable(iterable):
for elem in iterable:
self._semaphore.acquire()
yield elem
if chunksize > self._limit:
raise ValueError(f'chunksize argument exceeds {self._limit}')
result = super().imap_unordered(func, new_iterable(iterable), chunksize)
return ImapResult(self._semaphore, result)
class BoundedQueueProcessPool(BoundedQueuePool, multiprocessing.pool.Pool):
def __init__(self, *args, max_waiting_tasks=None, **kwargs):
multiprocessing.pool.Pool.__init__(self, *args, **kwargs)
if max_waiting_tasks is None:
max_waiting_tasks = self._processes
elif max_waiting_tasks < 0:
raise ValueError(f'Invalid negative max_waiting_tasks value: {max_waiting_tasks}')
limit = self._processes + max_waiting_tasks
BoundedQueuePool.__init__(self, limit, multiprocessing.BoundedSemaphore(limit))
class BoundedQueueThreadPool(BoundedQueuePool, multiprocessing.pool.ThreadPool):
def __init__(self, *args, max_waiting_tasks=None, **kwargs):
multiprocessing.pool.ThreadPool.__init__(self, *args, **kwargs)
if max_waiting_tasks is None:
max_waiting_tasks = self._processes
elif max_waiting_tasks < 0:
raise ValueError(f'Invalid negative max_waiting_tasks value: {max_waiting_tasks}')
limit = self._processes + max_waiting_tasks
BoundedQueuePool.__init__(self, limit, threading.BoundedSemaphore(limit))
#######################################################################
from time import sleep
def process_line(line):
sleep(3)
# the lines already have line end characters:
print(line, end='')
return True
if __name__ == "__main__":
pool = BoundedQueueProcessPool(2)
with open("test.txt") as file:
for res in pool.imap(process_line, file):
#print(res)
pass
pool.close()
pool.join()
According to https://docs.python.org/3/library/multiprocessing.html
multiprocessing forks (for *nix) to create a worker process to execute tasks. We can verify this by setting up a global variable in a module prior to the fork.
If the worker function imports that module and finds the variable present, then the process memory has been copied. And so it is:
import os
def f(x):
import sys
return sys._mypid # <<< value is returned by subprocess!
def set_state():
import sys
sys._mypid = os.getpid()
def g():
from multiprocessing import Pool
pool = Pool(4)
try:
for z in pool.imap(f, range(1000)):
print(z)
finally:
pool.close()
pool.join()
if __name__=='__main__':
set_state()
g()
However, if things work this way, what business does multiprocessing have in serializing the work function, f?
In this example:
import os
def set_state():
import sys
sys._mypid = os.getpid()
def g():
def f(x):
import sys
return sys._mypid
from multiprocessing import Pool
pool = Pool(4)
try:
for z in pool.imap(f, range(1000)):
print(z)
finally:
pool.close()
pool.join()
if __name__=='__main__':
set_state()
g()
we get:
AttributeError: Can't pickle local object 'g.<locals>.f'
Stackoverflow and the internet is full of ways to work around this. (Python's standard pickle function can handle functions, but not function with closure data).
But why do we get here? A copy-on-write version of f is in the forked process's memory. Why does it need to be serialized at all?
Derp -- it has to be this way because:
pool = Pool(4) <<< processes created here
for z in pool.imap(f, range(1000)): <<< reference to function
FYI... anyone wanting to fork, where the new process has access to the function (and thereby avoids serializing the function), can follow this pattern:
import collections
import multiprocessing as mp
import os
import pickle
import threading
_STATUS_DATA = 0
_STATUS_ERR = 1
_STATUS_POISON = 2
Message = collections.namedtuple(
"Message",
["status",
"payload",
"sequence_id"
]
)
def parallel_map(
target,
args,
num_processes,
inq_maxsize=None,
outq_maxsize=None,
serialize=pickle.dumps,
deserialize=pickle.loads,
start_method="fork",
preserve_order=True,
):
"""
:param target: Target function
:param args: Iterable of single parameter arguments for target.
:param num_processes: Number of processes.
:param inq_maxsize:
:param outq_maxsize:
:param serialize:
:param deserialize:
:param start_method:
:param preserve_order: If true result are returns in the order received by args. Otherwise,
first result is returned first
:return:
"""
if inq_maxsize is None: inq_maxsize=10*num_processes
if outq_maxsize is None: outq_maxsize=10*num_processes
inq = mp.Queue(maxsize=inq_maxsize)
outq = mp.Queue(maxsize=outq_maxsize)
poison = serialize(Message(_STATUS_POISON, None, -1))
deserialize(poison) # Test
def work():
while True:
obj = inq.get()
# print("{} - GET .. OK".format(os.getpid()))
# inq.task_done()
try:
msg = deserialize(obj)
assert isinstance(msg, Message)
if msg.status==_STATUS_POISON:
outq.put(serialize(Message(_STATUS_POISON,None,msg.sequence_id)))
# print("{} - RETURN POISON .. OK".format(os.getpid()))
return
else:
args, kw = msg.payload
result = target(*args,**kw)
outq.put(serialize(Message(_STATUS_DATA,result,msg.sequence_id)))
except Exception as e:
try:
outq.put(serialize(Message(_STATUS_ERR,e,msg.sequence_id)))
except Exception as e2:
try:
outq.put(serialize(Message(_STATUS_ERR,None,-1)))
# outq.put(serialize(1,Exception("Unable to serialize response")))
# TODO. Log exception
except Exception as e3:
pass
if start_method == "thread":
_start_method = threading.Thread
else:
_start_method = mp.get_context('fork').Process
processes = [
_start_method(
target=work,
name="parallel_map.work"
)
for _ in range(num_processes)]
for p in processes:
p.start()
quitting = []
def quit_processes():
if not quitting:
quitting.append(1)
# Send poison pills - kill child processes
for _ in range(num_processes):
inq.put(poison)
nsent = [0]
def send():
# Send the data
for seq_id, arg in enumerate(args):
obj = ((arg,), {})
inq.put(serialize(Message(_STATUS_DATA, obj, seq_id)))
nsent[0] += 1
quit_processes()
# Publish
sender = threading.Thread(
target=send,
name="parallel_map.sender",
daemon=True)
sender.start()
try:
# Consume
nquit = [0]
buffer = {}
nyielded = 0
while True:
result = outq.get() # Waiting here
# outq.task_done()
msg = deserialize(result)
assert isinstance(msg, Message)
if msg.status == _STATUS_POISON:
nquit[0]+=1
# print(">>> QUIT ACK {}".format(nquit[0]))
if nquit[0]>=num_processes:
break
else:
assert msg.sequence_id>=0
if preserve_order:
buffer[msg.sequence_id] = msg
while True:
if nyielded not in buffer:
break
msg = buffer.pop(nyielded)
nyielded += 1
if msg.status==_STATUS_ERR:
if isinstance(msg.payload, Exception):
raise msg.payload
else:
raise Exception("Unexpected exception")
else:
assert msg.status==_STATUS_DATA
yield msg.payload
else:
if msg.status==_STATUS_ERR:
if isinstance(msg.payload, Exception):
raise msg.payload
else:
raise Exception("Unexpected exception")
else:
assert msg.status==_STATUS_DATA
yield msg.payload
# if nyielded == nsent:
# break
except Exception as e:
raise
finally:
if not quitting:
quit_processes()
sender.join()
for p in processes:
p.join()
def f(x):
time.sleep(0.01)
if x ==-1:
raise Exception("Boo")
return x
Usage:
def f(x):
time.sleep(0.01)
if x ==-1:
raise Exception("Boo")
return x
for result in parallel_map(target=f, <<< not serialized
args=range(100),
num_processes=8,
start_method="fork"):
pass
... with that caveat: for every thread you have in your program when you fork, a puppy dies.
Here is my prime factorization program,i added a callback function in pool.apply_async(findK, args=(N,begin,end)),a message prompt out prime factorization is over when factorization is over,it works fine.
import math
import multiprocessing
def findK(N,begin,end):
for k in range(begin,end):
if N% k == 0:
print(N,"=" ,k ,"*", N/k)
return True
return False
def prompt(result):
if result:
print("prime factorization is over")
def mainFun(N,process_num):
pool = multiprocessing.Pool(process_num)
for i in range(process_num):
if i ==0 :
begin =2
else:
begin = int(math.sqrt(N)/process_num*i)+1
end = int(math.sqrt(N)/process_num*(i+1))
pool.apply_async(findK, args=(N,begin,end) , callback = prompt)
pool.close()
pool.join()
if __name__ == "__main__":
N = 684568031001583853
process_num = 16
mainFun(N,process_num)
Now i want to change the callback function in apply_async,to change prompt into a shutdown function to kill all other process.
def prompt(result):
if result:
pool.terminate()
The pool instance is not defined in prompt scope or passed into prompt.
pool.terminate() can't work in prompt function.
How to pass multiprocessing.Pool instance to apply_async'callback function ?
(I have made it done in class format,just to add a class method and call self.pool.terminate can kill all other process,
how to do the job in function format?)
if not set pool as global variable, can pool be passed into callback function?
Passing extra arguments to the callback function is not supported. Yet you have plenty of elegant ways to workaround that.
You can encapsulate your pool logic into an object:
class Executor:
def __init__(self, process_num):
self.pool = multiprocessing.Pool(process_num)
def prompt(self, result):
if result:
print("prime factorization is over")
self.pool.terminate()
def schedule(self, function, args):
self.pool.apply_async(function, args=args, callback=self.prompt)
def wait(self):
self.pool.close()
self.pool.join()
def main(N,process_num):
executor = Executor(process_num)
for i in range(process_num):
...
executor.schedule(findK, (N,begin,end))
executor.wait()
Or you can use the concurrent.futures.Executor implementation which returns a Future object. You just append the pool to the Future object before setting the callback.
def prompt(future):
if future.result():
print("prime factorization is over")
future.pool_executor.shutdown(wait=False)
def main(N,process_num):
executor = concurrent.futures.ProcessPoolExecutor(max_workers=process_num)
for i in range(process_num):
...
future = executor.submit(findK, N,begin,end)
future.pool_executor = executor
future.add_done_callback(prompt)
You can simply define a local close function as a callback:
import math
import multiprocessing
def findK(N, begin, end):
for k in range(begin, end):
if N % k == 0:
print(N, "=", k, "*", N / k)
return True
return False
def mainFun(N, process_num):
pool = multiprocessing.Pool(process_num)
def close(result):
if result:
print("prime factorization is over")
pool.terminate()
for i in range(process_num):
if i == 0:
begin = 2
else:
begin = int(math.sqrt(N) / process_num * i) + 1
end = int(math.sqrt(N) / process_num * (i + 1))
pool.apply_async(findK, args=(N, begin, end), callback=close)
pool.close()
pool.join()
if __name__ == "__main__":
N = 684568031001583853
process_num = 16
mainFun(N, process_num)
You can also use a partial function from functool, with
import functools
def close_pool(pool, results):
if result:
pool.terminate()
def mainFun(N, process_num):
pool = multiprocessing.Pool(process_num)
close = funtools.partial(close_pool, pool)
....
You need to have pool end up in prompt's environment. One possibility is to move pool into the global scope (though this isn't really best-practice). This appears to work:
import math
import multiprocessing
pool = None
def findK(N,begin,end):
for k in range(begin,end):
if N% k == 0:
print(N,"=" ,k ,"*", N/k)
return True
return False
def prompt(result):
if result:
print("prime factorization is over")
pool.terminate()
def mainFun(N,process_num):
global pool
pool = multiprocessing.Pool(process_num)
for i in range(process_num):
if i ==0 :
begin =2
else:
begin = int(math.sqrt(N)/process_num*i)+1
end = int(math.sqrt(N)/process_num*(i+1))
pool.apply_async(findK, args=(N,begin,end) , callback = prompt)
pool.close()
pool.join()
if __name__ == "__main__":
N = 684568031001583853
process_num = 16
mainFun(N,process_num)
I swear I saw the following in an example somewhere, but now I can't find that example and this isn't working. The __call__ class function never gets called.
EDIT: Code updated
pool.map appears to start the QueueWriter instance and the __call__ function is reached. However, the workers never seem to start or at least no results are pulled from the queue. Is my queue set up the right way? Why do the workers not fire off?
import multiprocessing as mp
import os
import random
class QueueWriter(object):
def __init__(self, **kwargs):
self.grid = kwargs.get("grid")
self.path = kwargs.get("path")
def __call__(self, q):
print self.path
log = open(self.path, "a", 1)
log.write("QueueWriter called.\n")
while 1:
res = q.get()
if res == 'kill':
self.log.write("QueueWriter received 'kill' message. Closing Writer.\n")
break
else:
self.log.write("This is where I'd write: {0} to grid file.\n".format(res))
log.close()
log = None
class Worker(object):
def __init__(self, **kwargs):
self.queue = kwargs.get("queue")
self.grid = kwargs.get("grid")
def __call__(self, idx):
res = self.workhorse(self, idx)
self.queue.put((idx,res))
return res
def workhorse(self,idx):
#in reality a fairly complex operation
return self.grid[idx] ** self.grid[idx]
if __name__ == '__main__':
# log = open(os.path.expanduser('~/minimal.log'), 'w',1)
path = os.path.expanduser('~/minimal.log')
pool = mp.Pool(mp.cpu_count())
manager = mp.Manager()
q = manager.Queue()
grid = [random.random() for _ in xrange(10000)]
# in actuality grid is a shared resource, read by Workers and written
# to by QueueWriter
qWriter = QueueWriter(grid=grid, path=path)
watcher = pool.map(qWriter, (q,),1)
wrkr = Worker(queue=q,grid=grid)
result = pool.map(wrkr, range(10000), 1)
result.get()
q.put('kill')
pool.close()
pool.join()
So the log does indeed print the initialization message, but then __call__ function is never called. Is this one of those pickling issues I've seen discussed so often? I've found answers about class member functions, but what about class instances?
At the gentle and patient prodding of martineau (thanks!) I think I've ironed out the problems. I have yet to apply it to my original code, but it is working in the example above and I'll start new questions for future implementation problems.
So in addition to changing where in the code the target file (the log, in this example) gets opened, I also started the QueueWriter instance as a single multiprocessing process rather than using pool.map. As martineau pointed out the map call blocks until the qWriter.__call__() returns and this prevented the workers from being called.
There were some other bugs in the code above, but those were incidental and fixed below:
import multiprocessing as mp
import os
import random
class QueueWriter(object):
def __init__(self, **kwargs):
self.grid = kwargs.get("grid")
self.path = kwargs.get("path")
def __call__(self, q):
print self.path
log = open(self.path, "a", 1)
log.write("QueueWriter called.\n")
while 1:
res = q.get()
if res == 'kill':
log.write("QueueWriter received 'kill' message. Closing Writer.\n")
break
else:
log.write("This is where I'd write: {0} to grid file.\n".format(res))
log.close()
log = None
class Worker(object):
def __init__(self, **kwargs):
self.queue = kwargs.get("queue")
self.grid = kwargs.get("grid")
def __call__(self, idx):
res = self.workhorse(idx)
self.queue.put((idx,res))
return res
def workhorse(self,idx):
#in reality a fairly complex operation
return self.grid[idx] ** self.grid[idx]
if __name__ == '__main__':
# log = open(os.path.expanduser('~/minimal.log'), 'w',1)
path = os.path.expanduser('~/minimal.log')
pool = mp.Pool(mp.cpu_count())
manager = mp.Manager()
q = manager.Queue()
grid = [random.random() for _ in xrange(10000)]
# in actuality grid is a shared resource, read by Workers and written
# to by QueueWriter
qWriter = QueueWriter(grid=grid, path=path)
# watcher = pool.map(qWriter, (q,),1)
# Start the writer as a single process rather than a pool
p = mp.Process(target=qWriter, args=(q,))
p.start()
wrkr = Worker(queue=q,grid=grid)
result = pool.map(wrkr, range(10000), 1)
# result.get()
# not required for pool
q.put('kill')
pool.close()
p.join()
pool.join()
I use multiprocessing lib to test python multi process, but I meet some problems. I have test code 1:
import multiprocessing
def test(name):
print 'processing.....'
tmp = 0
for i in xrange(1000000000):
tmp += i
print 'process done'
if __name__ == '__main__':
pools = multiprocessing.Pool()
for i in xrange(2):
pools.apply_async(test)
pools.close()
pools.join()
result is:
processing
processing
done
done
Code 2:
import multiprocessing
class Test:
def test(name):
print 'processing.....'
tmp = 0
for i in xrange(1000000000):
tmp += i
print 'process done'
if __name__ == '__main__':
t = Test()
pools = multiprocessing.Pool()
for i in xrange(4):
pools.apply_async(t.test)
pools.close()
pools.join()
this result is nothing, this pools don't call t.test! I can't understand what happended. Why is this?
instead of using pool, you can simply collect the jobs in a list:
import multiprocessing
class Test(multiprocessing.Process):
def run(self):
print 'processing.....'
tmp = 0
for i in xrange(10):
tmp += i
print 'process done'
return 1
if __name__ == '__main__':
jobs = []
for i in range(5):
t = Test()
jobs.append(t)
t.start()
the list jobs will be able to tell you if the process has finished or not, ultimately giving you the same effect as using pool.
if you wanna make sure that all jobs are done:
if __name__ == '__main__':
jobs = []
for i in range(5):
t = Test()
jobs.append(t)
t.start()
not_done = any(job.is_alive() for job in jobs)
while not_done:
not_done = any(job.is_alive() for job in jobs)
print 'job all done'