I have following piece of code to multiplex blocking generators:
import datetime
import time
import queue
import threading
def blocking1():
while True:
time.sleep(1)
result = "Block1: {}".format(datetime.datetime.now())
yield result
def blocking2():
while True:
time.sleep(2)
result = "Block2: {}".format(datetime.datetime.now())
yield result
def multiplex(generators):
if len(generators) == 1:
return generators[0]
elif len(generators) > 1:
q = queue.Queue()
def run_one(src):
for e in src: q.put(e)
def run_all():
threads = []
for src in generators:
t = threading.Thread(target=run_one, args=(src,))
t.start()
threads.append(t)
for t in threads: t.join()
q.put(StopIteration)
threading.Thread(target=run_all).start()
while True:
e = q.get()
if e is StopIteration:
return
yield e
else:
return []
if __name__ == "__main__":
# tasks = [("map1: {}".format(e) for e in blocking1()), ("map2: {}".format(e) for e in blocking2())]
tasks = [("map1: {}".format(e) for e in blocking1())]
for e in multiplex(tasks):
print(e)
I wanted to be clever and in case there is only one generator, don't do any thread spawning. Just return this single generator (after all the types still match)
However it doesn't work like that.
The program immediately terminates (like this is empty generator)
What's funny is that following works (the map1... output is displayed):
import datetime
import time
import queue
import threading
def blocking1():
while True:
time.sleep(1)
result = "Block1: {}".format(datetime.datetime.now())
yield result
def blocking2():
while True:
time.sleep(2)
result = "Block2: {}".format(datetime.datetime.now())
yield result
def multiplex(generators):
if len(generators) == 1:
return generators[0]
else:
return []
if __name__ == "__main__":
# tasks = [("map1: {}".format(e) for e in blocking1()), ("map2: {}".format(e) for e in blocking2())]
tasks = [("map1: {}".format(e) for e in blocking1())]
for e in multiplex(tasks):
print(e)
Where the difference is only in removal of elif part...
Could someone help me understand what is going on please?
I'm using Python 3.5.3
You can't (usefully) return a value from a function that also does a yield anywhere in its body (even if return and yields occur in separate blocks of code that can never run during the same execution of the function). If you have a yield anywhere in the function, you are making a generator function rather than a normal one.
A good fix for this is to yield from your lone generator if you're only given one:
def multiplex(generators):
if len(generators) == 1:
yield from generators[0] # because this is a generator function, we need to yield here
elif len(generators) > 1:
... # there's a yield in here causing the whole thing to be a generator function!
The problem is you're returning a generator, instead of iterating over it.
Replace
return generators[0]
with
yield from generators[0]
Related
I have a multiprocessing setup that handles a long running task by appending all calculated values to lst. It looks roughly like this:
from multiprocessing import Pool
from time import sleep
def fun(_):
lst = [] # list that will be returned
for i in range(200):
lst.append(i)
if not i % 10:
sleep(0.1) # 'long task', cause a KeyboardInterrupt in this time
return lst
if __name__ == '__main__':
master = []
processes = 2
for result in Pool(processes).imap_unordered(fun, range(processes)):
master.append(result)
print(master)
I want to be able to cause a KeyboardInterrupt and have the processes return the list they worked on, even if they are not done yet, as each iteration just adds a new sublist.
(My actual data looks roughly like lst = ([], [[], ...], [[], ...]), every empty list contains ints only, the actual function would return lst1, lst2, lst3)
I have tried to envelop the whole main part in try: except: like so:
try:
for result in Pool(processes).imap_unordered(fun, range(processes)):
master.append(result)
except KeyboardInterrupt:
# somehow retrieve the values here
pass
I have however not come to any possible solution this way.
How can I tell the processes it's time to exit early and return me their current result?
Edit to show the actual structure:
main.py:
from other import Other
class Something:
def __init__(self):
pass # stuff here
def spawner(self):
for result in Pool(processes=self.processes).imap_unordered(self.loop, range(self.processes)):
pass # do stuff with the data
def loop(self, _):
# setup stuff
Other(setup_stuff).start()
other.py
class Other:
def __init__(self):
pass # more stuff
def start(self):
lst1, lst2, lst3 = [], [], []
for _ in range(self.episodes):
pass # do the actual computation
return lst1, lst2, lst3
Maybe you can use multiprocessing.Queue instead of a list to return variables. Set-up one queue at the beginning and all processes will write to the queue.
At the end, read all values from the queue.
from time import sleep
from multiprocessing import Pool, Queue
q = None
def set_global_data(queue):
global q
q = queue
def fun(_):
for i in range(200):
q.put_nowait(i)
if not i % 10:
sleep(0.1) # 'long task', cause a KeyboardInterrupt in this time
# nothing is returned
if __name__ == "__main__":
master = Queue()
processes = 2
try:
with Pool(processes, set_global_data, (master,)) as p:
for result in p.imap_unordered(fun, range(processes)):
pass
except KeyboardInterrupt:
pass
while not master.empty():
v = master.get_nowait()
print(v)
EDIT: With multiple files:
main.py
from other import Other
from multiprocessing import Pool, Queue
class Something:
def __init__(self):
pass # stuff here
def spawner(self):
master = Queue()
try:
with Pool(2, Something.set_global_data, (master,)) as p:
for _ in p.imap_unordered(self.loop, range(2)):
pass
except KeyboardInterrupt:
pass
while not master.empty():
v = master.get_nowait()
print(v)
def loop(self, _):
# setup stuff
Other().start()
#staticmethod
def set_global_data(queue):
Other.q = queue
s = Something()
s.spawner()
other.py
from time import sleep
class Other:
q = None
def __init__(self):
pass # more stuff
def start(self):
for i in range(200):
Other.q.put_nowait(i)
if not i % 10:
sleep(0.1)
I have the following code where I am trying to call functions with different timeouts. It might happen that the first function times out but the second one could have been executed in the specified time.
import time
from concurrent.futures import ThreadPoolExecutor
def test1(a, b, c):
time.sleep(5)
d=a+b+c
print(d)
def test2(a, b):
time.sleep(5)
d=a+b
print(d)
with ThreadPoolExecutor(max_workers=1) as executor1:
try:
executor1.submit(test1, 1,2,3).result(timeout=1)
except:
executor1.shutdown(wait=False)
print("Pass")
with ThreadPoolExecutor(max_workers=1) as executor2:
try:
executor2.submit(test2, 1,2).result(timeout=8)
except:
executor2.shutdown(wait=False)
print("Pass-2")
Expected Output
Pass
3
Actual Output
Pass
6
3
What I'd like to have is stop the execution of first executor as soon as there is a timeout. And continue with the next executor.
Finally, it is implemented using this link. The final code is shared below :-
import time
from concurrent.futures import ThreadPoolExecutor
import ctypes
def terminate_thread(thread):
"""Terminates a python thread from another thread.
:param thread: a threading.Thread instance
"""
if not thread.isAlive():
return
exc = ctypes.py_object(SystemExit)
res = ctypes.pythonapi.PyThreadState_SetAsyncExc(
ctypes.c_long(thread.ident), exc)
if res == 0:
raise ValueError("nonexistent thread id")
elif res > 1:
# """if it returns a number greater than one, you're in trouble,
# and you should call it again with exc=NULL to revert the effect"""
ctypes.pythonapi.PyThreadState_SetAsyncExc(thread.ident, None)
raise SystemError("PyThreadState_SetAsyncExc failed")
def test1(a, b, c):
time.sleep(5)
d=a+b+c
print(d)
def test2(a, b):
time.sleep(5)
d=a+b
print(d)
with ThreadPoolExecutor(max_workers=1) as executor:
try:
executor.submit(test1, 1,2,3).result(timeout=1)
except:
executor.shutdown(wait=False)
for t in executor._threads:
terminate_thread(t)
print("Pass")
with ThreadPoolExecutor(max_workers=1) as executor:
try:
executor.submit(test2, 1,2).result(timeout=8)
except:
executor.shutdown(wait=False)
for t in executor._threads:
terminate_thread(t)
print("Pass-2")
According to https://docs.python.org/3/library/multiprocessing.html
multiprocessing forks (for *nix) to create a worker process to execute tasks. We can verify this by setting up a global variable in a module prior to the fork.
If the worker function imports that module and finds the variable present, then the process memory has been copied. And so it is:
import os
def f(x):
import sys
return sys._mypid # <<< value is returned by subprocess!
def set_state():
import sys
sys._mypid = os.getpid()
def g():
from multiprocessing import Pool
pool = Pool(4)
try:
for z in pool.imap(f, range(1000)):
print(z)
finally:
pool.close()
pool.join()
if __name__=='__main__':
set_state()
g()
However, if things work this way, what business does multiprocessing have in serializing the work function, f?
In this example:
import os
def set_state():
import sys
sys._mypid = os.getpid()
def g():
def f(x):
import sys
return sys._mypid
from multiprocessing import Pool
pool = Pool(4)
try:
for z in pool.imap(f, range(1000)):
print(z)
finally:
pool.close()
pool.join()
if __name__=='__main__':
set_state()
g()
we get:
AttributeError: Can't pickle local object 'g.<locals>.f'
Stackoverflow and the internet is full of ways to work around this. (Python's standard pickle function can handle functions, but not function with closure data).
But why do we get here? A copy-on-write version of f is in the forked process's memory. Why does it need to be serialized at all?
Derp -- it has to be this way because:
pool = Pool(4) <<< processes created here
for z in pool.imap(f, range(1000)): <<< reference to function
FYI... anyone wanting to fork, where the new process has access to the function (and thereby avoids serializing the function), can follow this pattern:
import collections
import multiprocessing as mp
import os
import pickle
import threading
_STATUS_DATA = 0
_STATUS_ERR = 1
_STATUS_POISON = 2
Message = collections.namedtuple(
"Message",
["status",
"payload",
"sequence_id"
]
)
def parallel_map(
target,
args,
num_processes,
inq_maxsize=None,
outq_maxsize=None,
serialize=pickle.dumps,
deserialize=pickle.loads,
start_method="fork",
preserve_order=True,
):
"""
:param target: Target function
:param args: Iterable of single parameter arguments for target.
:param num_processes: Number of processes.
:param inq_maxsize:
:param outq_maxsize:
:param serialize:
:param deserialize:
:param start_method:
:param preserve_order: If true result are returns in the order received by args. Otherwise,
first result is returned first
:return:
"""
if inq_maxsize is None: inq_maxsize=10*num_processes
if outq_maxsize is None: outq_maxsize=10*num_processes
inq = mp.Queue(maxsize=inq_maxsize)
outq = mp.Queue(maxsize=outq_maxsize)
poison = serialize(Message(_STATUS_POISON, None, -1))
deserialize(poison) # Test
def work():
while True:
obj = inq.get()
# print("{} - GET .. OK".format(os.getpid()))
# inq.task_done()
try:
msg = deserialize(obj)
assert isinstance(msg, Message)
if msg.status==_STATUS_POISON:
outq.put(serialize(Message(_STATUS_POISON,None,msg.sequence_id)))
# print("{} - RETURN POISON .. OK".format(os.getpid()))
return
else:
args, kw = msg.payload
result = target(*args,**kw)
outq.put(serialize(Message(_STATUS_DATA,result,msg.sequence_id)))
except Exception as e:
try:
outq.put(serialize(Message(_STATUS_ERR,e,msg.sequence_id)))
except Exception as e2:
try:
outq.put(serialize(Message(_STATUS_ERR,None,-1)))
# outq.put(serialize(1,Exception("Unable to serialize response")))
# TODO. Log exception
except Exception as e3:
pass
if start_method == "thread":
_start_method = threading.Thread
else:
_start_method = mp.get_context('fork').Process
processes = [
_start_method(
target=work,
name="parallel_map.work"
)
for _ in range(num_processes)]
for p in processes:
p.start()
quitting = []
def quit_processes():
if not quitting:
quitting.append(1)
# Send poison pills - kill child processes
for _ in range(num_processes):
inq.put(poison)
nsent = [0]
def send():
# Send the data
for seq_id, arg in enumerate(args):
obj = ((arg,), {})
inq.put(serialize(Message(_STATUS_DATA, obj, seq_id)))
nsent[0] += 1
quit_processes()
# Publish
sender = threading.Thread(
target=send,
name="parallel_map.sender",
daemon=True)
sender.start()
try:
# Consume
nquit = [0]
buffer = {}
nyielded = 0
while True:
result = outq.get() # Waiting here
# outq.task_done()
msg = deserialize(result)
assert isinstance(msg, Message)
if msg.status == _STATUS_POISON:
nquit[0]+=1
# print(">>> QUIT ACK {}".format(nquit[0]))
if nquit[0]>=num_processes:
break
else:
assert msg.sequence_id>=0
if preserve_order:
buffer[msg.sequence_id] = msg
while True:
if nyielded not in buffer:
break
msg = buffer.pop(nyielded)
nyielded += 1
if msg.status==_STATUS_ERR:
if isinstance(msg.payload, Exception):
raise msg.payload
else:
raise Exception("Unexpected exception")
else:
assert msg.status==_STATUS_DATA
yield msg.payload
else:
if msg.status==_STATUS_ERR:
if isinstance(msg.payload, Exception):
raise msg.payload
else:
raise Exception("Unexpected exception")
else:
assert msg.status==_STATUS_DATA
yield msg.payload
# if nyielded == nsent:
# break
except Exception as e:
raise
finally:
if not quitting:
quit_processes()
sender.join()
for p in processes:
p.join()
def f(x):
time.sleep(0.01)
if x ==-1:
raise Exception("Boo")
return x
Usage:
def f(x):
time.sleep(0.01)
if x ==-1:
raise Exception("Boo")
return x
for result in parallel_map(target=f, <<< not serialized
args=range(100),
num_processes=8,
start_method="fork"):
pass
... with that caveat: for every thread you have in your program when you fork, a puppy dies.
I have implemented a parser like this,
import multiprocessing
import time
def foo(i):
try:
# some codes
except Exception, e:
print e
def worker(i):
foo(i)
time.sleep(i)
return i
if __name__ == "__main__":
pool = multiprocessing.Pool(processes=4)
result = pool.map_async(worker, range(15))
while not result.ready():
print("num left: {}".format(result._number_left))
time.sleep(1)
real_result = result.get()
pool.close()
pool.join()
My parser actually finishes all the processes but the results are not available ie, it's still inside the while loop and printing num left : 2. How I stop this? And I don't want the value of real_result variable.
I'm running Ubuntu 14.04, python 2.7
Corresponding part of my code looks like,
async_args = ((date, kw_dict) for date in dates)
pool = Pool(processes=4)
no_rec = []
def check_for_exit(msg):
print msg
if last_date in msg:
print 'Terminating the pool'
pool.terminate()
try:
result = pool.map_async(parse_date_range, async_args)
while not result.ready():
print("num left: {}".format(result._number_left))
sleep(1)
real_result = result.get(5)
passed_dates = []
for x, y in real_result:
passed_dates.append(x)
if y:
no_rec.append(y[0])
# if last_date in passed_dates:
# print 'Terminating the pool'
# pool.terminate()
pool.close()
except:
print 'Pool error'
pool.terminate()
print traceback.format_exc()
finally:
pool.join()
My bet is that you have faulty parse_date_range,
which causes a worker process to terminate without producing any result or py exception.
Probably libc's exit is called by a C module/lib due to a realy nasty error.
This code reproduces the infinite loop you observe:
import sys
import multiprocessing
import time
def parse_date_range(i):
if i == 5:
sys.exit(1) # or raise SystemExit;
# other exceptions are handled by the pool
time.sleep(i/19.)
return i
if __name__ == "__main__":
pool = multiprocessing.Pool(4)
result = pool.map_async(parse_date_range, range(15))
while not result.ready():
print("num left: {}".format(result._number_left))
time.sleep(1)
real_result = result.get()
pool.close()
pool.join()
Hope this'll help.
Can I run multiple threads running the same copies of a coroutine?
for example if I change the threaded function from this tutorial to
#coroutine
def threaded(count, target):
messages = Queue()
def run_target():
while True:
item = messages.get()
if item is GeneratorExit:
target.close()
return
else:
target.send(item)
for i in xrange(count):
Thread(target=run_target).start()
try:
while True:
item = (yield)
messages.put(item)
except GeneratorExit:
messages.put(GeneratorExit)
Does that really work? How do I verify whether it is working?
I think I got it fixed, I needed to change the function to something like this for it to work
#coroutine
def _threaded(self, count, target_func):
"""
Given a target coroutine, spawn $count threads to run copies of them. In
order to properly use this, do not call the coroutine before calling this,
e.g.
#coroutine
def foo(self):
...
def bar(self):
...
self._threaded(10, self.foo) # <- do not call self.foo,
# just the reference
#param count The number of threads to spawn
#param target_func The reference to the target coroutine
#returns The subnet mask
"""
result = None
messages = Queue()
def default_target_run(index):
target = target_func()
while True:
item = messages.get()
if item is GeneratorExit:
target.close()
return
else:
target.send({'index': index, 'item': item})
# ensure code is testable
target_run = default_target_run
try:
target_run = self._threaded.target_run
except AttributeError:
pass
result = ThreadPool(count).map_async(target_run, range(count))
try:
while True:
item = (yield)
messages.put(item)
except GeneratorExit:
# allow all threads to quit
# by making sure all of them receives the exit message
for i in xrange(count):
messages.put(GeneratorExit)
result.ready()