I have a bit of code that gets the title of a .MP3 file
def getTitle(fileName):
print "getTitle"
audio = MP3(fileName)
try:
sTitle = str(audio["TIT2"])
except KeyError:
sTitle = os.path.basename(fileName)
sTitle = replace_all(sTitle) #remove special chars
return sTitle
I would call this function with
sTitle = getTitle("SomeSong.mp3")
To solve another problem I wanted to spawn this on its own thread so I altered my call to
threadTitle = Thread(target=getTitle("SomeSong.mp3"))
threadTitle.start()
This correctly calls the function and solves my other problem, but now I can't figure out how to get the return value of sTitle from the function into Main.
I would make a new object that extends thread so that you can get anything you want out of it at any time.
from threading import Thread
class GetTitleThread(Thread):
def __init__(self, fileName):
self.sTitle = None
self.fileName = fileName
super(GetTitleThread, self).__init__()
def run(self):
print "getTitle"
audio = MP3(self.fileName)
try:
self.sTitle = str(audio["TIT2"])
except KeyError:
self.sTitle = os.path.basename(self.fileName)
self.sTitle = replace_all(self.sTitle) #remove special chars
if __name__ == '__main__':
t = GetTitleThread('SomeSong.mp3')
t.start()
t.join()
print t.sTitle
One way to do it is to use a wrapper storing the result:
def wrapper(func, args, res):
res.append(func(*args))
res = []
t = threading.Thread(
target=wrapper, args=(getTitle, ("SomeSong.mp3",), res))
t.start()
t.join()
print res[0]
This one comfortably makes any function running in a thread taking care of its return value or exception:
def threading_func(f):
"""Decorator for running a function in a thread and handling its return
value or exception"""
def start(*args, **kw):
def run():
try:
th.ret = f(*args, **kw)
except:
th.exc = sys.exc_info()
def get(timeout=None):
th.join(timeout)
if th.exc:
raise th.exc[0], th.exc[1], th.exc[2] # py2
##raise th.exc[1] #py3
return th.ret
th = threading.Thread(None, run)
th.exc = None
th.get = get
th.start()
return th
return start
Usage Examples
def f(x):
return 2.5 * x
th = threading_func(f)(4)
print("still running?:", th.is_alive())
print("result:", th.get(timeout=1.0))
#threading_func
def th_mul(a, b):
return a * b
th = th_mul("text", 2.5)
try:
print(th.get())
except TypeError:
print("exception thrown ok.")
Related
According to https://docs.python.org/3/library/multiprocessing.html
multiprocessing forks (for *nix) to create a worker process to execute tasks. We can verify this by setting up a global variable in a module prior to the fork.
If the worker function imports that module and finds the variable present, then the process memory has been copied. And so it is:
import os
def f(x):
import sys
return sys._mypid # <<< value is returned by subprocess!
def set_state():
import sys
sys._mypid = os.getpid()
def g():
from multiprocessing import Pool
pool = Pool(4)
try:
for z in pool.imap(f, range(1000)):
print(z)
finally:
pool.close()
pool.join()
if __name__=='__main__':
set_state()
g()
However, if things work this way, what business does multiprocessing have in serializing the work function, f?
In this example:
import os
def set_state():
import sys
sys._mypid = os.getpid()
def g():
def f(x):
import sys
return sys._mypid
from multiprocessing import Pool
pool = Pool(4)
try:
for z in pool.imap(f, range(1000)):
print(z)
finally:
pool.close()
pool.join()
if __name__=='__main__':
set_state()
g()
we get:
AttributeError: Can't pickle local object 'g.<locals>.f'
Stackoverflow and the internet is full of ways to work around this. (Python's standard pickle function can handle functions, but not function with closure data).
But why do we get here? A copy-on-write version of f is in the forked process's memory. Why does it need to be serialized at all?
Derp -- it has to be this way because:
pool = Pool(4) <<< processes created here
for z in pool.imap(f, range(1000)): <<< reference to function
FYI... anyone wanting to fork, where the new process has access to the function (and thereby avoids serializing the function), can follow this pattern:
import collections
import multiprocessing as mp
import os
import pickle
import threading
_STATUS_DATA = 0
_STATUS_ERR = 1
_STATUS_POISON = 2
Message = collections.namedtuple(
"Message",
["status",
"payload",
"sequence_id"
]
)
def parallel_map(
target,
args,
num_processes,
inq_maxsize=None,
outq_maxsize=None,
serialize=pickle.dumps,
deserialize=pickle.loads,
start_method="fork",
preserve_order=True,
):
"""
:param target: Target function
:param args: Iterable of single parameter arguments for target.
:param num_processes: Number of processes.
:param inq_maxsize:
:param outq_maxsize:
:param serialize:
:param deserialize:
:param start_method:
:param preserve_order: If true result are returns in the order received by args. Otherwise,
first result is returned first
:return:
"""
if inq_maxsize is None: inq_maxsize=10*num_processes
if outq_maxsize is None: outq_maxsize=10*num_processes
inq = mp.Queue(maxsize=inq_maxsize)
outq = mp.Queue(maxsize=outq_maxsize)
poison = serialize(Message(_STATUS_POISON, None, -1))
deserialize(poison) # Test
def work():
while True:
obj = inq.get()
# print("{} - GET .. OK".format(os.getpid()))
# inq.task_done()
try:
msg = deserialize(obj)
assert isinstance(msg, Message)
if msg.status==_STATUS_POISON:
outq.put(serialize(Message(_STATUS_POISON,None,msg.sequence_id)))
# print("{} - RETURN POISON .. OK".format(os.getpid()))
return
else:
args, kw = msg.payload
result = target(*args,**kw)
outq.put(serialize(Message(_STATUS_DATA,result,msg.sequence_id)))
except Exception as e:
try:
outq.put(serialize(Message(_STATUS_ERR,e,msg.sequence_id)))
except Exception as e2:
try:
outq.put(serialize(Message(_STATUS_ERR,None,-1)))
# outq.put(serialize(1,Exception("Unable to serialize response")))
# TODO. Log exception
except Exception as e3:
pass
if start_method == "thread":
_start_method = threading.Thread
else:
_start_method = mp.get_context('fork').Process
processes = [
_start_method(
target=work,
name="parallel_map.work"
)
for _ in range(num_processes)]
for p in processes:
p.start()
quitting = []
def quit_processes():
if not quitting:
quitting.append(1)
# Send poison pills - kill child processes
for _ in range(num_processes):
inq.put(poison)
nsent = [0]
def send():
# Send the data
for seq_id, arg in enumerate(args):
obj = ((arg,), {})
inq.put(serialize(Message(_STATUS_DATA, obj, seq_id)))
nsent[0] += 1
quit_processes()
# Publish
sender = threading.Thread(
target=send,
name="parallel_map.sender",
daemon=True)
sender.start()
try:
# Consume
nquit = [0]
buffer = {}
nyielded = 0
while True:
result = outq.get() # Waiting here
# outq.task_done()
msg = deserialize(result)
assert isinstance(msg, Message)
if msg.status == _STATUS_POISON:
nquit[0]+=1
# print(">>> QUIT ACK {}".format(nquit[0]))
if nquit[0]>=num_processes:
break
else:
assert msg.sequence_id>=0
if preserve_order:
buffer[msg.sequence_id] = msg
while True:
if nyielded not in buffer:
break
msg = buffer.pop(nyielded)
nyielded += 1
if msg.status==_STATUS_ERR:
if isinstance(msg.payload, Exception):
raise msg.payload
else:
raise Exception("Unexpected exception")
else:
assert msg.status==_STATUS_DATA
yield msg.payload
else:
if msg.status==_STATUS_ERR:
if isinstance(msg.payload, Exception):
raise msg.payload
else:
raise Exception("Unexpected exception")
else:
assert msg.status==_STATUS_DATA
yield msg.payload
# if nyielded == nsent:
# break
except Exception as e:
raise
finally:
if not quitting:
quit_processes()
sender.join()
for p in processes:
p.join()
def f(x):
time.sleep(0.01)
if x ==-1:
raise Exception("Boo")
return x
Usage:
def f(x):
time.sleep(0.01)
if x ==-1:
raise Exception("Boo")
return x
for result in parallel_map(target=f, <<< not serialized
args=range(100),
num_processes=8,
start_method="fork"):
pass
... with that caveat: for every thread you have in your program when you fork, a puppy dies.
I'm new to python and trying to remove/trim gevent stacktrace output when an exception is raised. I read somewhere that I can make it happen by using AsyncResult, however it seems like I can't figure out how to use this.
Here is an example I started with and iterated over to make it similar to the real code I'm troubleshooting, but I got stuck in the last phase when I tried to add my_decor to work().
Any help fixing this is much appreciated.
from gevent.event import AsyncResult
import gevent
from functools import wraps
def my_decor(k, *args, **kwargs):
#wraps(k)
def wrapper(*args, **kwargs):
r = AsyncResult()
try:
value = k()
except Exception as e:
r.set_exception(e)
else:
r.set(value)
return r.exception or r.value
result = gevent.spawn(wrapper, k)
return result
def f():
def foo():
if True:
raise Exception('tttttttt')
return foo
def p():
def bar():
if True:
raise Exception('ppppppppppppp')
return bar
#my_decor
def work():
foo1 = gevent.spawn(f())
bar1 = gevent.spawn(p())
gevent.joinall([foo1, bar1])
return foo1.get() or bar1.get()
Found the answer, figured it might be a help to those with the same problem.
from gevent.event import AsyncResult
import gevent
from functools import wraps
def my_decor(k):
#wraps(k)
def wrapper(*args, **kwargs):
r = AsyncResult()
try:
value = k(*args, **kwargs)
except Exception as e:
r.set_exception(e)
else:
r.set(value)
return r.exception or r.value
return wrapper
def f(msg):
#my_decor
def foo():
if True:
raise Exception('tttttttt %s' % msg)
# print('test')
return foo
def p(msg):
#my_decor
def bar():
if True:
raise Exception('ppppppppppppp %s', msg)
return bar
def work():
test = "test"
seti = "set"
foo1 = gevent.spawn(f(test)) # returns a function that coroutine uses
bar1 = gevent.spawn(p(seti))
gevent.joinall([foo1, bar1])
return foo1.get() or bar1.get()
res = work()
print res
I'm trying to find the way to start a new Process and get its output if it takes less than X seconds. If the process takes more time I would like to ignore the Process result, kill the Process and carry on.
I need to basically add the timer to the code below. Now sure if there's a better way to do it, I'm open to a different and better solution.
from multiprocessing import Process, Queue
def f(q):
# Ugly work
q.put(['hello', 'world'])
if __name__ == '__main__':
q = Queue()
p = Process(target=f, args=(q,))
p.start()
print q.get()
p.join()
Thanks!
You may find the following module useful in your case:
Module
#! /usr/bin/env python3
"""Allow functions to be wrapped in a timeout API.
Since code can take a long time to run and may need to terminate before
finishing, this module provides a set_timeout decorator to wrap functions."""
__author__ = 'Stephen "Zero" Chappell ' \
'<stephen.paul.chappell#atlantis-zero.net>'
__date__ = '18 December 2017'
__version__ = 1, 0, 1
__all__ = [
'set_timeout',
'run_with_timeout'
]
import multiprocessing
import sys
import time
DEFAULT_TIMEOUT = 60
def set_timeout(limit=None):
"""Return a wrapper that provides a timeout API for callers."""
if limit is None:
limit = DEFAULT_TIMEOUT
_Timeout.validate_limit(limit)
def wrapper(entry_point):
return _Timeout(entry_point, limit)
return wrapper
def run_with_timeout(limit, polling_interval, entry_point, *args, **kwargs):
"""Execute a callable object and automatically poll for results."""
engine = set_timeout(limit)(entry_point)
engine(*args, **kwargs)
while engine.ready is False:
time.sleep(polling_interval)
return engine.value
def _target(queue, entry_point, *args, **kwargs):
"""Help with multiprocessing calls by being a top-level module function."""
# noinspection PyPep8,PyBroadException
try:
queue.put((True, entry_point(*args, **kwargs)))
except:
queue.put((False, sys.exc_info()[1]))
class _Timeout:
"""_Timeout(entry_point, limit) -> _Timeout instance"""
def __init__(self, entry_point, limit):
"""Initialize the _Timeout instance will all needed attributes."""
self.__entry_point = entry_point
self.__limit = limit
self.__queue = multiprocessing.Queue()
self.__process = multiprocessing.Process()
self.__timeout = time.monotonic()
def __call__(self, *args, **kwargs):
"""Begin execution of the entry point in a separate process."""
self.cancel()
self.__queue = multiprocessing.Queue(1)
self.__process = multiprocessing.Process(
target=_target,
args=(self.__queue, self.__entry_point) + args,
kwargs=kwargs
)
self.__process.daemon = True
self.__process.start()
self.__timeout = time.monotonic() + self.__limit
def cancel(self):
"""Terminate execution if possible."""
if self.__process.is_alive():
self.__process.terminate()
#property
def ready(self):
"""Property letting callers know if a returned value is available."""
if self.__queue.full():
return True
elif not self.__queue.empty():
return True
elif self.__timeout < time.monotonic():
self.cancel()
else:
return False
#property
def value(self):
"""Property that retrieves a returned value if available."""
if self.ready is True:
valid, value = self.__queue.get()
if valid:
return value
raise value
raise TimeoutError('execution timed out before terminating')
#property
def limit(self):
"""Property controlling what the timeout period is in seconds."""
return self.__limit
#limit.setter
def limit(self, value):
self.validate_limit(value)
self.__limit = value
#staticmethod
def validate_limit(value):
"""Verify that the limit's value is not too low."""
if value <= 0:
raise ValueError('limit must be greater than zero')
To use, see the following example that demonstrates its usage:
Example
from time import sleep
def main():
timeout_after_four_seconds = timeout(4)
# create copies of a function that have a timeout
a = timeout_after_four_seconds(do_something)
b = timeout_after_four_seconds(do_something)
c = timeout_after_four_seconds(do_something)
# execute the functions in separate processes
a('Hello', 1)
b('World', 5)
c('Jacob', 3)
# poll the functions to find out what they returned
results = [a, b, c]
polling = set(results)
while polling:
for process, name in zip(results, 'abc'):
if process in polling:
ready = process.ready
if ready is True: # if the function returned
print(name, 'returned', process.value)
polling.remove(process)
elif ready is None: # if the function took too long
print(name, 'reached timeout')
polling.remove(process)
else: # if the function is running
assert ready is False, 'ready must be True, False, or None'
sleep(0.1)
print('Done.')
def do_something(data, work):
sleep(work)
print(data)
return work
if __name__ == '__main__':
main()
Does the process you are running involve a loop?
If so you can get the timestamp prior to starting the loop and include an if statement within the loop with an sys.exit(); command terminating the script if the current timestamp differs from the recorded start time stamp by more than x seconds.
All you need to adapt the queue example from the docs to your case is to pass the timeout to the q.get() call and terminate the process on timeout:
from Queue import Empty
...
try:
print q.get(timeout=timeout)
except Empty: # no value, timeout occured
p.terminate()
q = None # the queue might be corrupted after the `terminate()` call
p.join()
Using a Pipe might be more lightweight otherwise the code is the same (you could use .poll(timeout), to find out whether there is a data to receive).
I am trying my hands on python multiprocessing. I want a couple of processes which are independent to each other to run in parallel and as they return check if the process was successful or not using ApplyAsync.successful() utility. However when I call successful in the callback to my subprocess the script hangs.
import multiprocessing as mp
import time
result_map = {}
def foo_pool(x):
time.sleep(2)
print x
return x
result_list = []
def log_result(result):
print result_map[result].successful() #hangs
result_list.append(result)
def apply_async_with_callback():
pool = mp.Pool()
for i in range(10):
result_map[i] = pool.apply_async(foo_pool, args = (i, ), callback = log_result)
pool.close()
pool.join()
print(result_list)
if __name__ == '__main__':
apply_async_with_callback()
You don't need to check successful() because the callback is only called when the result was successful.
Following is the relevant code (multiprocessing/pool.py - AsyncResult)
def _set(self, i, obj):
self._success, self._value = obj
if self._callback and self._success: # <-----
self._callback(self._value) # <-----
self._cond.acquire()
try:
self._ready = True
self._cond.notify()
finally:
self._cond.release()
del self._cache[self._job]
I have a small script that polls a database to look for status of certain jobs. I decided to use APScheduler to handle the looping call. I created a decorator to timeout a function if taking too long. The issue I am having here is that the decorator is inside a class and even though I create two instances of the class, inside two different functions, they always have the same start_time. I thought maybe if I move the decorator inside of my class and initialize the start_time in the init call it would update the start_time per instance of the class. When I moved the decorator insdie of the class and assigned self.start_time = datetime.now() the start time updates on each call of the class and thus will never time out. The example of the decorator inside of the class is also below.
def timeout(start, min_to_wait):
def decorator(func):
def _handle_timeout():
scheduler.shutdown(wait=False)
#wraps(func)
def wrapper(*args, **kwargs):
expire = start + timedelta(minutes = min_to_wait)
now = datetime.now()
if now > expire:
_handle_timeout()
return func(*args, **kwargs)
return wrapper
return decorator
class Job(object):
def __init__(self, name, run_id, results):
self.name = name
self.run_id = object_id
self.results = results
self.parcel_id= None
self.status = None
start_time = datetime.now()
#timeout(start_time, config.WAIT_TIME)
def wait_for_results(self):
if self.results:
self.pack_id = self.results[0].get('parcel_id')
self.status = self.results[0].get('status')
return self.results[0]
else:
return False
#timeout(start_time, config.WORK_TIME)
def is_done(self):
status = self.results[0].get('status')
status_map = {'done': True,
'failed': FailedError,
'lost': LostError}
def _get_or_throw(s, map_obj):
value = map_obj.get(s)
if s in ['failed', 'lost']:
raise value(s)
else:
self.status = s
return s
return _get_or_throw(status, status_map)
def job_1(mssql, postgres, runid):
res = get_results(mssql, config.MSSQL, first_query_to_call)
first_job= Job('first_job', runid, res)
step_two = pack_job.wait_for_results()
if step_two:
try:
logger.info(first_job)
if first_job.is_done() == 'done':
scheduler.remove_job('first_job')
scheduler.add_job(lambda: job_two(mssql,
postgres, first_job.object_id, runid), 'interval', seconds=config.POLL_RATE, id='second_job')
except LostError as e:
logger.error(e, exc_info=True)
scheduler.shutdown(wait=False)
except FailedError as e:
logger.error(e, exc_info=True)
scheduler.shutdown(wait=False)
def job_two(mssql, postgres, object_id, runid):
res = get_results(mssql, config.MSSQL, some_other_query_to_run, object_id)
second_job= Job('second_job', runid, res)
step_two = second_job.wait_for_results()
if step_two:
try:
logger.info(second_job)
if second_job.is_done() == 'done':
scheduler.remove_job('second_job')
except LostError as e:
logger.error(e, exc_info=True)
scheduler.shutdown(wait=False)
except FailedError as e:
logger.error(e, exc_info=True)
scheduler.shutdown(wait=False)
if __name__ == '__main__':
runid = sys.argv[1:]
if runid:
runid = runid[0]
scheduler = BlockingScheduler()
run_job = scheduler.add_job(lambda: job_one(pymssql, psycopg2, runid), 'interval', seconds=config.POLL_RATE, id='first_job')
attempt to move decorator inside class:
class Job(object):
def __init__(self, name, run_id, results):
self.name = name
self.run_id = run_id
self.results = results
self.pack_id = None
self.status = None
self.start_time = datetime.now()
def timeout(min_to_wait):
def decorator(func):
def _handle_timeout():
scheduler.shutdown(wait=False)
#wraps(func)
def wrapper(self, *args, **kwargs):
print '**'
print self.start_time
print ''
expire = self.start_time + timedelta(minutes = min_to_wait)
now = datetime.now()
if now > expire:
_handle_timeout()
return func(self, *args, **kwargs)
return wrapper
return decorator
here is an example output from when I use the above decorator.
**
self start time: 2014-10-28 08:57:11.947026
**
self start time: 2014-10-28 08:57:16.976828
**
self start time: 2014-10-28 08:57:21.989064
the start_time needs to stay the same or else I can't timeout the function.
In the first exemple, your start time is initialised when the class statement is executed, which in your case is when the module is first imported in the interpreter.
In the second exemple, the start time is initialized when the class is instanciated. It should not change from one method call to another for a same Job instance. Of course if you keep on creating new instances, the start time will be different for each instance.
Now you didn't post the code using your Job class, so it's hard to tell what the right solution would be.