Here is some code
How do I change the current multi-process part to a single process?
import multiprocessing
def process(self):
if product != 0:
if self.mlist.check(file_dictionary):
self.mlist.patch(file_dictionary)
process = multiprocessing.Process(target = self.mlist.job, args = (file_dictionary, targ))
self.multiprocess_list.append(process)
process.start()
def wait(self):
process_list = self.multiprocess_list
for i in process_list:
i.join(2)
Next time, please provide a minimal reproducible example like this:
import multiprocessing
def doSomething(k: str, v: int):
print(f"key={k}, value={v}")
if __name__ == "__main__":
data = {"a": 1, "b": 2, "c": 3}
processes = []
for key, value in data.items():
process = multiprocessing.Process(target=doSomething, args=(key, value))
processes.append(process)
for process in processes:
process.start()
for process in processes:
process.join()
Now, replace the import multiprocessing by
class multiprocessing: # Fake multiprocessing, not multiprocessing at all
def __init__(self, target, args):
self.target = target
self.args = args
#staticmethod
def Process(target, args):
return multiprocessing(target, args)
def start(self):
self.target(*self.args)
def join(self):
pass
The idea is that you provide the exact same interface (same names) but do not do multiprocessing. You can switch from and to multiprocessing easily.
multiprocessing is no longer a library but a class
multiprocessing.Process is no longer the constructor of a process object, but a static method
start() just calls the method
join() does nothing, since it ran synchronously already
Related
I'm trying to create multiple processes in which each process takes in a group of threads that it will start. I keep getting this error:
TypeError: cannot pickle '_thread.lock' object
Here is a basic example of what I am trying to achieve:
import time
import threading
import multiprocessing
def threading_func(i):
print(f'Starting Function {i}')
time.sleep(1)
print(f'Ending Function {i}')
def process_func(threads):
for thread in threads:
thread.start()
for thread in threads:
thread.join()
thread_list = [threading.Thread(target=threading_func, args=(i,)) for i in range(1, 9)]
processes = [multiprocessing.Process(target=process_func, args=([thread],)) for thread in thread_list]
for process in processes:
process.start()
for process in processes:
process.join()
I am aware that the arguments passed into a Process instance must have the ability to be serialized. The real question, then, is how can I make a Thread object serialized?
There is no simple way of getting a Thread instance to be serializable. An alternative would be to pass to each process the arguments required for it create the threads in its own address space. But this can be done rather painlessly by creating a class called MyThread that sort of behaves like the Thread class although it is meant to be used with the target argument since it does not have a run method:
import time
import threading
import multiprocessing
class MyThread:
def __init__(self, *args, **kwargs):
self.args = args
self.kwargs = kwargs
self.thread = None
def start(self):
self.thread = threading.Thread(*self.args, **self.kwargs)
self.thread.start()
def join(self):
self.thread.join()
def threading_func(i):
print(f'Starting Function {i}')
time.sleep(1)
print(f'Ending Function {i}')
def process_func(threads):
for thread in threads:
thread.start()
for thread in threads:
thread.join()
# Required for Windows:
if __name__ == '__main__':
thread_list = [MyThread(target=threading_func, args=(i,)) for i in range(1, 9)]
processes = [multiprocessing.Process(target=process_func, args=([thread],)) for thread in thread_list]
for process in processes:
process.start()
for process in processes:
process.join()
A version using a more generic MyThread class
import time
import threading
import multiprocessing
class MyThread:
def __init__(self, *args, **kwargs):
self.args = args
self.kwargs = kwargs
self.thread = None
def start(self):
self.thread = threading.Thread(*self.args, **self.kwargs) if 'target' in self.kwargs else threading.Thread(*self.args, **self.kwargs, target=self.run)
self.thread.start()
def join(self):
self.thread.join()
def run(self, *args, **kwargs):
"""
This method would need to be overridden if this class is not initialized
with the `target` keyword and you wanted to perform something useful
"""
pass
class T(MyThread):
def run(self, i):
print(f'Starting Function {i}')
time.sleep(1)
print(f'Ending Function {i}')
def process_func(threads):
for thread in threads:
thread.start()
for thread in threads:
thread.join()
# Required for Windows:
if __name__ == '__main__':
thread_list = [T(args=(i,)) for i in range(1, 9)]
processes = [multiprocessing.Process(target=process_func, args=([thread],)) for thread in thread_list]
for process in processes:
process.start()
for process in processes:
process.join()
After encountering some probable memory leaks in a long running multi threaded script I found out about maxtasksperchild, which can be used in a Multi process pool like this:
import multiprocessing
with multiprocessing.Pool(processes=32, maxtasksperchild=x) as pool:
pool.imap(function,stuff)
Is something similar possible for the Threadpool (multiprocessing.pool.ThreadPool)?
As the answer by noxdafox said, there is no way in the parent class, you can use threading module to control the max number of tasks per child. As you want to use multiprocessing.pool.ThreadPool, threading module is similar, so...
def split_processing(yourlist, num_splits=4):
'''
yourlist = list which you want to pass to function for threading.
num_splits = control total units passed.
'''
split_size = len(yourlist) // num_splits
threads = []
for i in range(num_splits):
start = i * split_size
end = len(yourlist) if i+1 == num_splits else (i+1) * split_size
threads.append(threading.Thread(target=function, args=(yourlist, start, end)))
threads[-1].start()
# wait for all threads to finish
for t in threads:
t.join()
Lets say
yourlist has 100 items, then
if num_splits = 10; then threads = 10, each thread has 10 tasks.
if num_splits = 5; then threads = 5, each thread has 20 tasks.
if num_splits = 50; then threads = 50, each thread has 2 tasks.
and vice versa.
Looking at multiprocessing.pool.ThreadPool implementation it becomes evident that the maxtaskperchild parameter is not propagated to the parent multiprocessing.Pool class. The multiprocessing.pool.ThreadPool implementation has never been completed, hence it lacks few features (as well as tests and documentation).
The pebble package implements a ThreadPool which supports workers restart after a given amount of tasks have been processed.
I wanted a ThreadPool that will run a new task as soon as another task in the pool completes (i.e. maxtasksperchild=1). I decided to write a small "ThreadPool" class that creates a new thread for every task. As soon a task in the pool completes, another thread is created for the next value in the iterable passed to the map method. The map method blocks until all values in the passed iterable have been processed and their threads returned.
import threading
class ThreadPool():
def __init__(self, processes=20):
self.processes = processes
self.threads = [Thread() for _ in range(0, processes)]
def get_dead_threads(self):
dead = []
for thread in self.threads:
if not thread.is_alive():
dead.append(thread)
return dead
def is_thread_running(self):
return len(self.get_dead_threads()) < self.processes
def map(self, func, values):
attempted_count = 0
values_iter = iter(values)
# loop until all values have been attempted to be processed and
# all threads are finished running
while (attempted_count < len(values) or self.is_thread_running()):
for thread in self.get_dead_threads():
try:
# run thread with the next value
value = next(values_iter)
attempted_count += 1
thread.run(func, value)
except StopIteration:
break
def __enter__(self):
return self
def __exit__(self, exc_type, exc_value, exc_tb):
pass
class Thread():
def __init__(self):
self.thread = None
def run(self, target, *args, **kwargs):
self.thread = threading.Thread(target=target,
args=args,
kwargs=kwargs)
self.thread.start()
def is_alive(self):
if self.thread:
return self.thread.is_alive()
else:
return False
You can use it like this:
def run_job(self, value, mp_queue=None):
# do something with value
value += 1
with ThreadPool(processes=2) as pool:
pool.map(run_job, [1, 2, 3, 4, 5])
I'm trying to inherit a sub class from multiprocessing.Process, which will have a queue for each instant, so that the queue can be use to catch the return value of the target.
But the problem is the multiprocessing.Process.start() uses subprocess.Popen (https://github.com/python/cpython/blob/master/Lib/multiprocessing/process.py) to create a process and run the target inside it. Is there a way to overload this without defining/overloading the entire Process module.
This is what I'm trying to do:
class Mprocessor(multiprocessing.Process):
def __init__(self, **kwargs):
multiprocessing.Process.__init__(self, **kwargs)
self._ret = Queue.Queue()
def run(self):
self._ret.put(multiprocessing.Process.run(self))
def getReturn(self):
if self._ret.empty:
return None
return self._ret.get()
Here I try to create a multiprocessig.Queue inside the class.
I override the 'run' method so when it is executed the return value/s of the target is put inside the queue.
I have a 'getReturn' method which is called in the main function using the Mprocess class. This method should only be called when 'Mprocess.isalive()' method(which is defined for multiprocessing.Process) returns false.
But this mechanism is not working because when I call 'Mprocess.start()' it creates a subprocess which runs the target in its own environment.
I want to know if there's a way to use the queue in the start method to get the return value, and avoid the target to have a queue argument to communicate.
I wanted to generalize this module.
I don't want my methods to be defined to have a queue to get return value.
I want to have a module so that it can be applicable to any function, because I am planning to have a manager method, which takes a dict["process_name/ID" : methods/targets], a dict["process name/ID" : [argument_list]] and create a process for each of this targets and return a dict["process_name/ID" : (return tuple, ).
Any ideas will be welcomed.
EDIT
Manager function:
def Processor_call(func = None, func_args = None):
if sorted(func.keys()) != sorted(func_args()):
print "Names in func dict and args dict doesn't match"
return None
process_list = multiprocessing.Queue()
for i in func.keys():
p = Mprocessor(name = i, target = func[i], args = tuple(func_args[i]))
process_list.put(p)
p.start()
return_dict = {}
while not process_list.empty():
process_wait = process_list.get()
if not process_wait.is_alive():
process_wait.join()
if process_wait.exitcode == 0:
return_dict[process_wait.name] = process_wait.getReturn()
else:
print "Error in process %s, status not availabe" %process_wait.name
else:
join_process.put(process_wait)
return return_dict
EDIT: The target function should look like this.
def sum(a , b):
return a + b
I don't want to pass a queue into the function, and return with queue.
I want to make a common module so that, any existing methods can use multiprocessing without any change to its definition, So the interface with other modules are maintained.
I don't want a function to be designed only to be run as a process, I want to have the common interface so that other modules can also use this function as a normal method, without bothering to read from the queue to get the return value.
Comment: ... so that I'll get the return value from the process started from start method
This will work for me, for instance:
class Mprocessor
class Mprocessor(multiprocessing.Process):
def __init__(self, queue, **kwargs):
multiprocessing.Process.__init__(self, **kwargs)
self._ret = queue
def run(self):
return_value = self._target( *self._args )
self._ret.put((self.name, return_value))
time.sleep(0.25)
exit(0)
Start processes and wait for return values
def Processor_call(func=None, func_args=None):
print('func=%s, func_args=%s' % (func, func_args))
ret_q = multiprocessing.Manager().Queue()
process_list = []
for i in func.keys():
p = Mprocessor(name=i, target=func[i], args=(func_args[i],), queue=ret_q)
p.start()
process_list.append(p)
time.sleep(0.1)
print('Block __main__ until all process terminated')
for p in process_list:
p.join()
print('Aggregate alle return values')
return_dict = {}
while not ret_q.empty():
p_name, value = ret_q.get()
return_dict[p_name] = value
return return_dict
__main__
if __name__ == '__main__':
rd = Processor_call({'f1':f1, 'f2':f1}, {'f1':1, 'f2':2})
print('rd=%s' % rd)
Output:
func={'f1': , 'f2': }, func_args={'f1': 1, 'f2': 2}
pid:4501 start 2
pid:4501 running
pid:4500 start 1
pid:4500 running
Block __main__ until all process terminated
pid:4501 running
pid:4500 running
pid:4501 running
pid:4500 running
pid:4501 Terminate
pid:4500 Terminate
Aggregate alle return values
rd={'f1': 1, 'f2': 2}
Tested with Python:3.4.2 and 2.7.9
Question: Is it possible to inherit multiprocessing.Process to communicate with the main process
Yes, it's possible. But not useing a class object, as your process use it's own copy of the class object .
You have to use a global Queue object and pass it to your process .
How can I get the following to work? The main point is that I want to run a method (and not a function) asynchronously.
from multiprocessing import Pool
class Async:
def __init__(self, pool):
self.pool = pool
self.run()
def run(self):
p.apply_async(self.f, (10, ))
def f(self, x):
print x*x
if __name__ == '__main__':
p = Pool(5)
a = Async(p)
p.close()
p.join()
This prints nothing.
The problem appears to be due to the fact that multiprocessing needs to pickle self.f while bound methods are not picklable. There is a discussion on how to solve the problem here.
The apply_async apparently creates an exception which is put inside the future returned. That's why nothing is printed. If a get is executed on the future, then the exception is raised.
Its definitely possible to thread class methods using a threadpool in python 2 - the following programme did what I would expect.
#!/usr/bin/env python
from multiprocessing.pool import ThreadPool
class TestAsync():
def __init__(self):
pool = ThreadPool(processes = 2)
async_completions = []
for a in range(2):
async_completions.append(pool.apply_async(self.print_int, ( a,)))
for completion in async_completions:
res = completion.get()
print("res = %d" % res)
def print_int(self, value):
print(value)
return (value*10)
a = TestAsync()
Is there any way to have a pub/sub pattern using multiprocessing data structures? In other words, I would like to have something like a queue, except that the publisher can send a single command to multiple workers simultaneously.
You can create your own data structure to implement a simple pub/sub pattern using a wrapper around multiprocessing.Queue:
import os
import multiprocessing
from functools import wraps
def ensure_parent(func):
#wraps(func)
def inner(self, *args, **kwargs):
if os.getpid() != self._creator_pid:
raise RuntimeError("{} can only be called in the "
"parent.".format(func.__name__))
return func(self, *args, **kwargs)
return inner
class PublishQueue(object):
def __init__(self):
self._queues = []
self._creator_pid = os.getpid()
def __getstate__(self):
self_dict = self.__dict__
self_dict['_queues'] = []
return self_dict
def __setstate__(self, state):
self.__dict__.update(state)
#ensure_parent
def register(self):
q = multiprocessing.Queue()
self._queues.append(q)
return q
#ensure_parent
def publish(self, val):
for q in self._queues:
q.put(val)
def worker(q):
for item in iter(q.get, None):
print("got item {} in process {}".format(item, os.getpid()))
if __name__ == "__main__":
q = PublishQueue()
processes = []
for _ in range(3):
p = multiprocessing.Process(target=worker, args=(q.register(),))
p.start()
processes.append(p)
q.publish('1')
q.publish(2)
q.publish(None) # Shut down workers
for p in processes:
p.join()
Output:
got item 1 in process 4383
got item 2 in process 4383
got item 1 in process 4381
got item 2 in process 4381
got item 1 in process 4382
got item 2 in process 4382
This pattern will work well as long as the parent process is the only one doing the publishing, and you register a subscription queue for each worker in the parent, and then pass that subscription queue to the worker process using its multiprocessing.Process constructor. These limitations are due to multiprocessing.Queue being unpicklable. If you need to pass the subscription queue to an already running worker, you'll need to tweak the implementation to use a multiprocessing.Manager.Queue instead.