multiprocessing - Pool.imap is consuming my iterator - python

I have an extremely huge iterator returning massive amounts of data (file contents). Consuming the iterator hence effectively eats up all my RAM in seconds. Generally, pythons multiprocessing.Pool().imap(...) claims to iterate lazily. That means that it gets a value from the iterator, passes it on to the worker and then waits for the worker to finish. This would be exactly what I want.
However, for some reason it keeps on retrieving values from the iterator, even if the maximum number of workers are already running. This is my code:
class FileNameIterator(object): # Support class for TupleIterator
def __init__(self,path):
self.scanner = scandir.scandir(path)
def __iter__(self):
return self
def next(self):
while True:
direntry = self.scanner.next()
if os.path.isfile(direntry.path):
return direntry.name
class ContentIterator(object): # Support class for TupleIterator
def __init__(self,filenameiter,path):
self.filenameiter = filenameiter
self.path = path
def __iter__(self):
return self
def next(self):
print "<iter> reading content." # TODO: remove
with open(self.path + "\\" + self.filenameiter.next()) as f:
r = f.read()
f.close()
return r
class TupleIterator(object): # Basically izip with working __len__
def __init__(self,path):
self.fnameiter = FileNameIterator(path)
self.cntiter = ContentIterator(FileNameIterator(path),path)
self.path = path
def __iter__(self):
return self
def next(self):
return self.fnameiter.next(), self.cntiter.next()
def __len__(self):
return len([name for name in os.listdir(self.path) if os.path.isfile(os.path.join(self.path, name))])
pool = ThreadPool(12) # Has same API as Pool(), but allows memory sharing
itr = TupleIterator(_path) # Basically izip with working __len__
with open(_datafile, 'w') as datafile: # Workers write results to a file
pool.imap_unordered(worker, itr,len(itr)/12) # This does not work as expected
pool.close()
pool.join()
datafile.close()
I have the workers printing a message when they start and finish, and the iterator printing when it reads a file. This shows that the iterator continuously reads files way faster than the workers can process them.
How do I fix this? Is the imap(..) function working as it should and I'm just misunderstanding how it should be working?

Related

Python Multiprocessing Pool as Decorator

I'm working on code where I frequently have to use python's multiprocessing Pool class. This results in a ton of code that looks like this:
import time
from multiprocessing import Pool
from functools import partial
def test_func(x):
time.sleep(1)
return x
def test_func_parallel(iterable, processes):
p = Pool(processes=processes)
output = p.map(test_func, iterable)
p.close()
return output
This can be made more general:
def parallel(func, iterable, **kwargs):
func = partial(func, **kwargs)
p = Pool(processes=6)
out = p.map(func, iterable)
p.close()
return out
This works, but adding a parallel wrapper to every other function complicates the code. What I'd really like is to get this working as a decorator. Something like this:
def parallel(num_processes):
def parallel_decorator(func, num_processes=num_processes):
def parallel_wrapper(iterable, **kwargs):
func = partial(func, **kwargs)
p = Pool(processes=num_processes)
output = p.map(func, iterable)
p.close()
return output
return parallel_wrapper
return parallel_decorator
Which could be used as follows:
#parallel(6)
def test_func(x):
time.sleep(1)
return x
This fails for pickle reasons
Can't pickle <function test1 at 0x117473268>: it's not the same object as __main__.test1
I've read a few posts on related issues, but they all implement a solution where the multiprocessing is executed outside the decorator. Does anyone know a way to make this work?
If you don't mind to not use the syntactic sugar for decorators (# symbol), something like this should work:
import functools
import time
from multiprocessing import Pool
def parallel(func=None, **options):
if func is None:
return functools.partial(parallel, **options)
def wrapper(iterable, **kwargs):
processes = options["processes"]
with Pool(processes) as pool:
result = pool.map(func, iterable)
return result
return wrapper
def test(i):
time.sleep(1)
print(f"{i}: {i * i}")
test_parallel = parallel(test, processes=6)
def main():
test_parallel(range(10))
if __name__ == "__main__":
main()
I have the same problem. It revolves around how Pool() objects are implemented. So, it is going to work fine with a normal wrapper but not with a Decorator. The workaround is to define your own Pool()-like implementation using Process().
This can be very tricky to optimize but if you are a Decorator enthusiast here is a (dirty) example:
# something to do
args = range(10)
def parallel(function):
""" An alternative implementation to
multiprocessing.Pool().map() using
multiprocessing.Process(). """
def interfacer(args):
""" The wrapper function. """
# required libraries
from multiprocessing import (Queue, Process)
from os import cpu_count
# process control
## maximum number of processes required
max_processes = len(args)
## maximum numer of processes running
max_threads = cpu_count() - 1
""" Since there is no Pool() around
we need to take care of the processes
ourselves. If there is nothing for a
processes to do, it is going to await
for an input, if there are too many of
them, the processor shall suffer. """
# communications
## things to do
inbasket = Queue()
## things done
outbasket = Queue()
""" I am thinking asynchronouly,
there is probably a better way of
doing this. """
# populate inputs
for each in args:
## put arguments into the basket
inbasket.put(each)
def doer():
""" Feeds the targeted/decorated
'function' with data from the baskets and
collets the results.
This blind function helps the
implementation to generalize over any
iterable. """
outbasket.put(function(inbasket.get()))
return(True)
def run(processes = max_threads):
""" Create a certain number of
Process()s and runs each one.
There is room for improvements here. """
# the process pool
factory = list()
# populate the process pool
for each in range(processes):
factory.append(Process(target = doer))
# execute in process pool
for each in factory:
each.start()
each.join()
each.close()
return(True)
""" Now we need to manage the processes,
and prevent them for overwhelm the CPU.
That is the tricky part that Pool() does
so well. """
while max_processes:
# as long as there is something to do
if (max_processes - max_threads) >= 0:
run(max_threads)
max_processes -= max_threads
else:
# play it safe
run(1)
max_processes -= 1
# undo the queue and give me back the list of 'dones'
return([outbasket.get() for each in range(outbasket.qsize())])
return(interfacer)
#parallel
def test(x):
return(x**2)
print(test(args))
Probably this code is inefficient, but gives an idea.

How can you feed an iterable to multiple consumers in constant space?

How can you feed an iterable to multiple consumers in constant space?
TLDR
Write an implementation which passes the following test in CONSTANT SPACE, while
treating min, max and sum as black boxes.
def testit(implementation, N):
assert implementation(range(N), min, max, sum) == (0, N-1, N*(N-1)//2)
Discussion
We love iterators because they let us process streams of data lazily,
allowing the treatment of huge amounts of data in CONSTANT SPACE.
def source_summary(source, summary):
return summary(source)
N = 10 ** 8
print(source_summary(range(N), min))
print(source_summary(range(N), max))
print(source_summary(range(N), sum))
Each line took a few seconds to execute, but used very little memory. However,
It did require 3 separate traversals of the source. So this will not work if
your source is a network connection, data acquisition hardware, etc. unless you cache all the data somewhere, losing the CONSTANT SPACE requirement.
Here's a version which demonstrates this problem
def source_summaries(source, *summaries):
from itertools import tee
return tuple(map(source_summary, tee(source, len(summaries)),
summaries))
testit(source_summaries, N)
print('OK')
The test passes, but tee had to keep a copy of all the data, so the space usage goes up from O(1) to O(N).
How can you obtain the results in a single traversal with constant memory?
It is, of course, possible to pass the test given at the top, with O(1) space usage, by cheating:
using knowledge of the specific iterator-consumers that the test uses. But
that is not the point: source_summaries should work with any iterator
consumables such as set, collections.Counter, ''.join, including any
and all that may be written in the future. The implementation must treat them
as black boxes.
To be clear: the only knowledge available about the consumers is that each one consumes one iterable and returns one result. Using any other knowledge about the consumer is cheating.
Ideas
[EDIT: I have posted an implementation of this idea as an answer]
I can imagine a solution (which I really don't like) that uses
preemptive threading
a custom iterator linking the consumer to the source
Let's call the custom iterator link.
For each consumer, run
result = consumer(<link instance for this thread>)
<link instance for this thread>.set_result(result)
on a separate thread.
On the main thread, something along the lines of
for item in source:
for l in links:
l.push(item)
for l in links:
l.stop()
for thread in threads:
thread.join()
return tuple(link.get_result, links)
link.__next__ blocks until the link instance receives
.push(item) in which case it returns the item
.stop() in which case it raises StopIteration
The data races look like a nightmare. You'd need a queue for the pushes, and probably a sentinel object would need to be placed in the queue by link.stop() ... and a bunch of other things I'm overlooking.
I would prefer to use cooperative threading, but consumer(link) seems to be
unavoidably un-cooperative.
Do you have any less messy suggestions?
Here is an alternative implementation of your idea. It uses cooperative multi-threading. As you suggested, the key point is to use multi-threading and having the iterators __next__ method block until all threads have consumed the current iterate.
In addition, the iterator contains an (optional) buffer of constant size. With this buffer we can read the source in chunks and avoid a lot of the locking/synchronization.
My implementation also handles the case in which some consumers stop iterating before reaching the end of the iterator.
import threading
class BufferedMultiIter:
def __init__(self, source, n, bufsize = 1):
'''`source` is an iterator or iterable,
`n` is the number of threads that will interact with this iterator,
`bufsize` is the size of the internal buffer. The iterator will read
and buffer elements from `source` in chunks of `bufsize`. The bigger
the buffer is, the better the performance but also the bigger the
(constant) space requirement.
'''
self._source = iter(source)
self._n = n
# Condition variable for synchronization
self._cond = threading.Condition()
# Buffered values
bufsize = max(bufsize, 1)
self._buffer = [None] * bufsize
self._buffered = 0
self._next = threading.local()
# State variables to implement the "wait for buffer to get refilled"
# protocol
self._serial = 0
self._waiting = 0
# True if we reached the end of the source
self._stop = False
# Was the thread killed (for error handling)?
self._killed = False
def _fill_buffer(self):
'''Refill the internal buffer.'''
self._buffered = 0
while self._buffered < len(self._buffer):
try:
self._buffer[self._buffered] = next(self._source)
self._buffered += 1
except StopIteration:
self._stop = True
break
# Explicitly clear the unused part of the buffer to release
# references as early as possible
for i in range(self._buffered, len(self._buffer)):
self._buffer[i] = None
self._waiting = 0
self._serial += 1
def register_thread(self):
'''Register a thread.
Each thread that wants to access this iterator must first register
with the iterator. It is an error to register the same thread more
than once. It is an error to access this iterator with a thread that
was not registered (with the exception of calling `kill`). It is an
error to register more threads than the number that was passed to the
constructor.
'''
self._next.i = 0
def unregister_thread(self):
'''Unregister a thread from this iterator.
This should be called when a thread is done using the iterator.
It catches the case in which a consumer does not consume all the
elements from the iterator but exits early.
'''
assert hasattr(self._next, 'i')
delattr(self._next, 'i')
with self._cond:
assert self._n > 0
self._n -= 1
if self._waiting == self._n:
self._fill_buffer()
self._cond.notify_all()
def kill(self):
'''Forcibly kill this iterator.
This will wake up all threads currently blocked in `__next__` and
will have them raise a `StopIteration`.
This function should be called in case of error to terminate all
threads as fast as possible.
'''
self._cond.acquire()
self._killed = True
self._stop = True
self._cond.notify_all()
self._cond.release()
def __iter__(self): return self
def __next__(self):
if self._next.i == self._buffered:
# We read everything from the buffer.
# Wait until all other threads have also consumed the buffer
# completely and then refill it.
with self._cond:
old = self._serial
self._waiting += 1
if self._waiting == self._n:
self._fill_buffer()
self._cond.notify_all()
else:
# Wait until the serial number changes. A change in
# serial number indicates that another thread has filled
# the buffer
while self._serial == old and not self._killed:
self._cond.wait()
# Start at beginning of newly filled buffer
self._next.i = 0
if self._killed:
raise StopIteration
k = self._next.i
if k == self._buffered and self._stop:
raise StopIteration
value = self._buffer[k]
self._next.i = k + 1
return value
class NotAll:
'''A consumer that does not consume all the elements from the source.'''
def __init__(self, limit):
self._limit = limit
self._consumed = 0
def __call__(self, it):
last = None
for k in it:
last = k
self._consumed += 1
if self._consumed >= self._limit:
break
return last
def multi_iter(iterable, *consumers, **kwargs):
'''Iterate using multiple consumers.
Each value in `iterable` is presented to each of the `consumers`.
The function returns a tuple with the results of all `consumers`.
There is an optional `bufsize` argument. This controls the internal
buffer size. The bigger the buffer, the better the performance, but also
the bigger the (constant) space requirement of the operation.
NOTE: This will spawn a new thread for each consumer! The iteration is
multi-threaded and happens in parallel for each element.
'''
n = len(consumers)
it = BufferedMultiIter(iterable, n, kwargs.get('bufsize', 1))
threads = list() # List with **running** threads
result = [None] * n
def thread_func(i, c):
it.register_thread()
result[i] = c(it)
it.unregister_thread()
try:
for c in consumers:
t = threading.Thread(target = thread_func, args = (len(threads), c))
t.start()
threads.append(t)
except:
# Here we should forcibly kill all the threads but there is not
# t.kill() function or similar. So the best we can do is stop the
# iterator
it.kill()
finally:
while len(threads) > 0:
t = threads.pop(-1)
t.join()
return tuple(result)
from time import time
N = 10 ** 7
notall1 = NotAll(1)
notall1000 = NotAll(1000)
start1 = time()
res1 = (min(range(N)), max(range(N)), sum(range(N)), NotAll(1)(range(N)),
NotAll(1000)(range(N)))
stop1 = time()
print('5 iterators: %s %.2f' % (str(res1), stop1 - start1))
for p in range(5):
start2 = time()
res2 = multi_iter(range(N), min, max, sum, NotAll(1), NotAll(1000),
bufsize = 2**p)
stop2 = time()
print('multi_iter%d: %s %.2f' % (p, str(res2), stop2 - start2))
The timings are again horrible but you can see how using a constant size buffer improves things significantly:
5 iterators: (0, 9999999, 49999995000000, 0, 999) 0.71
multi_iter0: (0, 9999999, 49999995000000, 0, 999) 342.36
multi_iter1: (0, 9999999, 49999995000000, 0, 999) 264.71
multi_iter2: (0, 9999999, 49999995000000, 0, 999) 151.06
multi_iter3: (0, 9999999, 49999995000000, 0, 999) 95.79
multi_iter4: (0, 9999999, 49999995000000, 0, 999) 72.79
Maybe this can serve as a source of ideas for a good implementation.
Here is an implementation of the preemptive threading solution outlined in the original question.
[EDIT: There is a serious problem with this implementation. [EDIT, now fixed, using a solution inspired by Daniel Junglas.]
Consumers which do not iterate through the whole iterable, will cause a space leak in the queue inside Link. For example:
def exceeds_10(iterable):
for item in iterable:
if item > 10:
return True
return False
if you use this as one of the consumers and use the source range(10**6), it will stop removing items from the queue inside Link after the first 11 items, leaving approximately 10**6 items to be accumulated in the queue!
]
class Link:
def __init__(self, queue):
self.queue = queue
def __iter__(self):
return self
def __next__(self):
item = self.queue.get()
if item is FINISHED:
raise StopIteration
return item
def put(self, item):
self.queue.put(item)
def stop(self):
self.queue.put(FINISHED)
def consumer_not_listening_any_more(self):
self.__class__ = ClosedLink
class ClosedLink:
def put(self, _): pass
def stop(self) : pass
class FINISHED: pass
def make_thread(link, consumer, future):
from threading import Thread
return Thread(target = lambda: on_thread(link, consumer, future))
def on_thread(link, consumer, future):
future.set_result(consumer(link))
link.consumer_not_listening_any_more()
def source_summaries_PREEMPTIVE_THREAD(source, *consumers):
from queue import SimpleQueue as Queue
from asyncio import Future
links = tuple(Link(Queue()) for _ in consumers)
futures = tuple( Future() for _ in consumers)
threads = tuple(map(make_thread, links, consumers, futures))
for thread in threads:
thread.start()
for item in source:
for link in links:
link.put(item)
for link in links:
link.stop()
for t in threads:
t.join()
return tuple(f.result() for f in futures)
It works, but (unsirprisingly) with a horrible degradation in performance:
def time(thunk):
from time import time
start = time()
thunk()
stop = time()
return stop - start
N = 10 ** 7
t = time(lambda: testit(source_summaries, N))
print(f'old: {N} in {t:5.1f} s')
t = time(lambda: testit(source_summaries_PREEMPTIVE_THREAD, N))
print(f'new: {N} in {t:5.1f} s')
giving
old: 10000000 in 1.2 s
new: 10000000 in 30.1 s
So, even though this is a theoretical solution, it is not a practical one[*].
Consequently, I think that this approach is a dead end, unless there's a way of persuading consumer to yield cooperatively (as opposed to forcing it to yield preemptively) in
def on_thread(link, consumer, future):
future.set_result(consumer(link))
... but that seems fundamentally impossible. Would love to be proven wrong.
[*] This is actually a bit harsh: the test does absolutely nothing with trivial data; if this were part of a larger computation which performed heavy calculations on the elements, then this approach could be genuinely useful.

Iterable multiprocessing Queue not exiting

import multiprocessing.queues as queues
import multiprocessing
class I(queues.Queue):
def __init__(self, maxsize=0):
super(I, self).__init__(maxsize)
self.length = 0
def __iter__(self):
return self
def put(self, obj, block=True, timeout=None):
super(I, self).put(obj,block,timeout)
self.length += 1
def get(self, block = True, timeout = None):
self.length -= 1
return super(I, self).get(block, timeout)
def __len__(self):
return self.length
def next(self):
item = self.get()
if item == 'Done':
raise StopIteration
return item
def thisworker(item):
print 'got this item: %s' % item
return item
q=I()
q.put(1)
q.put('Done')
the_pool = multiprocessing.Pool(1)
print the_pool.map(thisworker, q)
I'm trying to create an iterable queue to use with multiprocessing pool map.
The idea is that the function thisworker would append some items to the queue until a condition is met and then exit after putting 'Done' in the queue (I've not done it here in this code yet)
But, this code never completes, it always hangs up.
I'm not able to debug the real cause.
Request your help
PS: I've used self.length because the map_async method called from under the_pool.map requires to use the length of the iterable to form a variable: chunksize, which will be used to get tasks from the pool.
The problem is that you're treating 'Done' as a special-case item in the Queue, which indicates that the iteration should stop. So, if you iterate over the Queue using a for loop with your example, all that will be returned is 1. However, you're claiming that the length of the Queue is 2. This is screwing up the map code, which is relying on that length to accurately represent the number of items in the iterable in order to know when all the results have returned from the workers:
class MapResult(ApplyResult):
def __init__(self, cache, chunksize, length, callback):
ApplyResult.__init__(self, cache, callback)
...
# _number_left is used to know when the MapResult is done
self._number_left = length//chunksize + bool(length % chunksize)
So, you need to make the length actually be accurate. You can do that a few ways, but I would recommend not requiring a sentinel to be loaded into the Queue at all, and use get_nowait instead:
import multiprocessing.queues as queues
import multiprocessing
from Queue import Empty
class I(queues.Queue):
def __init__(self, maxsize=0):
super(I, self).__init__(maxsize)
self.length = 0
... <snip>
def next(self):
try:
item = self.get_nowait()
except Empty:
raise StopIteration
return item
def thisworker(item):
print 'got this item: %s' % item
return item
q=I()
q.put(1)
the_pool = multiprocessing.Pool(1)
print the_pool.map(thisworker, q)
Also, note that this approach isn't process safe. The length attribute will only be correct if you only put into the Queue from a single process, and then never put again after sending the Queue to a worker process. It also won't work in Python 3 without adjusting the imports and implementation, because the constructor for multiprocessing.queues.Queue has changed.
Instead of subclassing multiprocessing.queues.Queue, I would recommend using the iter built-in to iterate over the Queue:
q = multiprocessing.Queue()
q.put(1)
q.put(2)
q.put(None) # None is our sentinel, you could use 'Done', if you wanted
the_pool.map(thisworker, iter(q.get, None)) # This will call q.get() until None is returned
This will work on all versions of Python, is much less code, and is process-safe.
Edit:
Based on the requirements you mentioned in the comment to my answer, I think you're better off using imap instead of map, so that you don't need to know the length of the Queue at all. The reality is, you can't accurately determine that, and in fact the length may end up growing as you're iterating. If you use imap exclusively, then doing something similar to your original approach will work fine:
import multiprocessing
class I(object):
def __init__(self, maxsize=0):
self.q = multiprocessing.Queue(maxsize)
def __getattr__(self, attr):
if hasattr(self.q, attr):
return getattr(self.q, attr)
def __iter__(self):
return self
def next(self):
item = self.q.get()
if item == 'Done':
raise StopIteration
return item
def thisworker(item):
if item == 1:
q.put(3)
if item == 2:
q.put('Done')
print 'got this item: %s' % item
return item
q=I()
q.put(1)
q.put(2)
q.put(5)
the_pool = multiprocessing.Pool(2) # 2 workers
print list(the_pool.imap(thisworker, q))
Output:
got this item: 1
got this item: 5
got this item: 3
got this item: 2
[1, 2, 5, 3]
I got rid of the code that worried about the length, and used delegation instead of inheritance, for better Python 3.x compatibility.
Note that my original suggestion, to use iter(q.get, <sentinel>), still works here, too, as long as you use imap instead of map.

python threading in a loop

I have a project that requires a bunch of large matrices, which are stored in ~200 MB files, to be cross-correlated (i.e. FFT * conj(FFT)) with each other. The number of files is such that I can't just load them all up and then do my processing. On the other hand, reading in each file as I need it is slower than I'd like.
what I have so far is something like:
result=0
for i in xrange(N_files):
f1 = file_reader(file_list[i])
############################################################################
# here I want to have file_reader go start reading the next file I'll need #
############################################################################
in_place_processing(f1)
for j in xrange(i+1,N_files):
f2 = file_reader(file_list[j])
##################################################################
# here I want to have file_reader go start reading the next file #
##################################################################
in_place_processing(f2)
result += processing_function(f1,f2)
So basically, I just want to have two threads that will each read a file, give it to me when I ask for it (or as soon as it's done after I ask for it), and then go start reading the next file for when I ask for it. The object the file_reader returns is rather large and complicated, so I'm not sure if multiprocessing is the way to go here...
I've read about threading and queues but can't seem to figure out the part where I ask the thread to go read the file and can proceed with the program while it does. I don't want the threads to simply go about their business in the background -- am I missing a detail here, or is threading not the way to go?
Below is an example of using the multiprocessing module that will spawn off child processes to call your file_reader method and queue up their results. The queue should block when full, so you can control the number of read ahead's you'd like to perform with the QUEUE_SIZE constant.
This utilizes a standard Producer/Consumer model of multiprocess communication, with the child processes act as Producers, with the main thread being the Consumer. The join method call in the class destructor ensures the child process resources are cleaned up properly. There are some print statements interspersed for demonstration purposes.
Additionally, I added the ability for the QueuedFileReader class to offload work to a worker thread or run in the main thread, rather than using a child process, for comparison. This is done by specifying the mode parameter at class initialization to MODE_THREADS or MODE_SYNCHRONOUS, respectively.
import multiprocessing as mp
import Queue
import threading
import time
QUEUE_SIZE = 2 #buffer size of queue
## Placeholder for your functions and variables
N_files = 10
file_list = ['file %d' % i for i in range(N_files)]
def file_reader(filename):
time.sleep(.1)
result = (filename,'processed')
return result
def in_place_processing(f):
time.sleep(.2)
def processing_function(f1,f2):
print f1, f2
return id(f1) & id(f2)
MODE_SYNCHRONOUS = 0 #file_reader called in main thread synchronously
MODE_THREADS = 1 #file_reader executed in worker thread
MODE_PROCESS = 2 #file_reader executed in child_process
##################################################
## Class to encapsulate multiprocessing objects.
class QueuedFileReader():
def __init__(self, idlist, mode=MODE_PROCESS):
self.mode = mode
self.idlist = idlist
if mode == MODE_PROCESS:
self.queue = mp.Queue(QUEUE_SIZE)
self.process = mp.Process(target=QueuedFileReader.worker,
args=(self.queue,idlist))
self.process.start()
elif mode == MODE_THREADS:
self.queue = Queue.Queue(QUEUE_SIZE)
self.thread = threading.Thread(target=QueuedFileReader.worker,
args=(self.queue,idlist))
self.thread.start()
#staticmethod
def worker(queue, idlist):
for i in idlist:
queue.put((i, file_reader(file_list[i])))
print id(queue), 'queued', file_list[i]
queue.put('done')
def __iter__(self):
if self.mode == MODE_SYNCHRONOUS:
self.index = 0
return self
def next(self):
if self.mode == MODE_SYNCHRONOUS:
if self.index == len(self.idlist): raise StopIteration
q = (self.idlist[self.index],
file_reader(file_list[self.idlist[self.index]]))
self.index += 1
else:
q = self.queue.get()
if q == 'done': raise StopIteration
return q
def __del__(self):
if self.mode == MODE_PROCESS:
self.process.join()
elif self.mode == MODE_THREADS:
self.thread.join()
#mode = MODE_PROCESS
mode = MODE_THREADS
#mode = MODE_SYNCHRONOUS
result = 0
for i, f1 in QueuedFileReader(range(N_files),mode):
in_place_processing(f1)
for j, f2 in QueuedFileReader(range(i+1,N_files),mode):
in_place_processing(f2)
result += processing_function(f1,f2)
If your intermediate values are too large to pass through the Queue, you can execute each iteration of the outer loop in its own process. A handy way to do that would be using the Pool class in multiprocessing as in the example below.
import multiprocessing as mp
import time
## Placeholder for your functions and variables
N_files = 10
file_list = ['file %d' % i for i in range(N_files)]
def file_reader(filename):
time.sleep(.1)
result = (filename,'processed')
return result
def in_place_processing(f):
time.sleep(.2)
def processing_function(f1,f2):
print f1, f2
return id(f1) & id(f2)
def file_task(file_index):
print file_index
f1 = file_reader(file_list[file_index])
in_place_processing(f1)
task_result = 0
for j in range(file_index+1, N_files):
f2 = file_reader(file_list[j])
in_place_processing(f2)
task_result += processing_function(f1,f2)
return task_result
pool = mp.Pool(processes=None) #processes default to mp.cpu_count()
result = 0
for file_result in pool.map(file_task, range(N_files)):
result += file_result
print 'result', result
#or simply
#result = sum(pool.map(file_task, range(N_files)))

Dynamic processes in Python

I have a question concerning Python multiprocessing. I am trying to take a dataset, break into chunks, and pass those chunks to concurrently running processes. I need to transform large tables of data using simple calculations (eg. electrical resistance -> temperature for a thermistor).
The code listed below almost works as desired, but it doesn't seem to be spawning any new processes (or if so only one at a time).
from multiprocessing import Process
class Worker(Process):
# example data transform
def process(self, x): return (x * 2) / 3
def __init__(self, list):
self.data = list
self.result = map(self.process, self.data)
super(Worker, self).__init__()
if __name__ == '__main__':
start = datetime.datetime.now()
dataset = range(10000) # null dataset
processes = 3
for i in range(processes):
chunk = int(math.floor(len(dataset) / float(processes)))
if i + 1 == processes:
remainder = len(dataset) % processes
else: remainder = 0
tmp = dataset[i * chunk : (i + 1) * chunk + remainder]
exec('worker'+str(i)+' = Worker(tmp)')
exec('worker'+str(i)+'.start()')
for i in range(processes):
exec('worker'+str(i)+'.join()')
# just a placeholder to make sure the initial values of the set are as expected
exec('print worker'+str(i)+'.result[0]')
No need to send the number of chunks to each process, just use get_nowait() and handle the eventual Queue.Empty exception. Every process will get different amounts of CPU time and this should keep them all busy.
import multiprocessing, Queue
class Worker(multiprocessing.Process):
def process(self, x):
for i in range(15):
x += (float(i) / 2.6)
return x
def __init__(self, input, output):
self.input = input
self.output = output
super(Worker, self).__init__()
def run(self):
try:
while True:
self.output.put(self.process(self.input.get_nowait()))
except Queue.Empty:
pass
if name == 'main':
dataset = range(10)
processes = multiprocessing.cpu_count()
input = multiprocessing.Queue()
output = multiprocessing.Queue()
for obj in dataset:
input.put(obj)
for i in range(processes):
Worker(input, output).start()
for i in range(len(dataset)):
print output.get()
You haven't overridden the run method. There are two ways with processes (or threads) to have it execute code:
Create a process specifying target
Subclass the process, overriding the run method.
Overriding __init__ just means your process is all dressed up with nowhere to go. It should be used to give it attributes that it needs to perform what it needs to perform, but it shouldn't specify the task to be performed.
In your code, all the heavy lifting is done in this line:
exec('worker'+str(i)+' = Worker(tmp)')
and nothing is done here:
exec('worker'+str(i)+'.start()')
So checking the results with exec('print worker'+str(i)+'.result[0]') should give you something meaningful, but only because the code you want to be executed has been executed, but on process construction, not on process start.
Try this:
class Worker(Process):
# example data transform
def process(self, x): return (x * 2) / 3
def __init__(self, list):
self.data = list
self.result = []
super(Worker, self).__init__()
def run(self):
self.result = map(self.process, self.data)
EDIT:
Okay... so I was just flying based on my threading instincts here, and they were all wrong. What we both didn't understand about processes is that you can't directly share variables. Whatever you pass to a new process to start is read, copied, and gone forever. Unless you use one of the two standard ways to share data: queues and pipes. I've played around a little bit trying to get your code to work, but so far no luck. I think that will put you on the right track.
Ok, so it looks like the list was not thread safe, and I have moved to using a Queue (although it appears to be much slower). This code essentially accomplishes what I was trying to do:
import math, multiprocessing
class Worker(multiprocessing.Process):
def process(self, x):
for i in range(15):
x += (float(i) / 2.6)
return x
def __init__(self, input, output, chunksize):
self.input = input
self.output = output
self.chunksize = chunksize
super(Worker, self).__init__()
def run(self):
for x in range(self.chunksize):
self.output.put(self.process(self.input.get()))
if __name__ == '__main__':
dataset = range(10)
processes = multiprocessing.cpu_count()
input = multiprocessing.Queue()
output = multiprocessing.Queue()
for obj in dataset:
input.put(obj)
for i in range(processes):
chunk = int(math.floor(len(dataset) / float(processes)))
if i + 1 == processes:
remainder = len(dataset) % processes
else: remainder = 0
Worker(input, output, chunk + remainder).start()
for i in range(len(dataset)):
print output.get()

Categories

Resources