I am having troubles with the multiprocessing module. I am using a Pool of workers with its map method to concurrently analyze lots of files. Each time a file has been processed I would like to have a counter updated so that I can keep track of how many files remains to be processed. Here is sample code:
import os
import multiprocessing
counter = 0
def analyze(file):
# Analyze the file.
global counter
counter += 1
print counter
if __name__ == '__main__':
files = os.listdir('/some/directory')
pool = multiprocessing.Pool(4)
pool.map(analyze, files)
I cannot find a solution for this.
The problem is that the counter variable is not shared between your processes: each separate process is creating it's own local instance and incrementing that.
See this section of the documentation for some techniques you can employ to share state between your processes. In your case you might want to share a Value instance between your workers
Here's a working version of your example (with some dummy input data). Note it uses global values which I would really try to avoid in practice:
from multiprocessing import Pool, Value
from time import sleep
counter = None
def init(args):
''' store the counter for later use '''
global counter
counter = args
def analyze_data(args):
''' increment the global counter, do something with the input '''
global counter
# += operation is not atomic, so we need to get a lock:
with counter.get_lock():
counter.value += 1
print counter.value
return args * 10
if __name__ == '__main__':
#inputs = os.listdir(some_directory)
#
# initialize a cross-process counter and the input lists
#
counter = Value('i', 0)
inputs = [1, 2, 3, 4]
#
# create the pool of workers, ensuring each one receives the counter
# as it starts.
#
p = Pool(initializer = init, initargs = (counter, ))
i = p.map_async(analyze_data, inputs, chunksize = 1)
i.wait()
print i.get()
Counter class without the race-condition bug:
class Counter(object):
def __init__(self):
self.val = multiprocessing.Value('i', 0)
def increment(self, n=1):
with self.val.get_lock():
self.val.value += n
#property
def value(self):
return self.val.value
A extremly simple example, changed from jkp's answer:
from multiprocessing import Pool, Value
from time import sleep
counter = Value('i', 0)
def f(x):
global counter
with counter.get_lock():
counter.value += 1
print("counter.value:", counter.value)
sleep(1)
return x
with Pool(4) as p:
r = p.map(f, range(1000*1000))
Faster Counter class without using the built-in lock of Value twice
class Counter(object):
def __init__(self, initval=0):
self.val = multiprocessing.RawValue('i', initval)
self.lock = multiprocessing.Lock()
def increment(self):
with self.lock:
self.val.value += 1
#property
def value(self):
return self.val.value
https://eli.thegreenplace.net/2012/01/04/shared-counter-with-pythons-multiprocessing
https://docs.python.org/2/library/multiprocessing.html#multiprocessing.sharedctypes.Value
https://docs.python.org/2/library/multiprocessing.html#multiprocessing.sharedctypes.RawValue
Here is a solution to your problem based on a different approach from that proposed in the other answers. It uses message passing with multiprocessing.Queue objects (instead of shared memory with multiprocessing.Value objects) and process-safe (atomic) built-in increment and decrement operators += and -= (instead of introducing custom increment and decrement methods) since you asked for it.
First, we define a class Subject for instantiating an object that will be local to the parent process and whose attributes are to be incremented or decremented:
import multiprocessing
class Subject:
def __init__(self):
self.x = 0
self.y = 0
Next, we define a class Proxy for instantiating an object that will be the remote proxy through which the child processes will request the parent process to retrieve or update the attributes of the Subject object. The interprocess communication will use two multiprocessing.Queue attributes, one for exchanging requests and one for exchanging responses. Requests are of the form (sender, action, *args) where sender is the sender name, action is the action name ('get', 'set', 'increment', or 'decrement' the value of an attribute), and args is the argument tuple. Responses are of the form value (to 'get' requests):
class Proxy(Subject):
def __init__(self, request_queue, response_queue):
self.__request_queue = request_queue
self.__response_queue = response_queue
def _getter(self, target):
sender = multiprocessing.current_process().name
self.__request_queue.put((sender, 'get', target))
return Decorator(self.__response_queue.get())
def _setter(self, target, value):
sender = multiprocessing.current_process().name
action = getattr(value, 'action', 'set')
self.__request_queue.put((sender, action, target, value))
#property
def x(self):
return self._getter('x')
#property
def y(self):
return self._getter('y')
#x.setter
def x(self, value):
self._setter('x', value)
#y.setter
def y(self, value):
self._setter('y', value)
Then, we define the class Decorator to decorate the int objects returned by the getters of a Proxy object in order to inform its setters whether the increment or decrement operators += and -= have been used by adding an action attribute, in which case the setters request an 'increment' or 'decrement' operation instead of a 'set' operation. The increment and decrement operators += and -= call the corresponding augmented assignment special methods __iadd__ and __isub__ if they are defined, and fall back on the assignment special methods __add__ and __sub__ which are always defined for int objects (e.g. proxy.x += value is equivalent to proxy.x = proxy.x.__iadd__(value) which is equivalent to proxy.x = type(proxy).x.__get__(proxy).__iadd__(value) which is equivalent to type(proxy).x.__set__(proxy, type(proxy).x.__get__(proxy).__iadd__(value))):
class Decorator(int):
def __iadd__(self, other):
value = Decorator(other)
value.action = 'increment'
return value
def __isub__(self, other):
value = Decorator(other)
value.action = 'decrement'
return value
Then, we define the function worker that will be run in the child processes and request the increment and decrement operations:
def worker(proxy):
proxy.x += 1
proxy.y -= 1
Finally, we define a single request queue to send requests to the parent process, and multiple response queues to send responses to the child processes:
if __name__ == '__main__':
subject = Subject()
request_queue = multiprocessing.Queue()
response_queues = {}
processes = []
for index in range(4):
sender = 'child {}'.format(index)
response_queues[sender] = multiprocessing.Queue()
proxy = Proxy(request_queue, response_queues[sender])
process = multiprocessing.Process(
target=worker, args=(proxy,), name=sender)
processes.append(process)
running = len(processes)
for process in processes:
process.start()
while subject.x != 4 or subject.y != -4:
sender, action, *args = request_queue.get()
print(sender, 'requested', action, *args)
if action == 'get':
response_queues[sender].put(getattr(subject, args[0]))
elif action == 'set':
setattr(subject, args[0], args[1])
elif action == 'increment':
setattr(subject, args[0], getattr(subject, args[0]) + args[1])
elif action == 'decrement':
setattr(subject, args[0], getattr(subject, args[0]) - args[1])
for process in processes:
process.join()
The program is guaranteed to terminate when += and -= are process-safe. If you remove process-safety by commenting the corresponding __iadd__ or __isub__ of Decorator then the program will only terminate by chance (e.g. proxy.x += value is equivalent to proxy.x = proxy.x.__iadd__(value) but falls back to proxy.x = proxy.x.__add__(value) if __iadd__ is not defined, which is equivalent to proxy.x = proxy.x + value which is equivalent to proxy.x = type(proxy).x.__get__(proxy) + value which is equivalent to type(proxy).x.__set__(proxy, type(proxy).x.__get__(proxy) + value), so the action attribute is not added and the setter requests a 'set' operation instead of an 'increment' operation).
Example process-safe session (atomic += and -=):
child 0 requested get x
child 0 requested increment x 1
child 0 requested get y
child 0 requested decrement y 1
child 3 requested get x
child 3 requested increment x 1
child 3 requested get y
child 2 requested get x
child 3 requested decrement y 1
child 1 requested get x
child 2 requested increment x 1
child 2 requested get y
child 2 requested decrement y 1
child 1 requested increment x 1
child 1 requested get y
child 1 requested decrement y 1
Example process-unsafe session (non-atomic += and -=):
child 2 requested get x
child 1 requested get x
child 0 requested get x
child 2 requested set x 1
child 2 requested get y
child 1 requested set x 1
child 1 requested get y
child 2 requested set y -1
child 1 requested set y -1
child 0 requested set x 1
child 0 requested get y
child 0 requested set y -2
child 3 requested get x
child 3 requested set x 2
child 3 requested get y
child 3 requested set y -3 # the program stalls here
A more sophisticated solution based on the lock-free atomic operations, as given by example on atomics library README:
from multiprocessing import Process, shared_memory
import atomics
def fn(shmem_name: str, width: int, n: int) -> None:
shmem = shared_memory.SharedMemory(name=shmem_name)
buf = shmem.buf[:width]
with atomics.atomicview(buffer=buf, atype=atomics.INT) as a:
for _ in range(n):
a.inc()
del buf
shmem.close()
if __name__ == "__main__":
# setup
width = 4
shmem = shared_memory.SharedMemory(create=True, size=width)
buf = shmem.buf[:width]
total = 10_000
# run processes to completion
p1 = Process(target=fn, args=(shmem.name, width, total // 2))
p2 = Process(target=fn, args=(shmem.name, width, total // 2))
p1.start(), p2.start()
p1.join(), p2.join()
# print results and cleanup
with atomics.atomicview(buffer=buf, atype=atomics.INT) as a:
print(f"a[{a.load()}] == total[{total}]")
del buf
shmem.close()
shmem.unlink()
(atomics could be installed via pip install atomics on most of the major platforms)
This is a different solution and the simplest to my taste.
The reasoning is you create an empty list and append to it each time your function executes , then print len(list) to check progress.
Here is an example based on your code :
import os
import multiprocessing
counter = []
def analyze(file):
# Analyze the file.
counter.append(' ')
print len(counter)
if __name__ == '__main__':
files = os.listdir('/some/directory')
pool = multiprocessing.Pool(4)
pool.map(analyze, files)
For future visitors, the hack to add counter to multiprocessing is as follow :
from multiprocessing.pool import ThreadPool
counter = []
def your_function():
# function/process
counter.append(' ') # you can append anything
return len(counter)
pool = ThreadPool()
result = pool.map(get_data, urls)
Hope this will help.
I'm working on a process bar in PyQT5, so I use thread and pool together
import threading
import multiprocessing as mp
from queue import Queue
def multi(x):
return x*x
def pooler(q):
with mp.Pool() as pool:
count = 0
for i in pool.imap_unordered(ggg, range(100)):
print(count, i)
count += 1
q.put(count)
def main():
q = Queue()
t = threading.Thread(target=thr, args=(q,))
t.start()
print('start')
process = 0
while process < 100:
process = q.get()
print('p',process)
if __name__ == '__main__':
main()
this I put in Qthread worker and it works with acceptable latency
Related
I wish to have a list of Queue's shared between processes. The idea is from a "main" process, I can pipe whatever information I want to one of the other processes, but the number of other processes aren't determined.
I cannot create the Queue in the "main" process. I am simulating a decentralised system and creating the Queue in the main process does not fit this paradigm. As such, the Queue's must be created within the other processes.
This poses a difficulty, as I can't find how to share these Queue's with the main process. I have a managed list using multiprocessing.Manager, but if I append a multiprocess.Queue to it, I get:
RuntimeError: Queue objects should only be shared between processes
through inheritance
Appending a standard data type such as an integer works just fine.
MRE below:
import multiprocessing as mp
from time import sleep
class test:
def __init__(self, qlist):
self.qlist = qlist
self.q = mp.Queue()
qlist.append(4)
self.next = None
self.run()
def run(self):
while True:
val = self.q.get()
if val == 1:
p = mp.Process(target = test, args=(qlist, ))
p.start()
else:
print(val)
if __name__ == '__main__':
manager = mp.Manager()
qlist = manager.list()
p = mp.Process(target = test, args=(qlist, ))
p.start()
sleep(0.5)
print(qlist)
p.join()
The idea would be in the if __name__ == '__main__': code, I could look through the qlist and select one of the Queues to pipe information to, such as: qlist[2].put(1) to add a test object or qlist[3].put("Hello") to print "Hello".
The best case scenario would rather be to have a list of test objects (where the test object has its self.q attribute for accessing it's Queue) that I could access from the "main" process, but I'm even less sure of how to do that hence why I'm asking about the Queue's.
Any help with this would be greatly appreciated
You can definitely create queue instances in the main process; this occurs in your test.__init__ method with the statement self.q = mp.Queue(), which is running in the main process. The problem is that a multiprocessing queue cannot be added to a managed list. Here is your program, slightly modified where it does not attempt to add the queues to a managed list. I have also made your test class (now renamed Test) to be a subclass of Process and it will now terminate:
import multiprocessing as mp
class Test(mp.Process):
def __init__(self, value):
mp.Process.__init__(self)
self.value = value
self.q = mp.Queue()
self.q.put(value)
self.next = None
def run(self):
value = self.q.get()
print('value = ', value)
value -= 1
if value > 0:
p = Test(value).start()
if __name__ == '__main__':
p = Test(4).start()
Prints:
value = 4
value = 3
value = 2
value = 1
If you want to maintain a list of objects, then it would be better if Test is not a subclass of Process:
import multiprocessing as mp
class Test():
def __init__(self, lst, value):
lst.append(self)
self.lst = lst
self.value = value
self.q = mp.Queue()
self.q.put(value)
self.next = None
def run(self):
value = self.q.get()
print('value = ', value)
value -= 1
if value > 0:
test = Test(self.lst, value)
p = mp.Process(target=test.run).start()
if __name__ == '__main__':
manager = mp.Manager()
lst = manager.list()
test = Test(lst, 4)
p = mp.Process(target=test.run).start()
import time
time.sleep(3)
print(lst)
Prints:
value = 4
value = 3
value = 2
value = 1
[<__mp_main__.Test object at 0x0000028E6DAD5DC0>, <__mp_main__.Test object at 0x0000028E6DAD5FA0>, <__mp_main__.Test object at 0x0000028E6DAD5E50>, <__mp_main__.Test object at 0x0000028E6DAD5D90>]
But here is a big BUT:
Each of those objects "live" in a different address space and the references can only have meaning when accessed from the original address space they were created in. So this is pretty useless:
import multiprocessing as mp
class Test():
def __init__(self, lst, value):
lst.append(self)
self.lst = lst
self.value = value
self.q = mp.Queue()
self.q.put(value)
self.next = None
def run(self):
value = self.q.get()
print('value = ', value)
value -= 1
if value > 0:
test = Test(self.lst, value)
p = mp.Process(target=test.run).start()
if __name__ == '__main__':
manager = mp.Manager()
lst = manager.list()
test = Test(lst, 4)
p = mp.Process(target=test.run).start()
import time
time.sleep(3)
print(test, test.__class__, test.value)
print(lst)
for elem in lst:
print(type(elem))
print(elem.value)
Prints:
value = 4
value = 3
value = 2
value = 1
<__main__.Test object at 0x0000020E52E6A640> <class '__main__.Test'> 4
[<__mp_main__.Test object at 0x0000016827704DC0>, <__mp_main__.Test object at 0x0000016827704FA0>, <__mp_main__.Test object at 0x0000016827704250>, <__mp_main__.Test object at 0x0000016827704D90>]
<class '__main__.Test'>
Traceback (most recent call last):
File "C:\Ron\test\test.py", line 31, in <module>
print(elem.value)
AttributeError: 'Test' object has no attribute 'value'
I am currently trying to get into Python.
To explain the code below, you can see a program to compare two strategies in Roulette with many runs.
The color doubling strategy without knowing what hits were before the start.
The color doubling strategy with knowing that red got hit 10 times before, so I start with start value times 2^10
The "Player" class inherits both strategies. The global "globalBal_1" and "globalBal_2" variables count the profit for each strategy.
But the algorithm shall not be the problem. The main problem is that when I run the calculating function "run" with a normal call, it delivers me results. The multiprocessing processes for some reason do not change the global "globalBal_1" and "globalBal_2" variables and thus don't deliver results. Rather they do have value "0" as I have declared them initially.
What am I doing wrong there? I'm fairly new into multiprocessing and Python itself.
Edit:
Expected values for "globalBal_1" and"globalBal_2" are about half of "rounds", so in this case should be "500.000" (per process it is 500.000 / amount of processes).
But the actual results for the multiprocessing runs are "0".
Code:
from numpy.random import randint
import time
from multiprocessing import Process
threads = 4
rounds = int(1000000 / threads)
globalBal_1 = 0
globalBal_2 = 0
class Player:
def __init__(self):
self.balance_1 = 0
self.balance_2 = 0
def strat_1(self, sequence):
counter = 0
for i in range(len(sequence) - 1):
if sequence[i]:
counter += 1
self.balance_1 += counter
def strat_2(self, sequence):
for i in range(len(sequence) - 1 - 1):
if sequence[i] == 1:
return
if sequence[len(sequence) - 1]:
self.balance_2 += 2 ** (len(sequence) - 0)
def getBal_1(self):
return self.balance_1
def getBal_2(self):
return self.balance_2
def run(count):
p1 = Player()
print("Inside run func")
global globalBal_1, globalBal_2
for i in range(count):
rolls = randint(0, 2, 10)
p1.strat_1(rolls)
p1.strat_2(rolls)
globalBal_1 += p1.getBal_1()
globalBal_2 += p1.getBal_2()
print("Finished run func")
if __name__ == '__main__':
start = time.time()
procs = [Process(target=run, args=(rounds,)) for t in range(threads)]
for p in procs:
p.start()
for p in procs:
p.join()
tempEnd = time.time()
print("Multiprocessing result:")
print(globalBal_1, globalBal_2, tempEnd - start)
print("\nSingle process:")
run(rounds)
end = time.time()
print(globalBal_1, globalBal_2, end - start)
Solution thanks to #mirmo and #Bing Wang:
def runMulti(count, result1, result2):
p1 = Player()
for i in range(count):
rolls = randint(0, 2, 10)
p1.strat_1(rolls)
p1.strat_2(rolls)
result1.value += p1.getBal_1()
result2.value += p1.getBal_2()
[...]
profit1 = Value('i', 0)
profit2 = Value('i', 0)
procs = [Process(target=runMulti, args=(rounds, profit1, profit2)) for t in range(threads)]
Please always include the actual and expected output.
The global variables are not updated simply because there are now 4 separate processes created (hence the name multiprocessing) which have no access to the global variables you have created, or more specifically, do not have access to the global variables of the parent process.
Either return the value for each process and add them up at the end, create a queue or as mentioned use a shared object.
I am using the class that was provided by Python Thread Pool (Python recipe) in order to simulate thread pooling. I am trying to increment the value counter in function test. The problem is that it is remaining 0. I used lock that was explained in Is this simple python code thread safe but still it's not working.
Source code
#! /usr/bin/python
# -*- coding: utf-8 -*-
from Queue import Queue
from threading import Thread
import threading
lock = threading.Lock()
class Worker(Thread):
"""Thread executing tasks from a given tasks queue"""
def __init__(self, tasks):
Thread.__init__(self)
self.tasks = tasks
self.daemon = True
self.start()
def run(self):
while True:
func, args, kargs = self.tasks.get()
try: func(*args, **kargs)
except Exception, e: print e
self.tasks.task_done()
class ThreadPool:
"""Pool of threads consuming tasks from a queue"""
def __init__(self, num_threads):
self.tasks = Queue(num_threads)
for _ in range(num_threads): Worker(self.tasks)
def add_task(self, func, *args, **kargs):
"""Add a task to the queue"""
self.tasks.put((func, args, kargs))
def wait_completion(self):
"""Wait for completion of all the tasks in the queue"""
self.tasks.join()
def exp1_thread(counter):
with lock:
print counter
counter = counter + 1
def test():
# 1) Init a Thread pool with the desired number of threads
pool = ThreadPool(6)
counter = 0
for i in range(0, 10):
pool.add_task(exp1_thread,counter)
# 3) Wait for completion
pool.wait_completion()
if __name__ == "__main__":
test()
output
counter 0
counter 0
counter 0
counter 0
counter 0
counter 0
counter 0
counter 0
counter 0
counter 0
The reason why you are getting all zeros is because the integer value counter is passed by value to the threads. Each thread receives a copy of the counter and then goes to town.
You can fix this by finding a way to pass the value by reference.
Option 1. Define counter as a list you pass around:
def exp1_thread(counter):
with lock:
print counter[0]
counter[0] = counter[0] + 1
def test():
# 1) Init a Thread pool with the desired number of threads
pool = ThreadPool(6)
counter = [0]
for i in range(0, 10):
pool.add_task(exp1_thread, counter)
# 3) Wait for completion
pool.wait_completion()
Option 2. Create an object you pass around.
class Counter:
def __init__(self, initial_count):
self.count = initial_count
def exp1_thread(counter):
with lock:
print counter.count
counter.count = counter.count + 1
def test():
# 1) Init a Thread pool with the desired number of threads
pool = ThreadPool(6)
counter = Counter(0)
for i in range(0, 10):
pool.add_task(exp1_thread, counter)
# 3) Wait for completion
pool.wait_completion()
This has nothing to do with threading. The actual reason is that int is immutable in Python.
A function that just increments an int would not have the desired effect.
def inc(x):
x +=1
y = 0
inc(y)
print y # 0
If you want to increment the number you can store it in a mutable datatype (such as a list or dict) and manipulate the list.
int work differently from dictionaries, I'm sure someone well versed in Python logic can explain the difference. This is the logic I usually use and it works. As I just realized it's because you're passing the object (dict, list etc) and not the value itself.
Either declare your variables as globals (but be careful) or use say a dictionary with individual key slots for the different threads and sum them up at the end.
from threading import *
from time import sleep
myMap = {'counter' : 0}
class worker(Thread):
def __init__(self, counterMap):
Thread.__init__(self)
self.counterMap = counterMap
self.start()
def run(self):
self.counterMap['counter'] += 1
worker(myMap)
sleep(0.2)
worker(myMap)
sleep(0.2)
print(myMap)
I have a project that requires a bunch of large matrices, which are stored in ~200 MB files, to be cross-correlated (i.e. FFT * conj(FFT)) with each other. The number of files is such that I can't just load them all up and then do my processing. On the other hand, reading in each file as I need it is slower than I'd like.
what I have so far is something like:
result=0
for i in xrange(N_files):
f1 = file_reader(file_list[i])
############################################################################
# here I want to have file_reader go start reading the next file I'll need #
############################################################################
in_place_processing(f1)
for j in xrange(i+1,N_files):
f2 = file_reader(file_list[j])
##################################################################
# here I want to have file_reader go start reading the next file #
##################################################################
in_place_processing(f2)
result += processing_function(f1,f2)
So basically, I just want to have two threads that will each read a file, give it to me when I ask for it (or as soon as it's done after I ask for it), and then go start reading the next file for when I ask for it. The object the file_reader returns is rather large and complicated, so I'm not sure if multiprocessing is the way to go here...
I've read about threading and queues but can't seem to figure out the part where I ask the thread to go read the file and can proceed with the program while it does. I don't want the threads to simply go about their business in the background -- am I missing a detail here, or is threading not the way to go?
Below is an example of using the multiprocessing module that will spawn off child processes to call your file_reader method and queue up their results. The queue should block when full, so you can control the number of read ahead's you'd like to perform with the QUEUE_SIZE constant.
This utilizes a standard Producer/Consumer model of multiprocess communication, with the child processes act as Producers, with the main thread being the Consumer. The join method call in the class destructor ensures the child process resources are cleaned up properly. There are some print statements interspersed for demonstration purposes.
Additionally, I added the ability for the QueuedFileReader class to offload work to a worker thread or run in the main thread, rather than using a child process, for comparison. This is done by specifying the mode parameter at class initialization to MODE_THREADS or MODE_SYNCHRONOUS, respectively.
import multiprocessing as mp
import Queue
import threading
import time
QUEUE_SIZE = 2 #buffer size of queue
## Placeholder for your functions and variables
N_files = 10
file_list = ['file %d' % i for i in range(N_files)]
def file_reader(filename):
time.sleep(.1)
result = (filename,'processed')
return result
def in_place_processing(f):
time.sleep(.2)
def processing_function(f1,f2):
print f1, f2
return id(f1) & id(f2)
MODE_SYNCHRONOUS = 0 #file_reader called in main thread synchronously
MODE_THREADS = 1 #file_reader executed in worker thread
MODE_PROCESS = 2 #file_reader executed in child_process
##################################################
## Class to encapsulate multiprocessing objects.
class QueuedFileReader():
def __init__(self, idlist, mode=MODE_PROCESS):
self.mode = mode
self.idlist = idlist
if mode == MODE_PROCESS:
self.queue = mp.Queue(QUEUE_SIZE)
self.process = mp.Process(target=QueuedFileReader.worker,
args=(self.queue,idlist))
self.process.start()
elif mode == MODE_THREADS:
self.queue = Queue.Queue(QUEUE_SIZE)
self.thread = threading.Thread(target=QueuedFileReader.worker,
args=(self.queue,idlist))
self.thread.start()
#staticmethod
def worker(queue, idlist):
for i in idlist:
queue.put((i, file_reader(file_list[i])))
print id(queue), 'queued', file_list[i]
queue.put('done')
def __iter__(self):
if self.mode == MODE_SYNCHRONOUS:
self.index = 0
return self
def next(self):
if self.mode == MODE_SYNCHRONOUS:
if self.index == len(self.idlist): raise StopIteration
q = (self.idlist[self.index],
file_reader(file_list[self.idlist[self.index]]))
self.index += 1
else:
q = self.queue.get()
if q == 'done': raise StopIteration
return q
def __del__(self):
if self.mode == MODE_PROCESS:
self.process.join()
elif self.mode == MODE_THREADS:
self.thread.join()
#mode = MODE_PROCESS
mode = MODE_THREADS
#mode = MODE_SYNCHRONOUS
result = 0
for i, f1 in QueuedFileReader(range(N_files),mode):
in_place_processing(f1)
for j, f2 in QueuedFileReader(range(i+1,N_files),mode):
in_place_processing(f2)
result += processing_function(f1,f2)
If your intermediate values are too large to pass through the Queue, you can execute each iteration of the outer loop in its own process. A handy way to do that would be using the Pool class in multiprocessing as in the example below.
import multiprocessing as mp
import time
## Placeholder for your functions and variables
N_files = 10
file_list = ['file %d' % i for i in range(N_files)]
def file_reader(filename):
time.sleep(.1)
result = (filename,'processed')
return result
def in_place_processing(f):
time.sleep(.2)
def processing_function(f1,f2):
print f1, f2
return id(f1) & id(f2)
def file_task(file_index):
print file_index
f1 = file_reader(file_list[file_index])
in_place_processing(f1)
task_result = 0
for j in range(file_index+1, N_files):
f2 = file_reader(file_list[j])
in_place_processing(f2)
task_result += processing_function(f1,f2)
return task_result
pool = mp.Pool(processes=None) #processes default to mp.cpu_count()
result = 0
for file_result in pool.map(file_task, range(N_files)):
result += file_result
print 'result', result
#or simply
#result = sum(pool.map(file_task, range(N_files)))
How can I implement conditional lock in threaded application, for instance I haw
30 threads that are calling function and for most off the time all threads can access is simultaneous, but depending on function input there can be condition when only one thread can do that one thing. (If value for input is repeated and some thread is still working then I need lock.)
I now that there is module threading with Rlock() but I don't now how to use it in a way that i described it in first part.
Edit: The question is actually about how to prevent any two threads from running the same function with the same argument at the same time. (Thanks to David for helping me formulate my question :) )
Try this: have a lock in the module where your function is, and if the input to the function is such that locking is required, acquire the lock inside the function. Otherwise don't.
l = threading.RLock()
def fn(arg):
if arg == arg_that_needs_lock:
l.acquire()
try:
# do stuff
finally:
l.release()
else:
# do other stuff
EDIT:
As far as I can tell now, the question is actually about how to prevent any two threads from running the same function with the same argument at the same time. There's no problem with two threads running the same function with different arguments at the same time, though. The simple method to do this, if all valid arguments to the function can be dictionary keys, is to create a dictionary of arguments to locks:
import threading
dict_lock = threading.RLock()
locks = {}
def fn_dict(arg):
dict_lock.acquire()
try:
if arg not in dict:
locks[arg] = threading.RLock()
l = locks[arg]
finally:
dict_lock.release()
l.acquire()
try:
# do stuff
finally:
l.release()
If your function can be called with many different arguments, though, that amounts to a lot of locks. Probably a better way is to have a set of all arguments with which the function is currently executing, and have the contents of that set protected by a lock. I think this should work:
set_condition = threading.Condition()
current_args = set()
def fn_set(arg):
set_condition.acquire()
try:
while arg in current_args:
set_condition.wait()
current_args.add(arg)
finally:
set_condition.release()
# do stuff
set_condition.acquire()
try:
current_args.remove(arg)
set_condition.notifyAll()
finally:
set_condition.release()
It sounds like you want something similar to a Readers-Writer lock.
This is probably not what you want, but might be a clue:
from __future__ import with_statement
import threading
def RWLock(readers = 1, writers = 1):
m = _Monitor(readers, writers)
return (_RWLock(m.r_increment, m.r_decrement), _RWLock(m.w_increment, m.w_decrement))
class _RWLock(object):
def __init__(self, inc, dec):
self.inc = inc
self.dec = dec
def acquire(self):
self.inc()
def release(self):
self.dec()
def __enter__(self):
self.inc()
def __exit__(self):
self.dec()
class _Monitor(object):
def __init__(self, max_readers, max_writers):
self.max_readers = max_readers
self.max_writers = max_writers
self.readers = 0
self.writers = 0
self.monitor = threading.Condition()
def r_increment(self):
with self.monitor:
while self.writers > 0 and self.readers < self.max_readers:
self.monitor.wait()
self.readers += 1
self.monitor.notify()
def r_decrement(self):
with self.monitor:
while self.writers > 0:
self.monitor.wait()
assert(self.readers > 0)
self.readers -= 1
self.monitor.notify()
def w_increment(self):
with self.monitor:
while self.readers > 0 and self.writers < self.max_writers:
self.monitor.wait()
self.writers += 1
self.monitor.notify()
def w_decrement(self):
with self.monitor:
assert(self.writers > 0)
self.writers -= 1
self.monitor.notify()
if __name__ == '__main__':
rl, wl = RWLock()
wl.acquire()
wl.release()
rl.acquire()
rl.release()
(Unfortunately not tested)