Python - appending to same file from multiple threads - python

I'm writing an app that appends lines to the same file from multiple threads.
I have a problem in which some lines are appended without a new line.
Any solution for this?
class PathThread(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def printfiles(self, p):
for path, dirs, files in os.walk(p):
for f in files:
print(f, file=output)
def run(self):
while True:
path = self.queue.get()
self.printfiles(path)
self.queue.task_done()
pathqueue = Queue.Queue()
paths = getThisFromSomeWhere()
output = codecs.open('file', 'a')
# spawn threads
for i in range(0, 5):
t = PathThread(pathqueue)
t.setDaemon(True)
t.start()
# add paths to queue
for path in paths:
pathqueue.put(path)
# wait for queue to get empty
pathqueue.join()

The solution is to write to the file in one thread only.
import Queue # or queue in Python 3
import threading
class PrintThread(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def printfiles(self, p):
for path, dirs, files in os.walk(p):
for f in files:
print(f, file=output)
def run(self):
while True:
result = self.queue.get()
self.printfiles(result)
self.queue.task_done()
class ProcessThread(threading.Thread):
def __init__(self, in_queue, out_queue):
threading.Thread.__init__(self)
self.in_queue = in_queue
self.out_queue = out_queue
def run(self):
while True:
path = self.in_queue.get()
result = self.process(path)
self.out_queue.put(result)
self.in_queue.task_done()
def process(self, path):
# Do the processing job here
pathqueue = Queue.Queue()
resultqueue = Queue.Queue()
paths = getThisFromSomeWhere()
output = codecs.open('file', 'a')
# spawn threads to process
for i in range(0, 5):
t = ProcessThread(pathqueue, resultqueue)
t.setDaemon(True)
t.start()
# spawn threads to print
t = PrintThread(resultqueue)
t.setDaemon(True)
t.start()
# add paths to queue
for path in paths:
pathqueue.put(path)
# wait for queue to get empty
pathqueue.join()
resultqueue.join()

the fact that you never see jumbled text on the same line or new lines in the middle of a line is a clue that you actually dont need to syncronize appending to the file. the problem is that you use print to write to a single file handle. i suspect print is actually doing 2 operations to the file handle in one call and those operations are racing between the threads. basically print is doing something like:
file_handle.write('whatever_text_you_pass_it')
file_handle.write(os.linesep)
and because different threads are doing this simultaneously on the same file handle sometimes one thread will get in the first write and the other thread will then get in its first write and then you'll get two carriage returns in a row. or really any permutation of these.
the simplest way to get around this is to stop using print and just use write directly. try something like this:
output.write(f + os.linesep)
this still seems dangerous to me. im not sure what gaurantees you can expect with all the threads using the same file handle object and contending for its internal buffer. personally id side step the whole issue and just have every thread get its own file handle. also note that this works because the default for write buffer flushes is line-buffered, so when it does a flush to the file it ends on an os.linesep. to force it to use line-buffered send a 1 as the third argument of open. you can test it out like this:
#!/usr/bin/env python
import os
import sys
import threading
def hello(file_name, message, count):
with open(file_name, 'a', 1) as f:
for i in range(0, count):
f.write(message + os.linesep)
if __name__ == '__main__':
#start a file
with open('some.txt', 'w') as f:
f.write('this is the beginning' + os.linesep)
#make 10 threads write a million lines to the same file at the same time
threads = []
for i in range(0, 10):
threads.append(threading.Thread(target=hello, args=('some.txt', 'hey im thread %d' % i, 1000000)))
threads[-1].start()
for t in threads:
t.join()
#check what the heck the file had
uniq_lines = set()
with open('some.txt', 'r') as f:
for l in f:
uniq_lines.add(l)
for u in uniq_lines:
sys.stdout.write(u)
The output looks like this:
hey im thread 6
hey im thread 7
hey im thread 9
hey im thread 8
hey im thread 3
this is the beginning
hey im thread 5
hey im thread 4
hey im thread 1
hey im thread 0
hey im thread 2

And maybe some more newlines where they shouldn't be?
You should have in mind the fact that a shared resource should not be accessed by more than one thread at a time or otherwise unpredictable consequences might happen (it's called using 'atomic operations' while using threads).
Take a look at this page for a little intuition: Thread Synchronization Mechanisms in Python

Related

python threading running process in backend

I was trying to make some text report file from some data source which takes enormous time and to simulate this I wrote the following code
I planned to do it using thread and thought t.daemon = True would
solve the purpose, but the program doesn't exit till the operation is
complete
import random
import threading
import time
import logging
logging.basicConfig(level=logging.DEBUG,
format='(%(threadName)-10s) %(message)s',
)
def worker():
"""thread worker function"""
t = threading.currentThread()
tag = random.randint(1, 64)
file_name = "/tmp/t-%d.txt" % (tag)
logging.debug('started writing file - %s', file_name)
f = open(file_name, 'w')
for x in xrange(2 ** tag): # total no of lines is 2**tag
f.write("%d\n" % x)
logging.debug('ending')
f.close()
return
# to simulate 5 files
for i in range(5):
t = threading.Thread(target=worker)
t.daemon = True
t.start()
main_thread = threading.currentThread()
for t in threading.enumerate():
if t is main_thread:
continue
logging.debug('joining %s', t.getName())
t.join()
When I removed t.join() then some of the data written till program exits and the program
exits quickly, but adding t.join() keeps program running till end. Is there any way to exit from program but the
process should still be running to complete the task in backend.
You aren't looking for a daemon. In fact you want to make sure your process isn't a daemon because it will get killed once that's all that's left and your program exists. You are looking to detach your thread.
Note: lowered max to 28 in case I forgot to kill processes (and so it won't take my entire disk). You will need to kill each process individually if you want them to stop! ie "kill 13345" if you had the message "exiting main 13345" (where that thread is over 2**25)
Also note: thread joining will keep going until the end because your program is not done running and is waiting to join the threads.
Here's what you want:
import logging
import random
import multiprocessing
import time
import sys
#Make sure you don't write to stdout after this program stopped running and sub-processes are logging!
logging.basicConfig(level=logging.DEBUG,
format='(%(threadName)-10s) %(message)s',
)
def detach():
p = multiprocessing.current_process()
name = "worker" + str(p.pid)
cc = multiprocessing.Process(name=name, target=worker)
cc.daemon = False
cc.start()
logging.debug('Detached process: %s %s', p.name, p.pid)
sys.stdout.flush()
def worker():
"""thread worker function"""
#Should probably make sure there isn't already a thread processing this file already...
tag = random.randint(5, 33) #Stop at 33 to make sure we don't take over the harddrive (8GB)
file_name = "/tmp/t-%d.txt" % (tag)
if tag > 26:
logging.warning('\n\nThe detached process resulting from this may need to be killed by hand.\n')
logging.debug('started writing file - %s', file_name)
#Changed your code to use "with", available in any recent python version
with open(file_name, 'w') as f:
for x in xrange(2 ** tag): # total no of lines is 2**tag
f.write("%d\n" % x)
return
#Stackoverflow: Keep scrolling to see more code!
# to simulate 5 files
for i in range(5):
t = multiprocessing.Process(target=detach)
t.daemon = False
t.start()
time.sleep(0.5)
t.terminate()
logging.debug("Terminating main program")

Many threads to write log file at same time in Python

I am writing a script to retrieve WMI info from many computers at the same time then write this info in a text file:
f = open("results.txt", 'w+') ## to clean the results file before the start
def filesize(asset):
f = open("results.txt", 'a+')
c = wmi.WMI(asset)
wql = 'SELECT FileSize,Name FROM CIM_DataFile where (Drive="D:" OR Drive="E:") and Caption like "%file%"'
for item in c.query(wql):
print >> f, item.Name.split("\\")[2].strip().upper(), str(item.FileSize)
class myThread (threading.Thread):
def __init__(self,name):
threading.Thread.__init__(self)
self.name = name
def run(self):
pythoncom.CoInitialize ()
print "Starting " + self.name
filesize(self.name)
print "Exiting " + self.name
thread1 = myThread('10.24.2.31')
thread2 = myThread('10.24.2.32')
thread3 = myThread('10.24.2.33')
thread4 = myThread('10.24.2.34')
thread1.start()
thread2.start()
thread3.start()
thread4.start()
The problem is that all threads writing at the same time.
You can simply create your own locking mechanism to ensure that only one thread is ever writing to a file.
import threading
lock = threading.Lock()
def write_to_file(f, text, file_size):
lock.acquire() # thread blocks at this line until it can obtain lock
# in this section, only one thread can be present at a time.
print >> f, text, file_size
lock.release()
def filesize(asset):
f = open("results.txt", 'a+')
c = wmi.WMI(asset)
wql = 'SELECT FileSize,Name FROM CIM_DataFile where (Drive="D:" OR Drive="E:") and Caption like "%file%"'
for item in c.query(wql):
write_to_file(f, item.Name.split("\\")[2].strip().upper(), str(item.FileSize))
You may want to consider placing the lock around the entire for loop for item in c.query(wql): to allow each thread to do a larger chunk of work before releasing the lock.
print is not thread safe. Use the logging module instead (which is):
import logging
import threading
import time
FORMAT = '[%(levelname)s] (%(threadName)-10s) %(message)s'
logging.basicConfig(level=logging.DEBUG,
format=FORMAT)
file_handler = logging.FileHandler('results.log')
file_handler.setFormatter(logging.Formatter(FORMAT))
logging.getLogger().addHandler(file_handler)
def worker():
logging.info('Starting')
time.sleep(2)
logging.info('Exiting')
t1 = threading.Thread(target=worker)
t2 = threading.Thread(target=worker)
t1.start()
t2.start()
Output (and contents of results.log):
[INFO] (Thread-1 ) Starting
[INFO] (Thread-2 ) Starting
[INFO] (Thread-1 ) Exiting
[INFO] (Thread-2 ) Exiting
Instead of using the default name (Thread-n), you can set your own name using the name keyword argument, which the %(threadName) formatting directive then will then use:
t = threading.Thread(name="My worker thread", target=worker)
(This example was adapted from an example from Doug Hellmann's excellent article about the threading module)
For another solution, use a Pool to calculate data, returning it to the parent process. This parent then writes all data to a file. Since there's only one proc writing to the file at a time, there's no need for additional locking.
Note the following uses a pool of processes, not threads. This makes the code much simpler and easier than putting something together using the threading module. (There is a ThreadPool object, but it's not documented.)
source
import glob, os, time
from multiprocessing import Pool
def filesize(path):
time.sleep(0.1)
return (path, os.path.getsize(path))
paths = glob.glob('*.py')
pool = Pool() # default: proc per CPU
with open("results.txt", 'w+') as dataf:
for (apath, asize) in pool.imap_unordered(
filesize, paths,
):
print >>dataf, apath,asize
output in results.txt
zwrap.py 122
usercustomize.py 38
tpending.py 2345
msimple4.py 385
parse2.py 499

Python Thread communication not working

I have two threads, Reader and Writer.
The Writer gets data from the network and sends it then over a socket to some executable. When this is done the writer should block up to 70 seconds which I specify with a Event.wait(askrate).
This should give the executable enough time to compute the result and then submit the output. If the computation is finished I used Event.set() to release the lock on the Writer
thread so that it can read the next data that is forwared to the executeable and so on.
The problem that I have is, that the Writer thread still keeps reading data while the Reader thread is waiting for the result coming through the serial interface.
Anyone an idea why this blocking meachnism is not proberly working between these two threads?
askrate = 70
s = socket.socket(socket.AF_UNIX, socket.SOCK_STREAM)
s.connect("/tmp/demo_socket")
class Reader(Thread):
def __init__(self):
Thread.__init__(self)
self.daemon = True
def run(self):
while True:
nonce = s.recv(4)
if len(nonce) == 4:
submitter = Submitter(writer.block, nonce)
#submit result and release thread lock in Writer class
golden.set()
class Writer(Thread):
def __init__(self):
Thread.__init__(self)
self.daemon = True
def run(self):
while True:
work = bc.getwork()
self.block = work['data']
self.midstate = work['midstate']
payload = self.midstate.decode('hex') + self.block.decode('hex')
s.send(payload)
result = golden.wait(askrate)
if result:
golden.clear()
golden = Event()
reader = Reader()
writer = Writer()
reader.start()
writer.start()
I'm pretty sure that it's not how you are supposed to use AF_UNIX sockets. You are supposed to open the pseudo-file twice (from the same of different processes); then writes to one side appear as reads on the other side, and vice-versa. In your code, you open the pseudo-file only once. Any write is probably blocking, waiting for another process to open the pseudo-file a second time.
In your case, you should use socket.socketpair(), which returns you two sockets at once, playing the role of the two ends. Use one end in each thread.

Threaded scripts stall after ending without closing

Hopefully this is just something small im doing wrong as these are some of my first threaded scripts using queues. Basically after running through it stops and sits there but wont exit.
import threading
import Queue
class Words(threading.Thread):
def __init__(self):
threading.Thread.__init__(self)
self.queue = Queue.Queue()
def word(self):
read = open('words.txt')
for word in read:
word = word.replace("\n","")
self.queue.put(word)
read.close()
for i in range(5):
t = self.run()
t.setDaemon(True)
t.start()
self.queue.join()
def run(self):
while True:
word = self.queue.get()
print word
self.queue.task_done()
if __name__ == '__main__':
Word = Words()
Word.word()
You are using threads incorrectly in a couple of ways in your code:
First, the code seems to be built on the incorrect assumption that the one Thread subclass object you have can spawn all of the threads you need to do the work. On the contrary, the Thread documentation says that start "must be called at most once per Thread object". In the case of the word method, this is the self reference.
However, it would not be useful to call self.start() because that would spawn a single thread to consume the queue, and you would gain nothing from threading. Since word would have to construct new instances of Words anyway to initiate multiple threads, and the queue object will need to be accessed by multiple Words instances, it would be useful to have both of those separate from the Words object. For example, word could be a function outside of the Words object that starts like:
def word():
queue = Queue.Queue()
read = open('words.txt')
for word in read:
word = word.replace("\n","")
self.put(word)
read.close()
#...
This would also mean that Words would have to take the queue object as a parameter so that multiple instances would share the same queue:
class Words(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
Second, your thread function (run) is an infinite loop, so the thread will never terminate. Since you are only running the queue consumer threads after you have added all items to the queue, you should not have a problem terminating the thread once the queue is empty, like so:
def run(self):
while True:
try:
word = self.queue.get(False)
except Queue.Empty:
break
print word
self.queue.task_done()
It is useful to use exceptions here because otherwise the queue could empty out and then the thread could try to get from it and it would end up waiting forever for an item to be added.
Third, in your for loop you call self.run(), which passes control to the run method, which then processes the entire queue and returns None after the method is changed to terminate. The following lines would throw exceptions because t would be assigned the value None. Since you want to spawn other threads to do the work, you should do t = Word(queue) to get a new word thread and then t.start() to start. So, the code when put together should be
class Words(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
try:
word = self.queue.get(False)
except Queue.Empty:
break
print word
self.queue.task_done()
def word():
queue = Queue.Queue()
read = open('words.txt')
for word in read:
word = word.replace("\n","")
self.put(word)
read.close()
for i in range(5):
t = Word()
t.setDaemon(True)
t.start()
queue.join()
if __name__=='__main__':
word()
It looks to me like you're mixing up a number of different aspects of threads, when you really just need a simple solution. As far as I can tell, the for i in range(5): loop never gets past the first iteration because you run the thread and it gets caught in an infinite loop.
Here's how I would do it:
import threading
import Queue
class Worker(threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def run(self):
while True:
# try to dequeue a word from the queue
try:
word = self.queue.get_nowait()
# if there's nothing in the queue, break because we're done
except Queue.Empty:
break
# if the 'try' was successful at getting a word, print it
print word
def fill_queue(queue):
read = open('words.txt')
for word in read:
word = word.replace("\n", "")
queue.put(word)
read.close()
if __name__ == "__main__":
# create empty queue
queue = Queue.Queue()
# fill the queue with work
fill_queue(queue)
# create 5 worker threads
threads = []
for i in range(5):
threads.append(Worker(queue))
# start threads
for thread in threads:
thread.start()
# join threads once they finish
for thread in threads:
thread.join()
If you would like to read over some examples of threaded code in Python, the following recipes might be able teach you some basics regarding the subject. Some of them are demonstrations, and others are programs:
mthread.py (2)
mthread.py (1)
Thread Syncronizer
Bounded Buffer Example (1)
Bounded Buffer Example (2)
Port Forwarding
Module For Running Simple Proxies
Proxy Example
Paint 2.0
spots (2)
Directory Pruner 2

Using Python's Multiprocessing module to execute simultaneous and separate SEAWAT/MODFLOW model runs

I'm trying to complete 100 model runs on my 8-processor 64-bit Windows 7 machine. I'd like to run 7 instances of the model concurrently to decrease my total run time (approx. 9.5 min per model run). I've looked at several threads pertaining to the Multiprocessing module of Python, but am still missing something.
Using the multiprocessing module
How to spawn parallel child processes on a multi-processor system?
Python Multiprocessing queue
My Process:
I have 100 different parameter sets I'd like to run through SEAWAT/MODFLOW to compare the results. I have pre-built the model input files for each model run and stored them in their own directories. What I'd like to be able to do is have 7 models running at a time until all realizations have been completed. There needn't be communication between processes or display of results. So far I have only been able to spawn the models sequentially:
import os,subprocess
import multiprocessing as mp
ws = r'D:\Data\Users\jbellino\Project\stJohnsDeepening\model\xsec_a'
files = []
for f in os.listdir(ws + r'\fieldgen\reals'):
if f.endswith('.npy'):
files.append(f)
## def work(cmd):
## return subprocess.call(cmd, shell=False)
def run(f,def_param=ws):
real = f.split('_')[2].split('.')[0]
print 'Realization %s' % real
mf2k = r'c:\modflow\mf2k.1_19\bin\mf2k.exe '
mf2k5 = r'c:\modflow\MF2005_1_8\bin\mf2005.exe '
seawatV4 = r'c:\modflow\swt_v4_00_04\exe\swt_v4.exe '
seawatV4x64 = r'c:\modflow\swt_v4_00_04\exe\swt_v4x64.exe '
exe = seawatV4x64
swt_nam = ws + r'\reals\real%s\ss\ss.nam_swt' % real
os.system( exe + swt_nam )
if __name__ == '__main__':
p = mp.Pool(processes=mp.cpu_count()-1) #-leave 1 processor available for system and other processes
tasks = range(len(files))
results = []
for f in files:
r = p.map_async(run(f), tasks, callback=results.append)
I changed the if __name__ == 'main': to the following in hopes it would fix the lack of parallelism I feel is being imparted on the above script by the for loop. However, the model fails to even run (no Python error):
if __name__ == '__main__':
p = mp.Pool(processes=mp.cpu_count()-1) #-leave 1 processor available for system and other processes
p.map_async(run,((files[f],) for f in range(len(files))))
Any and all help is greatly appreciated!
EDIT 3/26/2012 13:31 EST
Using the "Manual Pool" method in #J.F. Sebastian's answer below I get parallel execution of my external .exe. Model realizations are called up in batches of 8 at a time, but it doesn't wait for those 8 runs to complete before calling up the next batch and so on:
from __future__ import print_function
import os,subprocess,sys
import multiprocessing as mp
from Queue import Queue
from threading import Thread
def run(f,ws):
real = f.split('_')[-1].split('.')[0]
print('Realization %s' % real)
seawatV4x64 = r'c:\modflow\swt_v4_00_04\exe\swt_v4x64.exe '
swt_nam = ws + r'\reals\real%s\ss\ss.nam_swt' % real
subprocess.check_call([seawatV4x64, swt_nam])
def worker(queue):
"""Process files from the queue."""
for args in iter(queue.get, None):
try:
run(*args)
except Exception as e: # catch exceptions to avoid exiting the
# thread prematurely
print('%r failed: %s' % (args, e,), file=sys.stderr)
def main():
# populate files
ws = r'D:\Data\Users\jbellino\Project\stJohnsDeepening\model\xsec_a'
wdir = os.path.join(ws, r'fieldgen\reals')
q = Queue()
for f in os.listdir(wdir):
if f.endswith('.npy'):
q.put_nowait((os.path.join(wdir, f), ws))
# start threads
threads = [Thread(target=worker, args=(q,)) for _ in range(8)]
for t in threads:
t.daemon = True # threads die if the program dies
t.start()
for _ in threads: q.put_nowait(None) # signal no more files
for t in threads: t.join() # wait for completion
if __name__ == '__main__':
mp.freeze_support() # optional if the program is not frozen
main()
No error traceback is available. The run() function performs its duty when called upon a single model realization file as with mutiple files. The only difference is that with multiple files, it is called len(files) times though each of the instances immediately closes and only one model run is allowed to finish at which time the script exits gracefully (exit code 0).
Adding some print statements to main() reveals some information about active thread-counts as well as thread status (note that this is a test on only 8 of the realization files to make the screenshot more manageable, theoretically all 8 files should be run concurrently, however the behavior continues where they are spawn and immediately die except one):
def main():
# populate files
ws = r'D:\Data\Users\jbellino\Project\stJohnsDeepening\model\xsec_a'
wdir = os.path.join(ws, r'fieldgen\test')
q = Queue()
for f in os.listdir(wdir):
if f.endswith('.npy'):
q.put_nowait((os.path.join(wdir, f), ws))
# start threads
threads = [Thread(target=worker, args=(q,)) for _ in range(mp.cpu_count())]
for t in threads:
t.daemon = True # threads die if the program dies
t.start()
print('Active Count a',threading.activeCount())
for _ in threads:
print(_)
q.put_nowait(None) # signal no more files
for t in threads:
print(t)
t.join() # wait for completion
print('Active Count b',threading.activeCount())
**The line which reads "D:\\Data\\Users..." is the error information thrown when I manually stop the model from running to completion. Once I stop the model running, the remaining thread status lines get reported and the script exits.
EDIT 3/26/2012 16:24 EST
SEAWAT does allow concurrent execution as I've done this in the past, spawning instances manually using iPython and launching from each model file folder. This time around, I'm launching all model runs from a single location, namely the directory where my script resides. It looks like the culprit may be in the way SEAWAT is saving some of the output. When SEAWAT is run, it immediately creates files pertaining to the model run. One of these files is not being saved to the directory in which the model realization is located, but in the top directory where the script is located. This is preventing any subsequent threads from saving the same file name in the same location (which they all want to do since these filenames are generic and non-specific to each realization). The SEAWAT windows were not staying open long enough for me to read or even see that there was an error message, I only realized this when I went back and tried to run the code using iPython which directly displays the printout from SEAWAT instead of opening a new window to run the program.
I am accepting #J.F. Sebastian's answer as it is likely that once I resolve this model-executable issue, the threading code he has provided will get me where I need to be.
FINAL CODE
Added cwd argument in subprocess.check_call to start each instance of SEAWAT in its own directory. Very key.
from __future__ import print_function
import os,subprocess,sys
import multiprocessing as mp
from Queue import Queue
from threading import Thread
import threading
def run(f,ws):
real = f.split('_')[-1].split('.')[0]
print('Realization %s' % real)
seawatV4x64 = r'c:\modflow\swt_v4_00_04\exe\swt_v4x64.exe '
cwd = ws + r'\reals\real%s\ss' % real
swt_nam = ws + r'\reals\real%s\ss\ss.nam_swt' % real
subprocess.check_call([seawatV4x64, swt_nam],cwd=cwd)
def worker(queue):
"""Process files from the queue."""
for args in iter(queue.get, None):
try:
run(*args)
except Exception as e: # catch exceptions to avoid exiting the
# thread prematurely
print('%r failed: %s' % (args, e,), file=sys.stderr)
def main():
# populate files
ws = r'D:\Data\Users\jbellino\Project\stJohnsDeepening\model\xsec_a'
wdir = os.path.join(ws, r'fieldgen\reals')
q = Queue()
for f in os.listdir(wdir):
if f.endswith('.npy'):
q.put_nowait((os.path.join(wdir, f), ws))
# start threads
threads = [Thread(target=worker, args=(q,)) for _ in range(mp.cpu_count()-1)]
for t in threads:
t.daemon = True # threads die if the program dies
t.start()
for _ in threads: q.put_nowait(None) # signal no more files
for t in threads: t.join() # wait for completion
if __name__ == '__main__':
mp.freeze_support() # optional if the program is not frozen
main()
I don't see any computations in the Python code. If you just need to execute several external programs in parallel it is sufficient to use subprocess to run the programs and threading module to maintain constant number of processes running, but the simplest code is using multiprocessing.Pool:
#!/usr/bin/env python
import os
import multiprocessing as mp
def run(filename_def_param):
filename, def_param = filename_def_param # unpack arguments
... # call external program on `filename`
def safe_run(*args, **kwargs):
"""Call run(), catch exceptions."""
try: run(*args, **kwargs)
except Exception as e:
print("error: %s run(*%r, **%r)" % (e, args, kwargs))
def main():
# populate files
ws = r'D:\Data\Users\jbellino\Project\stJohnsDeepening\model\xsec_a'
workdir = os.path.join(ws, r'fieldgen\reals')
files = ((os.path.join(workdir, f), ws)
for f in os.listdir(workdir) if f.endswith('.npy'))
# start processes
pool = mp.Pool() # use all available CPUs
pool.map(safe_run, files)
if __name__=="__main__":
mp.freeze_support() # optional if the program is not frozen
main()
If there are many files then pool.map() could be replaced by for _ in pool.imap_unordered(safe_run, files): pass.
There is also mutiprocessing.dummy.Pool that provides the same interface as multiprocessing.Pool but uses threads instead of processes that might be more appropriate in this case.
You don't need to keep some CPUs free. Just use a command that starts your executables with a low priority (on Linux it is a nice program).
ThreadPoolExecutor example
concurrent.futures.ThreadPoolExecutor would be both simple and sufficient but it requires 3rd-party dependency on Python 2.x (it is in the stdlib since Python 3.2).
#!/usr/bin/env python
import os
import concurrent.futures
def run(filename, def_param):
... # call external program on `filename`
# populate files
ws = r'D:\Data\Users\jbellino\Project\stJohnsDeepening\model\xsec_a'
wdir = os.path.join(ws, r'fieldgen\reals')
files = (os.path.join(wdir, f) for f in os.listdir(wdir) if f.endswith('.npy'))
# start threads
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
future_to_file = dict((executor.submit(run, f, ws), f) for f in files)
for future in concurrent.futures.as_completed(future_to_file):
f = future_to_file[future]
if future.exception() is not None:
print('%r generated an exception: %s' % (f, future.exception()))
# run() doesn't return anything so `future.result()` is always `None`
Or if we ignore exceptions raised by run():
from itertools import repeat
... # the same
# start threads
with concurrent.futures.ThreadPoolExecutor(max_workers=8) as executor:
executor.map(run, files, repeat(ws))
# run() doesn't return anything so `map()` results can be ignored
subprocess + threading (manual pool) solution
#!/usr/bin/env python
from __future__ import print_function
import os
import subprocess
import sys
from Queue import Queue
from threading import Thread
def run(filename, def_param):
... # define exe, swt_nam
subprocess.check_call([exe, swt_nam]) # run external program
def worker(queue):
"""Process files from the queue."""
for args in iter(queue.get, None):
try:
run(*args)
except Exception as e: # catch exceptions to avoid exiting the
# thread prematurely
print('%r failed: %s' % (args, e,), file=sys.stderr)
# start threads
q = Queue()
threads = [Thread(target=worker, args=(q,)) for _ in range(8)]
for t in threads:
t.daemon = True # threads die if the program dies
t.start()
# populate files
ws = r'D:\Data\Users\jbellino\Project\stJohnsDeepening\model\xsec_a'
wdir = os.path.join(ws, r'fieldgen\reals')
for f in os.listdir(wdir):
if f.endswith('.npy'):
q.put_nowait((os.path.join(wdir, f), ws))
for _ in threads: q.put_nowait(None) # signal no more files
for t in threads: t.join() # wait for completion
Here is my way to maintain the minimum x number of threads in the memory. Its an combination of threading and multiprocessing modules. It may be unusual to other techniques like respected fellow members have explained above BUT may be worth considerable. For the sake of explanation, I am taking a scenario of crawling a minimum of 5 websites at a time.
so here it is:-
#importing dependencies.
from multiprocessing import Process
from threading import Thread
import threading
# Crawler function
def crawler(domain):
# define crawler technique here.
output.write(scrapeddata + "\n")
pass
Next is threadController function. This function will control the flow of threads to the main memory. It will keep activating the threads to maintain the threadNum "minimum" limit ie. 5. Also it won't exit until, all Active threads(acitveCount) are finished up.
It will maintain a minimum of threadNum(5) startProcess function threads (these threads will eventually start the Processes from the processList while joining them with a time out of 60 seconds). After staring threadController, there would be 2 threads which are not included in the above limit of 5 ie. the Main thread and the threadController thread itself. thats why threading.activeCount() != 2 has been used.
def threadController():
print "Thread count before child thread starts is:-", threading.activeCount(), len(processList)
# staring first thread. This will make the activeCount=3
Thread(target = startProcess).start()
# loop while thread List is not empty OR active threads have not finished up.
while len(processList) != 0 or threading.activeCount() != 2:
if (threading.activeCount() < (threadNum + 2) and # if count of active threads are less than the Minimum AND
len(processList) != 0): # processList is not empty
Thread(target = startProcess).start() # This line would start startThreads function as a seperate thread **
startProcess function, as a separate thread, would start Processes from the processlist. The purpose of this function (**started as a different thread) is that It would become a parent thread for Processes. So when It will join them with a timeout of 60 seconds, this would stop the startProcess thread to move ahead but this won't stop threadController to perform. So this way, threadController will work as required.
def startProcess():
pr = processList.pop(0)
pr.start()
pr.join(60.00) # joining the thread with time out of 60 seconds as a float.
if __name__ == '__main__':
# a file holding a list of domains
domains = open("Domains.txt", "r").read().split("\n")
output = open("test.txt", "a")
processList = [] # thread list
threadNum = 5 # number of thread initiated processes to be run at one time
# making process List
for r in range(0, len(domains), 1):
domain = domains[r].strip()
p = Process(target = crawler, args = (domain,))
processList.append(p) # making a list of performer threads.
# starting the threadController as a seperate thread.
mt = Thread(target = threadController)
mt.start()
mt.join() # won't let go next until threadController thread finishes.
output.close()
print "Done"
Besides maintaining a minimum number of threads in the memory, my aim was to also have something which could avoid stuck threads or processes in the memory. I did this using the time out function.
My apologies for any typing mistake.
I hope this construction would help anyone in this world.
Regards,
Vikas Gautam

Categories

Resources