How do I run os.walk in parallel in Python?

How do I run os.walk in parallel in Python? - python

I wrote a simple app in Java that takes a list of paths and generates a file with all the file paths under that original list.
If I have paths.txt that has:
c:\folder1\
c:\folder2\
...
...
c:\folder1000\
My app runs the recursive function on each path multithreaded, and returns a file with all the file paths under these folders.
Now I want to write this app in Python.
I've written a simple app that uses os.walk() to run through a given folder and print the filepaths to output.
Now I want to run it in parallel, and I've seen that Python has some modules for this:
multithreaded and multiprocessing.
What is the best what to do this? And within that way, how is it performed?

Here is a multiprocessing solution:
from multiprocessing.pool import Pool
from multiprocessing import JoinableQueue as Queue
import os
def explore_path(path):
directories = []
nondirectories = []
for filename in os.listdir(path):
fullname = os.path.join(path, filename)
if os.path.isdir(fullname):
directories.append(fullname)
else:
nondirectories.append(filename)
outputfile = path.replace(os.sep, '_') + '.txt'
with open(outputfile, 'w') as f:
for filename in nondirectories:
print >> f, filename
return directories
def parallel_worker():
while True:
path = unsearched.get()
dirs = explore_path(path)
for newdir in dirs:
unsearched.put(newdir)
unsearched.task_done()
# acquire the list of paths
with open('paths.txt') as f:
paths = f.read().split()
unsearched = Queue()
for path in paths:
unsearched.put(path)
with Pool(5) as pool:
for i in range(5):
pool.apply_async(parallel_worker)
unsearched.join()
print('Done')

This is a pattern for threads in python which has been useful to me. I'm not sure if threading is going to increase your performance due to the way threads work in CPython though.
import threading
import Queue
import os
class PathThread (threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def printfiles(self, p):
for path, dirs, files in os.walk(p):
for f in files:
print path + "/" + f
def run(self):
while True:
path = self.queue.get()
self.printfiles(path)
self.queue.task_done()
# threadsafe queue
pathqueue = Queue.Queue()
paths = ["foo", "bar", "baz"]
# spawn threads
for i in range(0, 5):
t = PathThread(pathqueue)
t.setDaemon(True)
t.start()
# add paths to queue
for path in paths:
pathqueue.put(path)
# wait for queue to get empty
pathqueue.join()

Even threading can be pretty helpful for directory traversal. I use the following code for traversing a SharePoint tree, getting a pretty significant speedup for about 50 threads.
This particular program returns (path, data) pairs for all xml files in a directory structure, and can be simply expanded for your use.
(This is cut and pasted from my program; some additional editing is needed.)
#unique string for error passing error messages
ERROR = '\xffERROR\xff'
class ScanWorker(threading.Thread):
"""Worker class for scanning directory structures.
pathQueue: queue for pathnames of directories
resultQueue: results of processFile, pairs of (path, data) to be updated
"""
lock = threading.Lock()
dirCount = 0
def __init__(self, pathQueue, resultQueue):
self.pathQueue = pathQueue
self.resultQueue = resultQueue
super().__init__()
def run(self):
"""Worker thread.
Get a directory, process it, and put new directories on the
queue."""
try:
while True:
self.processDir(self.pathQueue.get())
self.pathQueue.task_done()
except Exception as e:
#pass on exception to main thread
description = traceback.format_exception(*sys.exc_info())
description.insert(0,
"Error in thread {}:\n".format(
threading.current_thread().name))
self.resultQueue.put((ERROR, description))
self.pathQueue.task_done()
def processDir(self, top):
"""Visit a directory
Call self.processFile on every file, and queue the directories.
"""
#Wait and retry a few times in case of network errors.
#SharePoint is not reliable, gives errors for no reason
for retryCount in range(30):
try:
names = listdir(top)
break
except OSError as e:
if e.errno in (2,22):
lastError = e
print(end="L", flush=True)
time.sleep(1)
else:
raise
else:
print("List: too many retries")
raise lastError
#it is not important to worry about race conditions here
self.__class__.dirCount += 1
#process contents
for name in names:
if isdir(join(top, name)): self.pathQueue.put(join(top, name))
else: self.processFile(join(top, name))
def processFile(self, path):
"""Get XML file.
"""
#only xml files
if not path.lower().endswith('.xml'): return
filemtime = datetime.fromtimestamp(getmtime(path))
#SharePoint is not reliable, gives errors for no reason; just retry
for retryCount in range(30):
try:
data = open(path,'rb').read()
break
except OSError as e:
if e.errno in (2,22):
lastError = e
print(end="R", flush=True)
time.sleep(1)
else:
raise
else:
print("Read: too many retries")
raise lastError
self.resultQueue.put((path, data))
class Scanner:
"""Interface to the ScanWorkers
Sharepoint is pretty fast compared to its delay and handles 50 workers well
Make sure you only create one instance of Scanner!
"""
def __init__(self, workers):
#don't restrict the path queue length; this causes deadlock
#we use a LIFO queue to get more depth-first like search
#reducing average queue length and hopefully improving server caching
self.pathQueue = queue.LifoQueue()
#this is the output queue to the main thread
self.resultQueue = queue.Queue(5)
self.workers = workers
#start workers
for i in range(workers):
t = ScanWorker(self.pathQueue, self.resultQueue)
t.setDaemon(True)
t.start()
def startWorkers(self, path):
#add counter
self.added = 0
#and go
self.pathQueue.put(path)
def processResult(self, wait=True):
"""Get an element from the result queue, and add to the zip file."""
path, data = self.resultQueue.get(block=wait)
if path==ERROR:
#process gave alarm; stop scanning
#pass on description
raise ScanError(data)
<do whatever you want to do with the file>
self.resultQueue.task_done()
self.added += 1
#main
try:
#set up
scanner = Scanner(threads)
scanner.startWorkers(rootpath)
pathQueue, resultQueue = scanner.pathQueue, scanner.resultQueue
#scanner is rolling; wait for it to finish
with pathQueue.all_tasks_done:
while pathQueue.unfinished_tasks:
#tasks are still running
#process results
while True:
try: scanner.processResult(wait=False)
except queue.Empty: break
#no new files found; check if scanner is ready
done = pathQueue.all_tasks_done.wait(timeout=1)
if not done:
#Not yet; print something while we wait
print(
"\rProcessed {} files from {} directories [{} {}] "
.format(
scanner.added,
ScanWorker.dirCount,
pathQueue.unfinished_tasks,
resultQueue.unfinished_tasks,
), end='\r')
#just to make sure everybody is ready: join the path queue
pathQueue.join()
#process remaining of result queue
while resultQueue.unfinished_tasks: scanner.processResult(wait=True)
#go to new line to prevent overwriting progress messages
print()
except ScanError as e:
print()
print(*e.args[0], end='')
print("Process interrupted.")
except KeyboardInterrupt:
print("\nProcess interrupted.")
print()

Related

Is there any way to stop all workers in a Threadpool if any of the worker raise an Exception in python

I am writing a python script to process an operation through the ThreadPoolExecutor. My requirement is to stop script execution if the exception is raised by any of the workers. I have used the exit method but it only exit/stops the particular thread, not the whole script.
Below is the piece of code:
def create_dump():
tenants = get_all_tenants()
with ThreadPoolExecutor(max_workers=8) as executor:
executor.map(process_create_dump, tenants, chunksize=1)
file = open(dump_file_path, 'a')
json.dump(json_dump, file, indent=1)
file.close()
def process_create_dump(tenant):
r_components = dict()
print("processing.....%s" % tenant)
try:
add_clients(r_components, tenant)
except Exception:
print("Unexpected exception occurred while processing")
exit(1)

I found a solution using threading.Event and shutdown method of ThreadPoolExecutor. Below is the code:
event_object = threading.Event()
def create_dump():
tenants = get_all_tenants()
with ThreadPoolExecutor(max_workers=8) as executor:
executor.submit(terminate_executor, executor)
executor.map(process_create_dump, tenants, chunksize=1)
executor.submit(set_event_on_success)
file = open(dump_file_path, 'a')
json.dump(json_dump, file, indent=1)
file.close()
def process_create_dump(tenant):
r_components = dict()
print("processing.....%s" % tenant)
try:
add_clients(r_components, tenant)
except Exception:
event_object.set()
def terminate_executor(executor):
event_object.wait()
executor.shutdown(wait=False, cancel_futures=True)
def set_event_on_success():
while not is_process_completed():
pass
event_object.set()

Python multiprocessing and too many open files

I have problem with multiprocessing in python. In code below I call 7 workers (multiprocessing.Process) and one result threading.Thread. Before and after processing of data (extracting some metadata from files), I run:
lsof | grep ' <user> ' | grep 'python3'
And I get some open handles as:
python3 17291 ivo DEL REG 0,20 5288943 /dev/shm/ZMcs2H
python3 17291 ivo DEL REG 0,20 5288942 /dev/shm/3iMR4q
python3 17291 ivo DEL REG 0,20 5288941 /dev/shm/XPYh79
and when running multiprocessing many times in loop (processing some continuous messages) I get
OSError: [Errno 24] Too many open files
Is there something wrong with dealing with multiprocessing package?
def worker_process_results(meta_queue, res_dict):
while True:
try:
(path, meta) = meta_queue.get()
res_dict[path] = meta
finally:
meta_queue.task_done()
def multiprocess_get_metadata(paths, thread_count = 7):
""" Scan files for metadata (multiprocessing). """
file_queue = multiprocessing.JoinableQueue()
meta_queue = multiprocessing.JoinableQueue()
res_dict = dict()
# result thread
meta_thread = threading.Thread(target = lambda: worker_process_results(meta_queue, res_dict))
meta_thread.daemon = True
meta_thread.start()
workers = []
for _ in range(0, min(thread_count, len(paths))):
worker = MetaDataWorker(file_queue, meta_queue)
worker.daemon = True
worker.start()
workers.append(worker)
for path in paths:
file_queue.put(path)
file_queue.join()
meta_queue.join()
for x in workers:
x.terminate()
return res_dict
class MetaDataWorker(multiprocessing.Process):
''' Use library to get meta data from file. '''
def __init__(self, file_queue, meta_queue):
''' Constructor. '''
super().__init__()
self.file_queue = file_queue
self.meta_queue = meta_queue
def run(self):
""" Run. """
while True:
try:
path = self.file_queue.get()
meta = getmetadata(path)
meta = None
self.meta_queue.put((path, meta))
except Exception as err:
print("Thread end.")
print("{0}".format(err))
finally:
self.file_queue.task_done()

Already solved, I needed to send some ending signals to workers and result thread to stop never-ending loop

Multiprocessing Queue.get() hangs

I'm trying to implement basic multiprocessing and I've run into an issue. The python script is attached below.
import time, sys, random, threading
from multiprocessing import Process
from Queue import Queue
from FrequencyAnalysis import FrequencyStore, AnalyzeFrequency
append_queue = Queue(10)
database = FrequencyStore()
def add_to_append_queue(_list):
append_queue.put(_list)
def process_append_queue():
while True:
item = append_queue.get()
database.append(item)
print("Appended to database in %.4f seconds" % database.append_time)
append_queue.task_done()
return
def main():
database.load_db()
print("Database loaded in %.4f seconds" % database.load_time)
append_queue_process = Process(target=process_append_queue)
append_queue_process.daemon = True
append_queue_process.start()
#t = threading.Thread(target=process_append_queue)
#t.daemon = True
#t.start()
while True:
path = raw_input("file: ")
if path == "exit":
break
a = AnalyzeFrequency(path)
a.analyze()
print("Analyzed file in %.4f seconds" % a._time)
add_to_append_queue(a.get_results())
append_queue.join()
#append_queue_process.join()
database.save_db()
print("Database saved in %.4f seconds" % database.save_time)
sys.exit(0)
if __name__=="__main__":
main()
The AnalyzeFrequency analyzes the frequencies of words in a file and get_results() returns a sorted list of said words and frequencies. The list is very large, perhaps 10000 items.
This list is then passed to the add_to_append_queue method which adds it to a queue. The process_append_queue takes the items one by one and adds the frequencies to a "database". This operation takes a bit longer than the actual analysis in main() so I am trying to use a seperate process for this method. When I try and do this with the threading module, everything works perfectly fine, no errors. When I try and use Process, the script hangs at item = append_queue.get().
Could someone please explain what is happening here, and perhaps direct me toward a fix?
All answers appreciated!
UPDATE
The pickle error was my fault, it was just a typo. Now I am using the Queue class within multiprocessing but the append_queue.get() method still hangs.
NEW CODE
import time, sys, random
from multiprocessing import Process, Queue
from FrequencyAnalysis import FrequencyStore, AnalyzeFrequency
append_queue = Queue()
database = FrequencyStore()
def add_to_append_queue(_list):
append_queue.put(_list)
def process_append_queue():
while True:
database.append(append_queue.get())
print("Appended to database in %.4f seconds" % database.append_time)
return
def main():
database.load_db()
print("Database loaded in %.4f seconds" % database.load_time)
append_queue_process = Process(target=process_append_queue)
append_queue_process.daemon = True
append_queue_process.start()
#t = threading.Thread(target=process_append_queue)
#t.daemon = True
#t.start()
while True:
path = raw_input("file: ")
if path == "exit":
break
a = AnalyzeFrequency(path)
a.analyze()
print("Analyzed file in %.4f seconds" % a._time)
add_to_append_queue(a.get_results())
#append_queue.join()
#append_queue_process.join()
print str(append_queue.qsize())
database.save_db()
print("Database saved in %.4f seconds" % database.save_time)
sys.exit(0)
if __name__=="__main__":
main()
UPDATE 2
This is the database code:
class FrequencyStore:
def __init__(self):
self.sorter = Sorter()
self.db = {}
self.load_time = -1
self.save_time = -1
self.append_time = -1
self.sort_time = -1
def load_db(self):
start_time = time.time()
try:
file = open("results.txt", 'r')
except:
raise IOError
self.db = {}
for line in file:
word, count = line.strip("\n").split("=")
self.db[word] = int(count)
file.close()
self.load_time = time.time() - start_time
def save_db(self):
start_time = time.time()
_db = []
for key in self.db:
_db.append([key, self.db[key]])
_db = self.sort(_db)
try:
file = open("results.txt", 'w')
except:
raise IOError
file.truncate(0)
for x in _db:
file.write(x[0] + "=" + str(x[1]) + "\n")
file.close()
self.save_time = time.time() - start_time
def create_sorted_db(self):
_temp_db = []
for key in self.db:
_temp_db.append([key, self.db[key]])
_temp_db = self.sort(_temp_db)
_temp_db.reverse()
return _temp_db
def get_db(self):
return self.db
def sort(self, _list):
start_time = time.time()
_list = self.sorter.mergesort(_list)
_list.reverse()
self.sort_time = time.time() - start_time
return _list
def append(self, _list):
start_time = time.time()
for x in _list:
if x[0] not in self.db:
self.db[x[0]] = x[1]
else:
self.db[x[0]] += x[1]
self.append_time = time.time() - start_time

Comments suggest you're trying to run this on Windows. As I said in a comment,
If you're running this on Windows, it can't work - Windows doesn't
have fork(), so each process gets its own Queue and they have nothing
to do with each other. The entire module is imported "from scratch" by
each process on Windows. You'll need to create the Queue in main(),
and pass it as an argument to the worker function.
Here's fleshing out what you need to do to make it portable, although I removed all the database stuff because it's irrelevant to the problems you've described so far. I also removed the daemon fiddling, because that's usually just a lazy way to avoid shutting down things cleanly, and often as not will come back to bite you later:
def process_append_queue(append_queue):
while True:
x = append_queue.get()
if x is None:
break
print("processed %d" % x)
print("worker done")
def main():
import multiprocessing as mp
append_queue = mp.Queue(10)
append_queue_process = mp.Process(target=process_append_queue, args=(append_queue,))
append_queue_process.start()
for i in range(100):
append_queue.put(i)
append_queue.put(None) # tell worker we're done
append_queue_process.join()
if __name__=="__main__":
main()
The output is the "obvious" stuff:
processed 0
processed 1
processed 2
processed 3
processed 4
...
processed 96
processed 97
processed 98
processed 99
worker done
Note: because Windows doesn't (can't) fork(), it's impossible for worker processes to inherit any Python object on Windows. Each process runs the entire program from its start. That's why your original program couldn't work: each process created its own Queue, wholly unrelated to the Queue in the other process. In the approach shown above, only the main process creates a Queue, and the main process passes it (as an argument) to the worker process.

queue.Queue is thread-safe, but doesn't work across processes. This is quite easy to fix, though. Instead of:
from multiprocessing import Process
from Queue import Queue
You want:
from multiprocessing import Process, Queue

Threads not stop in python

The purpose of my program is to download files with threads. I define the unit, and using len/unit threads, the len is the length of the file which is going to be downloaded.
Using my program, the file can be downloaded, but the threads are not stopping. I can't find the reason why.
This is my code...
#! /usr/bin/python
import urllib2
import threading
import os
from time import ctime
class MyThread(threading.Thread):
def __init__(self,func,args,name=''):
threading.Thread.__init__(self);
self.func = func;
self.args = args;
self.name = name;
def run(self):
apply(self.func,self.args);
url = 'http://ubuntuone.com/1SHQeCAQWgIjUP2945hkZF';
request = urllib2.Request(url);
response = urllib2.urlopen(request);
meta = response.info();
response.close();
unit = 1000000;
flen = int(meta.getheaders('Content-Length')[0]);
print flen;
if flen%unit == 0:
bs = flen/unit;
else :
bs = flen/unit+1;
blocks = range(bs);
cnt = {};
for i in blocks:
cnt[i]=i;
def getStr(i):
try:
print 'Thread %d start.'%(i,);
fout = open('a.zip','wb');
fout.seek(i*unit,0);
if (i+1)*unit > flen:
request.add_header('Range','bytes=%d-%d'%(i*unit,flen-1));
else :
request.add_header('Range','bytes=%d-%d'%(i*unit,(i+1)*unit-1));
#opener = urllib2.build_opener();
#buf = opener.open(request).read();
resp = urllib2.urlopen(request);
buf = resp.read();
fout.write(buf);
except BaseException:
print 'Error';
finally :
#opener.close();
fout.flush();
fout.close();
del cnt[i];
# filelen = os.path.getsize('a.zip');
print 'Thread %d ended.'%(i),
print cnt;
# print 'progress : %4.2f'%(filelen*100.0/flen,),'%';
def main():
print 'download at:',ctime();
threads = [];
for i in blocks:
t = MyThread(getStr,(blocks[i],),getStr.__name__);
threads.append(t);
for i in blocks:
threads[i].start();
for i in blocks:
# print 'this is the %d thread;'%(i,);
threads[i].join();
#print 'size:',os.path.getsize('a.zip');
print 'download done at:',ctime();
if __name__=='__main__':
main();
Could someone please help me understand why the threads aren't stopping.

I can't really address your code example because it is quite messy and hard to follow, but a potential reason you are seeing the threads not end is that a request will stall out and never finish. urllib2 allows you to specify timeouts for how long you will allow the request to take.
What I would recommend for your own code is that you split your work up into a queue, start a fixed number of thread (instead of a variable number), and let the worker threads pick up work until it is done. Make the http requests have a timeout. If the timeout expires, try again or put the work back into the queue.
Here is a generic example of how to use a queue, a fixed number of workers and a sync primitive between them:
import threading
import time
from Queue import Queue
def worker(queue, results, lock):
local_results = []
while True:
val = queue.get()
if val is None:
break
# pretend to do work
time.sleep(.1)
local_results.append(val)
with lock:
results.extend(local_results)
print threading.current_thread().name, "Done!"
num_workers = 4
threads = []
queue = Queue()
lock = threading.Lock()
results = []
for i in xrange(100):
queue.put(i)
for _ in xrange(num_workers):
# Use None as a sentinel to signal the threads to end
queue.put(None)
t = threading.Thread(target=worker, args=(queue,results,lock))
t.start()
threads.append(t)
for t in threads:
t.join()
print sorted(results)
print "All done"

Multi-thread a download loop with python

I have a list.
symbols = ('GGP', 'JPM', 'AIG', 'AMZN','GGP', 'rx', 'jnj', 'osip')
URL = "http://www.Xxxx_symbol=%s"
def fetch(symbols):
try:
url = URL % '+'.join(symbols)
fp = urllib2.urlopen(url)
try:
data = fp.read()
finally:
fp.close()
return data
except Exception as e:
print "No Internet Access"
I am trying to multithread (with 4 threads) the fetch process, not multi-process and not use twisted. The output file of Url fetch is csv with 7 lines of header info that I want to get rid of. I would like to loop each symbol in it own file. I used this fetch code before. I can get a symbol list which has one element.

This should get you started:
from threading import Thread, Lock
data = {}
data_lock = Lock()
class Fetcher(Thread):
def __init__(self, symbol):
super(Thread, self).__init__()
Thread.__init__(self)
self.symbol = symbol
def run(self):
# put the code from fetch() here
# replace 'data = fp.read()' with the following
tmp = fp.read()
data_lock.acquire()
data[self.symbol] = tmp
data_lock.release()
# Start a new Fetcher thread like this:
fetcher = Fetcher(symbol)
fetcher.start()
# To wait for the thread to finish, use Thread.join():
fetcher.join()

Develop Reference

Python is a programming language that lets you work quickly and integrate systems more effectively.

How do I run os.walk in parallel in Python? - python

Related

Is there any way to stop all workers in a Threadpool if any of the worker raise an Exception in python

Python multiprocessing and too many open files

Multiprocessing Queue.get() hangs

Threads not stop in python

Multi-thread a download loop with python

Categories

Resources