Python multiprocessing and too many open files - python

I have problem with multiprocessing in python. In code below I call 7 workers (multiprocessing.Process) and one result threading.Thread. Before and after processing of data (extracting some metadata from files), I run:
lsof | grep ' <user> ' | grep 'python3'
And I get some open handles as:
python3 17291 ivo DEL REG 0,20 5288943 /dev/shm/ZMcs2H
python3 17291 ivo DEL REG 0,20 5288942 /dev/shm/3iMR4q
python3 17291 ivo DEL REG 0,20 5288941 /dev/shm/XPYh79
and when running multiprocessing many times in loop (processing some continuous messages) I get
OSError: [Errno 24] Too many open files
Is there something wrong with dealing with multiprocessing package?
def worker_process_results(meta_queue, res_dict):
while True:
try:
(path, meta) = meta_queue.get()
res_dict[path] = meta
finally:
meta_queue.task_done()
def multiprocess_get_metadata(paths, thread_count = 7):
""" Scan files for metadata (multiprocessing). """
file_queue = multiprocessing.JoinableQueue()
meta_queue = multiprocessing.JoinableQueue()
res_dict = dict()
# result thread
meta_thread = threading.Thread(target = lambda: worker_process_results(meta_queue, res_dict))
meta_thread.daemon = True
meta_thread.start()
workers = []
for _ in range(0, min(thread_count, len(paths))):
worker = MetaDataWorker(file_queue, meta_queue)
worker.daemon = True
worker.start()
workers.append(worker)
for path in paths:
file_queue.put(path)
file_queue.join()
meta_queue.join()
for x in workers:
x.terminate()
return res_dict
class MetaDataWorker(multiprocessing.Process):
''' Use library to get meta data from file. '''
def __init__(self, file_queue, meta_queue):
''' Constructor. '''
super().__init__()
self.file_queue = file_queue
self.meta_queue = meta_queue
def run(self):
""" Run. """
while True:
try:
path = self.file_queue.get()
meta = getmetadata(path)
meta = None
self.meta_queue.put((path, meta))
except Exception as err:
print("Thread end.")
print("{0}".format(err))
finally:
self.file_queue.task_done()

Already solved, I needed to send some ending signals to workers and result thread to stop never-ending loop

Related

Multi-processed file reading in python

I want a file to be read in a multi-processed way. Each process will be a class instance doing some specific task, but the class where the file is being opened will be a singleton class.
We don't want to batch the entire file into a number of processes instead we want each process to asynchronously read a batch of lines from the text file where no two processes will have the same line.
I know this is quite a task to execute and I have tried multiple ways to achieve this task but was unsuccessful.
Below is the code snippet where I tried to achieve the above mentioned
class ListFromTextFile():
def __init__(self, config: str):
self.file_pointer = open(file_path, "r")
self.batch_size = config["batch_size"]
self.__lock = Lock()
self.__lock_aquire = False
def get_list_of_pids(self):
function_name = "get_list_of_pids"
pids = []
try:
for line in self.file_pointer:
if self.__lock_aquire:
pid = line.strip("\n")
pids.append(pid)
else:
self.__lock.acquire()
log_message(function_name, "Lock aquired !!!")
self.__lock_aquire = True
if len(pids) == self.batch_size:
self.__lock.release()
log_message(function_name, "Lock released !!!")
self.__lock_aquire = False
yield pids
pids = []
if len(pids):
self.__lock.release()
log_message(function_name, "Lock released !!!")
yield pids
except Exception as e:
err = str(e)
trace_back = traceback.format_exc()
log_exception()(self.function_name, err, trace_back)
Here ListFromTextFile is a singleton class and, processes will call get_list_of_pids() which will yield a list of pids. The task here is every batch generated and yielded by all processes must be unique.
Any suggestions to improve the above code or new ideas are welcome.
What you want is not necessarily a singleton but rather a class instance that can be sharable among multiple processes. The most straightforward way I know of doing this (other people might have other ideas) is to create a managed class from ListTextFile. I would first modify its definition so that method get_list_of_pids is not a generator function (these cannot be pickled). Instead you can just call it repeatedly to return the next batch of pids until an empty list is returned signifying end of file on the input file. As a managed object running within a manager's process and method calls are handled by threads, it is sufficent to to run get_list_of_pids as a critical section using a thread.Lock.
For demo purposes my input file, test.txt, consists of 7 lines:
1
2
3
4
5
6
7
The Demo
from threading import Lock
from multiprocessing import Process, current_process
from multiprocessing.managers import BaseManager
file_path = 'test.txt'
class ListFromTextFile():
def __init__(self, config: dict):
self.file_pointer = open(file_path, "r")
self.batch_size = config["batch_size"]
self.eof = False
self.__lock = Lock()
def get_list_of_pids(self):
function_name = "get_list_of_pids"
pids = []
if self.eof:
# Returning an empty list signifies end of file:
return pids;
with self.__lock:
while True:
line = self.file_pointer.readline()
if line == '':
self.eof = True
# Return what we have if anything.
return pids
pids.append(line.strip('\n'))
if len(pids) == self.batch_size:
return pids
class MyManager(BaseManager):
pass
def worker(reader):
import time
while True:
pids = reader.get_list_of_pids()
if not pids:
# Empty list returned -> end of file
return
print(f'Current pid: {current_process().pid}, pids: {pids}', flush=True)
time.sleep(.2) # Give other process a chance to run
def demo():
MyManager.register('ListFromTextFile', ListFromTextFile)
with MyManager() as manager:
config = {'batch_size': 3}
reader = manager.ListFromTextFile(config)
p1 = Process(target=worker, args=(reader,))
p2 = Process(target=worker, args=(reader,))
p1.start()
p2.start()
p1.join()
p2.join()
if __name__ == '__main__':
demo()
Prints:
Current pid: 1492, pids: ['1', '2', '3']
Current pid: 29072, pids: ['4', '5', '6']
Current pid: 1492, pids: ['7']
But Is There an Alternate Approach?
If each process does the processing for a single batch and then terminates (those details have not been provided), the main process can be doing all the batching and providing each batch to a multiprocessing pool:
from multiprocessing import Pool, current_process
file_path = 'test.txt'
def create_batch(config):
batch_size = config["batch_size"]
pids = []
with open(file_path, "r") as f:
for line in f:
pids.append(line.strip('\n'))
if len(pids) == batch_size:
yield pids
pids = []
if pids:
yield pids
def worker(pids):
import time
print(f'Current pid: {current_process().pid}, pids: {pids}', flush=True)
time.sleep(.2) # Give other process a chance to run
def demo():
pool = Pool(2)
# lazily evaluate the iterable using imap_unordered
# as a memory-savings technique. Specify a chunksize if
# the number of batches are huge:
config = {'batch_size': 3}
pool.imap_unordered(worker, create_batch(config))
# Wait for all tasks to complete:
pool.close()
pool.join()
if __name__ == '__main__':
demo()
Prints:
Current pid: 4316, pids: ['1', '2', '3']
Current pid: 13424, pids: ['4', '5', '6']
Current pid: 4316, pids: ['7']

Can't read input.txt file correctly using a multithreaded scraper in Python

I use the input file IG#Input.txt which contains:
qwdlab
qwdwe
The script would get all usernames from these Instagram hashtags but it's not working if there is more than 1 hashtag in the input.txt file. What do I need to change to make it work correctly along with the multithreading?
The script workflow would be like:
Reading hashtags/lines from input.txt file
Scraping them multithreaded
Printing the output into the outputIG.txt file
Code:
import threading
from instaloader import Instaloader
import time
#Classes for threading
class LockedIterator(object):
def __init__(self, it):
self.lock = threading.Lock()
self.it = it.__iter__()
def __iter__(self):
return self
def __next__(self):
self.lock.acquire()
try:
return self.it.__next__()
finally:
self.lock.release()
# input()
input = open("IG#Input.txt","r",encoding='utf-8')
HASHTAG = input.read()
p = HASHTAG.split('\n')
PROFILE = p[:]
posts = Instaloader(sleep=False).get_hashtag_posts(HASHTAG)
posts = LockedIterator(posts)
output = open("outputIG.txt","w+")
# Main
def worker():
for ind in range(len(PROFILE)):
pro = PROFILE[ind]
try:
for post in posts:
print(post.owner_username)
output.write(post.owner_username + '\n')
except Exception as e:
print(e)
raise
#Start Threading
threads = []
for i in range(4):
t = threading.Thread(target=worker)
threads.append(t)
t.start()
for t in threads:
t.join()
output.close()

Threads not stop in python

The purpose of my program is to download files with threads. I define the unit, and using len/unit threads, the len is the length of the file which is going to be downloaded.
Using my program, the file can be downloaded, but the threads are not stopping. I can't find the reason why.
This is my code...
#! /usr/bin/python
import urllib2
import threading
import os
from time import ctime
class MyThread(threading.Thread):
def __init__(self,func,args,name=''):
threading.Thread.__init__(self);
self.func = func;
self.args = args;
self.name = name;
def run(self):
apply(self.func,self.args);
url = 'http://ubuntuone.com/1SHQeCAQWgIjUP2945hkZF';
request = urllib2.Request(url);
response = urllib2.urlopen(request);
meta = response.info();
response.close();
unit = 1000000;
flen = int(meta.getheaders('Content-Length')[0]);
print flen;
if flen%unit == 0:
bs = flen/unit;
else :
bs = flen/unit+1;
blocks = range(bs);
cnt = {};
for i in blocks:
cnt[i]=i;
def getStr(i):
try:
print 'Thread %d start.'%(i,);
fout = open('a.zip','wb');
fout.seek(i*unit,0);
if (i+1)*unit > flen:
request.add_header('Range','bytes=%d-%d'%(i*unit,flen-1));
else :
request.add_header('Range','bytes=%d-%d'%(i*unit,(i+1)*unit-1));
#opener = urllib2.build_opener();
#buf = opener.open(request).read();
resp = urllib2.urlopen(request);
buf = resp.read();
fout.write(buf);
except BaseException:
print 'Error';
finally :
#opener.close();
fout.flush();
fout.close();
del cnt[i];
# filelen = os.path.getsize('a.zip');
print 'Thread %d ended.'%(i),
print cnt;
# print 'progress : %4.2f'%(filelen*100.0/flen,),'%';
def main():
print 'download at:',ctime();
threads = [];
for i in blocks:
t = MyThread(getStr,(blocks[i],),getStr.__name__);
threads.append(t);
for i in blocks:
threads[i].start();
for i in blocks:
# print 'this is the %d thread;'%(i,);
threads[i].join();
#print 'size:',os.path.getsize('a.zip');
print 'download done at:',ctime();
if __name__=='__main__':
main();
Could someone please help me understand why the threads aren't stopping.
I can't really address your code example because it is quite messy and hard to follow, but a potential reason you are seeing the threads not end is that a request will stall out and never finish. urllib2 allows you to specify timeouts for how long you will allow the request to take.
What I would recommend for your own code is that you split your work up into a queue, start a fixed number of thread (instead of a variable number), and let the worker threads pick up work until it is done. Make the http requests have a timeout. If the timeout expires, try again or put the work back into the queue.
Here is a generic example of how to use a queue, a fixed number of workers and a sync primitive between them:
import threading
import time
from Queue import Queue
def worker(queue, results, lock):
local_results = []
while True:
val = queue.get()
if val is None:
break
# pretend to do work
time.sleep(.1)
local_results.append(val)
with lock:
results.extend(local_results)
print threading.current_thread().name, "Done!"
num_workers = 4
threads = []
queue = Queue()
lock = threading.Lock()
results = []
for i in xrange(100):
queue.put(i)
for _ in xrange(num_workers):
# Use None as a sentinel to signal the threads to end
queue.put(None)
t = threading.Thread(target=worker, args=(queue,results,lock))
t.start()
threads.append(t)
for t in threads:
t.join()
print sorted(results)
print "All done"

How do I run os.walk in parallel in Python?

I wrote a simple app in Java that takes a list of paths and generates a file with all the file paths under that original list.
If I have paths.txt that has:
c:\folder1\
c:\folder2\
...
...
c:\folder1000\
My app runs the recursive function on each path multithreaded, and returns a file with all the file paths under these folders.
Now I want to write this app in Python.
I've written a simple app that uses os.walk() to run through a given folder and print the filepaths to output.
Now I want to run it in parallel, and I've seen that Python has some modules for this:
multithreaded and multiprocessing.
What is the best what to do this? And within that way, how is it performed?
Here is a multiprocessing solution:
from multiprocessing.pool import Pool
from multiprocessing import JoinableQueue as Queue
import os
def explore_path(path):
directories = []
nondirectories = []
for filename in os.listdir(path):
fullname = os.path.join(path, filename)
if os.path.isdir(fullname):
directories.append(fullname)
else:
nondirectories.append(filename)
outputfile = path.replace(os.sep, '_') + '.txt'
with open(outputfile, 'w') as f:
for filename in nondirectories:
print >> f, filename
return directories
def parallel_worker():
while True:
path = unsearched.get()
dirs = explore_path(path)
for newdir in dirs:
unsearched.put(newdir)
unsearched.task_done()
# acquire the list of paths
with open('paths.txt') as f:
paths = f.read().split()
unsearched = Queue()
for path in paths:
unsearched.put(path)
with Pool(5) as pool:
for i in range(5):
pool.apply_async(parallel_worker)
unsearched.join()
print('Done')
This is a pattern for threads in python which has been useful to me. I'm not sure if threading is going to increase your performance due to the way threads work in CPython though.
import threading
import Queue
import os
class PathThread (threading.Thread):
def __init__(self, queue):
threading.Thread.__init__(self)
self.queue = queue
def printfiles(self, p):
for path, dirs, files in os.walk(p):
for f in files:
print path + "/" + f
def run(self):
while True:
path = self.queue.get()
self.printfiles(path)
self.queue.task_done()
# threadsafe queue
pathqueue = Queue.Queue()
paths = ["foo", "bar", "baz"]
# spawn threads
for i in range(0, 5):
t = PathThread(pathqueue)
t.setDaemon(True)
t.start()
# add paths to queue
for path in paths:
pathqueue.put(path)
# wait for queue to get empty
pathqueue.join()
Even threading can be pretty helpful for directory traversal. I use the following code for traversing a SharePoint tree, getting a pretty significant speedup for about 50 threads.
This particular program returns (path, data) pairs for all xml files in a directory structure, and can be simply expanded for your use.
(This is cut and pasted from my program; some additional editing is needed.)
#unique string for error passing error messages
ERROR = '\xffERROR\xff'
class ScanWorker(threading.Thread):
"""Worker class for scanning directory structures.
pathQueue: queue for pathnames of directories
resultQueue: results of processFile, pairs of (path, data) to be updated
"""
lock = threading.Lock()
dirCount = 0
def __init__(self, pathQueue, resultQueue):
self.pathQueue = pathQueue
self.resultQueue = resultQueue
super().__init__()
def run(self):
"""Worker thread.
Get a directory, process it, and put new directories on the
queue."""
try:
while True:
self.processDir(self.pathQueue.get())
self.pathQueue.task_done()
except Exception as e:
#pass on exception to main thread
description = traceback.format_exception(*sys.exc_info())
description.insert(0,
"Error in thread {}:\n".format(
threading.current_thread().name))
self.resultQueue.put((ERROR, description))
self.pathQueue.task_done()
def processDir(self, top):
"""Visit a directory
Call self.processFile on every file, and queue the directories.
"""
#Wait and retry a few times in case of network errors.
#SharePoint is not reliable, gives errors for no reason
for retryCount in range(30):
try:
names = listdir(top)
break
except OSError as e:
if e.errno in (2,22):
lastError = e
print(end="L", flush=True)
time.sleep(1)
else:
raise
else:
print("List: too many retries")
raise lastError
#it is not important to worry about race conditions here
self.__class__.dirCount += 1
#process contents
for name in names:
if isdir(join(top, name)): self.pathQueue.put(join(top, name))
else: self.processFile(join(top, name))
def processFile(self, path):
"""Get XML file.
"""
#only xml files
if not path.lower().endswith('.xml'): return
filemtime = datetime.fromtimestamp(getmtime(path))
#SharePoint is not reliable, gives errors for no reason; just retry
for retryCount in range(30):
try:
data = open(path,'rb').read()
break
except OSError as e:
if e.errno in (2,22):
lastError = e
print(end="R", flush=True)
time.sleep(1)
else:
raise
else:
print("Read: too many retries")
raise lastError
self.resultQueue.put((path, data))
class Scanner:
"""Interface to the ScanWorkers
Sharepoint is pretty fast compared to its delay and handles 50 workers well
Make sure you only create one instance of Scanner!
"""
def __init__(self, workers):
#don't restrict the path queue length; this causes deadlock
#we use a LIFO queue to get more depth-first like search
#reducing average queue length and hopefully improving server caching
self.pathQueue = queue.LifoQueue()
#this is the output queue to the main thread
self.resultQueue = queue.Queue(5)
self.workers = workers
#start workers
for i in range(workers):
t = ScanWorker(self.pathQueue, self.resultQueue)
t.setDaemon(True)
t.start()
def startWorkers(self, path):
#add counter
self.added = 0
#and go
self.pathQueue.put(path)
def processResult(self, wait=True):
"""Get an element from the result queue, and add to the zip file."""
path, data = self.resultQueue.get(block=wait)
if path==ERROR:
#process gave alarm; stop scanning
#pass on description
raise ScanError(data)
<do whatever you want to do with the file>
self.resultQueue.task_done()
self.added += 1
#main
try:
#set up
scanner = Scanner(threads)
scanner.startWorkers(rootpath)
pathQueue, resultQueue = scanner.pathQueue, scanner.resultQueue
#scanner is rolling; wait for it to finish
with pathQueue.all_tasks_done:
while pathQueue.unfinished_tasks:
#tasks are still running
#process results
while True:
try: scanner.processResult(wait=False)
except queue.Empty: break
#no new files found; check if scanner is ready
done = pathQueue.all_tasks_done.wait(timeout=1)
if not done:
#Not yet; print something while we wait
print(
"\rProcessed {} files from {} directories [{} {}] "
.format(
scanner.added,
ScanWorker.dirCount,
pathQueue.unfinished_tasks,
resultQueue.unfinished_tasks,
), end='\r')
#just to make sure everybody is ready: join the path queue
pathQueue.join()
#process remaining of result queue
while resultQueue.unfinished_tasks: scanner.processResult(wait=True)
#go to new line to prevent overwriting progress messages
print()
except ScanError as e:
print()
print(*e.args[0], end='')
print("Process interrupted.")
except KeyboardInterrupt:
print("\nProcess interrupted.")
print()

multiprocessing module not spawning new processes

I am using the multiprocessing module in python to spawn new processes, one for each year between 2000 to 2012. This was running successfully until last week. Now, the code runs fine without throwing any errors and seems to spawn new processes, but does not start them simultaneously. The CPU I am running this on uses ubuntu and has plenty of memory with 24 processors.
The processes seem to run sequentially instead of parallel. There have been no code changes in the past 3 months, so I am suspecting its an environment issue but am clueless about where to start debugging. Any suggestions?
Is it possible for some default setting of the kernel to prevent simultaneous execution of code? Some setting of python?
Code:
class ForEachPerson(multiprocessing.Process):
"""This class contains the funcs for the main processing."""
def __init__(self, year_queue, result_queue, dict_of_files, all, today):
multiprocessing.Process.__init__(self)
self.work_queue = year_queue
self.result_queue = result_queue
self.kill_received = False
self.dict = dict_of_files
self.all = all
self.today = today
def run(self):
while not self.kill_received:
try:
year = self.work_queue.get_nowait()
year_start_date = year[0]
year_end_date = year[1]
split = year_end_date.year
except Queue.Empty:
self.result_queue.close()
return
if self.all:
try:
null_pids = self.dict["null_pids"]
except KeyError:
null_pids = []
#For each employee calculate the data and write to file.
today = self.today
hie = hie_util.Build()
hie_op = open("output.csv", "wb")
hierarchy_op.write("....\n")
/* do function */
............
hierarchy_op.close()
timestr = ("%s End writing for %s"
% (str(datetime.datetime.now()), str(year)))
self.result_queue.put(timestr)
def Manage(years, dict_of_files, num_processes, all, today):
"""Responsible for creating & assigning tasks to worker processes."""
#load up year queue
year_queue = multiprocessing.Queue()
for year in years:
year_queue.put(year)
if num_processes > len(years):
num_processes = len(years)
# queue to pass to workers to store the results
result_queue = multiprocessing.Queue()
# spawn workers
workers = []
for i in range(num_processes):
worker = ForEachPerson(year_queue, result_queue, dict_of_files, all, today)
logging.info("Worker spawned for processor " + str(i + 1))
worker.start()
workers.append(worker)
# collect results off the queue
logging.info("results being collected")
results = []
while len(results) < len(years):
try:
result = result_queue.get()
logging.info(str(result[0]))
results.append(result[1])
except Queue.Empty:
pass
count = 0
for worker in workers:
logging.info("Terminating worker: " + str(count))
worker.terminate()
count += 1
return results
def RunHie():
"""Main control flow for building."""
logging.info("Start ")
sql_instance = hie_sql.SQLExportImport()
sql_instance.RunEtl()
# gather list of dates
date_full_list = DailyDates()
dict_of_files = ReadFiles()
# calculate hierarchy - run
num_processes = multiprocessing.cpu_count() - 1
results = Manage(date_full_list, dict_of_files, num_processes, 0, today[1])
logging.info("End")

Categories

Resources